From be0e5c097fc206b863ce9fe6b3cfd6974b0110f4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 26 Jan 2007 15:51:26 -0500
Subject: Btrfs: Initial checkin, basic working tree code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 810 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 810 insertions(+)
 create mode 100644 fs/btrfs/ctree.c

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
new file mode 100644
index 00000000000..893fd56960a
--- /dev/null
+++ b/fs/btrfs/ctree.c
@@ -0,0 +1,810 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+
+#define BLOCKSIZE 4096
+
+struct key {
+	u64 objectid;
+	u32 flags;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+struct header {
+	u64 fsid[2]; /* FS specific uuid */
+	u64 blocknum;
+	u64 parentid;
+	u32 csum;
+	u32 ham;
+	u16 nritems;
+	u16 flags;
+} __attribute__ ((__packed__));
+
+#define NODEPTRS_PER_BLOCK ((BLOCKSIZE - sizeof(struct header)) / \
+			    (sizeof(struct key) + sizeof(u64)))
+
+#define LEVEL_BITS 3
+#define MAX_LEVEL (1 << LEVEL_BITS)
+#define node_level(f) ((f) & (MAX_LEVEL-1))
+#define is_leaf(f) (node_level(f) == 0)
+
+struct ctree_root {
+	struct node *node;
+};
+
+struct item {
+	struct key key;
+	u16 offset;
+	u16 size;
+} __attribute__ ((__packed__));
+
+#define LEAF_DATA_SIZE (BLOCKSIZE - sizeof(struct header))
+struct leaf {
+	struct header header;
+	union {
+		struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
+		u8 data[BLOCKSIZE-sizeof(struct header)];
+	};
+} __attribute__ ((__packed__));
+
+struct node {
+	struct header header;
+	struct key keys[NODEPTRS_PER_BLOCK];
+	u64 blockptrs[NODEPTRS_PER_BLOCK];
+} __attribute__ ((__packed__));
+
+struct ctree_path {
+	struct node *nodes[MAX_LEVEL];
+	int slots[MAX_LEVEL];
+};
+
+static inline void init_path(struct ctree_path *p)
+{
+	memset(p, 0, sizeof(*p));
+}
+
+static inline unsigned int leaf_data_end(struct leaf *leaf)
+{
+	unsigned int nr = leaf->header.nritems;
+	if (nr == 0)
+		return ARRAY_SIZE(leaf->data);
+	return leaf->items[nr-1].offset;
+}
+
+static inline int leaf_free_space(struct leaf *leaf)
+{
+	int data_end = leaf_data_end(leaf);
+	int nritems = leaf->header.nritems;
+	char *items_end = (char *)(leaf->items + nritems + 1);
+	return (char *)(leaf->data + data_end) - (char *)items_end;
+}
+
+int comp_keys(struct key *k1, struct key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->flags > k2->flags)
+		return 1;
+	if (k1->flags < k2->flags)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
+int generic_bin_search(char *p, int item_size, struct key *key,
+		       int max, int *slot)
+{
+	int low = 0;
+	int high = max;
+	int mid;
+	int ret;
+	struct key *tmp;
+
+	while(low < high) {
+		mid = (low + high) / 2;
+		tmp = (struct key *)(p + mid * item_size);
+		ret = comp_keys(tmp, key);
+
+		if (ret < 0)
+			low = mid + 1;
+		else if (ret > 0)
+			high = mid;
+		else {
+			*slot = mid;
+			return 0;
+		}
+	}
+	*slot = low;
+	return 1;
+}
+
+int bin_search(struct node *c, struct key *key, int *slot)
+{
+	if (is_leaf(c->header.flags)) {
+		struct leaf *l = (struct leaf *)c;
+		return generic_bin_search((void *)l->items, sizeof(struct item),
+					  key, c->header.nritems, slot);
+	} else {
+		return generic_bin_search((void *)c->keys, sizeof(struct key),
+					  key, c->header.nritems, slot);
+	}
+	return -1;
+}
+
+void *read_block(u64 blocknum)
+{
+	return (void *)blocknum;
+}
+
+int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
+{
+	struct node *c = root->node;
+	int slot;
+	int ret;
+	int level;
+	while (c) {
+		level = node_level(c->header.flags);
+		p->nodes[level] = c;
+		ret = bin_search(c, key, &slot);
+		if (!is_leaf(c->header.flags)) {
+			if (ret && slot > 0)
+				slot -= 1;
+			p->slots[level] = slot;
+			c = read_block(c->blockptrs[slot]);
+			continue;
+		} else {
+			p->slots[level] = slot;
+			return ret;
+		}
+	}
+	return -1;
+}
+
+static void fixup_low_keys(struct ctree_path *path, struct key *key,
+			     int level)
+{
+	int i;
+	/* adjust the pointers going up the tree */
+	for (i = level; i < MAX_LEVEL; i++) {
+		struct node *t = path->nodes[i];
+		int tslot = path->slots[i];
+		if (!t)
+			break;
+		memcpy(t->keys + tslot, key, sizeof(*key));
+		if (tslot != 0)
+			break;
+	}
+}
+
+int __insert_ptr(struct ctree_root *root,
+		struct ctree_path *path, struct key *key,
+		u64 blocknr, int slot, int level)
+{
+	struct node *c;
+	struct node *lower;
+	struct key *lower_key;
+	int nritems;
+	/* need a new root */
+	if (!path->nodes[level]) {
+		c = malloc(sizeof(struct node));
+		memset(c, 0, sizeof(c));
+		c->header.nritems = 2;
+		c->header.flags = node_level(level);
+		lower = path->nodes[level-1];
+		if (is_leaf(lower->header.flags))
+			lower_key = &((struct leaf *)lower)->items[0].key;
+		else
+			lower_key = lower->keys;
+		memcpy(c->keys, lower_key, sizeof(struct key));
+		memcpy(c->keys + 1, key, sizeof(struct key));
+		c->blockptrs[0] = (u64)lower;
+		c->blockptrs[1] = blocknr;
+		root->node = c;
+		path->nodes[level] = c;
+		path->slots[level] = 0;
+		if (c->keys[1].objectid == 0)
+			BUG();
+		return 0;
+	}
+	lower = path->nodes[level];
+	nritems = lower->header.nritems;
+	if (slot > nritems)
+		BUG();
+	if (nritems == NODEPTRS_PER_BLOCK)
+		BUG();
+	if (slot != nritems) {
+		memmove(lower->keys + slot + 1, lower->keys + slot,
+			(nritems - slot) * sizeof(struct key));
+		memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
+			(nritems - slot) * sizeof(u64));
+	}
+	memcpy(lower->keys + slot, key, sizeof(struct key));
+	lower->blockptrs[slot] = blocknr;
+	lower->header.nritems++;
+	if (lower->keys[1].objectid == 0)
+			BUG();
+	return 0;
+}
+
+int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
+{
+	int slot;
+	struct node *left;
+	struct node *right;
+	int push_items = 0;
+	int left_nritems;
+	int right_nritems;
+
+	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
+		return 1;
+	slot = path->slots[level + 1];
+	if (slot == 0)
+		return 1;
+
+	left = read_block(path->nodes[level + 1]->blockptrs[slot - 1]);
+	right = path->nodes[level];
+	left_nritems = left->header.nritems;
+	right_nritems = right->header.nritems;
+	push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
+	if (push_items <= 0)
+		return 1;
+
+	if (right_nritems < push_items)
+		push_items = right_nritems;
+	memcpy(left->keys + left_nritems, right->keys,
+		push_items * sizeof(struct key));
+	memcpy(left->blockptrs + left_nritems, right->blockptrs,
+		push_items * sizeof(u64));
+	memmove(right->keys, right->keys + push_items,
+		(right_nritems - push_items) * sizeof(struct key));
+	memmove(right->blockptrs, right->blockptrs + push_items,
+		(right_nritems - push_items) * sizeof(u64));
+	right->header.nritems -= push_items;
+	left->header.nritems += push_items;
+
+	/* adjust the pointers going up the tree */
+	fixup_low_keys(path, right->keys, level + 1);
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[level] < push_items) {
+		path->slots[level] += left_nritems;
+		path->nodes[level] = (struct node*)left;
+		path->slots[level + 1] -= 1;
+	} else {
+		path->slots[level] -= push_items;
+	}
+	return 0;
+}
+
+int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
+{
+	int slot;
+	struct node *dst;
+	struct node *src;
+	int push_items = 0;
+	int dst_nritems;
+	int src_nritems;
+
+	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
+		return 1;
+	slot = path->slots[level + 1];
+	if (slot == NODEPTRS_PER_BLOCK - 1)
+		return 1;
+
+	if (slot >= path->nodes[level + 1]->header.nritems -1)
+		return 1;
+
+	dst = read_block(path->nodes[level + 1]->blockptrs[slot + 1]);
+	src = path->nodes[level];
+	dst_nritems = dst->header.nritems;
+	src_nritems = src->header.nritems;
+	push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
+	if (push_items <= 0)
+		return 1;
+
+	if (src_nritems < push_items)
+		push_items = src_nritems;
+	memmove(dst->keys + push_items, dst->keys,
+		dst_nritems * sizeof(struct key));
+	memcpy(dst->keys, src->keys + src_nritems - push_items,
+		push_items * sizeof(struct key));
+
+	memmove(dst->blockptrs + push_items, dst->blockptrs,
+		dst_nritems * sizeof(u64));
+	memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
+		push_items * sizeof(u64));
+
+	src->header.nritems -= push_items;
+	dst->header.nritems += push_items;
+
+	/* adjust the pointers going up the tree */
+	memcpy(path->nodes[level + 1]->keys + path->slots[level + 1] + 1,
+		dst->keys, sizeof(struct key));
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[level] >= src->header.nritems) {
+		path->slots[level] -= src->header.nritems;
+		path->nodes[level] = (struct node*)dst;
+		path->slots[level + 1] += 1;
+	}
+	return 0;
+}
+
+int insert_ptr(struct ctree_root *root,
+		struct ctree_path *path, struct key *key,
+		u64 blocknr, int level)
+{
+	struct node *c = path->nodes[level];
+	struct node *b;
+	struct node *bal[MAX_LEVEL];
+	int bal_level = level;
+	int mid;
+	int bal_start = -1;
+
+	memset(bal, 0, ARRAY_SIZE(bal));
+	while(c && c->header.nritems == NODEPTRS_PER_BLOCK) {
+		if (push_node_left(root, path,
+		   node_level(c->header.flags)) == 0)
+			break;
+		if (push_node_right(root, path,
+		   node_level(c->header.flags)) == 0)
+			break;
+		bal_start = bal_level;
+		if (bal_level == MAX_LEVEL - 1)
+			BUG();
+		b = malloc(sizeof(struct node));
+		b->header.flags = c->header.flags;
+		mid = (c->header.nritems + 1) / 2;
+		memcpy(b->keys, c->keys + mid,
+			(c->header.nritems - mid) * sizeof(struct key));
+		memcpy(b->blockptrs, c->blockptrs + mid,
+			(c->header.nritems - mid) * sizeof(u64));
+		b->header.nritems = c->header.nritems - mid;
+		c->header.nritems = mid;
+		bal[bal_level] = b;
+		if (bal_level == MAX_LEVEL - 1)
+			break;
+		bal_level += 1;
+		c = path->nodes[bal_level];
+	}
+	while(bal_start > 0) {
+		b = bal[bal_start];
+		c = path->nodes[bal_start];
+		__insert_ptr(root, path, b->keys, (u64)b,
+				path->slots[bal_start + 1] + 1, bal_start + 1);
+		if (path->slots[bal_start] >= c->header.nritems) {
+			path->slots[bal_start] -= c->header.nritems;
+			path->nodes[bal_start] = b;
+			path->slots[bal_start + 1] += 1;
+		}
+		bal_start--;
+		if (!bal[bal_start])
+			break;
+	}
+	return __insert_ptr(root, path, key, blocknr, path->slots[level] + 1,
+			    level);
+}
+
+int leaf_space_used(struct leaf *l, int start, int nr)
+{
+	int data_len;
+	int end = start + nr - 1;
+
+	if (!nr)
+		return 0;
+	data_len = l->items[start].offset + l->items[start].size;
+	data_len = data_len - l->items[end].offset;
+	data_len += sizeof(struct item) * nr;
+	return data_len;
+}
+
+int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
+		   int data_size)
+{
+	struct leaf *right = (struct leaf *)path->nodes[0];
+	struct leaf *left;
+	int slot;
+	int i;
+	int free_space;
+	int push_space = 0;
+	int push_items = 0;
+	struct item *item;
+	int old_left_nritems;
+
+	slot = path->slots[1];
+	if (slot == 0) {
+		return 1;
+	}
+	if (!path->nodes[1]) {
+		return 1;
+	}
+	left = read_block(path->nodes[1]->blockptrs[slot - 1]);
+	free_space = leaf_free_space(left);
+	if (free_space < data_size + sizeof(struct item)) {
+		return 1;
+	}
+	for (i = 0; i < right->header.nritems; i++) {
+		item = right->items + i;
+		if (path->slots[0] == i)
+			push_space += data_size + sizeof(*item);
+		if (item->size + sizeof(*item) + push_space > free_space)
+			break;
+		push_items++;
+		push_space += item->size + sizeof(*item);
+	}
+	if (push_items == 0) {
+		return 1;
+	}
+	/* push data from right to left */
+	memcpy(left->items + left->header.nritems,
+		right->items, push_items * sizeof(struct item));
+	push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset;
+	memcpy(left->data + leaf_data_end(left) - push_space,
+		right->data + right->items[push_items - 1].offset,
+		push_space);
+	old_left_nritems = left->header.nritems;
+	for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+		left->items[i].offset -= LEAF_DATA_SIZE -
+			left->items[old_left_nritems -1].offset;
+	}
+	left->header.nritems += push_items;
+
+	/* fixup right node */
+	push_space = right->items[push_items-1].offset - leaf_data_end(right);
+	memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
+		leaf_data_end(right), push_space);
+	memmove(right->items, right->items + push_items,
+		(right->header.nritems - push_items) * sizeof(struct item));
+	right->header.nritems -= push_items;
+	push_space = LEAF_DATA_SIZE;
+	for (i = 0; i < right->header.nritems; i++) {
+		right->items[i].offset = push_space - right->items[i].size;
+		push_space = right->items[i].offset;
+	}
+	fixup_low_keys(path, &right->items[0].key, 1);
+
+	/* then fixup the leaf pointer in the path */
+	if (path->slots[0] < push_items) {
+		path->slots[0] += old_left_nritems;
+		path->nodes[0] = (struct node*)left;
+		path->slots[1] -= 1;
+	} else {
+		path->slots[0] -= push_items;
+	}
+	return 0;
+}
+
+int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
+{
+	struct leaf *l = (struct leaf *)path->nodes[0];
+	int nritems = l->header.nritems;
+	int mid = (nritems + 1)/ 2;
+	int slot = path->slots[0];
+	struct leaf *right;
+	int space_needed = data_size + sizeof(struct item);
+	int data_copy_size;
+	int rt_data_off;
+	int i;
+	int ret;
+
+	if (push_leaf_left(root, path, data_size) == 0) {
+		return 0;
+	}
+	right = malloc(sizeof(struct leaf));
+	memset(right, 0, sizeof(*right));
+	if (mid <= slot) {
+		if (leaf_space_used(l, mid, nritems - mid) + space_needed >
+			LEAF_DATA_SIZE)
+			BUG();
+	} else {
+		if (leaf_space_used(l, 0, mid + 1) + space_needed >
+			LEAF_DATA_SIZE)
+			BUG();
+	}
+	right->header.nritems = nritems - mid;
+	data_copy_size = l->items[mid].offset + l->items[mid].size -
+			 leaf_data_end(l);
+	memcpy(right->items, l->items + mid,
+	       (nritems - mid) * sizeof(struct item));
+	memcpy(right->data + LEAF_DATA_SIZE - data_copy_size,
+	       l->data + leaf_data_end(l), data_copy_size);
+	rt_data_off = LEAF_DATA_SIZE -
+		     (l->items[mid].offset + l->items[mid].size);
+	for (i = 0; i < right->header.nritems; i++) {
+		right->items[i].offset += rt_data_off;
+	}
+	l->header.nritems = mid;
+	ret = insert_ptr(root, path, &right->items[0].key,
+			  (u64)right, 1);
+	if (mid <= slot) {
+		path->nodes[0] = (struct node *)right;
+		path->slots[0] -= mid;
+		path->slots[1] += 1;
+	}
+	return ret;
+}
+
+int insert_item(struct ctree_root *root, struct key *key,
+			  void *data, int data_size)
+{
+	int ret;
+	int slot;
+	struct leaf *leaf;
+	unsigned int nritems;
+	unsigned int data_end;
+	struct ctree_path path;
+
+	init_path(&path);
+	ret = search_slot(root, key, &path);
+	if (ret == 0)
+		return -EEXIST;
+
+	leaf = (struct leaf *)path.nodes[0];
+	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
+		split_leaf(root, &path, data_size);
+	leaf = (struct leaf *)path.nodes[0];
+	nritems = leaf->header.nritems;
+	data_end = leaf_data_end(leaf);
+	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
+		BUG();
+
+	slot = path.slots[0];
+	if (slot == 0)
+		fixup_low_keys(&path, key, 1);
+	if (slot != nritems) {
+		int i;
+		unsigned int old_data = leaf->items[slot].offset +
+					leaf->items[slot].size;
+
+		/*
+		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+		 */
+		/* first correct the data pointers */
+		for (i = slot; i < nritems; i++)
+			leaf->items[i].offset -= data_size;
+
+		/* shift the items */
+		memmove(leaf->items + slot + 1, leaf->items + slot,
+		        (nritems - slot) * sizeof(struct item));
+
+		/* shift the data */
+		memmove(leaf->data + data_end - data_size, leaf->data +
+		        data_end, old_data - data_end);
+		data_end = old_data;
+	}
+	memcpy(&leaf->items[slot].key, key, sizeof(struct key));
+	leaf->items[slot].offset = data_end - data_size;
+	leaf->items[slot].size = data_size;
+	memcpy(leaf->data + data_end - data_size, data, data_size);
+	leaf->header.nritems += 1;
+	if (leaf_free_space(leaf) < 0)
+		BUG();
+	return 0;
+}
+
+int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
+{
+	int slot;
+	struct node *node;
+	int nritems;
+
+	while(1) {
+		node = path->nodes[level];
+		if (!node)
+			break;
+		slot = path->slots[level];
+		nritems = node->header.nritems;
+
+		if (slot != nritems -1) {
+			memmove(node->keys + slot, node->keys + slot + 1,
+				sizeof(struct key) * (nritems - slot - 1));
+			memmove(node->blockptrs + slot,
+				node->blockptrs + slot + 1,
+				sizeof(u64) * (nritems - slot - 1));
+		}
+		node->header.nritems--;
+		if (node->header.nritems != 0) {
+			int tslot;
+			if (slot == 0)
+				fixup_low_keys(path, node->keys, level + 1);
+			tslot = path->slots[level+1];
+			push_node_left(root, path, level);
+			if (node->header.nritems) {
+				push_node_right(root, path, level);
+			}
+			path->slots[level+1] = tslot;
+			if (node->header.nritems)
+				break;
+		}
+		if (node == root->node) {
+			printf("root is now null!\n");
+			root->node = NULL;
+			break;
+		}
+		level++;
+		if (!path->nodes[level])
+			BUG();
+		free(node);
+	}
+	return 0;
+}
+
+int del_item(struct ctree_root *root, struct key *key)
+{
+	int ret;
+	int slot;
+	struct leaf *leaf;
+	struct ctree_path path;
+	int doff;
+	int dsize;
+
+	init_path(&path);
+	ret = search_slot(root, key, &path);
+	if (ret != 0)
+		return -1;
+
+	leaf = (struct leaf *)path.nodes[0];
+	slot = path.slots[0];
+	doff = leaf->items[slot].offset;
+	dsize = leaf->items[slot].size;
+
+	if (slot != leaf->header.nritems - 1) {
+		int i;
+		int data_end = leaf_data_end(leaf);
+		memmove(leaf->data + data_end + dsize,
+			leaf->data + data_end,
+			doff - data_end);
+		for (i = slot + 1; i < leaf->header.nritems; i++)
+			leaf->items[i].offset += dsize;
+		memmove(leaf->items + slot, leaf->items + slot + 1,
+			sizeof(struct item) *
+			(leaf->header.nritems - slot - 1));
+	}
+	leaf->header.nritems -= 1;
+	if (leaf->header.nritems == 0) {
+		free(leaf);
+		del_ptr(root, &path, 1);
+	} else {
+		if (slot == 0)
+			fixup_low_keys(&path, &leaf->items[0].key, 1);
+		if (leaf_space_used(leaf, 0, leaf->header.nritems) <
+		    LEAF_DATA_SIZE / 4) {
+			/* push_leaf_left fixes the path.
+			 * make sure the path still points to our leaf
+			 * for possible call to del_ptr below
+			 */
+			slot = path.slots[1];
+			push_leaf_left(root, &path, 1);
+			path.slots[1] = slot;
+			if (leaf->header.nritems == 0) {
+				free(leaf);
+				del_ptr(root, &path, 1);
+			}
+		}
+	}
+	return 0;
+}
+
+void print_leaf(struct leaf *l)
+{
+	int i;
+	int nr = l->header.nritems;
+	struct item *item;
+	printf("leaf %p total ptrs %d free space %d\n", l, nr,
+	       leaf_free_space(l));
+	fflush(stdout);
+	for (i = 0 ; i < nr ; i++) {
+		item = l->items + i;
+		printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n",
+			i,
+			item->key.objectid, item->key.flags, item->key.offset,
+			item->offset, item->size);
+		fflush(stdout);
+		printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
+		fflush(stdout);
+	}
+}
+void print_tree(struct node *c)
+{
+	int i;
+	int nr;
+
+	if (!c)
+		return;
+	nr = c->header.nritems;
+	if (is_leaf(c->header.flags)) {
+		print_leaf((struct leaf *)c);
+		return;
+	}
+	printf("node %p level %d total ptrs %d free spc %lu\n", c,
+	        node_level(c->header.flags), c->header.nritems,
+		NODEPTRS_PER_BLOCK - c->header.nritems);
+	fflush(stdout);
+	for (i = 0; i < nr; i++) {
+		printf("\tkey %d (%lu %u %lu) block %lx\n",
+		       i,
+		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
+		       c->blockptrs[i]);
+		fflush(stdout);
+	}
+	for (i = 0; i < nr; i++) {
+		struct node *next = read_block(c->blockptrs[i]);
+		if (is_leaf(next->header.flags) &&
+		    node_level(c->header.flags) != 1)
+			BUG();
+		if (node_level(next->header.flags) !=
+			node_level(c->header.flags) - 1)
+			BUG();
+		print_tree(next);
+	}
+
+}
+
+/* for testing only */
+int next_key(int i, int max_key) {
+	return rand() % max_key;
+	// return i;
+}
+
+int main() {
+	struct leaf *first_node = malloc(sizeof(struct leaf));
+	struct ctree_root root;
+	struct key ins;
+	char *buf;
+	int i;
+	int num;
+	int ret;
+	int run_size = 10000000;
+	int max_key = 100000000;
+	int tree_size = 0;
+	struct ctree_path path;
+
+
+	srand(55);
+	root.node = (struct node *)first_node;
+	memset(first_node, 0, sizeof(*first_node));
+	for (i = 0; i < run_size; i++) {
+		buf = malloc(64);
+		num = next_key(i, max_key);
+		// num = i;
+		sprintf(buf, "string-%d", num);
+		// printf("insert %d\n", num);
+		ins.objectid = num;
+		ins.offset = 0;
+		ins.flags = 0;
+		ret = insert_item(&root, &ins, buf, strlen(buf));
+		if (!ret)
+			tree_size++;
+	}
+	srand(55);
+	for (i = 0; i < run_size; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		ins.offset = 0;
+		ins.flags = 0;
+		init_path(&path);
+		ret = search_slot(&root, &ins, &path);
+		if (ret) {
+			print_tree(root.node);
+			printf("unable to find %d\n", num);
+			exit(1);
+		}
+	}
+	printf("node %p level %d total ptrs %d free spc %lu\n", root.node,
+	        node_level(root.node->header.flags), root.node->header.nritems,
+		NODEPTRS_PER_BLOCK - root.node->header.nritems);
+	// print_tree(root.node);
+	printf("all searches good\n");
+	i = 0;
+	srand(55);
+	for (i = 0; i < run_size; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		del_item(&root, &ins);
+	}
+	print_tree(root.node);
+	return 0;
+}
-- 
cgit v1.2.3


From 4920c9ac9a4bbc6bf9acd8c614987ee6b378e78f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 26 Jan 2007 16:38:42 -0500
Subject: Btrfs: Faster deletes, add Makefile and kerncompat

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile     |  7 +++++
 fs/btrfs/ctree.c      | 82 ++++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/kerncompat.h | 68 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 136 insertions(+), 21 deletions(-)
 create mode 100644 fs/btrfs/Makefile
 create mode 100644 fs/btrfs/kerncompat.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
new file mode 100644
index 00000000000..9f84c08baab
--- /dev/null
+++ b/fs/btrfs/Makefile
@@ -0,0 +1,7 @@
+
+ctree: ctree.o
+	gcc -g -O2 -Wall -o ctree ctree.c
+
+clean:
+	rm ctree ctree.o
+
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 893fd56960a..4bf5e92584b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -615,9 +615,9 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 			if (node->header.nritems) {
 				push_node_right(root, path, level);
 			}
-			path->slots[level+1] = tslot;
 			if (node->header.nritems)
 				break;
+			path->slots[level+1] = tslot;
 		}
 		if (node == root->node) {
 			printf("root is now null!\n");
@@ -632,22 +632,15 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 	return 0;
 }
 
-int del_item(struct ctree_root *root, struct key *key)
+int del_item(struct ctree_root *root, struct ctree_path *path)
 {
-	int ret;
 	int slot;
 	struct leaf *leaf;
-	struct ctree_path path;
 	int doff;
 	int dsize;
 
-	init_path(&path);
-	ret = search_slot(root, key, &path);
-	if (ret != 0)
-		return -1;
-
-	leaf = (struct leaf *)path.nodes[0];
-	slot = path.slots[0];
+	leaf = (struct leaf *)path->nodes[0];
+	slot = path->slots[0];
 	doff = leaf->items[slot].offset;
 	dsize = leaf->items[slot].size;
 
@@ -665,23 +658,26 @@ int del_item(struct ctree_root *root, struct key *key)
 	}
 	leaf->header.nritems -= 1;
 	if (leaf->header.nritems == 0) {
+		if (leaf == (struct leaf *)root->node)
+			root->node = NULL;
+		else
+			del_ptr(root, path, 1);
 		free(leaf);
-		del_ptr(root, &path, 1);
 	} else {
 		if (slot == 0)
-			fixup_low_keys(&path, &leaf->items[0].key, 1);
+			fixup_low_keys(path, &leaf->items[0].key, 1);
 		if (leaf_space_used(leaf, 0, leaf->header.nritems) <
 		    LEAF_DATA_SIZE / 4) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
 			 */
-			slot = path.slots[1];
-			push_leaf_left(root, &path, 1);
-			path.slots[1] = slot;
+			slot = path->slots[1];
+			push_leaf_left(root, path, 1);
 			if (leaf->header.nritems == 0) {
 				free(leaf);
-				del_ptr(root, &path, 1);
+				path->slots[1] = slot;
+				del_ptr(root, path, 1);
 			}
 		}
 	}
@@ -753,11 +749,12 @@ int main() {
 	struct leaf *first_node = malloc(sizeof(struct leaf));
 	struct ctree_root root;
 	struct key ins;
+	struct key last = { (u64)-1, 0, 0};
 	char *buf;
 	int i;
 	int num;
 	int ret;
-	int run_size = 10000000;
+	int run_size = 100000;
 	int max_key = 100000000;
 	int tree_size = 0;
 	struct ctree_path path;
@@ -783,8 +780,6 @@ int main() {
 	for (i = 0; i < run_size; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
-		ins.offset = 0;
-		ins.flags = 0;
 		init_path(&path);
 		ret = search_slot(&root, &ins, &path);
 		if (ret) {
@@ -800,11 +795,56 @@ int main() {
 	printf("all searches good\n");
 	i = 0;
 	srand(55);
+	for (i = 0 ; i < run_size/4; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		init_path(&path);
+		ret = search_slot(&root, &ins, &path);
+		if (ret)
+			continue;
+		ret = del_item(&root, &path);
+		if (ret != 0)
+			BUG();
+		tree_size--;
+	}
+	srand(128);
 	for (i = 0; i < run_size; i++) {
+		buf = malloc(64);
 		num = next_key(i, max_key);
+		sprintf(buf, "string-%d", num);
 		ins.objectid = num;
-		del_item(&root, &ins);
+		ret = insert_item(&root, &ins, buf, strlen(buf));
+		if (!ret)
+			tree_size++;
+	}
+	while(root.node) {
+		struct leaf *leaf;
+		int slot;
+		ins.objectid = (u64)-1;
+		init_path(&path);
+		ret = search_slot(&root, &ins, &path);
+		if (ret == 0)
+			BUG();
+
+		leaf = (struct leaf *)(path.nodes[0]);
+		slot = path.slots[0];
+		if (slot != leaf->header.nritems)
+			BUG();
+		while(path.slots[0] > 0) {
+			path.slots[0] -= 1;
+			slot = path.slots[0];
+			leaf = (struct leaf *)(path.nodes[0]);
+
+			if (comp_keys(&last, &leaf->items[slot].key) <= 0)
+				BUG();
+			memcpy(&last, &leaf->items[slot].key, sizeof(last));
+			ret = del_item(&root, &path);
+			if (ret != 0)
+				BUG();
+			tree_size--;
+		}
 	}
 	print_tree(root.node);
+	printf("tree size is now %d\n", tree_size);
 	return 0;
 }
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
new file mode 100644
index 00000000000..3a4bb4d661f
--- /dev/null
+++ b/fs/btrfs/kerncompat.h
@@ -0,0 +1,68 @@
+#ifndef __KERNCOMPAT
+#define __KERNCOMPAT
+#define gfp_t int
+#define get_cpu_var(p) (p)
+#define __get_cpu_var(p) (p)
+#define BITS_PER_LONG 64
+#define __GFP_BITS_SHIFT 20
+#define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1))
+#define __read_mostly
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define __force
+#define PAGE_SHIFT 12
+#define ULONG_MAX       (~0UL)
+#define BUG() abort()
+
+typedef unsigned int u32;
+typedef unsigned long u64;
+typedef unsigned char u8;
+typedef unsigned short u16;
+
+typedef unsigned long pgoff_t;
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct vma_shared { int prio_tree_node; };
+struct vm_area_struct {
+	unsigned long vm_pgoff;
+	unsigned long vm_start;
+	unsigned long vm_end;
+	struct vma_shared shared;
+};
+
+struct page {
+	unsigned long index;
+};
+
+static inline void preempt_enable(void) { do {; } while(0);}
+static inline void preempt_disable(void) { do {; } while(0);}
+
+static inline void __set_bit(int bit, unsigned long *map) {
+	unsigned long *p = map + bit / BITS_PER_LONG;
+	bit = bit & (BITS_PER_LONG -1);
+	*p |= 1UL << bit;
+}
+
+static inline int test_bit(int bit, unsigned long *map) {
+	unsigned long *p = map + bit / BITS_PER_LONG;
+	bit = bit & (BITS_PER_LONG -1);
+	return *p & (1UL << bit) ? 1 : 0;
+}
+
+static inline void __clear_bit(int bit, unsigned long *map) {
+	unsigned long *p = map + bit / BITS_PER_LONG;
+	bit = bit & (BITS_PER_LONG -1);
+	*p &= ~(1UL << bit);
+}
+#define BUG_ON(c) do { if (c) abort(); } while (0)
+
+#define container_of(ptr, type, member) ({                      \
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+	        (type *)( (char *)__mptr - __builtin_offsetof(type,member) );})
+
+#endif
+
+#define ENOMEM 5
+#define EEXIST 6
-- 
cgit v1.2.3


From eb60ceac0ebd11e9896ed1ca84fcad71a4b23601 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 Feb 2007 09:18:22 -0500
Subject: Btrfs: Add backing store, memory management

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile     |  11 +-
 fs/btrfs/ctree.c      | 432 ++++++++++++++++----------
 fs/btrfs/ctree.h      |  62 ++++
 fs/btrfs/disk-io.c    | 174 +++++++++++
 fs/btrfs/disk-io.h    |  21 ++
 fs/btrfs/kerncompat.h |   1 +
 fs/btrfs/radix-tree.c | 836 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/radix-tree.h |  73 +++++
 8 files changed, 1447 insertions(+), 163 deletions(-)
 create mode 100644 fs/btrfs/ctree.h
 create mode 100644 fs/btrfs/disk-io.c
 create mode 100644 fs/btrfs/disk-io.h
 create mode 100644 fs/btrfs/radix-tree.c
 create mode 100644 fs/btrfs/radix-tree.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9f84c08baab..63360212a07 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,7 +1,12 @@
 
-ctree: ctree.o
-	gcc -g -O2 -Wall -o ctree ctree.c
+CFLAGS= -g -Wall
+
+.c.o:
+	$(CC) $(CFLAGS) -c $<
+
+ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h
+	gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o
 
 clean:
-	rm ctree ctree.o
+	rm ctree *.o
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bf5e92584b..6f0522f2108 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,68 +1,25 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "kerncompat.h"
-
-#define BLOCKSIZE 4096
-
-struct key {
-	u64 objectid;
-	u32 flags;
-	u64 offset;
-} __attribute__ ((__packed__));
-
-struct header {
-	u64 fsid[2]; /* FS specific uuid */
-	u64 blocknum;
-	u64 parentid;
-	u32 csum;
-	u32 ham;
-	u16 nritems;
-	u16 flags;
-} __attribute__ ((__packed__));
-
-#define NODEPTRS_PER_BLOCK ((BLOCKSIZE - sizeof(struct header)) / \
-			    (sizeof(struct key) + sizeof(u64)))
-
-#define LEVEL_BITS 3
-#define MAX_LEVEL (1 << LEVEL_BITS)
-#define node_level(f) ((f) & (MAX_LEVEL-1))
-#define is_leaf(f) (node_level(f) == 0)
-
-struct ctree_root {
-	struct node *node;
-};
-
-struct item {
-	struct key key;
-	u16 offset;
-	u16 size;
-} __attribute__ ((__packed__));
-
-#define LEAF_DATA_SIZE (BLOCKSIZE - sizeof(struct header))
-struct leaf {
-	struct header header;
-	union {
-		struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
-		u8 data[BLOCKSIZE-sizeof(struct header)];
-	};
-} __attribute__ ((__packed__));
-
-struct node {
-	struct header header;
-	struct key keys[NODEPTRS_PER_BLOCK];
-	u64 blockptrs[NODEPTRS_PER_BLOCK];
-} __attribute__ ((__packed__));
-
-struct ctree_path {
-	struct node *nodes[MAX_LEVEL];
-	int slots[MAX_LEVEL];
-};
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
 
 static inline void init_path(struct ctree_path *p)
 {
 	memset(p, 0, sizeof(*p));
 }
 
+static void release_path(struct ctree_root *root, struct ctree_path *p)
+{
+	int i;
+	for (i = 0; i < MAX_LEVEL; i++) {
+		if (!p->nodes[i])
+			break;
+		tree_block_release(root, p->nodes[i]);
+	}
+}
+
 static inline unsigned int leaf_data_end(struct leaf *leaf)
 {
 	unsigned int nr = leaf->header.nritems;
@@ -135,26 +92,25 @@ int bin_search(struct node *c, struct key *key, int *slot)
 	return -1;
 }
 
-void *read_block(u64 blocknum)
-{
-	return (void *)blocknum;
-}
-
 int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
 {
-	struct node *c = root->node;
+	struct tree_buffer *b = root->node;
+	struct node *c;
+
 	int slot;
 	int ret;
 	int level;
-	while (c) {
+	b->count++;
+	while (b) {
+		c = &b->node;
 		level = node_level(c->header.flags);
-		p->nodes[level] = c;
+		p->nodes[level] = b;
 		ret = bin_search(c, key, &slot);
 		if (!is_leaf(c->header.flags)) {
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			c = read_block(c->blockptrs[slot]);
+			b = read_tree_block(root, c->blockptrs[slot]);
 			continue;
 		} else {
 			p->slots[level] = slot;
@@ -164,17 +120,20 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
 	return -1;
 }
 
-static void fixup_low_keys(struct ctree_path *path, struct key *key,
-			     int level)
+static void fixup_low_keys(struct ctree_root *root,
+			   struct ctree_path *path, struct key *key,
+			   int level)
 {
 	int i;
 	/* adjust the pointers going up the tree */
 	for (i = level; i < MAX_LEVEL; i++) {
-		struct node *t = path->nodes[i];
+		struct node *t;
 		int tslot = path->slots[i];
-		if (!t)
+		if (!path->nodes[i])
 			break;
+		t = &path->nodes[i]->node;
 		memcpy(t->keys + tslot, key, sizeof(*key));
+		write_tree_block(root, path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
@@ -190,27 +149,34 @@ int __insert_ptr(struct ctree_root *root,
 	int nritems;
 	/* need a new root */
 	if (!path->nodes[level]) {
-		c = malloc(sizeof(struct node));
+		struct tree_buffer *t;
+		t = alloc_free_block(root);
+		c = &t->node;
 		memset(c, 0, sizeof(c));
 		c->header.nritems = 2;
 		c->header.flags = node_level(level);
-		lower = path->nodes[level-1];
+		c->header.blocknr = t->blocknr;
+		lower = &path->nodes[level-1]->node;
 		if (is_leaf(lower->header.flags))
 			lower_key = &((struct leaf *)lower)->items[0].key;
 		else
 			lower_key = lower->keys;
 		memcpy(c->keys, lower_key, sizeof(struct key));
 		memcpy(c->keys + 1, key, sizeof(struct key));
-		c->blockptrs[0] = (u64)lower;
+		c->blockptrs[0] = path->nodes[level-1]->blocknr;
 		c->blockptrs[1] = blocknr;
-		root->node = c;
-		path->nodes[level] = c;
+		/* the path has an extra ref to root->node */
+		tree_block_release(root, root->node);
+		root->node = t;
+		t->count++;
+		write_tree_block(root, t);
+		path->nodes[level] = t;
 		path->slots[level] = 0;
 		if (c->keys[1].objectid == 0)
 			BUG();
 		return 0;
 	}
-	lower = path->nodes[level];
+	lower = &path->nodes[level]->node;
 	nritems = lower->header.nritems;
 	if (slot > nritems)
 		BUG();
@@ -227,6 +193,7 @@ int __insert_ptr(struct ctree_root *root,
 	lower->header.nritems++;
 	if (lower->keys[1].objectid == 0)
 			BUG();
+	write_tree_block(root, path->nodes[level]);
 	return 0;
 }
 
@@ -238,6 +205,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 	int push_items = 0;
 	int left_nritems;
 	int right_nritems;
+	struct tree_buffer *t;
+	struct tree_buffer *right_buf;
 
 	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
 		return 1;
@@ -245,13 +214,18 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 	if (slot == 0)
 		return 1;
 
-	left = read_block(path->nodes[level + 1]->blockptrs[slot - 1]);
-	right = path->nodes[level];
+	t = read_tree_block(root,
+		            path->nodes[level + 1]->node.blockptrs[slot - 1]);
+	left = &t->node;
+	right_buf = path->nodes[level];
+	right = &right_buf->node;
 	left_nritems = left->header.nritems;
 	right_nritems = right->header.nritems;
 	push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
-	if (push_items <= 0)
+	if (push_items <= 0) {
+		tree_block_release(root, t);
 		return 1;
+	}
 
 	if (right_nritems < push_items)
 		push_items = right_nritems;
@@ -267,15 +241,20 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 	left->header.nritems += push_items;
 
 	/* adjust the pointers going up the tree */
-	fixup_low_keys(path, right->keys, level + 1);
+	fixup_low_keys(root, path, right->keys, level + 1);
+
+	write_tree_block(root, t);
+	write_tree_block(root, right_buf);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[level] < push_items) {
 		path->slots[level] += left_nritems;
-		path->nodes[level] = (struct node*)left;
+		tree_block_release(root, path->nodes[level]);
+		path->nodes[level] = t;
 		path->slots[level + 1] -= 1;
 	} else {
 		path->slots[level] -= push_items;
+		tree_block_release(root, t);
 	}
 	return 0;
 }
@@ -283,6 +262,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
+	struct tree_buffer *t;
+	struct tree_buffer *src_buffer;
 	struct node *dst;
 	struct node *src;
 	int push_items = 0;
@@ -295,16 +276,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	if (slot == NODEPTRS_PER_BLOCK - 1)
 		return 1;
 
-	if (slot >= path->nodes[level + 1]->header.nritems -1)
+	if (slot >= path->nodes[level + 1]->node.header.nritems -1)
 		return 1;
 
-	dst = read_block(path->nodes[level + 1]->blockptrs[slot + 1]);
-	src = path->nodes[level];
+	t = read_tree_block(root,
+			    path->nodes[level + 1]->node.blockptrs[slot + 1]);
+	dst = &t->node;
+	src_buffer = path->nodes[level];
+	src = &src_buffer->node;
 	dst_nritems = dst->header.nritems;
 	src_nritems = src->header.nritems;
 	push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
-	if (push_items <= 0)
+	if (push_items <= 0) {
+		tree_block_release(root, t);
 		return 1;
+	}
 
 	if (src_nritems < push_items)
 		push_items = src_nritems;
@@ -322,13 +308,21 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	dst->header.nritems += push_items;
 
 	/* adjust the pointers going up the tree */
-	memcpy(path->nodes[level + 1]->keys + path->slots[level + 1] + 1,
+	memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
 		dst->keys, sizeof(struct key));
+
+	write_tree_block(root, path->nodes[level + 1]);
+	write_tree_block(root, t);
+	write_tree_block(root, src_buffer);
+
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[level] >= src->header.nritems) {
 		path->slots[level] -= src->header.nritems;
-		path->nodes[level] = (struct node*)dst;
+		tree_block_release(root, path->nodes[level]);
+		path->nodes[level] = t;
 		path->slots[level + 1] += 1;
+	} else {
+		tree_block_release(root, t);
 	}
 	return 0;
 }
@@ -337,15 +331,18 @@ int insert_ptr(struct ctree_root *root,
 		struct ctree_path *path, struct key *key,
 		u64 blocknr, int level)
 {
-	struct node *c = path->nodes[level];
+	struct tree_buffer *t = path->nodes[level];
+	struct node *c = &path->nodes[level]->node;
 	struct node *b;
-	struct node *bal[MAX_LEVEL];
+	struct tree_buffer *b_buffer;
+	struct tree_buffer *bal[MAX_LEVEL];
 	int bal_level = level;
 	int mid;
 	int bal_start = -1;
 
 	memset(bal, 0, ARRAY_SIZE(bal));
-	while(c && c->header.nritems == NODEPTRS_PER_BLOCK) {
+	while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
+		c = &t->node;
 		if (push_node_left(root, path,
 		   node_level(c->header.flags)) == 0)
 			break;
@@ -355,8 +352,10 @@ int insert_ptr(struct ctree_root *root,
 		bal_start = bal_level;
 		if (bal_level == MAX_LEVEL - 1)
 			BUG();
-		b = malloc(sizeof(struct node));
+		b_buffer = alloc_free_block(root);
+		b = &b_buffer->node;
 		b->header.flags = c->header.flags;
+		b->header.blocknr = b_buffer->blocknr;
 		mid = (c->header.nritems + 1) / 2;
 		memcpy(b->keys, c->keys + mid,
 			(c->header.nritems - mid) * sizeof(struct key));
@@ -364,21 +363,28 @@ int insert_ptr(struct ctree_root *root,
 			(c->header.nritems - mid) * sizeof(u64));
 		b->header.nritems = c->header.nritems - mid;
 		c->header.nritems = mid;
-		bal[bal_level] = b;
+
+		write_tree_block(root, t);
+		write_tree_block(root, b_buffer);
+
+		bal[bal_level] = b_buffer;
 		if (bal_level == MAX_LEVEL - 1)
 			break;
 		bal_level += 1;
-		c = path->nodes[bal_level];
+		t = path->nodes[bal_level];
 	}
 	while(bal_start > 0) {
-		b = bal[bal_start];
-		c = path->nodes[bal_start];
-		__insert_ptr(root, path, b->keys, (u64)b,
+		b_buffer = bal[bal_start];
+		c = &path->nodes[bal_start]->node;
+		__insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr,
 				path->slots[bal_start + 1] + 1, bal_start + 1);
 		if (path->slots[bal_start] >= c->header.nritems) {
 			path->slots[bal_start] -= c->header.nritems;
-			path->nodes[bal_start] = b;
+			tree_block_release(root, path->nodes[bal_start]);
+			path->nodes[bal_start] = b_buffer;
 			path->slots[bal_start + 1] += 1;
+		} else {
+			tree_block_release(root, b_buffer);
 		}
 		bal_start--;
 		if (!bal[bal_start])
@@ -404,7 +410,9 @@ int leaf_space_used(struct leaf *l, int start, int nr)
 int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		   int data_size)
 {
-	struct leaf *right = (struct leaf *)path->nodes[0];
+	struct tree_buffer *right_buf = path->nodes[0];
+	struct leaf *right = &right_buf->leaf;
+	struct tree_buffer *t;
 	struct leaf *left;
 	int slot;
 	int i;
@@ -421,9 +429,11 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	if (!path->nodes[1]) {
 		return 1;
 	}
-	left = read_block(path->nodes[1]->blockptrs[slot - 1]);
+	t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
+	left = &t->leaf;
 	free_space = leaf_free_space(left);
 	if (free_space < data_size + sizeof(struct item)) {
+		tree_block_release(root, t);
 		return 1;
 	}
 	for (i = 0; i < right->header.nritems; i++) {
@@ -436,6 +446,7 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		push_space += item->size + sizeof(*item);
 	}
 	if (push_items == 0) {
+		tree_block_release(root, t);
 		return 1;
 	}
 	/* push data from right to left */
@@ -446,6 +457,8 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		right->data + right->items[push_items - 1].offset,
 		push_space);
 	old_left_nritems = left->header.nritems;
+	BUG_ON(old_left_nritems < 0);
+
 	for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		left->items[i].offset -= LEAF_DATA_SIZE -
 			left->items[old_left_nritems -1].offset;
@@ -460,30 +473,40 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		(right->header.nritems - push_items) * sizeof(struct item));
 	right->header.nritems -= push_items;
 	push_space = LEAF_DATA_SIZE;
+
 	for (i = 0; i < right->header.nritems; i++) {
 		right->items[i].offset = push_space - right->items[i].size;
 		push_space = right->items[i].offset;
 	}
-	fixup_low_keys(path, &right->items[0].key, 1);
+
+	write_tree_block(root, t);
+	write_tree_block(root, right_buf);
+
+	fixup_low_keys(root, path, &right->items[0].key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
 		path->slots[0] += old_left_nritems;
-		path->nodes[0] = (struct node*)left;
+		tree_block_release(root, path->nodes[0]);
+		path->nodes[0] = t;
 		path->slots[1] -= 1;
 	} else {
+		tree_block_release(root, t);
 		path->slots[0] -= push_items;
 	}
+	BUG_ON(path->slots[0] < 0);
 	return 0;
 }
 
 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 {
-	struct leaf *l = (struct leaf *)path->nodes[0];
-	int nritems = l->header.nritems;
-	int mid = (nritems + 1)/ 2;
-	int slot = path->slots[0];
+	struct tree_buffer *l_buf = path->nodes[0];
+	struct leaf *l = &l_buf->leaf;
+	int nritems;
+	int mid;
+	int slot;
 	struct leaf *right;
+	struct tree_buffer *right_buffer;
 	int space_needed = data_size + sizeof(struct item);
 	int data_copy_size;
 	int rt_data_off;
@@ -491,9 +514,19 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	int ret;
 
 	if (push_leaf_left(root, path, data_size) == 0) {
-		return 0;
+		l_buf = path->nodes[0];
+		l = &l_buf->leaf;
+		if (leaf_free_space(l) >= sizeof(struct item) + data_size)
+			return 0;
 	}
-	right = malloc(sizeof(struct leaf));
+	slot = path->slots[0];
+	nritems = l->header.nritems;
+	mid = (nritems + 1)/ 2;
+
+	right_buffer = alloc_free_block(root);
+	BUG_ON(!right_buffer);
+	BUG_ON(mid == nritems);
+	right = &right_buffer->leaf;
 	memset(right, 0, sizeof(*right));
 	if (mid <= slot) {
 		if (leaf_space_used(l, mid, nritems - mid) + space_needed >
@@ -505,6 +538,8 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 			BUG();
 	}
 	right->header.nritems = nritems - mid;
+	right->header.blocknr = right_buffer->blocknr;
+	right->header.flags = node_level(0);
 	data_copy_size = l->items[mid].offset + l->items[mid].size -
 			 leaf_data_end(l);
 	memcpy(right->items, l->items + mid,
@@ -518,12 +553,20 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	}
 	l->header.nritems = mid;
 	ret = insert_ptr(root, path, &right->items[0].key,
-			  (u64)right, 1);
+			  right_buffer->blocknr, 1);
+
+	write_tree_block(root, right_buffer);
+	write_tree_block(root, l_buf);
+
+	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
-		path->nodes[0] = (struct node *)right;
+		tree_block_release(root, path->nodes[0]);
+		path->nodes[0] = right_buffer;
 		path->slots[0] -= mid;
 		path->slots[1] += 1;
-	}
+	} else
+		tree_block_release(root, right_buffer);
+	BUG_ON(path->slots[0] < 0);
 	return ret;
 }
 
@@ -532,28 +575,48 @@ int insert_item(struct ctree_root *root, struct key *key,
 {
 	int ret;
 	int slot;
+	int slot_orig;
 	struct leaf *leaf;
+	struct tree_buffer *leaf_buf;
 	unsigned int nritems;
 	unsigned int data_end;
 	struct ctree_path path;
 
+	if (!root->node) {
+		struct tree_buffer *t;
+		t = alloc_free_block(root);
+		BUG_ON(!t);
+		t->node.header.nritems = 0;
+		t->node.header.flags = node_level(0);
+		t->node.header.blocknr = t->blocknr;
+		root->node = t;
+		write_tree_block(root, t);
+	}
 	init_path(&path);
 	ret = search_slot(root, key, &path);
-	if (ret == 0)
+	if (ret == 0) {
+		release_path(root, &path);
 		return -EEXIST;
+	}
 
-	leaf = (struct leaf *)path.nodes[0];
-	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
+	slot_orig = path.slots[0];
+	leaf_buf = path.nodes[0];
+	leaf = &leaf_buf->leaf;
+	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size) {
 		split_leaf(root, &path, data_size);
-	leaf = (struct leaf *)path.nodes[0];
+		leaf_buf = path.nodes[0];
+		leaf = &path.nodes[0]->leaf;
+	}
 	nritems = leaf->header.nritems;
 	data_end = leaf_data_end(leaf);
+
 	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
 		BUG();
 
 	slot = path.slots[0];
+	BUG_ON(slot < 0);
 	if (slot == 0)
-		fixup_low_keys(&path, key, 1);
+		fixup_low_keys(root, &path, key, 1);
 	if (slot != nritems) {
 		int i;
 		unsigned int old_data = leaf->items[slot].offset +
@@ -580,21 +643,25 @@ int insert_item(struct ctree_root *root, struct key *key,
 	leaf->items[slot].size = data_size;
 	memcpy(leaf->data + data_end - data_size, data, data_size);
 	leaf->header.nritems += 1;
+	write_tree_block(root, leaf_buf);
 	if (leaf_free_space(leaf) < 0)
 		BUG();
+	release_path(root, &path);
 	return 0;
 }
 
 int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
+	struct tree_buffer *t;
 	struct node *node;
 	int nritems;
 
 	while(1) {
-		node = path->nodes[level];
-		if (!node)
+		t = path->nodes[level];
+		if (!t)
 			break;
+		node = &t->node;
 		slot = path->slots[level];
 		nritems = node->header.nritems;
 
@@ -606,28 +673,34 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 				sizeof(u64) * (nritems - slot - 1));
 		}
 		node->header.nritems--;
+		write_tree_block(root, t);
 		if (node->header.nritems != 0) {
 			int tslot;
 			if (slot == 0)
-				fixup_low_keys(path, node->keys, level + 1);
+				fixup_low_keys(root, path, node->keys,
+					       level + 1);
 			tslot = path->slots[level+1];
+			t->count++;
 			push_node_left(root, path, level);
 			if (node->header.nritems) {
 				push_node_right(root, path, level);
 			}
-			if (node->header.nritems)
+			if (node->header.nritems) {
+				tree_block_release(root, t);
 				break;
+			}
+			tree_block_release(root, t);
 			path->slots[level+1] = tslot;
 		}
-		if (node == root->node) {
-			printf("root is now null!\n");
-			root->node = NULL;
+		if (t == root->node) {
+			/* just turn the root into a leaf and break */
+			root->node->node.header.flags = node_level(0);
+			write_tree_block(root, t);
 			break;
 		}
 		level++;
 		if (!path->nodes[level])
 			BUG();
-		free(node);
 	}
 	return 0;
 }
@@ -636,10 +709,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 {
 	int slot;
 	struct leaf *leaf;
+	struct tree_buffer *leaf_buf;
 	int doff;
 	int dsize;
 
-	leaf = (struct leaf *)path->nodes[0];
+	leaf_buf = path->nodes[0];
+	leaf = &leaf_buf->leaf;
 	slot = path->slots[0];
 	doff = leaf->items[slot].offset;
 	dsize = leaf->items[slot].size;
@@ -658,14 +733,15 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	}
 	leaf->header.nritems -= 1;
 	if (leaf->header.nritems == 0) {
-		if (leaf == (struct leaf *)root->node)
-			root->node = NULL;
-		else
+		if (leaf_buf == root->node) {
+			leaf->header.flags = node_level(0);
+			write_tree_block(root, leaf_buf);
+		} else
 			del_ptr(root, path, 1);
-		free(leaf);
 	} else {
 		if (slot == 0)
-			fixup_low_keys(path, &leaf->items[0].key, 1);
+			fixup_low_keys(root, path, &leaf->items[0].key, 1);
+		write_tree_block(root, leaf_buf);
 		if (leaf_space_used(leaf, 0, leaf->header.nritems) <
 		    LEAF_DATA_SIZE / 4) {
 			/* push_leaf_left fixes the path.
@@ -673,12 +749,13 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			 * for possible call to del_ptr below
 			 */
 			slot = path->slots[1];
+			leaf_buf->count++;
 			push_leaf_left(root, path, 1);
 			if (leaf->header.nritems == 0) {
-				free(leaf);
 				path->slots[1] = slot;
 				del_ptr(root, path, 1);
 			}
+			tree_block_release(root, leaf_buf);
 		}
 	}
 	return 0;
@@ -689,7 +766,7 @@ void print_leaf(struct leaf *l)
 	int i;
 	int nr = l->header.nritems;
 	struct item *item;
-	printf("leaf %p total ptrs %d free space %d\n", l, nr,
+	printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
 	       leaf_free_space(l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
@@ -703,38 +780,45 @@ void print_leaf(struct leaf *l)
 		fflush(stdout);
 	}
 }
-void print_tree(struct node *c)
+void print_tree(struct ctree_root *root, struct tree_buffer *t)
 {
 	int i;
 	int nr;
+	struct node *c;
 
-	if (!c)
+	if (!t)
 		return;
+	c = &t->node;
 	nr = c->header.nritems;
+	if (c->header.blocknr != t->blocknr)
+		BUG();
 	if (is_leaf(c->header.flags)) {
 		print_leaf((struct leaf *)c);
 		return;
 	}
-	printf("node %p level %d total ptrs %d free spc %lu\n", c,
+	printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
 	        node_level(c->header.flags), c->header.nritems,
 		NODEPTRS_PER_BLOCK - c->header.nritems);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
-		printf("\tkey %d (%lu %u %lu) block %lx\n",
+		printf("\tkey %d (%lu %u %lu) block %lu\n",
 		       i,
 		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
 		       c->blockptrs[i]);
 		fflush(stdout);
 	}
 	for (i = 0; i < nr; i++) {
-		struct node *next = read_block(c->blockptrs[i]);
+		struct tree_buffer *next_buf = read_tree_block(root,
+							    c->blockptrs[i]);
+		struct node *next = &next_buf->node;
 		if (is_leaf(next->header.flags) &&
 		    node_level(c->header.flags) != 1)
 			BUG();
 		if (node_level(next->header.flags) !=
 			node_level(c->header.flags) - 1)
 			BUG();
-		print_tree(next);
+		print_tree(root, next_buf);
+		tree_block_release(root, next_buf);
 	}
 
 }
@@ -746,23 +830,24 @@ int next_key(int i, int max_key) {
 }
 
 int main() {
-	struct leaf *first_node = malloc(sizeof(struct leaf));
-	struct ctree_root root;
+	struct ctree_root *root;
 	struct key ins;
 	struct key last = { (u64)-1, 0, 0};
 	char *buf;
 	int i;
 	int num;
 	int ret;
-	int run_size = 100000;
+	int run_size = 1000000;
 	int max_key = 100000000;
 	int tree_size = 0;
 	struct ctree_path path;
 
+	radix_tree_init();
+
+
+	root = open_ctree("dbfile");
 
 	srand(55);
-	root.node = (struct node *)first_node;
-	memset(first_node, 0, sizeof(*first_node));
 	for (i = 0; i < run_size; i++) {
 		buf = malloc(64);
 		num = next_key(i, max_key);
@@ -772,39 +857,46 @@ int main() {
 		ins.objectid = num;
 		ins.offset = 0;
 		ins.flags = 0;
-		ret = insert_item(&root, &ins, buf, strlen(buf));
+		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 	}
+	close_ctree(root);
+	root = open_ctree("dbfile");
+	printf("starting search\n");
 	srand(55);
 	for (i = 0; i < run_size; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(&root, &ins, &path);
+		ret = search_slot(root, &ins, &path);
 		if (ret) {
-			print_tree(root.node);
+			print_tree(root, root->node);
 			printf("unable to find %d\n", num);
 			exit(1);
 		}
-	}
-	printf("node %p level %d total ptrs %d free spc %lu\n", root.node,
-	        node_level(root.node->header.flags), root.node->header.nritems,
-		NODEPTRS_PER_BLOCK - root.node->header.nritems);
-	// print_tree(root.node);
-	printf("all searches good\n");
+		release_path(root, &path);
+	}
+	close_ctree(root);
+	root = open_ctree("dbfile");
+	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
+	        node_level(root->node->node.header.flags),
+		root->node->node.header.nritems,
+		NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
+	printf("all searches good, deleting some items\n");
 	i = 0;
 	srand(55);
 	for (i = 0 ; i < run_size/4; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(&root, &ins, &path);
+		ret = search_slot(root, &ins, &path);
 		if (ret)
 			continue;
-		ret = del_item(&root, &path);
+		ret = del_item(root, &path);
 		if (ret != 0)
 			BUG();
+		release_path(root, &path);
 		tree_size--;
 	}
 	srand(128);
@@ -813,38 +905,58 @@ int main() {
 		num = next_key(i, max_key);
 		sprintf(buf, "string-%d", num);
 		ins.objectid = num;
-		ret = insert_item(&root, &ins, buf, strlen(buf));
+		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 	}
-	while(root.node) {
+	close_ctree(root);
+	root = open_ctree("dbfile");
+	printf("starting search2\n");
+	srand(128);
+	for (i = 0; i < run_size; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		init_path(&path);
+		ret = search_slot(root, &ins, &path);
+		if (ret) {
+			print_tree(root, root->node);
+			printf("unable to find %d\n", num);
+			exit(1);
+		}
+		release_path(root, &path);
+	}
+	printf("starting big long delete run\n");
+	while(root->node && root->node->node.header.nritems > 0) {
 		struct leaf *leaf;
 		int slot;
 		ins.objectid = (u64)-1;
 		init_path(&path);
-		ret = search_slot(&root, &ins, &path);
+		ret = search_slot(root, &ins, &path);
 		if (ret == 0)
 			BUG();
 
-		leaf = (struct leaf *)(path.nodes[0]);
+		leaf = &path.nodes[0]->leaf;
 		slot = path.slots[0];
 		if (slot != leaf->header.nritems)
 			BUG();
 		while(path.slots[0] > 0) {
 			path.slots[0] -= 1;
 			slot = path.slots[0];
-			leaf = (struct leaf *)(path.nodes[0]);
+			leaf = &path.nodes[0]->leaf;
 
 			if (comp_keys(&last, &leaf->items[slot].key) <= 0)
 				BUG();
 			memcpy(&last, &leaf->items[slot].key, sizeof(last));
-			ret = del_item(&root, &path);
-			if (ret != 0)
+			ret = del_item(root, &path);
+			if (ret != 0) {
+				printf("del_item returned %d\n", ret);
 				BUG();
+			}
 			tree_size--;
 		}
+		release_path(root, &path);
 	}
-	print_tree(root.node);
+	close_ctree(root);
 	printf("tree size is now %d\n", tree_size);
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 00000000000..586bf186604
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,62 @@
+#ifndef __CTREE__
+#define __CTREE__
+
+#define CTREE_BLOCKSIZE 4096
+
+struct key {
+	u64 objectid;
+	u32 flags;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+struct header {
+	u64 fsid[2]; /* FS specific uuid */
+	u64 blocknr;
+	u64 parentid;
+	u32 csum;
+	u32 ham;
+	u16 nritems;
+	u16 flags;
+} __attribute__ ((__packed__));
+
+#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
+			    (sizeof(struct key) + sizeof(u64)))
+
+#define LEVEL_BITS 3
+#define MAX_LEVEL (1 << LEVEL_BITS)
+#define node_level(f) ((f) & (MAX_LEVEL-1))
+#define is_leaf(f) (node_level(f) == 0)
+
+struct tree_buffer;
+struct ctree_root {
+	struct tree_buffer *node;
+	int fp;
+	struct radix_tree_root cache_radix;
+};
+
+struct item {
+	struct key key;
+	u16 offset;
+	u16 size;
+} __attribute__ ((__packed__));
+
+#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header))
+struct leaf {
+	struct header header;
+	union {
+		struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
+		u8 data[CTREE_BLOCKSIZE-sizeof(struct header)];
+	};
+} __attribute__ ((__packed__));
+
+struct node {
+	struct header header;
+	struct key keys[NODEPTRS_PER_BLOCK];
+	u64 blockptrs[NODEPTRS_PER_BLOCK];
+} __attribute__ ((__packed__));
+
+struct ctree_path {
+	struct tree_buffer *nodes[MAX_LEVEL];
+	int slots[MAX_LEVEL];
+};
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 00000000000..8d51a07051d
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,174 @@
+#define _XOPEN_SOURCE 500
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
+static int allocated_blocks = 0;
+
+struct ctree_header {
+	u64 root_block;
+} __attribute__ ((__packed__));
+
+static int get_free_block(struct ctree_root *root, u64 *block)
+{
+	struct stat st;
+	int ret;
+
+	st.st_size = 0;
+	ret = fstat(root->fp, &st);
+	if (st.st_size > sizeof(struct ctree_header)) {
+		*block = (st.st_size -
+			sizeof(struct ctree_header)) / CTREE_BLOCKSIZE;
+	} else {
+		*block = 0;
+	}
+	ret = ftruncate(root->fp, sizeof(struct ctree_header) + (*block + 1) *
+			CTREE_BLOCKSIZE);
+	return ret;
+}
+
+struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
+{
+	struct tree_buffer *buf;
+	int ret;
+	buf = malloc(sizeof(struct tree_buffer));
+	if (!buf)
+		return buf;
+	allocated_blocks++;
+	buf->blocknr = blocknr;
+	buf->count = 1;
+	radix_tree_preload(GFP_KERNEL);
+	ret = radix_tree_insert(&root->cache_radix, blocknr, buf);
+	radix_tree_preload_end();
+	if (ret) {
+		free(buf);
+		return NULL;
+	}
+	return buf;
+}
+
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
+{
+	u64 free_block;
+	int ret;
+	struct tree_buffer * buf;
+	ret = get_free_block(root, &free_block);
+	if (ret) {
+		BUG();
+		return NULL;
+	}
+	buf = alloc_tree_block(root, free_block);
+	if (!buf)
+		BUG();
+	return buf;
+}
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
+{
+	loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+	struct tree_buffer *buf;
+	int ret;
+
+	buf = radix_tree_lookup(&root->cache_radix, blocknr);
+	if (buf) {
+		buf->count++;
+		if (buf->blocknr != blocknr)
+			BUG();
+		if (buf->blocknr != buf->node.header.blocknr)
+			BUG();
+		return buf;
+	}
+	buf = alloc_tree_block(root, blocknr);
+	if (!buf)
+		return NULL;
+	ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+	if (ret != CTREE_BLOCKSIZE) {
+		free(buf);
+		return NULL;
+	}
+	if (buf->blocknr != buf->node.header.blocknr)
+		BUG();
+	return buf;
+}
+
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+{
+	u64 blocknr = buf->blocknr;
+	loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+	int ret;
+
+	if (buf->blocknr != buf->node.header.blocknr)
+		BUG();
+	ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+	if (ret != CTREE_BLOCKSIZE)
+		return ret;
+	if (buf == root->node)
+		return update_root_block(root);
+	return 0;
+}
+
+struct ctree_root *open_ctree(char *filename)
+{
+	struct ctree_root *root = malloc(sizeof(struct ctree_root));
+	int fp;
+	u64 root_block;
+	int ret;
+
+	fp = open(filename, O_CREAT | O_RDWR);
+	if (fp < 0) {
+		free(root);
+		return NULL;
+	}
+	root->fp = fp;
+	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+	ret = pread(fp, &root_block, sizeof(u64), 0);
+	if (ret == sizeof(u64)) {
+		printf("reading root node at block %lu\n", root_block);
+		root->node = read_tree_block(root, root_block);
+	} else
+		root->node = NULL;
+	return root;
+}
+
+int close_ctree(struct ctree_root *root)
+{
+	close(root->fp);
+	if (root->node)
+		tree_block_release(root, root->node);
+	free(root);
+	printf("on close %d blocks are allocated\n", allocated_blocks);
+	return 0;
+}
+
+int update_root_block(struct ctree_root *root)
+{
+	int ret;
+	u64 root_block = root->node->blocknr;
+
+	ret = pwrite(root->fp, &root_block, sizeof(u64), 0);
+	if (ret != sizeof(u64))
+		return ret;
+	return 0;
+}
+
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
+{
+	buf->count--;
+	if (buf->count == 0) {
+		if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
+			BUG();
+		radix_tree_delete(&root->cache_radix, buf->blocknr);
+		memset(buf, 0, sizeof(*buf));
+		free(buf);
+		BUG_ON(allocated_blocks == 0);
+		allocated_blocks--;
+	}
+}
+
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 00000000000..ee95fa05c4a
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,21 @@
+#ifndef __DISKIO__
+#define __DISKIO__
+
+struct tree_buffer {
+	u64 blocknr;
+	int count;
+	union {
+		struct node node;
+		struct leaf leaf;
+	};
+};
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
+struct ctree_root *open_ctree(char *filename);
+int close_ctree(struct ctree_root *root);
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
+struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int update_root_block(struct ctree_root *root);
+
+#endif
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
index 3a4bb4d661f..347ca06205c 100644
--- a/fs/btrfs/kerncompat.h
+++ b/fs/btrfs/kerncompat.h
@@ -6,6 +6,7 @@
 #define BITS_PER_LONG 64
 #define __GFP_BITS_SHIFT 20
 #define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1))
+#define GFP_KERNEL 0
 #define __read_mostly
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define __force
diff --git a/fs/btrfs/radix-tree.c b/fs/btrfs/radix-tree.c
new file mode 100644
index 00000000000..baa25ca1c2a
--- /dev/null
+++ b/fs/btrfs/radix-tree.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "kerncompat.h"
+#include "radix-tree.h"
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS	\
+	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+	unsigned int	count;
+	void		*slots[RADIX_TREE_MAP_SIZE];
+	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
+struct radix_tree_path {
+	struct radix_tree_node *node;
+	int offset;
+};
+
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
+
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
+
+/*
+ * Per-cpu pool of preloaded nodes
+ */
+struct radix_tree_preload {
+	int nr;
+	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
+};
+struct radix_tree_preload radix_tree_preloads = { 0, };
+
+static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+{
+	return root->gfp_mask & __GFP_BITS_MASK;
+}
+
+static int internal_nodes = 0;
+/*
+ * This assumes that the caller has performed appropriate preallocation, and
+ * that the caller has pinned this thread of control to the current CPU.
+ */
+static struct radix_tree_node *
+radix_tree_node_alloc(struct radix_tree_root *root)
+{
+	struct radix_tree_node *ret;
+	ret = malloc(sizeof(struct radix_tree_node));
+	if (ret) {
+		memset(ret, 0, sizeof(struct radix_tree_node));
+		internal_nodes++;
+	}
+	return ret;
+}
+
+static inline void
+radix_tree_node_free(struct radix_tree_node *node)
+{
+	internal_nodes--;
+	free(node);
+}
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+	struct radix_tree_preload *rtp;
+	struct radix_tree_node *node;
+	int ret = -ENOMEM;
+
+	preempt_disable();
+	rtp = &__get_cpu_var(radix_tree_preloads);
+	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
+		preempt_enable();
+		node = radix_tree_node_alloc(NULL);
+		if (node == NULL)
+			goto out;
+		preempt_disable();
+		rtp = &__get_cpu_var(radix_tree_preloads);
+		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
+			rtp->nodes[rtp->nr++] = node;
+		else
+			radix_tree_node_free(node);
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	__set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	__clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	return test_bit(offset, node->tags[tag]);
+}
+
+static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK;
+}
+
+static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+	int idx;
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		if (node->tags[tag][idx])
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ *	Return the maximum key which can be store into a
+ *	radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+	return height_to_maxindex[height];
+}
+
+/*
+ *	Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_node *node;
+	unsigned int height;
+	int tag;
+
+	/* Figure out what the height should be.  */
+	height = root->height + 1;
+	while (index > radix_tree_maxindex(height))
+		height++;
+
+	if (root->rnode == NULL) {
+		root->height = height;
+		goto out;
+	}
+
+	do {
+		if (!(node = radix_tree_node_alloc(root)))
+			return -ENOMEM;
+
+		/* Increase the height.  */
+		node->slots[0] = root->rnode;
+
+		/* Propagate the aggregated tag info into the new root */
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+			if (root_tag_get(root, tag))
+				tag_set(node, tag, 0);
+		}
+
+		node->count = 1;
+		root->rnode = node;
+		root->height++;
+	} while (height > root->height);
+out:
+	return 0;
+}
+
+/**
+ *	radix_tree_insert    -    insert into a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@item:		item to insert
+ *
+ *	Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root,
+			unsigned long index, void *item)
+{
+	struct radix_tree_node *node = NULL, *slot;
+	unsigned int height, shift;
+	int offset;
+	int error;
+
+	/* Make sure the tree is high enough.  */
+	if (index > radix_tree_maxindex(root->height)) {
+		error = radix_tree_extend(root, index);
+		if (error)
+			return error;
+	}
+
+	slot = root->rnode;
+	height = root->height;
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+	offset = 0;			/* uninitialised var warning */
+	while (height > 0) {
+		if (slot == NULL) {
+			/* Have to add a child node.  */
+			if (!(slot = radix_tree_node_alloc(root)))
+				return -ENOMEM;
+			if (node) {
+				node->slots[offset] = slot;
+				node->count++;
+			} else
+				root->rnode = slot;
+		}
+
+		/* Go a level down */
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		node = slot;
+		slot = node->slots[offset];
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	if (slot != NULL)
+		return -EEXIST;
+
+	if (node) {
+		node->count++;
+		node->slots[offset] = item;
+		BUG_ON(tag_get(node, 0, offset));
+		BUG_ON(tag_get(node, 1, offset));
+	} else {
+		root->rnode = item;
+		BUG_ON(root_tag_get(root, 0));
+		BUG_ON(root_tag_get(root, 1));
+	}
+
+	return 0;
+}
+
+static inline void **__lookup_slot(struct radix_tree_root *root,
+				   unsigned long index)
+{
+	unsigned int height, shift;
+	struct radix_tree_node **slot;
+
+	height = root->height;
+
+	if (index > radix_tree_maxindex(height))
+		return NULL;
+
+	if (height == 0 && root->rnode)
+		return (void **)&root->rnode;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = &root->rnode;
+
+	while (height > 0) {
+		if (*slot == NULL)
+			return NULL;
+
+		slot = (struct radix_tree_node **)
+			((*slot)->slots +
+				((index >> shift) & RADIX_TREE_MAP_MASK));
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	return (void **)slot;
+}
+
+/**
+ *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the slot corresponding to the position @index in the radix tree
+ *	@root. This is useful for update-if-exists operations.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+	return __lookup_slot(root, index);
+}
+
+/**
+ *	radix_tree_lookup    -    perform lookup operation on a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+	void **slot;
+
+	slot = __lookup_slot(root, index);
+	return slot != NULL ? *slot : NULL;
+}
+
+/**
+ *	radix_tree_tag_set - set a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *	corresponding to @index in the radix tree.  From
+ *	the root all the way down to the leaf node.
+ *
+ *	Returns the address of the tagged item.   Setting a tag on a not-present
+ *	item is a bug.
+ */
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node *slot;
+
+	height = root->height;
+	BUG_ON(index > radix_tree_maxindex(height));
+
+	slot = root->rnode;
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+
+	while (height > 0) {
+		int offset;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		if (!tag_get(slot, tag, offset))
+			tag_set(slot, tag, offset);
+		slot = slot->slots[offset];
+		BUG_ON(slot == NULL);
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	/* set the root's tag bit */
+	if (slot && !root_tag_get(root, tag))
+		root_tag_set(root, tag);
+
+	return slot;
+}
+
+/**
+ *	radix_tree_tag_clear - clear a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *	corresponding to @index in the radix tree.  If
+ *	this causes the leaf node to have no tags set then clear the tag in the
+ *	next-to-leaf node, etc.
+ *
+ *	Returns the address of the tagged item on success, else NULL.  ie:
+ *	has the same return value and semantics as radix_tree_lookup().
+ */
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag)
+{
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	struct radix_tree_node *slot = NULL;
+	unsigned int height, shift;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		goto out;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	pathp->node = NULL;
+	slot = root->rnode;
+
+	while (height > 0) {
+		int offset;
+
+		if (slot == NULL)
+			goto out;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp[1].offset = offset;
+		pathp[1].node = slot;
+		slot = slot->slots[offset];
+		pathp++;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	if (slot == NULL)
+		goto out;
+
+	while (pathp->node) {
+		if (!tag_get(pathp->node, tag, pathp->offset))
+			goto out;
+		tag_clear(pathp->node, tag, pathp->offset);
+		if (any_tag_set(pathp->node, tag))
+			goto out;
+		pathp--;
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+
+out:
+	return slot;
+}
+
+#ifndef __KERNEL__	/* Only the test harness uses this at present */
+/**
+ * radix_tree_tag_get - get a tag on a radix tree node
+ * @root:		radix tree root
+ * @index:		index key
+ * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ * Return values:
+ *
+ *  0: tag not present or not set
+ *  1: tag set
+ */
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node *slot;
+	int saw_unset_tag = 0;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return 0;
+
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	if (height == 0)
+		return 1;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	for ( ; ; ) {
+		int offset;
+
+		if (slot == NULL)
+			return 0;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		/*
+		 * This is just a debug check.  Later, we can bale as soon as
+		 * we see an unset tag.
+		 */
+		if (!tag_get(slot, tag, offset))
+			saw_unset_tag = 1;
+		if (height == 1) {
+			int ret = tag_get(slot, tag, offset);
+
+			BUG_ON(ret && saw_unset_tag);
+			return !!ret;
+		}
+		slot = slot->slots[offset];
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+}
+#endif
+
+static unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift, height;
+	struct radix_tree_node *slot;
+	unsigned long i;
+
+	height = root->height;
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
+		goto out;
+	}
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	for ( ; height > 1; height--) {
+
+		for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
+				i < RADIX_TREE_MAP_SIZE; i++) {
+			if (slot->slots[i] != NULL)
+				break;
+			index &= ~((1UL << shift) - 1);
+			index += 1UL << shift;
+			if (index == 0)
+				goto out;	/* 32-bit wraparound */
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+
+		shift -= RADIX_TREE_MAP_SHIFT;
+		slot = slot->slots[i];
+	}
+
+	/* Bottom level: grab some items */
+	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+		index++;
+		if (slot->slots[i]) {
+			results[nr_found++] = slot->slots[i];
+			if (nr_found == max_items)
+				goto out;
+		}
+	}
+out:
+	*next_index = index;
+	return nr_found;
+}
+
+/**
+ *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	them at *@results and returns the number of items which were placed at
+ *	*@results.
+ *
+ *	The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup(root, results + ret, cur_index,
+					max_items - ret, &next_index);
+		ret += nr_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+	return ret;
+}
+
+/*
+ * FIXME: the two tag_get()s here should use find_next_bit() instead of
+ * open-coding the search.
+ */
+static unsigned int
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index, unsigned int tag)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift;
+	unsigned int height = root->height;
+	struct radix_tree_node *slot;
+
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
+		goto out;
+	}
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	do {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+			if (tag_get(slot, tag, i)) {
+				BUG_ON(slot->slots[i] == NULL);
+				break;
+			}
+			index &= ~((1UL << shift) - 1);
+			index += 1UL << shift;
+			if (index == 0)
+				goto out;	/* 32-bit wraparound */
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+		height--;
+		if (height == 0) {	/* Bottom level: grab some items */
+			unsigned long j = index & RADIX_TREE_MAP_MASK;
+
+			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				index++;
+				if (tag_get(slot, tag, j)) {
+					BUG_ON(slot->slots[j] == NULL);
+					results[nr_found++] = slot->slots[j];
+					if (nr_found == max_items)
+						goto out;
+				}
+			}
+		}
+		shift -= RADIX_TREE_MAP_SHIFT;
+		slot = slot->slots[i];
+	} while (height > 0);
+out:
+	*next_index = index;
+	return nr_found;
+}
+
+/**
+ *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
+ *	                             based on a tag
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ *	Performs an index-ascending scan of the tree for present items which
+ *	have the tag indexed by @tag set.  Places the items at *@results and
+ *	returns the number of items which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup_tag(root, results + ret, cur_index,
+					max_items - ret, &next_index, tag);
+		ret += nr_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+	return ret;
+}
+
+/**
+ *	radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *	@root		radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root)
+{
+	/* try to shrink tree height */
+	while (root->height > 0 &&
+			root->rnode->count == 1 &&
+			root->rnode->slots[0]) {
+		struct radix_tree_node *to_free = root->rnode;
+
+		root->rnode = to_free->slots[0];
+		root->height--;
+		/* must only free zeroed nodes into the slab */
+		tag_clear(to_free, 0, 0);
+		tag_clear(to_free, 1, 0);
+		to_free->slots[0] = NULL;
+		to_free->count = 0;
+		radix_tree_node_free(to_free);
+	}
+}
+
+/**
+ *	radix_tree_delete    -    delete an item from a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Remove the item at @index from the radix tree rooted at @root.
+ *
+ *	Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	struct radix_tree_node *slot = NULL;
+	unsigned int height, shift;
+	int tag;
+	int offset;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		goto out;
+
+	slot = root->rnode;
+	if (height == 0 && root->rnode) {
+		root_tag_clear_all(root);
+		root->rnode = NULL;
+		goto out;
+	}
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	pathp->node = NULL;
+
+	do {
+		if (slot == NULL)
+			goto out;
+
+		pathp++;
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp->offset = offset;
+		pathp->node = slot;
+		slot = slot->slots[offset];
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	} while (height > 0);
+
+	if (slot == NULL)
+		goto out;
+
+	/*
+	 * Clear all tags associated with the just-deleted item
+	 */
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+		if (tag_get(pathp->node, tag, pathp->offset))
+			radix_tree_tag_clear(root, index, tag);
+	}
+
+	/* Now free the nodes we do not need anymore */
+	while (pathp->node) {
+		pathp->node->slots[pathp->offset] = NULL;
+		pathp->node->count--;
+
+		if (pathp->node->count) {
+			if (pathp->node == root->rnode)
+				radix_tree_shrink(root);
+			goto out;
+		}
+
+		/* Node with zero slots in use so free it */
+		radix_tree_node_free(pathp->node);
+
+		pathp--;
+	}
+	root_tag_clear_all(root);
+	root->height = 0;
+	root->rnode = NULL;
+
+out:
+	return slot;
+}
+
+/**
+ *	radix_tree_tagged - test whether any items in the tree are tagged
+ *	@root:		radix tree root
+ *	@tag:		tag to test
+ */
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
+{
+	return root_tag_get(root, tag);
+}
+
+static unsigned long __maxindex(unsigned int height)
+{
+	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+	if (tmp >= RADIX_TREE_INDEX_BITS)
+		index = ~0UL;
+	return index;
+}
+
+static void radix_tree_init_maxindex(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
+		height_to_maxindex[i] = __maxindex(i);
+}
+
+void radix_tree_init(void)
+{
+	radix_tree_init_maxindex();
+}
diff --git a/fs/btrfs/radix-tree.h b/fs/btrfs/radix-tree.h
new file mode 100644
index 00000000000..c3ce88137f7
--- /dev/null
+++ b/fs/btrfs/radix-tree.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _LINUX_RADIX_TREE_H
+#define _LINUX_RADIX_TREE_H
+
+#define RADIX_TREE_MAX_TAGS 2
+
+/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
+struct radix_tree_root {
+	unsigned int		height;
+	gfp_t			gfp_mask;
+	struct radix_tree_node	*rnode;
+};
+
+#define RADIX_TREE_INIT(mask)	{					\
+	.height = 0,							\
+	.gfp_mask = (mask),						\
+	.rnode = NULL,							\
+}
+
+#define RADIX_TREE(name, mask) \
+	struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+#define INIT_RADIX_TREE(root, mask)					\
+do {									\
+	(root)->height = 0;						\
+	(root)->gfp_mask = (mask);					\
+	(root)->rnode = NULL;						\
+} while (0)
+
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items);
+int radix_tree_preload(gfp_t gfp_mask);
+void radix_tree_init(void);
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag);
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag);
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag);
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag);
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+
+static inline void radix_tree_preload_end(void)
+{
+	preempt_enable();
+}
+
+#endif /* _LINUX_RADIX_TREE_H */
-- 
cgit v1.2.3


From 74123bd72a4e34c66e5ad95ce233dd352bebc572 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 Feb 2007 11:05:29 -0500
Subject: Btrfs: Commenting/cleanup

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 249 +++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 186 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6f0522f2108..6b64f49a027 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -20,6 +20,11 @@ static void release_path(struct ctree_root *root, struct ctree_path *p)
 	}
 }
 
+/*
+ * The leaf data grows from end-to-front in the node.
+ * this returns the address of the start of the last item,
+ * which is the stop of the leaf data stack
+ */
 static inline unsigned int leaf_data_end(struct leaf *leaf)
 {
 	unsigned int nr = leaf->header.nritems;
@@ -28,6 +33,11 @@ static inline unsigned int leaf_data_end(struct leaf *leaf)
 	return leaf->items[nr-1].offset;
 }
 
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data.  IOW, how much room
+ * the leaf has left for both items and data
+ */
 static inline int leaf_free_space(struct leaf *leaf)
 {
 	int data_end = leaf_data_end(leaf);
@@ -36,6 +46,9 @@ static inline int leaf_free_space(struct leaf *leaf)
 	return (char *)(leaf->data + data_end) - (char *)items_end;
 }
 
+/*
+ * compare two keys in a memcmp fashion
+ */
 int comp_keys(struct key *k1, struct key *k2)
 {
 	if (k1->objectid > k2->objectid)
@@ -52,6 +65,16 @@ int comp_keys(struct key *k1, struct key *k2)
 		return -1;
 	return 0;
 }
+
+/*
+ * search for key in the array p.  items p are item_size apart
+ * and there are 'max' items in p
+ * the slot in the array is returned via slot, and it points to
+ * the place where you would insert key if it is not found in
+ * the array.
+ *
+ * slot may point to max if the key is bigger than all of the keys
+ */
 int generic_bin_search(char *p, int item_size, struct key *key,
 		       int max, int *slot)
 {
@@ -92,6 +115,14 @@ int bin_search(struct node *c, struct key *key, int *slot)
 	return -1;
 }
 
+/*
+ * look for key in the tree.  path is filled in with nodes along the way
+ * if key is found, we return zero and you can find the item in the leaf
+ * level of the path (level 0)
+ *
+ * If the key isn't found, the path points to the slot where it should
+ * be inserted.
+ */
 int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
 {
 	struct tree_buffer *b = root->node;
@@ -120,12 +151,18 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
 	return -1;
 }
 
+/*
+ * adjust the pointers going up the tree, starting at level
+ * making sure the right key of each node is points to 'key'.
+ * This is used after shifting pointers to the left, so it stops
+ * fixing up pointers when a given leaf/node is not in slot 0 of the
+ * higher levels
+ */
 static void fixup_low_keys(struct ctree_root *root,
 			   struct ctree_path *path, struct key *key,
 			   int level)
 {
 	int i;
-	/* adjust the pointers going up the tree */
 	for (i = level; i < MAX_LEVEL; i++) {
 		struct node *t;
 		int tslot = path->slots[i];
@@ -139,64 +176,16 @@ static void fixup_low_keys(struct ctree_root *root,
 	}
 }
 
-int __insert_ptr(struct ctree_root *root,
-		struct ctree_path *path, struct key *key,
-		u64 blocknr, int slot, int level)
-{
-	struct node *c;
-	struct node *lower;
-	struct key *lower_key;
-	int nritems;
-	/* need a new root */
-	if (!path->nodes[level]) {
-		struct tree_buffer *t;
-		t = alloc_free_block(root);
-		c = &t->node;
-		memset(c, 0, sizeof(c));
-		c->header.nritems = 2;
-		c->header.flags = node_level(level);
-		c->header.blocknr = t->blocknr;
-		lower = &path->nodes[level-1]->node;
-		if (is_leaf(lower->header.flags))
-			lower_key = &((struct leaf *)lower)->items[0].key;
-		else
-			lower_key = lower->keys;
-		memcpy(c->keys, lower_key, sizeof(struct key));
-		memcpy(c->keys + 1, key, sizeof(struct key));
-		c->blockptrs[0] = path->nodes[level-1]->blocknr;
-		c->blockptrs[1] = blocknr;
-		/* the path has an extra ref to root->node */
-		tree_block_release(root, root->node);
-		root->node = t;
-		t->count++;
-		write_tree_block(root, t);
-		path->nodes[level] = t;
-		path->slots[level] = 0;
-		if (c->keys[1].objectid == 0)
-			BUG();
-		return 0;
-	}
-	lower = &path->nodes[level]->node;
-	nritems = lower->header.nritems;
-	if (slot > nritems)
-		BUG();
-	if (nritems == NODEPTRS_PER_BLOCK)
-		BUG();
-	if (slot != nritems) {
-		memmove(lower->keys + slot + 1, lower->keys + slot,
-			(nritems - slot) * sizeof(struct key));
-		memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
-			(nritems - slot) * sizeof(u64));
-	}
-	memcpy(lower->keys + slot, key, sizeof(struct key));
-	lower->blockptrs[slot] = blocknr;
-	lower->header.nritems++;
-	if (lower->keys[1].objectid == 0)
-			BUG();
-	write_tree_block(root, path->nodes[level]);
-	return 0;
-}
-
+/*
+ * try to push data from one node into the next node left in the
+ * tree.  The src node is found at specified level in the path.
+ * If some bytes were pushed, return 0, otherwise return 1.
+ *
+ * Lower nodes/leaves in the path are not touched, higher nodes may
+ * be modified to reflect the push.
+ *
+ * The path is altered to reflect the push.
+ */
 int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
@@ -259,6 +248,16 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 	return 0;
 }
 
+/*
+ * try to push data from one node into the next node right in the
+ * tree.  The src node is found at specified level in the path.
+ * If some bytes were pushed, return 0, otherwise return 1.
+ *
+ * Lower nodes/leaves in the path are not touched, higher nodes may
+ * be modified to reflect the push.
+ *
+ * The path is altered to reflect the push.
+ */
 int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
@@ -270,8 +269,11 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	int dst_nritems;
 	int src_nritems;
 
+	/* can't push from the root */
 	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
 		return 1;
+
+	/* only try to push inside the node higher up */
 	slot = path->slots[level + 1];
 	if (slot == NODEPTRS_PER_BLOCK - 1)
 		return 1;
@@ -315,7 +317,7 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	write_tree_block(root, t);
 	write_tree_block(root, src_buffer);
 
-	/* then fixup the leaf pointer in the path */
+	/* then fixup the pointers in the path */
 	if (path->slots[level] >= src->header.nritems) {
 		path->slots[level] -= src->header.nritems;
 		tree_block_release(root, path->nodes[level]);
@@ -327,6 +329,76 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	return 0;
 }
 
+/*
+ * worker function to insert a single pointer in a node.
+ * the node should have enough room for the pointer already
+ * slot and level indicate where you want the key to go, and
+ * blocknr is the block the key points to.
+ */
+int __insert_ptr(struct ctree_root *root,
+		struct ctree_path *path, struct key *key,
+		u64 blocknr, int slot, int level)
+{
+	struct node *c;
+	struct node *lower;
+	struct key *lower_key;
+	int nritems;
+	/* need a new root */
+	if (!path->nodes[level]) {
+		struct tree_buffer *t;
+		t = alloc_free_block(root);
+		c = &t->node;
+		memset(c, 0, sizeof(c));
+		c->header.nritems = 2;
+		c->header.flags = node_level(level);
+		c->header.blocknr = t->blocknr;
+		lower = &path->nodes[level-1]->node;
+		if (is_leaf(lower->header.flags))
+			lower_key = &((struct leaf *)lower)->items[0].key;
+		else
+			lower_key = lower->keys;
+		memcpy(c->keys, lower_key, sizeof(struct key));
+		memcpy(c->keys + 1, key, sizeof(struct key));
+		c->blockptrs[0] = path->nodes[level-1]->blocknr;
+		c->blockptrs[1] = blocknr;
+		/* the path has an extra ref to root->node */
+		tree_block_release(root, root->node);
+		root->node = t;
+		t->count++;
+		write_tree_block(root, t);
+		path->nodes[level] = t;
+		path->slots[level] = 0;
+		if (c->keys[1].objectid == 0)
+			BUG();
+		return 0;
+	}
+	lower = &path->nodes[level]->node;
+	nritems = lower->header.nritems;
+	if (slot > nritems)
+		BUG();
+	if (nritems == NODEPTRS_PER_BLOCK)
+		BUG();
+	if (slot != nritems) {
+		memmove(lower->keys + slot + 1, lower->keys + slot,
+			(nritems - slot) * sizeof(struct key));
+		memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
+			(nritems - slot) * sizeof(u64));
+	}
+	memcpy(lower->keys + slot, key, sizeof(struct key));
+	lower->blockptrs[slot] = blocknr;
+	lower->header.nritems++;
+	if (lower->keys[1].objectid == 0)
+			BUG();
+	write_tree_block(root, path->nodes[level]);
+	return 0;
+}
+
+
+/*
+ * insert a key,blocknr pair into the tree at a given level
+ * If the node at that level in the path doesn't have room,
+ * it is split or shifted as appropriate.
+ */
 int insert_ptr(struct ctree_root *root,
 		struct ctree_path *path, struct key *key,
 		u64 blocknr, int level)
@@ -340,6 +412,15 @@ int insert_ptr(struct ctree_root *root,
 	int mid;
 	int bal_start = -1;
 
+	/*
+	 * check to see if we need to make room in the node for this
+	 * pointer.  If we do, keep walking the tree, making sure there
+	 * is enough room in each level for the required insertions.
+	 *
+	 * The bal array is filled in with any nodes to be inserted
+	 * due to splitting.  Once we've done all the splitting required
+	 * do the inserts based on the data in the bal array.
+	 */
 	memset(bal, 0, ARRAY_SIZE(bal));
 	while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
 		c = &t->node;
@@ -373,6 +454,11 @@ int insert_ptr(struct ctree_root *root,
 		bal_level += 1;
 		t = path->nodes[bal_level];
 	}
+	/*
+	 * bal_start tells us the first level in the tree that needed to
+	 * be split.  Go through the bal array inserting the new nodes
+	 * as needed.  The path is fixed as we go.
+	 */
 	while(bal_start > 0) {
 		b_buffer = bal[bal_start];
 		c = &path->nodes[bal_start]->node;
@@ -390,10 +476,16 @@ int insert_ptr(struct ctree_root *root,
 		if (!bal[bal_start])
 			break;
 	}
+	/* Now that the tree has room, insert the requested pointer */
 	return __insert_ptr(root, path, key, blocknr, path->slots[level] + 1,
 			    level);
 }
 
+/*
+ * how many bytes are required to store the items in a leaf.  start
+ * and nr indicate which items in the leaf to check.  This totals up the
+ * space used both by the item structs and the item data
+ */
 int leaf_space_used(struct leaf *l, int start, int nr)
 {
 	int data_len;
@@ -407,6 +499,10 @@ int leaf_space_used(struct leaf *l, int start, int nr)
 	return data_len;
 }
 
+/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
 int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		   int data_size)
 {
@@ -498,6 +594,10 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	return 0;
 }
 
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ */
 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 {
 	struct tree_buffer *l_buf = path->nodes[0];
@@ -548,9 +648,10 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	       l->data + leaf_data_end(l), data_copy_size);
 	rt_data_off = LEAF_DATA_SIZE -
 		     (l->items[mid].offset + l->items[mid].size);
-	for (i = 0; i < right->header.nritems; i++) {
+
+	for (i = 0; i < right->header.nritems; i++)
 		right->items[i].offset += rt_data_off;
-	}
+
 	l->header.nritems = mid;
 	ret = insert_ptr(root, path, &right->items[0].key,
 			  right_buffer->blocknr, 1);
@@ -570,6 +671,10 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	return ret;
 }
 
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
 int insert_item(struct ctree_root *root, struct key *key,
 			  void *data, int data_size)
 {
@@ -582,6 +687,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	unsigned int data_end;
 	struct ctree_path path;
 
+	/* create a root if there isn't one */
 	if (!root->node) {
 		struct tree_buffer *t;
 		t = alloc_free_block(root);
@@ -602,6 +708,8 @@ int insert_item(struct ctree_root *root, struct key *key,
 	slot_orig = path.slots[0];
 	leaf_buf = path.nodes[0];
 	leaf = &leaf_buf->leaf;
+
+	/* make room if needed */
 	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size) {
 		split_leaf(root, &path, data_size);
 		leaf_buf = path.nodes[0];
@@ -638,6 +746,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 		        data_end, old_data - data_end);
 		data_end = old_data;
 	}
+	/* copy the new data in */
 	memcpy(&leaf->items[slot].key, key, sizeof(struct key));
 	leaf->items[slot].offset = data_end - data_size;
 	leaf->items[slot].size = data_size;
@@ -650,6 +759,14 @@ int insert_item(struct ctree_root *root, struct key *key,
 	return 0;
 }
 
+/*
+ * delete the pointer from a given level in the path.  The path is not
+ * fixed up, so after calling this it is not valid at that level.
+ *
+ * If the delete empties a node, the node is removed from the tree,
+ * continuing all the way the root if required.  The root is converted into
+ * a leaf if all the nodes are emptied.
+ */
 int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
@@ -705,6 +822,10 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 	return 0;
 }
 
+/*
+ * delete the item at the leaf level in path.  If that empties
+ * the leaf, remove it from the tree
+ */
 int del_item(struct ctree_root *root, struct ctree_path *path)
 {
 	int slot;
@@ -732,6 +853,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			(leaf->header.nritems - slot - 1));
 	}
 	leaf->header.nritems -= 1;
+	/* delete the leaf if we've emptied it */
 	if (leaf->header.nritems == 0) {
 		if (leaf_buf == root->node) {
 			leaf->header.flags = node_level(0);
@@ -742,6 +864,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 		if (slot == 0)
 			fixup_low_keys(root, path, &leaf->items[0].key, 1);
 		write_tree_block(root, leaf_buf);
+		/* delete the leaf if it is mostly empty */
 		if (leaf_space_used(leaf, 0, leaf->header.nritems) <
 		    LEAF_DATA_SIZE / 4) {
 			/* push_leaf_left fixes the path.
@@ -837,7 +960,7 @@ int main() {
 	int i;
 	int num;
 	int ret;
-	int run_size = 1000000;
+	int run_size = 25000;
 	int max_key = 100000000;
 	int tree_size = 0;
 	struct ctree_path path;
-- 
cgit v1.2.3


From d97e63b69ef21c02b67e20e41d9968b0e503572e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Feb 2007 16:40:44 -0500
Subject: Btrfs: early extent mapping support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile  |   4 +-
 fs/btrfs/ctree.c   | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/ctree.h   |  34 ++++++++++--
 fs/btrfs/disk-io.c |  76 +++++++++++++++++++--------
 fs/btrfs/disk-io.h |   3 ++
 5 files changed, 236 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 63360212a07..df065dd2dce 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,8 +4,8 @@ CFLAGS= -g -Wall
 .c.o:
 	$(CC) $(CFLAGS) -c $<
 
-ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h
-	gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o
+ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h mkfs.o
+	gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o mkfs.o
 
 clean:
 	rm ctree *.o
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6b64f49a027..2177744dedd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5,6 +5,8 @@
 #include "ctree.h"
 #include "disk-io.h"
 
+static int refill_alloc_extent(struct ctree_root *root);
+
 static inline void init_path(struct ctree_path *p)
 {
 	memset(p, 0, sizeof(*p));
@@ -29,7 +31,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf)
 {
 	unsigned int nr = leaf->header.nritems;
 	if (nr == 0)
-		return ARRAY_SIZE(leaf->data);
+		return sizeof(leaf->data);
 	return leaf->items[nr-1].offset;
 }
 
@@ -421,7 +423,7 @@ int insert_ptr(struct ctree_root *root,
 	 * due to splitting.  Once we've done all the splitting required
 	 * do the inserts based on the data in the bal array.
 	 */
-	memset(bal, 0, ARRAY_SIZE(bal));
+	memset(bal, 0, sizeof(bal));
 	while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
 		c = &t->node;
 		if (push_node_left(root, path,
@@ -756,6 +758,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	if (leaf_free_space(leaf) < 0)
 		BUG();
 	release_path(root, &path);
+	refill_alloc_extent(root);
 	return 0;
 }
 
@@ -884,6 +887,135 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
+int next_leaf(struct ctree_root *root, struct ctree_path *path)
+{
+	int slot;
+	int level = 1;
+	u64 blocknr;
+	struct tree_buffer *c;
+	struct tree_buffer *next;
+
+	while(level < MAX_LEVEL) {
+		if (!path->nodes[level])
+			return -1;
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+		if (slot >= c->node.header.nritems) {
+			level++;
+			continue;
+		}
+		blocknr = c->node.blockptrs[slot];
+		next = read_tree_block(root, blocknr);
+		break;
+	}
+	path->slots[level] = slot;
+	while(1) {
+		level--;
+		c = path->nodes[level];
+		tree_block_release(root, c);
+		path->nodes[level] = next;
+		path->slots[level] = 0;
+		if (!level)
+			break;
+		next = read_tree_block(root, next->node.blockptrs[0]);
+	}
+	return 0;
+}
+
+int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
+		 u64 search_end, u64 owner, struct key *ins)
+{
+	struct ctree_path path;
+	struct key *key;
+	int ret;
+	u64 hole_size = 0;
+	int slot = 0;
+	u64 last_block;
+	int start_found = 0;
+	struct leaf *l;
+	struct extent_item extent_item;
+
+	init_path(&path);
+	ins->objectid = search_start;
+	ins->offset = 0;
+	ins->flags = 0;
+
+	ret = search_slot(root, ins, &path);
+	while (1) {
+		l = &path.nodes[0]->leaf;
+		slot = path.slots[0];
+		if (!l) {
+			// FIXME allocate root
+		}
+		if (slot >= l->header.nritems) {
+			ret = next_leaf(root, &path);
+			if (ret == 0)
+				continue;
+			if (!start_found) {
+				ins->objectid = search_start;
+				ins->offset = num_blocks;
+				hole_size = search_end - search_start;
+				goto insert;
+			}
+			ins->objectid = last_block;
+			ins->offset = num_blocks;
+			hole_size = search_end - last_block;
+			goto insert;
+		}
+		key = &l->items[slot].key;
+		if (start_found) {
+			hole_size = key->objectid - last_block;
+			if (hole_size > num_blocks) {
+				ins->objectid = last_block;
+				ins->offset = num_blocks;
+				goto insert;
+			}
+		} else
+			start_found = 1;
+		last_block = key->objectid + key->offset;
+		path.slots[0]++;
+		printf("last block is not %lu\n", last_block);
+	}
+	// FIXME -ENOSPC
+insert:
+	extent_item.refs = 1;
+	extent_item.owner = owner;
+	ret = insert_item(root, ins, &extent_item, sizeof(extent_item));
+	return ret;
+}
+
+static int refill_alloc_extent(struct ctree_root *root)
+{
+	struct alloc_extent *ae = root->alloc_extent;
+	struct key key;
+	int ret;
+	int min_blocks = MAX_LEVEL * 2;
+
+	printf("refill alloc root %p, numused %lu total %lu\n", root, ae->num_used, ae->num_blocks);
+	if (ae->num_blocks > ae->num_used && ae->num_blocks - ae->num_used >
+	    min_blocks)
+		return 0;
+	ae = root->reserve_extent;
+	if (ae->num_blocks > ae->num_used) {
+		if (root->alloc_extent->num_blocks == 0) {
+			/* we should swap reserve/alloc_extent when alloc
+			 * fills up
+			 */
+			BUG();
+		}
+		if (ae->num_blocks - ae->num_used < min_blocks)
+			BUG();
+		return 0;
+	}
+	// FIXME, this recurses
+	ret = alloc_extent(root->extent_root,
+			   min_blocks * 2, 0, (unsigned long)-1, 0, &key);
+	ae->blocknr = key.objectid;
+	ae->num_blocks = key.offset;
+	ae->num_used = 0;
+	return ret;
+}
+
 void print_leaf(struct leaf *l)
 {
 	int i;
@@ -948,8 +1080,8 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 
 /* for testing only */
 int next_key(int i, int max_key) {
-	return rand() % max_key;
-	// return i;
+	// return rand() % max_key;
+	return i;
 }
 
 int main() {
@@ -960,7 +1092,7 @@ int main() {
 	int i;
 	int num;
 	int ret;
-	int run_size = 25000;
+	int run_size = 256;
 	int max_key = 100000000;
 	int tree_size = 0;
 	struct ctree_path path;
@@ -980,10 +1112,20 @@ int main() {
 		ins.objectid = num;
 		ins.offset = 0;
 		ins.flags = 0;
+		printf("insert %d\n", i);
 		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
+		printf("done insert %d\n", i);
 	}
+	printf("root used: %lu\n", root->alloc_extent->num_used);
+	printf("root tree\n");
+	print_tree(root, root->node);
+	printf("map tree\n");
+	printf("map used: %lu\n", root->extent_root->alloc_extent->num_used);
+	print_tree(root->extent_root, root->extent_root->node);
+	exit(1);
+
 	close_ctree(root);
 	root = open_ctree("dbfile");
 	printf("starting search\n");
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 586bf186604..b737925be31 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,7 +1,7 @@
 #ifndef __CTREE__
 #define __CTREE__
 
-#define CTREE_BLOCKSIZE 4096
+#define CTREE_BLOCKSIZE 256
 
 struct key {
 	u64 objectid;
@@ -22,18 +22,41 @@ struct header {
 #define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
 			    (sizeof(struct key) + sizeof(u64)))
 
-#define LEVEL_BITS 3
-#define MAX_LEVEL (1 << LEVEL_BITS)
+#define MAX_LEVEL 8
 #define node_level(f) ((f) & (MAX_LEVEL-1))
 #define is_leaf(f) (node_level(f) == 0)
 
 struct tree_buffer;
+
+struct alloc_extent {
+	u64 blocknr;
+	u64 num_blocks;
+	u64 num_used;
+} __attribute__ ((__packed__));
+
 struct ctree_root {
 	struct tree_buffer *node;
+	struct ctree_root *extent_root;
+	struct alloc_extent *alloc_extent;
+	struct alloc_extent *reserve_extent;
 	int fp;
 	struct radix_tree_root cache_radix;
+	struct alloc_extent ai1;
+	struct alloc_extent ai2;
 };
 
+struct ctree_root_info {
+	u64 fsid[2]; /* FS specific uuid */
+	u64 blocknr; /* blocknr of this block */
+	u64 objectid; /* inode number of this root */
+	u64 tree_root; /* the tree root */
+	u32 csum;
+	u32 ham;
+	struct alloc_extent alloc_extent;
+	struct alloc_extent reserve_extent;
+	u64 snapuuid[2]; /* root specific uuid */
+} __attribute__ ((__packed__));
+
 struct item {
 	struct key key;
 	u16 offset;
@@ -55,6 +78,11 @@ struct node {
 	u64 blockptrs[NODEPTRS_PER_BLOCK];
 } __attribute__ ((__packed__));
 
+struct extent_item {
+	u32 refs;
+	u64 owner;
+} __attribute__ ((__packed__));
+
 struct ctree_path {
 	struct tree_buffer *nodes[MAX_LEVEL];
 	int slots[MAX_LEVEL];
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8d51a07051d..653f18aab33 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -12,25 +12,27 @@
 
 static int allocated_blocks = 0;
 
-struct ctree_header {
-	u64 root_block;
-} __attribute__ ((__packed__));
-
 static int get_free_block(struct ctree_root *root, u64 *block)
 {
 	struct stat st;
 	int ret;
 
+	if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks)
+		return -1;
+
+	*block = root->alloc_extent->blocknr + root->alloc_extent->num_used;
+	root->alloc_extent->num_used += 1;
+	if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks) {
+		struct alloc_extent *ae = root->alloc_extent;
+		root->alloc_extent = root->reserve_extent;
+		root->reserve_extent = ae;
+		ae->num_blocks = 0;
+	}
 	st.st_size = 0;
 	ret = fstat(root->fp, &st);
-	if (st.st_size > sizeof(struct ctree_header)) {
-		*block = (st.st_size -
-			sizeof(struct ctree_header)) / CTREE_BLOCKSIZE;
-	} else {
-		*block = 0;
-	}
-	ret = ftruncate(root->fp, sizeof(struct ctree_header) + (*block + 1) *
-			CTREE_BLOCKSIZE);
+	if (st.st_size < (*block + 1) * CTREE_BLOCKSIZE)
+		ret = ftruncate(root->fp,
+				(*block + 1) * CTREE_BLOCKSIZE);
 	return ret;
 }
 
@@ -72,7 +74,7 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root)
 
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 {
-	loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+	loff_t offset = blocknr * CTREE_BLOCKSIZE;
 	struct tree_buffer *buf;
 	int ret;
 
@@ -101,7 +103,7 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 {
 	u64 blocknr = buf->blocknr;
-	loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+	loff_t offset = blocknr * CTREE_BLOCKSIZE;
 	int ret;
 
 	if (buf->blocknr != buf->node.header.blocknr)
@@ -114,11 +116,32 @@ int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 	return 0;
 }
 
+struct ctree_super_block {
+	struct ctree_root_info root_info;
+	struct ctree_root_info extent_info;
+} __attribute__ ((__packed__));
+
+static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
+			struct ctree_root_info *info, int fp)
+{
+	root->fp = fp;
+	root->node = read_tree_block(root, info->tree_root);
+	root->extent_root = extent_root;
+	memcpy(&root->ai1, &info->alloc_extent, sizeof(info->alloc_extent));
+	memcpy(&root->ai2, &info->reserve_extent, sizeof(info->reserve_extent));
+	root->alloc_extent = &root->ai1;
+	root->reserve_extent = &root->ai2;
+	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+	printf("setup done reading root %p, used %lu\n", root, root->alloc_extent->num_used);
+	return 0;
+}
+
 struct ctree_root *open_ctree(char *filename)
 {
 	struct ctree_root *root = malloc(sizeof(struct ctree_root));
+	struct ctree_root *extent_root = malloc(sizeof(struct ctree_root));
+	struct ctree_super_block super;
 	int fp;
-	u64 root_block;
 	int ret;
 
 	fp = open(filename, O_CREAT | O_RDWR);
@@ -126,14 +149,20 @@ struct ctree_root *open_ctree(char *filename)
 		free(root);
 		return NULL;
 	}
-	root->fp = fp;
-	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
-	ret = pread(fp, &root_block, sizeof(u64), 0);
-	if (ret == sizeof(u64)) {
-		printf("reading root node at block %lu\n", root_block);
-		root->node = read_tree_block(root, root_block);
-	} else
-		root->node = NULL;
+	ret = pread(fp, &super, sizeof(struct ctree_super_block),
+		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
+	if (ret == 0) {
+		ret = mkfs(fp);
+		if (ret)
+			return NULL;
+		ret = pread(fp, &super, sizeof(struct ctree_super_block),
+			     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
+		if (ret != sizeof(struct ctree_super_block))
+			return NULL;
+	}
+	BUG_ON(ret < 0);
+	__setup_root(root, extent_root, &super.root_info, fp);
+	__setup_root(extent_root, extent_root, &super.extent_info, fp);
 	return root;
 }
 
@@ -160,6 +189,7 @@ int update_root_block(struct ctree_root *root)
 
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
 {
+	return;
 	buf->count--;
 	if (buf->count == 0) {
 		if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index ee95fa05c4a..41acb65eee1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -17,5 +17,8 @@ int close_ctree(struct ctree_root *root);
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
 struct tree_buffer *alloc_free_block(struct ctree_root *root);
 int update_root_block(struct ctree_root *root);
+int mkfs(int fd);
+
+#define CTREE_SUPER_INFO_OFFSET(bs) (16 * (bs))
 
 #endif
-- 
cgit v1.2.3


From 06ed4b316e8e24b6899ece7186c6a7a0129326ba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Feb 2007 16:41:09 -0500
Subject: Btrfs: add mkfs.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/mkfs.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 fs/btrfs/mkfs.c

(limited to 'fs')

diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
new file mode 100644
index 00000000000..bab98c6d6fa
--- /dev/null
+++ b/fs/btrfs/mkfs.c
@@ -0,0 +1,69 @@
+#define _XOPEN_SOURCE 500
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
+int mkfs(int fd)
+{
+	struct ctree_root_info info[2];
+	struct leaf empty_leaf;
+	struct item item;
+	struct extent_item extent_item;
+	int ret;
+
+	memset(info, 0, sizeof(info));
+	info[0].blocknr = 16;
+	info[0].objectid = 1;
+	info[0].tree_root = 17;
+	info[0].alloc_extent.blocknr = 0;
+	info[0].alloc_extent.num_blocks = 20;
+	/* 0-17 are used (inclusive) */
+	info[0].alloc_extent.num_used = 18;
+
+	info[1].blocknr = 16;
+	info[1].objectid = 2;
+	info[1].tree_root = 64;
+	info[1].alloc_extent.blocknr = 64;
+	info[1].alloc_extent.num_blocks = 8;
+	info[1].alloc_extent.num_used = 1;
+	ret = pwrite(fd, info, sizeof(info),
+		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
+	if (ret != sizeof(info))
+		return -1;
+	memset(&empty_leaf, 0, sizeof(empty_leaf));
+	empty_leaf.header.parentid = 1;
+	empty_leaf.header.blocknr = 17;
+	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 17 * CTREE_BLOCKSIZE);
+	if (ret != sizeof(empty_leaf))
+		return -1;
+
+	empty_leaf.header.parentid = 2;
+	empty_leaf.header.blocknr = 64;
+	empty_leaf.header.nritems = 2;
+	item.key.objectid = 0;
+	item.key.offset = 64;
+	item.key.flags = 0;
+	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item);
+	item.size = sizeof(struct extent_item);
+	extent_item.refs = 1;
+	extent_item.owner = 1;
+	memcpy(empty_leaf.items, &item, sizeof(item));
+	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
+	item.key.objectid = 64;
+	item.key.offset = 64;
+	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 2;
+	extent_item.owner = 2;
+	memcpy(empty_leaf.items + 1, &item, sizeof(item));
+	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
+	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 64 * CTREE_BLOCKSIZE);
+	if (ret != sizeof(empty_leaf))
+		return -1;
+	return 0;
+}
-- 
cgit v1.2.3


From cfaa72952fa7b44aa5d967cbc266110900552aef Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 21 Feb 2007 17:04:57 -0500
Subject: Btrfs: extent fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 69 +++++++++++++++++++++++++++++++----------------
 fs/btrfs/ctree.h   |  5 ++++
 fs/btrfs/disk-io.c | 79 +++++++++++++++++++++++++++++++-----------------------
 fs/btrfs/disk-io.h |  4 +--
 fs/btrfs/mkfs.c    |  7 +++--
 5 files changed, 103 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2177744dedd..2891b582e26 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -354,6 +354,7 @@ int __insert_ptr(struct ctree_root *root,
 		c->header.nritems = 2;
 		c->header.flags = node_level(level);
 		c->header.blocknr = t->blocknr;
+		c->header.parentid = root->node->node.header.parentid;
 		lower = &path->nodes[level-1]->node;
 		if (is_leaf(lower->header.flags))
 			lower_key = &((struct leaf *)lower)->items[0].key;
@@ -363,7 +364,7 @@ int __insert_ptr(struct ctree_root *root,
 		memcpy(c->keys + 1, key, sizeof(struct key));
 		c->blockptrs[0] = path->nodes[level-1]->blocknr;
 		c->blockptrs[1] = blocknr;
-		/* the path has an extra ref to root->node */
+		/* the super has an extra ref to root->node */
 		tree_block_release(root, root->node);
 		root->node = t;
 		t->count++;
@@ -439,6 +440,7 @@ int insert_ptr(struct ctree_root *root,
 		b = &b_buffer->node;
 		b->header.flags = c->header.flags;
 		b->header.blocknr = b_buffer->blocknr;
+		b->header.parentid = root->node->node.header.parentid;
 		mid = (c->header.nritems + 1) / 2;
 		memcpy(b->keys, c->keys + mid,
 			(c->header.nritems - mid) * sizeof(struct key));
@@ -642,6 +644,7 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	right->header.nritems = nritems - mid;
 	right->header.blocknr = right_buffer->blocknr;
 	right->header.flags = node_level(0);
+	right->header.parentid = root->node->node.header.parentid;
 	data_copy_size = l->items[mid].offset + l->items[mid].size -
 			 leaf_data_end(l);
 	memcpy(right->items, l->items + mid,
@@ -689,8 +692,12 @@ int insert_item(struct ctree_root *root, struct key *key,
 	unsigned int data_end;
 	struct ctree_path path;
 
+	refill_alloc_extent(root);
+
 	/* create a root if there isn't one */
 	if (!root->node) {
+		BUG();
+#if 0
 		struct tree_buffer *t;
 		t = alloc_free_block(root);
 		BUG_ON(!t);
@@ -699,6 +706,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 		t->node.header.blocknr = t->blocknr;
 		root->node = t;
 		write_tree_block(root, t);
+#endif
 	}
 	init_path(&path);
 	ret = search_slot(root, key, &path);
@@ -758,7 +766,6 @@ int insert_item(struct ctree_root *root, struct key *key,
 	if (leaf_free_space(leaf) < 0)
 		BUG();
 	release_path(root, &path);
-	refill_alloc_extent(root);
 	return 0;
 }
 
@@ -893,7 +900,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	int level = 1;
 	u64 blocknr;
 	struct tree_buffer *c;
-	struct tree_buffer *next;
+	struct tree_buffer *next = NULL;
 
 	while(level < MAX_LEVEL) {
 		if (!path->nodes[level])
@@ -905,6 +912,8 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 			continue;
 		}
 		blocknr = c->node.blockptrs[slot];
+		if (next)
+			tree_block_release(root, next);
 		next = read_tree_block(root, blocknr);
 		break;
 	}
@@ -922,7 +931,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
-int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
+int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
 		 u64 search_end, u64 owner, struct key *ins)
 {
 	struct ctree_path path;
@@ -934,6 +943,7 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
 	int start_found = 0;
 	struct leaf *l;
 	struct extent_item extent_item;
+	struct ctree_root * root = orig_root->extent_root;
 
 	init_path(&path);
 	ins->objectid = search_start;
@@ -974,13 +984,18 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
 			start_found = 1;
 		last_block = key->objectid + key->offset;
 		path.slots[0]++;
-		printf("last block is not %lu\n", last_block);
 	}
 	// FIXME -ENOSPC
 insert:
+	release_path(root, &path);
 	extent_item.refs = 1;
 	extent_item.owner = owner;
-	ret = insert_item(root, ins, &extent_item, sizeof(extent_item));
+	if (root == orig_root && root->reserve_extent->num_blocks == 0) {
+		root->reserve_extent->blocknr = ins->objectid;
+		root->reserve_extent->num_blocks = ins->offset;
+		root->reserve_extent->num_used = 0;
+	}
+	ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
 	return ret;
 }
 
@@ -991,7 +1006,6 @@ static int refill_alloc_extent(struct ctree_root *root)
 	int ret;
 	int min_blocks = MAX_LEVEL * 2;
 
-	printf("refill alloc root %p, numused %lu total %lu\n", root, ae->num_used, ae->num_blocks);
 	if (ae->num_blocks > ae->num_used && ae->num_blocks - ae->num_used >
 	    min_blocks)
 		return 0;
@@ -1007,9 +1021,9 @@ static int refill_alloc_extent(struct ctree_root *root)
 			BUG();
 		return 0;
 	}
-	// FIXME, this recurses
-	ret = alloc_extent(root->extent_root,
-			   min_blocks * 2, 0, (unsigned long)-1, 0, &key);
+	ret = alloc_extent(root,
+			   min_blocks * 2, 0, (unsigned long)-1,
+			   root->node->node.header.parentid, &key);
 	ae->blocknr = key.objectid;
 	ae->num_blocks = key.offset;
 	ae->num_used = 0;
@@ -1021,6 +1035,7 @@ void print_leaf(struct leaf *l)
 	int i;
 	int nr = l->header.nritems;
 	struct item *item;
+	struct extent_item *ei;
 	printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
 	       leaf_free_space(l));
 	fflush(stdout);
@@ -1032,6 +1047,8 @@ void print_leaf(struct leaf *l)
 			item->offset, item->size);
 		fflush(stdout);
 		printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
+		ei = (struct extent_item *)(l->data + item->offset);
+		printf("\t\textent data %u %lu\n", ei->refs, ei->owner);
 		fflush(stdout);
 	}
 }
@@ -1080,8 +1097,8 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 
 /* for testing only */
 int next_key(int i, int max_key) {
-	// return rand() % max_key;
-	return i;
+	return rand() % max_key;
+	// return i;
 }
 
 int main() {
@@ -1092,15 +1109,20 @@ int main() {
 	int i;
 	int num;
 	int ret;
-	int run_size = 256;
+	int run_size = 10000;
 	int max_key = 100000000;
 	int tree_size = 0;
 	struct ctree_path path;
+	struct ctree_super_block super;
 
 	radix_tree_init();
 
 
-	root = open_ctree("dbfile");
+	root = open_ctree("dbfile", &super);
+	printf("root tree\n");
+	print_tree(root, root->node);
+	printf("map tree\n");
+	print_tree(root->extent_root, root->extent_root->node);
 
 	srand(55);
 	for (i = 0; i < run_size; i++) {
@@ -1112,22 +1134,20 @@ int main() {
 		ins.objectid = num;
 		ins.offset = 0;
 		ins.flags = 0;
-		printf("insert %d\n", i);
 		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
-		printf("done insert %d\n", i);
 	}
 	printf("root used: %lu\n", root->alloc_extent->num_used);
 	printf("root tree\n");
-	print_tree(root, root->node);
+	// print_tree(root, root->node);
 	printf("map tree\n");
 	printf("map used: %lu\n", root->extent_root->alloc_extent->num_used);
-	print_tree(root->extent_root, root->extent_root->node);
-	exit(1);
-
+	// print_tree(root->extent_root, root->extent_root->node);
+	write_ctree_super(root, &super);
 	close_ctree(root);
-	root = open_ctree("dbfile");
+
+	root = open_ctree("dbfile", &super);
 	printf("starting search\n");
 	srand(55);
 	for (i = 0; i < run_size; i++) {
@@ -1142,8 +1162,9 @@ int main() {
 		}
 		release_path(root, &path);
 	}
+	write_ctree_super(root, &super);
 	close_ctree(root);
-	root = open_ctree("dbfile");
+	root = open_ctree("dbfile", &super);
 	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
 	        node_level(root->node->node.header.flags),
 		root->node->node.header.nritems,
@@ -1174,8 +1195,9 @@ int main() {
 		if (!ret)
 			tree_size++;
 	}
+	write_ctree_super(root, &super);
 	close_ctree(root);
-	root = open_ctree("dbfile");
+	root = open_ctree("dbfile", &super);
 	printf("starting search2\n");
 	srand(128);
 	for (i = 0; i < run_size; i++) {
@@ -1221,6 +1243,7 @@ int main() {
 		}
 		release_path(root, &path);
 	}
+	write_ctree_super(root, &super);
 	close_ctree(root);
 	printf("tree size is now %d\n", tree_size);
 	return 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b737925be31..78407d32db7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -57,6 +57,11 @@ struct ctree_root_info {
 	u64 snapuuid[2]; /* root specific uuid */
 } __attribute__ ((__packed__));
 
+struct ctree_super_block {
+	struct ctree_root_info root_info;
+	struct ctree_root_info extent_info;
+} __attribute__ ((__packed__));
+
 struct item {
 	struct key key;
 	u16 offset;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 653f18aab33..2f71ccf1c3c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -15,7 +15,7 @@ static int allocated_blocks = 0;
 static int get_free_block(struct ctree_root *root, u64 *block)
 {
 	struct stat st;
-	int ret;
+	int ret = 0;
 
 	if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks)
 		return -1;
@@ -30,9 +30,14 @@ static int get_free_block(struct ctree_root *root, u64 *block)
 	}
 	st.st_size = 0;
 	ret = fstat(root->fp, &st);
-	if (st.st_size < (*block + 1) * CTREE_BLOCKSIZE)
+	if (st.st_size < (*block + 1) * CTREE_BLOCKSIZE) {
 		ret = ftruncate(root->fp,
 				(*block + 1) * CTREE_BLOCKSIZE);
+		if (ret) {
+			perror("ftruncate");
+			exit(1);
+		}
+	}
 	return ret;
 }
 
@@ -81,11 +86,7 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 	buf = radix_tree_lookup(&root->cache_radix, blocknr);
 	if (buf) {
 		buf->count++;
-		if (buf->blocknr != blocknr)
-			BUG();
-		if (buf->blocknr != buf->node.header.blocknr)
-			BUG();
-		return buf;
+		goto test;
 	}
 	buf = alloc_tree_block(root, blocknr);
 	if (!buf)
@@ -95,8 +96,11 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 		free(buf);
 		return NULL;
 	}
+test:
 	if (buf->blocknr != buf->node.header.blocknr)
 		BUG();
+	if (root->node && buf->node.header.parentid != root->node->node.header.parentid)
+		BUG();
 	return buf;
 }
 
@@ -111,36 +115,30 @@ int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 	ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
 	if (ret != CTREE_BLOCKSIZE)
 		return ret;
-	if (buf == root->node)
-		return update_root_block(root);
 	return 0;
 }
 
-struct ctree_super_block {
-	struct ctree_root_info root_info;
-	struct ctree_root_info extent_info;
-} __attribute__ ((__packed__));
-
 static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
 			struct ctree_root_info *info, int fp)
 {
+	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
 	root->fp = fp;
+	root->node = NULL;
 	root->node = read_tree_block(root, info->tree_root);
 	root->extent_root = extent_root;
 	memcpy(&root->ai1, &info->alloc_extent, sizeof(info->alloc_extent));
 	memcpy(&root->ai2, &info->reserve_extent, sizeof(info->reserve_extent));
 	root->alloc_extent = &root->ai1;
 	root->reserve_extent = &root->ai2;
-	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
-	printf("setup done reading root %p, used %lu\n", root, root->alloc_extent->num_used);
+	printf("setup done reading root %p, used %lu available %lu\n", root, root->alloc_extent->num_used, root->alloc_extent->num_blocks);
+	printf("setup done reading root %p, reserve used %lu available %lu\n", root, root->reserve_extent->num_used, root->reserve_extent->num_blocks);
 	return 0;
 }
 
-struct ctree_root *open_ctree(char *filename)
+struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 {
 	struct ctree_root *root = malloc(sizeof(struct ctree_root));
 	struct ctree_root *extent_root = malloc(sizeof(struct ctree_root));
-	struct ctree_super_block super;
 	int fp;
 	int ret;
 
@@ -149,48 +147,61 @@ struct ctree_root *open_ctree(char *filename)
 		free(root);
 		return NULL;
 	}
-	ret = pread(fp, &super, sizeof(struct ctree_super_block),
+	ret = pread(fp, super, sizeof(struct ctree_super_block),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
 	if (ret == 0) {
 		ret = mkfs(fp);
 		if (ret)
 			return NULL;
-		ret = pread(fp, &super, sizeof(struct ctree_super_block),
+		ret = pread(fp, super, sizeof(struct ctree_super_block),
 			     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
 		if (ret != sizeof(struct ctree_super_block))
 			return NULL;
 	}
 	BUG_ON(ret < 0);
-	__setup_root(root, extent_root, &super.root_info, fp);
-	__setup_root(extent_root, extent_root, &super.extent_info, fp);
+	__setup_root(root, extent_root, &super->root_info, fp);
+	__setup_root(extent_root, extent_root, &super->extent_info, fp);
 	return root;
 }
 
-int close_ctree(struct ctree_root *root)
+static int __update_root(struct ctree_root *root, struct ctree_root_info *info)
 {
-	close(root->fp);
-	if (root->node)
-		tree_block_release(root, root->node);
-	free(root);
-	printf("on close %d blocks are allocated\n", allocated_blocks);
+	info->tree_root = root->node->blocknr;
+	memcpy(&info->alloc_extent, root->alloc_extent, sizeof(struct alloc_extent));
+	memcpy(&info->reserve_extent, root->reserve_extent, sizeof(struct alloc_extent));
 	return 0;
 }
 
-int update_root_block(struct ctree_root *root)
+int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s)
 {
 	int ret;
-	u64 root_block = root->node->blocknr;
-
-	ret = pwrite(root->fp, &root_block, sizeof(u64), 0);
-	if (ret != sizeof(u64))
+	__update_root(root, &s->root_info);
+	__update_root(root->extent_root, &s->extent_info);
+	ret = pwrite(root->fp, s, sizeof(*s), CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
+	if (ret != sizeof(*s)) {
+		fprintf(stderr, "failed to write new super block err %d\n", ret);
 		return ret;
+	}
+	return 0;
+}
+
+int close_ctree(struct ctree_root *root)
+{
+	close(root->fp);
+	if (root->node)
+		tree_block_release(root, root->node);
+	if (root->extent_root->node)
+		tree_block_release(root->extent_root, root->extent_root->node);
+	free(root);
+	printf("on close %d blocks are allocated\n", allocated_blocks);
 	return 0;
 }
 
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
 {
-	return;
 	buf->count--;
+	if (buf->count < 0)
+		BUG();
 	if (buf->count == 0) {
 		if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
 			BUG();
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 41acb65eee1..e288fe883b9 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -12,11 +12,11 @@ struct tree_buffer {
 
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
 int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
-struct ctree_root *open_ctree(char *filename);
+struct ctree_root *open_ctree(char *filename, struct ctree_super_block *s);
 int close_ctree(struct ctree_root *root);
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
 struct tree_buffer *alloc_free_block(struct ctree_root *root);
-int update_root_block(struct ctree_root *root);
+int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s);
 int mkfs(int fd);
 
 #define CTREE_SUPER_INFO_OFFSET(bs) (16 * (bs))
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index bab98c6d6fa..584aba44e80 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -18,12 +18,13 @@ int mkfs(int fd)
 	struct extent_item extent_item;
 	int ret;
 
+	/* setup the super block area */
 	memset(info, 0, sizeof(info));
 	info[0].blocknr = 16;
 	info[0].objectid = 1;
 	info[0].tree_root = 17;
 	info[0].alloc_extent.blocknr = 0;
-	info[0].alloc_extent.num_blocks = 20;
+	info[0].alloc_extent.num_blocks = 64;
 	/* 0-17 are used (inclusive) */
 	info[0].alloc_extent.num_used = 18;
 
@@ -31,12 +32,14 @@ int mkfs(int fd)
 	info[1].objectid = 2;
 	info[1].tree_root = 64;
 	info[1].alloc_extent.blocknr = 64;
-	info[1].alloc_extent.num_blocks = 8;
+	info[1].alloc_extent.num_blocks = 64;
 	info[1].alloc_extent.num_used = 1;
 	ret = pwrite(fd, info, sizeof(info),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
 	if (ret != sizeof(info))
 		return -1;
+
+	/* create leaves for the tree root and extent root */
 	memset(&empty_leaf, 0, sizeof(empty_leaf));
 	empty_leaf.header.parentid = 1;
 	empty_leaf.header.blocknr = 17;
-- 
cgit v1.2.3


From 5c680ed620c2b69cf751aecf1a5e03ce2c89c7f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 22 Feb 2007 11:39:13 -0500
Subject: Btrfs: switch to early splits

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 263 ++++++++++++++++++++++++-----------------------------
 fs/btrfs/disk-io.c |   3 +-
 2 files changed, 120 insertions(+), 146 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2891b582e26..1b4e82d8074 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5,7 +5,12 @@
 #include "ctree.h"
 #include "disk-io.h"
 
+#define SEARCH_READ 0
+#define SEARCH_WRITE 1
+
 static int refill_alloc_extent(struct ctree_root *root);
+int split_node(struct ctree_root *root, struct ctree_path *path, int level);
+int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size);
 
 static inline void init_path(struct ctree_path *p)
 {
@@ -125,14 +130,14 @@ int bin_search(struct node *c, struct key *key, int *slot)
  * If the key isn't found, the path points to the slot where it should
  * be inserted.
  */
-int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
+int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len)
 {
 	struct tree_buffer *b = root->node;
 	struct node *c;
-
 	int slot;
 	int ret;
 	int level;
+
 	b->count++;
 	while (b) {
 		c = &b->node;
@@ -143,10 +148,26 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
+			if (ins_len && c->header.nritems == NODEPTRS_PER_BLOCK) {
+				int sret = split_node(root, p, level);
+				BUG_ON(sret > 0);
+				if (sret)
+					return sret;
+				b = p->nodes[level];
+				c = &b->node;
+				slot = p->slots[level];
+			}
 			b = read_tree_block(root, c->blockptrs[slot]);
 			continue;
 		} else {
+			struct leaf *l = (struct leaf *)c;
 			p->slots[level] = slot;
+			if (ins_len && leaf_free_space(l) <  sizeof(struct item) + ins_len) {
+				int sret = split_leaf(root, p, ins_len);
+				BUG_ON(sret > 0);
+				if (sret)
+					return sret;
+			}
 			return ret;
 		}
 	}
@@ -331,50 +352,54 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	return 0;
 }
 
+static int insert_new_root(struct ctree_root *root, struct ctree_path *path, int level)
+{
+	struct tree_buffer *t;
+	struct node *lower;
+	struct node *c;
+	struct key *lower_key;
+
+	BUG_ON(path->nodes[level]);
+	BUG_ON(path->nodes[level-1] != root->node);
+
+	t = alloc_free_block(root);
+	c = &t->node;
+	memset(c, 0, sizeof(c));
+	c->header.nritems = 1;
+	c->header.flags = node_level(level);
+	c->header.blocknr = t->blocknr;
+	c->header.parentid = root->node->node.header.parentid;
+	lower = &path->nodes[level-1]->node;
+	if (is_leaf(lower->header.flags))
+		lower_key = &((struct leaf *)lower)->items[0].key;
+	else
+		lower_key = lower->keys;
+	memcpy(c->keys, lower_key, sizeof(struct key));
+	c->blockptrs[0] = path->nodes[level-1]->blocknr;
+	/* the super has an extra ref to root->node */
+	tree_block_release(root, root->node);
+	root->node = t;
+	t->count++;
+	write_tree_block(root, t);
+	path->nodes[level] = t;
+	path->slots[level] = 0;
+	return 0;
+}
+
 /*
  * worker function to insert a single pointer in a node.
  * the node should have enough room for the pointer already
  * slot and level indicate where you want the key to go, and
  * blocknr is the block the key points to.
  */
-int __insert_ptr(struct ctree_root *root,
+int insert_ptr(struct ctree_root *root,
 		struct ctree_path *path, struct key *key,
 		u64 blocknr, int slot, int level)
 {
-	struct node *c;
 	struct node *lower;
-	struct key *lower_key;
 	int nritems;
-	/* need a new root */
-	if (!path->nodes[level]) {
-		struct tree_buffer *t;
-		t = alloc_free_block(root);
-		c = &t->node;
-		memset(c, 0, sizeof(c));
-		c->header.nritems = 2;
-		c->header.flags = node_level(level);
-		c->header.blocknr = t->blocknr;
-		c->header.parentid = root->node->node.header.parentid;
-		lower = &path->nodes[level-1]->node;
-		if (is_leaf(lower->header.flags))
-			lower_key = &((struct leaf *)lower)->items[0].key;
-		else
-			lower_key = lower->keys;
-		memcpy(c->keys, lower_key, sizeof(struct key));
-		memcpy(c->keys + 1, key, sizeof(struct key));
-		c->blockptrs[0] = path->nodes[level-1]->blocknr;
-		c->blockptrs[1] = blocknr;
-		/* the super has an extra ref to root->node */
-		tree_block_release(root, root->node);
-		root->node = t;
-		t->count++;
-		write_tree_block(root, t);
-		path->nodes[level] = t;
-		path->slots[level] = 0;
-		if (c->keys[1].objectid == 0)
-			BUG();
-		return 0;
-	}
+
+	BUG_ON(!path->nodes[level]);
 	lower = &path->nodes[level]->node;
 	nritems = lower->header.nritems;
 	if (slot > nritems)
@@ -396,93 +421,54 @@ int __insert_ptr(struct ctree_root *root,
 	return 0;
 }
 
-
-/*
- * insert a key,blocknr pair into the tree at a given level
- * If the node at that level in the path doesn't have room,
- * it is split or shifted as appropriate.
- */
-int insert_ptr(struct ctree_root *root,
-		struct ctree_path *path, struct key *key,
-		u64 blocknr, int level)
+int split_node(struct ctree_root *root, struct ctree_path *path, int level)
 {
-	struct tree_buffer *t = path->nodes[level];
-	struct node *c = &path->nodes[level]->node;
-	struct node *b;
-	struct tree_buffer *b_buffer;
-	struct tree_buffer *bal[MAX_LEVEL];
-	int bal_level = level;
+	struct tree_buffer *t;
+	struct node *c;
+	struct tree_buffer *split_buffer;
+	struct node *split;
 	int mid;
-	int bal_start = -1;
-
-	/*
-	 * check to see if we need to make room in the node for this
-	 * pointer.  If we do, keep walking the tree, making sure there
-	 * is enough room in each level for the required insertions.
-	 *
-	 * The bal array is filled in with any nodes to be inserted
-	 * due to splitting.  Once we've done all the splitting required
-	 * do the inserts based on the data in the bal array.
-	 */
-	memset(bal, 0, sizeof(bal));
-	while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
-		c = &t->node;
-		if (push_node_left(root, path,
-		   node_level(c->header.flags)) == 0)
-			break;
-		if (push_node_right(root, path,
-		   node_level(c->header.flags)) == 0)
-			break;
-		bal_start = bal_level;
-		if (bal_level == MAX_LEVEL - 1)
-			BUG();
-		b_buffer = alloc_free_block(root);
-		b = &b_buffer->node;
-		b->header.flags = c->header.flags;
-		b->header.blocknr = b_buffer->blocknr;
-		b->header.parentid = root->node->node.header.parentid;
-		mid = (c->header.nritems + 1) / 2;
-		memcpy(b->keys, c->keys + mid,
-			(c->header.nritems - mid) * sizeof(struct key));
-		memcpy(b->blockptrs, c->blockptrs + mid,
-			(c->header.nritems - mid) * sizeof(u64));
-		b->header.nritems = c->header.nritems - mid;
-		c->header.nritems = mid;
-
-		write_tree_block(root, t);
-		write_tree_block(root, b_buffer);
+	int ret;
 
-		bal[bal_level] = b_buffer;
-		if (bal_level == MAX_LEVEL - 1)
-			break;
-		bal_level += 1;
-		t = path->nodes[bal_level];
+	ret = push_node_left(root, path, level);
+	if (!ret)
+		return 0;
+	ret = push_node_right(root, path, level);
+	if (!ret)
+		return 0;
+	t = path->nodes[level];
+	c = &t->node;
+	if (t == root->node) {
+		/* trying to split the root, lets make a new one */
+		ret = insert_new_root(root, path, level + 1);
+		if (ret)
+			return ret;
 	}
-	/*
-	 * bal_start tells us the first level in the tree that needed to
-	 * be split.  Go through the bal array inserting the new nodes
-	 * as needed.  The path is fixed as we go.
-	 */
-	while(bal_start > 0) {
-		b_buffer = bal[bal_start];
-		c = &path->nodes[bal_start]->node;
-		__insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr,
-				path->slots[bal_start + 1] + 1, bal_start + 1);
-		if (path->slots[bal_start] >= c->header.nritems) {
-			path->slots[bal_start] -= c->header.nritems;
-			tree_block_release(root, path->nodes[bal_start]);
-			path->nodes[bal_start] = b_buffer;
-			path->slots[bal_start + 1] += 1;
-		} else {
-			tree_block_release(root, b_buffer);
-		}
-		bal_start--;
-		if (!bal[bal_start])
-			break;
+	split_buffer = alloc_free_block(root);
+	split = &split_buffer->node;
+	split->header.flags = c->header.flags;
+	split->header.blocknr = split_buffer->blocknr;
+	split->header.parentid = root->node->node.header.parentid;
+	mid = (c->header.nritems + 1) / 2;
+	memcpy(split->keys, c->keys + mid,
+		(c->header.nritems - mid) * sizeof(struct key));
+	memcpy(split->blockptrs, c->blockptrs + mid,
+		(c->header.nritems - mid) * sizeof(u64));
+	split->header.nritems = c->header.nritems - mid;
+	c->header.nritems = mid;
+	write_tree_block(root, t);
+	write_tree_block(root, split_buffer);
+	insert_ptr(root, path, split->keys, split_buffer->blocknr,
+		     path->slots[level + 1] + 1, level + 1);
+	if (path->slots[level] > mid) {
+		path->slots[level] -= mid;
+		tree_block_release(root, t);
+		path->nodes[level] = split_buffer;
+		path->slots[level + 1] += 1;
+	} else {
+		tree_block_release(root, split_buffer);
 	}
-	/* Now that the tree has room, insert the requested pointer */
-	return __insert_ptr(root, path, key, blocknr, path->slots[level] + 1,
-			    level);
+	return 0;
 }
 
 /*
@@ -623,6 +609,11 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 		if (leaf_free_space(l) >= sizeof(struct item) + data_size)
 			return 0;
 	}
+	if (!path->nodes[1]) {
+		ret = insert_new_root(root, path, 1);
+		if (ret)
+			return ret;
+	}
 	slot = path->slots[0];
 	nritems = l->header.nritems;
 	mid = (nritems + 1)/ 2;
@@ -659,8 +650,7 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 
 	l->header.nritems = mid;
 	ret = insert_ptr(root, path, &right->items[0].key,
-			  right_buffer->blocknr, 1);
-
+			  right_buffer->blocknr, path->slots[1] + 1, 1);
 	write_tree_block(root, right_buffer);
 	write_tree_block(root, l_buf);
 
@@ -695,21 +685,10 @@ int insert_item(struct ctree_root *root, struct key *key,
 	refill_alloc_extent(root);
 
 	/* create a root if there isn't one */
-	if (!root->node) {
+	if (!root->node)
 		BUG();
-#if 0
-		struct tree_buffer *t;
-		t = alloc_free_block(root);
-		BUG_ON(!t);
-		t->node.header.nritems = 0;
-		t->node.header.flags = node_level(0);
-		t->node.header.blocknr = t->blocknr;
-		root->node = t;
-		write_tree_block(root, t);
-#endif
-	}
 	init_path(&path);
-	ret = search_slot(root, key, &path);
+	ret = search_slot(root, key, &path, data_size);
 	if (ret == 0) {
 		release_path(root, &path);
 		return -EEXIST;
@@ -719,12 +698,6 @@ int insert_item(struct ctree_root *root, struct key *key,
 	leaf_buf = path.nodes[0];
 	leaf = &leaf_buf->leaf;
 
-	/* make room if needed */
-	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size) {
-		split_leaf(root, &path, data_size);
-		leaf_buf = path.nodes[0];
-		leaf = &path.nodes[0]->leaf;
-	}
 	nritems = leaf->header.nritems;
 	data_end = leaf_data_end(leaf);
 
@@ -950,7 +923,7 @@ int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
 	ins->offset = 0;
 	ins->flags = 0;
 
-	ret = search_slot(root, ins, &path);
+	ret = search_slot(root, ins, &path, sizeof(struct extent_item));
 	while (1) {
 		l = &path.nodes[0]->leaf;
 		slot = path.slots[0];
@@ -1097,8 +1070,8 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 
 /* for testing only */
 int next_key(int i, int max_key) {
-	return rand() % max_key;
-	// return i;
+	// return rand() % max_key;
+	return i;
 }
 
 int main() {
@@ -1154,7 +1127,7 @@ int main() {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path);
+		ret = search_slot(root, &ins, &path, 0);
 		if (ret) {
 			print_tree(root, root->node);
 			printf("unable to find %d\n", num);
@@ -1176,7 +1149,7 @@ int main() {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path);
+		ret = search_slot(root, &ins, &path, 0);
 		if (ret)
 			continue;
 		ret = del_item(root, &path);
@@ -1204,7 +1177,7 @@ int main() {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path);
+		ret = search_slot(root, &ins, &path, 0);
 		if (ret) {
 			print_tree(root, root->node);
 			printf("unable to find %d\n", num);
@@ -1218,7 +1191,7 @@ int main() {
 		int slot;
 		ins.objectid = (u64)-1;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path);
+		ret = search_slot(root, &ins, &path, 0);
 		if (ret == 0)
 			BUG();
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f71ccf1c3c..a696a4278ac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -149,7 +149,8 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 	}
 	ret = pread(fp, super, sizeof(struct ctree_super_block),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
-	if (ret == 0) {
+	if (ret == 0 || super->root_info.tree_root == 0) {
+		printf("making new FS!\n");
 		ret = mkfs(fp);
 		if (ret)
 			return NULL;
-- 
cgit v1.2.3


From 9a8dd1502de6aa683ae46cf0397e9b6e636416fb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 23 Feb 2007 08:38:36 -0500
Subject: Btrfs: Block sized tree extents and extent deletion

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 245 +++++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/ctree.h   |  13 +--
 fs/btrfs/disk-io.c |  90 +++++++-------------
 fs/btrfs/disk-io.h |   2 +-
 fs/btrfs/mkfs.c    |  37 ++++----
 5 files changed, 252 insertions(+), 135 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1b4e82d8074..f0abcf1f393 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -8,9 +8,12 @@
 #define SEARCH_READ 0
 #define SEARCH_WRITE 1
 
-static int refill_alloc_extent(struct ctree_root *root);
+#define CTREE_EXTENT_PENDING 0
+
 int split_node(struct ctree_root *root, struct ctree_path *path, int level);
 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size);
+struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
 
 static inline void init_path(struct ctree_path *p)
 {
@@ -682,8 +685,6 @@ int insert_item(struct ctree_root *root, struct key *key,
 	unsigned int data_end;
 	struct ctree_path path;
 
-	refill_alloc_extent(root);
-
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
@@ -756,6 +757,7 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 	struct tree_buffer *t;
 	struct node *node;
 	int nritems;
+	u64 blocknr;
 
 	while(1) {
 		t = path->nodes[level];
@@ -774,6 +776,7 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 		}
 		node->header.nritems--;
 		write_tree_block(root, t);
+		blocknr = t->blocknr;
 		if (node->header.nritems != 0) {
 			int tslot;
 			if (slot == 0)
@@ -799,6 +802,7 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 			break;
 		}
 		level++;
+		free_extent(root, blocknr, 1);
 		if (!path->nodes[level])
 			BUG();
 	}
@@ -841,8 +845,10 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 		if (leaf_buf == root->node) {
 			leaf->header.flags = node_level(0);
 			write_tree_block(root, leaf_buf);
-		} else
+		} else {
 			del_ptr(root, path, 1);
+			free_extent(root, leaf_buf->blocknr, 1);
+		}
 	} else {
 		if (slot == 0)
 			fixup_low_keys(root, path, &leaf->items[0].key, 1);
@@ -867,6 +873,72 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
+static int del_pending_extents(struct ctree_root *extent_root)
+{
+	int ret;
+	struct key key;
+	struct tree_buffer *gang[4];
+	int i;
+	struct ctree_path path;
+
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+						 (void **)gang, 0, ARRAY_SIZE(gang),
+						 CTREE_EXTENT_PENDING);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			key.objectid = gang[i]->blocknr;
+			key.flags = 0;
+			key.offset = 1;
+			init_path(&path);
+			ret = search_slot(extent_root, &key, &path, 0);
+			if (ret) {
+				BUG();
+				// FIXME undo it and return sane
+				return ret;
+			}
+			ret = del_item(extent_root, &path);
+			if (ret) {
+				BUG();
+				return ret;
+			}
+			release_path(extent_root, &path);
+			radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
+						CTREE_EXTENT_PENDING);
+			tree_block_release(extent_root, gang[i]);
+		}
+	}
+	return 0;
+}
+
+int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
+{
+	struct ctree_path path;
+	struct key key;
+	struct ctree_root *extent_root = root->extent_root;
+	struct tree_buffer *t;
+	int pending_ret;
+	int ret;
+
+	key.objectid = blocknr;
+	key.flags = 0;
+	key.offset = num_blocks;
+	if (root == extent_root) {
+		t = read_tree_block(root, key.objectid);
+		radix_tree_tag_set(&root->cache_radix, key.objectid, CTREE_EXTENT_PENDING);
+		return 0;
+	}
+	init_path(&path);
+	ret = search_slot(extent_root, &key, &path, 0);
+	if (ret)
+		BUG();
+	ret = del_item(extent_root, &path);
+	release_path(extent_root, &path);
+	pending_ret = del_pending_extents(root->extent_root);
+	return ret ? ret : pending_ret;
+}
+
 int next_leaf(struct ctree_root *root, struct ctree_path *path)
 {
 	int slot;
@@ -904,8 +976,8 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
-int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
-		 u64 search_end, u64 owner, struct key *ins)
+int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
+			 u64 search_end, struct key *ins)
 {
 	struct ctree_path path;
 	struct key *key;
@@ -915,15 +987,13 @@ int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
 	u64 last_block;
 	int start_found = 0;
 	struct leaf *l;
-	struct extent_item extent_item;
 	struct ctree_root * root = orig_root->extent_root;
 
 	init_path(&path);
 	ins->objectid = search_start;
 	ins->offset = 0;
 	ins->flags = 0;
-
-	ret = search_slot(root, ins, &path, sizeof(struct extent_item));
+	ret = search_slot(root, ins, &path, 0);
 	while (1) {
 		l = &path.nodes[0]->leaf;
 		slot = path.slots[0];
@@ -938,6 +1008,7 @@ int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
 				ins->objectid = search_start;
 				ins->offset = num_blocks;
 				hole_size = search_end - search_start;
+				start_found = 1;
 				goto insert;
 			}
 			ins->objectid = last_block;
@@ -956,51 +1027,119 @@ int alloc_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
 		} else
 			start_found = 1;
 		last_block = key->objectid + key->offset;
+insert_failed:
 		path.slots[0]++;
 	}
 	// FIXME -ENOSPC
 insert:
+	if (orig_root->extent_root == orig_root) {
+		BUG_ON(num_blocks != 1);
+		if ((root->current_insert.objectid <= ins->objectid &&
+		    root->current_insert.objectid + root->current_insert.offset >
+		    ins->objectid) ||
+		   (root->current_insert.objectid > ins->objectid &&
+		    root->current_insert.objectid <= ins->objectid + ins->offset) ||
+		   radix_tree_tag_get(&root->cache_radix, ins->objectid,
+				      CTREE_EXTENT_PENDING)) {
+			last_block = ins->objectid + 1;
+			search_start = last_block;
+			goto insert_failed;
+		}
+	}
 	release_path(root, &path);
+	if (ins->offset != 1)
+		BUG();
+	return 0;
+}
+
+static int insert_pending_extents(struct ctree_root *extent_root)
+{
+	int ret;
+	struct key key;
+	struct extent_item item;
+	struct tree_buffer *gang[4];
+	int i;
+
+	// FIXME -ENOSPC
+	item.refs = 1;
+	item.owner = extent_root->node->node.header.parentid;
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+						 (void **)gang, 0, ARRAY_SIZE(gang),
+						 CTREE_EXTENT_PENDING);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			key.objectid = gang[i]->blocknr;
+			key.flags = 0;
+			key.offset = 1;
+			ret = insert_item(extent_root, &key, &item, sizeof(item));
+			if (ret) {
+				BUG();
+				// FIXME undo it and return sane
+				return ret;
+			}
+			radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
+						CTREE_EXTENT_PENDING);
+			tree_block_release(extent_root, gang[i]);
+		}
+	}
+	return 0;
+}
+
+int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
+			 u64 search_end, u64 owner, struct key *ins, struct tree_buffer **buf)
+{
+	int ret;
+	int pending_ret;
+	struct extent_item extent_item;
+
 	extent_item.refs = 1;
 	extent_item.owner = owner;
-	if (root == orig_root && root->reserve_extent->num_blocks == 0) {
-		root->reserve_extent->blocknr = ins->objectid;
-		root->reserve_extent->num_blocks = ins->offset;
-		root->reserve_extent->num_used = 0;
+
+	ret = find_free_extent(root, num_blocks, search_start, search_end, ins);
+	if (ret)
+		return ret;
+
+	if (root != root->extent_root) {
+		memcpy(&root->extent_root->current_insert, ins, sizeof(*ins));
+		ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
+		memset(&root->extent_root->current_insert, 0, sizeof(struct key));
+		pending_ret = insert_pending_extents(root->extent_root);
+		if (ret)
+			return ret;
+		if (pending_ret)
+			return pending_ret;
+		*buf = find_tree_block(root, ins->objectid);
+		return 0;
 	}
-	ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
-	return ret;
+	/* we're allocating an extent for the extent tree, don't recurse */
+	BUG_ON(ins->offset != 1);
+	*buf = find_tree_block(root, ins->objectid);
+	BUG_ON(!*buf);
+	radix_tree_tag_set(&root->cache_radix, ins->objectid, CTREE_EXTENT_PENDING);
+	(*buf)->count++;
+	return 0;
+
 }
 
-static int refill_alloc_extent(struct ctree_root *root)
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
 {
-	struct alloc_extent *ae = root->alloc_extent;
-	struct key key;
+	struct key ins;
 	int ret;
-	int min_blocks = MAX_LEVEL * 2;
+	struct tree_buffer *buf = NULL;
 
-	if (ae->num_blocks > ae->num_used && ae->num_blocks - ae->num_used >
-	    min_blocks)
-		return 0;
-	ae = root->reserve_extent;
-	if (ae->num_blocks > ae->num_used) {
-		if (root->alloc_extent->num_blocks == 0) {
-			/* we should swap reserve/alloc_extent when alloc
-			 * fills up
-			 */
-			BUG();
-		}
-		if (ae->num_blocks - ae->num_used < min_blocks)
-			BUG();
-		return 0;
+	ret = alloc_extent(root, 1, 0, (unsigned long)-1, root->node->node.header.parentid,
+			   &ins, &buf);
+
+	if (ret) {
+		BUG();
+		return NULL;
 	}
-	ret = alloc_extent(root,
-			   min_blocks * 2, 0, (unsigned long)-1,
-			   root->node->node.header.parentid, &key);
-	ae->blocknr = key.objectid;
-	ae->num_blocks = key.offset;
-	ae->num_used = 0;
-	return ret;
+	if (root != root->extent_root)
+		BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, buf->blocknr,
+					  CTREE_EXTENT_PENDING));
+	return buf;
 }
 
 void print_leaf(struct leaf *l)
@@ -1096,6 +1235,7 @@ int main() {
 	print_tree(root, root->node);
 	printf("map tree\n");
 	print_tree(root->extent_root, root->extent_root->node);
+	fflush(stdout);
 
 	srand(55);
 	for (i = 0; i < run_size; i++) {
@@ -1111,12 +1251,6 @@ int main() {
 		if (!ret)
 			tree_size++;
 	}
-	printf("root used: %lu\n", root->alloc_extent->num_used);
-	printf("root tree\n");
-	// print_tree(root, root->node);
-	printf("map tree\n");
-	printf("map used: %lu\n", root->extent_root->alloc_extent->num_used);
-	// print_tree(root->extent_root, root->extent_root->node);
 	write_ctree_super(root, &super);
 	close_ctree(root);
 
@@ -1167,12 +1301,27 @@ int main() {
 		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
+		if (i >= 5) {
+			struct key ugh;
+			ugh.objectid = 5;
+			ugh.flags = 0;
+			ugh.offset = 0;
+			init_path(&path);
+			ret = search_slot(root, &ugh, &path, 0);
+			if (ret) {
+				print_tree(root, root->node);
+				printf("unable to find 5 %d\n", num);
+				exit(1);
+			}
+			release_path(root, &path);
+
+		}
 	}
 	write_ctree_super(root, &super);
 	close_ctree(root);
 	root = open_ctree("dbfile", &super);
-	printf("starting search2\n");
 	srand(128);
+	printf("starting search2\n");
 	for (i = 0; i < run_size; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
@@ -1219,5 +1368,7 @@ int main() {
 	write_ctree_super(root, &super);
 	close_ctree(root);
 	printf("tree size is now %d\n", tree_size);
+	printf("map tree\n");
+	print_tree(root->extent_root, root->extent_root->node);
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78407d32db7..8c32c0e9267 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,21 +28,12 @@ struct header {
 
 struct tree_buffer;
 
-struct alloc_extent {
-	u64 blocknr;
-	u64 num_blocks;
-	u64 num_used;
-} __attribute__ ((__packed__));
-
 struct ctree_root {
 	struct tree_buffer *node;
 	struct ctree_root *extent_root;
-	struct alloc_extent *alloc_extent;
-	struct alloc_extent *reserve_extent;
+	struct key current_insert;
 	int fp;
 	struct radix_tree_root cache_radix;
-	struct alloc_extent ai1;
-	struct alloc_extent ai2;
 };
 
 struct ctree_root_info {
@@ -52,8 +43,6 @@ struct ctree_root_info {
 	u64 tree_root; /* the tree root */
 	u32 csum;
 	u32 ham;
-	struct alloc_extent alloc_extent;
-	struct alloc_extent reserve_extent;
 	u64 snapuuid[2]; /* root specific uuid */
 } __attribute__ ((__packed__));
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a696a4278ac..14955e44077 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -12,33 +12,13 @@
 
 static int allocated_blocks = 0;
 
-static int get_free_block(struct ctree_root *root, u64 *block)
+static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 {
-	struct stat st;
-	int ret = 0;
-
-	if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks)
-		return -1;
-
-	*block = root->alloc_extent->blocknr + root->alloc_extent->num_used;
-	root->alloc_extent->num_used += 1;
-	if (root->alloc_extent->num_used >= root->alloc_extent->num_blocks) {
-		struct alloc_extent *ae = root->alloc_extent;
-		root->alloc_extent = root->reserve_extent;
-		root->reserve_extent = ae;
-		ae->num_blocks = 0;
-	}
-	st.st_size = 0;
-	ret = fstat(root->fp, &st);
-	if (st.st_size < (*block + 1) * CTREE_BLOCKSIZE) {
-		ret = ftruncate(root->fp,
-				(*block + 1) * CTREE_BLOCKSIZE);
-		if (ret) {
-			perror("ftruncate");
-			exit(1);
-		}
-	}
-	return ret;
+	if (buf->blocknr != buf->node.header.blocknr)
+		BUG();
+	if (root->node && buf->node.header.parentid != root->node->node.header.parentid)
+		BUG();
+	return 0;
 }
 
 struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
@@ -61,22 +41,23 @@ struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
 	return buf;
 }
 
-struct tree_buffer *alloc_free_block(struct ctree_root *root)
+struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr)
 {
-	u64 free_block;
-	int ret;
-	struct tree_buffer * buf;
-	ret = get_free_block(root, &free_block);
-	if (ret) {
-		BUG();
-		return NULL;
+	struct tree_buffer *buf;
+	buf = radix_tree_lookup(&root->cache_radix, blocknr);
+	if (buf) {
+		buf->count++;
+	} else {
+		buf = alloc_tree_block(root, blocknr);
+		if (!buf) {
+			BUG();
+			return NULL;
+		}
 	}
-	buf = alloc_tree_block(root, free_block);
-	if (!buf)
-		BUG();
 	return buf;
 }
 
+
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 {
 	loff_t offset = blocknr * CTREE_BLOCKSIZE;
@@ -86,20 +67,17 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 	buf = radix_tree_lookup(&root->cache_radix, blocknr);
 	if (buf) {
 		buf->count++;
-		goto test;
-	}
-	buf = alloc_tree_block(root, blocknr);
-	if (!buf)
-		return NULL;
-	ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
-	if (ret != CTREE_BLOCKSIZE) {
-		free(buf);
-		return NULL;
+	} else {
+		buf = alloc_tree_block(root, blocknr);
+		if (!buf)
+			return NULL;
+		ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+		if (ret != CTREE_BLOCKSIZE) {
+			free(buf);
+			return NULL;
+		}
 	}
-test:
-	if (buf->blocknr != buf->node.header.blocknr)
-		BUG();
-	if (root->node && buf->node.header.parentid != root->node->node.header.parentid)
+	if (check_tree_block(root, buf))
 		BUG();
 	return buf;
 }
@@ -121,17 +99,10 @@ int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
 			struct ctree_root_info *info, int fp)
 {
-	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
 	root->fp = fp;
 	root->node = NULL;
 	root->node = read_tree_block(root, info->tree_root);
 	root->extent_root = extent_root;
-	memcpy(&root->ai1, &info->alloc_extent, sizeof(info->alloc_extent));
-	memcpy(&root->ai2, &info->reserve_extent, sizeof(info->reserve_extent));
-	root->alloc_extent = &root->ai1;
-	root->reserve_extent = &root->ai2;
-	printf("setup done reading root %p, used %lu available %lu\n", root, root->alloc_extent->num_used, root->alloc_extent->num_blocks);
-	printf("setup done reading root %p, reserve used %lu available %lu\n", root, root->reserve_extent->num_used, root->reserve_extent->num_blocks);
 	return 0;
 }
 
@@ -147,6 +118,8 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 		free(root);
 		return NULL;
 	}
+	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL);
 	ret = pread(fp, super, sizeof(struct ctree_super_block),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
 	if (ret == 0 || super->root_info.tree_root == 0) {
@@ -168,8 +141,6 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 static int __update_root(struct ctree_root *root, struct ctree_root_info *info)
 {
 	info->tree_root = root->node->blocknr;
-	memcpy(&info->alloc_extent, root->alloc_extent, sizeof(struct alloc_extent));
-	memcpy(&info->reserve_extent, root->reserve_extent, sizeof(struct alloc_extent));
 	return 0;
 }
 
@@ -201,6 +172,7 @@ int close_ctree(struct ctree_root *root)
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
 {
 	buf->count--;
+	write_tree_block(root, buf);
 	if (buf->count < 0)
 		BUG();
 	if (buf->count == 0) {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e288fe883b9..2729b757ddc 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -11,11 +11,11 @@ struct tree_buffer {
 };
 
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
+struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr);
 int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
 struct ctree_root *open_ctree(char *filename, struct ctree_super_block *s);
 int close_ctree(struct ctree_root *root);
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
-struct tree_buffer *alloc_free_block(struct ctree_root *root);
 int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s);
 int mkfs(int fd);
 
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index 584aba44e80..fd4e5dea7e1 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -23,17 +23,10 @@ int mkfs(int fd)
 	info[0].blocknr = 16;
 	info[0].objectid = 1;
 	info[0].tree_root = 17;
-	info[0].alloc_extent.blocknr = 0;
-	info[0].alloc_extent.num_blocks = 64;
-	/* 0-17 are used (inclusive) */
-	info[0].alloc_extent.num_used = 18;
 
 	info[1].blocknr = 16;
 	info[1].objectid = 2;
-	info[1].tree_root = 64;
-	info[1].alloc_extent.blocknr = 64;
-	info[1].alloc_extent.num_blocks = 64;
-	info[1].alloc_extent.num_used = 1;
+	info[1].tree_root = 18;
 	ret = pwrite(fd, info, sizeof(info),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
 	if (ret != sizeof(info))
@@ -48,24 +41,36 @@ int mkfs(int fd)
 		return -1;
 
 	empty_leaf.header.parentid = 2;
-	empty_leaf.header.blocknr = 64;
-	empty_leaf.header.nritems = 2;
+	empty_leaf.header.blocknr = 18;
+	empty_leaf.header.nritems = 3;
+
+	/* item1, reserve blocks 0-16 */
 	item.key.objectid = 0;
-	item.key.offset = 64;
+	item.key.offset = 17;
 	item.key.flags = 0;
 	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item);
 	item.size = sizeof(struct extent_item);
 	extent_item.refs = 1;
-	extent_item.owner = 1;
+	extent_item.owner = 0;
 	memcpy(empty_leaf.items, &item, sizeof(item));
 	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
-	item.key.objectid = 64;
-	item.key.offset = 64;
+
+	/* item2, give block 17 to the root */
+	item.key.objectid = 17;
+	item.key.offset = 1;
 	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 2;
-	extent_item.owner = 2;
+	extent_item.owner = 1;
 	memcpy(empty_leaf.items + 1, &item, sizeof(item));
 	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
-	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 64 * CTREE_BLOCKSIZE);
+
+	/* item3, give block 18 for the extent root */
+	item.key.objectid = 18;
+	item.key.offset = 1;
+	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 3;
+	extent_item.owner = 2;
+	memcpy(empty_leaf.items + 2, &item, sizeof(item));
+	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
+	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 18 * CTREE_BLOCKSIZE);
 	if (ret != sizeof(empty_leaf))
 		return -1;
 	return 0;
-- 
cgit v1.2.3


From 5de08d7d50ba535b968d97101ed2ab07c2b8eb7d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 24 Feb 2007 06:24:44 -0500
Subject: Btrfs: Break up ctree.c a little Extent fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile     |  16 +-
 fs/btrfs/ctree.c      | 446 +++++++++-----------------------------------------
 fs/btrfs/ctree.h      |  12 +-
 fs/btrfs/disk-io.c    |   1 -
 fs/btrfs/print-tree.c |  72 ++++++++
 fs/btrfs/print-tree.h |   3 +
 6 files changed, 176 insertions(+), 374 deletions(-)
 create mode 100644 fs/btrfs/print-tree.c
 create mode 100644 fs/btrfs/print-tree.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index df065dd2dce..fe73ab9d81d 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,12 +1,16 @@
 
-CFLAGS= -g -Wall
+CFLAGS = -g -Wall
+headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h
+objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
 
-.c.o:
-	$(CC) $(CFLAGS) -c $<
+#.c.o:
+#	$(CC) $(CFLAGS) -c $<
 
-ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h mkfs.o
-	gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o mkfs.o
+ctree : $(objects)
+	gcc $(CFLAGS) -o ctree $(objects)
 
-clean:
+$(objects) : $(headers)
+
+clean :
 	rm ctree *.o
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f0abcf1f393..e497fd96311 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -4,23 +4,21 @@
 #include "radix-tree.h"
 #include "ctree.h"
 #include "disk-io.h"
-
-#define SEARCH_READ 0
-#define SEARCH_WRITE 1
-
-#define CTREE_EXTENT_PENDING 0
+#include "print-tree.h"
 
 int split_node(struct ctree_root *root, struct ctree_path *path, int level);
 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size);
-struct tree_buffer *alloc_free_block(struct ctree_root *root);
-int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
+int push_node_left(struct ctree_root *root, struct ctree_path *path, int level);
+int push_node_right(struct ctree_root *root,
+		    struct ctree_path *path, int level);
+int del_ptr(struct ctree_root *root, struct ctree_path *path, int level);
 
-static inline void init_path(struct ctree_path *p)
+inline void init_path(struct ctree_path *p)
 {
 	memset(p, 0, sizeof(*p));
 }
 
-static void release_path(struct ctree_root *root, struct ctree_path *p)
+void release_path(struct ctree_root *root, struct ctree_path *p)
 {
 	int i;
 	for (i = 0; i < MAX_LEVEL; i++) {
@@ -48,7 +46,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-static inline int leaf_free_space(struct leaf *leaf)
+int leaf_free_space(struct leaf *leaf)
 {
 	int data_end = leaf_data_end(leaf);
 	int nritems = leaf->header.nritems;
@@ -133,7 +131,8 @@ int bin_search(struct node *c, struct key *key, int *slot)
  * If the key isn't found, the path points to the slot where it should
  * be inserted.
  */
-int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len)
+int search_slot(struct ctree_root *root, struct key *key,
+		struct ctree_path *p, int ins_len)
 {
 	struct tree_buffer *b = root->node;
 	struct node *c;
@@ -151,7 +150,8 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p,
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			if (ins_len && c->header.nritems == NODEPTRS_PER_BLOCK) {
+			if (ins_len > 0 &&
+			    c->header.nritems == NODEPTRS_PER_BLOCK) {
 				int sret = split_node(root, p, level);
 				BUG_ON(sret > 0);
 				if (sret)
@@ -159,13 +159,37 @@ int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p,
 				b = p->nodes[level];
 				c = &b->node;
 				slot = p->slots[level];
+			} else if (ins_len < 0 &&
+				   c->header.nritems <= NODEPTRS_PER_BLOCK/4) {
+				u64 blocknr = b->blocknr;
+				slot = p->slots[level +1];
+				b->count++;
+				if (push_node_left(root, p, level))
+					push_node_right(root, p, level);
+				if (c->header.nritems == 0 &&
+				    level < MAX_LEVEL - 1 &&
+				    p->nodes[level + 1]) {
+					int tslot = p->slots[level + 1];
+
+					p->slots[level + 1] = slot;
+					del_ptr(root, p, level + 1);
+					p->slots[level + 1] = tslot;
+					tree_block_release(root, b);
+					free_extent(root, blocknr, 1);
+				} else {
+					tree_block_release(root, b);
+				}
+				b = p->nodes[level];
+				c = &b->node;
+				slot = p->slots[level];
 			}
 			b = read_tree_block(root, c->blockptrs[slot]);
 			continue;
 		} else {
 			struct leaf *l = (struct leaf *)c;
 			p->slots[level] = slot;
-			if (ins_len && leaf_free_space(l) <  sizeof(struct item) + ins_len) {
+			if (ins_len > 0 && leaf_free_space(l) <
+			    sizeof(struct item) + ins_len) {
 				int sret = split_leaf(root, p, ins_len);
 				BUG_ON(sret > 0);
 				if (sret)
@@ -355,7 +379,8 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	return 0;
 }
 
-static int insert_new_root(struct ctree_root *root, struct ctree_path *path, int level)
+static int insert_new_root(struct ctree_root *root,
+			   struct ctree_path *path, int level)
 {
 	struct tree_buffer *t;
 	struct node *lower;
@@ -463,7 +488,7 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level)
 	write_tree_block(root, split_buffer);
 	insert_ptr(root, path, split->keys, split_buffer->blocknr,
 		     path->slots[level + 1] + 1, level + 1);
-	if (path->slots[level] > mid) {
+	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
 		tree_block_release(root, t);
 		path->nodes[level] = split_buffer;
@@ -744,8 +769,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 }
 
 /*
- * delete the pointer from a given level in the path.  The path is not
- * fixed up, so after calling this it is not valid at that level.
+ * delete the pointer from a given node.
  *
  * If the delete empties a node, the node is removed from the tree,
  * continuing all the way the root if required.  The root is converted into
@@ -778,22 +802,10 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 		write_tree_block(root, t);
 		blocknr = t->blocknr;
 		if (node->header.nritems != 0) {
-			int tslot;
 			if (slot == 0)
 				fixup_low_keys(root, path, node->keys,
 					       level + 1);
-			tslot = path->slots[level+1];
-			t->count++;
-			push_node_left(root, path, level);
-			if (node->header.nritems) {
-				push_node_right(root, path, level);
-			}
-			if (node->header.nritems) {
-				tree_block_release(root, t);
-				break;
-			}
-			tree_block_release(root, t);
-			path->slots[level+1] = tslot;
+			break;
 		}
 		if (t == root->node) {
 			/* just turn the root into a leaf and break */
@@ -850,12 +862,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			free_extent(root, leaf_buf->blocknr, 1);
 		}
 	} else {
+		int used = leaf_space_used(leaf, 0, leaf->header.nritems);
 		if (slot == 0)
 			fixup_low_keys(root, path, &leaf->items[0].key, 1);
 		write_tree_block(root, leaf_buf);
 		/* delete the leaf if it is mostly empty */
-		if (leaf_space_used(leaf, 0, leaf->header.nritems) <
-		    LEAF_DATA_SIZE / 4) {
+		if (used < LEAF_DATA_SIZE / 3) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
@@ -864,81 +876,19 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			leaf_buf->count++;
 			push_leaf_left(root, path, 1);
 			if (leaf->header.nritems == 0) {
+				u64 blocknr = leaf_buf->blocknr;
 				path->slots[1] = slot;
 				del_ptr(root, path, 1);
+				tree_block_release(root, leaf_buf);
+				free_extent(root, blocknr, 1);
+			} else {
+				tree_block_release(root, leaf_buf);
 			}
-			tree_block_release(root, leaf_buf);
 		}
 	}
 	return 0;
 }
 
-static int del_pending_extents(struct ctree_root *extent_root)
-{
-	int ret;
-	struct key key;
-	struct tree_buffer *gang[4];
-	int i;
-	struct ctree_path path;
-
-	while(1) {
-		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
-						 (void **)gang, 0, ARRAY_SIZE(gang),
-						 CTREE_EXTENT_PENDING);
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++) {
-			key.objectid = gang[i]->blocknr;
-			key.flags = 0;
-			key.offset = 1;
-			init_path(&path);
-			ret = search_slot(extent_root, &key, &path, 0);
-			if (ret) {
-				BUG();
-				// FIXME undo it and return sane
-				return ret;
-			}
-			ret = del_item(extent_root, &path);
-			if (ret) {
-				BUG();
-				return ret;
-			}
-			release_path(extent_root, &path);
-			radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
-						CTREE_EXTENT_PENDING);
-			tree_block_release(extent_root, gang[i]);
-		}
-	}
-	return 0;
-}
-
-int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
-{
-	struct ctree_path path;
-	struct key key;
-	struct ctree_root *extent_root = root->extent_root;
-	struct tree_buffer *t;
-	int pending_ret;
-	int ret;
-
-	key.objectid = blocknr;
-	key.flags = 0;
-	key.offset = num_blocks;
-	if (root == extent_root) {
-		t = read_tree_block(root, key.objectid);
-		radix_tree_tag_set(&root->cache_radix, key.objectid, CTREE_EXTENT_PENDING);
-		return 0;
-	}
-	init_path(&path);
-	ret = search_slot(extent_root, &key, &path, 0);
-	if (ret)
-		BUG();
-	ret = del_item(extent_root, &path);
-	release_path(extent_root, &path);
-	pending_ret = del_pending_extents(root->extent_root);
-	return ret ? ret : pending_ret;
-}
-
 int next_leaf(struct ctree_root *root, struct ctree_path *path)
 {
 	int slot;
@@ -976,241 +926,10 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
-int find_free_extent(struct ctree_root *orig_root, u64 num_blocks, u64 search_start,
-			 u64 search_end, struct key *ins)
-{
-	struct ctree_path path;
-	struct key *key;
-	int ret;
-	u64 hole_size = 0;
-	int slot = 0;
-	u64 last_block;
-	int start_found = 0;
-	struct leaf *l;
-	struct ctree_root * root = orig_root->extent_root;
-
-	init_path(&path);
-	ins->objectid = search_start;
-	ins->offset = 0;
-	ins->flags = 0;
-	ret = search_slot(root, ins, &path, 0);
-	while (1) {
-		l = &path.nodes[0]->leaf;
-		slot = path.slots[0];
-		if (!l) {
-			// FIXME allocate root
-		}
-		if (slot >= l->header.nritems) {
-			ret = next_leaf(root, &path);
-			if (ret == 0)
-				continue;
-			if (!start_found) {
-				ins->objectid = search_start;
-				ins->offset = num_blocks;
-				hole_size = search_end - search_start;
-				start_found = 1;
-				goto insert;
-			}
-			ins->objectid = last_block;
-			ins->offset = num_blocks;
-			hole_size = search_end - last_block;
-			goto insert;
-		}
-		key = &l->items[slot].key;
-		if (start_found) {
-			hole_size = key->objectid - last_block;
-			if (hole_size > num_blocks) {
-				ins->objectid = last_block;
-				ins->offset = num_blocks;
-				goto insert;
-			}
-		} else
-			start_found = 1;
-		last_block = key->objectid + key->offset;
-insert_failed:
-		path.slots[0]++;
-	}
-	// FIXME -ENOSPC
-insert:
-	if (orig_root->extent_root == orig_root) {
-		BUG_ON(num_blocks != 1);
-		if ((root->current_insert.objectid <= ins->objectid &&
-		    root->current_insert.objectid + root->current_insert.offset >
-		    ins->objectid) ||
-		   (root->current_insert.objectid > ins->objectid &&
-		    root->current_insert.objectid <= ins->objectid + ins->offset) ||
-		   radix_tree_tag_get(&root->cache_radix, ins->objectid,
-				      CTREE_EXTENT_PENDING)) {
-			last_block = ins->objectid + 1;
-			search_start = last_block;
-			goto insert_failed;
-		}
-	}
-	release_path(root, &path);
-	if (ins->offset != 1)
-		BUG();
-	return 0;
-}
-
-static int insert_pending_extents(struct ctree_root *extent_root)
-{
-	int ret;
-	struct key key;
-	struct extent_item item;
-	struct tree_buffer *gang[4];
-	int i;
-
-	// FIXME -ENOSPC
-	item.refs = 1;
-	item.owner = extent_root->node->node.header.parentid;
-	while(1) {
-		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
-						 (void **)gang, 0, ARRAY_SIZE(gang),
-						 CTREE_EXTENT_PENDING);
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++) {
-			key.objectid = gang[i]->blocknr;
-			key.flags = 0;
-			key.offset = 1;
-			ret = insert_item(extent_root, &key, &item, sizeof(item));
-			if (ret) {
-				BUG();
-				// FIXME undo it and return sane
-				return ret;
-			}
-			radix_tree_tag_clear(&extent_root->cache_radix, gang[i]->blocknr,
-						CTREE_EXTENT_PENDING);
-			tree_block_release(extent_root, gang[i]);
-		}
-	}
-	return 0;
-}
-
-int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
-			 u64 search_end, u64 owner, struct key *ins, struct tree_buffer **buf)
-{
-	int ret;
-	int pending_ret;
-	struct extent_item extent_item;
-
-	extent_item.refs = 1;
-	extent_item.owner = owner;
-
-	ret = find_free_extent(root, num_blocks, search_start, search_end, ins);
-	if (ret)
-		return ret;
-
-	if (root != root->extent_root) {
-		memcpy(&root->extent_root->current_insert, ins, sizeof(*ins));
-		ret = insert_item(root->extent_root, ins, &extent_item, sizeof(extent_item));
-		memset(&root->extent_root->current_insert, 0, sizeof(struct key));
-		pending_ret = insert_pending_extents(root->extent_root);
-		if (ret)
-			return ret;
-		if (pending_ret)
-			return pending_ret;
-		*buf = find_tree_block(root, ins->objectid);
-		return 0;
-	}
-	/* we're allocating an extent for the extent tree, don't recurse */
-	BUG_ON(ins->offset != 1);
-	*buf = find_tree_block(root, ins->objectid);
-	BUG_ON(!*buf);
-	radix_tree_tag_set(&root->cache_radix, ins->objectid, CTREE_EXTENT_PENDING);
-	(*buf)->count++;
-	return 0;
-
-}
-
-struct tree_buffer *alloc_free_block(struct ctree_root *root)
-{
-	struct key ins;
-	int ret;
-	struct tree_buffer *buf = NULL;
-
-	ret = alloc_extent(root, 1, 0, (unsigned long)-1, root->node->node.header.parentid,
-			   &ins, &buf);
-
-	if (ret) {
-		BUG();
-		return NULL;
-	}
-	if (root != root->extent_root)
-		BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix, buf->blocknr,
-					  CTREE_EXTENT_PENDING));
-	return buf;
-}
-
-void print_leaf(struct leaf *l)
-{
-	int i;
-	int nr = l->header.nritems;
-	struct item *item;
-	struct extent_item *ei;
-	printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
-	       leaf_free_space(l));
-	fflush(stdout);
-	for (i = 0 ; i < nr ; i++) {
-		item = l->items + i;
-		printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n",
-			i,
-			item->key.objectid, item->key.flags, item->key.offset,
-			item->offset, item->size);
-		fflush(stdout);
-		printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
-		ei = (struct extent_item *)(l->data + item->offset);
-		printf("\t\textent data %u %lu\n", ei->refs, ei->owner);
-		fflush(stdout);
-	}
-}
-void print_tree(struct ctree_root *root, struct tree_buffer *t)
-{
-	int i;
-	int nr;
-	struct node *c;
-
-	if (!t)
-		return;
-	c = &t->node;
-	nr = c->header.nritems;
-	if (c->header.blocknr != t->blocknr)
-		BUG();
-	if (is_leaf(c->header.flags)) {
-		print_leaf((struct leaf *)c);
-		return;
-	}
-	printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
-	        node_level(c->header.flags), c->header.nritems,
-		NODEPTRS_PER_BLOCK - c->header.nritems);
-	fflush(stdout);
-	for (i = 0; i < nr; i++) {
-		printf("\tkey %d (%lu %u %lu) block %lu\n",
-		       i,
-		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
-		       c->blockptrs[i]);
-		fflush(stdout);
-	}
-	for (i = 0; i < nr; i++) {
-		struct tree_buffer *next_buf = read_tree_block(root,
-							    c->blockptrs[i]);
-		struct node *next = &next_buf->node;
-		if (is_leaf(next->header.flags) &&
-		    node_level(c->header.flags) != 1)
-			BUG();
-		if (node_level(next->header.flags) !=
-			node_level(c->header.flags) - 1)
-			BUG();
-		print_tree(root, next_buf);
-		tree_block_release(root, next_buf);
-	}
-
-}
-
 /* for testing only */
 int next_key(int i, int max_key) {
-	// return rand() % max_key;
-	return i;
+	return rand() % max_key;
+	// return i;
 }
 
 int main() {
@@ -1221,8 +940,8 @@ int main() {
 	int i;
 	int num;
 	int ret;
-	int run_size = 10000;
-	int max_key = 100000000;
+	int run_size = 20000000;
+	int max_key =  100000000;
 	int tree_size = 0;
 	struct ctree_path path;
 	struct ctree_super_block super;
@@ -1231,11 +950,6 @@ int main() {
 
 
 	root = open_ctree("dbfile", &super);
-	printf("root tree\n");
-	print_tree(root, root->node);
-	printf("map tree\n");
-	print_tree(root->extent_root, root->extent_root->node);
-	fflush(stdout);
 
 	srand(55);
 	for (i = 0; i < run_size; i++) {
@@ -1243,13 +957,15 @@ int main() {
 		num = next_key(i, max_key);
 		// num = i;
 		sprintf(buf, "string-%d", num);
-		// printf("insert %d\n", num);
+		if (i % 10000 == 0)
+			printf("insert %d:%d\n", num, i);
 		ins.objectid = num;
 		ins.offset = 0;
 		ins.flags = 0;
 		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
+		free(buf);
 	}
 	write_ctree_super(root, &super);
 	close_ctree(root);
@@ -1261,6 +977,8 @@ int main() {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
+		if (i % 10000 == 0)
+			printf("search %d:%d\n", num, i);
 		ret = search_slot(root, &ins, &path, 0);
 		if (ret) {
 			print_tree(root, root->node);
@@ -1283,39 +1001,32 @@ int main() {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path, 0);
-		if (ret)
-			continue;
-		ret = del_item(root, &path);
-		if (ret != 0)
-			BUG();
+		ret = search_slot(root, &ins, &path, -1);
+		if (!ret) {
+			if (i % 10000 == 0)
+				printf("del %d:%d\n", num, i);
+			ret = del_item(root, &path);
+			if (ret != 0)
+				BUG();
+			tree_size--;
+		}
 		release_path(root, &path);
-		tree_size--;
 	}
+	write_ctree_super(root, &super);
+	close_ctree(root);
+	root = open_ctree("dbfile", &super);
 	srand(128);
 	for (i = 0; i < run_size; i++) {
 		buf = malloc(64);
 		num = next_key(i, max_key);
 		sprintf(buf, "string-%d", num);
 		ins.objectid = num;
+		if (i % 10000 == 0)
+			printf("insert %d:%d\n", num, i);
 		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
-		if (i >= 5) {
-			struct key ugh;
-			ugh.objectid = 5;
-			ugh.flags = 0;
-			ugh.offset = 0;
-			init_path(&path);
-			ret = search_slot(root, &ugh, &path, 0);
-			if (ret) {
-				print_tree(root, root->node);
-				printf("unable to find 5 %d\n", num);
-				exit(1);
-			}
-			release_path(root, &path);
-
-		}
+		free(buf);
 	}
 	write_ctree_super(root, &super);
 	close_ctree(root);
@@ -1326,6 +1037,8 @@ int main() {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
+		if (i % 10000 == 0)
+			printf("search %d:%d\n", num, i);
 		ret = search_slot(root, &ins, &path, 0);
 		if (ret) {
 			print_tree(root, root->node);
@@ -1340,7 +1053,7 @@ int main() {
 		int slot;
 		ins.objectid = (u64)-1;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path, 0);
+		ret = search_slot(root, &ins, &path, -1);
 		if (ret == 0)
 			BUG();
 
@@ -1356,6 +1069,8 @@ int main() {
 			if (comp_keys(&last, &leaf->items[slot].key) <= 0)
 				BUG();
 			memcpy(&last, &leaf->items[slot].key, sizeof(last));
+			if (tree_size % 10000 == 0)
+				printf("big del %d:%d\n", tree_size, i);
 			ret = del_item(root, &path);
 			if (ret != 0) {
 				printf("del_item returned %d\n", ret);
@@ -1365,10 +1080,9 @@ int main() {
 		}
 		release_path(root, &path);
 	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
 	printf("tree size is now %d\n", tree_size);
 	printf("map tree\n");
-	print_tree(root->extent_root, root->extent_root->node);
+	write_ctree_super(root, &super);
+	close_ctree(root);
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8c32c0e9267..b92fbbb5ecd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,7 +1,7 @@
 #ifndef __CTREE__
 #define __CTREE__
 
-#define CTREE_BLOCKSIZE 256
+#define CTREE_BLOCKSIZE 4096
 
 struct key {
 	u64 objectid;
@@ -81,4 +81,14 @@ struct ctree_path {
 	struct tree_buffer *nodes[MAX_LEVEL];
 	int slots[MAX_LEVEL];
 };
+
+struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
+int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len);
+void release_path(struct ctree_root *root, struct ctree_path *p);
+void init_path(struct ctree_path *p);
+int del_item(struct ctree_root *root, struct ctree_path *path);
+int insert_item(struct ctree_root *root, struct key *key, void *data, int data_size);
+int next_leaf(struct ctree_root *root, struct ctree_path *path);
+int leaf_free_space(struct leaf *leaf);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 14955e44077..f4c6ff202ba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -172,7 +172,6 @@ int close_ctree(struct ctree_root *root)
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
 {
 	buf->count--;
-	write_tree_block(root, buf);
 	if (buf->count < 0)
 		BUG();
 	if (buf->count == 0) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
new file mode 100644
index 00000000000..594d23b5b24
--- /dev/null
+++ b/fs/btrfs/print-tree.c
@@ -0,0 +1,72 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
+void print_leaf(struct leaf *l)
+{
+	int i;
+	int nr = l->header.nritems;
+	struct item *item;
+	struct extent_item *ei;
+	printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
+	       leaf_free_space(l));
+	fflush(stdout);
+	for (i = 0 ; i < nr ; i++) {
+		item = l->items + i;
+		printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n",
+			i,
+			item->key.objectid, item->key.flags, item->key.offset,
+			item->offset, item->size);
+		fflush(stdout);
+		printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
+		ei = (struct extent_item *)(l->data + item->offset);
+		printf("\t\textent data %u %lu\n", ei->refs, ei->owner);
+		fflush(stdout);
+	}
+}
+void print_tree(struct ctree_root *root, struct tree_buffer *t)
+{
+	int i;
+	int nr;
+	struct node *c;
+
+	if (!t)
+		return;
+	c = &t->node;
+	nr = c->header.nritems;
+	if (c->header.blocknr != t->blocknr)
+		BUG();
+	if (is_leaf(c->header.flags)) {
+		print_leaf((struct leaf *)c);
+		return;
+	}
+	printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
+	        node_level(c->header.flags), c->header.nritems,
+		NODEPTRS_PER_BLOCK - c->header.nritems);
+	fflush(stdout);
+	for (i = 0; i < nr; i++) {
+		printf("\tkey %d (%lu %u %lu) block %lu\n",
+		       i,
+		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
+		       c->blockptrs[i]);
+		fflush(stdout);
+	}
+	for (i = 0; i < nr; i++) {
+		struct tree_buffer *next_buf = read_tree_block(root,
+							    c->blockptrs[i]);
+		struct node *next = &next_buf->node;
+		if (is_leaf(next->header.flags) &&
+		    node_level(c->header.flags) != 1)
+			BUG();
+		if (node_level(next->header.flags) !=
+			node_level(c->header.flags) - 1)
+			BUG();
+		print_tree(root, next_buf);
+		tree_block_release(root, next_buf);
+	}
+
+}
+
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
new file mode 100644
index 00000000000..3c1e9a3e026
--- /dev/null
+++ b/fs/btrfs/print-tree.h
@@ -0,0 +1,3 @@
+
+void print_leaf(struct leaf *l);
+void print_tree(struct ctree_root *root, struct tree_buffer *t);
-- 
cgit v1.2.3


From 00ec4c5161e0adcf8be3cd844cb40239dc393d70 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 24 Feb 2007 12:47:20 -0500
Subject: Btrfs: push_leaf_right

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 102 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e497fd96311..ef8bfa83753 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -517,6 +517,97 @@ int leaf_space_used(struct leaf *l, int start, int nr)
 	return data_len;
 }
 
+/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
+		   int data_size)
+{
+	struct tree_buffer *left_buf = path->nodes[0];
+	struct leaf *left = &left_buf->leaf;
+	struct leaf *right;
+	struct tree_buffer *right_buf;
+	struct tree_buffer *upper;
+	int slot;
+	int i;
+	int free_space;
+	int push_space = 0;
+	int push_items = 0;
+	struct item *item;
+
+	slot = path->slots[1];
+	if (!path->nodes[1]) {
+		return 1;
+	}
+	upper = path->nodes[1];
+	if (slot >= upper->node.header.nritems - 1) {
+		return 1;
+	}
+	right_buf = read_tree_block(root, upper->node.blockptrs[slot + 1]);
+	right = &right_buf->leaf;
+	free_space = leaf_free_space(right);
+	if (free_space < data_size + sizeof(struct item)) {
+		tree_block_release(root, right_buf);
+		return 1;
+	}
+	for (i = left->header.nritems - 1; i >= 0; i--) {
+		item = left->items + i;
+		if (path->slots[0] == i)
+			push_space += data_size + sizeof(*item);
+		if (item->size + sizeof(*item) + push_space > free_space)
+			break;
+		push_items++;
+		push_space += item->size + sizeof(*item);
+	}
+	if (push_items == 0) {
+		tree_block_release(root, right_buf);
+		return 1;
+	}
+	/* push left to right */
+	push_space = left->items[left->header.nritems - push_items].offset +
+		     left->items[left->header.nritems - push_items].size;
+	push_space -= leaf_data_end(left);
+	/* make room in the right data area */
+	memmove(right->data + leaf_data_end(right) - push_space,
+		right->data + leaf_data_end(right),
+		LEAF_DATA_SIZE - leaf_data_end(right));
+	/* copy from the left data area */
+	memcpy(right->data + LEAF_DATA_SIZE - push_space,
+		left->data + leaf_data_end(left),
+		push_space);
+	memmove(right->items + push_items, right->items,
+		right->header.nritems * sizeof(struct item));
+	/* copy the items from left to right */
+	memcpy(right->items, left->items + left->header.nritems - push_items,
+		push_items * sizeof(struct item));
+
+	/* update the item pointers */
+	right->header.nritems += push_items;
+	push_space = LEAF_DATA_SIZE;
+	for (i = 0; i < right->header.nritems; i++) {
+		right->items[i].offset = push_space - right->items[i].size;
+		push_space = right->items[i].offset;
+	}
+	left->header.nritems -= push_items;
+
+	write_tree_block(root, left_buf);
+	write_tree_block(root, right_buf);
+	memcpy(upper->node.keys + slot + 1,
+		&right->items[0].key, sizeof(struct key));
+	write_tree_block(root, upper);
+	/* then fixup the leaf pointer in the path */
+	// FIXME use nritems in here somehow
+	if (path->slots[0] >= left->header.nritems) {
+		path->slots[0] -= left->header.nritems;
+		tree_block_release(root, path->nodes[0]);
+		path->nodes[0] = right_buf;
+		path->slots[1] += 1;
+	} else {
+		tree_block_release(root, right_buf);
+	}
+	return 0;
+}
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
@@ -631,7 +722,8 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	int i;
 	int ret;
 
-	if (push_leaf_left(root, path, data_size) == 0) {
+	if (push_leaf_left(root, path, data_size) == 0 ||
+	    push_leaf_right(root, path, data_size) == 0) {
 		l_buf = path->nodes[0];
 		l = &l_buf->leaf;
 		if (leaf_free_space(l) >= sizeof(struct item) + data_size)
@@ -875,6 +967,8 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			slot = path->slots[1];
 			leaf_buf->count++;
 			push_leaf_left(root, path, 1);
+			if (leaf->header.nritems)
+				push_leaf_right(root, path, 1);
 			if (leaf->header.nritems == 0) {
 				u64 blocknr = leaf_buf->blocknr;
 				path->slots[1] = slot;
@@ -929,7 +1023,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 /* for testing only */
 int next_key(int i, int max_key) {
 	return rand() % max_key;
-	// return i;
+	//return i;
 }
 
 int main() {
@@ -958,7 +1052,7 @@ int main() {
 		// num = i;
 		sprintf(buf, "string-%d", num);
 		if (i % 10000 == 0)
-			printf("insert %d:%d\n", num, i);
+			fprintf(stderr, "insert %d:%d\n", num, i);
 		ins.objectid = num;
 		ins.offset = 0;
 		ins.flags = 0;
@@ -978,7 +1072,7 @@ int main() {
 		ins.objectid = num;
 		init_path(&path);
 		if (i % 10000 == 0)
-			printf("search %d:%d\n", num, i);
+			fprintf(stderr, "search %d:%d\n", num, i);
 		ret = search_slot(root, &ins, &path, 0);
 		if (ret) {
 			print_tree(root, root->node);
@@ -1004,7 +1098,7 @@ int main() {
 		ret = search_slot(root, &ins, &path, -1);
 		if (!ret) {
 			if (i % 10000 == 0)
-				printf("del %d:%d\n", num, i);
+				fprintf(stderr, "del %d:%d\n", num, i);
 			ret = del_item(root, &path);
 			if (ret != 0)
 				BUG();
@@ -1022,7 +1116,7 @@ int main() {
 		sprintf(buf, "string-%d", num);
 		ins.objectid = num;
 		if (i % 10000 == 0)
-			printf("insert %d:%d\n", num, i);
+			fprintf(stderr, "insert %d:%d\n", num, i);
 		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
@@ -1038,7 +1132,7 @@ int main() {
 		ins.objectid = num;
 		init_path(&path);
 		if (i % 10000 == 0)
-			printf("search %d:%d\n", num, i);
+			fprintf(stderr, "search %d:%d\n", num, i);
 		ret = search_slot(root, &ins, &path, 0);
 		if (ret) {
 			print_tree(root, root->node);
@@ -1082,6 +1176,7 @@ int main() {
 	}
 	printf("tree size is now %d\n", tree_size);
 	printf("map tree\n");
+	print_tree(root->extent_root, root->extent_root->node);
 	write_ctree_super(root, &super);
 	close_ctree(root);
 	return 0;
-- 
cgit v1.2.3


From 97571fd0c939be8ae9cb57a8c57430a244ce13ae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 24 Feb 2007 13:39:08 -0500
Subject: Btrfs: cleanup & comment

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ef8bfa83753..7645ab3259e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -110,6 +110,10 @@ int generic_bin_search(char *p, int item_size, struct key *key,
 	return 1;
 }
 
+/*
+ * simple bin_search frontend that does the right thing for
+ * leaves vs nodes
+ */
 int bin_search(struct node *c, struct key *key, int *slot)
 {
 	if (is_leaf(c->header.flags)) {
@@ -130,6 +134,10 @@ int bin_search(struct node *c, struct key *key, int *slot)
  *
  * If the key isn't found, the path points to the slot where it should
  * be inserted.
+ *
+ * if ins_len > 0, nodes and leaves will be split as we walk down the
+ * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
+ * possible)
  */
 int search_slot(struct ctree_root *root, struct key *key,
 		struct ctree_path *p, int ins_len)
@@ -379,6 +387,11 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 	return 0;
 }
 
+/*
+ * helper function to insert a new root level in the tree.
+ * A new node is allocated, and a single item is inserted to
+ * point to the existing root
+ */
 static int insert_new_root(struct ctree_root *root,
 			   struct ctree_path *path, int level)
 {
@@ -417,6 +430,7 @@ static int insert_new_root(struct ctree_root *root,
 /*
  * worker function to insert a single pointer in a node.
  * the node should have enough room for the pointer already
+ *
  * slot and level indicate where you want the key to go, and
  * blocknr is the block the key points to.
  */
@@ -449,6 +463,13 @@ int insert_ptr(struct ctree_root *root,
 	return 0;
 }
 
+/*
+ * split the node at the specified level in path in two.
+ * The path is corrected to point to the appropriate node after the split
+ *
+ * Before splitting this tries to make some room in the node by pushing
+ * left and right, if either one works, it returns right away.
+ */
 int split_node(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	struct tree_buffer *t;
@@ -744,10 +765,12 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	right = &right_buffer->leaf;
 	memset(right, 0, sizeof(*right));
 	if (mid <= slot) {
+		/* FIXME, just alloc a new leaf here */
 		if (leaf_space_used(l, mid, nritems - mid) + space_needed >
 			LEAF_DATA_SIZE)
 			BUG();
 	} else {
+		/* FIXME, just alloc a new leaf here */
 		if (leaf_space_used(l, 0, mid + 1) + space_needed >
 			LEAF_DATA_SIZE)
 			BUG();
@@ -983,6 +1006,10 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
+/*
+ * walk up the tree as far as required to find the next leaf.
+ * returns 0 if it found something or -1 if there are no greater leaves.
+ */
 int next_leaf(struct ctree_root *root, struct ctree_path *path)
 {
 	int slot;
@@ -1044,7 +1071,6 @@ int main() {
 
 
 	root = open_ctree("dbfile", &super);
-
 	srand(55);
 	for (i = 0; i < run_size; i++) {
 		buf = malloc(64);
-- 
cgit v1.2.3


From fec577fb7f516e0d12ff821b1af272fd754e120a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Feb 2007 10:40:21 -0500
Subject: Btrfs: Add fsx-style randomized tree tester Add debug-tree command to
 print the tree Add extent-tree.c to the repo Comment ctree.h

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |  10 +-
 fs/btrfs/ctree.c       |   6 +-
 fs/btrfs/ctree.h       |  62 ++++++++++-
 fs/btrfs/debug-tree.c  |  19 ++++
 fs/btrfs/extent-tree.c | 296 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/random-test.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 680 insertions(+), 8 deletions(-)
 create mode 100644 fs/btrfs/debug-tree.c
 create mode 100644 fs/btrfs/extent-tree.c
 create mode 100644 fs/btrfs/random-test.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index fe73ab9d81d..855e8f499e3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,11 +6,17 @@ objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
 #.c.o:
 #	$(CC) $(CFLAGS) -c $<
 
-ctree : $(objects)
-	gcc $(CFLAGS) -o ctree $(objects)
+all: tester debug-tree
+
+debug-tree: $(objects) debug-tree.o
+	gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o
+
+tester: $(objects) random-test.o
+	gcc $(CFLAGS) -o tester $(objects) random-test.o
 
 $(objects) : $(headers)
 
 clean :
 	rm ctree *.o
 
+
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7645ab3259e..25dc7b2f742 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1047,14 +1047,14 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
+/* some sample code to insert,search & delete items */
+#if 0
 /* for testing only */
 int next_key(int i, int max_key) {
 	return rand() % max_key;
 	//return i;
 }
-
 int main() {
-	struct ctree_root *root;
 	struct key ins;
 	struct key last = { (u64)-1, 0, 0};
 	char *buf;
@@ -1066,6 +1066,7 @@ int main() {
 	int tree_size = 0;
 	struct ctree_path path;
 	struct ctree_super_block super;
+	struct ctree_root *root;
 
 	radix_tree_init();
 
@@ -1207,3 +1208,4 @@ int main() {
 	close_ctree(root);
 	return 0;
 }
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b92fbbb5ecd..18daccd8453 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,22 +1,36 @@
 #ifndef __CTREE__
 #define __CTREE__
 
-#define CTREE_BLOCKSIZE 4096
+#define CTREE_BLOCKSIZE 1024
 
+/*
+ * the key defines the order in the tree, and so it also defines (optimal)
+ * block layout.  objectid corresonds to the inode number.  The flags
+ * tells us things about the object, and is a kind of stream selector.
+ * so for a given inode, keys with flags of 1 might refer to the inode
+ * data, flags of 2 may point to file data in the btree and flags == 3
+ * may point to extents.
+ *
+ * offset is the starting byte offset for this key in the stream.
+ */
 struct key {
 	u64 objectid;
 	u32 flags;
 	u64 offset;
 } __attribute__ ((__packed__));
 
+/*
+ * every tree block (leaf or node) starts with this header.
+ */
 struct header {
 	u64 fsid[2]; /* FS specific uuid */
-	u64 blocknr;
-	u64 parentid;
+	u64 blocknr; /* which block this node is supposed to live in */
+	u64 parentid; /* objectid of the tree root */
 	u32 csum;
 	u32 ham;
 	u16 nritems;
 	u16 flags;
+	/* generation flags to be added */
 } __attribute__ ((__packed__));
 
 #define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
@@ -28,6 +42,11 @@ struct header {
 
 struct tree_buffer;
 
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.  current_insert is used
+ * only for the extent tree.
+ */
 struct ctree_root {
 	struct tree_buffer *node;
 	struct ctree_root *extent_root;
@@ -36,27 +55,46 @@ struct ctree_root {
 	struct radix_tree_root cache_radix;
 };
 
+/*
+ * describes a tree on disk
+ */
 struct ctree_root_info {
 	u64 fsid[2]; /* FS specific uuid */
 	u64 blocknr; /* blocknr of this block */
 	u64 objectid; /* inode number of this root */
-	u64 tree_root; /* the tree root */
+	u64 tree_root; /* the tree root block */
 	u32 csum;
 	u32 ham;
 	u64 snapuuid[2]; /* root specific uuid */
 } __attribute__ ((__packed__));
 
+/*
+ * the super block basically lists the main trees of the FS
+ * it currently lacks any block count etc etc
+ */
 struct ctree_super_block {
 	struct ctree_root_info root_info;
 	struct ctree_root_info extent_info;
 } __attribute__ ((__packed__));
 
+/*
+ * A leaf is full of items.  The exact type of item is defined by
+ * the key flags parameter.  offset and size tell us where to find
+ * the item in the leaf (relative to the start of the data area)
+ */
 struct item {
 	struct key key;
 	u16 offset;
 	u16 size;
 } __attribute__ ((__packed__));
 
+/*
+ * leaves have an item area and a data area:
+ * [item0, item1....itemN] [free space] [dataN...data1, data0]
+ *
+ * The data is separate from the items to get the keys closer together
+ * during searches.
+ */
 #define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header))
 struct leaf {
 	struct header header;
@@ -66,17 +104,33 @@ struct leaf {
 	};
 } __attribute__ ((__packed__));
 
+/*
+ * all non-leaf blocks are nodes, they hold only keys and pointers to
+ * other blocks
+ */
 struct node {
 	struct header header;
 	struct key keys[NODEPTRS_PER_BLOCK];
 	u64 blockptrs[NODEPTRS_PER_BLOCK];
 } __attribute__ ((__packed__));
 
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
 struct extent_item {
 	u32 refs;
 	u64 owner;
 } __attribute__ ((__packed__));
 
+/*
+ * ctree_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...MAX_LEVEL] will point
+ * to any other levels that are present.
+ *
+ * The slots array records the index of the item or block pointer
+ * used while walking the tree.
+ */
 struct ctree_path {
 	struct tree_buffer *nodes[MAX_LEVEL];
 	int slots[MAX_LEVEL];
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
new file mode 100644
index 00000000000..b2c5abfdc74
--- /dev/null
+++ b/fs/btrfs/debug-tree.c
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+int main() {
+	struct ctree_super_block super;
+	struct ctree_root *root;
+	radix_tree_init();
+	root = open_ctree("dbfile", &super);
+	printf("root tree\n");
+	print_tree(root, root->node);
+	printf("map tree\n");
+	print_tree(root->extent_root, root->extent_root->node);
+	return 0;
+}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
new file mode 100644
index 00000000000..2b663ef8449
--- /dev/null
+++ b/fs/btrfs/extent-tree.c
@@ -0,0 +1,296 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/*
+ * pending extents are blocks that we're trying to allocate in the extent
+ * map while trying to grow the map because of other allocations.  To avoid
+ * recursing, they are tagged in the radix tree and cleaned up after
+ * other allocations are done.  The pending tag is also used in the same
+ * manner for deletes.
+ */
+#define CTREE_EXTENT_PENDING 0
+
+/*
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
+ */
+static int del_pending_extents(struct ctree_root *extent_root)
+{
+	int ret;
+	struct key key;
+	struct tree_buffer *gang[4];
+	int i;
+	struct ctree_path path;
+
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+						 (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 CTREE_EXTENT_PENDING);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			key.objectid = gang[i]->blocknr;
+			key.flags = 0;
+			key.offset = 1;
+			init_path(&path);
+			ret = search_slot(extent_root, &key, &path, 0);
+			if (ret) {
+				print_tree(extent_root, extent_root->node);
+				printf("unable to find %lu\n", key.objectid);
+				BUG();
+				// FIXME undo it and return sane
+				return ret;
+			}
+			ret = del_item(extent_root, &path);
+			if (ret) {
+				BUG();
+				return ret;
+			}
+			release_path(extent_root, &path);
+			radix_tree_tag_clear(&extent_root->cache_radix,
+						gang[i]->blocknr,
+						CTREE_EXTENT_PENDING);
+			tree_block_release(extent_root, gang[i]);
+		}
+	}
+	return 0;
+}
+
+/*
+ * remove an extent from the root, returns 0 on success
+ */
+int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
+{
+	struct ctree_path path;
+	struct key key;
+	struct ctree_root *extent_root = root->extent_root;
+	struct tree_buffer *t;
+	int pending_ret;
+	int ret;
+	key.objectid = blocknr;
+	key.flags = 0;
+	key.offset = num_blocks;
+	if (root == extent_root) {
+		t = read_tree_block(root, key.objectid);
+		radix_tree_tag_set(&root->cache_radix, key.objectid,
+				   CTREE_EXTENT_PENDING);
+		return 0;
+	}
+	init_path(&path);
+	ret = search_slot(extent_root, &key, &path, 0);
+	if (ret) {
+		print_tree(extent_root, extent_root->node);
+		printf("failed to find %lu\n", key.objectid);
+		BUG();
+	}
+	ret = del_item(extent_root, &path);
+	if (ret)
+		BUG();
+	release_path(extent_root, &path);
+	pending_ret = del_pending_extents(root->extent_root);
+	return ret ? ret : pending_ret;
+}
+
+/*
+ * walks the btree of allocated extents and find a hole of a given size.
+ * The key ins is changed to record the hole:
+ * ins->objectid == block start
+ * ins->flags = 0
+ * ins->offset == number of blocks
+ * Any available blocks before search_start are skipped.
+ */
+int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
+		     u64 search_start, u64 search_end, struct key *ins)
+{
+	struct ctree_path path;
+	struct key *key;
+	int ret;
+	u64 hole_size = 0;
+	int slot = 0;
+	u64 last_block;
+	int start_found;
+	struct leaf *l;
+	struct ctree_root * root = orig_root->extent_root;
+
+check_failed:
+	init_path(&path);
+	ins->objectid = search_start;
+	ins->offset = 0;
+	ins->flags = 0;
+	start_found = 0;
+	ret = search_slot(root, ins, &path, 0);
+	while (1) {
+		l = &path.nodes[0]->leaf;
+		slot = path.slots[0];
+		if (slot >= l->header.nritems) {
+			ret = next_leaf(root, &path);
+			if (ret == 0)
+				continue;
+			if (!start_found) {
+				ins->objectid = search_start;
+				ins->offset = num_blocks;
+				start_found = 1;
+				goto check_pending;
+			}
+			ins->objectid = last_block > search_start ?
+					last_block : search_start;
+			ins->offset = num_blocks;
+			goto check_pending;
+		}
+		key = &l->items[slot].key;
+		if (key->objectid >= search_start) {
+			if (start_found) {
+				hole_size = key->objectid - last_block;
+				if (hole_size > num_blocks) {
+					ins->objectid = last_block;
+					ins->offset = num_blocks;
+					goto check_pending;
+				}
+			} else
+				start_found = 1;
+			last_block = key->objectid + key->offset;
+		}
+		path.slots[0]++;
+	}
+	// FIXME -ENOSPC
+check_pending:
+	/* we have to make sure we didn't find an extent that has already
+	 * been allocated by the map tree or the original allocation
+	 */
+	release_path(root, &path);
+	BUG_ON(ins->objectid < search_start);
+	if (orig_root->extent_root == orig_root) {
+		BUG_ON(num_blocks != 1);
+		if ((root->current_insert.objectid <= ins->objectid &&
+		    root->current_insert.objectid +
+		    root->current_insert.offset > ins->objectid) ||
+		   (root->current_insert.objectid > ins->objectid &&
+		    root->current_insert.objectid <= ins->objectid +
+		    ins->offset) ||
+		   radix_tree_tag_get(&root->cache_radix, ins->objectid,
+				      CTREE_EXTENT_PENDING)) {
+			search_start = ins->objectid + 1;
+			goto check_failed;
+		}
+	}
+	if (ins->offset != 1)
+		BUG();
+	return 0;
+}
+
+/*
+ * insert all of the pending extents reserved during the original
+ * allocation.  (CTREE_EXTENT_PENDING).  Returns zero if it all worked out
+ */
+static int insert_pending_extents(struct ctree_root *extent_root)
+{
+	int ret;
+	struct key key;
+	struct extent_item item;
+	struct tree_buffer *gang[4];
+	int i;
+
+	// FIXME -ENOSPC
+	item.refs = 1;
+	item.owner = extent_root->node->node.header.parentid;
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+						 (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 CTREE_EXTENT_PENDING);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			key.objectid = gang[i]->blocknr;
+			key.flags = 0;
+			key.offset = 1;
+			ret = insert_item(extent_root, &key, &item,
+					  sizeof(item));
+			if (ret) {
+				BUG();
+				// FIXME undo it and return sane
+				return ret;
+			}
+			radix_tree_tag_clear(&extent_root->cache_radix,
+					     gang[i]->blocknr,
+					     CTREE_EXTENT_PENDING);
+			tree_block_release(extent_root, gang[i]);
+		}
+	}
+	return 0;
+}
+
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
+			 u64 search_end, u64 owner, struct key *ins,
+			 struct tree_buffer **buf)
+{
+	int ret;
+	int pending_ret;
+	struct extent_item extent_item;
+	extent_item.refs = 1;
+	extent_item.owner = owner;
+
+	ret = find_free_extent(root, num_blocks, search_start, search_end, ins);
+	if (ret)
+		return ret;
+	if (root != root->extent_root) {
+		memcpy(&root->extent_root->current_insert, ins, sizeof(*ins));
+		ret = insert_item(root->extent_root, ins, &extent_item,
+				  sizeof(extent_item));
+		memset(&root->extent_root->current_insert, 0,
+		       sizeof(struct key));
+		pending_ret = insert_pending_extents(root->extent_root);
+		if (ret)
+			return ret;
+		if (pending_ret)
+			return pending_ret;
+		*buf = find_tree_block(root, ins->objectid);
+		return 0;
+	}
+	/* we're allocating an extent for the extent tree, don't recurse */
+	BUG_ON(ins->offset != 1);
+	*buf = find_tree_block(root, ins->objectid);
+	BUG_ON(!*buf);
+	radix_tree_tag_set(&root->cache_radix, ins->objectid,
+			   CTREE_EXTENT_PENDING);
+	(*buf)->count++;
+	return 0;
+
+}
+
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
+{
+	struct key ins;
+	int ret;
+	struct tree_buffer *buf = NULL;
+
+	ret = alloc_extent(root, 1, 0, (unsigned long)-1,
+			   root->node->node.header.parentid,
+			   &ins, &buf);
+
+	if (ret) {
+		BUG();
+		return NULL;
+	}
+	if (root != root->extent_root)
+		BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix,
+					  buf->blocknr, CTREE_EXTENT_PENDING));
+	return buf;
+}
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
new file mode 100644
index 00000000000..3c8c68d55d2
--- /dev/null
+++ b/fs/btrfs/random-test.c
@@ -0,0 +1,295 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+int keep_running = 1;
+
+static int setup_key(struct radix_tree_root *root, struct key *key, int exists)
+{
+	int num = rand();
+	unsigned long res[2];
+	int ret;
+
+	key->flags = 0;
+	key->offset = 0;
+again:
+	ret = radix_tree_gang_lookup(root, (void **)res, num, 2);
+	if (exists) {
+		if (ret == 0)
+			return -1;
+		num = res[0];
+	} else if (ret != 0 && num == res[0]) {
+		num++;
+		if (ret > 1 && num == res[1]) {
+			num++;
+			goto again;
+		}
+	}
+	key->objectid = num;
+	return 0;
+}
+
+static int ins_one(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	struct ctree_path path;
+	struct key key;
+	int ret;
+	char buf[128];
+	init_path(&path);
+	ret = setup_key(radix, &key, 0);
+	sprintf(buf, "str-%lu\n", key.objectid);
+	ret = insert_item(root, &key, buf, strlen(buf));
+	if (ret)
+		goto error;
+	radix_tree_preload(GFP_KERNEL);
+	ret = radix_tree_insert(radix, key.objectid,
+					(void *)key.objectid);
+	radix_tree_preload_end();
+	if (ret)
+		goto error;
+	return ret;
+error:
+	printf("failed to insert %lu\n", key.objectid);
+	return -1;
+}
+
+static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	struct ctree_path path;
+	struct key key;
+	int ret;
+	char buf[128];
+	init_path(&path);
+	ret = setup_key(radix, &key, 1);
+	if (ret < 0)
+		return 0;
+	sprintf(buf, "str-%lu\n", key.objectid);
+	ret = insert_item(root, &key, buf, strlen(buf));
+	if (ret != -EEXIST) {
+		printf("insert on %lu gave us %d\n", key.objectid, ret);
+		return 1;
+	}
+	return 0;
+}
+
+static int del_one(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	struct ctree_path path;
+	struct key key;
+	int ret;
+	unsigned long *ptr;
+	init_path(&path);
+	ret = setup_key(radix, &key, 1);
+	if (ret < 0)
+		return 0;
+	ret = search_slot(root, &key, &path, -1);
+	if (ret)
+		goto error;
+	ret = del_item(root, &path);
+	release_path(root, &path);
+	if (ret != 0)
+		goto error;
+	ptr = radix_tree_delete(radix, key.objectid);
+	if (!ptr)
+		goto error;
+	return 0;
+error:
+	printf("failed to delete %lu\n", key.objectid);
+	return -1;
+}
+
+static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	struct ctree_path path;
+	struct key key;
+	int ret;
+	init_path(&path);
+	ret = setup_key(radix, &key, 1);
+	if (ret < 0)
+		return 0;
+	ret = search_slot(root, &key, &path, 0);
+	release_path(root, &path);
+	if (ret)
+		goto error;
+	return 0;
+error:
+	printf("unable to find key %lu\n", key.objectid);
+	return -1;
+}
+
+static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	struct ctree_path path;
+	struct key key;
+	int ret;
+	init_path(&path);
+	ret = setup_key(radix, &key, 0);
+	if (ret < 0)
+		return ret;
+	ret = search_slot(root, &key, &path, 0);
+	release_path(root, &path);
+	if (ret == 0)
+		goto error;
+	return 0;
+error:
+	printf("able to find key that should not exist %lu\n", key.objectid);
+	return -1;
+}
+
+int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) =
+{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent };
+
+static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	struct ctree_path path;
+	struct key key;
+	u64 found;
+	int ret;
+	int slot;
+	int i;
+	key.offset = 0;
+	key.flags = 0;
+	key.objectid = (unsigned long)-1;
+	while(1) {
+		init_path(&path);
+		ret = search_slot(root, &key, &path, 0);
+		slot = path.slots[0];
+		if (ret != 0) {
+			if (slot == 0) {
+				release_path(root, &path);
+				break;
+			}
+			slot -= 1;
+		}
+		for (i = slot; i >= 0; i--) {
+			found = path.nodes[0]->leaf.items[i].key.objectid;
+			radix_tree_preload(GFP_KERNEL);
+			ret = radix_tree_insert(radix, found, (void *)found);
+			if (ret) {
+				fprintf(stderr,
+					"failed to insert %lu into radix\n",
+					found);
+				exit(1);
+			}
+
+			radix_tree_preload_end();
+		}
+		release_path(root, &path);
+		key.objectid = found - 1;
+		if (key.objectid > found)
+			break;
+	}
+	return 0;
+}
+
+void sigstopper(int ignored)
+{
+	keep_running = 0;
+	fprintf(stderr, "caught exit signal, stopping\n");
+}
+
+int print_usage(void)
+{
+	printf("usage: tester [-ih] [-c count] [-f count]\n");
+	printf("\t -c count -- iteration count after filling\n");
+	printf("\t -f count -- run this many random inserts before starting\n");
+	printf("\t -i       -- only do initial fill\n");
+	printf("\t -h       -- this help text\n");
+	exit(1);
+}
+int main(int ac, char **av)
+{
+	RADIX_TREE(radix, GFP_KERNEL);
+	struct ctree_super_block super;
+	struct ctree_root *root;
+	int i;
+	int ret;
+	int count;
+	int op;
+	int iterations = 20000;
+	int init_fill_count = 800000;
+	int err = 0;
+	int initial_only = 0;
+	radix_tree_init();
+	root = open_ctree("dbfile", &super);
+	fill_radix(root, &radix);
+
+	signal(SIGTERM, sigstopper);
+	signal(SIGINT, sigstopper);
+
+	for (i = 1 ; i < ac ; i++) {
+		if (strcmp(av[i], "-i") == 0) {
+			initial_only = 1;
+		} else if (strcmp(av[i], "-c") == 0) {
+			iterations = atoi(av[i+1]);
+			i++;
+		} else if (strcmp(av[i], "-f") == 0) {
+			init_fill_count = atoi(av[i+1]);
+			i++;
+		} else {
+			print_usage();
+		}
+	}
+	for (i = 0; i < init_fill_count; i++) {
+		ret = ins_one(root, &radix);
+		if (ret) {
+			printf("initial fill failed\n");
+			err = ret;
+			goto out;
+		}
+		if (i % 10000 == 0) {
+			printf("initial fill %d level %d count %d\n", i,
+				node_level(root->node->node.header.flags),
+				root->node->node.header.nritems);
+		}
+		if (keep_running == 0) {
+			err = 0;
+			goto out;
+		}
+	}
+	if (initial_only == 1) {
+		goto out;
+	}
+	for (i = 0; i < iterations; i++) {
+		op = rand() % ARRAY_SIZE(ops);
+		count = rand() % 128;
+		if (i % 2000 == 0) {
+			printf("%d\n", i);
+			fflush(stdout);
+		}
+		if (i && i % 5000 == 0) {
+			printf("open & close, root level %d nritems %d\n",
+				node_level(root->node->node.header.flags),
+				root->node->node.header.nritems);
+			write_ctree_super(root, &super);
+			close_ctree(root);
+			root = open_ctree("dbfile", &super);
+		}
+		while(count--) {
+			ret = ops[op](root, &radix);
+			if (ret) {
+				fprintf(stderr, "op %d failed %d:%d\n",
+					op, i, iterations);
+				print_tree(root, root->node);
+				fprintf(stderr, "op %d failed %d:%d\n",
+					op, i, iterations);
+				err = ret;
+				goto out;
+			}
+			if (keep_running == 0) {
+				err = 0;
+				goto out;
+			}
+		}
+	}
+out:
+	write_ctree_super(root, &super);
+	close_ctree(root);
+	return err;
+}
+
-- 
cgit v1.2.3


From c673024aba596e57c07196cb3400cdcc9d28f3aa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Feb 2007 10:46:55 -0500
Subject: Btrfs: fixup dbfile perms

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f4c6ff202ba..c42dc72706b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -113,7 +113,7 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 	int fp;
 	int ret;
 
-	fp = open(filename, O_CREAT | O_RDWR);
+	fp = open(filename, O_CREAT | O_RDWR, 0600);
 	if (fp < 0) {
 		free(root);
 		return NULL;
-- 
cgit v1.2.3


From 7cf75962acd27ed0f819df791486e7b803c7109c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Feb 2007 10:55:01 -0500
Subject: Btrfs: u64 cleanups

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |  2 +-
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/kerncompat.h  |  2 +-
 fs/btrfs/print-tree.c  | 10 +++++-----
 fs/btrfs/random-test.c | 16 ++++++++--------
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 855e8f499e3..ea9b6665da3 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -17,6 +17,6 @@ tester: $(objects) random-test.o
 $(objects) : $(headers)
 
 clean :
-	rm ctree *.o
+	rm debug-tree tester *.o
 
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2b663ef8449..26321524c18 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,7 +42,7 @@ static int del_pending_extents(struct ctree_root *extent_root)
 			ret = search_slot(extent_root, &key, &path, 0);
 			if (ret) {
 				print_tree(extent_root, extent_root->node);
-				printf("unable to find %lu\n", key.objectid);
+				printf("unable to find %Lu\n", key.objectid);
 				BUG();
 				// FIXME undo it and return sane
 				return ret;
@@ -86,7 +86,7 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 	ret = search_slot(extent_root, &key, &path, 0);
 	if (ret) {
 		print_tree(extent_root, extent_root->node);
-		printf("failed to find %lu\n", key.objectid);
+		printf("failed to find %Lu\n", key.objectid);
 		BUG();
 	}
 	ret = del_item(extent_root, &path);
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
index 347ca06205c..f5efc5f0fff 100644
--- a/fs/btrfs/kerncompat.h
+++ b/fs/btrfs/kerncompat.h
@@ -15,7 +15,7 @@
 #define BUG() abort()
 
 typedef unsigned int u32;
-typedef unsigned long u64;
+typedef unsigned long long u64;
 typedef unsigned char u8;
 typedef unsigned short u16;
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 594d23b5b24..1d591270f4c 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -11,19 +11,19 @@ void print_leaf(struct leaf *l)
 	int nr = l->header.nritems;
 	struct item *item;
 	struct extent_item *ei;
-	printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
+	printf("leaf %Lu total ptrs %d free space %d\n", l->header.blocknr, nr,
 	       leaf_free_space(l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
-		printf("\titem %d key (%lu %u %lu) itemoff %d itemsize %d\n",
+		printf("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n",
 			i,
 			item->key.objectid, item->key.flags, item->key.offset,
 			item->offset, item->size);
 		fflush(stdout);
 		printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
 		ei = (struct extent_item *)(l->data + item->offset);
-		printf("\t\textent data %u %lu\n", ei->refs, ei->owner);
+		printf("\t\textent data %u %Lu\n", ei->refs, ei->owner);
 		fflush(stdout);
 	}
 }
@@ -43,12 +43,12 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 		print_leaf((struct leaf *)c);
 		return;
 	}
-	printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
+	printf("node %Lu level %d total ptrs %d free spc %lu\n", t->blocknr,
 	        node_level(c->header.flags), c->header.nritems,
 		NODEPTRS_PER_BLOCK - c->header.nritems);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
-		printf("\tkey %d (%lu %u %lu) block %lu\n",
+		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
 		       i,
 		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
 		       c->blockptrs[i]);
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 3c8c68d55d2..111a248bd5c 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -42,7 +42,7 @@ static int ins_one(struct ctree_root *root, struct radix_tree_root *radix)
 	char buf[128];
 	init_path(&path);
 	ret = setup_key(radix, &key, 0);
-	sprintf(buf, "str-%lu\n", key.objectid);
+	sprintf(buf, "str-%Lu\n", key.objectid);
 	ret = insert_item(root, &key, buf, strlen(buf));
 	if (ret)
 		goto error;
@@ -54,7 +54,7 @@ static int ins_one(struct ctree_root *root, struct radix_tree_root *radix)
 		goto error;
 	return ret;
 error:
-	printf("failed to insert %lu\n", key.objectid);
+	printf("failed to insert %Lu\n", key.objectid);
 	return -1;
 }
 
@@ -68,10 +68,10 @@ static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
-	sprintf(buf, "str-%lu\n", key.objectid);
+	sprintf(buf, "str-%Lu\n", key.objectid);
 	ret = insert_item(root, &key, buf, strlen(buf));
 	if (ret != -EEXIST) {
-		printf("insert on %lu gave us %d\n", key.objectid, ret);
+		printf("insert on %Lu gave us %d\n", key.objectid, ret);
 		return 1;
 	}
 	return 0;
@@ -99,7 +99,7 @@ static int del_one(struct ctree_root *root, struct radix_tree_root *radix)
 		goto error;
 	return 0;
 error:
-	printf("failed to delete %lu\n", key.objectid);
+	printf("failed to delete %Lu\n", key.objectid);
 	return -1;
 }
 
@@ -118,7 +118,7 @@ static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix)
 		goto error;
 	return 0;
 error:
-	printf("unable to find key %lu\n", key.objectid);
+	printf("unable to find key %Lu\n", key.objectid);
 	return -1;
 }
 
@@ -137,7 +137,7 @@ static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix)
 		goto error;
 	return 0;
 error:
-	printf("able to find key that should not exist %lu\n", key.objectid);
+	printf("able to find key that should not exist %Lu\n", key.objectid);
 	return -1;
 }
 
@@ -148,7 +148,7 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
 	struct key key;
-	u64 found;
+	unsigned long found;
 	int ret;
 	int slot;
 	int i;
-- 
cgit v1.2.3


From 41903fe666ff8e7b1d47ac7b2f5f9a91c7749498 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Feb 2007 10:55:42 -0500
Subject: Btrfs: 32bit cleanups

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/print-tree.c  | 2 +-
 fs/btrfs/random-test.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 1d591270f4c..a9a14223e18 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -43,7 +43,7 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 		print_leaf((struct leaf *)c);
 		return;
 	}
-	printf("node %Lu level %d total ptrs %d free spc %lu\n", t->blocknr,
+	printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
 	        node_level(c->header.flags), c->header.nritems,
 		NODEPTRS_PER_BLOCK - c->header.nritems);
 	fflush(stdout);
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 111a248bd5c..7078263acc3 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -40,15 +40,16 @@ static int ins_one(struct ctree_root *root, struct radix_tree_root *radix)
 	struct key key;
 	int ret;
 	char buf[128];
+	unsigned long oid;
 	init_path(&path);
 	ret = setup_key(radix, &key, 0);
 	sprintf(buf, "str-%Lu\n", key.objectid);
 	ret = insert_item(root, &key, buf, strlen(buf));
 	if (ret)
 		goto error;
+	oid = (unsigned long)key.objectid;
 	radix_tree_preload(GFP_KERNEL);
-	ret = radix_tree_insert(radix, key.objectid,
-					(void *)key.objectid);
+	ret = radix_tree_insert(radix, oid, (void *)oid);
 	radix_tree_preload_end();
 	if (ret)
 		goto error;
-- 
cgit v1.2.3


From d16302ab0ed26e236e9bde8cd54fd604d7d97297 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Feb 2007 14:26:58 -0500
Subject: Btrfs: more 32 bit cleanups

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/print-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index a9a14223e18..dda08f32c15 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,7 +45,7 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 	}
 	printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
 	        node_level(c->header.flags), c->header.nritems,
-		NODEPTRS_PER_BLOCK - c->header.nritems);
+		(u32)NODEPTRS_PER_BLOCK - c->header.nritems);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
 		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
-- 
cgit v1.2.3


From 8e19f2cd452853d1ca0895c2488c24d40de3d255 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Feb 2007 09:27:02 -0500
Subject: Btrfs: Take out the merge-during-search-on-delete code, it is buggy.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO    | 25 +++++++++++++++++++++++++
 fs/btrfs/ctree.c | 42 ++++++++++++++----------------------------
 2 files changed, 39 insertions(+), 28 deletions(-)
 create mode 100644 fs/btrfs/TODO

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
new file mode 100644
index 00000000000..5c4395c3784
--- /dev/null
+++ b/fs/btrfs/TODO
@@ -0,0 +1,25 @@
+* cleanup, add more error checking, get rid of BUG_ONs
+* Make IO functions look more like the page cache
+* Fix ENOSPC handling
+* make blocksize a mkfs parameter instead of #define
+* make a real mkfs and superblock
+* Add shadowing and transactions
+* Do checksumming
+* Define FS objects in terms of different item types
+* Add block mapping tree (simple dm layer)
+* Add simple tree locking (semaphore per tree)
+* Make allocator smarter
+* Port into the kernel
+* Add virtual filesystems, mountable snapshots
+* Get rid of struct ctree_path, limiting tree levels held at one time
+* Release
+* Do real tree locking
+* Add extent mirroring (backup copies of blocks)
+* Add fancy interface to get access to incremental backups
+* Add fancy striped extents to make big reads faster
+* Use relocation to try and fix write errors
+* Make allocator much smarter
+* xattrs (directory streams for regular files)
+* fsck
+* Scrub & defrag
+
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 25dc7b2f742..0aea94224ba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -167,29 +167,6 @@ int search_slot(struct ctree_root *root, struct key *key,
 				b = p->nodes[level];
 				c = &b->node;
 				slot = p->slots[level];
-			} else if (ins_len < 0 &&
-				   c->header.nritems <= NODEPTRS_PER_BLOCK/4) {
-				u64 blocknr = b->blocknr;
-				slot = p->slots[level +1];
-				b->count++;
-				if (push_node_left(root, p, level))
-					push_node_right(root, p, level);
-				if (c->header.nritems == 0 &&
-				    level < MAX_LEVEL - 1 &&
-				    p->nodes[level + 1]) {
-					int tslot = p->slots[level + 1];
-
-					p->slots[level + 1] = slot;
-					del_ptr(root, p, level + 1);
-					p->slots[level + 1] = tslot;
-					tree_block_release(root, b);
-					free_extent(root, blocknr, 1);
-				} else {
-					tree_block_release(root, b);
-				}
-				b = p->nodes[level];
-				c = &b->node;
-				slot = p->slots[level];
 			}
 			b = read_tree_block(root, c->blockptrs[slot]);
 			continue;
@@ -618,7 +595,6 @@ int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		&right->items[0].key, sizeof(struct key));
 	write_tree_block(root, upper);
 	/* then fixup the leaf pointer in the path */
-	// FIXME use nritems in here somehow
 	if (path->slots[0] >= left->header.nritems) {
 		path->slots[0] -= left->header.nritems;
 		tree_block_release(root, path->nodes[0]);
@@ -847,8 +823,6 @@ int insert_item(struct ctree_root *root, struct key *key,
 
 	slot = path.slots[0];
 	BUG_ON(slot < 0);
-	if (slot == 0)
-		fixup_low_keys(root, &path, key, 1);
 	if (slot != nritems) {
 		int i;
 		unsigned int old_data = leaf->items[slot].offset +
@@ -877,6 +851,8 @@ int insert_item(struct ctree_root *root, struct key *key,
 	memcpy(leaf->data + data_end - data_size, data, data_size);
 	leaf->header.nritems += 1;
 	write_tree_block(root, leaf_buf);
+	if (slot == 0)
+		fixup_low_keys(root, &path, key, 1);
 	if (leaf_free_space(leaf) < 0)
 		BUG();
 	release_path(root, &path);
@@ -914,13 +890,23 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 				sizeof(u64) * (nritems - slot - 1));
 		}
 		node->header.nritems--;
-		write_tree_block(root, t);
 		blocknr = t->blocknr;
+		write_tree_block(root, t);
 		if (node->header.nritems != 0) {
+			int tslot;
 			if (slot == 0)
 				fixup_low_keys(root, path, node->keys,
 					       level + 1);
-			break;
+			tslot = path->slots[level + 1];
+			t->count++;
+			if (push_node_left(root, path, level))
+				push_node_right(root, path, level);
+			path->slots[level + 1] = tslot;
+			if (node->header.nritems != 0) {
+				tree_block_release(root, t);
+				break;
+			}
+			tree_block_release(root, t);
 		}
 		if (t == root->node) {
 			/* just turn the root into a leaf and break */
-- 
cgit v1.2.3


From af86d07efe83e2f2da3658702a720bc48b1a6d78 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Feb 2007 09:40:58 -0500
Subject: Btrfs: Add sparse checking to Makefile

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile     | 9 +++++++--
 fs/btrfs/debug-tree.c | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ea9b6665da3..078061c321b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -3,8 +3,13 @@ CFLAGS = -g -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
 
-#.c.o:
-#	$(CC) $(CFLAGS) -c $<
+# if you don't have sparse installed, use ls instead
+check=sparse
+#check=ls
+
+.c.o:
+	$(check) $<
+	$(CC) $(CFLAGS) -c $<
 
 all: tester debug-tree
 
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
index b2c5abfdc74..21f607d8b05 100644
--- a/fs/btrfs/debug-tree.c
+++ b/fs/btrfs/debug-tree.c
@@ -6,7 +6,7 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
-int main() {
+int main(int ac, char **av) {
 	struct ctree_super_block super;
 	struct ctree_root *root;
 	radix_tree_init();
-- 
cgit v1.2.3


From aa5d6bed255d7f8c655a8f10d760f4247bc8385c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Feb 2007 16:35:06 -0500
Subject: Btrfs: return code checking

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   5 +-
 fs/btrfs/ctree.c       | 480 ++++++++++++++++++++++++++-----------------------
 fs/btrfs/extent-tree.c |   5 +
 fs/btrfs/random-test.c |   7 +-
 4 files changed, 269 insertions(+), 228 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 078061c321b..58d4260a04b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,10 +1,13 @@
 
+CC=gcc
 CFLAGS = -g -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
 
 # if you don't have sparse installed, use ls instead
-check=sparse
+CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
+		-Wcontext -Wcast-truncate -Wuninitialized -Wshadow -Wundef
+check=sparse $(CHECKFLAGS)
 #check=ls
 
 .c.o:
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0aea94224ba..be2be027251 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -6,12 +6,15 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
-int split_node(struct ctree_root *root, struct ctree_path *path, int level);
-int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size);
-int push_node_left(struct ctree_root *root, struct ctree_path *path, int level);
-int push_node_right(struct ctree_root *root,
+static int split_node(struct ctree_root *root, struct ctree_path *path,
+		      int level);
+static int split_leaf(struct ctree_root *root, struct ctree_path *path,
+		      int data_size);
+static int push_node_left(struct ctree_root *root, struct ctree_path *path,
+			  int level);
+static int push_node_right(struct ctree_root *root,
 		    struct ctree_path *path, int level);
-int del_ptr(struct ctree_root *root, struct ctree_path *path, int level);
+static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level);
 
 inline void init_path(struct ctree_path *p)
 {
@@ -26,6 +29,7 @@ void release_path(struct ctree_root *root, struct ctree_path *p)
 			break;
 		tree_block_release(root, p->nodes[i]);
 	}
+	memset(p, 0, sizeof(*p));
 }
 
 /*
@@ -74,6 +78,67 @@ int comp_keys(struct key *k1, struct key *k2)
 	return 0;
 }
 
+int check_node(struct ctree_path *path, int level)
+{
+	int i;
+	struct node *parent = NULL;
+	struct node *node = &path->nodes[level]->node;
+	int parent_slot;
+
+	if (path->nodes[level + 1])
+		parent = &path->nodes[level + 1]->node;
+	parent_slot = path->slots[level + 1];
+	if (parent && node->header.nritems > 0) {
+		struct key *parent_key;
+		parent_key = &parent->keys[parent_slot];
+		BUG_ON(memcmp(parent_key, node->keys, sizeof(struct key)));
+		BUG_ON(parent->blockptrs[parent_slot] != node->header.blocknr);
+	}
+	BUG_ON(node->header.nritems > NODEPTRS_PER_BLOCK);
+	for (i = 0; i < node->header.nritems - 2; i++) {
+		BUG_ON(comp_keys(&node->keys[i], &node->keys[i+1]) >= 0);
+	}
+	return 0;
+}
+
+int check_leaf(struct ctree_path *path, int level)
+{
+	int i;
+	struct leaf *leaf = &path->nodes[level]->leaf;
+	struct node *parent = NULL;
+	int parent_slot;
+
+	if (path->nodes[level + 1])
+		parent = &path->nodes[level + 1]->node;
+	parent_slot = path->slots[level + 1];
+	if (parent && leaf->header.nritems > 0) {
+		struct key *parent_key;
+		parent_key = &parent->keys[parent_slot];
+		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
+		       sizeof(struct key)));
+		BUG_ON(parent->blockptrs[parent_slot] != leaf->header.blocknr);
+	}
+	for (i = 0; i < leaf->header.nritems - 2; i++) {
+		BUG_ON(comp_keys(&leaf->items[i].key,
+		                 &leaf->items[i+1].key) >= 0);
+		BUG_ON(leaf->items[i].offset != leaf->items[i + 1].offset +
+		    leaf->items[i + 1].size);
+		if (i == 0) {
+			BUG_ON(leaf->items[i].offset + leaf->items[i].size !=
+				LEAF_DATA_SIZE);
+		}
+	}
+	BUG_ON(leaf_free_space(leaf) < 0);
+	return 0;
+}
+
+int check_block(struct ctree_path *path, int level)
+{
+	if (level == 0)
+		return check_leaf(path, level);
+	return check_node(path, level);
+}
+
 /*
  * search for key in the array p.  items p are item_size apart
  * and there are 'max' items in p
@@ -133,7 +198,8 @@ int bin_search(struct node *c, struct key *key, int *slot)
  * level of the path (level 0)
  *
  * If the key isn't found, the path points to the slot where it should
- * be inserted.
+ * be inserted, and 1 is returned.  If there are other errors during the
+ * search a negative error number is returned.
  *
  * if ins_len > 0, nodes and leaves will be split as we walk down the
  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
@@ -153,6 +219,9 @@ int search_slot(struct ctree_root *root, struct key *key,
 		c = &b->node;
 		level = node_level(c->header.flags);
 		p->nodes[level] = b;
+		ret = check_block(p, level);
+		if (ret)
+			return -1;
 		ret = bin_search(c, key, &slot);
 		if (!is_leaf(c->header.flags)) {
 			if (ret && slot > 0)
@@ -183,7 +252,7 @@ int search_slot(struct ctree_root *root, struct key *key,
 			return ret;
 		}
 	}
-	return -1;
+	return 1;
 }
 
 /*
@@ -192,12 +261,17 @@ int search_slot(struct ctree_root *root, struct key *key,
  * This is used after shifting pointers to the left, so it stops
  * fixing up pointers when a given leaf/node is not in slot 0 of the
  * higher levels
+ *
+ * If this fails to write a tree block, it returns -1, but continues
+ * fixing up the blocks in ram so the tree is consistent.
  */
-static void fixup_low_keys(struct ctree_root *root,
+static int fixup_low_keys(struct ctree_root *root,
 			   struct ctree_path *path, struct key *key,
 			   int level)
 {
 	int i;
+	int ret = 0;
+	int wret;
 	for (i = level; i < MAX_LEVEL; i++) {
 		struct node *t;
 		int tslot = path->slots[i];
@@ -205,10 +279,13 @@ static void fixup_low_keys(struct ctree_root *root,
 			break;
 		t = &path->nodes[i]->node;
 		memcpy(t->keys + tslot, key, sizeof(*key));
-		write_tree_block(root, path->nodes[i]);
+		wret = write_tree_block(root, path->nodes[i]);
+		if (wret)
+			ret = wret;
 		if (tslot != 0)
 			break;
 	}
+	return ret;
 }
 
 /*
@@ -220,8 +297,12 @@ static void fixup_low_keys(struct ctree_root *root,
  * be modified to reflect the push.
  *
  * The path is altered to reflect the push.
+ *
+ * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the left hand block.
  */
-int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
+static int push_node_left(struct ctree_root *root, struct ctree_path *path,
+			  int level)
 {
 	int slot;
 	struct node *left;
@@ -231,6 +312,8 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 	int right_nritems;
 	struct tree_buffer *t;
 	struct tree_buffer *right_buf;
+	int ret = 0;
+	int wret;
 
 	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
 		return 1;
@@ -265,10 +348,17 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 	left->header.nritems += push_items;
 
 	/* adjust the pointers going up the tree */
-	fixup_low_keys(root, path, right->keys, level + 1);
+	wret = fixup_low_keys(root, path, right->keys, level + 1);
+	if (wret < 0)
+		ret = wret;
 
-	write_tree_block(root, t);
-	write_tree_block(root, right_buf);
+	wret = write_tree_block(root, t);
+	if (wret < 0)
+		ret = wret;
+
+	wret = write_tree_block(root, right_buf);
+	if (wret < 0)
+		ret = wret;
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[level] < push_items) {
@@ -280,7 +370,7 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
 		path->slots[level] -= push_items;
 		tree_block_release(root, t);
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -292,8 +382,12 @@ int push_node_left(struct ctree_root *root, struct ctree_path *path, int level)
  * be modified to reflect the push.
  *
  * The path is altered to reflect the push.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
  */
-int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
+static int push_node_right(struct ctree_root *root, struct ctree_path *path,
+			   int level)
 {
 	int slot;
 	struct tree_buffer *t;
@@ -368,6 +462,8 @@ int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
  * helper function to insert a new root level in the tree.
  * A new node is allocated, and a single item is inserted to
  * point to the existing root
+ *
+ * returns zero on success or < 0 on failure.
  */
 static int insert_new_root(struct ctree_root *root,
 			   struct ctree_path *path, int level)
@@ -410,8 +506,10 @@ static int insert_new_root(struct ctree_root *root,
  *
  * slot and level indicate where you want the key to go, and
  * blocknr is the block the key points to.
+ *
+ * returns zero on success and < 0 on any error
  */
-int insert_ptr(struct ctree_root *root,
+static int insert_ptr(struct ctree_root *root,
 		struct ctree_path *path, struct key *key,
 		u64 blocknr, int slot, int level)
 {
@@ -446,8 +544,11 @@ int insert_ptr(struct ctree_root *root,
  *
  * Before splitting this tries to make some room in the node by pushing
  * left and right, if either one works, it returns right away.
+ *
+ * returns 0 on success and < 0 on failure
  */
-int split_node(struct ctree_root *root, struct ctree_path *path, int level)
+static int split_node(struct ctree_root *root, struct ctree_path *path,
+		      int level)
 {
 	struct tree_buffer *t;
 	struct node *c;
@@ -455,13 +556,18 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level)
 	struct node *split;
 	int mid;
 	int ret;
+	int wret;
 
 	ret = push_node_left(root, path, level);
 	if (!ret)
 		return 0;
+	if (ret < 0)
+		return ret;
 	ret = push_node_right(root, path, level);
 	if (!ret)
 		return 0;
+	if (ret < 0)
+		return ret;
 	t = path->nodes[level];
 	c = &t->node;
 	if (t == root->node) {
@@ -482,10 +588,19 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level)
 		(c->header.nritems - mid) * sizeof(u64));
 	split->header.nritems = c->header.nritems - mid;
 	c->header.nritems = mid;
-	write_tree_block(root, t);
-	write_tree_block(root, split_buffer);
-	insert_ptr(root, path, split->keys, split_buffer->blocknr,
-		     path->slots[level + 1] + 1, level + 1);
+	ret = 0;
+
+	wret = write_tree_block(root, t);
+	if (wret)
+		ret = wret;
+	wret = write_tree_block(root, split_buffer);
+	if (wret)
+		ret = wret;
+	wret = insert_ptr(root, path, split->keys, split_buffer->blocknr,
+			  path->slots[level + 1] + 1, level + 1);
+	if (wret)
+		ret = wret;
+
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
 		tree_block_release(root, t);
@@ -494,7 +609,7 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level)
 	} else {
 		tree_block_release(root, split_buffer);
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -502,7 +617,7 @@ int split_node(struct ctree_root *root, struct ctree_path *path, int level)
  * and nr indicate which items in the leaf to check.  This totals up the
  * space used both by the item structs and the item data
  */
-int leaf_space_used(struct leaf *l, int start, int nr)
+static int leaf_space_used(struct leaf *l, int start, int nr)
 {
 	int data_len;
 	int end = start + nr - 1;
@@ -518,9 +633,12 @@ int leaf_space_used(struct leaf *l, int start, int nr)
 /*
  * push some data in the path leaf to the right, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
  */
-int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
-		   int data_size)
+static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
+			   int data_size)
 {
 	struct tree_buffer *left_buf = path->nodes[0];
 	struct leaf *left = &left_buf->leaf;
@@ -609,8 +727,8 @@ int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
-int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
-		   int data_size)
+static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
+			  int data_size)
 {
 	struct tree_buffer *right_buf = path->nodes[0];
 	struct leaf *right = &right_buf->leaf;
@@ -623,6 +741,8 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	int push_items = 0;
 	struct item *item;
 	int old_left_nritems;
+	int ret = 0;
+	int wret;
 
 	slot = path->slots[1];
 	if (slot == 0) {
@@ -681,10 +801,16 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		push_space = right->items[i].offset;
 	}
 
-	write_tree_block(root, t);
-	write_tree_block(root, right_buf);
+	wret = write_tree_block(root, t);
+	if (wret)
+		ret = wret;
+	wret = write_tree_block(root, right_buf);
+	if (wret)
+		ret = wret;
 
-	fixup_low_keys(root, path, &right->items[0].key, 1);
+	wret = fixup_low_keys(root, path, &right->items[0].key, 1);
+	if (wret)
+		ret = wret;
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
@@ -697,17 +823,20 @@ int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		path->slots[0] -= push_items;
 	}
 	BUG_ON(path->slots[0] < 0);
-	return 0;
+	return ret;
 }
 
 /*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
  */
-int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
+static int split_leaf(struct ctree_root *root, struct ctree_path *path,
+		      int data_size)
 {
-	struct tree_buffer *l_buf = path->nodes[0];
-	struct leaf *l = &l_buf->leaf;
+	struct tree_buffer *l_buf;
+	struct leaf *l;
 	int nritems;
 	int mid;
 	int slot;
@@ -718,14 +847,23 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 	int rt_data_off;
 	int i;
 	int ret;
-
-	if (push_leaf_left(root, path, data_size) == 0 ||
-	    push_leaf_right(root, path, data_size) == 0) {
-		l_buf = path->nodes[0];
-		l = &l_buf->leaf;
-		if (leaf_free_space(l) >= sizeof(struct item) + data_size)
-			return 0;
+	int wret;
+
+	wret = push_leaf_left(root, path, data_size);
+	if (wret < 0)
+		return wret;
+	if (wret) {
+		wret = push_leaf_right(root, path, data_size);
+		if (wret < 0)
+			return wret;
 	}
+	l_buf = path->nodes[0];
+	l = &l_buf->leaf;
+
+	/* did the pushes work? */
+	if (leaf_free_space(l) >= sizeof(struct item) + data_size)
+		return 0;
+
 	if (!path->nodes[1]) {
 		ret = insert_new_root(root, path, 1);
 		if (ret)
@@ -768,10 +906,17 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 		right->items[i].offset += rt_data_off;
 
 	l->header.nritems = mid;
-	ret = insert_ptr(root, path, &right->items[0].key,
+	ret = 0;
+	wret = insert_ptr(root, path, &right->items[0].key,
 			  right_buffer->blocknr, path->slots[1] + 1, 1);
-	write_tree_block(root, right_buffer);
-	write_tree_block(root, l_buf);
+	if (wret)
+		ret = wret;
+	wret = write_tree_block(root, right_buffer);
+	if (wret)
+		ret = wret;
+	wret = write_tree_block(root, l_buf);
+	if (wret)
+		ret = wret;
 
 	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
@@ -792,7 +937,8 @@ int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 int insert_item(struct ctree_root *root, struct key *key,
 			  void *data, int data_size)
 {
-	int ret;
+	int ret = 0;
+	int wret;
 	int slot;
 	int slot_orig;
 	struct leaf *leaf;
@@ -810,6 +956,10 @@ int insert_item(struct ctree_root *root, struct key *key,
 		release_path(root, &path);
 		return -EEXIST;
 	}
+	if (ret < 0) {
+		release_path(root, &path);
+		return ret;
+	}
 
 	slot_orig = path.slots[0];
 	leaf_buf = path.nodes[0];
@@ -850,13 +1000,19 @@ int insert_item(struct ctree_root *root, struct key *key,
 	leaf->items[slot].size = data_size;
 	memcpy(leaf->data + data_end - data_size, data, data_size);
 	leaf->header.nritems += 1;
-	write_tree_block(root, leaf_buf);
+
+	ret = 0;
 	if (slot == 0)
-		fixup_low_keys(root, &path, key, 1);
+		ret = fixup_low_keys(root, &path, key, 1);
+
+	wret = write_tree_block(root, leaf_buf);
+	if (wret)
+		ret = wret;
+
 	if (leaf_free_space(leaf) < 0)
 		BUG();
 	release_path(root, &path);
-	return 0;
+	return ret;
 }
 
 /*
@@ -866,13 +1022,15 @@ int insert_item(struct ctree_root *root, struct key *key,
  * continuing all the way the root if required.  The root is converted into
  * a leaf if all the nodes are emptied.
  */
-int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
+static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
 	struct tree_buffer *t;
 	struct node *node;
 	int nritems;
 	u64 blocknr;
+	int wret;
+	int ret = 0;
 
 	while(1) {
 		t = path->nodes[level];
@@ -894,13 +1052,27 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 		write_tree_block(root, t);
 		if (node->header.nritems != 0) {
 			int tslot;
-			if (slot == 0)
-				fixup_low_keys(root, path, node->keys,
-					       level + 1);
+			if (slot == 0) {
+				wret = fixup_low_keys(root, path,
+							   node->keys,
+							   level + 1);
+				if (wret)
+					ret = wret;
+			}
 			tslot = path->slots[level + 1];
 			t->count++;
-			if (push_node_left(root, path, level))
-				push_node_right(root, path, level);
+			wret = push_node_left(root, path, level);
+			if (wret < 0) {
+				ret = wret;
+				break;
+			}
+			if (node->header.nritems != 0) {
+				wret = push_node_right(root, path, level);
+				if (wret < 0) {
+					ret = wret;
+					break;
+				}
+			}
 			path->slots[level + 1] = tslot;
 			if (node->header.nritems != 0) {
 				tree_block_release(root, t);
@@ -919,7 +1091,7 @@ int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 		if (!path->nodes[level])
 			BUG();
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -933,6 +1105,8 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	struct tree_buffer *leaf_buf;
 	int doff;
 	int dsize;
+	int ret = 0;
+	int wret;
 
 	leaf_buf = path->nodes[0];
 	leaf = &leaf_buf->leaf;
@@ -959,14 +1133,23 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			leaf->header.flags = node_level(0);
 			write_tree_block(root, leaf_buf);
 		} else {
-			del_ptr(root, path, 1);
+			wret = del_ptr(root, path, 1);
+			if (wret)
+				ret = wret;
 			free_extent(root, leaf_buf->blocknr, 1);
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, leaf->header.nritems);
-		if (slot == 0)
-			fixup_low_keys(root, path, &leaf->items[0].key, 1);
-		write_tree_block(root, leaf_buf);
+		if (slot == 0) {
+			wret = fixup_low_keys(root, path,
+						   &leaf->items[0].key, 1);
+			if (wret)
+				ret = wret;
+		}
+		wret = write_tree_block(root, leaf_buf);
+		if (wret)
+			ret = wret;
+
 		/* delete the leaf if it is mostly empty */
 		if (used < LEAF_DATA_SIZE / 3) {
 			/* push_leaf_left fixes the path.
@@ -975,13 +1158,20 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			 */
 			slot = path->slots[1];
 			leaf_buf->count++;
-			push_leaf_left(root, path, 1);
-			if (leaf->header.nritems)
-				push_leaf_right(root, path, 1);
+			wret = push_leaf_left(root, path, 1);
+			if (wret < 0)
+				ret = wret;
+			if (leaf->header.nritems) {
+				wret = push_leaf_right(root, path, 1);
+				if (wret < 0)
+					ret = wret;
+			}
 			if (leaf->header.nritems == 0) {
 				u64 blocknr = leaf_buf->blocknr;
 				path->slots[1] = slot;
-				del_ptr(root, path, 1);
+				wret = del_ptr(root, path, 1);
+				if (wret)
+					ret = wret;
 				tree_block_release(root, leaf_buf);
 				free_extent(root, blocknr, 1);
 			} else {
@@ -989,7 +1179,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			}
 		}
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -1033,165 +1223,3 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
-/* some sample code to insert,search & delete items */
-#if 0
-/* for testing only */
-int next_key(int i, int max_key) {
-	return rand() % max_key;
-	//return i;
-}
-int main() {
-	struct key ins;
-	struct key last = { (u64)-1, 0, 0};
-	char *buf;
-	int i;
-	int num;
-	int ret;
-	int run_size = 20000000;
-	int max_key =  100000000;
-	int tree_size = 0;
-	struct ctree_path path;
-	struct ctree_super_block super;
-	struct ctree_root *root;
-
-	radix_tree_init();
-
-
-	root = open_ctree("dbfile", &super);
-	srand(55);
-	for (i = 0; i < run_size; i++) {
-		buf = malloc(64);
-		num = next_key(i, max_key);
-		// num = i;
-		sprintf(buf, "string-%d", num);
-		if (i % 10000 == 0)
-			fprintf(stderr, "insert %d:%d\n", num, i);
-		ins.objectid = num;
-		ins.offset = 0;
-		ins.flags = 0;
-		ret = insert_item(root, &ins, buf, strlen(buf));
-		if (!ret)
-			tree_size++;
-		free(buf);
-	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
-
-	root = open_ctree("dbfile", &super);
-	printf("starting search\n");
-	srand(55);
-	for (i = 0; i < run_size; i++) {
-		num = next_key(i, max_key);
-		ins.objectid = num;
-		init_path(&path);
-		if (i % 10000 == 0)
-			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = search_slot(root, &ins, &path, 0);
-		if (ret) {
-			print_tree(root, root->node);
-			printf("unable to find %d\n", num);
-			exit(1);
-		}
-		release_path(root, &path);
-	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
-	root = open_ctree("dbfile", &super);
-	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
-	        node_level(root->node->node.header.flags),
-		root->node->node.header.nritems,
-		NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
-	printf("all searches good, deleting some items\n");
-	i = 0;
-	srand(55);
-	for (i = 0 ; i < run_size/4; i++) {
-		num = next_key(i, max_key);
-		ins.objectid = num;
-		init_path(&path);
-		ret = search_slot(root, &ins, &path, -1);
-		if (!ret) {
-			if (i % 10000 == 0)
-				fprintf(stderr, "del %d:%d\n", num, i);
-			ret = del_item(root, &path);
-			if (ret != 0)
-				BUG();
-			tree_size--;
-		}
-		release_path(root, &path);
-	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
-	root = open_ctree("dbfile", &super);
-	srand(128);
-	for (i = 0; i < run_size; i++) {
-		buf = malloc(64);
-		num = next_key(i, max_key);
-		sprintf(buf, "string-%d", num);
-		ins.objectid = num;
-		if (i % 10000 == 0)
-			fprintf(stderr, "insert %d:%d\n", num, i);
-		ret = insert_item(root, &ins, buf, strlen(buf));
-		if (!ret)
-			tree_size++;
-		free(buf);
-	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
-	root = open_ctree("dbfile", &super);
-	srand(128);
-	printf("starting search2\n");
-	for (i = 0; i < run_size; i++) {
-		num = next_key(i, max_key);
-		ins.objectid = num;
-		init_path(&path);
-		if (i % 10000 == 0)
-			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = search_slot(root, &ins, &path, 0);
-		if (ret) {
-			print_tree(root, root->node);
-			printf("unable to find %d\n", num);
-			exit(1);
-		}
-		release_path(root, &path);
-	}
-	printf("starting big long delete run\n");
-	while(root->node && root->node->node.header.nritems > 0) {
-		struct leaf *leaf;
-		int slot;
-		ins.objectid = (u64)-1;
-		init_path(&path);
-		ret = search_slot(root, &ins, &path, -1);
-		if (ret == 0)
-			BUG();
-
-		leaf = &path.nodes[0]->leaf;
-		slot = path.slots[0];
-		if (slot != leaf->header.nritems)
-			BUG();
-		while(path.slots[0] > 0) {
-			path.slots[0] -= 1;
-			slot = path.slots[0];
-			leaf = &path.nodes[0]->leaf;
-
-			if (comp_keys(&last, &leaf->items[slot].key) <= 0)
-				BUG();
-			memcpy(&last, &leaf->items[slot].key, sizeof(last));
-			if (tree_size % 10000 == 0)
-				printf("big del %d:%d\n", tree_size, i);
-			ret = del_item(root, &path);
-			if (ret != 0) {
-				printf("del_item returned %d\n", ret);
-				BUG();
-			}
-			tree_size--;
-		}
-		release_path(root, &path);
-	}
-	printf("tree size is now %d\n", tree_size);
-	printf("map tree\n");
-	print_tree(root->extent_root, root->extent_root->node);
-	write_ctree_super(root, &super);
-	close_ctree(root);
-	return 0;
-}
-#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 26321524c18..97715bae784 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -125,6 +125,11 @@ check_failed:
 	ins->flags = 0;
 	start_found = 0;
 	ret = search_slot(root, ins, &path, 0);
+	if (ret < 0) {
+		release_path(root, &path);
+		return ret;
+	}
+
 	while (1) {
 		l = &path.nodes[0]->leaf;
 		slot = path.slots[0];
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 7078263acc3..cebaf648392 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -134,7 +134,7 @@ static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix)
 		return ret;
 	ret = search_slot(root, &key, &path, 0);
 	release_path(root, &path);
-	if (ret == 0)
+	if (ret <= 0)
 		goto error;
 	return 0;
 error:
@@ -153,12 +153,17 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 	int ret;
 	int slot;
 	int i;
+
 	key.offset = 0;
 	key.flags = 0;
 	key.objectid = (unsigned long)-1;
 	while(1) {
 		init_path(&path);
 		ret = search_slot(root, &key, &path, 0);
+		if (ret < 0) {
+			release_path(root, &path);
+			return ret;
+		}
 		slot = path.slots[0];
 		if (ret != 0) {
 			if (slot == 0) {
-- 
cgit v1.2.3


From 0f70abe2b39d19171d4133d2ffdf77fb9113106a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Feb 2007 16:46:22 -0500
Subject: Btrfs: more return code checking

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 17 ++++++++++++-----
 fs/btrfs/extent-tree.c | 15 +++++++++------
 2 files changed, 21 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index be2be027251..27323993225 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1087,7 +1087,9 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 			break;
 		}
 		level++;
-		free_extent(root, blocknr, 1);
+		wret = free_extent(root, blocknr, 1);
+		if (wret)
+			ret = wret;
 		if (!path->nodes[level])
 			BUG();
 	}
@@ -1136,7 +1138,9 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			wret = del_ptr(root, path, 1);
 			if (wret)
 				ret = wret;
-			free_extent(root, leaf_buf->blocknr, 1);
+			wret = free_extent(root, leaf_buf->blocknr, 1);
+			if (wret)
+				ret = wret;
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, leaf->header.nritems);
@@ -1173,7 +1177,9 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 				if (wret)
 					ret = wret;
 				tree_block_release(root, leaf_buf);
-				free_extent(root, blocknr, 1);
+				wret = free_extent(root, blocknr, 1);
+				if (wret)
+					ret = wret;
 			} else {
 				tree_block_release(root, leaf_buf);
 			}
@@ -1184,7 +1190,8 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 
 /*
  * walk up the tree as far as required to find the next leaf.
- * returns 0 if it found something or -1 if there are no greater leaves.
+ * returns 0 if it found something or 1 if there are no greater leaves.
+ * returns < 0 on io errors.
  */
 int next_leaf(struct ctree_root *root, struct ctree_path *path)
 {
@@ -1196,7 +1203,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 
 	while(level < MAX_LEVEL) {
 		if (!path->nodes[level])
-			return -1;
+			return 1;
 		slot = path->slots[level] + 1;
 		c = path->nodes[level];
 		if (slot >= c->node.header.nritems) {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 97715bae784..074f4b182f1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -105,8 +105,8 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
-		     u64 search_start, u64 search_end, struct key *ins)
+static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
+			    u64 search_start, u64 search_end, struct key *ins)
 {
 	struct ctree_path path;
 	struct key *key;
@@ -125,10 +125,8 @@ check_failed:
 	ins->flags = 0;
 	start_found = 0;
 	ret = search_slot(root, ins, &path, 0);
-	if (ret < 0) {
-		release_path(root, &path);
-		return ret;
-	}
+	if (ret < 0)
+		goto error;
 
 	while (1) {
 		l = &path.nodes[0]->leaf;
@@ -137,6 +135,8 @@ check_failed:
 			ret = next_leaf(root, &path);
 			if (ret == 0)
 				continue;
+			if (ret < 0)
+				goto error;
 			if (!start_found) {
 				ins->objectid = search_start;
 				ins->offset = num_blocks;
@@ -187,6 +187,9 @@ check_pending:
 	if (ins->offset != 1)
 		BUG();
 	return 0;
+error:
+	release_path(root, &path);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From bb8039515d7c1b521ea22f095b43618ccc771885 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 1 Mar 2007 12:04:21 -0500
Subject: Btrfs: merge on the way down during deletes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 420 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 200 insertions(+), 220 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 27323993225..df4a19d6520 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -10,11 +10,10 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 		      int level);
 static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 		      int data_size);
-static int push_node_left(struct ctree_root *root, struct ctree_path *path,
-			  int level);
-static int push_node_right(struct ctree_root *root,
-		    struct ctree_path *path, int level);
-static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level);
+static int push_node_left(struct ctree_root *root, struct tree_buffer *dst,
+			  struct tree_buffer *src);
+static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
+		   int slot);
 
 inline void init_path(struct ctree_path *p)
 {
@@ -192,6 +191,138 @@ int bin_search(struct node *c, struct key *key, int *slot)
 	return -1;
 }
 
+struct tree_buffer *read_node_slot(struct ctree_root *root,
+				   struct tree_buffer *parent_buf,
+				   int slot)
+{
+	struct node *node = &parent_buf->node;
+	if (slot < 0)
+		return NULL;
+	if (slot >= node->header.nritems)
+		return NULL;
+	return read_tree_block(root, node->blockptrs[slot]);
+}
+
+static int balance_level(struct ctree_root *root, struct ctree_path *path,
+			int level)
+{
+	struct tree_buffer *right_buf;
+	struct tree_buffer *mid_buf;
+	struct tree_buffer *left_buf;
+	struct tree_buffer *parent_buf = NULL;
+	struct node *right = NULL;
+	struct node *mid;
+	struct node *left = NULL;
+	struct node *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int used = 0;
+	int count;
+	int orig_slot = path->slots[level];
+
+	if (level == 0)
+		return 0;
+
+	mid_buf = path->nodes[level];
+	mid = &mid_buf->node;
+	if (level < MAX_LEVEL - 1)
+		parent_buf = path->nodes[level + 1];
+	pslot = path->slots[level + 1];
+
+	if (!parent_buf) {
+		struct tree_buffer *child;
+		u64 blocknr = mid_buf->blocknr;
+
+		if (mid->header.nritems != 1)
+			return 0;
+
+		/* promote the child to a root */
+		child = read_node_slot(root, mid_buf, 0);
+		BUG_ON(!child);
+		root->node = child;
+		path->nodes[level] = NULL;
+		/* once for the path */
+		tree_block_release(root, mid_buf);
+		/* once for the root ptr */
+		tree_block_release(root, mid_buf);
+		return free_extent(root, blocknr, 1);
+	}
+	parent = &parent_buf->node;
+
+	if (mid->header.nritems > NODEPTRS_PER_BLOCK / 4)
+		return 0;
+
+	// print_tree(root, root->node);
+	left_buf = read_node_slot(root, parent_buf, pslot - 1);
+	right_buf = read_node_slot(root, parent_buf, pslot + 1);
+	if (right_buf) {
+		right = &right_buf->node;
+		used = right->header.nritems;
+		count = 1;
+	}
+	if (left_buf) {
+		left = &left_buf->node;
+		used += left->header.nritems;
+		orig_slot += left->header.nritems;
+		count++;
+	}
+	if (left_buf)
+		push_node_left(root, left_buf, mid_buf);
+	if (right_buf) {
+		push_node_left(root, mid_buf, right_buf);
+		if (right->header.nritems == 0) {
+			u64 blocknr = right_buf->blocknr;
+			tree_block_release(root, right_buf);
+			right_buf = NULL;
+			right = NULL;
+			wret = del_ptr(root, path, level + 1, pslot + 1);
+			if (wret)
+				ret = wret;
+			wret = free_extent(root, blocknr, 1);
+			if (wret)
+				ret = wret;
+		} else {
+			memcpy(parent->keys + pslot + 1, right->keys,
+				sizeof(struct key));
+		}
+	}
+	if (mid->header.nritems == 0) {
+		u64 blocknr = mid_buf->blocknr;
+		tree_block_release(root, mid_buf);
+		mid_buf = NULL;
+		mid = NULL;
+		wret = del_ptr(root, path, level + 1, pslot);
+		if (wret)
+			ret = wret;
+		wret = free_extent(root, blocknr, 1);
+		if (wret)
+			ret = wret;
+	} else
+		memcpy(parent->keys + pslot, mid->keys, sizeof(struct key));
+
+	if (left_buf) {
+		if (left->header.nritems >= orig_slot) {
+			left_buf->count++; // released below
+			path->nodes[level] = left_buf;
+			path->slots[level + 1] -= 1;
+			path->slots[level] = orig_slot;
+			if (mid_buf)
+				tree_block_release(root, mid_buf);
+		} else {
+			orig_slot -= left->header.nritems;
+			path->slots[level] = orig_slot;
+		}
+	}
+
+	if (right_buf)
+		tree_block_release(root, right_buf);
+	if (left_buf)
+		tree_block_release(root, left_buf);
+
+	return ret;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -208,12 +339,14 @@ int bin_search(struct node *c, struct key *key, int *slot)
 int search_slot(struct ctree_root *root, struct key *key,
 		struct ctree_path *p, int ins_len)
 {
-	struct tree_buffer *b = root->node;
+	struct tree_buffer *b;
 	struct node *c;
 	int slot;
 	int ret;
 	int level;
 
+again:
+	b = root->node;
 	b->count++;
 	while (b) {
 		c = &b->node;
@@ -236,9 +369,17 @@ int search_slot(struct ctree_root *root, struct key *key,
 				b = p->nodes[level];
 				c = &b->node;
 				slot = p->slots[level];
+			} else if (ins_len < 0) {
+				int sret = balance_level(root, p, level);
+				if (sret)
+					return sret;
+				b = p->nodes[level];
+				if (!b)
+					goto again;
+				c = &b->node;
+				slot = p->slots[level];
 			}
 			b = read_tree_block(root, c->blockptrs[slot]);
-			continue;
 		} else {
 			struct leaf *l = (struct leaf *)c;
 			p->slots[level] = slot;
@@ -249,9 +390,11 @@ int search_slot(struct ctree_root *root, struct key *key,
 				if (sret)
 					return sret;
 			}
+			BUG_ON(root->node->count == 1);
 			return ret;
 		}
 	}
+	BUG_ON(root->node->count == 1);
 	return 1;
 }
 
@@ -301,163 +444,49 @@ static int fixup_low_keys(struct ctree_root *root,
  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
  * error, and > 0 if there was no room in the left hand block.
  */
-static int push_node_left(struct ctree_root *root, struct ctree_path *path,
-			  int level)
+static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
+			  struct tree_buffer *src_buf)
 {
-	int slot;
-	struct node *left;
-	struct node *right;
+	struct node *src = &src_buf->node;
+	struct node *dst = &dst_buf->node;
 	int push_items = 0;
-	int left_nritems;
-	int right_nritems;
-	struct tree_buffer *t;
-	struct tree_buffer *right_buf;
+	int src_nritems;
+	int dst_nritems;
 	int ret = 0;
 	int wret;
 
-	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
-		return 1;
-	slot = path->slots[level + 1];
-	if (slot == 0)
-		return 1;
-
-	t = read_tree_block(root,
-		            path->nodes[level + 1]->node.blockptrs[slot - 1]);
-	left = &t->node;
-	right_buf = path->nodes[level];
-	right = &right_buf->node;
-	left_nritems = left->header.nritems;
-	right_nritems = right->header.nritems;
-	push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
+	src_nritems = src->header.nritems;
+	dst_nritems = dst->header.nritems;
+	push_items = NODEPTRS_PER_BLOCK - dst_nritems;
 	if (push_items <= 0) {
-		tree_block_release(root, t);
 		return 1;
 	}
 
-	if (right_nritems < push_items)
-		push_items = right_nritems;
-	memcpy(left->keys + left_nritems, right->keys,
+	if (src_nritems < push_items)
+		push_items =src_nritems;
+	memcpy(dst->keys + dst_nritems, src->keys,
 		push_items * sizeof(struct key));
-	memcpy(left->blockptrs + left_nritems, right->blockptrs,
+	memcpy(dst->blockptrs + dst_nritems, src->blockptrs,
 		push_items * sizeof(u64));
-	memmove(right->keys, right->keys + push_items,
-		(right_nritems - push_items) * sizeof(struct key));
-	memmove(right->blockptrs, right->blockptrs + push_items,
-		(right_nritems - push_items) * sizeof(u64));
-	right->header.nritems -= push_items;
-	left->header.nritems += push_items;
-
-	/* adjust the pointers going up the tree */
-	wret = fixup_low_keys(root, path, right->keys, level + 1);
-	if (wret < 0)
-		ret = wret;
+	if (push_items < src_nritems) {
+		memmove(src->keys, src->keys + push_items,
+			(src_nritems - push_items) * sizeof(struct key));
+		memmove(src->blockptrs, src->blockptrs + push_items,
+			(src_nritems - push_items) * sizeof(u64));
+	}
+	src->header.nritems -= push_items;
+	dst->header.nritems += push_items;
 
-	wret = write_tree_block(root, t);
+	wret = write_tree_block(root, src_buf);
 	if (wret < 0)
 		ret = wret;
 
-	wret = write_tree_block(root, right_buf);
+	wret = write_tree_block(root, dst_buf);
 	if (wret < 0)
 		ret = wret;
-
-	/* then fixup the leaf pointer in the path */
-	if (path->slots[level] < push_items) {
-		path->slots[level] += left_nritems;
-		tree_block_release(root, path->nodes[level]);
-		path->nodes[level] = t;
-		path->slots[level + 1] -= 1;
-	} else {
-		path->slots[level] -= push_items;
-		tree_block_release(root, t);
-	}
 	return ret;
 }
 
-/*
- * try to push data from one node into the next node right in the
- * tree.  The src node is found at specified level in the path.
- * If some bytes were pushed, return 0, otherwise return 1.
- *
- * Lower nodes/leaves in the path are not touched, higher nodes may
- * be modified to reflect the push.
- *
- * The path is altered to reflect the push.
- *
- * returns 0 if some ptrs were pushed, < 0 if there was some horrible
- * error, and > 0 if there was no room in the right hand block.
- */
-static int push_node_right(struct ctree_root *root, struct ctree_path *path,
-			   int level)
-{
-	int slot;
-	struct tree_buffer *t;
-	struct tree_buffer *src_buffer;
-	struct node *dst;
-	struct node *src;
-	int push_items = 0;
-	int dst_nritems;
-	int src_nritems;
-
-	/* can't push from the root */
-	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
-		return 1;
-
-	/* only try to push inside the node higher up */
-	slot = path->slots[level + 1];
-	if (slot == NODEPTRS_PER_BLOCK - 1)
-		return 1;
-
-	if (slot >= path->nodes[level + 1]->node.header.nritems -1)
-		return 1;
-
-	t = read_tree_block(root,
-			    path->nodes[level + 1]->node.blockptrs[slot + 1]);
-	dst = &t->node;
-	src_buffer = path->nodes[level];
-	src = &src_buffer->node;
-	dst_nritems = dst->header.nritems;
-	src_nritems = src->header.nritems;
-	push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
-	if (push_items <= 0) {
-		tree_block_release(root, t);
-		return 1;
-	}
-
-	if (src_nritems < push_items)
-		push_items = src_nritems;
-	memmove(dst->keys + push_items, dst->keys,
-		dst_nritems * sizeof(struct key));
-	memcpy(dst->keys, src->keys + src_nritems - push_items,
-		push_items * sizeof(struct key));
-
-	memmove(dst->blockptrs + push_items, dst->blockptrs,
-		dst_nritems * sizeof(u64));
-	memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
-		push_items * sizeof(u64));
-
-	src->header.nritems -= push_items;
-	dst->header.nritems += push_items;
-
-	/* adjust the pointers going up the tree */
-	memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
-		dst->keys, sizeof(struct key));
-
-	write_tree_block(root, path->nodes[level + 1]);
-	write_tree_block(root, t);
-	write_tree_block(root, src_buffer);
-
-	/* then fixup the pointers in the path */
-	if (path->slots[level] >= src->header.nritems) {
-		path->slots[level] -= src->header.nritems;
-		tree_block_release(root, path->nodes[level]);
-		path->nodes[level] = t;
-		path->slots[level + 1] += 1;
-	} else {
-		tree_block_release(root, t);
-	}
-	return 0;
-}
-
 /*
  * helper function to insert a new root level in the tree.
  * A new node is allocated, and a single item is inserted to
@@ -558,16 +587,6 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 	int ret;
 	int wret;
 
-	ret = push_node_left(root, path, level);
-	if (!ret)
-		return 0;
-	if (ret < 0)
-		return ret;
-	ret = push_node_right(root, path, level);
-	if (!ret)
-		return 0;
-	if (ret < 0)
-		return ret;
 	t = path->nodes[level];
 	c = &t->node;
 	if (t == root->node) {
@@ -1011,6 +1030,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 
 	if (leaf_free_space(leaf) < 0)
 		BUG();
+	check_leaf(&path, 0);
 	release_path(root, &path);
 	return ret;
 }
@@ -1022,77 +1042,38 @@ int insert_item(struct ctree_root *root, struct key *key,
  * continuing all the way the root if required.  The root is converted into
  * a leaf if all the nodes are emptied.
  */
-static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
+static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
+		   int slot)
 {
-	int slot;
-	struct tree_buffer *t;
 	struct node *node;
+	struct tree_buffer *parent = path->nodes[level];
 	int nritems;
-	u64 blocknr;
-	int wret;
 	int ret = 0;
+	int wret;
 
-	while(1) {
-		t = path->nodes[level];
-		if (!t)
-			break;
-		node = &t->node;
-		slot = path->slots[level];
-		nritems = node->header.nritems;
-
-		if (slot != nritems -1) {
-			memmove(node->keys + slot, node->keys + slot + 1,
-				sizeof(struct key) * (nritems - slot - 1));
-			memmove(node->blockptrs + slot,
-				node->blockptrs + slot + 1,
-				sizeof(u64) * (nritems - slot - 1));
-		}
-		node->header.nritems--;
-		blocknr = t->blocknr;
-		write_tree_block(root, t);
-		if (node->header.nritems != 0) {
-			int tslot;
-			if (slot == 0) {
-				wret = fixup_low_keys(root, path,
-							   node->keys,
-							   level + 1);
-				if (wret)
-					ret = wret;
-			}
-			tslot = path->slots[level + 1];
-			t->count++;
-			wret = push_node_left(root, path, level);
-			if (wret < 0) {
-				ret = wret;
-				break;
-			}
-			if (node->header.nritems != 0) {
-				wret = push_node_right(root, path, level);
-				if (wret < 0) {
-					ret = wret;
-					break;
-				}
-			}
-			path->slots[level + 1] = tslot;
-			if (node->header.nritems != 0) {
-				tree_block_release(root, t);
-				break;
-			}
-			tree_block_release(root, t);
-		}
-		if (t == root->node) {
-			/* just turn the root into a leaf and break */
-			root->node->node.header.flags = node_level(0);
-			write_tree_block(root, t);
-			break;
-		}
-		level++;
-		wret = free_extent(root, blocknr, 1);
+	node = &parent->node;
+	nritems = node->header.nritems;
+
+	if (slot != nritems -1) {
+		memmove(node->keys + slot, node->keys + slot + 1,
+			sizeof(struct key) * (nritems - slot - 1));
+		memmove(node->blockptrs + slot,
+			node->blockptrs + slot + 1,
+			sizeof(u64) * (nritems - slot - 1));
+	}
+	node->header.nritems--;
+	if (node->header.nritems == 0 && parent == root->node) {
+		BUG_ON(node_level(root->node->node.header.flags) != 1);
+		/* just turn the root into a leaf and break */
+		root->node->node.header.flags = node_level(0);
+	} else if (slot == 0) {
+		wret = fixup_low_keys(root, path, node->keys, level + 1);
 		if (wret)
 			ret = wret;
-		if (!path->nodes[level])
-			BUG();
 	}
+	wret = write_tree_block(root, parent);
+	if (wret)
+		ret = wret;
 	return ret;
 }
 
@@ -1135,7 +1116,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			leaf->header.flags = node_level(0);
 			write_tree_block(root, leaf_buf);
 		} else {
-			wret = del_ptr(root, path, 1);
+			wret = del_ptr(root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
 			wret = free_extent(root, leaf_buf->blocknr, 1);
@@ -1172,8 +1153,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			}
 			if (leaf->header.nritems == 0) {
 				u64 blocknr = leaf_buf->blocknr;
-				path->slots[1] = slot;
-				wret = del_ptr(root, path, 1);
+				wret = del_ptr(root, path, 1, slot);
 				if (wret)
 					ret = wret;
 				tree_block_release(root, leaf_buf);
-- 
cgit v1.2.3


From 79f95c82dca7665f32bafd68b7cdf4a01fab0840 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 1 Mar 2007 15:16:26 -0500
Subject: Btrfs: Fixup the code to merge during path walks Add a bulk
 insert/remove test to random-test Add the quick-test code back as another
 regression test

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   5 +-
 fs/btrfs/ctree.c       | 137 +++++++++++++++++++++++++++++++++-------
 fs/btrfs/quick-test.c  | 165 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/random-test.c | 117 +++++++++++++++++++++++++++++------
 4 files changed, 381 insertions(+), 43 deletions(-)
 create mode 100644 fs/btrfs/quick-test.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 58d4260a04b..756b1cd4cf4 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -14,7 +14,7 @@ check=sparse $(CHECKFLAGS)
 	$(check) $<
 	$(CC) $(CFLAGS) -c $<
 
-all: tester debug-tree
+all: tester debug-tree quick-test
 
 debug-tree: $(objects) debug-tree.o
 	gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o
@@ -22,6 +22,9 @@ debug-tree: $(objects) debug-tree.o
 tester: $(objects) random-test.o
 	gcc $(CFLAGS) -o tester $(objects) random-test.o
 
+quick-test: $(objects) quick-test.o
+	gcc $(CFLAGS) -o quick-test $(objects) quick-test.o
+
 $(objects) : $(headers)
 
 clean :
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index df4a19d6520..afa5bc5c7c1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -12,6 +12,9 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 		      int data_size);
 static int push_node_left(struct ctree_root *root, struct tree_buffer *dst,
 			  struct tree_buffer *src);
+static int balance_node_right(struct ctree_root *root,
+			      struct tree_buffer *dst_buf,
+			      struct tree_buffer *src_buf);
 static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
 		   int slot);
 
@@ -217,15 +220,16 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	int ret = 0;
 	int wret;
 	int pslot;
-	int used = 0;
-	int count;
 	int orig_slot = path->slots[level];
+	u64 orig_ptr;
 
 	if (level == 0)
 		return 0;
 
 	mid_buf = path->nodes[level];
 	mid = &mid_buf->node;
+	orig_ptr = mid->blockptrs[orig_slot];
+
 	if (level < MAX_LEVEL - 1)
 		parent_buf = path->nodes[level + 1];
 	pslot = path->slots[level + 1];
@@ -253,24 +257,26 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	if (mid->header.nritems > NODEPTRS_PER_BLOCK / 4)
 		return 0;
 
-	// print_tree(root, root->node);
 	left_buf = read_node_slot(root, parent_buf, pslot - 1);
 	right_buf = read_node_slot(root, parent_buf, pslot + 1);
-	if (right_buf) {
-		right = &right_buf->node;
-		used = right->header.nritems;
-		count = 1;
-	}
+
+	/* first, try to make some room in the middle buffer */
 	if (left_buf) {
 		left = &left_buf->node;
-		used += left->header.nritems;
 		orig_slot += left->header.nritems;
-		count++;
+		wret = push_node_left(root, left_buf, mid_buf);
+		if (wret < 0)
+			ret = wret;
 	}
-	if (left_buf)
-		push_node_left(root, left_buf, mid_buf);
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
 	if (right_buf) {
-		push_node_left(root, mid_buf, right_buf);
+		right = &right_buf->node;
+		wret = push_node_left(root, mid_buf, right_buf);
+		if (wret < 0)
+			ret = wret;
 		if (right->header.nritems == 0) {
 			u64 blocknr = right_buf->blocknr;
 			tree_block_release(root, right_buf);
@@ -285,9 +291,29 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		} else {
 			memcpy(parent->keys + pslot + 1, right->keys,
 				sizeof(struct key));
+			wret = write_tree_block(root, parent_buf);
+			if (wret)
+				ret = wret;
 		}
 	}
+	if (mid->header.nritems == 1) {
+		/*
+		 * we're not allowed to leave a node with one item in the
+		 * tree during a delete.  A deletion from lower in the tree
+		 * could try to delete the only pointer in this node.
+		 * So, pull some keys from the left.
+		 * There has to be a left pointer at this point because
+		 * otherwise we would have pulled some pointers from the
+		 * right
+		 */
+		BUG_ON(!left_buf);
+		wret = balance_node_right(root, mid_buf, left_buf);
+		if (wret < 0)
+			ret = wret;
+		BUG_ON(wret == 1);
+	}
 	if (mid->header.nritems == 0) {
+		/* we've managed to empty the middle node, drop it */
 		u64 blocknr = mid_buf->blocknr;
 		tree_block_release(root, mid_buf);
 		mid_buf = NULL;
@@ -298,11 +324,17 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		wret = free_extent(root, blocknr, 1);
 		if (wret)
 			ret = wret;
-	} else
+	} else {
+		/* update the parent key to reflect our changes */
 		memcpy(parent->keys + pslot, mid->keys, sizeof(struct key));
+		wret = write_tree_block(root, parent_buf);
+		if (wret)
+			ret = wret;
+	}
 
+	/* update the path */
 	if (left_buf) {
-		if (left->header.nritems >= orig_slot) {
+		if (left->header.nritems > orig_slot) {
 			left_buf->count++; // released below
 			path->nodes[level] = left_buf;
 			path->slots[level + 1] -= 1;
@@ -314,12 +346,15 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 			path->slots[level] = orig_slot;
 		}
 	}
+	/* double check we haven't messed things up */
+	check_block(path, level);
+	if (orig_ptr != path->nodes[level]->node.blockptrs[path->slots[level]])
+		BUG();
 
 	if (right_buf)
 		tree_block_release(root, right_buf);
 	if (left_buf)
 		tree_block_release(root, left_buf);
-
 	return ret;
 }
 
@@ -378,6 +413,7 @@ again:
 					goto again;
 				c = &b->node;
 				slot = p->slots[level];
+				BUG_ON(c->header.nritems == 1);
 			}
 			b = read_tree_block(root, c->blockptrs[slot]);
 		} else {
@@ -433,13 +469,7 @@ static int fixup_low_keys(struct ctree_root *root,
 
 /*
  * try to push data from one node into the next node left in the
- * tree.  The src node is found at specified level in the path.
- * If some bytes were pushed, return 0, otherwise return 1.
- *
- * Lower nodes/leaves in the path are not touched, higher nodes may
- * be modified to reflect the push.
- *
- * The path is altered to reflect the push.
+ * tree.
  *
  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
  * error, and > 0 if there was no room in the left hand block.
@@ -463,7 +493,8 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 	}
 
 	if (src_nritems < push_items)
-		push_items =src_nritems;
+		push_items = src_nritems;
+
 	memcpy(dst->keys + dst_nritems, src->keys,
 		push_items * sizeof(struct key));
 	memcpy(dst->blockptrs + dst_nritems, src->blockptrs,
@@ -487,6 +518,64 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 	return ret;
 }
 
+/*
+ * try to push data from one node into the next node right in the
+ * tree.
+ *
+ * returns 0 if some ptrs were pushed, < 0 if there was some horrible
+ * error, and > 0 if there was no room in the right hand block.
+ *
+ * this will  only push up to 1/2 the contents of the left node over
+ */
+static int balance_node_right(struct ctree_root *root,
+			      struct tree_buffer *dst_buf,
+			      struct tree_buffer *src_buf)
+{
+	struct node *src = &src_buf->node;
+	struct node *dst = &dst_buf->node;
+	int push_items = 0;
+	int max_push;
+	int src_nritems;
+	int dst_nritems;
+	int ret = 0;
+	int wret;
+
+	src_nritems = src->header.nritems;
+	dst_nritems = dst->header.nritems;
+	push_items = NODEPTRS_PER_BLOCK - dst_nritems;
+	if (push_items <= 0) {
+		return 1;
+	}
+
+	max_push = src_nritems / 2 + 1;
+	/* don't try to empty the node */
+	if (max_push > src_nritems)
+		return 1;
+	if (max_push < push_items)
+		push_items = max_push;
+
+	memmove(dst->keys + push_items, dst->keys,
+		dst_nritems * sizeof(struct key));
+	memmove(dst->blockptrs + push_items, dst->blockptrs,
+		dst_nritems * sizeof(u64));
+	memcpy(dst->keys, src->keys + src_nritems - push_items,
+		push_items * sizeof(struct key));
+	memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
+		push_items * sizeof(u64));
+
+	src->header.nritems -= push_items;
+	dst->header.nritems += push_items;
+
+	wret = write_tree_block(root, src_buf);
+	if (wret < 0)
+		ret = wret;
+
+	wret = write_tree_block(root, dst_buf);
+	if (wret < 0)
+		ret = wret;
+	return ret;
+}
+
 /*
  * helper function to insert a new root level in the tree.
  * A new node is allocated, and a single item is inserted to
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
new file mode 100644
index 00000000000..dbd00c3b7ab
--- /dev/null
+++ b/fs/btrfs/quick-test.c
@@ -0,0 +1,165 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+/* for testing only */
+int next_key(int i, int max_key) {
+	return rand() % max_key;
+	//return i;
+}
+
+int main(int ac, char **av) {
+	struct key ins;
+	struct key last = { (u64)-1, 0, 0};
+	char *buf;
+	int i;
+	int num;
+	int ret;
+	int run_size = 100000;
+	int max_key =  100000000;
+	int tree_size = 0;
+	struct ctree_path path;
+	struct ctree_super_block super;
+	struct ctree_root *root;
+
+	radix_tree_init();
+
+	root = open_ctree("dbfile", &super);
+	srand(55);
+	for (i = 0; i < run_size; i++) {
+		buf = malloc(64);
+		num = next_key(i, max_key);
+		// num = i;
+		sprintf(buf, "string-%d", num);
+		if (i % 10000 == 0)
+			fprintf(stderr, "insert %d:%d\n", num, i);
+		ins.objectid = num;
+		ins.offset = 0;
+		ins.flags = 0;
+		ret = insert_item(root, &ins, buf, strlen(buf));
+		if (!ret)
+			tree_size++;
+		free(buf);
+	}
+	write_ctree_super(root, &super);
+	close_ctree(root);
+
+	root = open_ctree("dbfile", &super);
+	printf("starting search\n");
+	srand(55);
+	for (i = 0; i < run_size; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		init_path(&path);
+		if (i % 10000 == 0)
+			fprintf(stderr, "search %d:%d\n", num, i);
+		ret = search_slot(root, &ins, &path, 0);
+		if (ret) {
+			print_tree(root, root->node);
+			printf("unable to find %d\n", num);
+			exit(1);
+		}
+		release_path(root, &path);
+	}
+	write_ctree_super(root, &super);
+	close_ctree(root);
+	root = open_ctree("dbfile", &super);
+	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
+	        node_level(root->node->node.header.flags),
+		root->node->node.header.nritems,
+		NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
+	printf("all searches good, deleting some items\n");
+	i = 0;
+	srand(55);
+	for (i = 0 ; i < run_size/4; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		init_path(&path);
+		ret = search_slot(root, &ins, &path, -1);
+		if (!ret) {
+			if (i % 10000 == 0)
+				fprintf(stderr, "del %d:%d\n", num, i);
+			ret = del_item(root, &path);
+			if (ret != 0)
+				BUG();
+			tree_size--;
+		}
+		release_path(root, &path);
+	}
+	write_ctree_super(root, &super);
+	close_ctree(root);
+	root = open_ctree("dbfile", &super);
+	srand(128);
+	for (i = 0; i < run_size; i++) {
+		buf = malloc(64);
+		num = next_key(i, max_key);
+		sprintf(buf, "string-%d", num);
+		ins.objectid = num;
+		if (i % 10000 == 0)
+			fprintf(stderr, "insert %d:%d\n", num, i);
+		ret = insert_item(root, &ins, buf, strlen(buf));
+		if (!ret)
+			tree_size++;
+		free(buf);
+	}
+	write_ctree_super(root, &super);
+	close_ctree(root);
+	root = open_ctree("dbfile", &super);
+	srand(128);
+	printf("starting search2\n");
+	for (i = 0; i < run_size; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		init_path(&path);
+		if (i % 10000 == 0)
+			fprintf(stderr, "search %d:%d\n", num, i);
+		ret = search_slot(root, &ins, &path, 0);
+		if (ret) {
+			print_tree(root, root->node);
+			printf("unable to find %d\n", num);
+			exit(1);
+		}
+		release_path(root, &path);
+	}
+	printf("starting big long delete run\n");
+	while(root->node && root->node->node.header.nritems > 0) {
+		struct leaf *leaf;
+		int slot;
+		ins.objectid = (u64)-1;
+		init_path(&path);
+		ret = search_slot(root, &ins, &path, -1);
+		if (ret == 0)
+			BUG();
+
+		leaf = &path.nodes[0]->leaf;
+		slot = path.slots[0];
+		if (slot != leaf->header.nritems)
+			BUG();
+		while(path.slots[0] > 0) {
+			path.slots[0] -= 1;
+			slot = path.slots[0];
+			leaf = &path.nodes[0]->leaf;
+
+			memcpy(&last, &leaf->items[slot].key, sizeof(last));
+			if (tree_size % 10000 == 0)
+				printf("big del %d:%d\n", tree_size, i);
+			ret = del_item(root, &path);
+			if (ret != 0) {
+				printf("del_item returned %d\n", ret);
+				BUG();
+			}
+			tree_size--;
+		}
+		release_path(root, &path);
+	}
+	printf("tree size is now %d\n", tree_size);
+	printf("map tree\n");
+	print_tree(root->extent_root, root->extent_root->node);
+	write_ctree_super(root, &super);
+	close_ctree(root);
+	return 0;
+}
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index cebaf648392..bbd554e88db 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -142,8 +142,98 @@ error:
 	return -1;
 }
 
+static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix,
+		      int nr)
+{
+	struct ctree_path path;
+	struct key key;
+	unsigned long found = 0;
+	int ret;
+	int slot;
+	int *ptr;
+	int count = 0;
+
+	key.offset = 0;
+	key.flags = 0;
+	key.objectid = (unsigned long)-1;
+	while(nr-- >= 0) {
+		init_path(&path);
+		ret = search_slot(root, &key, &path, -1);
+		if (ret < 0) {
+			release_path(root, &path);
+			return ret;
+		}
+		if (ret != 0) {
+			if (path.slots[0] == 0) {
+				release_path(root, &path);
+				break;
+			}
+			path.slots[0] -= 1;
+		}
+		slot = path.slots[0];
+		found = path.nodes[0]->leaf.items[slot].key.objectid;
+		ret = del_item(root, &path);
+		count++;
+		if (ret) {
+			fprintf(stderr,
+				"failed to remove %lu from tree\n",
+				found);
+			return -1;
+		}
+		release_path(root, &path);
+		ptr = radix_tree_delete(radix, found);
+		if (!ptr)
+			goto error;
+		if (!keep_running)
+			break;
+	}
+	return 0;
+error:
+	fprintf(stderr, "failed to delete from the radix %lu\n", found);
+	return -1;
+}
+
+static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix,
+		     int count)
+{
+	int i;
+	int err;
+	int ret = 0;
+	for (i = 0; i < count; i++) {
+		ret = ins_one(root, radix);
+		if (ret) {
+			printf("fill failed\n");
+			err = ret;
+			goto out;
+		}
+		if (!keep_running)
+			break;
+	}
+out:
+	return ret;
+}
+
+static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	int ret;
+	int nr = rand() % 20000;
+	static int run_nr = 0;
+
+	/* do the bulk op much less frequently */
+	if (run_nr++ % 100)
+		return 0;
+	ret = empty_tree(root, radix, nr);
+	if (ret)
+		return ret;
+	ret = fill_tree(root, radix, nr);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+
 int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) =
-{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent };
+{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent, bulk_op };
 
 static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 {
@@ -192,7 +282,6 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 	}
 	return 0;
 }
-
 void sigstopper(int ignored)
 {
 	keep_running = 0;
@@ -241,22 +330,12 @@ int main(int ac, char **av)
 			print_usage();
 		}
 	}
-	for (i = 0; i < init_fill_count; i++) {
-		ret = ins_one(root, &radix);
-		if (ret) {
-			printf("initial fill failed\n");
-			err = ret;
-			goto out;
-		}
-		if (i % 10000 == 0) {
-			printf("initial fill %d level %d count %d\n", i,
-				node_level(root->node->node.header.flags),
-				root->node->node.header.nritems);
-		}
-		if (keep_running == 0) {
-			err = 0;
-			goto out;
-		}
+	printf("initial fill\n");
+	ret = fill_tree(root, &radix, init_fill_count);
+	printf("starting run\n");
+	if (ret) {
+		err = ret;
+		goto out;
 	}
 	if (initial_only == 1) {
 		goto out;
@@ -287,6 +366,8 @@ int main(int ac, char **av)
 				err = ret;
 				goto out;
 			}
+			if (ops[op] == bulk_op)
+				break;
 			if (keep_running == 0) {
 				err = 0;
 				goto out;
-- 
cgit v1.2.3


From ed2ff2cba766dfe7976a0113f667c9a0a50dff02 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 1 Mar 2007 18:59:40 -0500
Subject: Btrfs: pretend page cache & commit code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   |  66 +++++----
 fs/btrfs/ctree.h   |   5 +
 fs/btrfs/disk-io.c |  91 +++++++++++-
 fs/btrfs/disk-io.h |   6 +
 fs/btrfs/list.h    | 418 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 558 insertions(+), 28 deletions(-)
 create mode 100644 fs/btrfs/list.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index afa5bc5c7c1..57fa505fb9f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -250,6 +250,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		tree_block_release(root, mid_buf);
 		/* once for the root ptr */
 		tree_block_release(root, mid_buf);
+		clean_tree_block(root, mid_buf);
 		return free_extent(root, blocknr, 1);
 	}
 	parent = &parent_buf->node;
@@ -280,6 +281,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		if (right->header.nritems == 0) {
 			u64 blocknr = right_buf->blocknr;
 			tree_block_release(root, right_buf);
+			clean_tree_block(root, right_buf);
 			right_buf = NULL;
 			right = NULL;
 			wret = del_ptr(root, path, level + 1, pslot + 1);
@@ -291,7 +293,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		} else {
 			memcpy(parent->keys + pslot + 1, right->keys,
 				sizeof(struct key));
-			wret = write_tree_block(root, parent_buf);
+			wret = dirty_tree_block(root, parent_buf);
 			if (wret)
 				ret = wret;
 		}
@@ -316,6 +318,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		/* we've managed to empty the middle node, drop it */
 		u64 blocknr = mid_buf->blocknr;
 		tree_block_release(root, mid_buf);
+		clean_tree_block(root, mid_buf);
 		mid_buf = NULL;
 		mid = NULL;
 		wret = del_ptr(root, path, level + 1, pslot);
@@ -327,7 +330,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	} else {
 		/* update the parent key to reflect our changes */
 		memcpy(parent->keys + pslot, mid->keys, sizeof(struct key));
-		wret = write_tree_block(root, parent_buf);
+		wret = dirty_tree_block(root, parent_buf);
 		if (wret)
 			ret = wret;
 	}
@@ -458,7 +461,7 @@ static int fixup_low_keys(struct ctree_root *root,
 			break;
 		t = &path->nodes[i]->node;
 		memcpy(t->keys + tslot, key, sizeof(*key));
-		wret = write_tree_block(root, path->nodes[i]);
+		wret = dirty_tree_block(root, path->nodes[i]);
 		if (wret)
 			ret = wret;
 		if (tslot != 0)
@@ -508,11 +511,11 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 	src->header.nritems -= push_items;
 	dst->header.nritems += push_items;
 
-	wret = write_tree_block(root, src_buf);
+	wret = dirty_tree_block(root, src_buf);
 	if (wret < 0)
 		ret = wret;
 
-	wret = write_tree_block(root, dst_buf);
+	wret = dirty_tree_block(root, dst_buf);
 	if (wret < 0)
 		ret = wret;
 	return ret;
@@ -566,11 +569,11 @@ static int balance_node_right(struct ctree_root *root,
 	src->header.nritems -= push_items;
 	dst->header.nritems += push_items;
 
-	wret = write_tree_block(root, src_buf);
+	wret = dirty_tree_block(root, src_buf);
 	if (wret < 0)
 		ret = wret;
 
-	wret = write_tree_block(root, dst_buf);
+	wret = dirty_tree_block(root, dst_buf);
 	if (wret < 0)
 		ret = wret;
 	return ret;
@@ -612,7 +615,7 @@ static int insert_new_root(struct ctree_root *root,
 	tree_block_release(root, root->node);
 	root->node = t;
 	t->count++;
-	write_tree_block(root, t);
+	dirty_tree_block(root, t);
 	path->nodes[level] = t;
 	path->slots[level] = 0;
 	return 0;
@@ -652,7 +655,7 @@ static int insert_ptr(struct ctree_root *root,
 	lower->header.nritems++;
 	if (lower->keys[1].objectid == 0)
 			BUG();
-	write_tree_block(root, path->nodes[level]);
+	dirty_tree_block(root, path->nodes[level]);
 	return 0;
 }
 
@@ -698,10 +701,10 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 	c->header.nritems = mid;
 	ret = 0;
 
-	wret = write_tree_block(root, t);
+	wret = dirty_tree_block(root, t);
 	if (wret)
 		ret = wret;
-	wret = write_tree_block(root, split_buffer);
+	wret = dirty_tree_block(root, split_buffer);
 	if (wret)
 		ret = wret;
 	wret = insert_ptr(root, path, split->keys, split_buffer->blocknr,
@@ -815,11 +818,11 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	}
 	left->header.nritems -= push_items;
 
-	write_tree_block(root, left_buf);
-	write_tree_block(root, right_buf);
+	dirty_tree_block(root, left_buf);
+	dirty_tree_block(root, right_buf);
 	memcpy(upper->node.keys + slot + 1,
 		&right->items[0].key, sizeof(struct key));
-	write_tree_block(root, upper);
+	dirty_tree_block(root, upper);
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left->header.nritems) {
 		path->slots[0] -= left->header.nritems;
@@ -909,10 +912,10 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		push_space = right->items[i].offset;
 	}
 
-	wret = write_tree_block(root, t);
+	wret = dirty_tree_block(root, t);
 	if (wret)
 		ret = wret;
-	wret = write_tree_block(root, right_buf);
+	wret = dirty_tree_block(root, right_buf);
 	if (wret)
 		ret = wret;
 
@@ -1019,10 +1022,10 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 			  right_buffer->blocknr, path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
-	wret = write_tree_block(root, right_buffer);
+	wret = dirty_tree_block(root, right_buffer);
 	if (wret)
 		ret = wret;
-	wret = write_tree_block(root, l_buf);
+	wret = dirty_tree_block(root, l_buf);
 	if (wret)
 		ret = wret;
 
@@ -1062,12 +1065,14 @@ int insert_item(struct ctree_root *root, struct key *key,
 	ret = search_slot(root, key, &path, data_size);
 	if (ret == 0) {
 		release_path(root, &path);
-		return -EEXIST;
-	}
-	if (ret < 0) {
-		release_path(root, &path);
+		ret = -EEXIST;
+		wret = commit_transaction(root);
+		if (wret)
+			ret = wret;
 		return ret;
 	}
+	if (ret < 0)
+		goto out;
 
 	slot_orig = path.slots[0];
 	leaf_buf = path.nodes[0];
@@ -1113,14 +1118,18 @@ int insert_item(struct ctree_root *root, struct key *key,
 	if (slot == 0)
 		ret = fixup_low_keys(root, &path, key, 1);
 
-	wret = write_tree_block(root, leaf_buf);
+	wret = dirty_tree_block(root, leaf_buf);
 	if (wret)
 		ret = wret;
 
 	if (leaf_free_space(leaf) < 0)
 		BUG();
 	check_leaf(&path, 0);
+out:
 	release_path(root, &path);
+	wret = commit_transaction(root);
+	if (wret)
+		ret = wret;
 	return ret;
 }
 
@@ -1160,7 +1169,7 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
 		if (wret)
 			ret = wret;
 	}
-	wret = write_tree_block(root, parent);
+	wret = dirty_tree_block(root, parent);
 	if (wret)
 		ret = wret;
 	return ret;
@@ -1203,8 +1212,9 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	if (leaf->header.nritems == 0) {
 		if (leaf_buf == root->node) {
 			leaf->header.flags = node_level(0);
-			write_tree_block(root, leaf_buf);
+			dirty_tree_block(root, leaf_buf);
 		} else {
+			clean_tree_block(root, leaf_buf);
 			wret = del_ptr(root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
@@ -1220,7 +1230,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			if (wret)
 				ret = wret;
 		}
-		wret = write_tree_block(root, leaf_buf);
+		wret = dirty_tree_block(root, leaf_buf);
 		if (wret)
 			ret = wret;
 
@@ -1242,6 +1252,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			}
 			if (leaf->header.nritems == 0) {
 				u64 blocknr = leaf_buf->blocknr;
+				clean_tree_block(root, leaf_buf);
 				wret = del_ptr(root, path, 1, slot);
 				if (wret)
 					ret = wret;
@@ -1254,6 +1265,9 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			}
 		}
 	}
+	wret = commit_transaction(root);
+	if (wret)
+		ret = wret;
 	return ret;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 18daccd8453..6b4dabd4769 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,6 +1,8 @@
 #ifndef __CTREE__
 #define __CTREE__
 
+#include "list.h"
+
 #define CTREE_BLOCKSIZE 1024
 
 /*
@@ -53,6 +55,9 @@ struct ctree_root {
 	struct key current_insert;
 	int fp;
 	struct radix_tree_root cache_radix;
+	struct list_head trans;
+	struct list_head cache;
+	int cache_size;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c42dc72706b..656ace6147a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -11,6 +11,8 @@
 #include "disk-io.h"
 
 static int allocated_blocks = 0;
+int cache_size = 0;
+int cache_max = 10000;
 
 static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 {
@@ -21,6 +23,25 @@ static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 	return 0;
 }
 
+static int free_some_buffers(struct ctree_root *root)
+{
+	struct list_head *node, *next;
+	struct tree_buffer *b;
+	if (root->cache_size < cache_max)
+		return 0;
+	list_for_each_safe(node, next, &root->cache) {
+		b = list_entry(node, struct tree_buffer, cache);
+		if (b->count == 1) {
+			BUG_ON(!list_empty(&b->dirty));
+			list_del_init(&b->cache);
+			tree_block_release(root, b);
+			if (root->cache_size < cache_max)
+				return 0;
+		}
+	}
+	return 0;
+}
+
 struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
 {
 	struct tree_buffer *buf;
@@ -30,10 +51,14 @@ struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
 		return buf;
 	allocated_blocks++;
 	buf->blocknr = blocknr;
-	buf->count = 1;
+	buf->count = 2;
+	INIT_LIST_HEAD(&buf->dirty);
+	free_some_buffers(root);
 	radix_tree_preload(GFP_KERNEL);
 	ret = radix_tree_insert(&root->cache_radix, blocknr, buf);
 	radix_tree_preload_end();
+	list_add_tail(&buf->cache, &root->cache);
+	root->cache_size++;
 	if (ret) {
 		free(buf);
 		return NULL;
@@ -57,7 +82,6 @@ struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr)
 	return buf;
 }
 
-
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 {
 	loff_t offset = blocknr * CTREE_BLOCKSIZE;
@@ -82,6 +106,24 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 	return buf;
 }
 
+int dirty_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+{
+	if (!list_empty(&buf->dirty))
+		return 0;
+	list_add_tail(&buf->dirty, &root->trans);
+	buf->count++;
+	return 0;
+}
+
+int clean_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+{
+	if (!list_empty(&buf->dirty)) {
+		list_del_init(&buf->dirty);
+		tree_block_release(root, buf);
+	}
+	return 0;
+}
+
 int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 {
 	u64 blocknr = buf->blocknr;
@@ -96,9 +138,37 @@ int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 	return 0;
 }
 
+static int __commit_transaction(struct ctree_root *root)
+{
+	struct tree_buffer *b;
+	int ret = 0;
+	int wret;
+	while(!list_empty(&root->trans)) {
+		b = list_entry(root->trans.next, struct tree_buffer, dirty);
+		list_del_init(&b->dirty);
+		wret = write_tree_block(root, b);
+		if (wret)
+			ret = wret;
+		tree_block_release(root, b);
+	}
+	return ret;
+}
+
+int commit_transaction(struct ctree_root *root)
+{
+	int ret;
+	ret = __commit_transaction(root);
+	if (!ret && root != root->extent_root)
+		ret = __commit_transaction(root->extent_root);
+	BUG_ON(ret);
+	return ret;
+}
+
 static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
 			struct ctree_root_info *info, int fp)
 {
+	INIT_LIST_HEAD(&root->trans);
+	INIT_LIST_HEAD(&root->cache);
 	root->fp = fp;
 	root->node = NULL;
 	root->node = read_tree_block(root, info->tree_root);
@@ -157,8 +227,23 @@ int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s)
 	return 0;
 }
 
+static int drop_cache(struct ctree_root *root)
+{
+	while(!list_empty(&root->cache)) {
+		struct tree_buffer *b = list_entry(root->cache.next,
+						   struct tree_buffer, cache);
+		list_del_init(&b->cache);
+		tree_block_release(root, b);
+	}
+	return 0;
+}
 int close_ctree(struct ctree_root *root)
 {
+	drop_cache(root->extent_root);
+	drop_cache(root);
+	BUG_ON(!list_empty(&root->trans));
+	BUG_ON(!list_empty(&root->extent_root->trans));
+
 	close(root->fp);
 	if (root->node)
 		tree_block_release(root, root->node);
@@ -182,6 +267,8 @@ void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
 		free(buf);
 		BUG_ON(allocated_blocks == 0);
 		allocated_blocks--;
+		BUG_ON(root->cache_size == 0);
+		root->cache_size--;
 	}
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2729b757ddc..b5dee2fae4d 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -1,5 +1,6 @@
 #ifndef __DISKIO__
 #define __DISKIO__
+#include "list.h"
 
 struct tree_buffer {
 	u64 blocknr;
@@ -8,11 +9,16 @@ struct tree_buffer {
 		struct node node;
 		struct leaf leaf;
 	};
+	struct list_head dirty;
+	struct list_head cache;
 };
 
 struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
 struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr);
 int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
+int dirty_tree_block(struct ctree_root *root, struct tree_buffer *buf);
+int clean_tree_block(struct ctree_root *root, struct tree_buffer *buf);
+int commit_transaction(struct ctree_root *root);
 struct ctree_root *open_ctree(char *filename, struct ctree_super_block *s);
 int close_ctree(struct ctree_root *root);
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
diff --git a/fs/btrfs/list.h b/fs/btrfs/list.h
new file mode 100644
index 00000000000..1aafafb1337
--- /dev/null
+++ b/fs/btrfs/list.h
@@ -0,0 +1,418 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+	list->next = list;
+	list->prev = list;
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+#else
+extern void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next);
+#endif
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+#else
+extern void list_add(struct list_head *new, struct list_head *head);
+#endif
+
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+#ifndef CONFIG_DEBUG_LIST
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = LIST_POISON1;
+	entry->prev = LIST_POISON2;
+}
+#else
+extern void list_del(struct list_head *entry);
+#endif
+
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ * Note: if 'old' was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_is_last - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+				const struct list_head *head)
+{
+	return list->next == head;
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; prefetch(pos->next), pos != (head); \
+        	pos = pos->next)
+
+/**
+ * __list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \
+        	pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_entry((head)->next, typeof(*pos), member);	\
+	     prefetch(pos->member.next), &pos->member != (head); 	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
+	     prefetch(pos->member.prev), &pos->member != (head); 	\
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
+ * @pos:	the type * to use as a start point
+ * @head:	the head of the list
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
+ */
+#define list_prepare_entry(pos, head, member) \
+	((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue(pos, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
+	     prefetch(pos->member.next), &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_from - iterate over list of given type from the current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from(pos, head, member) 			\
+	for (; prefetch(pos->member.next), &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member),	\
+		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_continue
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
+		n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_from
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member) 			\
+	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_reverse
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+#endif
-- 
cgit v1.2.3


From f0930a37f1c096c3a8f6a17b1e251c7fdf4d4457 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 Mar 2007 09:47:58 -0500
Subject: Btrfs: Fix extent code to use merge during delete Remove implicit
 commit in del_item and insert_item Add implicit commit to close() Add commit
 op in random-test

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 15 +++------------
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/random-test.c | 10 ++++++++--
 4 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 57fa505fb9f..3c5f4c2dd52 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1065,11 +1065,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	ret = search_slot(root, key, &path, data_size);
 	if (ret == 0) {
 		release_path(root, &path);
-		ret = -EEXIST;
-		wret = commit_transaction(root);
-		if (wret)
-			ret = wret;
-		return ret;
+		return -EEXIST;
 	}
 	if (ret < 0)
 		goto out;
@@ -1127,9 +1123,6 @@ int insert_item(struct ctree_root *root, struct key *key,
 	check_leaf(&path, 0);
 out:
 	release_path(root, &path);
-	wret = commit_transaction(root);
-	if (wret)
-		ret = wret;
 	return ret;
 }
 
@@ -1245,7 +1238,8 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			wret = push_leaf_left(root, path, 1);
 			if (wret < 0)
 				ret = wret;
-			if (leaf->header.nritems) {
+			if (path->nodes[0] == leaf_buf &&
+			    leaf->header.nritems) {
 				wret = push_leaf_right(root, path, 1);
 				if (wret < 0)
 					ret = wret;
@@ -1265,9 +1259,6 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			}
 		}
 	}
-	wret = commit_transaction(root);
-	if (wret)
-		ret = wret;
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 656ace6147a..f7ca5362291 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -239,6 +239,7 @@ static int drop_cache(struct ctree_root *root)
 }
 int close_ctree(struct ctree_root *root)
 {
+	commit_transaction(root);
 	drop_cache(root->extent_root);
 	drop_cache(root);
 	BUG_ON(!list_empty(&root->trans));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 074f4b182f1..fb214449621 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -39,7 +39,7 @@ static int del_pending_extents(struct ctree_root *extent_root)
 			key.flags = 0;
 			key.offset = 1;
 			init_path(&path);
-			ret = search_slot(extent_root, &key, &path, 0);
+			ret = search_slot(extent_root, &key, &path, -1);
 			if (ret) {
 				print_tree(extent_root, extent_root->node);
 				printf("unable to find %Lu\n", key.objectid);
@@ -83,7 +83,7 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 		return 0;
 	}
 	init_path(&path);
-	ret = search_slot(extent_root, &key, &path, 0);
+	ret = search_slot(extent_root, &key, &path, -1);
 	if (ret) {
 		print_tree(extent_root, extent_root->node);
 		printf("failed to find %Lu\n", key.objectid);
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index bbd554e88db..22955753c3a 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -59,6 +59,11 @@ error:
 	return -1;
 }
 
+static int run_commit(struct ctree_root *root, struct radix_tree_root *radix)
+{
+	return commit_transaction(root);
+}
+
 static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
@@ -233,7 +238,8 @@ static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix)
 
 
 int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) =
-{ ins_one, insert_dup, del_one, lookup_item, lookup_enoent, bulk_op };
+	{ ins_one, insert_dup, del_one, lookup_item,
+	  lookup_enoent, bulk_op, run_commit };
 
 static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 {
@@ -366,7 +372,7 @@ int main(int ac, char **av)
 				err = ret;
 				goto out;
 			}
-			if (ops[op] == bulk_op)
+			if (ops[op] == bulk_op || ops[op] == run_commit)
 				break;
 			if (keep_running == 0) {
 				err = 0;
-- 
cgit v1.2.3


From 77ce6846c40e83193df01295e5af437f8b6c7a2d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 Mar 2007 10:06:43 -0500
Subject: Btrfs: period commit during initial fill in the random tester

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  3 +--
 fs/btrfs/random-test.c | 14 +++++++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f7ca5362291..b1a8149bbc8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -11,7 +11,6 @@
 #include "disk-io.h"
 
 static int allocated_blocks = 0;
-int cache_size = 0;
 int cache_max = 10000;
 
 static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
@@ -36,7 +35,7 @@ static int free_some_buffers(struct ctree_root *root)
 			list_del_init(&b->cache);
 			tree_block_release(root, b);
 			if (root->cache_size < cache_max)
-				return 0;
+				break;
 		}
 	}
 	return 0;
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 22955753c3a..53245c5039d 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -202,15 +202,23 @@ static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix,
 		     int count)
 {
 	int i;
-	int err;
 	int ret = 0;
 	for (i = 0; i < count; i++) {
 		ret = ins_one(root, radix);
 		if (ret) {
-			printf("fill failed\n");
-			err = ret;
+			fprintf(stderr, "fill failed\n");
 			goto out;
 		}
+		if (i % 1000 == 0) {
+			ret = commit_transaction(root);
+			if (ret) {
+				fprintf(stderr, "fill commit failed\n");
+				return ret;
+			}
+		}
+		if (i % 10000 == 0) {
+			printf("bigfill %d\n", i);
+		}
 		if (!keep_running)
 			break;
 	}
-- 
cgit v1.2.3


From 02217ed299c6340a35696e0610047eb96826de2d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 Mar 2007 16:08:05 -0500
Subject: Btrfs: early reference counting

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 153 ++++++++++++++++++++++++++++---------------------
 fs/btrfs/ctree.h       |   3 +-
 fs/btrfs/disk-io.c     |   2 +
 fs/btrfs/extent-tree.c |  45 ++++++++++++++-
 fs/btrfs/quick-test.c  |  10 ++--
 fs/btrfs/random-test.c |  12 ++--
 6 files changed, 145 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3c5f4c2dd52..1118986d756 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -34,6 +34,37 @@ void release_path(struct ctree_root *root, struct ctree_path *p)
 	memset(p, 0, sizeof(*p));
 }
 
+int btrfs_cow_block(struct ctree_root *root,
+		    struct tree_buffer *buf,
+		    struct tree_buffer *parent,
+		    int parent_slot,
+		    struct tree_buffer **cow_ret)
+{
+	struct tree_buffer *cow;
+
+	if (!list_empty(&buf->dirty)) {
+		*cow_ret = buf;
+		return 0;
+	}
+	cow = alloc_free_block(root);
+	memcpy(&cow->node, &buf->node, sizeof(buf->node));
+	cow->node.header.blocknr = cow->blocknr;
+	*cow_ret = cow;
+	if (buf == root->node) {
+		root->node = cow;
+		cow->count++;
+		tree_block_release(root, buf);
+	} else {
+		parent->node.blockptrs[parent_slot] = cow->blocknr;
+		BUG_ON(list_empty(&parent->dirty));
+	}
+	if (0 && root != root->extent_root && !is_leaf(cow->node.header.flags)) {
+		btrfs_inc_ref(root, cow);
+	}
+	tree_block_release(root, buf);
+	return 0;
+}
+
 /*
  * The leaf data grows from end-to-front in the node.
  * this returns the address of the start of the last item,
@@ -263,6 +294,8 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 
 	/* first, try to make some room in the middle buffer */
 	if (left_buf) {
+		btrfs_cow_block(root, left_buf, parent_buf,
+				pslot - 1, &left_buf);
 		left = &left_buf->node;
 		orig_slot += left->header.nritems;
 		wret = push_node_left(root, left_buf, mid_buf);
@@ -274,6 +307,8 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right_buf) {
+		btrfs_cow_block(root, right_buf, parent_buf,
+				pslot + 1, &right_buf);
 		right = &right_buf->node;
 		wret = push_node_left(root, mid_buf, right_buf);
 		if (wret < 0)
@@ -293,9 +328,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		} else {
 			memcpy(parent->keys + pslot + 1, right->keys,
 				sizeof(struct key));
-			wret = dirty_tree_block(root, parent_buf);
-			if (wret)
-				ret = wret;
+			BUG_ON(list_empty(&parent_buf->dirty));
 		}
 	}
 	if (mid->header.nritems == 1) {
@@ -330,9 +363,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	} else {
 		/* update the parent key to reflect our changes */
 		memcpy(parent->keys + pslot, mid->keys, sizeof(struct key));
-		wret = dirty_tree_block(root, parent_buf);
-		if (wret)
-			ret = wret;
+		BUG_ON(list_empty(&parent_buf->dirty));
 	}
 
 	/* update the path */
@@ -375,9 +406,10 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
  * possible)
  */
 int search_slot(struct ctree_root *root, struct key *key,
-		struct ctree_path *p, int ins_len)
+		struct ctree_path *p, int ins_len, int cow)
 {
 	struct tree_buffer *b;
+	struct tree_buffer *cow_buf;
 	struct node *c;
 	int slot;
 	int ret;
@@ -387,8 +419,15 @@ again:
 	b = root->node;
 	b->count++;
 	while (b) {
+		level = node_level(b->node.header.flags);
+		if (cow) {
+			int wret;
+			wret = btrfs_cow_block(root, b, p->nodes[level + 1],
+					       p->slots[level + 1], &cow_buf);
+			b = cow_buf;
+		}
+		BUG_ON(!cow && ins_len);
 		c = &b->node;
-		level = node_level(c->header.flags);
 		p->nodes[level] = b;
 		ret = check_block(p, level);
 		if (ret)
@@ -453,7 +492,6 @@ static int fixup_low_keys(struct ctree_root *root,
 {
 	int i;
 	int ret = 0;
-	int wret;
 	for (i = level; i < MAX_LEVEL; i++) {
 		struct node *t;
 		int tslot = path->slots[i];
@@ -461,9 +499,7 @@ static int fixup_low_keys(struct ctree_root *root,
 			break;
 		t = &path->nodes[i]->node;
 		memcpy(t->keys + tslot, key, sizeof(*key));
-		wret = dirty_tree_block(root, path->nodes[i]);
-		if (wret)
-			ret = wret;
+		BUG_ON(list_empty(&path->nodes[i]->dirty));
 		if (tslot != 0)
 			break;
 	}
@@ -486,7 +522,6 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 	int src_nritems;
 	int dst_nritems;
 	int ret = 0;
-	int wret;
 
 	src_nritems = src->header.nritems;
 	dst_nritems = dst->header.nritems;
@@ -511,13 +546,8 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 	src->header.nritems -= push_items;
 	dst->header.nritems += push_items;
 
-	wret = dirty_tree_block(root, src_buf);
-	if (wret < 0)
-		ret = wret;
-
-	wret = dirty_tree_block(root, dst_buf);
-	if (wret < 0)
-		ret = wret;
+	BUG_ON(list_empty(&src_buf->dirty));
+	BUG_ON(list_empty(&dst_buf->dirty));
 	return ret;
 }
 
@@ -541,7 +571,6 @@ static int balance_node_right(struct ctree_root *root,
 	int src_nritems;
 	int dst_nritems;
 	int ret = 0;
-	int wret;
 
 	src_nritems = src->header.nritems;
 	dst_nritems = dst->header.nritems;
@@ -569,13 +598,8 @@ static int balance_node_right(struct ctree_root *root,
 	src->header.nritems -= push_items;
 	dst->header.nritems += push_items;
 
-	wret = dirty_tree_block(root, src_buf);
-	if (wret < 0)
-		ret = wret;
-
-	wret = dirty_tree_block(root, dst_buf);
-	if (wret < 0)
-		ret = wret;
+	BUG_ON(list_empty(&src_buf->dirty));
+	BUG_ON(list_empty(&dst_buf->dirty));
 	return ret;
 }
 
@@ -615,7 +639,6 @@ static int insert_new_root(struct ctree_root *root,
 	tree_block_release(root, root->node);
 	root->node = t;
 	t->count++;
-	dirty_tree_block(root, t);
 	path->nodes[level] = t;
 	path->slots[level] = 0;
 	return 0;
@@ -655,7 +678,7 @@ static int insert_ptr(struct ctree_root *root,
 	lower->header.nritems++;
 	if (lower->keys[1].objectid == 0)
 			BUG();
-	dirty_tree_block(root, path->nodes[level]);
+	BUG_ON(list_empty(&path->nodes[level]->dirty));
 	return 0;
 }
 
@@ -701,12 +724,7 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 	c->header.nritems = mid;
 	ret = 0;
 
-	wret = dirty_tree_block(root, t);
-	if (wret)
-		ret = wret;
-	wret = dirty_tree_block(root, split_buffer);
-	if (wret)
-		ret = wret;
+	BUG_ON(list_empty(&t->dirty));
 	wret = insert_ptr(root, path, split->keys, split_buffer->blocknr,
 			  path->slots[level + 1] + 1, level + 1);
 	if (wret)
@@ -778,6 +796,15 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		tree_block_release(root, right_buf);
 		return 1;
 	}
+	/* cow and double check */
+	btrfs_cow_block(root, right_buf, upper, slot + 1, &right_buf);
+	right = &right_buf->leaf;
+	free_space = leaf_free_space(right);
+	if (free_space < data_size + sizeof(struct item)) {
+		tree_block_release(root, right_buf);
+		return 1;
+	}
+
 	for (i = left->header.nritems - 1; i >= 0; i--) {
 		item = left->items + i;
 		if (path->slots[0] == i)
@@ -818,11 +845,12 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	}
 	left->header.nritems -= push_items;
 
-	dirty_tree_block(root, left_buf);
-	dirty_tree_block(root, right_buf);
+	BUG_ON(list_empty(&left_buf->dirty));
+	BUG_ON(list_empty(&right_buf->dirty));
 	memcpy(upper->node.keys + slot + 1,
 		&right->items[0].key, sizeof(struct key));
-	dirty_tree_block(root, upper);
+	BUG_ON(list_empty(&upper->dirty));
+
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left->header.nritems) {
 		path->slots[0] -= left->header.nritems;
@@ -869,6 +897,16 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		tree_block_release(root, t);
 		return 1;
 	}
+
+	/* cow and double check */
+	btrfs_cow_block(root, t, path->nodes[1], slot - 1, &t);
+	left = &t->leaf;
+	free_space = leaf_free_space(left);
+	if (free_space < data_size + sizeof(struct item)) {
+		tree_block_release(root, t);
+		return 1;
+	}
+
 	for (i = 0; i < right->header.nritems; i++) {
 		item = right->items + i;
 		if (path->slots[0] == i)
@@ -912,12 +950,8 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		push_space = right->items[i].offset;
 	}
 
-	wret = dirty_tree_block(root, t);
-	if (wret)
-		ret = wret;
-	wret = dirty_tree_block(root, right_buf);
-	if (wret)
-		ret = wret;
+	BUG_ON(list_empty(&t->dirty));
+	BUG_ON(list_empty(&right_buf->dirty));
 
 	wret = fixup_low_keys(root, path, &right->items[0].key, 1);
 	if (wret)
@@ -968,6 +1002,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 		if (wret < 0)
 			return wret;
 	}
+
 	l_buf = path->nodes[0];
 	l = &l_buf->leaf;
 
@@ -1022,13 +1057,8 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 			  right_buffer->blocknr, path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
-	wret = dirty_tree_block(root, right_buffer);
-	if (wret)
-		ret = wret;
-	wret = dirty_tree_block(root, l_buf);
-	if (wret)
-		ret = wret;
-
+	BUG_ON(list_empty(&right_buffer->dirty));
+	BUG_ON(list_empty(&l_buf->dirty));
 	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
 		tree_block_release(root, path->nodes[0]);
@@ -1049,7 +1079,6 @@ int insert_item(struct ctree_root *root, struct key *key,
 			  void *data, int data_size)
 {
 	int ret = 0;
-	int wret;
 	int slot;
 	int slot_orig;
 	struct leaf *leaf;
@@ -1062,7 +1091,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	if (!root->node)
 		BUG();
 	init_path(&path);
-	ret = search_slot(root, key, &path, data_size);
+	ret = search_slot(root, key, &path, data_size, 1);
 	if (ret == 0) {
 		release_path(root, &path);
 		return -EEXIST;
@@ -1114,10 +1143,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	if (slot == 0)
 		ret = fixup_low_keys(root, &path, key, 1);
 
-	wret = dirty_tree_block(root, leaf_buf);
-	if (wret)
-		ret = wret;
-
+	BUG_ON(list_empty(&leaf_buf->dirty));
 	if (leaf_free_space(leaf) < 0)
 		BUG();
 	check_leaf(&path, 0);
@@ -1162,9 +1188,7 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
 		if (wret)
 			ret = wret;
 	}
-	wret = dirty_tree_block(root, parent);
-	if (wret)
-		ret = wret;
+	BUG_ON(list_empty(&parent->dirty));
 	return ret;
 }
 
@@ -1205,7 +1229,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	if (leaf->header.nritems == 0) {
 		if (leaf_buf == root->node) {
 			leaf->header.flags = node_level(0);
-			dirty_tree_block(root, leaf_buf);
+			BUG_ON(list_empty(&leaf_buf->dirty));
 		} else {
 			clean_tree_block(root, leaf_buf);
 			wret = del_ptr(root, path, 1, path->slots[1]);
@@ -1223,9 +1247,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			if (wret)
 				ret = wret;
 		}
-		wret = dirty_tree_block(root, leaf_buf);
-		if (wret)
-			ret = wret;
+		BUG_ON(list_empty(&leaf_buf->dirty));
 
 		/* delete the leaf if it is mostly empty */
 		if (used < LEAF_DATA_SIZE / 3) {
@@ -1304,3 +1326,4 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	return 0;
 }
 
+
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6b4dabd4769..9fe8ba6e25c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -142,8 +142,9 @@ struct ctree_path {
 };
 
 struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf);
 int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
-int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len);
+int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len, int cow);
 void release_path(struct ctree_root *root, struct ctree_path *p);
 void init_path(struct ctree_path *p);
 int del_item(struct ctree_root *root, struct ctree_path *path);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b1a8149bbc8..0e1c31e682f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -260,6 +260,8 @@ void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
 	if (buf->count < 0)
 		BUG();
 	if (buf->count == 0) {
+		BUG_ON(!list_empty(&buf->cache));
+		BUG_ON(!list_empty(&buf->dirty));
 		if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
 			BUG();
 		radix_tree_delete(&root->cache_radix, buf->blocknr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fb214449621..25d9cd16920 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -15,6 +15,39 @@
  */
 #define CTREE_EXTENT_PENDING 0
 
+static int inc_block_ref(struct ctree_root *root, u64 blocknr)
+{
+	struct ctree_path path;
+	int ret;
+	struct key key;
+	struct leaf *l;
+	struct extent_item *item;
+	init_path(&path);
+	key.objectid = blocknr;
+	key.flags = 0;
+	key.offset = 1;
+	ret = search_slot(root->extent_root, &key, &path, 0, 1);
+	BUG_ON(ret != 0);
+	l = &path.nodes[0]->leaf;
+	item = (struct extent_item *)(l->data +
+				      l->items[path.slots[0]].offset);
+	item->refs++;
+	BUG_ON(list_empty(&path.nodes[0]->dirty));
+	release_path(root->extent_root, &path);
+	return 0;
+}
+
+int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf)
+{
+	u64 blocknr;
+	int i;
+	for (i = 0; i < buf->node.header.nritems; i++) {
+		blocknr = buf->node.blockptrs[i];
+		inc_block_ref(root, blocknr);
+	}
+	return 0;
+}
+
 /*
  * find all the blocks marked as pending in the radix tree and remove
  * them from the extent map
@@ -39,7 +72,7 @@ static int del_pending_extents(struct ctree_root *extent_root)
 			key.flags = 0;
 			key.offset = 1;
 			init_path(&path);
-			ret = search_slot(extent_root, &key, &path, -1);
+			ret = search_slot(extent_root, &key, &path, -1, 1);
 			if (ret) {
 				print_tree(extent_root, extent_root->node);
 				printf("unable to find %Lu\n", key.objectid);
@@ -83,7 +116,7 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 		return 0;
 	}
 	init_path(&path);
-	ret = search_slot(extent_root, &key, &path, -1);
+	ret = search_slot(extent_root, &key, &path, -1, 1);
 	if (ret) {
 		print_tree(extent_root, extent_root->node);
 		printf("failed to find %Lu\n", key.objectid);
@@ -124,7 +157,7 @@ check_failed:
 	ins->offset = 0;
 	ins->flags = 0;
 	start_found = 0;
-	ret = search_slot(root, ins, &path, 0);
+	ret = search_slot(root, ins, &path, 0, 0);
 	if (ret < 0)
 		goto error;
 
@@ -221,6 +254,8 @@ static int insert_pending_extents(struct ctree_root *extent_root)
 			ret = insert_item(extent_root, &key, &item,
 					  sizeof(item));
 			if (ret) {
+				printf("%Lu already in tree\n", key.objectid);
+				print_tree(extent_root, extent_root->node);
 				BUG();
 				// FIXME undo it and return sane
 				return ret;
@@ -228,6 +263,7 @@ static int insert_pending_extents(struct ctree_root *extent_root)
 			radix_tree_tag_clear(&extent_root->cache_radix,
 					     gang[i]->blocknr,
 					     CTREE_EXTENT_PENDING);
+			printf("%Lu is not pending\n", gang[i]->blocknr);
 			tree_block_release(extent_root, gang[i]);
 		}
 	}
@@ -266,15 +302,18 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
 		if (pending_ret)
 			return pending_ret;
 		*buf = find_tree_block(root, ins->objectid);
+		dirty_tree_block(root, *buf);
 		return 0;
 	}
 	/* we're allocating an extent for the extent tree, don't recurse */
 	BUG_ON(ins->offset != 1);
 	*buf = find_tree_block(root, ins->objectid);
 	BUG_ON(!*buf);
+	printf("%Lu is pending\n", ins->objectid);
 	radix_tree_tag_set(&root->cache_radix, ins->objectid,
 			   CTREE_EXTENT_PENDING);
 	(*buf)->count++;
+	dirty_tree_block(root, *buf);
 	return 0;
 
 }
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index dbd00c3b7ab..8255f79ceca 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -19,7 +19,7 @@ int main(int ac, char **av) {
 	int i;
 	int num;
 	int ret;
-	int run_size = 100000;
+	int run_size = 1024;
 	int max_key =  100000000;
 	int tree_size = 0;
 	struct ctree_path path;
@@ -57,7 +57,7 @@ int main(int ac, char **av) {
 		init_path(&path);
 		if (i % 10000 == 0)
 			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = search_slot(root, &ins, &path, 0);
+		ret = search_slot(root, &ins, &path, 0, 0);
 		if (ret) {
 			print_tree(root, root->node);
 			printf("unable to find %d\n", num);
@@ -79,7 +79,7 @@ int main(int ac, char **av) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path, -1);
+		ret = search_slot(root, &ins, &path, -1, 1);
 		if (!ret) {
 			if (i % 10000 == 0)
 				fprintf(stderr, "del %d:%d\n", num, i);
@@ -117,7 +117,7 @@ int main(int ac, char **av) {
 		init_path(&path);
 		if (i % 10000 == 0)
 			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = search_slot(root, &ins, &path, 0);
+		ret = search_slot(root, &ins, &path, 0, 0);
 		if (ret) {
 			print_tree(root, root->node);
 			printf("unable to find %d\n", num);
@@ -131,7 +131,7 @@ int main(int ac, char **av) {
 		int slot;
 		ins.objectid = (u64)-1;
 		init_path(&path);
-		ret = search_slot(root, &ins, &path, -1);
+		ret = search_slot(root, &ins, &path, -1, 1);
 		if (ret == 0)
 			BUG();
 
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 53245c5039d..dcc852ad673 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -93,7 +93,7 @@ static int del_one(struct ctree_root *root, struct radix_tree_root *radix)
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
-	ret = search_slot(root, &key, &path, -1);
+	ret = search_slot(root, &key, &path, -1, 1);
 	if (ret)
 		goto error;
 	ret = del_item(root, &path);
@@ -118,7 +118,7 @@ static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix)
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
-	ret = search_slot(root, &key, &path, 0);
+	ret = search_slot(root, &key, &path, 0, 1);
 	release_path(root, &path);
 	if (ret)
 		goto error;
@@ -137,7 +137,7 @@ static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix)
 	ret = setup_key(radix, &key, 0);
 	if (ret < 0)
 		return ret;
-	ret = search_slot(root, &key, &path, 0);
+	ret = search_slot(root, &key, &path, 0, 0);
 	release_path(root, &path);
 	if (ret <= 0)
 		goto error;
@@ -163,7 +163,7 @@ static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix,
 	key.objectid = (unsigned long)-1;
 	while(nr-- >= 0) {
 		init_path(&path);
-		ret = search_slot(root, &key, &path, -1);
+		ret = search_slot(root, &key, &path, -1, 1);
 		if (ret < 0) {
 			release_path(root, &path);
 			return ret;
@@ -216,7 +216,7 @@ static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix,
 				return ret;
 			}
 		}
-		if (i % 10000 == 0) {
+		if (i && i % 10000 == 0) {
 			printf("bigfill %d\n", i);
 		}
 		if (!keep_running)
@@ -263,7 +263,7 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 	key.objectid = (unsigned long)-1;
 	while(1) {
 		init_path(&path);
-		ret = search_slot(root, &key, &path, 0);
+		ret = search_slot(root, &key, &path, 0, 0);
 		if (ret < 0) {
 			release_path(root, &path);
 			return ret;
-- 
cgit v1.2.3


From a28ec19775d62d673b034082128aca95780d3737 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 6 Mar 2007 20:08:01 -0500
Subject: Btrfs: Fixup reference counting on cows

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/ctree.c       |   9 +-
 fs/btrfs/ctree.h       |   4 +
 fs/btrfs/disk-io.c     |  31 +++++-
 fs/btrfs/disk-io.h     |   4 +-
 fs/btrfs/extent-tree.c | 291 ++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/print-tree.c  |   6 +-
 fs/btrfs/quick-test.c  |  26 +++--
 fs/btrfs/random-test.c |  21 ++--
 9 files changed, 267 insertions(+), 127 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 756b1cd4cf4..ae7f4c00c39 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,7 +1,7 @@
 
 CC=gcc
 CFLAGS = -g -Wall
-headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h
+headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
 
 # if you don't have sparse installed, use ls instead
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1118986d756..72816381d20 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -50,16 +50,17 @@ int btrfs_cow_block(struct ctree_root *root,
 	memcpy(&cow->node, &buf->node, sizeof(buf->node));
 	cow->node.header.blocknr = cow->blocknr;
 	*cow_ret = cow;
+	btrfs_inc_ref(root, buf);
 	if (buf == root->node) {
 		root->node = cow;
 		cow->count++;
+		if (buf != root->commit_root)
+			free_extent(root, buf->blocknr, 1);
 		tree_block_release(root, buf);
 	} else {
 		parent->node.blockptrs[parent_slot] = cow->blocknr;
 		BUG_ON(list_empty(&parent->dirty));
-	}
-	if (0 && root != root->extent_root && !is_leaf(cow->node.header.flags)) {
-		btrfs_inc_ref(root, cow);
+		free_extent(root, buf->blocknr, 1);
 	}
 	tree_block_release(root, buf);
 	return 0;
@@ -1018,7 +1019,6 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	slot = path->slots[0];
 	nritems = l->header.nritems;
 	mid = (nritems + 1)/ 2;
-
 	right_buffer = alloc_free_block(root);
 	BUG_ON(!right_buffer);
 	BUG_ON(mid == nritems);
@@ -1170,7 +1170,6 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
 
 	node = &parent->node;
 	nritems = node->header.nritems;
-
 	if (slot != nritems -1) {
 		memmove(node->keys + slot, node->keys + slot + 1,
 			sizeof(struct key) * (nritems - slot - 1));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9fe8ba6e25c..4a7bc4e6e74 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -51,10 +51,12 @@ struct tree_buffer;
  */
 struct ctree_root {
 	struct tree_buffer *node;
+	struct tree_buffer *commit_root;
 	struct ctree_root *extent_root;
 	struct key current_insert;
 	int fp;
 	struct radix_tree_root cache_radix;
+	struct radix_tree_root pinned_radix;
 	struct list_head trans;
 	struct list_head cache;
 	int cache_size;
@@ -151,4 +153,6 @@ int del_item(struct ctree_root *root, struct ctree_path *path);
 int insert_item(struct ctree_root *root, struct key *key, void *data, int data_size);
 int next_leaf(struct ctree_root *root, struct ctree_path *path);
 int leaf_free_space(struct leaf *leaf);
+int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap);
+int btrfs_finish_extent_commit(struct ctree_root *root);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0e1c31e682f..2fe31c3508c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -153,13 +153,24 @@ static int __commit_transaction(struct ctree_root *root)
 	return ret;
 }
 
-int commit_transaction(struct ctree_root *root)
+int commit_transaction(struct ctree_root *root, struct ctree_super_block *s)
 {
-	int ret;
+	int ret = 0;
+
 	ret = __commit_transaction(root);
 	if (!ret && root != root->extent_root)
 		ret = __commit_transaction(root->extent_root);
 	BUG_ON(ret);
+	if (root->commit_root != root->node) {
+		struct tree_buffer *snap = root->commit_root;
+		root->commit_root = root->node;
+		root->node->count++;
+		ret = btrfs_drop_snapshot(root, snap);
+		BUG_ON(ret);
+		tree_block_release(root, snap);
+	}
+        write_ctree_super(root, s);
+	btrfs_finish_extent_commit(root);
 	return ret;
 }
 
@@ -168,10 +179,13 @@ static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
 {
 	INIT_LIST_HEAD(&root->trans);
 	INIT_LIST_HEAD(&root->cache);
+	root->cache_size = 0;
 	root->fp = fp;
 	root->node = NULL;
-	root->node = read_tree_block(root, info->tree_root);
 	root->extent_root = extent_root;
+	root->commit_root = NULL;
+	root->node = read_tree_block(root, info->tree_root);
+	memset(&root->current_insert, 0, sizeof(root->current_insert));
 	return 0;
 }
 
@@ -188,6 +202,8 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 		return NULL;
 	}
 	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&root->pinned_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&extent_root->pinned_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL);
 	ret = pread(fp, super, sizeof(struct ctree_super_block),
 		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
@@ -204,6 +220,8 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 	BUG_ON(ret < 0);
 	__setup_root(root, extent_root, &super->root_info, fp);
 	__setup_root(extent_root, extent_root, &super->extent_info, fp);
+	root->commit_root = root->node;
+	root->node->count++;
 	return root;
 }
 
@@ -236,9 +254,11 @@ static int drop_cache(struct ctree_root *root)
 	}
 	return 0;
 }
-int close_ctree(struct ctree_root *root)
+int close_ctree(struct ctree_root *root, struct ctree_super_block *s)
 {
-	commit_transaction(root);
+	commit_transaction(root, s);
+	__commit_transaction(root->extent_root);
+	write_ctree_super(root, s);
 	drop_cache(root->extent_root);
 	drop_cache(root);
 	BUG_ON(!list_empty(&root->trans));
@@ -249,6 +269,7 @@ int close_ctree(struct ctree_root *root)
 		tree_block_release(root, root->node);
 	if (root->extent_root->node)
 		tree_block_release(root->extent_root, root->extent_root->node);
+	tree_block_release(root, root->commit_root);
 	free(root);
 	printf("on close %d blocks are allocated\n", allocated_blocks);
 	return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b5dee2fae4d..1c0af7c56c2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -18,9 +18,9 @@ struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr);
 int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
 int dirty_tree_block(struct ctree_root *root, struct tree_buffer *buf);
 int clean_tree_block(struct ctree_root *root, struct tree_buffer *buf);
-int commit_transaction(struct ctree_root *root);
+int commit_transaction(struct ctree_root *root, struct ctree_super_block *s);
 struct ctree_root *open_ctree(char *filename, struct ctree_super_block *s);
-int close_ctree(struct ctree_root *root);
+int close_ctree(struct ctree_root *root, struct ctree_super_block *s);
 void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
 int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s);
 int mkfs(int fd);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 25d9cd16920..0723b7f3f0c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -13,7 +13,8 @@
  * other allocations are done.  The pending tag is also used in the same
  * manner for deletes.
  */
-#define CTREE_EXTENT_PENDING 0
+#define CTREE_EXTENT_PENDING_ADD 0
+#define CTREE_EXTENT_PENDING_DEL 1
 
 static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 {
@@ -27,20 +28,51 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 	key.flags = 0;
 	key.offset = 1;
 	ret = search_slot(root->extent_root, &key, &path, 0, 1);
+	if (ret != 0)
+		BUG();
 	BUG_ON(ret != 0);
 	l = &path.nodes[0]->leaf;
 	item = (struct extent_item *)(l->data +
 				      l->items[path.slots[0]].offset);
 	item->refs++;
+
 	BUG_ON(list_empty(&path.nodes[0]->dirty));
 	release_path(root->extent_root, &path);
 	return 0;
 }
 
+static int lookup_block_ref(struct ctree_root *root, u64 blocknr, int *refs)
+{
+	struct ctree_path path;
+	int ret;
+	struct key key;
+	struct leaf *l;
+	struct extent_item *item;
+	init_path(&path);
+	key.objectid = blocknr;
+	key.flags = 0;
+	key.offset = 1;
+	ret = search_slot(root->extent_root, &key, &path, 0, 0);
+	if (ret != 0)
+		BUG();
+	l = &path.nodes[0]->leaf;
+	item = (struct extent_item *)(l->data +
+				      l->items[path.slots[0]].offset);
+	*refs = item->refs;
+	release_path(root->extent_root, &path);
+	return 0;
+}
+
 int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf)
 {
 	u64 blocknr;
 	int i;
+
+	if (root == root->extent_root)
+		return 0;
+	if (is_leaf(buf->node.header.flags))
+		return 0;
+
 	for (i = 0; i < buf->node.header.nritems; i++) {
 		blocknr = buf->node.blockptrs[i];
 		inc_block_ref(root, blocknr);
@@ -48,85 +80,187 @@ int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf)
 	return 0;
 }
 
+int btrfs_finish_extent_commit(struct ctree_root *root)
+{
+	struct ctree_root *extent_root = root->extent_root;
+	unsigned long gang[8];
+	int ret;
+	int i;
+
+	while(1) {
+		ret = radix_tree_gang_lookup(&extent_root->pinned_radix,
+						 (void **)gang, 0,
+						 ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++)
+			radix_tree_delete(&extent_root->pinned_radix, gang[i]);
+	}
+	return 0;
+}
+
 /*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
+ * remove an extent from the root, returns 0 on success
  */
-static int del_pending_extents(struct ctree_root *extent_root)
+int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
+{
+	struct ctree_path path;
+	struct key key;
+	struct ctree_root *extent_root = root->extent_root;
+	int ret;
+	struct item *item;
+	struct extent_item *ei;
+	key.objectid = blocknr;
+	key.flags = 0;
+	key.offset = num_blocks;
+
+	init_path(&path);
+	ret = search_slot(extent_root, &key, &path, -1, 1);
+	if (ret) {
+		printf("failed to find %Lu\n", key.objectid);
+		print_tree(extent_root, extent_root->node);
+		printf("failed to find %Lu\n", key.objectid);
+		BUG();
+	}
+	item = path.nodes[0]->leaf.items + path.slots[0];
+	ei = (struct extent_item *)(path.nodes[0]->leaf.data + item->offset);
+	BUG_ON(ei->refs == 0);
+	ei->refs--;
+	if (ei->refs == 0) {
+		if (root == extent_root) {
+			int err;
+			radix_tree_preload(GFP_KERNEL);
+			err = radix_tree_insert(&extent_root->pinned_radix,
+					  blocknr, (void *)blocknr);
+			BUG_ON(err);
+			radix_tree_preload_end();
+		}
+		ret = del_item(extent_root, &path);
+		if (ret)
+			BUG();
+	}
+	release_path(extent_root, &path);
+	return ret;
+}
+
+/*
+ * insert all of the pending extents reserved during the original
+ * allocation.  (CTREE_EXTENT_PENDING).  Returns zero if it all worked out
+ */
+static int insert_pending_extents(struct ctree_root *extent_root)
 {
 	int ret;
 	struct key key;
+	struct extent_item item;
 	struct tree_buffer *gang[4];
 	int i;
-	struct ctree_path path;
 
+	// FIXME -ENOSPC
+	item.owner = extent_root->node->node.header.parentid;
+	item.refs = 1;
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
 						 (void **)gang, 0,
 						 ARRAY_SIZE(gang),
-						 CTREE_EXTENT_PENDING);
+						 CTREE_EXTENT_PENDING_ADD);
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
 			key.objectid = gang[i]->blocknr;
 			key.flags = 0;
 			key.offset = 1;
-			init_path(&path);
-			ret = search_slot(extent_root, &key, &path, -1, 1);
+			ret = insert_item(extent_root, &key, &item,
+					  sizeof(item));
 			if (ret) {
+				printf("%Lu already in tree\n", key.objectid);
 				print_tree(extent_root, extent_root->node);
-				printf("unable to find %Lu\n", key.objectid);
 				BUG();
 				// FIXME undo it and return sane
 				return ret;
 			}
-			ret = del_item(extent_root, &path);
-			if (ret) {
-				BUG();
-				return ret;
-			}
-			release_path(extent_root, &path);
+			radix_tree_tag_clear(&extent_root->cache_radix,
+					     gang[i]->blocknr,
+					     CTREE_EXTENT_PENDING_ADD);
+			tree_block_release(extent_root, gang[i]);
+		}
+	}
+	return 0;
+}
+
+/*
+ * find all the blocks marked as pending in the radix tree and remove
+ * them from the extent map
+ */
+static int del_pending_extents(struct ctree_root *extent_root)
+{
+	int ret;
+	struct tree_buffer *gang[4];
+	int i;
+
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
+						 (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 CTREE_EXTENT_PENDING_DEL);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			ret = __free_extent(extent_root, gang[i]->blocknr, 1);
 			radix_tree_tag_clear(&extent_root->cache_radix,
 						gang[i]->blocknr,
-						CTREE_EXTENT_PENDING);
+						CTREE_EXTENT_PENDING_DEL);
 			tree_block_release(extent_root, gang[i]);
 		}
 	}
 	return 0;
 }
 
+static int run_pending(struct ctree_root *extent_root)
+{
+	while(radix_tree_tagged(&extent_root->cache_radix,
+			        CTREE_EXTENT_PENDING_DEL) ||
+	      radix_tree_tagged(&extent_root->cache_radix,
+				CTREE_EXTENT_PENDING_ADD)) {
+		insert_pending_extents(extent_root);
+		del_pending_extents(extent_root);
+	}
+	return 0;
+}
+
+
 /*
  * remove an extent from the root, returns 0 on success
  */
 int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 {
-	struct ctree_path path;
 	struct key key;
 	struct ctree_root *extent_root = root->extent_root;
 	struct tree_buffer *t;
 	int pending_ret;
 	int ret;
-	key.objectid = blocknr;
-	key.flags = 0;
-	key.offset = num_blocks;
+
 	if (root == extent_root) {
-		t = read_tree_block(root, key.objectid);
-		radix_tree_tag_set(&root->cache_radix, key.objectid,
-				   CTREE_EXTENT_PENDING);
+		t = find_tree_block(root, blocknr);
+		if (radix_tree_tag_get(&root->cache_radix, blocknr,
+				      CTREE_EXTENT_PENDING_ADD)) {
+			radix_tree_tag_clear(&root->cache_radix,
+					     blocknr,
+					     CTREE_EXTENT_PENDING_ADD);
+			/* once for us */
+			tree_block_release(root, t);
+			/* once for the pending add */
+			tree_block_release(root, t);
+		} else {
+			radix_tree_tag_set(&root->cache_radix, blocknr,
+				   CTREE_EXTENT_PENDING_DEL);
+		}
 		return 0;
 	}
-	init_path(&path);
-	ret = search_slot(extent_root, &key, &path, -1, 1);
-	if (ret) {
-		print_tree(extent_root, extent_root->node);
-		printf("failed to find %Lu\n", key.objectid);
-		BUG();
-	}
-	ret = del_item(extent_root, &path);
-	if (ret)
-		BUG();
-	release_path(extent_root, &path);
-	pending_ret = del_pending_extents(root->extent_root);
+	key.objectid = blocknr;
+	key.flags = 0;
+	key.offset = num_blocks;
+	ret = __free_extent(root, blocknr, num_blocks);
+	pending_ret = run_pending(root->extent_root);
 	return ret ? ret : pending_ret;
 }
 
@@ -203,7 +337,7 @@ check_pending:
 	 */
 	release_path(root, &path);
 	BUG_ON(ins->objectid < search_start);
-	if (orig_root->extent_root == orig_root) {
+	if (1 || orig_root->extent_root == orig_root) {
 		BUG_ON(num_blocks != 1);
 		if ((root->current_insert.objectid <= ins->objectid &&
 		    root->current_insert.objectid +
@@ -211,8 +345,9 @@ check_pending:
 		   (root->current_insert.objectid > ins->objectid &&
 		    root->current_insert.objectid <= ins->objectid +
 		    ins->offset) ||
+		   radix_tree_lookup(&root->pinned_radix, ins->objectid) ||
 		   radix_tree_tag_get(&root->cache_radix, ins->objectid,
-				      CTREE_EXTENT_PENDING)) {
+				      CTREE_EXTENT_PENDING_ADD)) {
 			search_start = ins->objectid + 1;
 			goto check_failed;
 		}
@@ -225,51 +360,6 @@ error:
 	return ret;
 }
 
-/*
- * insert all of the pending extents reserved during the original
- * allocation.  (CTREE_EXTENT_PENDING).  Returns zero if it all worked out
- */
-static int insert_pending_extents(struct ctree_root *extent_root)
-{
-	int ret;
-	struct key key;
-	struct extent_item item;
-	struct tree_buffer *gang[4];
-	int i;
-
-	// FIXME -ENOSPC
-	item.refs = 1;
-	item.owner = extent_root->node->node.header.parentid;
-	while(1) {
-		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
-						 (void **)gang, 0,
-						 ARRAY_SIZE(gang),
-						 CTREE_EXTENT_PENDING);
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++) {
-			key.objectid = gang[i]->blocknr;
-			key.flags = 0;
-			key.offset = 1;
-			ret = insert_item(extent_root, &key, &item,
-					  sizeof(item));
-			if (ret) {
-				printf("%Lu already in tree\n", key.objectid);
-				print_tree(extent_root, extent_root->node);
-				BUG();
-				// FIXME undo it and return sane
-				return ret;
-			}
-			radix_tree_tag_clear(&extent_root->cache_radix,
-					     gang[i]->blocknr,
-					     CTREE_EXTENT_PENDING);
-			printf("%Lu is not pending\n", gang[i]->blocknr);
-			tree_block_release(extent_root, gang[i]);
-		}
-	}
-	return 0;
-}
-
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -296,7 +386,7 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
 				  sizeof(extent_item));
 		memset(&root->extent_root->current_insert, 0,
 		       sizeof(struct key));
-		pending_ret = insert_pending_extents(root->extent_root);
+		pending_ret = run_pending(root->extent_root);
 		if (ret)
 			return ret;
 		if (pending_ret)
@@ -309,9 +399,8 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
 	BUG_ON(ins->offset != 1);
 	*buf = find_tree_block(root, ins->objectid);
 	BUG_ON(!*buf);
-	printf("%Lu is pending\n", ins->objectid);
 	radix_tree_tag_set(&root->cache_radix, ins->objectid,
-			   CTREE_EXTENT_PENDING);
+			   CTREE_EXTENT_PENDING_ADD);
 	(*buf)->count++;
 	dirty_tree_block(root, *buf);
 	return 0;
@@ -331,13 +420,41 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root)
 	ret = alloc_extent(root, 1, 0, (unsigned long)-1,
 			   root->node->node.header.parentid,
 			   &ins, &buf);
-
 	if (ret) {
 		BUG();
 		return NULL;
 	}
 	if (root != root->extent_root)
 		BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix,
-					  buf->blocknr, CTREE_EXTENT_PENDING));
+					  buf->blocknr,
+					  CTREE_EXTENT_PENDING_ADD));
 	return buf;
 }
+
+int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
+{
+	int ret;
+	int level;
+	int refs;
+	u64 blocknr = snap->blocknr;
+
+	level = node_level(snap->node.header.flags);
+	ret = lookup_block_ref(root, snap->blocknr, &refs);
+	BUG_ON(ret);
+	if (refs == 1 && level != 0) {
+		struct node *n = &snap->node;
+		struct tree_buffer *b;
+		int i;
+		for (i = 0; i < n->header.nritems; i++) {
+			b = read_tree_block(root, n->blockptrs[i]);
+			/* FIXME, don't recurse here */
+			ret = btrfs_drop_snapshot(root, b);
+			BUG_ON(ret);
+			tree_block_release(root, b);
+		}
+	}
+	ret = free_extent(root, blocknr, 1);
+	BUG_ON(ret);
+	return 0;
+}
+
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index dda08f32c15..e32a959dd3e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -21,9 +21,11 @@ void print_leaf(struct leaf *l)
 			item->key.objectid, item->key.flags, item->key.offset,
 			item->offset, item->size);
 		fflush(stdout);
-		printf("\t\titem data %.*s\n", item->size, l->data+item->offset);
+		printf("\t\titem data %.*s\n", item->size,
+			l->data+item->offset);
 		ei = (struct extent_item *)(l->data + item->offset);
-		printf("\t\textent data %u %Lu\n", ei->refs, ei->owner);
+		printf("\t\textent data refs %u owner %Lu\n", ei->refs,
+			ei->owner);
 		fflush(stdout);
 	}
 }
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index 8255f79ceca..6400c7100a6 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -19,7 +19,7 @@ int main(int ac, char **av) {
 	int i;
 	int num;
 	int ret;
-	int run_size = 1024;
+	int run_size = 100000;
 	int max_key =  100000000;
 	int tree_size = 0;
 	struct ctree_path path;
@@ -44,9 +44,9 @@ int main(int ac, char **av) {
 		if (!ret)
 			tree_size++;
 		free(buf);
+
 	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
+	close_ctree(root, &super);
 
 	root = open_ctree("dbfile", &super);
 	printf("starting search\n");
@@ -65,8 +65,7 @@ int main(int ac, char **av) {
 		}
 		release_path(root, &path);
 	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
+	close_ctree(root, &super);
 	root = open_ctree("dbfile", &super);
 	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
 	        node_level(root->node->node.header.flags),
@@ -90,8 +89,7 @@ int main(int ac, char **av) {
 		}
 		release_path(root, &path);
 	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
+	close_ctree(root, &super);
 	root = open_ctree("dbfile", &super);
 	srand(128);
 	for (i = 0; i < run_size; i++) {
@@ -106,8 +104,7 @@ int main(int ac, char **av) {
 			tree_size++;
 		free(buf);
 	}
-	write_ctree_super(root, &super);
-	close_ctree(root);
+	close_ctree(root, &super);
 	root = open_ctree("dbfile", &super);
 	srand(128);
 	printf("starting search2\n");
@@ -156,10 +153,17 @@ int main(int ac, char **av) {
 		}
 		release_path(root, &path);
 	}
+	/*
+	printf("previous tree:\n");
+	print_tree(root, root->commit_root);
+	printf("map before commit\n");
+	print_tree(root->extent_root, root->extent_root->node);
+	*/
+	commit_transaction(root, &super);
 	printf("tree size is now %d\n", tree_size);
+	printf("root %p commit root %p\n", root->node, root->commit_root);
 	printf("map tree\n");
 	print_tree(root->extent_root, root->extent_root->node);
-	write_ctree_super(root, &super);
-	close_ctree(root);
+	close_ctree(root, &super);
 	return 0;
 }
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index dcc852ad673..7b37b6bae10 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -8,6 +8,7 @@
 #include "print-tree.h"
 
 int keep_running = 1;
+struct ctree_super_block super;
 
 static int setup_key(struct radix_tree_root *root, struct key *key, int exists)
 {
@@ -59,11 +60,6 @@ error:
 	return -1;
 }
 
-static int run_commit(struct ctree_root *root, struct radix_tree_root *radix)
-{
-	return commit_transaction(root);
-}
-
 static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
@@ -210,7 +206,7 @@ static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix,
 			goto out;
 		}
 		if (i % 1000 == 0) {
-			ret = commit_transaction(root);
+			ret = commit_transaction(root, &super);
 			if (ret) {
 				fprintf(stderr, "fill commit failed\n");
 				return ret;
@@ -229,7 +225,7 @@ out:
 static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	int ret;
-	int nr = rand() % 20000;
+	int nr = rand() % 5000;
 	static int run_nr = 0;
 
 	/* do the bulk op much less frequently */
@@ -247,7 +243,7 @@ static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix)
 
 int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) =
 	{ ins_one, insert_dup, del_one, lookup_item,
-	  lookup_enoent, bulk_op, run_commit };
+	  lookup_enoent, bulk_op };
 
 static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 {
@@ -314,7 +310,6 @@ int print_usage(void)
 int main(int ac, char **av)
 {
 	RADIX_TREE(radix, GFP_KERNEL);
-	struct ctree_super_block super;
 	struct ctree_root *root;
 	int i;
 	int ret;
@@ -365,8 +360,7 @@ int main(int ac, char **av)
 			printf("open & close, root level %d nritems %d\n",
 				node_level(root->node->node.header.flags),
 				root->node->node.header.nritems);
-			write_ctree_super(root, &super);
-			close_ctree(root);
+			close_ctree(root, &super);
 			root = open_ctree("dbfile", &super);
 		}
 		while(count--) {
@@ -380,7 +374,7 @@ int main(int ac, char **av)
 				err = ret;
 				goto out;
 			}
-			if (ops[op] == bulk_op || ops[op] == run_commit)
+			if (ops[op] == bulk_op)
 				break;
 			if (keep_running == 0) {
 				err = 0;
@@ -389,8 +383,7 @@ int main(int ac, char **av)
 		}
 	}
 out:
-	write_ctree_super(root, &super);
-	close_ctree(root);
+	close_ctree(root, &super);
 	return err;
 }
 
-- 
cgit v1.2.3


From 037e6390488af8ab96137e1e5cccc15ad14ef887 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Mar 2007 11:50:24 -0500
Subject: Btrfs: get rid of add recursion

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/ctree.c       |   9 ---
 fs/btrfs/extent-tree.c | 209 ++++++++++++++++++++++---------------------------
 3 files changed, 96 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ae7f4c00c39..d92d08dde0f 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,6 +1,6 @@
 
 CC=gcc
-CFLAGS = -g -Wall
+CFLAGS = -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 72816381d20..729d4ddb374 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -995,15 +995,6 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	int ret;
 	int wret;
 
-	wret = push_leaf_left(root, path, data_size);
-	if (wret < 0)
-		return wret;
-	if (wret) {
-		wret = push_leaf_right(root, path, data_size);
-		if (wret < 0)
-			return wret;
-	}
-
 	l_buf = path->nodes[0];
 	l = &l_buf->leaf;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0723b7f3f0c..8a2b8aaf9b8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6,6 +6,11 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
+static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
+			    u64 search_start, u64 search_end, struct key *ins);
+static int finish_current_insert(struct ctree_root *extent_root);
+static int run_pending(struct ctree_root *extent_root);
+
 /*
  * pending extents are blocks that we're trying to allocate in the extent
  * map while trying to grow the map because of other allocations.  To avoid
@@ -13,8 +18,7 @@
  * other allocations are done.  The pending tag is also used in the same
  * manner for deletes.
  */
-#define CTREE_EXTENT_PENDING_ADD 0
-#define CTREE_EXTENT_PENDING_DEL 1
+#define CTREE_EXTENT_PENDING_DEL 0
 
 static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 {
@@ -23,6 +27,9 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 	struct key key;
 	struct leaf *l;
 	struct extent_item *item;
+	struct key ins;
+
+	find_free_extent(root->extent_root, 0, 0, (u64)-1, &ins);
 	init_path(&path);
 	key.objectid = blocknr;
 	key.flags = 0;
@@ -38,6 +45,8 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 
 	BUG_ON(list_empty(&path.nodes[0]->dirty));
 	release_path(root->extent_root, &path);
+	finish_current_insert(root->extent_root);
+	run_pending(root->extent_root);
 	return 0;
 }
 
@@ -99,6 +108,28 @@ int btrfs_finish_extent_commit(struct ctree_root *root)
 	return 0;
 }
 
+static int finish_current_insert(struct ctree_root *extent_root)
+{
+	struct key ins;
+	struct extent_item extent_item;
+	int i;
+	int ret;
+
+	extent_item.refs = 1;
+	extent_item.owner = extent_root->node->node.header.parentid;
+	ins.offset = 1;
+	ins.flags = 0;
+
+	for (i = 0; i < extent_root->current_insert.flags; i++) {
+		ins.objectid = extent_root->current_insert.objectid + i;
+		ret = insert_item(extent_root, &ins, &extent_item,
+				  sizeof(extent_item));
+		BUG_ON(ret);
+	}
+	extent_root->current_insert.offset = 0;
+	return 0;
+}
+
 /*
  * remove an extent from the root, returns 0 on success
  */
@@ -110,10 +141,13 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 	int ret;
 	struct item *item;
 	struct extent_item *ei;
+	struct key ins;
+
 	key.objectid = blocknr;
 	key.flags = 0;
 	key.offset = num_blocks;
 
+	find_free_extent(root, 0, 0, (u64)-1, &ins);
 	init_path(&path);
 	ret = search_slot(extent_root, &key, &path, -1, 1);
 	if (ret) {
@@ -140,53 +174,10 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 			BUG();
 	}
 	release_path(extent_root, &path);
+	finish_current_insert(extent_root);
 	return ret;
 }
 
-/*
- * insert all of the pending extents reserved during the original
- * allocation.  (CTREE_EXTENT_PENDING).  Returns zero if it all worked out
- */
-static int insert_pending_extents(struct ctree_root *extent_root)
-{
-	int ret;
-	struct key key;
-	struct extent_item item;
-	struct tree_buffer *gang[4];
-	int i;
-
-	// FIXME -ENOSPC
-	item.owner = extent_root->node->node.header.parentid;
-	item.refs = 1;
-	while(1) {
-		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
-						 (void **)gang, 0,
-						 ARRAY_SIZE(gang),
-						 CTREE_EXTENT_PENDING_ADD);
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++) {
-			key.objectid = gang[i]->blocknr;
-			key.flags = 0;
-			key.offset = 1;
-			ret = insert_item(extent_root, &key, &item,
-					  sizeof(item));
-			if (ret) {
-				printf("%Lu already in tree\n", key.objectid);
-				print_tree(extent_root, extent_root->node);
-				BUG();
-				// FIXME undo it and return sane
-				return ret;
-			}
-			radix_tree_tag_clear(&extent_root->cache_radix,
-					     gang[i]->blocknr,
-					     CTREE_EXTENT_PENDING_ADD);
-			tree_block_release(extent_root, gang[i]);
-		}
-	}
-	return 0;
-}
-
 /*
  * find all the blocks marked as pending in the radix tree and remove
  * them from the extent map
@@ -218,12 +209,8 @@ static int del_pending_extents(struct ctree_root *extent_root)
 static int run_pending(struct ctree_root *extent_root)
 {
 	while(radix_tree_tagged(&extent_root->cache_radix,
-			        CTREE_EXTENT_PENDING_DEL) ||
-	      radix_tree_tagged(&extent_root->cache_radix,
-				CTREE_EXTENT_PENDING_ADD)) {
-		insert_pending_extents(extent_root);
+			        CTREE_EXTENT_PENDING_DEL))
 		del_pending_extents(extent_root);
-	}
 	return 0;
 }
 
@@ -241,19 +228,8 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 
 	if (root == extent_root) {
 		t = find_tree_block(root, blocknr);
-		if (radix_tree_tag_get(&root->cache_radix, blocknr,
-				      CTREE_EXTENT_PENDING_ADD)) {
-			radix_tree_tag_clear(&root->cache_radix,
-					     blocknr,
-					     CTREE_EXTENT_PENDING_ADD);
-			/* once for us */
-			tree_block_release(root, t);
-			/* once for the pending add */
-			tree_block_release(root, t);
-		} else {
-			radix_tree_tag_set(&root->cache_radix, blocknr,
+		radix_tree_tag_set(&root->cache_radix, blocknr,
 				   CTREE_EXTENT_PENDING_DEL);
-		}
 		return 0;
 	}
 	key.objectid = blocknr;
@@ -281,9 +257,11 @@ static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
 	u64 hole_size = 0;
 	int slot = 0;
 	u64 last_block;
+	u64 test_block;
 	int start_found;
 	struct leaf *l;
 	struct ctree_root * root = orig_root->extent_root;
+	int total_needed = num_blocks + MAX_LEVEL * 3;
 
 check_failed:
 	init_path(&path);
@@ -306,22 +284,34 @@ check_failed:
 				goto error;
 			if (!start_found) {
 				ins->objectid = search_start;
-				ins->offset = num_blocks;
+				ins->offset = (u64)-1;
 				start_found = 1;
 				goto check_pending;
 			}
 			ins->objectid = last_block > search_start ?
 					last_block : search_start;
-			ins->offset = num_blocks;
+			ins->offset = (u64)-1;
 			goto check_pending;
 		}
+		if (slot == 0) {
+			int last_slot = l->header.nritems - 1;
+			u64 span = l->items[last_slot].key.objectid;
+			span -= l->items[slot].key.objectid;
+			if (span + total_needed > last_slot - slot) {
+				path.slots[0] = last_slot + 1;
+				key = &l->items[last_slot].key;
+				last_block = key->objectid + key->offset;
+				start_found = 1;
+				continue;
+			}
+		}
 		key = &l->items[slot].key;
 		if (key->objectid >= search_start) {
 			if (start_found) {
 				hole_size = key->objectid - last_block;
-				if (hole_size > num_blocks) {
+				if (hole_size > total_needed) {
 					ins->objectid = last_block;
-					ins->offset = num_blocks;
+					ins->offset = hole_size;
 					goto check_pending;
 				}
 			} else
@@ -337,23 +327,18 @@ check_pending:
 	 */
 	release_path(root, &path);
 	BUG_ON(ins->objectid < search_start);
-	if (1 || orig_root->extent_root == orig_root) {
-		BUG_ON(num_blocks != 1);
-		if ((root->current_insert.objectid <= ins->objectid &&
-		    root->current_insert.objectid +
-		    root->current_insert.offset > ins->objectid) ||
-		   (root->current_insert.objectid > ins->objectid &&
-		    root->current_insert.objectid <= ins->objectid +
-		    ins->offset) ||
-		   radix_tree_lookup(&root->pinned_radix, ins->objectid) ||
-		   radix_tree_tag_get(&root->cache_radix, ins->objectid,
-				      CTREE_EXTENT_PENDING_ADD)) {
-			search_start = ins->objectid + 1;
+	for (test_block = ins->objectid;
+	     test_block < ins->objectid + total_needed; test_block++) {
+		if (radix_tree_lookup(&root->pinned_radix, test_block)) {
+			search_start = test_block + 1;
 			goto check_failed;
 		}
 	}
-	if (ins->offset != 1)
-		BUG();
+	BUG_ON(root->current_insert.offset);
+	root->current_insert.offset = total_needed;
+	root->current_insert.objectid = ins->objectid + num_blocks;
+	root->current_insert.flags = 0;
+	ins->offset = num_blocks;
 	return 0;
 error:
 	release_path(root, &path);
@@ -368,43 +353,41 @@ error:
  * returns 0 if everything worked, non-zero otherwise.
  */
 int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
-			 u64 search_end, u64 owner, struct key *ins,
-			 struct tree_buffer **buf)
+			 u64 search_end, u64 owner, struct key *ins)
 {
 	int ret;
 	int pending_ret;
+	struct ctree_root *extent_root = root->extent_root;
 	struct extent_item extent_item;
+
 	extent_item.refs = 1;
 	extent_item.owner = owner;
 
-	ret = find_free_extent(root, num_blocks, search_start, search_end, ins);
-	if (ret)
-		return ret;
-	if (root != root->extent_root) {
-		memcpy(&root->extent_root->current_insert, ins, sizeof(*ins));
-		ret = insert_item(root->extent_root, ins, &extent_item,
-				  sizeof(extent_item));
-		memset(&root->extent_root->current_insert, 0,
-		       sizeof(struct key));
-		pending_ret = run_pending(root->extent_root);
-		if (ret)
-			return ret;
-		if (pending_ret)
-			return pending_ret;
-		*buf = find_tree_block(root, ins->objectid);
-		dirty_tree_block(root, *buf);
+	if (root == extent_root) {
+		BUG_ON(extent_root->current_insert.offset == 0);
+		BUG_ON(num_blocks != 1);
+		BUG_ON(extent_root->current_insert.flags ==
+		       extent_root->current_insert.offset);
+		ins->offset = 1;
+		ins->objectid = extent_root->current_insert.objectid +
+				extent_root->current_insert.flags++;
 		return 0;
 	}
-	/* we're allocating an extent for the extent tree, don't recurse */
-	BUG_ON(ins->offset != 1);
-	*buf = find_tree_block(root, ins->objectid);
-	BUG_ON(!*buf);
-	radix_tree_tag_set(&root->cache_radix, ins->objectid,
-			   CTREE_EXTENT_PENDING_ADD);
-	(*buf)->count++;
-	dirty_tree_block(root, *buf);
-	return 0;
+	ret = find_free_extent(root, num_blocks, search_start,
+			       search_end, ins);
+	if (ret)
+		return ret;
 
+	ret = insert_item(extent_root, ins, &extent_item,
+			  sizeof(extent_item));
+
+	finish_current_insert(extent_root);
+	pending_ret = run_pending(extent_root);
+	if (ret)
+		return ret;
+	if (pending_ret)
+		return pending_ret;
+	return 0;
 }
 
 /*
@@ -415,19 +398,17 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root)
 {
 	struct key ins;
 	int ret;
-	struct tree_buffer *buf = NULL;
+	struct tree_buffer *buf;
 
 	ret = alloc_extent(root, 1, 0, (unsigned long)-1,
 			   root->node->node.header.parentid,
-			   &ins, &buf);
+			   &ins);
 	if (ret) {
 		BUG();
 		return NULL;
 	}
-	if (root != root->extent_root)
-		BUG_ON(radix_tree_tag_get(&root->extent_root->cache_radix,
-					  buf->blocknr,
-					  CTREE_EXTENT_PENDING_ADD));
+	buf = find_tree_block(root, ins.objectid);
+	dirty_tree_block(root, buf);
 	return buf;
 }
 
-- 
cgit v1.2.3


From 0579da4280812f34f382fb0f8004d7b0219e7a33 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Mar 2007 16:15:30 -0500
Subject: Btrfs: Fixup last found extent caching

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |  2 +-
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 39 +++++++++++++++++++++------------------
 4 files changed, 24 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d92d08dde0f..ae7f4c00c39 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,6 +1,6 @@
 
 CC=gcc
-CFLAGS = -Wall
+CFLAGS = -g -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4a7bc4e6e74..518326fa369 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -54,6 +54,7 @@ struct ctree_root {
 	struct tree_buffer *commit_root;
 	struct ctree_root *extent_root;
 	struct key current_insert;
+	struct key last_insert;
 	int fp;
 	struct radix_tree_root cache_radix;
 	struct radix_tree_root pinned_radix;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2fe31c3508c..997cc578a18 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -186,6 +186,7 @@ static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
 	root->commit_root = NULL;
 	root->node = read_tree_block(root, info->tree_root);
 	memset(&root->current_insert, 0, sizeof(root->current_insert));
+	memset(&root->last_insert, 0, sizeof(root->last_insert));
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8a2b8aaf9b8..dd11532cb2f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -102,9 +102,12 @@ int btrfs_finish_extent_commit(struct ctree_root *root)
 						 ARRAY_SIZE(gang));
 		if (!ret)
 			break;
-		for (i = 0; i < ret; i++)
+		for (i = 0; i < ret; i++) {
 			radix_tree_delete(&extent_root->pinned_radix, gang[i]);
+		}
 	}
+	extent_root->last_insert.objectid = 0;
+	extent_root->last_insert.offset = 0;
 	return 0;
 }
 
@@ -170,6 +173,9 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 			radix_tree_preload_end();
 		}
 		ret = del_item(extent_root, &path);
+		if (root != extent_root &&
+		    extent_root->last_insert.objectid < blocknr)
+			extent_root->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
 	}
@@ -261,8 +267,11 @@ static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
 	int start_found;
 	struct leaf *l;
 	struct ctree_root * root = orig_root->extent_root;
-	int total_needed = num_blocks + MAX_LEVEL * 3;
+	int total_needed = num_blocks;
 
+	total_needed += (node_level(root->node->node.header.flags) + 1) * 3;
+	if (root->last_insert.objectid > search_start)
+		search_start = root->last_insert.objectid;
 check_failed:
 	init_path(&path);
 	ins->objectid = search_start;
@@ -273,6 +282,9 @@ check_failed:
 	if (ret < 0)
 		goto error;
 
+	if (path.slots[0] > 0)
+		path.slots[0]--;
+
 	while (1) {
 		l = &path.nodes[0]->leaf;
 		slot = path.slots[0];
@@ -293,31 +305,21 @@ check_failed:
 			ins->offset = (u64)-1;
 			goto check_pending;
 		}
-		if (slot == 0) {
-			int last_slot = l->header.nritems - 1;
-			u64 span = l->items[last_slot].key.objectid;
-			span -= l->items[slot].key.objectid;
-			if (span + total_needed > last_slot - slot) {
-				path.slots[0] = last_slot + 1;
-				key = &l->items[last_slot].key;
-				last_block = key->objectid + key->offset;
-				start_found = 1;
-				continue;
-			}
-		}
 		key = &l->items[slot].key;
 		if (key->objectid >= search_start) {
 			if (start_found) {
+				if (last_block < search_start)
+					last_block = search_start;
 				hole_size = key->objectid - last_block;
 				if (hole_size > total_needed) {
 					ins->objectid = last_block;
 					ins->offset = hole_size;
 					goto check_pending;
 				}
-			} else
-				start_found = 1;
-			last_block = key->objectid + key->offset;
+			}
 		}
+		start_found = 1;
+		last_block = key->objectid + key->offset;
 		path.slots[0]++;
 	}
 	// FIXME -ENOSPC
@@ -335,9 +337,10 @@ check_pending:
 		}
 	}
 	BUG_ON(root->current_insert.offset);
-	root->current_insert.offset = total_needed;
+	root->current_insert.offset = total_needed - num_blocks;
 	root->current_insert.objectid = ins->objectid + num_blocks;
 	root->current_insert.flags = 0;
+	root->last_insert.objectid = ins->objectid;
 	ins->offset = num_blocks;
 	return 0;
 error:
-- 
cgit v1.2.3


From 20524f02260910db1e67bd5335d3854e5e555efc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 10 Mar 2007 06:35:47 -0500
Subject: Btrfs: recursion free-first pass

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/quick-test.c  |  5 ++-
 2 files changed, 100 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dd11532cb2f..6fbaece43ff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -50,7 +50,7 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 	return 0;
 }
 
-static int lookup_block_ref(struct ctree_root *root, u64 blocknr, int *refs)
+static int lookup_block_ref(struct ctree_root *root, u64 blocknr, u32 *refs)
 {
 	struct ctree_path path;
 	int ret;
@@ -415,6 +415,100 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root)
 	return buf;
 }
 
+int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level)
+{
+	struct tree_buffer *next;
+	struct tree_buffer *cur;
+	u64 blocknr;
+	int ret;
+	u32 refs;
+
+	ret = lookup_block_ref(root, path->nodes[*level]->blocknr, &refs);
+	BUG_ON(ret);
+	if (refs > 1)
+		goto out;
+	while(*level > 0) {
+		cur = path->nodes[*level];
+		if (path->slots[*level] >= cur->node.header.nritems)
+			break;
+		blocknr = cur->node.blockptrs[path->slots[*level]];
+		ret = lookup_block_ref(root, blocknr, &refs);
+		if (refs != 1 || *level == 1) {
+			path->slots[*level]++;
+			ret = free_extent(root, blocknr, 1);
+			BUG_ON(ret);
+			continue;
+		}
+		BUG_ON(ret);
+		next = read_tree_block(root, blocknr);
+		if (path->nodes[*level-1]) {
+			tree_block_release(root, path->nodes[*level-1]);
+		}
+		path->nodes[*level-1] = next;
+		*level = node_level(next->node.header.flags);
+		path->slots[*level] = 0;
+	}
+out:
+	ret = free_extent(root, path->nodes[*level]->blocknr, 1);
+	path->nodes[*level] = NULL;
+	*level += 1;
+	BUG_ON(ret);
+	return 0;
+}
+
+int walk_up_tree(struct ctree_root *root, struct ctree_path *path, int *level)
+{
+	int i;
+	int slot;
+	int ret;
+	for(i = *level; i < MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < path->nodes[i]->node.header.nritems - 1) {
+			path->slots[i]++;
+			*level = i;
+			return 0;
+		} else {
+			ret = free_extent(root,
+					  path->nodes[*level]->blocknr, 1);
+			*level = i + 1;
+			BUG_ON(ret);
+		}
+	}
+	return 1;
+}
+
+int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
+{
+	int ret;
+	int level;
+	struct ctree_path path;
+	int i;
+	int orig_level;
+
+	init_path(&path);
+
+	level = node_level(snap->node.header.flags);
+	orig_level = level;
+	path.nodes[level] = snap;
+	path.slots[level] = 0;
+	while(1) {
+		ret = walk_down_tree(root, &path, &level);
+		if (ret > 0)
+			break;
+		ret = walk_up_tree(root, &path, &level);
+		if (ret > 0)
+			break;
+	}
+	for (i = 0; i < orig_level; i++) {
+		if (path.nodes[i])
+			tree_block_release(root, path.nodes[i]);
+	}
+
+	return 0;
+}
+
+
+#if 0
 int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
 {
 	int ret;
@@ -441,4 +535,4 @@ int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
 	BUG_ON(ret);
 	return 0;
 }
-
+#endif
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index 6400c7100a6..ccca9b2c14a 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -9,7 +9,7 @@
 /* for testing only */
 int next_key(int i, int max_key) {
 	return rand() % max_key;
-	//return i;
+	// return i;
 }
 
 int main(int ac, char **av) {
@@ -44,6 +44,9 @@ int main(int ac, char **av) {
 		if (!ret)
 			tree_size++;
 		free(buf);
+		if (i == run_size - 5) {
+			commit_transaction(root, &super);
+		}
 
 	}
 	close_ctree(root, &super);
-- 
cgit v1.2.3


From 83e15a28e046dbb4534dd263d1d3dc867a8994a2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 Mar 2007 09:03:27 -0400
Subject: fix leak in btrfs_drop_snapshot

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent-tree.c | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 997cc578a18..c9201ec81fe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -167,7 +167,7 @@ int commit_transaction(struct ctree_root *root, struct ctree_super_block *s)
 		root->node->count++;
 		ret = btrfs_drop_snapshot(root, snap);
 		BUG_ON(ret);
-		tree_block_release(root, snap);
+		// tree_block_release(root, snap);
 	}
         write_ctree_super(root, s);
 	btrfs_finish_extent_commit(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6fbaece43ff..e7b8a70b5e5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -441,15 +441,15 @@ int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 		}
 		BUG_ON(ret);
 		next = read_tree_block(root, blocknr);
-		if (path->nodes[*level-1]) {
+		if (path->nodes[*level-1])
 			tree_block_release(root, path->nodes[*level-1]);
-		}
 		path->nodes[*level-1] = next;
 		*level = node_level(next->node.header.flags);
 		path->slots[*level] = 0;
 	}
 out:
 	ret = free_extent(root, path->nodes[*level]->blocknr, 1);
+	tree_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	BUG_ON(ret);
@@ -470,6 +470,8 @@ int walk_up_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 		} else {
 			ret = free_extent(root,
 					  path->nodes[*level]->blocknr, 1);
+			tree_block_release(root, path->nodes[*level]);
+			path->nodes[*level] = NULL;
 			*level = i + 1;
 			BUG_ON(ret);
 		}
@@ -499,9 +501,10 @@ int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
 		if (ret > 0)
 			break;
 	}
-	for (i = 0; i < orig_level; i++) {
-		if (path.nodes[i])
+	for (i = 0; i <= orig_level; i++) {
+		if (path.nodes[i]) {
 			tree_block_release(root, path.nodes[i]);
+		}
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 7518a238ea0152dc849d1ed76d3cae8b44e12f46 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 Mar 2007 12:01:18 -0400
Subject: Btrfs: get/set for struct header fields

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/TODO          |   1 +
 fs/btrfs/ctree.c       | 224 +++++++++++++++++++++++++++----------------------
 fs/btrfs/ctree.h       |  63 +++++++++++++-
 fs/btrfs/disk-io.c     |   7 +-
 fs/btrfs/extent-tree.c |  53 +++---------
 fs/btrfs/mkfs.c        |  10 +--
 fs/btrfs/print-tree.c  |  26 +++---
 fs/btrfs/quick-test.c  |  12 +--
 fs/btrfs/random-test.c |   4 +-
 10 files changed, 230 insertions(+), 172 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ae7f4c00c39..f8532200e9a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -25,7 +25,7 @@ tester: $(objects) random-test.o
 quick-test: $(objects) quick-test.o
 	gcc $(CFLAGS) -o quick-test $(objects) quick-test.o
 
-$(objects) : $(headers)
+$(objects): $(headers)
 
 clean :
 	rm debug-tree tester *.o
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 5c4395c3784..52d2a7a071f 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,4 +1,5 @@
 * cleanup, add more error checking, get rid of BUG_ONs
+* endian fixes
 * Make IO functions look more like the page cache
 * Fix ENOSPC handling
 * make blocksize a mkfs parameter instead of #define
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 729d4ddb374..e43c827e0df 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -48,7 +48,7 @@ int btrfs_cow_block(struct ctree_root *root,
 	}
 	cow = alloc_free_block(root);
 	memcpy(&cow->node, &buf->node, sizeof(buf->node));
-	cow->node.header.blocknr = cow->blocknr;
+	btrfs_set_header_blocknr(&cow->node.header, cow->blocknr);
 	*cow_ret = cow;
 	btrfs_inc_ref(root, buf);
 	if (buf == root->node) {
@@ -73,7 +73,7 @@ int btrfs_cow_block(struct ctree_root *root,
  */
 static inline unsigned int leaf_data_end(struct leaf *leaf)
 {
-	unsigned int nr = leaf->header.nritems;
+	u32 nr = btrfs_header_nritems(&leaf->header);
 	if (nr == 0)
 		return sizeof(leaf->data);
 	return leaf->items[nr-1].offset;
@@ -87,7 +87,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf)
 int leaf_free_space(struct leaf *leaf)
 {
 	int data_end = leaf_data_end(leaf);
-	int nritems = leaf->header.nritems;
+	int nritems = btrfs_header_nritems(&leaf->header);
 	char *items_end = (char *)(leaf->items + nritems + 1);
 	return (char *)(leaf->data + data_end) - (char *)items_end;
 }
@@ -118,18 +118,21 @@ int check_node(struct ctree_path *path, int level)
 	struct node *parent = NULL;
 	struct node *node = &path->nodes[level]->node;
 	int parent_slot;
+	u32 nritems = btrfs_header_nritems(&node->header);
 
 	if (path->nodes[level + 1])
 		parent = &path->nodes[level + 1]->node;
 	parent_slot = path->slots[level + 1];
-	if (parent && node->header.nritems > 0) {
+	BUG_ON(nritems == 0);
+	if (parent) {
 		struct key *parent_key;
 		parent_key = &parent->keys[parent_slot];
 		BUG_ON(memcmp(parent_key, node->keys, sizeof(struct key)));
-		BUG_ON(parent->blockptrs[parent_slot] != node->header.blocknr);
+		BUG_ON(parent->blockptrs[parent_slot] !=
+		       btrfs_header_blocknr(&node->header));
 	}
-	BUG_ON(node->header.nritems > NODEPTRS_PER_BLOCK);
-	for (i = 0; i < node->header.nritems - 2; i++) {
+	BUG_ON(nritems > NODEPTRS_PER_BLOCK);
+	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
 		BUG_ON(comp_keys(&node->keys[i], &node->keys[i+1]) >= 0);
 	}
 	return 0;
@@ -141,18 +144,25 @@ int check_leaf(struct ctree_path *path, int level)
 	struct leaf *leaf = &path->nodes[level]->leaf;
 	struct node *parent = NULL;
 	int parent_slot;
+	u32 nritems = btrfs_header_nritems(&leaf->header);
 
 	if (path->nodes[level + 1])
 		parent = &path->nodes[level + 1]->node;
 	parent_slot = path->slots[level + 1];
-	if (parent && leaf->header.nritems > 0) {
+	BUG_ON(leaf_free_space(leaf) < 0);
+
+	if (nritems == 0)
+		return 0;
+
+	if (parent) {
 		struct key *parent_key;
 		parent_key = &parent->keys[parent_slot];
 		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
 		       sizeof(struct key)));
-		BUG_ON(parent->blockptrs[parent_slot] != leaf->header.blocknr);
+		BUG_ON(parent->blockptrs[parent_slot] !=
+		       btrfs_header_blocknr(&leaf->header));
 	}
-	for (i = 0; i < leaf->header.nritems - 2; i++) {
+	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
 		BUG_ON(comp_keys(&leaf->items[i].key,
 		                 &leaf->items[i+1].key) >= 0);
 		BUG_ON(leaf->items[i].offset != leaf->items[i + 1].offset +
@@ -162,7 +172,6 @@ int check_leaf(struct ctree_path *path, int level)
 				LEAF_DATA_SIZE);
 		}
 	}
-	BUG_ON(leaf_free_space(leaf) < 0);
 	return 0;
 }
 
@@ -215,13 +224,15 @@ int generic_bin_search(char *p, int item_size, struct key *key,
  */
 int bin_search(struct node *c, struct key *key, int *slot)
 {
-	if (is_leaf(c->header.flags)) {
+	if (btrfs_is_leaf(c)) {
 		struct leaf *l = (struct leaf *)c;
 		return generic_bin_search((void *)l->items, sizeof(struct item),
-					  key, c->header.nritems, slot);
+					  key, btrfs_header_nritems(&c->header),
+					  slot);
 	} else {
 		return generic_bin_search((void *)c->keys, sizeof(struct key),
-					  key, c->header.nritems, slot);
+					  key, btrfs_header_nritems(&c->header),
+					  slot);
 	}
 	return -1;
 }
@@ -233,7 +244,7 @@ struct tree_buffer *read_node_slot(struct ctree_root *root,
 	struct node *node = &parent_buf->node;
 	if (slot < 0)
 		return NULL;
-	if (slot >= node->header.nritems)
+	if (slot >= btrfs_header_nritems(&node->header))
 		return NULL;
 	return read_tree_block(root, node->blockptrs[slot]);
 }
@@ -270,7 +281,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		struct tree_buffer *child;
 		u64 blocknr = mid_buf->blocknr;
 
-		if (mid->header.nritems != 1)
+		if (btrfs_header_nritems(&mid->header) != 1)
 			return 0;
 
 		/* promote the child to a root */
@@ -287,7 +298,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	}
 	parent = &parent_buf->node;
 
-	if (mid->header.nritems > NODEPTRS_PER_BLOCK / 4)
+	if (btrfs_header_nritems(&mid->header) > NODEPTRS_PER_BLOCK / 4)
 		return 0;
 
 	left_buf = read_node_slot(root, parent_buf, pslot - 1);
@@ -298,7 +309,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		btrfs_cow_block(root, left_buf, parent_buf,
 				pslot - 1, &left_buf);
 		left = &left_buf->node;
-		orig_slot += left->header.nritems;
+		orig_slot += btrfs_header_nritems(&left->header);
 		wret = push_node_left(root, left_buf, mid_buf);
 		if (wret < 0)
 			ret = wret;
@@ -314,7 +325,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		wret = push_node_left(root, mid_buf, right_buf);
 		if (wret < 0)
 			ret = wret;
-		if (right->header.nritems == 0) {
+		if (btrfs_header_nritems(&right->header) == 0) {
 			u64 blocknr = right_buf->blocknr;
 			tree_block_release(root, right_buf);
 			clean_tree_block(root, right_buf);
@@ -332,7 +343,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 			BUG_ON(list_empty(&parent_buf->dirty));
 		}
 	}
-	if (mid->header.nritems == 1) {
+	if (btrfs_header_nritems(&mid->header) == 1) {
 		/*
 		 * we're not allowed to leave a node with one item in the
 		 * tree during a delete.  A deletion from lower in the tree
@@ -348,7 +359,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 			ret = wret;
 		BUG_ON(wret == 1);
 	}
-	if (mid->header.nritems == 0) {
+	if (btrfs_header_nritems(&mid->header) == 0) {
 		/* we've managed to empty the middle node, drop it */
 		u64 blocknr = mid_buf->blocknr;
 		tree_block_release(root, mid_buf);
@@ -369,7 +380,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 
 	/* update the path */
 	if (left_buf) {
-		if (left->header.nritems > orig_slot) {
+		if (btrfs_header_nritems(&left->header) > orig_slot) {
 			left_buf->count++; // released below
 			path->nodes[level] = left_buf;
 			path->slots[level + 1] -= 1;
@@ -377,7 +388,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 			if (mid_buf)
 				tree_block_release(root, mid_buf);
 		} else {
-			orig_slot -= left->header.nritems;
+			orig_slot -= btrfs_header_nritems(&left->header);
 			path->slots[level] = orig_slot;
 		}
 	}
@@ -420,7 +431,7 @@ again:
 	b = root->node;
 	b->count++;
 	while (b) {
-		level = node_level(b->node.header.flags);
+		level = btrfs_header_level(&b->node.header);
 		if (cow) {
 			int wret;
 			wret = btrfs_cow_block(root, b, p->nodes[level + 1],
@@ -434,12 +445,12 @@ again:
 		if (ret)
 			return -1;
 		ret = bin_search(c, key, &slot);
-		if (!is_leaf(c->header.flags)) {
+		if (!btrfs_is_leaf(c)) {
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			if (ins_len > 0 &&
-			    c->header.nritems == NODEPTRS_PER_BLOCK) {
+			if (ins_len > 0 && btrfs_header_nritems(&c->header) ==
+			    NODEPTRS_PER_BLOCK) {
 				int sret = split_node(root, p, level);
 				BUG_ON(sret > 0);
 				if (sret)
@@ -456,7 +467,7 @@ again:
 					goto again;
 				c = &b->node;
 				slot = p->slots[level];
-				BUG_ON(c->header.nritems == 1);
+				BUG_ON(btrfs_header_nritems(&c->header) == 1);
 			}
 			b = read_tree_block(root, c->blockptrs[slot]);
 		} else {
@@ -524,8 +535,8 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 	int dst_nritems;
 	int ret = 0;
 
-	src_nritems = src->header.nritems;
-	dst_nritems = dst->header.nritems;
+	src_nritems = btrfs_header_nritems(&src->header);
+	dst_nritems = btrfs_header_nritems(&dst->header);
 	push_items = NODEPTRS_PER_BLOCK - dst_nritems;
 	if (push_items <= 0) {
 		return 1;
@@ -544,9 +555,8 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 		memmove(src->blockptrs, src->blockptrs + push_items,
 			(src_nritems - push_items) * sizeof(u64));
 	}
-	src->header.nritems -= push_items;
-	dst->header.nritems += push_items;
-
+	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
+	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
 	BUG_ON(list_empty(&src_buf->dirty));
 	BUG_ON(list_empty(&dst_buf->dirty));
 	return ret;
@@ -573,8 +583,8 @@ static int balance_node_right(struct ctree_root *root,
 	int dst_nritems;
 	int ret = 0;
 
-	src_nritems = src->header.nritems;
-	dst_nritems = dst->header.nritems;
+	src_nritems = btrfs_header_nritems(&src->header);
+	dst_nritems = btrfs_header_nritems(&dst->header);
 	push_items = NODEPTRS_PER_BLOCK - dst_nritems;
 	if (push_items <= 0) {
 		return 1;
@@ -596,8 +606,8 @@ static int balance_node_right(struct ctree_root *root,
 	memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
 		push_items * sizeof(u64));
 
-	src->header.nritems -= push_items;
-	dst->header.nritems += push_items;
+	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
+	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
 
 	BUG_ON(list_empty(&src_buf->dirty));
 	BUG_ON(list_empty(&dst_buf->dirty));
@@ -625,12 +635,13 @@ static int insert_new_root(struct ctree_root *root,
 	t = alloc_free_block(root);
 	c = &t->node;
 	memset(c, 0, sizeof(c));
-	c->header.nritems = 1;
-	c->header.flags = node_level(level);
-	c->header.blocknr = t->blocknr;
-	c->header.parentid = root->node->node.header.parentid;
+	btrfs_set_header_nritems(&c->header, 1);
+	btrfs_set_header_level(&c->header, level);
+	btrfs_set_header_blocknr(&c->header, t->blocknr);
+	btrfs_set_header_parentid(&c->header,
+	                       btrfs_header_parentid(&root->node->node.header));
 	lower = &path->nodes[level-1]->node;
-	if (is_leaf(lower->header.flags))
+	if (btrfs_is_leaf(lower))
 		lower_key = &((struct leaf *)lower)->items[0].key;
 	else
 		lower_key = lower->keys;
@@ -663,7 +674,7 @@ static int insert_ptr(struct ctree_root *root,
 
 	BUG_ON(!path->nodes[level]);
 	lower = &path->nodes[level]->node;
-	nritems = lower->header.nritems;
+	nritems = btrfs_header_nritems(&lower->header);
 	if (slot > nritems)
 		BUG();
 	if (nritems == NODEPTRS_PER_BLOCK)
@@ -676,7 +687,7 @@ static int insert_ptr(struct ctree_root *root,
 	}
 	memcpy(lower->keys + slot, key, sizeof(struct key));
 	lower->blockptrs[slot] = blocknr;
-	lower->header.nritems++;
+	btrfs_set_header_nritems(&lower->header, nritems + 1);
 	if (lower->keys[1].objectid == 0)
 			BUG();
 	BUG_ON(list_empty(&path->nodes[level]->dirty));
@@ -702,6 +713,7 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 	int mid;
 	int ret;
 	int wret;
+	u32 c_nritems;
 
 	t = path->nodes[level];
 	c = &t->node;
@@ -711,18 +723,20 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 		if (ret)
 			return ret;
 	}
+	c_nritems = btrfs_header_nritems(&c->header);
 	split_buffer = alloc_free_block(root);
 	split = &split_buffer->node;
-	split->header.flags = c->header.flags;
-	split->header.blocknr = split_buffer->blocknr;
-	split->header.parentid = root->node->node.header.parentid;
-	mid = (c->header.nritems + 1) / 2;
+	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
+	btrfs_set_header_blocknr(&split->header, split_buffer->blocknr);
+	btrfs_set_header_parentid(&split->header,
+	                       btrfs_header_parentid(&root->node->node.header));
+	mid = (c_nritems + 1) / 2;
 	memcpy(split->keys, c->keys + mid,
-		(c->header.nritems - mid) * sizeof(struct key));
+		(c_nritems - mid) * sizeof(struct key));
 	memcpy(split->blockptrs, c->blockptrs + mid,
-		(c->header.nritems - mid) * sizeof(u64));
-	split->header.nritems = c->header.nritems - mid;
-	c->header.nritems = mid;
+		(c_nritems - mid) * sizeof(u64));
+	btrfs_set_header_nritems(&split->header, c_nritems - mid);
+	btrfs_set_header_nritems(&c->header, mid);
 	ret = 0;
 
 	BUG_ON(list_empty(&t->dirty));
@@ -781,13 +795,15 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	int push_space = 0;
 	int push_items = 0;
 	struct item *item;
+	u32 left_nritems;
+	u32 right_nritems;
 
 	slot = path->slots[1];
 	if (!path->nodes[1]) {
 		return 1;
 	}
 	upper = path->nodes[1];
-	if (slot >= upper->node.header.nritems - 1) {
+	if (slot >= btrfs_header_nritems(&upper->node.header) - 1) {
 		return 1;
 	}
 	right_buf = read_tree_block(root, upper->node.blockptrs[slot + 1]);
@@ -806,7 +822,8 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		return 1;
 	}
 
-	for (i = left->header.nritems - 1; i >= 0; i--) {
+	left_nritems = btrfs_header_nritems(&left->header);
+	for (i = left_nritems - 1; i >= 0; i--) {
 		item = left->items + i;
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
@@ -819,9 +836,10 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		tree_block_release(root, right_buf);
 		return 1;
 	}
+	right_nritems = btrfs_header_nritems(&right->header);
 	/* push left to right */
-	push_space = left->items[left->header.nritems - push_items].offset +
-		     left->items[left->header.nritems - push_items].size;
+	push_space = left->items[left_nritems - push_items].offset +
+		     left->items[left_nritems - push_items].size;
 	push_space -= leaf_data_end(left);
 	/* make room in the right data area */
 	memmove(right->data + leaf_data_end(right) - push_space,
@@ -832,19 +850,21 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		left->data + leaf_data_end(left),
 		push_space);
 	memmove(right->items + push_items, right->items,
-		right->header.nritems * sizeof(struct item));
+		right_nritems * sizeof(struct item));
 	/* copy the items from left to right */
-	memcpy(right->items, left->items + left->header.nritems - push_items,
+	memcpy(right->items, left->items + left_nritems - push_items,
 		push_items * sizeof(struct item));
 
 	/* update the item pointers */
-	right->header.nritems += push_items;
+	right_nritems += push_items;
+	btrfs_set_header_nritems(&right->header, right_nritems);
 	push_space = LEAF_DATA_SIZE;
-	for (i = 0; i < right->header.nritems; i++) {
+	for (i = 0; i < right_nritems; i++) {
 		right->items[i].offset = push_space - right->items[i].size;
 		push_space = right->items[i].offset;
 	}
-	left->header.nritems -= push_items;
+	left_nritems -= push_items;
+	btrfs_set_header_nritems(&left->header, left_nritems);
 
 	BUG_ON(list_empty(&left_buf->dirty));
 	BUG_ON(list_empty(&right_buf->dirty));
@@ -853,8 +873,8 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	BUG_ON(list_empty(&upper->dirty));
 
 	/* then fixup the leaf pointer in the path */
-	if (path->slots[0] >= left->header.nritems) {
-		path->slots[0] -= left->header.nritems;
+	if (path->slots[0] >= left_nritems) {
+		path->slots[0] -= left_nritems;
 		tree_block_release(root, path->nodes[0]);
 		path->nodes[0] = right_buf;
 		path->slots[1] += 1;
@@ -880,7 +900,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	int push_space = 0;
 	int push_items = 0;
 	struct item *item;
-	int old_left_nritems;
+	u32 old_left_nritems;
 	int ret = 0;
 	int wret;
 
@@ -908,7 +928,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		return 1;
 	}
 
-	for (i = 0; i < right->header.nritems; i++) {
+	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
 		item = right->items + i;
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
@@ -922,31 +942,34 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		return 1;
 	}
 	/* push data from right to left */
-	memcpy(left->items + left->header.nritems,
+	memcpy(left->items + btrfs_header_nritems(&left->header),
 		right->items, push_items * sizeof(struct item));
 	push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset;
 	memcpy(left->data + leaf_data_end(left) - push_space,
 		right->data + right->items[push_items - 1].offset,
 		push_space);
-	old_left_nritems = left->header.nritems;
+	old_left_nritems = btrfs_header_nritems(&left->header);
 	BUG_ON(old_left_nritems < 0);
 
 	for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		left->items[i].offset -= LEAF_DATA_SIZE -
 			left->items[old_left_nritems -1].offset;
 	}
-	left->header.nritems += push_items;
+	btrfs_set_header_nritems(&left->header, old_left_nritems + push_items);
 
 	/* fixup right node */
 	push_space = right->items[push_items-1].offset - leaf_data_end(right);
 	memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
 		leaf_data_end(right), push_space);
 	memmove(right->items, right->items + push_items,
-		(right->header.nritems - push_items) * sizeof(struct item));
-	right->header.nritems -= push_items;
+		(btrfs_header_nritems(&right->header) - push_items) *
+		sizeof(struct item));
+	btrfs_set_header_nritems(&right->header,
+				 btrfs_header_nritems(&right->header) -
+				 push_items);
 	push_space = LEAF_DATA_SIZE;
 
-	for (i = 0; i < right->header.nritems; i++) {
+	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
 		right->items[i].offset = push_space - right->items[i].size;
 		push_space = right->items[i].offset;
 	}
@@ -983,7 +1006,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 {
 	struct tree_buffer *l_buf;
 	struct leaf *l;
-	int nritems;
+	u32 nritems;
 	int mid;
 	int slot;
 	struct leaf *right;
@@ -1008,7 +1031,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 			return ret;
 	}
 	slot = path->slots[0];
-	nritems = l->header.nritems;
+	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
 	right_buffer = alloc_free_block(root);
 	BUG_ON(!right_buffer);
@@ -1026,10 +1049,11 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 			LEAF_DATA_SIZE)
 			BUG();
 	}
-	right->header.nritems = nritems - mid;
-	right->header.blocknr = right_buffer->blocknr;
-	right->header.flags = node_level(0);
-	right->header.parentid = root->node->node.header.parentid;
+	btrfs_set_header_nritems(&right->header, nritems - mid);
+	btrfs_set_header_blocknr(&right->header, right_buffer->blocknr);
+	btrfs_set_header_level(&right->header, 0);
+	btrfs_set_header_parentid(&right->header,
+	                       btrfs_header_parentid(&root->node->node.header));
 	data_copy_size = l->items[mid].offset + l->items[mid].size -
 			 leaf_data_end(l);
 	memcpy(right->items, l->items + mid,
@@ -1039,10 +1063,10 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	rt_data_off = LEAF_DATA_SIZE -
 		     (l->items[mid].offset + l->items[mid].size);
 
-	for (i = 0; i < right->header.nritems; i++)
+	for (i = 0; i < btrfs_header_nritems(&right->header); i++)
 		right->items[i].offset += rt_data_off;
 
-	l->header.nritems = mid;
+	btrfs_set_header_nritems(&l->header, mid);
 	ret = 0;
 	wret = insert_ptr(root, path, &right->items[0].key,
 			  right_buffer->blocknr, path->slots[1] + 1, 1);
@@ -1074,7 +1098,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	int slot_orig;
 	struct leaf *leaf;
 	struct tree_buffer *leaf_buf;
-	unsigned int nritems;
+	u32 nritems;
 	unsigned int data_end;
 	struct ctree_path path;
 
@@ -1094,7 +1118,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	leaf_buf = path.nodes[0];
 	leaf = &leaf_buf->leaf;
 
-	nritems = leaf->header.nritems;
+	nritems = btrfs_header_nritems(&leaf->header);
 	data_end = leaf_data_end(leaf);
 
 	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
@@ -1128,7 +1152,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 	leaf->items[slot].offset = data_end - data_size;
 	leaf->items[slot].size = data_size;
 	memcpy(leaf->data + data_end - data_size, data, data_size);
-	leaf->header.nritems += 1;
+	btrfs_set_header_nritems(&leaf->header, nritems + 1);
 
 	ret = 0;
 	if (slot == 0)
@@ -1155,12 +1179,12 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
 {
 	struct node *node;
 	struct tree_buffer *parent = path->nodes[level];
-	int nritems;
+	u32 nritems;
 	int ret = 0;
 	int wret;
 
 	node = &parent->node;
-	nritems = node->header.nritems;
+	nritems = btrfs_header_nritems(&node->header);
 	if (slot != nritems -1) {
 		memmove(node->keys + slot, node->keys + slot + 1,
 			sizeof(struct key) * (nritems - slot - 1));
@@ -1168,11 +1192,12 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
 			node->blockptrs + slot + 1,
 			sizeof(u64) * (nritems - slot - 1));
 	}
-	node->header.nritems--;
-	if (node->header.nritems == 0 && parent == root->node) {
-		BUG_ON(node_level(root->node->node.header.flags) != 1);
+	nritems--;
+	btrfs_set_header_nritems(&node->header, nritems);
+	if (nritems == 0 && parent == root->node) {
+		BUG_ON(btrfs_header_level(&root->node->node.header) != 1);
 		/* just turn the root into a leaf and break */
-		root->node->node.header.flags = node_level(0);
+		btrfs_set_header_level(&root->node->node.header, 0);
 	} else if (slot == 0) {
 		wret = fixup_low_keys(root, path, node->keys, level + 1);
 		if (wret)
@@ -1195,30 +1220,33 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	int dsize;
 	int ret = 0;
 	int wret;
+	u32 nritems;
 
 	leaf_buf = path->nodes[0];
 	leaf = &leaf_buf->leaf;
 	slot = path->slots[0];
 	doff = leaf->items[slot].offset;
 	dsize = leaf->items[slot].size;
+	nritems = btrfs_header_nritems(&leaf->header);
 
-	if (slot != leaf->header.nritems - 1) {
+	if (slot != nritems - 1) {
 		int i;
 		int data_end = leaf_data_end(leaf);
 		memmove(leaf->data + data_end + dsize,
 			leaf->data + data_end,
 			doff - data_end);
-		for (i = slot + 1; i < leaf->header.nritems; i++)
+		for (i = slot + 1; i < nritems; i++)
 			leaf->items[i].offset += dsize;
 		memmove(leaf->items + slot, leaf->items + slot + 1,
 			sizeof(struct item) *
-			(leaf->header.nritems - slot - 1));
+			(nritems - slot - 1));
 	}
-	leaf->header.nritems -= 1;
+	btrfs_set_header_nritems(&leaf->header, nritems - 1);
+	nritems--;
 	/* delete the leaf if we've emptied it */
-	if (leaf->header.nritems == 0) {
+	if (nritems == 0) {
 		if (leaf_buf == root->node) {
-			leaf->header.flags = node_level(0);
+			btrfs_set_header_level(&leaf->header, 0);
 			BUG_ON(list_empty(&leaf_buf->dirty));
 		} else {
 			clean_tree_block(root, leaf_buf);
@@ -1230,7 +1258,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 				ret = wret;
 		}
 	} else {
-		int used = leaf_space_used(leaf, 0, leaf->header.nritems);
+		int used = leaf_space_used(leaf, 0, nritems);
 		if (slot == 0) {
 			wret = fixup_low_keys(root, path,
 						   &leaf->items[0].key, 1);
@@ -1251,12 +1279,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			if (wret < 0)
 				ret = wret;
 			if (path->nodes[0] == leaf_buf &&
-			    leaf->header.nritems) {
+			    btrfs_header_nritems(&leaf->header)) {
 				wret = push_leaf_right(root, path, 1);
 				if (wret < 0)
 					ret = wret;
 			}
-			if (leaf->header.nritems == 0) {
+			if (btrfs_header_nritems(&leaf->header) == 0) {
 				u64 blocknr = leaf_buf->blocknr;
 				clean_tree_block(root, leaf_buf);
 				wret = del_ptr(root, path, 1, slot);
@@ -1292,7 +1320,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 			return 1;
 		slot = path->slots[level] + 1;
 		c = path->nodes[level];
-		if (slot >= c->node.header.nritems) {
+		if (slot >= btrfs_header_nritems(&c->node.header)) {
 			level++;
 			continue;
 		}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 518326fa369..60c21fe1566 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,13 +35,10 @@ struct header {
 	/* generation flags to be added */
 } __attribute__ ((__packed__));
 
+#define MAX_LEVEL 8
 #define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
 			    (sizeof(struct key) + sizeof(u64)))
 
-#define MAX_LEVEL 8
-#define node_level(f) ((f) & (MAX_LEVEL-1))
-#define is_leaf(f) (node_level(f) == 0)
-
 struct tree_buffer;
 
 /*
@@ -144,6 +141,64 @@ struct ctree_path {
 	int slots[MAX_LEVEL];
 };
 
+static inline u64 btrfs_header_blocknr(struct header *h)
+{
+	return h->blocknr;
+}
+
+static inline void btrfs_set_header_blocknr(struct header *h, u64 blocknr)
+{
+	h->blocknr = blocknr;
+}
+
+static inline u64 btrfs_header_parentid(struct header *h)
+{
+	return h->parentid;
+}
+
+static inline void btrfs_set_header_parentid(struct header *h, u64 parentid)
+{
+	h->parentid = parentid;
+}
+
+static inline u32 btrfs_header_nritems(struct header *h)
+{
+	return h->nritems;
+}
+
+static inline void btrfs_set_header_nritems(struct header *h, u32 val)
+{
+	h->nritems = val;
+}
+
+static inline u32 btrfs_header_flags(struct header *h)
+{
+	return h->flags;
+}
+
+static inline void btrfs_set_header_flags(struct header *h, u32 val)
+{
+	h->flags = val;
+}
+
+static inline int btrfs_header_level(struct header *h)
+{
+	return btrfs_header_flags(h) & (MAX_LEVEL - 1);
+}
+
+static inline void btrfs_set_header_level(struct header *h, int level)
+{
+	u32 flags;
+	BUG_ON(level > MAX_LEVEL);
+	flags = btrfs_header_flags(h) & ~(MAX_LEVEL - 1);
+	btrfs_set_header_flags(h, flags | level);
+}
+
+static inline int btrfs_is_leaf(struct node *n)
+{
+	return (btrfs_header_level(&n->header) == 0);
+}
+
 struct tree_buffer *alloc_free_block(struct ctree_root *root);
 int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf);
 int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c9201ec81fe..065e888d2c0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -15,9 +15,10 @@ int cache_max = 10000;
 
 static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 {
-	if (buf->blocknr != buf->node.header.blocknr)
+	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
 		BUG();
-	if (root->node && buf->node.header.parentid != root->node->node.header.parentid)
+	if (root->node && btrfs_header_parentid(&buf->node.header) !=
+	    btrfs_header_parentid(&root->node->node.header))
 		BUG();
 	return 0;
 }
@@ -129,7 +130,7 @@ int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 	loff_t offset = blocknr * CTREE_BLOCKSIZE;
 	int ret;
 
-	if (buf->blocknr != buf->node.header.blocknr)
+	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
 		BUG();
 	ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
 	if (ret != CTREE_BLOCKSIZE)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e7b8a70b5e5..524c11fea24 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -79,10 +79,10 @@ int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf)
 
 	if (root == root->extent_root)
 		return 0;
-	if (is_leaf(buf->node.header.flags))
+	if (btrfs_is_leaf(&buf->node))
 		return 0;
 
-	for (i = 0; i < buf->node.header.nritems; i++) {
+	for (i = 0; i < btrfs_header_nritems(&buf->node.header); i++) {
 		blocknr = buf->node.blockptrs[i];
 		inc_block_ref(root, blocknr);
 	}
@@ -119,7 +119,8 @@ static int finish_current_insert(struct ctree_root *extent_root)
 	int ret;
 
 	extent_item.refs = 1;
-	extent_item.owner = extent_root->node->node.header.parentid;
+	extent_item.owner =
+		btrfs_header_parentid(&extent_root->node->node.header);
 	ins.offset = 1;
 	ins.flags = 0;
 
@@ -269,7 +270,7 @@ static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
 	struct ctree_root * root = orig_root->extent_root;
 	int total_needed = num_blocks;
 
-	total_needed += (node_level(root->node->node.header.flags) + 1) * 3;
+	total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3;
 	if (root->last_insert.objectid > search_start)
 		search_start = root->last_insert.objectid;
 check_failed:
@@ -288,7 +289,7 @@ check_failed:
 	while (1) {
 		l = &path.nodes[0]->leaf;
 		slot = path.slots[0];
-		if (slot >= l->header.nritems) {
+		if (slot >= btrfs_header_nritems(&l->header)) {
 			ret = next_leaf(root, &path);
 			if (ret == 0)
 				continue;
@@ -404,7 +405,7 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root)
 	struct tree_buffer *buf;
 
 	ret = alloc_extent(root, 1, 0, (unsigned long)-1,
-			   root->node->node.header.parentid,
+			   btrfs_header_parentid(&root->node->node.header),
 			   &ins);
 	if (ret) {
 		BUG();
@@ -429,7 +430,8 @@ int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 		goto out;
 	while(*level > 0) {
 		cur = path->nodes[*level];
-		if (path->slots[*level] >= cur->node.header.nritems)
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(&cur->node.header))
 			break;
 		blocknr = cur->node.blockptrs[path->slots[*level]];
 		ret = lookup_block_ref(root, blocknr, &refs);
@@ -444,7 +446,7 @@ int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 		if (path->nodes[*level-1])
 			tree_block_release(root, path->nodes[*level-1]);
 		path->nodes[*level-1] = next;
-		*level = node_level(next->node.header.flags);
+		*level = btrfs_header_level(&next->node.header);
 		path->slots[*level] = 0;
 	}
 out:
@@ -463,7 +465,8 @@ int walk_up_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 	int ret;
 	for(i = *level; i < MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot < path->nodes[i]->node.header.nritems - 1) {
+		if (slot <
+		    btrfs_header_nritems(&path->nodes[i]->node.header)- 1) {
 			path->slots[i]++;
 			*level = i;
 			return 0;
@@ -489,7 +492,7 @@ int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
 
 	init_path(&path);
 
-	level = node_level(snap->node.header.flags);
+	level = btrfs_header_level(&snap->node.header);
 	orig_level = level;
 	path.nodes[level] = snap;
 	path.slots[level] = 0;
@@ -509,33 +512,3 @@ int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
 
 	return 0;
 }
-
-
-#if 0
-int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
-{
-	int ret;
-	int level;
-	int refs;
-	u64 blocknr = snap->blocknr;
-
-	level = node_level(snap->node.header.flags);
-	ret = lookup_block_ref(root, snap->blocknr, &refs);
-	BUG_ON(ret);
-	if (refs == 1 && level != 0) {
-		struct node *n = &snap->node;
-		struct tree_buffer *b;
-		int i;
-		for (i = 0; i < n->header.nritems; i++) {
-			b = read_tree_block(root, n->blockptrs[i]);
-			/* FIXME, don't recurse here */
-			ret = btrfs_drop_snapshot(root, b);
-			BUG_ON(ret);
-			tree_block_release(root, b);
-		}
-	}
-	ret = free_extent(root, blocknr, 1);
-	BUG_ON(ret);
-	return 0;
-}
-#endif
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index fd4e5dea7e1..b2ba9469629 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -34,15 +34,15 @@ int mkfs(int fd)
 
 	/* create leaves for the tree root and extent root */
 	memset(&empty_leaf, 0, sizeof(empty_leaf));
-	empty_leaf.header.parentid = 1;
-	empty_leaf.header.blocknr = 17;
+	btrfs_set_header_parentid(&empty_leaf.header, 1);
+	btrfs_set_header_blocknr(&empty_leaf.header, 17);
 	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 17 * CTREE_BLOCKSIZE);
 	if (ret != sizeof(empty_leaf))
 		return -1;
 
-	empty_leaf.header.parentid = 2;
-	empty_leaf.header.blocknr = 18;
-	empty_leaf.header.nritems = 3;
+	btrfs_set_header_parentid(&empty_leaf.header, 2);
+	btrfs_set_header_blocknr(&empty_leaf.header, 18);
+	btrfs_set_header_nritems(&empty_leaf.header, 3);
 
 	/* item1, reserve blocks 0-16 */
 	item.key.objectid = 0;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index e32a959dd3e..7df16b1e473 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -8,11 +8,11 @@
 void print_leaf(struct leaf *l)
 {
 	int i;
-	int nr = l->header.nritems;
+	u32 nr = btrfs_header_nritems(&l->header);
 	struct item *item;
 	struct extent_item *ei;
-	printf("leaf %Lu total ptrs %d free space %d\n", l->header.blocknr, nr,
-	       leaf_free_space(l));
+	printf("leaf %Lu total ptrs %d free space %d\n",
+		btrfs_header_blocknr(&l->header), nr, leaf_free_space(l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
@@ -32,22 +32,20 @@ void print_leaf(struct leaf *l)
 void print_tree(struct ctree_root *root, struct tree_buffer *t)
 {
 	int i;
-	int nr;
+	u32 nr;
 	struct node *c;
 
 	if (!t)
 		return;
 	c = &t->node;
-	nr = c->header.nritems;
-	if (c->header.blocknr != t->blocknr)
-		BUG();
-	if (is_leaf(c->header.flags)) {
+	nr = btrfs_header_nritems(&c->header);
+	if (btrfs_is_leaf(c)) {
 		print_leaf((struct leaf *)c);
 		return;
 	}
 	printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
-	        node_level(c->header.flags), c->header.nritems,
-		(u32)NODEPTRS_PER_BLOCK - c->header.nritems);
+	        btrfs_header_level(&c->header), nr,
+		(u32)NODEPTRS_PER_BLOCK - nr);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
 		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
@@ -60,11 +58,11 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 		struct tree_buffer *next_buf = read_tree_block(root,
 							    c->blockptrs[i]);
 		struct node *next = &next_buf->node;
-		if (is_leaf(next->header.flags) &&
-		    node_level(c->header.flags) != 1)
+		if (btrfs_is_leaf(next) &&
+		    btrfs_header_level(&c->header) != 1)
 			BUG();
-		if (node_level(next->header.flags) !=
-			node_level(c->header.flags) - 1)
+		if (btrfs_header_level(&next->header) !=
+			btrfs_header_level(&c->header) - 1)
 			BUG();
 		print_tree(root, next_buf);
 		tree_block_release(root, next_buf);
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index ccca9b2c14a..07fd71b7762 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -71,9 +71,10 @@ int main(int ac, char **av) {
 	close_ctree(root, &super);
 	root = open_ctree("dbfile", &super);
 	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
-	        node_level(root->node->node.header.flags),
-		root->node->node.header.nritems,
-		NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
+	        btrfs_header_level(&root->node->node.header),
+		btrfs_header_nritems(&root->node->node.header),
+		NODEPTRS_PER_BLOCK -
+		btrfs_header_nritems(&root->node->node.header));
 	printf("all searches good, deleting some items\n");
 	i = 0;
 	srand(55);
@@ -126,7 +127,8 @@ int main(int ac, char **av) {
 		release_path(root, &path);
 	}
 	printf("starting big long delete run\n");
-	while(root->node && root->node->node.header.nritems > 0) {
+	while(root->node &&
+	      btrfs_header_nritems(&root->node->node.header) > 0) {
 		struct leaf *leaf;
 		int slot;
 		ins.objectid = (u64)-1;
@@ -137,7 +139,7 @@ int main(int ac, char **av) {
 
 		leaf = &path.nodes[0]->leaf;
 		slot = path.slots[0];
-		if (slot != leaf->header.nritems)
+		if (slot != btrfs_header_nritems(&leaf->header))
 			BUG();
 		while(path.slots[0] > 0) {
 			path.slots[0] -= 1;
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 7b37b6bae10..5d99b730a69 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -358,8 +358,8 @@ int main(int ac, char **av)
 		}
 		if (i && i % 5000 == 0) {
 			printf("open & close, root level %d nritems %d\n",
-				node_level(root->node->node.header.flags),
-				root->node->node.header.nritems);
+				btrfs_header_level(&root->node->node.header),
+				btrfs_header_nritems(&root->node->node.header));
 			close_ctree(root, &super);
 			root = open_ctree("dbfile", &super);
 		}
-- 
cgit v1.2.3


From bb492bb0a5453222109a72859353728959c2539d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 Mar 2007 12:29:44 -0400
Subject: Btrfs: Add sparse endian annotations to struct header rename struct
 header to btrfs_header

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      | 65 ++++++++++++++++++++++++++-------------------------
 fs/btrfs/kerncompat.h | 29 ++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 60c21fe1566..e1aaca66d59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -24,19 +24,19 @@ struct key {
 /*
  * every tree block (leaf or node) starts with this header.
  */
-struct header {
-	u64 fsid[2]; /* FS specific uuid */
-	u64 blocknr; /* which block this node is supposed to live in */
-	u64 parentid; /* objectid of the tree root */
-	u32 csum;
-	u32 ham;
-	u16 nritems;
-	u16 flags;
+struct btrfs_header {
+	__le64 fsid[2]; /* FS specific uuid */
+	__le64 blocknr; /* which block this node is supposed to live in */
+	__le64 parentid; /* objectid of the tree root */
+	__le32 csum;
+	__le32 ham;
+	__le16 nritems;
+	__le16 flags;
 	/* generation flags to be added */
 } __attribute__ ((__packed__));
 
 #define MAX_LEVEL 8
-#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
+#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct btrfs_header)) / \
 			    (sizeof(struct key) + sizeof(u64)))
 
 struct tree_buffer;
@@ -100,12 +100,12 @@ struct item {
  * The data is separate from the items to get the keys closer together
  * during searches.
  */
-#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header))
+#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct btrfs_header))
 struct leaf {
-	struct header header;
+	struct btrfs_header header;
 	union {
 		struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
-		u8 data[CTREE_BLOCKSIZE-sizeof(struct header)];
+		u8 data[CTREE_BLOCKSIZE-sizeof(struct btrfs_header)];
 	};
 } __attribute__ ((__packed__));
 
@@ -114,7 +114,7 @@ struct leaf {
  * other blocks
  */
 struct node {
-	struct header header;
+	struct btrfs_header header;
 	struct key keys[NODEPTRS_PER_BLOCK];
 	u64 blockptrs[NODEPTRS_PER_BLOCK];
 } __attribute__ ((__packed__));
@@ -141,54 +141,55 @@ struct ctree_path {
 	int slots[MAX_LEVEL];
 };
 
-static inline u64 btrfs_header_blocknr(struct header *h)
+static inline u64 btrfs_header_blocknr(struct btrfs_header *h)
 {
-	return h->blocknr;
+	return le64_to_cpu(h->blocknr);
 }
 
-static inline void btrfs_set_header_blocknr(struct header *h, u64 blocknr)
+static inline void btrfs_set_header_blocknr(struct btrfs_header *h, u64 blocknr)
 {
-	h->blocknr = blocknr;
+	h->blocknr = cpu_to_le64(blocknr);
 }
 
-static inline u64 btrfs_header_parentid(struct header *h)
+static inline u64 btrfs_header_parentid(struct btrfs_header *h)
 {
-	return h->parentid;
+	return le64_to_cpu(h->parentid);
 }
 
-static inline void btrfs_set_header_parentid(struct header *h, u64 parentid)
+static inline void btrfs_set_header_parentid(struct btrfs_header *h,
+					     u64 parentid)
 {
-	h->parentid = parentid;
+	h->parentid = cpu_to_le64(parentid);
 }
 
-static inline u32 btrfs_header_nritems(struct header *h)
+static inline u16 btrfs_header_nritems(struct btrfs_header *h)
 {
-	return h->nritems;
+	return le16_to_cpu(h->nritems);
 }
 
-static inline void btrfs_set_header_nritems(struct header *h, u32 val)
+static inline void btrfs_set_header_nritems(struct btrfs_header *h, u16 val)
 {
-	h->nritems = val;
+	h->nritems = cpu_to_le16(val);
 }
 
-static inline u32 btrfs_header_flags(struct header *h)
+static inline u16 btrfs_header_flags(struct btrfs_header *h)
 {
-	return h->flags;
+	return le16_to_cpu(h->flags);
 }
 
-static inline void btrfs_set_header_flags(struct header *h, u32 val)
+static inline void btrfs_set_header_flags(struct btrfs_header *h, u16 val)
 {
-	h->flags = val;
+	h->flags = cpu_to_le16(val);
 }
 
-static inline int btrfs_header_level(struct header *h)
+static inline int btrfs_header_level(struct btrfs_header *h)
 {
 	return btrfs_header_flags(h) & (MAX_LEVEL - 1);
 }
 
-static inline void btrfs_set_header_level(struct header *h, int level)
+static inline void btrfs_set_header_level(struct btrfs_header *h, int level)
 {
-	u32 flags;
+	u16 flags;
 	BUG_ON(level > MAX_LEVEL);
 	flags = btrfs_header_flags(h) & ~(MAX_LEVEL - 1);
 	btrfs_set_header_flags(h, flags | level);
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
index f5efc5f0fff..54cf70eaa01 100644
--- a/fs/btrfs/kerncompat.h
+++ b/fs/btrfs/kerncompat.h
@@ -9,10 +9,16 @@
 #define GFP_KERNEL 0
 #define __read_mostly
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#define __force
 #define PAGE_SHIFT 12
 #define ULONG_MAX       (~0UL)
 #define BUG() abort()
+#ifdef __CHECKER__
+#define __force    __attribute__((force))
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __force
+#define __bitwise__
+#endif
 
 typedef unsigned int u32;
 typedef unsigned long long u64;
@@ -67,3 +73,24 @@ static inline void __clear_bit(int bit, unsigned long *map) {
 
 #define ENOMEM 5
 #define EEXIST 6
+
+#define __CHECK_ENDIAN__
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+typedef u16 __bitwise __le16;
+typedef u16 __bitwise __be16;
+typedef u32 __bitwise __le32;
+typedef u32 __bitwise __be32;
+typedef u64 __bitwise __le64;
+typedef u64 __bitwise __be64;
+
+#define cpu_to_le64(x) ((__force __le64)(u64)(x))
+#define le64_to_cpu(x) ((__force u64)(__le64)(x))
+#define cpu_to_le32(x) ((__force __le32)(u32)(x))
+#define le32_to_cpu(x) ((__force u32)(__le32)(x))
+#define cpu_to_le16(x) ((__force __le16)(u16)(x))
+#define le16_to_cpu(x) ((__force u16)(__le16)(x))
-- 
cgit v1.2.3


From e2fa7227cdf132d72e7410dd0679dc573a1c2618 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 Mar 2007 16:22:34 -0400
Subject: Btrfs: struct key endian fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 92 +++++++++++++++++++++++++++++---------------------
 fs/btrfs/ctree.h       | 78 +++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/extent-tree.c | 34 ++++++++++---------
 fs/btrfs/kerncompat.h  |  3 +-
 fs/btrfs/mkfs.c        | 14 ++++----
 fs/btrfs/quick-test.c  |  6 ++--
 fs/btrfs/random-test.c | 17 +++++-----
 7 files changed, 162 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e43c827e0df..489019ac04b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -95,19 +95,23 @@ int leaf_free_space(struct leaf *leaf)
 /*
  * compare two keys in a memcmp fashion
  */
-int comp_keys(struct key *k1, struct key *k2)
+int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 {
-	if (k1->objectid > k2->objectid)
+	struct btrfs_key k1;
+
+	btrfs_disk_key_to_cpu(&k1, disk);
+
+	if (k1.objectid > k2->objectid)
 		return 1;
-	if (k1->objectid < k2->objectid)
+	if (k1.objectid < k2->objectid)
 		return -1;
-	if (k1->flags > k2->flags)
+	if (k1.flags > k2->flags)
 		return 1;
-	if (k1->flags < k2->flags)
+	if (k1.flags < k2->flags)
 		return -1;
-	if (k1->offset > k2->offset)
+	if (k1.offset > k2->offset)
 		return 1;
-	if (k1->offset < k2->offset)
+	if (k1.offset < k2->offset)
 		return -1;
 	return 0;
 }
@@ -125,15 +129,18 @@ int check_node(struct ctree_path *path, int level)
 	parent_slot = path->slots[level + 1];
 	BUG_ON(nritems == 0);
 	if (parent) {
-		struct key *parent_key;
+		struct btrfs_disk_key *parent_key;
 		parent_key = &parent->keys[parent_slot];
-		BUG_ON(memcmp(parent_key, node->keys, sizeof(struct key)));
+		BUG_ON(memcmp(parent_key, node->keys,
+			      sizeof(struct btrfs_disk_key)));
 		BUG_ON(parent->blockptrs[parent_slot] !=
 		       btrfs_header_blocknr(&node->header));
 	}
 	BUG_ON(nritems > NODEPTRS_PER_BLOCK);
 	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
-		BUG_ON(comp_keys(&node->keys[i], &node->keys[i+1]) >= 0);
+		struct btrfs_key cpukey;
+		btrfs_disk_key_to_cpu(&cpukey, &node->keys[i + 1]);
+		BUG_ON(comp_keys(&node->keys[i], &cpukey) >= 0);
 	}
 	return 0;
 }
@@ -155,16 +162,18 @@ int check_leaf(struct ctree_path *path, int level)
 		return 0;
 
 	if (parent) {
-		struct key *parent_key;
+		struct btrfs_disk_key *parent_key;
 		parent_key = &parent->keys[parent_slot];
 		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
-		       sizeof(struct key)));
+		       sizeof(struct btrfs_disk_key)));
 		BUG_ON(parent->blockptrs[parent_slot] !=
 		       btrfs_header_blocknr(&leaf->header));
 	}
 	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
+		struct btrfs_key cpukey;
+		btrfs_disk_key_to_cpu(&cpukey, &leaf->items[i + 1].key);
 		BUG_ON(comp_keys(&leaf->items[i].key,
-		                 &leaf->items[i+1].key) >= 0);
+		                 &cpukey) >= 0);
 		BUG_ON(leaf->items[i].offset != leaf->items[i + 1].offset +
 		    leaf->items[i + 1].size);
 		if (i == 0) {
@@ -191,18 +200,18 @@ int check_block(struct ctree_path *path, int level)
  *
  * slot may point to max if the key is bigger than all of the keys
  */
-int generic_bin_search(char *p, int item_size, struct key *key,
+int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
 		       int max, int *slot)
 {
 	int low = 0;
 	int high = max;
 	int mid;
 	int ret;
-	struct key *tmp;
+	struct btrfs_disk_key *tmp;
 
 	while(low < high) {
 		mid = (low + high) / 2;
-		tmp = (struct key *)(p + mid * item_size);
+		tmp = (struct btrfs_disk_key *)(p + mid * item_size);
 		ret = comp_keys(tmp, key);
 
 		if (ret < 0)
@@ -222,7 +231,7 @@ int generic_bin_search(char *p, int item_size, struct key *key,
  * simple bin_search frontend that does the right thing for
  * leaves vs nodes
  */
-int bin_search(struct node *c, struct key *key, int *slot)
+int bin_search(struct node *c, struct btrfs_key *key, int *slot)
 {
 	if (btrfs_is_leaf(c)) {
 		struct leaf *l = (struct leaf *)c;
@@ -230,7 +239,8 @@ int bin_search(struct node *c, struct key *key, int *slot)
 					  key, btrfs_header_nritems(&c->header),
 					  slot);
 	} else {
-		return generic_bin_search((void *)c->keys, sizeof(struct key),
+		return generic_bin_search((void *)c->keys,
+					  sizeof(struct btrfs_disk_key),
 					  key, btrfs_header_nritems(&c->header),
 					  slot);
 	}
@@ -339,7 +349,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 				ret = wret;
 		} else {
 			memcpy(parent->keys + pslot + 1, right->keys,
-				sizeof(struct key));
+				sizeof(struct btrfs_disk_key));
 			BUG_ON(list_empty(&parent_buf->dirty));
 		}
 	}
@@ -374,7 +384,8 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 			ret = wret;
 	} else {
 		/* update the parent key to reflect our changes */
-		memcpy(parent->keys + pslot, mid->keys, sizeof(struct key));
+		memcpy(parent->keys + pslot, mid->keys,
+		       sizeof(struct btrfs_disk_key));
 		BUG_ON(list_empty(&parent_buf->dirty));
 	}
 
@@ -417,7 +428,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
  * possible)
  */
-int search_slot(struct ctree_root *root, struct key *key,
+int search_slot(struct ctree_root *root, struct btrfs_key *key,
 		struct ctree_path *p, int ins_len, int cow)
 {
 	struct tree_buffer *b;
@@ -499,7 +510,7 @@ again:
  * fixing up the blocks in ram so the tree is consistent.
  */
 static int fixup_low_keys(struct ctree_root *root,
-			   struct ctree_path *path, struct key *key,
+			   struct ctree_path *path, struct btrfs_disk_key *key,
 			   int level)
 {
 	int i;
@@ -546,12 +557,13 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
 		push_items = src_nritems;
 
 	memcpy(dst->keys + dst_nritems, src->keys,
-		push_items * sizeof(struct key));
+		push_items * sizeof(struct btrfs_disk_key));
 	memcpy(dst->blockptrs + dst_nritems, src->blockptrs,
 		push_items * sizeof(u64));
 	if (push_items < src_nritems) {
 		memmove(src->keys, src->keys + push_items,
-			(src_nritems - push_items) * sizeof(struct key));
+			(src_nritems - push_items) *
+			sizeof(struct btrfs_disk_key));
 		memmove(src->blockptrs, src->blockptrs + push_items,
 			(src_nritems - push_items) * sizeof(u64));
 	}
@@ -598,11 +610,11 @@ static int balance_node_right(struct ctree_root *root,
 		push_items = max_push;
 
 	memmove(dst->keys + push_items, dst->keys,
-		dst_nritems * sizeof(struct key));
+		dst_nritems * sizeof(struct btrfs_disk_key));
 	memmove(dst->blockptrs + push_items, dst->blockptrs,
 		dst_nritems * sizeof(u64));
 	memcpy(dst->keys, src->keys + src_nritems - push_items,
-		push_items * sizeof(struct key));
+		push_items * sizeof(struct btrfs_disk_key));
 	memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
 		push_items * sizeof(u64));
 
@@ -627,7 +639,7 @@ static int insert_new_root(struct ctree_root *root,
 	struct tree_buffer *t;
 	struct node *lower;
 	struct node *c;
-	struct key *lower_key;
+	struct btrfs_disk_key *lower_key;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
@@ -645,7 +657,7 @@ static int insert_new_root(struct ctree_root *root,
 		lower_key = &((struct leaf *)lower)->items[0].key;
 	else
 		lower_key = lower->keys;
-	memcpy(c->keys, lower_key, sizeof(struct key));
+	memcpy(c->keys, lower_key, sizeof(struct btrfs_disk_key));
 	c->blockptrs[0] = path->nodes[level-1]->blocknr;
 	/* the super has an extra ref to root->node */
 	tree_block_release(root, root->node);
@@ -666,7 +678,7 @@ static int insert_new_root(struct ctree_root *root,
  * returns zero on success and < 0 on any error
  */
 static int insert_ptr(struct ctree_root *root,
-		struct ctree_path *path, struct key *key,
+		struct ctree_path *path, struct btrfs_disk_key *key,
 		u64 blocknr, int slot, int level)
 {
 	struct node *lower;
@@ -681,11 +693,11 @@ static int insert_ptr(struct ctree_root *root,
 		BUG();
 	if (slot != nritems) {
 		memmove(lower->keys + slot + 1, lower->keys + slot,
-			(nritems - slot) * sizeof(struct key));
+			(nritems - slot) * sizeof(struct btrfs_disk_key));
 		memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
 			(nritems - slot) * sizeof(u64));
 	}
-	memcpy(lower->keys + slot, key, sizeof(struct key));
+	memcpy(lower->keys + slot, key, sizeof(struct btrfs_disk_key));
 	lower->blockptrs[slot] = blocknr;
 	btrfs_set_header_nritems(&lower->header, nritems + 1);
 	if (lower->keys[1].objectid == 0)
@@ -732,7 +744,7 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 	                       btrfs_header_parentid(&root->node->node.header));
 	mid = (c_nritems + 1) / 2;
 	memcpy(split->keys, c->keys + mid,
-		(c_nritems - mid) * sizeof(struct key));
+		(c_nritems - mid) * sizeof(struct btrfs_disk_key));
 	memcpy(split->blockptrs, c->blockptrs + mid,
 		(c_nritems - mid) * sizeof(u64));
 	btrfs_set_header_nritems(&split->header, c_nritems - mid);
@@ -869,7 +881,7 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	BUG_ON(list_empty(&left_buf->dirty));
 	BUG_ON(list_empty(&right_buf->dirty));
 	memcpy(upper->node.keys + slot + 1,
-		&right->items[0].key, sizeof(struct key));
+		&right->items[0].key, sizeof(struct btrfs_disk_key));
 	BUG_ON(list_empty(&upper->dirty));
 
 	/* then fixup the leaf pointer in the path */
@@ -1090,7 +1102,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
-int insert_item(struct ctree_root *root, struct key *key,
+int insert_item(struct ctree_root *root, struct btrfs_key *cpu_key,
 			  void *data, int data_size)
 {
 	int ret = 0;
@@ -1101,12 +1113,15 @@ int insert_item(struct ctree_root *root, struct key *key,
 	u32 nritems;
 	unsigned int data_end;
 	struct ctree_path path;
+	struct btrfs_disk_key disk_key;
+
+	btrfs_cpu_key_to_disk(&disk_key, cpu_key);
 
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
 	init_path(&path);
-	ret = search_slot(root, key, &path, data_size, 1);
+	ret = search_slot(root, cpu_key, &path, data_size, 1);
 	if (ret == 0) {
 		release_path(root, &path);
 		return -EEXIST;
@@ -1148,7 +1163,8 @@ int insert_item(struct ctree_root *root, struct key *key,
 		data_end = old_data;
 	}
 	/* copy the new data in */
-	memcpy(&leaf->items[slot].key, key, sizeof(struct key));
+	memcpy(&leaf->items[slot].key, &disk_key,
+		sizeof(struct btrfs_disk_key));
 	leaf->items[slot].offset = data_end - data_size;
 	leaf->items[slot].size = data_size;
 	memcpy(leaf->data + data_end - data_size, data, data_size);
@@ -1156,7 +1172,7 @@ int insert_item(struct ctree_root *root, struct key *key,
 
 	ret = 0;
 	if (slot == 0)
-		ret = fixup_low_keys(root, &path, key, 1);
+		ret = fixup_low_keys(root, &path, &disk_key, 1);
 
 	BUG_ON(list_empty(&leaf_buf->dirty));
 	if (leaf_free_space(leaf) < 0)
@@ -1187,7 +1203,7 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
 	nritems = btrfs_header_nritems(&node->header);
 	if (slot != nritems -1) {
 		memmove(node->keys + slot, node->keys + slot + 1,
-			sizeof(struct key) * (nritems - slot - 1));
+			sizeof(struct btrfs_disk_key) * (nritems - slot - 1));
 		memmove(node->blockptrs + slot,
 			node->blockptrs + slot + 1,
 			sizeof(u64) * (nritems - slot - 1));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e1aaca66d59..58e03e90f23 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2,6 +2,7 @@
 #define __CTREE__
 
 #include "list.h"
+#include "kerncompat.h"
 
 #define CTREE_BLOCKSIZE 1024
 
@@ -14,8 +15,18 @@
  * may point to extents.
  *
  * offset is the starting byte offset for this key in the stream.
+ *
+ * btrfs_disk_key is in disk byte order.  struct btrfs_key is always
+ * in cpu native order.  Otherwise they are identical and their sizes
+ * should be the same (ie both packed)
  */
-struct key {
+struct btrfs_disk_key {
+	__le64 objectid;
+	__le32 flags;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_key {
 	u64 objectid;
 	u32 flags;
 	u64 offset;
@@ -37,7 +48,7 @@ struct btrfs_header {
 
 #define MAX_LEVEL 8
 #define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct btrfs_header)) / \
-			    (sizeof(struct key) + sizeof(u64)))
+			    (sizeof(struct btrfs_disk_key) + sizeof(u64)))
 
 struct tree_buffer;
 
@@ -50,8 +61,8 @@ struct ctree_root {
 	struct tree_buffer *node;
 	struct tree_buffer *commit_root;
 	struct ctree_root *extent_root;
-	struct key current_insert;
-	struct key last_insert;
+	struct btrfs_key current_insert;
+	struct btrfs_key last_insert;
 	int fp;
 	struct radix_tree_root cache_radix;
 	struct radix_tree_root pinned_radix;
@@ -88,7 +99,7 @@ struct ctree_super_block {
  * the item in the leaf (relative to the start of the data area)
  */
 struct item {
-	struct key key;
+	struct btrfs_disk_key key;
 	u16 offset;
 	u16 size;
 } __attribute__ ((__packed__));
@@ -115,7 +126,7 @@ struct leaf {
  */
 struct node {
 	struct btrfs_header header;
-	struct key keys[NODEPTRS_PER_BLOCK];
+	struct btrfs_disk_key keys[NODEPTRS_PER_BLOCK];
 	u64 blockptrs[NODEPTRS_PER_BLOCK];
 } __attribute__ ((__packed__));
 
@@ -141,6 +152,55 @@ struct ctree_path {
 	int slots[MAX_LEVEL];
 };
 
+static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
+					 struct btrfs_disk_key *disk)
+{
+	cpu->offset = le64_to_cpu(disk->offset);
+	cpu->flags = le32_to_cpu(disk->flags);
+	cpu->objectid = le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
+					 struct btrfs_key *cpu)
+{
+	disk->offset = cpu_to_le64(cpu->offset);
+	disk->flags = cpu_to_le32(cpu->flags);
+	disk->objectid = cpu_to_le64(cpu->objectid);
+}
+
+static inline u64 btrfs_key_objectid(struct btrfs_disk_key *disk)
+{
+	return le64_to_cpu(disk->objectid);
+}
+
+static inline void btrfs_set_key_objectid(struct btrfs_disk_key *disk,
+					  u64 val)
+{
+	disk->objectid = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_key_offset(struct btrfs_disk_key *disk)
+{
+	return le64_to_cpu(disk->offset);
+}
+
+static inline void btrfs_set_key_offset(struct btrfs_disk_key *disk,
+					  u64 val)
+{
+	disk->offset = cpu_to_le64(val);
+}
+
+static inline u32 btrfs_key_flags(struct btrfs_disk_key *disk)
+{
+	return le32_to_cpu(disk->flags);
+}
+
+static inline void btrfs_set_key_flags(struct btrfs_disk_key *disk,
+					  u32 val)
+{
+	disk->flags = cpu_to_le32(val);
+}
+
 static inline u64 btrfs_header_blocknr(struct btrfs_header *h)
 {
 	return le64_to_cpu(h->blocknr);
@@ -203,11 +263,13 @@ static inline int btrfs_is_leaf(struct node *n)
 struct tree_buffer *alloc_free_block(struct ctree_root *root);
 int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf);
 int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
-int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p, int ins_len, int cow);
+int search_slot(struct ctree_root *root, struct btrfs_key *key,
+		struct ctree_path *p, int ins_len, int cow);
 void release_path(struct ctree_root *root, struct ctree_path *p);
 void init_path(struct ctree_path *p);
 int del_item(struct ctree_root *root, struct ctree_path *path);
-int insert_item(struct ctree_root *root, struct key *key, void *data, int data_size);
+int insert_item(struct ctree_root *root, struct btrfs_key *key,
+		void *data, int data_size);
 int next_leaf(struct ctree_root *root, struct ctree_path *path);
 int leaf_free_space(struct leaf *leaf);
 int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 524c11fea24..475e7221596 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7,7 +7,8 @@
 #include "print-tree.h"
 
 static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
-			    u64 search_start, u64 search_end, struct key *ins);
+			    u64 search_start, u64 search_end,
+			    struct btrfs_key *ins);
 static int finish_current_insert(struct ctree_root *extent_root);
 static int run_pending(struct ctree_root *extent_root);
 
@@ -24,10 +25,10 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 {
 	struct ctree_path path;
 	int ret;
-	struct key key;
+	struct btrfs_key key;
 	struct leaf *l;
 	struct extent_item *item;
-	struct key ins;
+	struct btrfs_key ins;
 
 	find_free_extent(root->extent_root, 0, 0, (u64)-1, &ins);
 	init_path(&path);
@@ -54,7 +55,7 @@ static int lookup_block_ref(struct ctree_root *root, u64 blocknr, u32 *refs)
 {
 	struct ctree_path path;
 	int ret;
-	struct key key;
+	struct btrfs_key key;
 	struct leaf *l;
 	struct extent_item *item;
 	init_path(&path);
@@ -113,7 +114,7 @@ int btrfs_finish_extent_commit(struct ctree_root *root)
 
 static int finish_current_insert(struct ctree_root *extent_root)
 {
-	struct key ins;
+	struct btrfs_key ins;
 	struct extent_item extent_item;
 	int i;
 	int ret;
@@ -140,12 +141,12 @@ static int finish_current_insert(struct ctree_root *extent_root)
 int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	struct ctree_root *extent_root = root->extent_root;
 	int ret;
 	struct item *item;
 	struct extent_item *ei;
-	struct key ins;
+	struct btrfs_key ins;
 
 	key.objectid = blocknr;
 	key.flags = 0;
@@ -227,7 +228,7 @@ static int run_pending(struct ctree_root *extent_root)
  */
 int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 {
-	struct key key;
+	struct btrfs_key key;
 	struct ctree_root *extent_root = root->extent_root;
 	struct tree_buffer *t;
 	int pending_ret;
@@ -256,10 +257,11 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
  * Any available blocks before search_start are skipped.
  */
 static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
-			    u64 search_start, u64 search_end, struct key *ins)
+			    u64 search_start, u64 search_end,
+			    struct btrfs_key *ins)
 {
 	struct ctree_path path;
-	struct key *key;
+	struct btrfs_key key;
 	int ret;
 	u64 hole_size = 0;
 	int slot = 0;
@@ -306,12 +308,12 @@ check_failed:
 			ins->offset = (u64)-1;
 			goto check_pending;
 		}
-		key = &l->items[slot].key;
-		if (key->objectid >= search_start) {
+		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
+		if (key.objectid >= search_start) {
 			if (start_found) {
 				if (last_block < search_start)
 					last_block = search_start;
-				hole_size = key->objectid - last_block;
+				hole_size = key.objectid - last_block;
 				if (hole_size > total_needed) {
 					ins->objectid = last_block;
 					ins->offset = hole_size;
@@ -320,7 +322,7 @@ check_failed:
 			}
 		}
 		start_found = 1;
-		last_block = key->objectid + key->offset;
+		last_block = key.objectid + key.offset;
 		path.slots[0]++;
 	}
 	// FIXME -ENOSPC
@@ -357,7 +359,7 @@ error:
  * returns 0 if everything worked, non-zero otherwise.
  */
 int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
-			 u64 search_end, u64 owner, struct key *ins)
+			 u64 search_end, u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
@@ -400,7 +402,7 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
  */
 struct tree_buffer *alloc_free_block(struct ctree_root *root)
 {
-	struct key ins;
+	struct btrfs_key ins;
 	int ret;
 	struct tree_buffer *buf;
 
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
index 54cf70eaa01..56239b922ba 100644
--- a/fs/btrfs/kerncompat.h
+++ b/fs/btrfs/kerncompat.h
@@ -69,8 +69,6 @@ static inline void __clear_bit(int bit, unsigned long *map) {
         const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
 	        (type *)( (char *)__mptr - __builtin_offsetof(type,member) );})
 
-#endif
-
 #define ENOMEM 5
 #define EEXIST 6
 
@@ -94,3 +92,4 @@ typedef u64 __bitwise __be64;
 #define le32_to_cpu(x) ((__force u32)(__le32)(x))
 #define cpu_to_le16(x) ((__force __le16)(u16)(x))
 #define le16_to_cpu(x) ((__force u16)(__le16)(x))
+#endif
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index b2ba9469629..5290423343f 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -45,9 +45,9 @@ int mkfs(int fd)
 	btrfs_set_header_nritems(&empty_leaf.header, 3);
 
 	/* item1, reserve blocks 0-16 */
-	item.key.objectid = 0;
-	item.key.offset = 17;
-	item.key.flags = 0;
+	item.key.objectid = cpu_to_le64(0);
+	item.key.offset = cpu_to_le64(17);
+	item.key.flags = cpu_to_le32(0);
 	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item);
 	item.size = sizeof(struct extent_item);
 	extent_item.refs = 1;
@@ -56,16 +56,16 @@ int mkfs(int fd)
 	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
 
 	/* item2, give block 17 to the root */
-	item.key.objectid = 17;
-	item.key.offset = 1;
+	item.key.objectid = cpu_to_le64(17);
+	item.key.offset = cpu_to_le64(1);
 	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 2;
 	extent_item.owner = 1;
 	memcpy(empty_leaf.items + 1, &item, sizeof(item));
 	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
 
 	/* item3, give block 18 for the extent root */
-	item.key.objectid = 18;
-	item.key.offset = 1;
+	item.key.objectid = cpu_to_le64(18);
+	item.key.offset = cpu_to_le64(1);
 	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 3;
 	extent_item.owner = 2;
 	memcpy(empty_leaf.items + 2, &item, sizeof(item));
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index 07fd71b7762..ab3bda53a2f 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -13,8 +13,8 @@ int next_key(int i, int max_key) {
 }
 
 int main(int ac, char **av) {
-	struct key ins;
-	struct key last = { (u64)-1, 0, 0};
+	struct btrfs_key ins;
+	struct btrfs_key last = { (u64)-1, 0, 0};
 	char *buf;
 	int i;
 	int num;
@@ -146,7 +146,7 @@ int main(int ac, char **av) {
 			slot = path.slots[0];
 			leaf = &path.nodes[0]->leaf;
 
-			memcpy(&last, &leaf->items[slot].key, sizeof(last));
+			btrfs_disk_key_to_cpu(&last, &leaf->items[slot].key);
 			if (tree_size % 10000 == 0)
 				printf("big del %d:%d\n", tree_size, i);
 			ret = del_item(root, &path);
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 5d99b730a69..34a15841ebd 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -10,7 +10,8 @@
 int keep_running = 1;
 struct ctree_super_block super;
 
-static int setup_key(struct radix_tree_root *root, struct key *key, int exists)
+static int setup_key(struct radix_tree_root *root, struct btrfs_key *key,
+		     int exists)
 {
 	int num = rand();
 	unsigned long res[2];
@@ -38,7 +39,7 @@ again:
 static int ins_one(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	int ret;
 	char buf[128];
 	unsigned long oid;
@@ -63,7 +64,7 @@ error:
 static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	int ret;
 	char buf[128];
 	init_path(&path);
@@ -82,7 +83,7 @@ static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
 static int del_one(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	int ret;
 	unsigned long *ptr;
 	init_path(&path);
@@ -108,7 +109,7 @@ error:
 static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	int ret;
 	init_path(&path);
 	ret = setup_key(radix, &key, 1);
@@ -127,7 +128,7 @@ error:
 static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	int ret;
 	init_path(&path);
 	ret = setup_key(radix, &key, 0);
@@ -147,7 +148,7 @@ static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix,
 		      int nr)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	unsigned long found = 0;
 	int ret;
 	int slot;
@@ -248,7 +249,7 @@ int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) =
 static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 {
 	struct ctree_path path;
-	struct key key;
+	struct btrfs_key key;
 	unsigned long found;
 	int ret;
 	int slot;
-- 
cgit v1.2.3


From 0783fcfc4dc19b8bb99bd51b7afa669ba4cbd377 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 Mar 2007 20:12:07 -0400
Subject: Btrfs: struct item endian fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 129 +++++++++++++++++++++++++++----------------------
 fs/btrfs/ctree.h       |  34 +++++++++++--
 fs/btrfs/extent-tree.c |  12 +++--
 fs/btrfs/mkfs.c        |  36 ++++++++------
 fs/btrfs/print-tree.c  |  15 +++---
 fs/btrfs/random-test.c |   5 +-
 6 files changed, 141 insertions(+), 90 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 489019ac04b..10555647005 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -76,7 +76,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf)
 	u32 nr = btrfs_header_nritems(&leaf->header);
 	if (nr == 0)
 		return sizeof(leaf->data);
-	return leaf->items[nr-1].offset;
+	return btrfs_item_offset(leaf->items + nr - 1);
 }
 
 /*
@@ -174,11 +174,12 @@ int check_leaf(struct ctree_path *path, int level)
 		btrfs_disk_key_to_cpu(&cpukey, &leaf->items[i + 1].key);
 		BUG_ON(comp_keys(&leaf->items[i].key,
 		                 &cpukey) >= 0);
-		BUG_ON(leaf->items[i].offset != leaf->items[i + 1].offset +
-		    leaf->items[i + 1].size);
+		BUG_ON(btrfs_item_offset(leaf->items + i) !=
+			btrfs_item_end(leaf->items + i + 1));
 		if (i == 0) {
-			BUG_ON(leaf->items[i].offset + leaf->items[i].size !=
-				LEAF_DATA_SIZE);
+			BUG_ON(btrfs_item_offset(leaf->items + i) +
+			       btrfs_item_size(leaf->items + i) !=
+			       LEAF_DATA_SIZE);
 		}
 	}
 	return 0;
@@ -235,7 +236,8 @@ int bin_search(struct node *c, struct btrfs_key *key, int *slot)
 {
 	if (btrfs_is_leaf(c)) {
 		struct leaf *l = (struct leaf *)c;
-		return generic_bin_search((void *)l->items, sizeof(struct item),
+		return generic_bin_search((void *)l->items,
+					  sizeof(struct btrfs_item),
 					  key, btrfs_header_nritems(&c->header),
 					  slot);
 	} else {
@@ -485,7 +487,7 @@ again:
 			struct leaf *l = (struct leaf *)c;
 			p->slots[level] = slot;
 			if (ins_len > 0 && leaf_free_space(l) <
-			    sizeof(struct item) + ins_len) {
+			    sizeof(struct btrfs_item) + ins_len) {
 				int sret = split_leaf(root, p, ins_len);
 				BUG_ON(sret > 0);
 				if (sret)
@@ -780,9 +782,9 @@ static int leaf_space_used(struct leaf *l, int start, int nr)
 
 	if (!nr)
 		return 0;
-	data_len = l->items[start].offset + l->items[start].size;
-	data_len = data_len - l->items[end].offset;
-	data_len += sizeof(struct item) * nr;
+	data_len = btrfs_item_end(l->items + start);
+	data_len = data_len - btrfs_item_offset(l->items + end);
+	data_len += sizeof(struct btrfs_item) * nr;
 	return data_len;
 }
 
@@ -806,7 +808,7 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	int free_space;
 	int push_space = 0;
 	int push_items = 0;
-	struct item *item;
+	struct btrfs_item *item;
 	u32 left_nritems;
 	u32 right_nritems;
 
@@ -821,7 +823,7 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	right_buf = read_tree_block(root, upper->node.blockptrs[slot + 1]);
 	right = &right_buf->leaf;
 	free_space = leaf_free_space(right);
-	if (free_space < data_size + sizeof(struct item)) {
+	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		tree_block_release(root, right_buf);
 		return 1;
 	}
@@ -829,7 +831,7 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	btrfs_cow_block(root, right_buf, upper, slot + 1, &right_buf);
 	right = &right_buf->leaf;
 	free_space = leaf_free_space(right);
-	if (free_space < data_size + sizeof(struct item)) {
+	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		tree_block_release(root, right_buf);
 		return 1;
 	}
@@ -839,10 +841,11 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		item = left->items + i;
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
-		if (item->size + sizeof(*item) + push_space > free_space)
+		if (btrfs_item_size(item) + sizeof(*item) + push_space >
+		    free_space)
 			break;
 		push_items++;
-		push_space += item->size + sizeof(*item);
+		push_space += btrfs_item_size(item) + sizeof(*item);
 	}
 	if (push_items == 0) {
 		tree_block_release(root, right_buf);
@@ -850,8 +853,7 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	}
 	right_nritems = btrfs_header_nritems(&right->header);
 	/* push left to right */
-	push_space = left->items[left_nritems - push_items].offset +
-		     left->items[left_nritems - push_items].size;
+	push_space = btrfs_item_end(left->items + left_nritems - push_items);
 	push_space -= leaf_data_end(left);
 	/* make room in the right data area */
 	memmove(right->data + leaf_data_end(right) - push_space,
@@ -862,18 +864,19 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		left->data + leaf_data_end(left),
 		push_space);
 	memmove(right->items + push_items, right->items,
-		right_nritems * sizeof(struct item));
+		right_nritems * sizeof(struct btrfs_item));
 	/* copy the items from left to right */
 	memcpy(right->items, left->items + left_nritems - push_items,
-		push_items * sizeof(struct item));
+		push_items * sizeof(struct btrfs_item));
 
 	/* update the item pointers */
 	right_nritems += push_items;
 	btrfs_set_header_nritems(&right->header, right_nritems);
 	push_space = LEAF_DATA_SIZE;
 	for (i = 0; i < right_nritems; i++) {
-		right->items[i].offset = push_space - right->items[i].size;
-		push_space = right->items[i].offset;
+		btrfs_set_item_offset(right->items + i, push_space -
+				      btrfs_item_size(right->items + i));
+		push_space = btrfs_item_offset(right->items + i);
 	}
 	left_nritems -= push_items;
 	btrfs_set_header_nritems(&left->header, left_nritems);
@@ -911,7 +914,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	int free_space;
 	int push_space = 0;
 	int push_items = 0;
-	struct item *item;
+	struct btrfs_item *item;
 	u32 old_left_nritems;
 	int ret = 0;
 	int wret;
@@ -926,7 +929,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
 	left = &t->leaf;
 	free_space = leaf_free_space(left);
-	if (free_space < data_size + sizeof(struct item)) {
+	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		tree_block_release(root, t);
 		return 1;
 	}
@@ -935,7 +938,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	btrfs_cow_block(root, t, path->nodes[1], slot - 1, &t);
 	left = &t->leaf;
 	free_space = leaf_free_space(left);
-	if (free_space < data_size + sizeof(struct item)) {
+	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		tree_block_release(root, t);
 		return 1;
 	}
@@ -944,10 +947,11 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		item = right->items + i;
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
-		if (item->size + sizeof(*item) + push_space > free_space)
+		if (btrfs_item_size(item) + sizeof(*item) + push_space >
+		    free_space)
 			break;
 		push_items++;
-		push_space += item->size + sizeof(*item);
+		push_space += btrfs_item_size(item) + sizeof(*item);
 	}
 	if (push_items == 0) {
 		tree_block_release(root, t);
@@ -955,35 +959,40 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	}
 	/* push data from right to left */
 	memcpy(left->items + btrfs_header_nritems(&left->header),
-		right->items, push_items * sizeof(struct item));
-	push_space = LEAF_DATA_SIZE - right->items[push_items -1].offset;
+		right->items, push_items * sizeof(struct btrfs_item));
+	push_space = LEAF_DATA_SIZE -
+		     btrfs_item_offset(right->items + push_items -1);
 	memcpy(left->data + leaf_data_end(left) - push_space,
-		right->data + right->items[push_items - 1].offset,
+		right->data + btrfs_item_offset(right->items + push_items - 1),
 		push_space);
 	old_left_nritems = btrfs_header_nritems(&left->header);
 	BUG_ON(old_left_nritems < 0);
 
-	for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
-		left->items[i].offset -= LEAF_DATA_SIZE -
-			left->items[old_left_nritems -1].offset;
+	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
+		u16 ioff = btrfs_item_offset(left->items + i);
+		btrfs_set_item_offset(left->items + i, ioff - (LEAF_DATA_SIZE -
+				      btrfs_item_offset(left->items +
+						        old_left_nritems - 1)));
 	}
 	btrfs_set_header_nritems(&left->header, old_left_nritems + push_items);
 
 	/* fixup right node */
-	push_space = right->items[push_items-1].offset - leaf_data_end(right);
+	push_space = btrfs_item_offset(right->items + push_items - 1) -
+		     leaf_data_end(right);
 	memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
 		leaf_data_end(right), push_space);
 	memmove(right->items, right->items + push_items,
 		(btrfs_header_nritems(&right->header) - push_items) *
-		sizeof(struct item));
+		sizeof(struct btrfs_item));
 	btrfs_set_header_nritems(&right->header,
 				 btrfs_header_nritems(&right->header) -
 				 push_items);
 	push_space = LEAF_DATA_SIZE;
 
 	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
-		right->items[i].offset = push_space - right->items[i].size;
-		push_space = right->items[i].offset;
+		btrfs_set_item_offset(right->items + i, push_space -
+				      btrfs_item_size(right->items + i));
+		push_space = btrfs_item_offset(right->items + i);
 	}
 
 	BUG_ON(list_empty(&t->dirty));
@@ -1023,7 +1032,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	int slot;
 	struct leaf *right;
 	struct tree_buffer *right_buffer;
-	int space_needed = data_size + sizeof(struct item);
+	int space_needed = data_size + sizeof(struct btrfs_item);
 	int data_copy_size;
 	int rt_data_off;
 	int i;
@@ -1034,7 +1043,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	l = &l_buf->leaf;
 
 	/* did the pushes work? */
-	if (leaf_free_space(l) >= sizeof(struct item) + data_size)
+	if (leaf_free_space(l) >= sizeof(struct btrfs_item) + data_size)
 		return 0;
 
 	if (!path->nodes[1]) {
@@ -1066,17 +1075,17 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	btrfs_set_header_level(&right->header, 0);
 	btrfs_set_header_parentid(&right->header,
 	                       btrfs_header_parentid(&root->node->node.header));
-	data_copy_size = l->items[mid].offset + l->items[mid].size -
-			 leaf_data_end(l);
+	data_copy_size = btrfs_item_end(l->items + mid) - leaf_data_end(l);
 	memcpy(right->items, l->items + mid,
-	       (nritems - mid) * sizeof(struct item));
+	       (nritems - mid) * sizeof(struct btrfs_item));
 	memcpy(right->data + LEAF_DATA_SIZE - data_copy_size,
 	       l->data + leaf_data_end(l), data_copy_size);
-	rt_data_off = LEAF_DATA_SIZE -
-		     (l->items[mid].offset + l->items[mid].size);
+	rt_data_off = LEAF_DATA_SIZE - btrfs_item_end(l->items + mid);
 
-	for (i = 0; i < btrfs_header_nritems(&right->header); i++)
-		right->items[i].offset += rt_data_off;
+	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
+		u16 ioff = btrfs_item_offset(right->items + i);
+		btrfs_set_item_offset(right->items + i, ioff + rt_data_off);
+	}
 
 	btrfs_set_header_nritems(&l->header, mid);
 	ret = 0;
@@ -1136,26 +1145,28 @@ int insert_item(struct ctree_root *root, struct btrfs_key *cpu_key,
 	nritems = btrfs_header_nritems(&leaf->header);
 	data_end = leaf_data_end(leaf);
 
-	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
+	if (leaf_free_space(leaf) <  sizeof(struct btrfs_item) + data_size)
 		BUG();
 
 	slot = path.slots[0];
 	BUG_ON(slot < 0);
 	if (slot != nritems) {
 		int i;
-		unsigned int old_data = leaf->items[slot].offset +
-					leaf->items[slot].size;
+		unsigned int old_data = btrfs_item_end(leaf->items + slot);
 
 		/*
 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
 		 */
 		/* first correct the data pointers */
-		for (i = slot; i < nritems; i++)
-			leaf->items[i].offset -= data_size;
+		for (i = slot; i < nritems; i++) {
+			u16 ioff = btrfs_item_offset(leaf->items + i);
+			btrfs_set_item_offset(leaf->items + i,
+					      ioff - data_size);
+		}
 
 		/* shift the items */
 		memmove(leaf->items + slot + 1, leaf->items + slot,
-		        (nritems - slot) * sizeof(struct item));
+		        (nritems - slot) * sizeof(struct btrfs_item));
 
 		/* shift the data */
 		memmove(leaf->data + data_end - data_size, leaf->data +
@@ -1165,8 +1176,8 @@ int insert_item(struct ctree_root *root, struct btrfs_key *cpu_key,
 	/* copy the new data in */
 	memcpy(&leaf->items[slot].key, &disk_key,
 		sizeof(struct btrfs_disk_key));
-	leaf->items[slot].offset = data_end - data_size;
-	leaf->items[slot].size = data_size;
+	btrfs_set_item_offset(leaf->items + slot, data_end - data_size);
+	btrfs_set_item_size(leaf->items + slot, data_size);
 	memcpy(leaf->data + data_end - data_size, data, data_size);
 	btrfs_set_header_nritems(&leaf->header, nritems + 1);
 
@@ -1241,8 +1252,8 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 	leaf_buf = path->nodes[0];
 	leaf = &leaf_buf->leaf;
 	slot = path->slots[0];
-	doff = leaf->items[slot].offset;
-	dsize = leaf->items[slot].size;
+	doff = btrfs_item_offset(leaf->items + slot);
+	dsize = btrfs_item_size(leaf->items + slot);
 	nritems = btrfs_header_nritems(&leaf->header);
 
 	if (slot != nritems - 1) {
@@ -1251,10 +1262,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 		memmove(leaf->data + data_end + dsize,
 			leaf->data + data_end,
 			doff - data_end);
-		for (i = slot + 1; i < nritems; i++)
-			leaf->items[i].offset += dsize;
+		for (i = slot + 1; i < nritems; i++) {
+			u16 ioff = btrfs_item_offset(leaf->items + i);
+			btrfs_set_item_offset(leaf->items + i, ioff + dsize);
+		}
 		memmove(leaf->items + slot, leaf->items + slot + 1,
-			sizeof(struct item) *
+			sizeof(struct btrfs_item) *
 			(nritems - slot - 1));
 	}
 	btrfs_set_header_nritems(&leaf->header, nritems - 1);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 58e03e90f23..b03df154dcd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -98,10 +98,10 @@ struct ctree_super_block {
  * the key flags parameter.  offset and size tell us where to find
  * the item in the leaf (relative to the start of the data area)
  */
-struct item {
+struct btrfs_item {
 	struct btrfs_disk_key key;
-	u16 offset;
-	u16 size;
+	__le16 offset;
+	__le16 size;
 } __attribute__ ((__packed__));
 
 /*
@@ -115,7 +115,8 @@ struct item {
 struct leaf {
 	struct btrfs_header header;
 	union {
-		struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
+		struct btrfs_item items[LEAF_DATA_SIZE/
+				        sizeof(struct btrfs_item)];
 		u8 data[CTREE_BLOCKSIZE-sizeof(struct btrfs_header)];
 	};
 } __attribute__ ((__packed__));
@@ -152,6 +153,31 @@ struct ctree_path {
 	int slots[MAX_LEVEL];
 };
 
+static inline u16 btrfs_item_offset(struct btrfs_item *item)
+{
+	return le16_to_cpu(item->offset);
+}
+
+static inline void btrfs_set_item_offset(struct btrfs_item *item, u16 val)
+{
+	item->offset = cpu_to_le16(val);
+}
+
+static inline u16 btrfs_item_end(struct btrfs_item *item)
+{
+	return le16_to_cpu(item->offset) + le16_to_cpu(item->size);
+}
+
+static inline u16 btrfs_item_size(struct btrfs_item *item)
+{
+	return le16_to_cpu(item->size);
+}
+
+static inline void btrfs_set_item_size(struct btrfs_item *item, u16 val)
+{
+	item->size = cpu_to_le16(val);
+}
+
 static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
 					 struct btrfs_disk_key *disk)
 {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 475e7221596..fdf95bd07f9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -40,8 +40,8 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 		BUG();
 	BUG_ON(ret != 0);
 	l = &path.nodes[0]->leaf;
-	item = (struct extent_item *)(l->data +
-				      l->items[path.slots[0]].offset);
+	item = (struct extent_item *)(l->data + btrfs_item_offset(l->items +
+								path.slots[0]));
 	item->refs++;
 
 	BUG_ON(list_empty(&path.nodes[0]->dirty));
@@ -67,7 +67,8 @@ static int lookup_block_ref(struct ctree_root *root, u64 blocknr, u32 *refs)
 		BUG();
 	l = &path.nodes[0]->leaf;
 	item = (struct extent_item *)(l->data +
-				      l->items[path.slots[0]].offset);
+				      btrfs_item_offset(l->items +
+							path.slots[0]));
 	*refs = item->refs;
 	release_path(root->extent_root, &path);
 	return 0;
@@ -144,7 +145,7 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 	struct btrfs_key key;
 	struct ctree_root *extent_root = root->extent_root;
 	int ret;
-	struct item *item;
+	struct btrfs_item *item;
 	struct extent_item *ei;
 	struct btrfs_key ins;
 
@@ -162,7 +163,8 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 		BUG();
 	}
 	item = path.nodes[0]->leaf.items + path.slots[0];
-	ei = (struct extent_item *)(path.nodes[0]->leaf.data + item->offset);
+	ei = (struct extent_item *)(path.nodes[0]->leaf.data +
+				    btrfs_item_offset(item));
 	BUG_ON(ei->refs == 0);
 	ei->refs--;
 	if (ei->refs == 0) {
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index 5290423343f..0f77babcd30 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -14,7 +14,7 @@ int mkfs(int fd)
 {
 	struct ctree_root_info info[2];
 	struct leaf empty_leaf;
-	struct item item;
+	struct btrfs_item item;
 	struct extent_item extent_item;
 	int ret;
 
@@ -45,31 +45,37 @@ int mkfs(int fd)
 	btrfs_set_header_nritems(&empty_leaf.header, 3);
 
 	/* item1, reserve blocks 0-16 */
-	item.key.objectid = cpu_to_le64(0);
-	item.key.offset = cpu_to_le64(17);
-	item.key.flags = cpu_to_le32(0);
-	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item);
-	item.size = sizeof(struct extent_item);
+	btrfs_set_key_objectid(&item.key, 0);
+	btrfs_set_key_offset(&item.key, 17);
+	btrfs_set_key_flags(&item.key, 0);
+	btrfs_set_item_offset(&item,
+			      LEAF_DATA_SIZE - sizeof(struct extent_item));
+	btrfs_set_item_size(&item, sizeof(struct extent_item));
 	extent_item.refs = 1;
 	extent_item.owner = 0;
 	memcpy(empty_leaf.items, &item, sizeof(item));
-	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
+	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
+		btrfs_item_size(&item));
 
 	/* item2, give block 17 to the root */
-	item.key.objectid = cpu_to_le64(17);
-	item.key.offset = cpu_to_le64(1);
-	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 2;
+	btrfs_set_key_objectid(&item.key, 17);
+	btrfs_set_key_offset(&item.key, 1);
+	btrfs_set_item_offset(&item,
+			      LEAF_DATA_SIZE - sizeof(struct extent_item) * 2);
 	extent_item.owner = 1;
 	memcpy(empty_leaf.items + 1, &item, sizeof(item));
-	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
+	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
+		btrfs_item_size(&item));
 
 	/* item3, give block 18 for the extent root */
-	item.key.objectid = cpu_to_le64(18);
-	item.key.offset = cpu_to_le64(1);
-	item.offset = LEAF_DATA_SIZE - sizeof(struct extent_item) * 3;
+	btrfs_set_key_objectid(&item.key, 18);
+	btrfs_set_key_offset(&item.key, 1);
+	btrfs_set_item_offset(&item,
+			      LEAF_DATA_SIZE - sizeof(struct extent_item) * 3);
 	extent_item.owner = 2;
 	memcpy(empty_leaf.items + 2, &item, sizeof(item));
-	memcpy(empty_leaf.data + item.offset, &extent_item, item.size);
+	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
+		btrfs_item_size(&item));
 	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 18 * CTREE_BLOCKSIZE);
 	if (ret != sizeof(empty_leaf))
 		return -1;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 7df16b1e473..33f5ee4052c 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -9,7 +9,7 @@ void print_leaf(struct leaf *l)
 {
 	int i;
 	u32 nr = btrfs_header_nritems(&l->header);
-	struct item *item;
+	struct btrfs_item *item;
 	struct extent_item *ei;
 	printf("leaf %Lu total ptrs %d free space %d\n",
 		btrfs_header_blocknr(&l->header), nr, leaf_free_space(l));
@@ -18,12 +18,15 @@ void print_leaf(struct leaf *l)
 		item = l->items + i;
 		printf("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n",
 			i,
-			item->key.objectid, item->key.flags, item->key.offset,
-			item->offset, item->size);
+			btrfs_key_objectid(&item->key),
+			btrfs_key_flags(&item->key),
+			btrfs_key_offset(&item->key),
+			btrfs_item_offset(item),
+			btrfs_item_size(item));
 		fflush(stdout);
-		printf("\t\titem data %.*s\n", item->size,
-			l->data+item->offset);
-		ei = (struct extent_item *)(l->data + item->offset);
+		printf("\t\titem data %.*s\n", btrfs_item_size(item),
+			l->data + btrfs_item_offset(item));
+		ei = (struct extent_item *)(l->data + btrfs_item_offset(item));
 		printf("\t\textent data refs %u owner %Lu\n", ei->refs,
 			ei->owner);
 		fflush(stdout);
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index 34a15841ebd..e767528bc52 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -173,7 +173,7 @@ static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix,
 			path.slots[0] -= 1;
 		}
 		slot = path.slots[0];
-		found = path.nodes[0]->leaf.items[slot].key.objectid;
+		found=btrfs_key_objectid(&path.nodes[0]->leaf.items[slot].key);
 		ret = del_item(root, &path);
 		count++;
 		if (ret) {
@@ -274,7 +274,8 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 			slot -= 1;
 		}
 		for (i = slot; i >= 0; i--) {
-			found = path.nodes[0]->leaf.items[i].key.objectid;
+			found = btrfs_key_objectid(&path.nodes[0]->
+						   leaf.items[i].key);
 			radix_tree_preload(GFP_KERNEL);
 			ret = radix_tree_insert(radix, found, (void *)found);
 			if (ret) {
-- 
cgit v1.2.3


From 1d4f8a0c1eca5586134b56a4114a7cd5e85e3560 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Mar 2007 09:28:32 -0400
Subject: Btrfs: node->blockptrs endian fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 31 ++++++++++++++++++-------------
 fs/btrfs/ctree.h       | 12 +++++++++++-
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/print-tree.c  |  4 ++--
 4 files changed, 33 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 10555647005..76281073100 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -58,7 +58,8 @@ int btrfs_cow_block(struct ctree_root *root,
 			free_extent(root, buf->blocknr, 1);
 		tree_block_release(root, buf);
 	} else {
-		parent->node.blockptrs[parent_slot] = cow->blocknr;
+		btrfs_set_node_blockptr(&parent->node, parent_slot,
+					cow->blocknr);
 		BUG_ON(list_empty(&parent->dirty));
 		free_extent(root, buf->blocknr, 1);
 	}
@@ -133,7 +134,7 @@ int check_node(struct ctree_path *path, int level)
 		parent_key = &parent->keys[parent_slot];
 		BUG_ON(memcmp(parent_key, node->keys,
 			      sizeof(struct btrfs_disk_key)));
-		BUG_ON(parent->blockptrs[parent_slot] !=
+		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
 		       btrfs_header_blocknr(&node->header));
 	}
 	BUG_ON(nritems > NODEPTRS_PER_BLOCK);
@@ -166,7 +167,7 @@ int check_leaf(struct ctree_path *path, int level)
 		parent_key = &parent->keys[parent_slot];
 		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
 		       sizeof(struct btrfs_disk_key)));
-		BUG_ON(parent->blockptrs[parent_slot] !=
+		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
 		       btrfs_header_blocknr(&leaf->header));
 	}
 	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
@@ -258,7 +259,7 @@ struct tree_buffer *read_node_slot(struct ctree_root *root,
 		return NULL;
 	if (slot >= btrfs_header_nritems(&node->header))
 		return NULL;
-	return read_tree_block(root, node->blockptrs[slot]);
+	return read_tree_block(root, btrfs_node_blockptr(node, slot));
 }
 
 static int balance_level(struct ctree_root *root, struct ctree_path *path,
@@ -283,7 +284,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 
 	mid_buf = path->nodes[level];
 	mid = &mid_buf->node;
-	orig_ptr = mid->blockptrs[orig_slot];
+	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < MAX_LEVEL - 1)
 		parent_buf = path->nodes[level + 1];
@@ -407,7 +408,8 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	}
 	/* double check we haven't messed things up */
 	check_block(path, level);
-	if (orig_ptr != path->nodes[level]->node.blockptrs[path->slots[level]])
+	if (orig_ptr != btrfs_node_blockptr(&path->nodes[level]->node,
+					    path->slots[level]))
 		BUG();
 
 	if (right_buf)
@@ -482,7 +484,7 @@ again:
 				slot = p->slots[level];
 				BUG_ON(btrfs_header_nritems(&c->header) == 1);
 			}
-			b = read_tree_block(root, c->blockptrs[slot]);
+			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 		} else {
 			struct leaf *l = (struct leaf *)c;
 			p->slots[level] = slot;
@@ -660,7 +662,7 @@ static int insert_new_root(struct ctree_root *root,
 	else
 		lower_key = lower->keys;
 	memcpy(c->keys, lower_key, sizeof(struct btrfs_disk_key));
-	c->blockptrs[0] = path->nodes[level-1]->blocknr;
+	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->blocknr);
 	/* the super has an extra ref to root->node */
 	tree_block_release(root, root->node);
 	root->node = t;
@@ -700,7 +702,7 @@ static int insert_ptr(struct ctree_root *root,
 			(nritems - slot) * sizeof(u64));
 	}
 	memcpy(lower->keys + slot, key, sizeof(struct btrfs_disk_key));
-	lower->blockptrs[slot] = blocknr;
+	btrfs_set_node_blockptr(lower, slot, blocknr);
 	btrfs_set_header_nritems(&lower->header, nritems + 1);
 	if (lower->keys[1].objectid == 0)
 			BUG();
@@ -820,7 +822,8 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	if (slot >= btrfs_header_nritems(&upper->node.header) - 1) {
 		return 1;
 	}
-	right_buf = read_tree_block(root, upper->node.blockptrs[slot + 1]);
+	right_buf = read_tree_block(root, btrfs_node_blockptr(&upper->node,
+							      slot + 1));
 	right = &right_buf->leaf;
 	free_space = leaf_free_space(right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
@@ -926,7 +929,8 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	if (!path->nodes[1]) {
 		return 1;
 	}
-	t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
+	t = read_tree_block(root, btrfs_node_blockptr(&path->nodes[1]->node,
+						      slot - 1));
 	left = &t->leaf;
 	free_space = leaf_free_space(left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
@@ -1353,7 +1357,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 			level++;
 			continue;
 		}
-		blocknr = c->node.blockptrs[slot];
+		blocknr = btrfs_node_blockptr(&c->node, slot);
 		if (next)
 			tree_block_release(root, next);
 		next = read_tree_block(root, blocknr);
@@ -1368,7 +1372,8 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 		path->slots[level] = 0;
 		if (!level)
 			break;
-		next = read_tree_block(root, next->node.blockptrs[0]);
+		next = read_tree_block(root,
+				       btrfs_node_blockptr(&next->node, 0));
 	}
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b03df154dcd..a8454c401ce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -128,7 +128,7 @@ struct leaf {
 struct node {
 	struct btrfs_header header;
 	struct btrfs_disk_key keys[NODEPTRS_PER_BLOCK];
-	u64 blockptrs[NODEPTRS_PER_BLOCK];
+	__le64 blockptrs[NODEPTRS_PER_BLOCK];
 } __attribute__ ((__packed__));
 
 /*
@@ -153,6 +153,16 @@ struct ctree_path {
 	int slots[MAX_LEVEL];
 };
 
+static inline u64 btrfs_node_blockptr(struct node *n, int nr)
+{
+	return le64_to_cpu(n->blockptrs[nr]);
+}
+
+static inline void btrfs_set_node_blockptr(struct node *n, int nr, u64 val)
+{
+	n->blockptrs[nr] = cpu_to_le64(val);
+}
+
 static inline u16 btrfs_item_offset(struct btrfs_item *item)
 {
 	return le16_to_cpu(item->offset);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fdf95bd07f9..e511f48eb48 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -85,7 +85,7 @@ int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf)
 		return 0;
 
 	for (i = 0; i < btrfs_header_nritems(&buf->node.header); i++) {
-		blocknr = buf->node.blockptrs[i];
+		blocknr = btrfs_node_blockptr(&buf->node, i);
 		inc_block_ref(root, blocknr);
 	}
 	return 0;
@@ -437,7 +437,7 @@ int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(&cur->node.header))
 			break;
-		blocknr = cur->node.blockptrs[path->slots[*level]];
+		blocknr = btrfs_node_blockptr(&cur->node, path->slots[*level]);
 		ret = lookup_block_ref(root, blocknr, &refs);
 		if (refs != 1 || *level == 1) {
 			path->slots[*level]++;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 33f5ee4052c..101278e1139 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -54,12 +54,12 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
 		       i,
 		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
-		       c->blockptrs[i]);
+		       btrfs_node_blockptr(c, i));
 		fflush(stdout);
 	}
 	for (i = 0; i < nr; i++) {
 		struct tree_buffer *next_buf = read_tree_block(root,
-							    c->blockptrs[i]);
+						btrfs_node_blockptr(c, i));
 		struct node *next = &next_buf->node;
 		if (btrfs_is_leaf(next) &&
 		    btrfs_header_level(&c->header) != 1)
-- 
cgit v1.2.3


From cf27e1eec063fa68a89c57ae0a83f93aa38851d6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Mar 2007 09:49:06 -0400
Subject: Btrfs: struct extent_item endian

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 24 ++++++++++++++++++++++--
 fs/btrfs/extent-tree.c | 22 +++++++++++++---------
 fs/btrfs/mkfs.c        |  8 ++++----
 3 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a8454c401ce..90cc2c9bd79 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -136,8 +136,8 @@ struct node {
  * owner of the block and the number of references
  */
 struct extent_item {
-	u32 refs;
-	u64 owner;
+	__le32 refs;
+	__le64 owner;
 } __attribute__ ((__packed__));
 
 /*
@@ -153,6 +153,26 @@ struct ctree_path {
 	int slots[MAX_LEVEL];
 };
 
+static inline u64 btrfs_extent_owner(struct extent_item *ei)
+{
+	return le64_to_cpu(ei->owner);
+}
+
+static inline void btrfs_set_extent_owner(struct extent_item *ei, u64 val)
+{
+	ei->owner = cpu_to_le64(val);
+}
+
+static inline u32 btrfs_extent_refs(struct extent_item *ei)
+{
+	return le32_to_cpu(ei->refs);
+}
+
+static inline void btrfs_set_extent_refs(struct extent_item *ei, u32 val)
+{
+	ei->refs = cpu_to_le32(val);
+}
+
 static inline u64 btrfs_node_blockptr(struct node *n, int nr)
 {
 	return le64_to_cpu(n->blockptrs[nr]);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e511f48eb48..1676a6595cc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -29,6 +29,7 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 	struct leaf *l;
 	struct extent_item *item;
 	struct btrfs_key ins;
+	u32 refs;
 
 	find_free_extent(root->extent_root, 0, 0, (u64)-1, &ins);
 	init_path(&path);
@@ -42,7 +43,8 @@ static int inc_block_ref(struct ctree_root *root, u64 blocknr)
 	l = &path.nodes[0]->leaf;
 	item = (struct extent_item *)(l->data + btrfs_item_offset(l->items +
 								path.slots[0]));
-	item->refs++;
+	refs = btrfs_extent_refs(item);
+	btrfs_set_extent_refs(item, refs + 1);
 
 	BUG_ON(list_empty(&path.nodes[0]->dirty));
 	release_path(root->extent_root, &path);
@@ -69,7 +71,7 @@ static int lookup_block_ref(struct ctree_root *root, u64 blocknr, u32 *refs)
 	item = (struct extent_item *)(l->data +
 				      btrfs_item_offset(l->items +
 							path.slots[0]));
-	*refs = item->refs;
+	*refs = btrfs_extent_refs(item);
 	release_path(root->extent_root, &path);
 	return 0;
 }
@@ -120,9 +122,9 @@ static int finish_current_insert(struct ctree_root *extent_root)
 	int i;
 	int ret;
 
-	extent_item.refs = 1;
-	extent_item.owner =
-		btrfs_header_parentid(&extent_root->node->node.header);
+	btrfs_set_extent_refs(&extent_item, 1);
+	btrfs_set_extent_owner(&extent_item,
+		btrfs_header_parentid(&extent_root->node->node.header));
 	ins.offset = 1;
 	ins.flags = 0;
 
@@ -148,6 +150,7 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 	struct btrfs_item *item;
 	struct extent_item *ei;
 	struct btrfs_key ins;
+	u32 refs;
 
 	key.objectid = blocknr;
 	key.flags = 0;
@@ -166,8 +169,9 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 	ei = (struct extent_item *)(path.nodes[0]->leaf.data +
 				    btrfs_item_offset(item));
 	BUG_ON(ei->refs == 0);
-	ei->refs--;
-	if (ei->refs == 0) {
+	refs = btrfs_extent_refs(ei) - 1;
+	btrfs_set_extent_refs(ei, refs);
+	if (refs == 0) {
 		if (root == extent_root) {
 			int err;
 			radix_tree_preload(GFP_KERNEL);
@@ -368,8 +372,8 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
 	struct ctree_root *extent_root = root->extent_root;
 	struct extent_item extent_item;
 
-	extent_item.refs = 1;
-	extent_item.owner = owner;
+	btrfs_set_extent_refs(&extent_item, 1);
+	btrfs_set_extent_owner(&extent_item, owner);
 
 	if (root == extent_root) {
 		BUG_ON(extent_root->current_insert.offset == 0);
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index 0f77babcd30..e0d3bc1d025 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -51,8 +51,8 @@ int mkfs(int fd)
 	btrfs_set_item_offset(&item,
 			      LEAF_DATA_SIZE - sizeof(struct extent_item));
 	btrfs_set_item_size(&item, sizeof(struct extent_item));
-	extent_item.refs = 1;
-	extent_item.owner = 0;
+	btrfs_set_extent_refs(&extent_item, 1);
+	btrfs_set_extent_owner(&extent_item, 0);
 	memcpy(empty_leaf.items, &item, sizeof(item));
 	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
 		btrfs_item_size(&item));
@@ -62,7 +62,7 @@ int mkfs(int fd)
 	btrfs_set_key_offset(&item.key, 1);
 	btrfs_set_item_offset(&item,
 			      LEAF_DATA_SIZE - sizeof(struct extent_item) * 2);
-	extent_item.owner = 1;
+	btrfs_set_extent_owner(&extent_item, 1);
 	memcpy(empty_leaf.items + 1, &item, sizeof(item));
 	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
 		btrfs_item_size(&item));
@@ -72,7 +72,7 @@ int mkfs(int fd)
 	btrfs_set_key_offset(&item.key, 1);
 	btrfs_set_item_offset(&item,
 			      LEAF_DATA_SIZE - sizeof(struct extent_item) * 3);
-	extent_item.owner = 2;
+	btrfs_set_extent_owner(&extent_item, 2);
 	memcpy(empty_leaf.items + 2, &item, sizeof(item));
 	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
 		btrfs_item_size(&item));
-- 
cgit v1.2.3


From 234b63a091e1df6bd4261dd7b3a7490074830628 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Mar 2007 10:46:10 -0400
Subject: rename funcs and structs to btrfs

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 317 +++++++++++++++++++++++++------------------------
 fs/btrfs/ctree.h       |  95 +++++++--------
 fs/btrfs/debug-tree.c  |   8 +-
 fs/btrfs/disk-io.c     | 106 +++++++++--------
 fs/btrfs/disk-io.h     |  29 ++---
 fs/btrfs/extent-tree.c | 147 +++++++++++------------
 fs/btrfs/mkfs.c        |  26 ++--
 fs/btrfs/print-tree.c  |  23 ++--
 fs/btrfs/print-tree.h  |   4 +-
 fs/btrfs/quick-test.c  |  54 ++++-----
 fs/btrfs/random-test.c |  92 +++++++-------
 11 files changed, 454 insertions(+), 447 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 76281073100..c8529b394d5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -6,47 +6,47 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
-static int split_node(struct ctree_root *root, struct ctree_path *path,
+static int split_node(struct btrfs_root *root, struct btrfs_path *path,
 		      int level);
-static int split_leaf(struct ctree_root *root, struct ctree_path *path,
+static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		      int data_size);
-static int push_node_left(struct ctree_root *root, struct tree_buffer *dst,
-			  struct tree_buffer *src);
-static int balance_node_right(struct ctree_root *root,
-			      struct tree_buffer *dst_buf,
-			      struct tree_buffer *src_buf);
-static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
+static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst,
+			  struct btrfs_buffer *src);
+static int balance_node_right(struct btrfs_root *root,
+			      struct btrfs_buffer *dst_buf,
+			      struct btrfs_buffer *src_buf);
+static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
 		   int slot);
 
-inline void init_path(struct ctree_path *p)
+inline void btrfs_init_path(struct btrfs_path *p)
 {
 	memset(p, 0, sizeof(*p));
 }
 
-void release_path(struct ctree_root *root, struct ctree_path *p)
+void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
-	for (i = 0; i < MAX_LEVEL; i++) {
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i])
 			break;
-		tree_block_release(root, p->nodes[i]);
+		btrfs_block_release(root, p->nodes[i]);
 	}
 	memset(p, 0, sizeof(*p));
 }
 
-int btrfs_cow_block(struct ctree_root *root,
-		    struct tree_buffer *buf,
-		    struct tree_buffer *parent,
+int btrfs_cow_block(struct btrfs_root *root,
+		    struct btrfs_buffer *buf,
+		    struct btrfs_buffer *parent,
 		    int parent_slot,
-		    struct tree_buffer **cow_ret)
+		    struct btrfs_buffer **cow_ret)
 {
-	struct tree_buffer *cow;
+	struct btrfs_buffer *cow;
 
 	if (!list_empty(&buf->dirty)) {
 		*cow_ret = buf;
 		return 0;
 	}
-	cow = alloc_free_block(root);
+	cow = btrfs_alloc_free_block(root);
 	memcpy(&cow->node, &buf->node, sizeof(buf->node));
 	btrfs_set_header_blocknr(&cow->node.header, cow->blocknr);
 	*cow_ret = cow;
@@ -55,15 +55,15 @@ int btrfs_cow_block(struct ctree_root *root,
 		root->node = cow;
 		cow->count++;
 		if (buf != root->commit_root)
-			free_extent(root, buf->blocknr, 1);
-		tree_block_release(root, buf);
+			btrfs_free_extent(root, buf->blocknr, 1);
+		btrfs_block_release(root, buf);
 	} else {
 		btrfs_set_node_blockptr(&parent->node, parent_slot,
 					cow->blocknr);
 		BUG_ON(list_empty(&parent->dirty));
-		free_extent(root, buf->blocknr, 1);
+		btrfs_free_extent(root, buf->blocknr, 1);
 	}
-	tree_block_release(root, buf);
+	btrfs_block_release(root, buf);
 	return 0;
 }
 
@@ -72,7 +72,7 @@ int btrfs_cow_block(struct ctree_root *root,
  * this returns the address of the start of the last item,
  * which is the stop of the leaf data stack
  */
-static inline unsigned int leaf_data_end(struct leaf *leaf)
+static inline unsigned int leaf_data_end(struct btrfs_leaf *leaf)
 {
 	u32 nr = btrfs_header_nritems(&leaf->header);
 	if (nr == 0)
@@ -85,7 +85,7 @@ static inline unsigned int leaf_data_end(struct leaf *leaf)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int leaf_free_space(struct leaf *leaf)
+int btrfs_leaf_free_space(struct btrfs_leaf *leaf)
 {
 	int data_end = leaf_data_end(leaf);
 	int nritems = btrfs_header_nritems(&leaf->header);
@@ -117,11 +117,11 @@ int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 	return 0;
 }
 
-int check_node(struct ctree_path *path, int level)
+int check_node(struct btrfs_path *path, int level)
 {
 	int i;
-	struct node *parent = NULL;
-	struct node *node = &path->nodes[level]->node;
+	struct btrfs_node *parent = NULL;
+	struct btrfs_node *node = &path->nodes[level]->node;
 	int parent_slot;
 	u32 nritems = btrfs_header_nritems(&node->header);
 
@@ -146,18 +146,18 @@ int check_node(struct ctree_path *path, int level)
 	return 0;
 }
 
-int check_leaf(struct ctree_path *path, int level)
+int check_leaf(struct btrfs_path *path, int level)
 {
 	int i;
-	struct leaf *leaf = &path->nodes[level]->leaf;
-	struct node *parent = NULL;
+	struct btrfs_leaf *leaf = &path->nodes[level]->leaf;
+	struct btrfs_node *parent = NULL;
 	int parent_slot;
 	u32 nritems = btrfs_header_nritems(&leaf->header);
 
 	if (path->nodes[level + 1])
 		parent = &path->nodes[level + 1]->node;
 	parent_slot = path->slots[level + 1];
-	BUG_ON(leaf_free_space(leaf) < 0);
+	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
 
 	if (nritems == 0)
 		return 0;
@@ -186,7 +186,7 @@ int check_leaf(struct ctree_path *path, int level)
 	return 0;
 }
 
-int check_block(struct ctree_path *path, int level)
+int check_block(struct btrfs_path *path, int level)
 {
 	if (level == 0)
 		return check_leaf(path, level);
@@ -233,10 +233,10 @@ int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
  * simple bin_search frontend that does the right thing for
  * leaves vs nodes
  */
-int bin_search(struct node *c, struct btrfs_key *key, int *slot)
+int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot)
 {
 	if (btrfs_is_leaf(c)) {
-		struct leaf *l = (struct leaf *)c;
+		struct btrfs_leaf *l = (struct btrfs_leaf *)c;
 		return generic_bin_search((void *)l->items,
 					  sizeof(struct btrfs_item),
 					  key, btrfs_header_nritems(&c->header),
@@ -250,11 +250,11 @@ int bin_search(struct node *c, struct btrfs_key *key, int *slot)
 	return -1;
 }
 
-struct tree_buffer *read_node_slot(struct ctree_root *root,
-				   struct tree_buffer *parent_buf,
+struct btrfs_buffer *read_node_slot(struct btrfs_root *root,
+				   struct btrfs_buffer *parent_buf,
 				   int slot)
 {
-	struct node *node = &parent_buf->node;
+	struct btrfs_node *node = &parent_buf->node;
 	if (slot < 0)
 		return NULL;
 	if (slot >= btrfs_header_nritems(&node->header))
@@ -262,17 +262,17 @@ struct tree_buffer *read_node_slot(struct ctree_root *root,
 	return read_tree_block(root, btrfs_node_blockptr(node, slot));
 }
 
-static int balance_level(struct ctree_root *root, struct ctree_path *path,
+static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 			int level)
 {
-	struct tree_buffer *right_buf;
-	struct tree_buffer *mid_buf;
-	struct tree_buffer *left_buf;
-	struct tree_buffer *parent_buf = NULL;
-	struct node *right = NULL;
-	struct node *mid;
-	struct node *left = NULL;
-	struct node *parent = NULL;
+	struct btrfs_buffer *right_buf;
+	struct btrfs_buffer *mid_buf;
+	struct btrfs_buffer *left_buf;
+	struct btrfs_buffer *parent_buf = NULL;
+	struct btrfs_node *right = NULL;
+	struct btrfs_node *mid;
+	struct btrfs_node *left = NULL;
+	struct btrfs_node *parent = NULL;
 	int ret = 0;
 	int wret;
 	int pslot;
@@ -286,12 +286,12 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	mid = &mid_buf->node;
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
-	if (level < MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1)
 		parent_buf = path->nodes[level + 1];
 	pslot = path->slots[level + 1];
 
 	if (!parent_buf) {
-		struct tree_buffer *child;
+		struct btrfs_buffer *child;
 		u64 blocknr = mid_buf->blocknr;
 
 		if (btrfs_header_nritems(&mid->header) != 1)
@@ -303,11 +303,11 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		root->node = child;
 		path->nodes[level] = NULL;
 		/* once for the path */
-		tree_block_release(root, mid_buf);
+		btrfs_block_release(root, mid_buf);
 		/* once for the root ptr */
-		tree_block_release(root, mid_buf);
+		btrfs_block_release(root, mid_buf);
 		clean_tree_block(root, mid_buf);
-		return free_extent(root, blocknr, 1);
+		return btrfs_free_extent(root, blocknr, 1);
 	}
 	parent = &parent_buf->node;
 
@@ -340,14 +340,14 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 			ret = wret;
 		if (btrfs_header_nritems(&right->header) == 0) {
 			u64 blocknr = right_buf->blocknr;
-			tree_block_release(root, right_buf);
+			btrfs_block_release(root, right_buf);
 			clean_tree_block(root, right_buf);
 			right_buf = NULL;
 			right = NULL;
 			wret = del_ptr(root, path, level + 1, pslot + 1);
 			if (wret)
 				ret = wret;
-			wret = free_extent(root, blocknr, 1);
+			wret = btrfs_free_extent(root, blocknr, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -375,14 +375,14 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 	if (btrfs_header_nritems(&mid->header) == 0) {
 		/* we've managed to empty the middle node, drop it */
 		u64 blocknr = mid_buf->blocknr;
-		tree_block_release(root, mid_buf);
+		btrfs_block_release(root, mid_buf);
 		clean_tree_block(root, mid_buf);
 		mid_buf = NULL;
 		mid = NULL;
 		wret = del_ptr(root, path, level + 1, pslot);
 		if (wret)
 			ret = wret;
-		wret = free_extent(root, blocknr, 1);
+		wret = btrfs_free_extent(root, blocknr, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -400,7 +400,7 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 			path->slots[level + 1] -= 1;
 			path->slots[level] = orig_slot;
 			if (mid_buf)
-				tree_block_release(root, mid_buf);
+				btrfs_block_release(root, mid_buf);
 		} else {
 			orig_slot -= btrfs_header_nritems(&left->header);
 			path->slots[level] = orig_slot;
@@ -413,9 +413,9 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
 		BUG();
 
 	if (right_buf)
-		tree_block_release(root, right_buf);
+		btrfs_block_release(root, right_buf);
 	if (left_buf)
-		tree_block_release(root, left_buf);
+		btrfs_block_release(root, left_buf);
 	return ret;
 }
 
@@ -432,12 +432,12 @@ static int balance_level(struct ctree_root *root, struct ctree_path *path,
  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
  * possible)
  */
-int search_slot(struct ctree_root *root, struct btrfs_key *key,
-		struct ctree_path *p, int ins_len, int cow)
+int btrfs_search_slot(struct btrfs_root *root, struct btrfs_key *key,
+		struct btrfs_path *p, int ins_len, int cow)
 {
-	struct tree_buffer *b;
-	struct tree_buffer *cow_buf;
-	struct node *c;
+	struct btrfs_buffer *b;
+	struct btrfs_buffer *cow_buf;
+	struct btrfs_node *c;
 	int slot;
 	int ret;
 	int level;
@@ -486,9 +486,9 @@ again:
 			}
 			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 		} else {
-			struct leaf *l = (struct leaf *)c;
+			struct btrfs_leaf *l = (struct btrfs_leaf *)c;
 			p->slots[level] = slot;
-			if (ins_len > 0 && leaf_free_space(l) <
+			if (ins_len > 0 && btrfs_leaf_free_space(l) <
 			    sizeof(struct btrfs_item) + ins_len) {
 				int sret = split_leaf(root, p, ins_len);
 				BUG_ON(sret > 0);
@@ -513,14 +513,14 @@ again:
  * If this fails to write a tree block, it returns -1, but continues
  * fixing up the blocks in ram so the tree is consistent.
  */
-static int fixup_low_keys(struct ctree_root *root,
-			   struct ctree_path *path, struct btrfs_disk_key *key,
+static int fixup_low_keys(struct btrfs_root *root,
+			   struct btrfs_path *path, struct btrfs_disk_key *key,
 			   int level)
 {
 	int i;
 	int ret = 0;
-	for (i = level; i < MAX_LEVEL; i++) {
-		struct node *t;
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		struct btrfs_node *t;
 		int tslot = path->slots[i];
 		if (!path->nodes[i])
 			break;
@@ -540,11 +540,11 @@ static int fixup_low_keys(struct ctree_root *root,
  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
  * error, and > 0 if there was no room in the left hand block.
  */
-static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
-			  struct tree_buffer *src_buf)
+static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst_buf,
+			  struct btrfs_buffer *src_buf)
 {
-	struct node *src = &src_buf->node;
-	struct node *dst = &dst_buf->node;
+	struct btrfs_node *src = &src_buf->node;
+	struct btrfs_node *dst = &dst_buf->node;
 	int push_items = 0;
 	int src_nritems;
 	int dst_nritems;
@@ -587,12 +587,12 @@ static int push_node_left(struct ctree_root *root, struct tree_buffer *dst_buf,
  *
  * this will  only push up to 1/2 the contents of the left node over
  */
-static int balance_node_right(struct ctree_root *root,
-			      struct tree_buffer *dst_buf,
-			      struct tree_buffer *src_buf)
+static int balance_node_right(struct btrfs_root *root,
+			      struct btrfs_buffer *dst_buf,
+			      struct btrfs_buffer *src_buf)
 {
-	struct node *src = &src_buf->node;
-	struct node *dst = &dst_buf->node;
+	struct btrfs_node *src = &src_buf->node;
+	struct btrfs_node *dst = &dst_buf->node;
 	int push_items = 0;
 	int max_push;
 	int src_nritems;
@@ -637,18 +637,18 @@ static int balance_node_right(struct ctree_root *root,
  *
  * returns zero on success or < 0 on failure.
  */
-static int insert_new_root(struct ctree_root *root,
-			   struct ctree_path *path, int level)
+static int insert_new_root(struct btrfs_root *root,
+			   struct btrfs_path *path, int level)
 {
-	struct tree_buffer *t;
-	struct node *lower;
-	struct node *c;
+	struct btrfs_buffer *t;
+	struct btrfs_node *lower;
+	struct btrfs_node *c;
 	struct btrfs_disk_key *lower_key;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	t = alloc_free_block(root);
+	t = btrfs_alloc_free_block(root);
 	c = &t->node;
 	memset(c, 0, sizeof(c));
 	btrfs_set_header_nritems(&c->header, 1);
@@ -658,13 +658,13 @@ static int insert_new_root(struct ctree_root *root,
 	                       btrfs_header_parentid(&root->node->node.header));
 	lower = &path->nodes[level-1]->node;
 	if (btrfs_is_leaf(lower))
-		lower_key = &((struct leaf *)lower)->items[0].key;
+		lower_key = &((struct btrfs_leaf *)lower)->items[0].key;
 	else
 		lower_key = lower->keys;
 	memcpy(c->keys, lower_key, sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->blocknr);
 	/* the super has an extra ref to root->node */
-	tree_block_release(root, root->node);
+	btrfs_block_release(root, root->node);
 	root->node = t;
 	t->count++;
 	path->nodes[level] = t;
@@ -681,11 +681,11 @@ static int insert_new_root(struct ctree_root *root,
  *
  * returns zero on success and < 0 on any error
  */
-static int insert_ptr(struct ctree_root *root,
-		struct ctree_path *path, struct btrfs_disk_key *key,
+static int insert_ptr(struct btrfs_root *root,
+		struct btrfs_path *path, struct btrfs_disk_key *key,
 		u64 blocknr, int slot, int level)
 {
-	struct node *lower;
+	struct btrfs_node *lower;
 	int nritems;
 
 	BUG_ON(!path->nodes[level]);
@@ -719,13 +719,13 @@ static int insert_ptr(struct ctree_root *root,
  *
  * returns 0 on success and < 0 on failure
  */
-static int split_node(struct ctree_root *root, struct ctree_path *path,
+static int split_node(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
-	struct tree_buffer *t;
-	struct node *c;
-	struct tree_buffer *split_buffer;
-	struct node *split;
+	struct btrfs_buffer *t;
+	struct btrfs_node *c;
+	struct btrfs_buffer *split_buffer;
+	struct btrfs_node *split;
 	int mid;
 	int ret;
 	int wret;
@@ -740,7 +740,7 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 			return ret;
 	}
 	c_nritems = btrfs_header_nritems(&c->header);
-	split_buffer = alloc_free_block(root);
+	split_buffer = btrfs_alloc_free_block(root);
 	split = &split_buffer->node;
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
 	btrfs_set_header_blocknr(&split->header, split_buffer->blocknr);
@@ -763,11 +763,11 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
 
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
-		tree_block_release(root, t);
+		btrfs_block_release(root, t);
 		path->nodes[level] = split_buffer;
 		path->slots[level + 1] += 1;
 	} else {
-		tree_block_release(root, split_buffer);
+		btrfs_block_release(root, split_buffer);
 	}
 	return ret;
 }
@@ -777,7 +777,7 @@ static int split_node(struct ctree_root *root, struct ctree_path *path,
  * and nr indicate which items in the leaf to check.  This totals up the
  * space used both by the item structs and the item data
  */
-static int leaf_space_used(struct leaf *l, int start, int nr)
+static int leaf_space_used(struct btrfs_leaf *l, int start, int nr)
 {
 	int data_len;
 	int end = start + nr - 1;
@@ -797,14 +797,14 @@ static int leaf_space_used(struct leaf *l, int start, int nr)
  * returns 1 if the push failed because the other node didn't have enough
  * room, 0 if everything worked out and < 0 if there were major errors.
  */
-static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
+static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
 			   int data_size)
 {
-	struct tree_buffer *left_buf = path->nodes[0];
-	struct leaf *left = &left_buf->leaf;
-	struct leaf *right;
-	struct tree_buffer *right_buf;
-	struct tree_buffer *upper;
+	struct btrfs_buffer *left_buf = path->nodes[0];
+	struct btrfs_leaf *left = &left_buf->leaf;
+	struct btrfs_leaf *right;
+	struct btrfs_buffer *right_buf;
+	struct btrfs_buffer *upper;
 	int slot;
 	int i;
 	int free_space;
@@ -825,17 +825,17 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	right_buf = read_tree_block(root, btrfs_node_blockptr(&upper->node,
 							      slot + 1));
 	right = &right_buf->leaf;
-	free_space = leaf_free_space(right);
+	free_space = btrfs_leaf_free_space(right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		tree_block_release(root, right_buf);
+		btrfs_block_release(root, right_buf);
 		return 1;
 	}
 	/* cow and double check */
 	btrfs_cow_block(root, right_buf, upper, slot + 1, &right_buf);
 	right = &right_buf->leaf;
-	free_space = leaf_free_space(right);
+	free_space = btrfs_leaf_free_space(right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		tree_block_release(root, right_buf);
+		btrfs_block_release(root, right_buf);
 		return 1;
 	}
 
@@ -851,7 +851,7 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 		push_space += btrfs_item_size(item) + sizeof(*item);
 	}
 	if (push_items == 0) {
-		tree_block_release(root, right_buf);
+		btrfs_block_release(root, right_buf);
 		return 1;
 	}
 	right_nritems = btrfs_header_nritems(&right->header);
@@ -893,11 +893,11 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
 		path->slots[0] -= left_nritems;
-		tree_block_release(root, path->nodes[0]);
+		btrfs_block_release(root, path->nodes[0]);
 		path->nodes[0] = right_buf;
 		path->slots[1] += 1;
 	} else {
-		tree_block_release(root, right_buf);
+		btrfs_block_release(root, right_buf);
 	}
 	return 0;
 }
@@ -905,13 +905,13 @@ static int push_leaf_right(struct ctree_root *root, struct ctree_path *path,
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
-static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
+static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
 			  int data_size)
 {
-	struct tree_buffer *right_buf = path->nodes[0];
-	struct leaf *right = &right_buf->leaf;
-	struct tree_buffer *t;
-	struct leaf *left;
+	struct btrfs_buffer *right_buf = path->nodes[0];
+	struct btrfs_leaf *right = &right_buf->leaf;
+	struct btrfs_buffer *t;
+	struct btrfs_leaf *left;
 	int slot;
 	int i;
 	int free_space;
@@ -932,18 +932,18 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	t = read_tree_block(root, btrfs_node_blockptr(&path->nodes[1]->node,
 						      slot - 1));
 	left = &t->leaf;
-	free_space = leaf_free_space(left);
+	free_space = btrfs_leaf_free_space(left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		tree_block_release(root, t);
+		btrfs_block_release(root, t);
 		return 1;
 	}
 
 	/* cow and double check */
 	btrfs_cow_block(root, t, path->nodes[1], slot - 1, &t);
 	left = &t->leaf;
-	free_space = leaf_free_space(left);
+	free_space = btrfs_leaf_free_space(left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		tree_block_release(root, t);
+		btrfs_block_release(root, t);
 		return 1;
 	}
 
@@ -958,7 +958,7 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		push_space += btrfs_item_size(item) + sizeof(*item);
 	}
 	if (push_items == 0) {
-		tree_block_release(root, t);
+		btrfs_block_release(root, t);
 		return 1;
 	}
 	/* push data from right to left */
@@ -1009,11 +1009,11 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
 		path->slots[0] += old_left_nritems;
-		tree_block_release(root, path->nodes[0]);
+		btrfs_block_release(root, path->nodes[0]);
 		path->nodes[0] = t;
 		path->slots[1] -= 1;
 	} else {
-		tree_block_release(root, t);
+		btrfs_block_release(root, t);
 		path->slots[0] -= push_items;
 	}
 	BUG_ON(path->slots[0] < 0);
@@ -1026,16 +1026,16 @@ static int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
  *
  * returns 0 if all went well and < 0 on failure.
  */
-static int split_leaf(struct ctree_root *root, struct ctree_path *path,
+static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		      int data_size)
 {
-	struct tree_buffer *l_buf;
-	struct leaf *l;
+	struct btrfs_buffer *l_buf;
+	struct btrfs_leaf *l;
 	u32 nritems;
 	int mid;
 	int slot;
-	struct leaf *right;
-	struct tree_buffer *right_buffer;
+	struct btrfs_leaf *right;
+	struct btrfs_buffer *right_buffer;
 	int space_needed = data_size + sizeof(struct btrfs_item);
 	int data_copy_size;
 	int rt_data_off;
@@ -1047,7 +1047,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	l = &l_buf->leaf;
 
 	/* did the pushes work? */
-	if (leaf_free_space(l) >= sizeof(struct btrfs_item) + data_size)
+	if (btrfs_leaf_free_space(l) >= sizeof(struct btrfs_item) + data_size)
 		return 0;
 
 	if (!path->nodes[1]) {
@@ -1058,7 +1058,7 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
-	right_buffer = alloc_free_block(root);
+	right_buffer = btrfs_alloc_free_block(root);
 	BUG_ON(!right_buffer);
 	BUG_ON(mid == nritems);
 	right = &right_buffer->leaf;
@@ -1101,12 +1101,12 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
 	BUG_ON(list_empty(&l_buf->dirty));
 	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
-		tree_block_release(root, path->nodes[0]);
+		btrfs_block_release(root, path->nodes[0]);
 		path->nodes[0] = right_buffer;
 		path->slots[0] -= mid;
 		path->slots[1] += 1;
 	} else
-		tree_block_release(root, right_buffer);
+		btrfs_block_release(root, right_buffer);
 	BUG_ON(path->slots[0] < 0);
 	return ret;
 }
@@ -1115,17 +1115,17 @@ static int split_leaf(struct ctree_root *root, struct ctree_path *path,
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
-int insert_item(struct ctree_root *root, struct btrfs_key *cpu_key,
+int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 			  void *data, int data_size)
 {
 	int ret = 0;
 	int slot;
 	int slot_orig;
-	struct leaf *leaf;
-	struct tree_buffer *leaf_buf;
+	struct btrfs_leaf *leaf;
+	struct btrfs_buffer *leaf_buf;
 	u32 nritems;
 	unsigned int data_end;
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_disk_key disk_key;
 
 	btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -1133,10 +1133,10 @@ int insert_item(struct ctree_root *root, struct btrfs_key *cpu_key,
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
-	init_path(&path);
-	ret = search_slot(root, cpu_key, &path, data_size, 1);
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(root, cpu_key, &path, data_size, 1);
 	if (ret == 0) {
-		release_path(root, &path);
+		btrfs_release_path(root, &path);
 		return -EEXIST;
 	}
 	if (ret < 0)
@@ -1149,7 +1149,8 @@ int insert_item(struct ctree_root *root, struct btrfs_key *cpu_key,
 	nritems = btrfs_header_nritems(&leaf->header);
 	data_end = leaf_data_end(leaf);
 
-	if (leaf_free_space(leaf) <  sizeof(struct btrfs_item) + data_size)
+	if (btrfs_leaf_free_space(leaf) <
+	    sizeof(struct btrfs_item) + data_size)
 		BUG();
 
 	slot = path.slots[0];
@@ -1190,11 +1191,11 @@ int insert_item(struct ctree_root *root, struct btrfs_key *cpu_key,
 		ret = fixup_low_keys(root, &path, &disk_key, 1);
 
 	BUG_ON(list_empty(&leaf_buf->dirty));
-	if (leaf_free_space(leaf) < 0)
+	if (btrfs_leaf_free_space(leaf) < 0)
 		BUG();
 	check_leaf(&path, 0);
 out:
-	release_path(root, &path);
+	btrfs_release_path(root, &path);
 	return ret;
 }
 
@@ -1205,11 +1206,11 @@ out:
  * continuing all the way the root if required.  The root is converted into
  * a leaf if all the nodes are emptied.
  */
-static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
+static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
 		   int slot)
 {
-	struct node *node;
-	struct tree_buffer *parent = path->nodes[level];
+	struct btrfs_node *node;
+	struct btrfs_buffer *parent = path->nodes[level];
 	u32 nritems;
 	int ret = 0;
 	int wret;
@@ -1242,11 +1243,11 @@ static int del_ptr(struct ctree_root *root, struct ctree_path *path, int level,
  * delete the item at the leaf level in path.  If that empties
  * the leaf, remove it from the tree
  */
-int del_item(struct ctree_root *root, struct ctree_path *path)
+int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
 {
 	int slot;
-	struct leaf *leaf;
-	struct tree_buffer *leaf_buf;
+	struct btrfs_leaf *leaf;
+	struct btrfs_buffer *leaf_buf;
 	int doff;
 	int dsize;
 	int ret = 0;
@@ -1286,7 +1287,7 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 			wret = del_ptr(root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
-			wret = free_extent(root, leaf_buf->blocknr, 1);
+			wret = btrfs_free_extent(root, leaf_buf->blocknr, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -1323,12 +1324,12 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
 				wret = del_ptr(root, path, 1, slot);
 				if (wret)
 					ret = wret;
-				tree_block_release(root, leaf_buf);
-				wret = free_extent(root, blocknr, 1);
+				btrfs_block_release(root, leaf_buf);
+				wret = btrfs_free_extent(root, blocknr, 1);
 				if (wret)
 					ret = wret;
 			} else {
-				tree_block_release(root, leaf_buf);
+				btrfs_block_release(root, leaf_buf);
 			}
 		}
 	}
@@ -1340,15 +1341,15 @@ int del_item(struct ctree_root *root, struct ctree_path *path)
  * returns 0 if it found something or 1 if there are no greater leaves.
  * returns < 0 on io errors.
  */
-int next_leaf(struct ctree_root *root, struct ctree_path *path)
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
 	int slot;
 	int level = 1;
 	u64 blocknr;
-	struct tree_buffer *c;
-	struct tree_buffer *next = NULL;
+	struct btrfs_buffer *c;
+	struct btrfs_buffer *next = NULL;
 
-	while(level < MAX_LEVEL) {
+	while(level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 		slot = path->slots[level] + 1;
@@ -1359,7 +1360,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 		}
 		blocknr = btrfs_node_blockptr(&c->node, slot);
 		if (next)
-			tree_block_release(root, next);
+			btrfs_block_release(root, next);
 		next = read_tree_block(root, blocknr);
 		break;
 	}
@@ -1367,7 +1368,7 @@ int next_leaf(struct ctree_root *root, struct ctree_path *path)
 	while(1) {
 		level--;
 		c = path->nodes[level];
-		tree_block_release(root, c);
+		btrfs_block_release(root, c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
 		if (!level)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 90cc2c9bd79..5dfcc90812c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,10 +1,10 @@
-#ifndef __CTREE__
-#define __CTREE__
+#ifndef __BTRFS__
+#define __BTRFS__
 
 #include "list.h"
 #include "kerncompat.h"
 
-#define CTREE_BLOCKSIZE 1024
+#define BTRFS_BLOCKSIZE 1024
 
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
@@ -46,21 +46,21 @@ struct btrfs_header {
 	/* generation flags to be added */
 } __attribute__ ((__packed__));
 
-#define MAX_LEVEL 8
-#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct btrfs_header)) / \
+#define BTRFS_MAX_LEVEL 8
+#define NODEPTRS_PER_BLOCK ((BTRFS_BLOCKSIZE - sizeof(struct btrfs_header)) / \
 			    (sizeof(struct btrfs_disk_key) + sizeof(u64)))
 
-struct tree_buffer;
+struct btrfs_buffer;
 
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.  current_insert is used
  * only for the extent tree.
  */
-struct ctree_root {
-	struct tree_buffer *node;
-	struct tree_buffer *commit_root;
-	struct ctree_root *extent_root;
+struct btrfs_root {
+	struct btrfs_buffer *node;
+	struct btrfs_buffer *commit_root;
+	struct btrfs_root *extent_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
 	int fp;
@@ -74,7 +74,7 @@ struct ctree_root {
 /*
  * describes a tree on disk
  */
-struct ctree_root_info {
+struct btrfs_root_info {
 	u64 fsid[2]; /* FS specific uuid */
 	u64 blocknr; /* blocknr of this block */
 	u64 objectid; /* inode number of this root */
@@ -88,9 +88,9 @@ struct ctree_root_info {
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
  */
-struct ctree_super_block {
-	struct ctree_root_info root_info;
-	struct ctree_root_info extent_info;
+struct btrfs_super_block {
+	struct btrfs_root_info root_info;
+	struct btrfs_root_info extent_info;
 } __attribute__ ((__packed__));
 
 /*
@@ -111,13 +111,13 @@ struct btrfs_item {
  * The data is separate from the items to get the keys closer together
  * during searches.
  */
-#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct btrfs_header))
-struct leaf {
+#define LEAF_DATA_SIZE (BTRFS_BLOCKSIZE - sizeof(struct btrfs_header))
+struct btrfs_leaf {
 	struct btrfs_header header;
 	union {
 		struct btrfs_item items[LEAF_DATA_SIZE/
 				        sizeof(struct btrfs_item)];
-		u8 data[CTREE_BLOCKSIZE-sizeof(struct btrfs_header)];
+		u8 data[BTRFS_BLOCKSIZE - sizeof(struct btrfs_header)];
 	};
 } __attribute__ ((__packed__));
 
@@ -125,7 +125,7 @@ struct leaf {
  * all non-leaf blocks are nodes, they hold only keys and pointers to
  * other blocks
  */
-struct node {
+struct btrfs_node {
 	struct btrfs_header header;
 	struct btrfs_disk_key keys[NODEPTRS_PER_BLOCK];
 	__le64 blockptrs[NODEPTRS_PER_BLOCK];
@@ -135,50 +135,51 @@ struct node {
  * items in the extent btree are used to record the objectid of the
  * owner of the block and the number of references
  */
-struct extent_item {
+struct btrfs_extent_item {
 	__le32 refs;
 	__le64 owner;
 } __attribute__ ((__packed__));
 
 /*
- * ctree_paths remember the path taken from the root down to the leaf.
- * level 0 is always the leaf, and nodes[1...MAX_LEVEL] will point
+ * btrfs_paths remember the path taken from the root down to the leaf.
+ * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
  * to any other levels that are present.
  *
  * The slots array records the index of the item or block pointer
  * used while walking the tree.
  */
-struct ctree_path {
-	struct tree_buffer *nodes[MAX_LEVEL];
-	int slots[MAX_LEVEL];
+struct btrfs_path {
+	struct btrfs_buffer *nodes[BTRFS_MAX_LEVEL];
+	int slots[BTRFS_MAX_LEVEL];
 };
 
-static inline u64 btrfs_extent_owner(struct extent_item *ei)
+static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
 {
 	return le64_to_cpu(ei->owner);
 }
 
-static inline void btrfs_set_extent_owner(struct extent_item *ei, u64 val)
+static inline void btrfs_set_extent_owner(struct btrfs_extent_item *ei, u64 val)
 {
 	ei->owner = cpu_to_le64(val);
 }
 
-static inline u32 btrfs_extent_refs(struct extent_item *ei)
+static inline u32 btrfs_extent_refs(struct btrfs_extent_item *ei)
 {
 	return le32_to_cpu(ei->refs);
 }
 
-static inline void btrfs_set_extent_refs(struct extent_item *ei, u32 val)
+static inline void btrfs_set_extent_refs(struct btrfs_extent_item *ei, u32 val)
 {
 	ei->refs = cpu_to_le32(val);
 }
 
-static inline u64 btrfs_node_blockptr(struct node *n, int nr)
+static inline u64 btrfs_node_blockptr(struct btrfs_node *n, int nr)
 {
 	return le64_to_cpu(n->blockptrs[nr]);
 }
 
-static inline void btrfs_set_node_blockptr(struct node *n, int nr, u64 val)
+static inline void btrfs_set_node_blockptr(struct btrfs_node *n, int nr,
+					   u64 val)
 {
 	n->blockptrs[nr] = cpu_to_le64(val);
 }
@@ -300,34 +301,34 @@ static inline void btrfs_set_header_flags(struct btrfs_header *h, u16 val)
 
 static inline int btrfs_header_level(struct btrfs_header *h)
 {
-	return btrfs_header_flags(h) & (MAX_LEVEL - 1);
+	return btrfs_header_flags(h) & (BTRFS_MAX_LEVEL - 1);
 }
 
 static inline void btrfs_set_header_level(struct btrfs_header *h, int level)
 {
 	u16 flags;
-	BUG_ON(level > MAX_LEVEL);
-	flags = btrfs_header_flags(h) & ~(MAX_LEVEL - 1);
+	BUG_ON(level > BTRFS_MAX_LEVEL);
+	flags = btrfs_header_flags(h) & ~(BTRFS_MAX_LEVEL - 1);
 	btrfs_set_header_flags(h, flags | level);
 }
 
-static inline int btrfs_is_leaf(struct node *n)
+static inline int btrfs_is_leaf(struct btrfs_node *n)
 {
 	return (btrfs_header_level(&n->header) == 0);
 }
 
-struct tree_buffer *alloc_free_block(struct ctree_root *root);
-int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf);
-int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks);
-int search_slot(struct ctree_root *root, struct btrfs_key *key,
-		struct ctree_path *p, int ins_len, int cow);
-void release_path(struct ctree_root *root, struct ctree_path *p);
-void init_path(struct ctree_path *p);
-int del_item(struct ctree_root *root, struct ctree_path *path);
-int insert_item(struct ctree_root *root, struct btrfs_key *key,
+struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root);
+int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf);
+int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks);
+int btrfs_search_slot(struct btrfs_root *root, struct btrfs_key *key,
+		struct btrfs_path *p, int ins_len, int cow);
+void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+void btrfs_init_path(struct btrfs_path *p);
+int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *key,
 		void *data, int data_size);
-int next_leaf(struct ctree_root *root, struct ctree_path *path);
-int leaf_free_space(struct leaf *leaf);
-int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap);
-int btrfs_finish_extent_commit(struct ctree_root *root);
+int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_leaf_free_space(struct btrfs_leaf *leaf);
+int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap);
+int btrfs_finish_extent_commit(struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
index 21f607d8b05..6da0a7aa0f4 100644
--- a/fs/btrfs/debug-tree.c
+++ b/fs/btrfs/debug-tree.c
@@ -7,13 +7,13 @@
 #include "print-tree.h"
 
 int main(int ac, char **av) {
-	struct ctree_super_block super;
-	struct ctree_root *root;
+	struct btrfs_super_block super;
+	struct btrfs_root *root;
 	radix_tree_init();
 	root = open_ctree("dbfile", &super);
 	printf("root tree\n");
-	print_tree(root, root->node);
+	btrfs_print_tree(root, root->node);
 	printf("map tree\n");
-	print_tree(root->extent_root, root->extent_root->node);
+	btrfs_print_tree(root->extent_root, root->extent_root->node);
 	return 0;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 065e888d2c0..c34c0c60935 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -13,7 +13,7 @@
 static int allocated_blocks = 0;
 int cache_max = 10000;
 
-static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+static int check_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 {
 	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
 		BUG();
@@ -23,18 +23,18 @@ static int check_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 	return 0;
 }
 
-static int free_some_buffers(struct ctree_root *root)
+static int free_some_buffers(struct btrfs_root *root)
 {
 	struct list_head *node, *next;
-	struct tree_buffer *b;
+	struct btrfs_buffer *b;
 	if (root->cache_size < cache_max)
 		return 0;
 	list_for_each_safe(node, next, &root->cache) {
-		b = list_entry(node, struct tree_buffer, cache);
+		b = list_entry(node, struct btrfs_buffer, cache);
 		if (b->count == 1) {
 			BUG_ON(!list_empty(&b->dirty));
 			list_del_init(&b->cache);
-			tree_block_release(root, b);
+			btrfs_block_release(root, b);
 			if (root->cache_size < cache_max)
 				break;
 		}
@@ -42,11 +42,11 @@ static int free_some_buffers(struct ctree_root *root)
 	return 0;
 }
 
-struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
+struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct tree_buffer *buf;
+	struct btrfs_buffer *buf;
 	int ret;
-	buf = malloc(sizeof(struct tree_buffer));
+	buf = malloc(sizeof(struct btrfs_buffer));
 	if (!buf)
 		return buf;
 	allocated_blocks++;
@@ -66,9 +66,9 @@ struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
 	return buf;
 }
 
-struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr)
+struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct tree_buffer *buf;
+	struct btrfs_buffer *buf;
 	buf = radix_tree_lookup(&root->cache_radix, blocknr);
 	if (buf) {
 		buf->count++;
@@ -82,10 +82,10 @@ struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr)
 	return buf;
 }
 
-struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
+struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	loff_t offset = blocknr * CTREE_BLOCKSIZE;
-	struct tree_buffer *buf;
+	loff_t offset = blocknr * BTRFS_BLOCKSIZE;
+	struct btrfs_buffer *buf;
 	int ret;
 
 	buf = radix_tree_lookup(&root->cache_radix, blocknr);
@@ -95,8 +95,8 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 		buf = alloc_tree_block(root, blocknr);
 		if (!buf)
 			return NULL;
-		ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
-		if (ret != CTREE_BLOCKSIZE) {
+		ret = pread(root->fp, &buf->node, BTRFS_BLOCKSIZE, offset);
+		if (ret != BTRFS_BLOCKSIZE) {
 			free(buf);
 			return NULL;
 		}
@@ -106,7 +106,7 @@ struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
 	return buf;
 }
 
-int dirty_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 {
 	if (!list_empty(&buf->dirty))
 		return 0;
@@ -115,46 +115,47 @@ int dirty_tree_block(struct ctree_root *root, struct tree_buffer *buf)
 	return 0;
 }
 
-int clean_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 {
 	if (!list_empty(&buf->dirty)) {
 		list_del_init(&buf->dirty);
-		tree_block_release(root, buf);
+		btrfs_block_release(root, buf);
 	}
 	return 0;
 }
 
-int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 {
 	u64 blocknr = buf->blocknr;
-	loff_t offset = blocknr * CTREE_BLOCKSIZE;
+	loff_t offset = blocknr * BTRFS_BLOCKSIZE;
 	int ret;
 
 	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
 		BUG();
-	ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
-	if (ret != CTREE_BLOCKSIZE)
+	ret = pwrite(root->fp, &buf->node, BTRFS_BLOCKSIZE, offset);
+	if (ret != BTRFS_BLOCKSIZE)
 		return ret;
 	return 0;
 }
 
-static int __commit_transaction(struct ctree_root *root)
+static int __commit_transaction(struct btrfs_root *root)
 {
-	struct tree_buffer *b;
+	struct btrfs_buffer *b;
 	int ret = 0;
 	int wret;
 	while(!list_empty(&root->trans)) {
-		b = list_entry(root->trans.next, struct tree_buffer, dirty);
+		b = list_entry(root->trans.next, struct btrfs_buffer, dirty);
 		list_del_init(&b->dirty);
 		wret = write_tree_block(root, b);
 		if (wret)
 			ret = wret;
-		tree_block_release(root, b);
+		btrfs_block_release(root, b);
 	}
 	return ret;
 }
 
-int commit_transaction(struct ctree_root *root, struct ctree_super_block *s)
+int btrfs_commit_transaction(struct btrfs_root *root,
+			     struct btrfs_super_block *s)
 {
 	int ret = 0;
 
@@ -163,20 +164,20 @@ int commit_transaction(struct ctree_root *root, struct ctree_super_block *s)
 		ret = __commit_transaction(root->extent_root);
 	BUG_ON(ret);
 	if (root->commit_root != root->node) {
-		struct tree_buffer *snap = root->commit_root;
+		struct btrfs_buffer *snap = root->commit_root;
 		root->commit_root = root->node;
 		root->node->count++;
 		ret = btrfs_drop_snapshot(root, snap);
 		BUG_ON(ret);
-		// tree_block_release(root, snap);
+		// btrfs_block_release(root, snap);
 	}
         write_ctree_super(root, s);
 	btrfs_finish_extent_commit(root);
 	return ret;
 }
 
-static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
-			struct ctree_root_info *info, int fp)
+static int __setup_root(struct btrfs_root *root, struct btrfs_root *extent_root,
+			struct btrfs_root_info *info, int fp)
 {
 	INIT_LIST_HEAD(&root->trans);
 	INIT_LIST_HEAD(&root->cache);
@@ -191,10 +192,10 @@ static int __setup_root(struct ctree_root *root, struct ctree_root *extent_root,
 	return 0;
 }
 
-struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
+struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 {
-	struct ctree_root *root = malloc(sizeof(struct ctree_root));
-	struct ctree_root *extent_root = malloc(sizeof(struct ctree_root));
+	struct btrfs_root *root = malloc(sizeof(struct btrfs_root));
+	struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root));
 	int fp;
 	int ret;
 
@@ -207,16 +208,16 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 	INIT_RADIX_TREE(&root->pinned_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&extent_root->pinned_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL);
-	ret = pread(fp, super, sizeof(struct ctree_super_block),
-		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
+	ret = pread(fp, super, sizeof(struct btrfs_super_block),
+		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
 	if (ret == 0 || super->root_info.tree_root == 0) {
 		printf("making new FS!\n");
 		ret = mkfs(fp);
 		if (ret)
 			return NULL;
-		ret = pread(fp, super, sizeof(struct ctree_super_block),
-			     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
-		if (ret != sizeof(struct ctree_super_block))
+		ret = pread(fp, super, sizeof(struct btrfs_super_block),
+			     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
+		if (ret != sizeof(struct btrfs_super_block))
 			return NULL;
 	}
 	BUG_ON(ret < 0);
@@ -227,18 +228,19 @@ struct ctree_root *open_ctree(char *filename, struct ctree_super_block *super)
 	return root;
 }
 
-static int __update_root(struct ctree_root *root, struct ctree_root_info *info)
+static int __update_root(struct btrfs_root *root, struct btrfs_root_info *info)
 {
 	info->tree_root = root->node->blocknr;
 	return 0;
 }
 
-int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s)
+int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s)
 {
 	int ret;
 	__update_root(root, &s->root_info);
 	__update_root(root->extent_root, &s->extent_info);
-	ret = pwrite(root->fp, s, sizeof(*s), CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
+	ret = pwrite(root->fp, s, sizeof(*s),
+		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
 	if (ret != sizeof(*s)) {
 		fprintf(stderr, "failed to write new super block err %d\n", ret);
 		return ret;
@@ -246,19 +248,19 @@ int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s)
 	return 0;
 }
 
-static int drop_cache(struct ctree_root *root)
+static int drop_cache(struct btrfs_root *root)
 {
 	while(!list_empty(&root->cache)) {
-		struct tree_buffer *b = list_entry(root->cache.next,
-						   struct tree_buffer, cache);
+		struct btrfs_buffer *b = list_entry(root->cache.next,
+						   struct btrfs_buffer, cache);
 		list_del_init(&b->cache);
-		tree_block_release(root, b);
+		btrfs_block_release(root, b);
 	}
 	return 0;
 }
-int close_ctree(struct ctree_root *root, struct ctree_super_block *s)
+int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s)
 {
-	commit_transaction(root, s);
+	btrfs_commit_transaction(root, s);
 	__commit_transaction(root->extent_root);
 	write_ctree_super(root, s);
 	drop_cache(root->extent_root);
@@ -268,16 +270,16 @@ int close_ctree(struct ctree_root *root, struct ctree_super_block *s)
 
 	close(root->fp);
 	if (root->node)
-		tree_block_release(root, root->node);
+		btrfs_block_release(root, root->node);
 	if (root->extent_root->node)
-		tree_block_release(root->extent_root, root->extent_root->node);
-	tree_block_release(root, root->commit_root);
+		btrfs_block_release(root->extent_root, root->extent_root->node);
+	btrfs_block_release(root, root->commit_root);
 	free(root);
 	printf("on close %d blocks are allocated\n", allocated_blocks);
 	return 0;
 }
 
-void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
+void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf)
 {
 	buf->count--;
 	if (buf->count < 0)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 1c0af7c56c2..b391335864b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -2,29 +2,30 @@
 #define __DISKIO__
 #include "list.h"
 
-struct tree_buffer {
+struct btrfs_buffer {
 	u64 blocknr;
 	int count;
 	union {
-		struct node node;
-		struct leaf leaf;
+		struct btrfs_node node;
+		struct btrfs_leaf leaf;
 	};
 	struct list_head dirty;
 	struct list_head cache;
 };
 
-struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
-struct tree_buffer *find_tree_block(struct ctree_root *root, u64 blocknr);
-int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
-int dirty_tree_block(struct ctree_root *root, struct tree_buffer *buf);
-int clean_tree_block(struct ctree_root *root, struct tree_buffer *buf);
-int commit_transaction(struct ctree_root *root, struct ctree_super_block *s);
-struct ctree_root *open_ctree(char *filename, struct ctree_super_block *s);
-int close_ctree(struct ctree_root *root, struct ctree_super_block *s);
-void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
-int write_ctree_super(struct ctree_root *root, struct ctree_super_block *s);
+struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr);
+struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr);
+int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf);
+int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf);
+int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf);
+int btrfs_commit_transaction(struct btrfs_root *root,
+			     struct btrfs_super_block *s);
+struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s);
+int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s);
+void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf);
+int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s);
 int mkfs(int fd);
 
-#define CTREE_SUPER_INFO_OFFSET(bs) (16 * (bs))
+#define BTRFS_SUPER_INFO_OFFSET(bs) (16 * (bs))
 
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1676a6595cc..4a4f2d810d9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6,11 +6,11 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
-static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
+static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks,
 			    u64 search_start, u64 search_end,
 			    struct btrfs_key *ins);
-static int finish_current_insert(struct ctree_root *extent_root);
-static int run_pending(struct ctree_root *extent_root);
+static int finish_current_insert(struct btrfs_root *extent_root);
+static int run_pending(struct btrfs_root *extent_root);
 
 /*
  * pending extents are blocks that we're trying to allocate in the extent
@@ -21,62 +21,63 @@ static int run_pending(struct ctree_root *extent_root);
  */
 #define CTREE_EXTENT_PENDING_DEL 0
 
-static int inc_block_ref(struct ctree_root *root, u64 blocknr)
+static int inc_block_ref(struct btrfs_root *root, u64 blocknr)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	int ret;
 	struct btrfs_key key;
-	struct leaf *l;
-	struct extent_item *item;
+	struct btrfs_leaf *l;
+	struct btrfs_extent_item *item;
 	struct btrfs_key ins;
 	u32 refs;
 
 	find_free_extent(root->extent_root, 0, 0, (u64)-1, &ins);
-	init_path(&path);
+	btrfs_init_path(&path);
 	key.objectid = blocknr;
 	key.flags = 0;
 	key.offset = 1;
-	ret = search_slot(root->extent_root, &key, &path, 0, 1);
+	ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 1);
 	if (ret != 0)
 		BUG();
 	BUG_ON(ret != 0);
 	l = &path.nodes[0]->leaf;
-	item = (struct extent_item *)(l->data + btrfs_item_offset(l->items +
-								path.slots[0]));
+	item = (struct btrfs_extent_item *)(l->data +
+					    btrfs_item_offset(l->items +
+							      path.slots[0]));
 	refs = btrfs_extent_refs(item);
 	btrfs_set_extent_refs(item, refs + 1);
 
 	BUG_ON(list_empty(&path.nodes[0]->dirty));
-	release_path(root->extent_root, &path);
+	btrfs_release_path(root->extent_root, &path);
 	finish_current_insert(root->extent_root);
 	run_pending(root->extent_root);
 	return 0;
 }
 
-static int lookup_block_ref(struct ctree_root *root, u64 blocknr, u32 *refs)
+static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	int ret;
 	struct btrfs_key key;
-	struct leaf *l;
-	struct extent_item *item;
-	init_path(&path);
+	struct btrfs_leaf *l;
+	struct btrfs_extent_item *item;
+	btrfs_init_path(&path);
 	key.objectid = blocknr;
 	key.flags = 0;
 	key.offset = 1;
-	ret = search_slot(root->extent_root, &key, &path, 0, 0);
+	ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 0);
 	if (ret != 0)
 		BUG();
 	l = &path.nodes[0]->leaf;
-	item = (struct extent_item *)(l->data +
+	item = (struct btrfs_extent_item *)(l->data +
 				      btrfs_item_offset(l->items +
 							path.slots[0]));
 	*refs = btrfs_extent_refs(item);
-	release_path(root->extent_root, &path);
+	btrfs_release_path(root->extent_root, &path);
 	return 0;
 }
 
-int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf)
+int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf)
 {
 	u64 blocknr;
 	int i;
@@ -93,9 +94,9 @@ int btrfs_inc_ref(struct ctree_root *root, struct tree_buffer *buf)
 	return 0;
 }
 
-int btrfs_finish_extent_commit(struct ctree_root *root)
+int btrfs_finish_extent_commit(struct btrfs_root *root)
 {
-	struct ctree_root *extent_root = root->extent_root;
+	struct btrfs_root *extent_root = root->extent_root;
 	unsigned long gang[8];
 	int ret;
 	int i;
@@ -115,10 +116,10 @@ int btrfs_finish_extent_commit(struct ctree_root *root)
 	return 0;
 }
 
-static int finish_current_insert(struct ctree_root *extent_root)
+static int finish_current_insert(struct btrfs_root *extent_root)
 {
 	struct btrfs_key ins;
-	struct extent_item extent_item;
+	struct btrfs_extent_item extent_item;
 	int i;
 	int ret;
 
@@ -130,7 +131,7 @@ static int finish_current_insert(struct ctree_root *extent_root)
 
 	for (i = 0; i < extent_root->current_insert.flags; i++) {
 		ins.objectid = extent_root->current_insert.objectid + i;
-		ret = insert_item(extent_root, &ins, &extent_item,
+		ret = btrfs_insert_item(extent_root, &ins, &extent_item,
 				  sizeof(extent_item));
 		BUG_ON(ret);
 	}
@@ -141,14 +142,14 @@ static int finish_current_insert(struct ctree_root *extent_root)
 /*
  * remove an extent from the root, returns 0 on success
  */
-int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
+static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
-	struct ctree_root *extent_root = root->extent_root;
+	struct btrfs_root *extent_root = root->extent_root;
 	int ret;
 	struct btrfs_item *item;
-	struct extent_item *ei;
+	struct btrfs_extent_item *ei;
 	struct btrfs_key ins;
 	u32 refs;
 
@@ -157,16 +158,16 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 	key.offset = num_blocks;
 
 	find_free_extent(root, 0, 0, (u64)-1, &ins);
-	init_path(&path);
-	ret = search_slot(extent_root, &key, &path, -1, 1);
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(extent_root, &key, &path, -1, 1);
 	if (ret) {
 		printf("failed to find %Lu\n", key.objectid);
-		print_tree(extent_root, extent_root->node);
+		btrfs_print_tree(extent_root, extent_root->node);
 		printf("failed to find %Lu\n", key.objectid);
 		BUG();
 	}
 	item = path.nodes[0]->leaf.items + path.slots[0];
-	ei = (struct extent_item *)(path.nodes[0]->leaf.data +
+	ei = (struct btrfs_extent_item *)(path.nodes[0]->leaf.data +
 				    btrfs_item_offset(item));
 	BUG_ON(ei->refs == 0);
 	refs = btrfs_extent_refs(ei) - 1;
@@ -180,14 +181,14 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
 			BUG_ON(err);
 			radix_tree_preload_end();
 		}
-		ret = del_item(extent_root, &path);
+		ret = btrfs_del_item(extent_root, &path);
 		if (root != extent_root &&
 		    extent_root->last_insert.objectid < blocknr)
 			extent_root->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
 	}
-	release_path(extent_root, &path);
+	btrfs_release_path(extent_root, &path);
 	finish_current_insert(extent_root);
 	return ret;
 }
@@ -196,10 +197,10 @@ int __free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
  * find all the blocks marked as pending in the radix tree and remove
  * them from the extent map
  */
-static int del_pending_extents(struct ctree_root *extent_root)
+static int del_pending_extents(struct btrfs_root *extent_root)
 {
 	int ret;
-	struct tree_buffer *gang[4];
+	struct btrfs_buffer *gang[4];
 	int i;
 
 	while(1) {
@@ -214,13 +215,13 @@ static int del_pending_extents(struct ctree_root *extent_root)
 			radix_tree_tag_clear(&extent_root->cache_radix,
 						gang[i]->blocknr,
 						CTREE_EXTENT_PENDING_DEL);
-			tree_block_release(extent_root, gang[i]);
+			btrfs_block_release(extent_root, gang[i]);
 		}
 	}
 	return 0;
 }
 
-static int run_pending(struct ctree_root *extent_root)
+static int run_pending(struct btrfs_root *extent_root)
 {
 	while(radix_tree_tagged(&extent_root->cache_radix,
 			        CTREE_EXTENT_PENDING_DEL))
@@ -232,11 +233,11 @@ static int run_pending(struct ctree_root *extent_root)
 /*
  * remove an extent from the root, returns 0 on success
  */
-int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
+int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 {
 	struct btrfs_key key;
-	struct ctree_root *extent_root = root->extent_root;
-	struct tree_buffer *t;
+	struct btrfs_root *extent_root = root->extent_root;
+	struct btrfs_buffer *t;
 	int pending_ret;
 	int ret;
 
@@ -262,11 +263,11 @@ int free_extent(struct ctree_root *root, u64 blocknr, u64 num_blocks)
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
+static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks,
 			    u64 search_start, u64 search_end,
 			    struct btrfs_key *ins)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	int ret;
 	u64 hole_size = 0;
@@ -274,20 +275,20 @@ static int find_free_extent(struct ctree_root *orig_root, u64 num_blocks,
 	u64 last_block;
 	u64 test_block;
 	int start_found;
-	struct leaf *l;
-	struct ctree_root * root = orig_root->extent_root;
+	struct btrfs_leaf *l;
+	struct btrfs_root * root = orig_root->extent_root;
 	int total_needed = num_blocks;
 
 	total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3;
 	if (root->last_insert.objectid > search_start)
 		search_start = root->last_insert.objectid;
 check_failed:
-	init_path(&path);
+	btrfs_init_path(&path);
 	ins->objectid = search_start;
 	ins->offset = 0;
 	ins->flags = 0;
 	start_found = 0;
-	ret = search_slot(root, ins, &path, 0, 0);
+	ret = btrfs_search_slot(root, ins, &path, 0, 0);
 	if (ret < 0)
 		goto error;
 
@@ -298,7 +299,7 @@ check_failed:
 		l = &path.nodes[0]->leaf;
 		slot = path.slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
-			ret = next_leaf(root, &path);
+			ret = btrfs_next_leaf(root, &path);
 			if (ret == 0)
 				continue;
 			if (ret < 0)
@@ -336,7 +337,7 @@ check_pending:
 	/* we have to make sure we didn't find an extent that has already
 	 * been allocated by the map tree or the original allocation
 	 */
-	release_path(root, &path);
+	btrfs_release_path(root, &path);
 	BUG_ON(ins->objectid < search_start);
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + total_needed; test_block++) {
@@ -353,7 +354,7 @@ check_pending:
 	ins->offset = num_blocks;
 	return 0;
 error:
-	release_path(root, &path);
+	btrfs_release_path(root, &path);
 	return ret;
 }
 
@@ -364,13 +365,13 @@ error:
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
+int alloc_extent(struct btrfs_root *root, u64 num_blocks, u64 search_start,
 			 u64 search_end, u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
-	struct ctree_root *extent_root = root->extent_root;
-	struct extent_item extent_item;
+	struct btrfs_root *extent_root = root->extent_root;
+	struct btrfs_extent_item extent_item;
 
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item, owner);
@@ -390,7 +391,7 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
 	if (ret)
 		return ret;
 
-	ret = insert_item(extent_root, ins, &extent_item,
+	ret = btrfs_insert_item(extent_root, ins, &extent_item,
 			  sizeof(extent_item));
 
 	finish_current_insert(extent_root);
@@ -406,11 +407,11 @@ int alloc_extent(struct ctree_root *root, u64 num_blocks, u64 search_start,
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
  */
-struct tree_buffer *alloc_free_block(struct ctree_root *root)
+struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root)
 {
 	struct btrfs_key ins;
 	int ret;
-	struct tree_buffer *buf;
+	struct btrfs_buffer *buf;
 
 	ret = alloc_extent(root, 1, 0, (unsigned long)-1,
 			   btrfs_header_parentid(&root->node->node.header),
@@ -424,10 +425,10 @@ struct tree_buffer *alloc_free_block(struct ctree_root *root)
 	return buf;
 }
 
-int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level)
+int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, int *level)
 {
-	struct tree_buffer *next;
-	struct tree_buffer *cur;
+	struct btrfs_buffer *next;
+	struct btrfs_buffer *cur;
 	u64 blocknr;
 	int ret;
 	u32 refs;
@@ -445,33 +446,33 @@ int walk_down_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 		ret = lookup_block_ref(root, blocknr, &refs);
 		if (refs != 1 || *level == 1) {
 			path->slots[*level]++;
-			ret = free_extent(root, blocknr, 1);
+			ret = btrfs_free_extent(root, blocknr, 1);
 			BUG_ON(ret);
 			continue;
 		}
 		BUG_ON(ret);
 		next = read_tree_block(root, blocknr);
 		if (path->nodes[*level-1])
-			tree_block_release(root, path->nodes[*level-1]);
+			btrfs_block_release(root, path->nodes[*level-1]);
 		path->nodes[*level-1] = next;
 		*level = btrfs_header_level(&next->node.header);
 		path->slots[*level] = 0;
 	}
 out:
-	ret = free_extent(root, path->nodes[*level]->blocknr, 1);
-	tree_block_release(root, path->nodes[*level]);
+	ret = btrfs_free_extent(root, path->nodes[*level]->blocknr, 1);
+	btrfs_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	BUG_ON(ret);
 	return 0;
 }
 
-int walk_up_tree(struct ctree_root *root, struct ctree_path *path, int *level)
+int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, int *level)
 {
 	int i;
 	int slot;
 	int ret;
-	for(i = *level; i < MAX_LEVEL - 1 && path->nodes[i]; i++) {
+	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot <
 		    btrfs_header_nritems(&path->nodes[i]->node.header)- 1) {
@@ -479,9 +480,9 @@ int walk_up_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 			*level = i;
 			return 0;
 		} else {
-			ret = free_extent(root,
+			ret = btrfs_free_extent(root,
 					  path->nodes[*level]->blocknr, 1);
-			tree_block_release(root, path->nodes[*level]);
+			btrfs_block_release(root, path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
 			BUG_ON(ret);
@@ -490,15 +491,15 @@ int walk_up_tree(struct ctree_root *root, struct ctree_path *path, int *level)
 	return 1;
 }
 
-int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
+int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap)
 {
 	int ret;
 	int level;
-	struct ctree_path path;
+	struct btrfs_path path;
 	int i;
 	int orig_level;
 
-	init_path(&path);
+	btrfs_init_path(&path);
 
 	level = btrfs_header_level(&snap->node.header);
 	orig_level = level;
@@ -514,7 +515,7 @@ int btrfs_drop_snapshot(struct ctree_root *root, struct tree_buffer *snap)
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path.nodes[i]) {
-			tree_block_release(root, path.nodes[i]);
+			btrfs_block_release(root, path.nodes[i]);
 		}
 	}
 
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index e0d3bc1d025..fc1923320de 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -12,10 +12,10 @@
 
 int mkfs(int fd)
 {
-	struct ctree_root_info info[2];
-	struct leaf empty_leaf;
+	struct btrfs_root_info info[2];
+	struct btrfs_leaf empty_leaf;
 	struct btrfs_item item;
-	struct extent_item extent_item;
+	struct btrfs_extent_item extent_item;
 	int ret;
 
 	/* setup the super block area */
@@ -28,7 +28,7 @@ int mkfs(int fd)
 	info[1].objectid = 2;
 	info[1].tree_root = 18;
 	ret = pwrite(fd, info, sizeof(info),
-		     CTREE_SUPER_INFO_OFFSET(CTREE_BLOCKSIZE));
+		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
 	if (ret != sizeof(info))
 		return -1;
 
@@ -36,7 +36,7 @@ int mkfs(int fd)
 	memset(&empty_leaf, 0, sizeof(empty_leaf));
 	btrfs_set_header_parentid(&empty_leaf.header, 1);
 	btrfs_set_header_blocknr(&empty_leaf.header, 17);
-	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 17 * CTREE_BLOCKSIZE);
+	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 17 * BTRFS_BLOCKSIZE);
 	if (ret != sizeof(empty_leaf))
 		return -1;
 
@@ -48,9 +48,9 @@ int mkfs(int fd)
 	btrfs_set_key_objectid(&item.key, 0);
 	btrfs_set_key_offset(&item.key, 17);
 	btrfs_set_key_flags(&item.key, 0);
-	btrfs_set_item_offset(&item,
-			      LEAF_DATA_SIZE - sizeof(struct extent_item));
-	btrfs_set_item_size(&item, sizeof(struct extent_item));
+	btrfs_set_item_offset(&item, LEAF_DATA_SIZE -
+			      sizeof(struct btrfs_extent_item));
+	btrfs_set_item_size(&item, sizeof(struct btrfs_extent_item));
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item, 0);
 	memcpy(empty_leaf.items, &item, sizeof(item));
@@ -60,8 +60,8 @@ int mkfs(int fd)
 	/* item2, give block 17 to the root */
 	btrfs_set_key_objectid(&item.key, 17);
 	btrfs_set_key_offset(&item.key, 1);
-	btrfs_set_item_offset(&item,
-			      LEAF_DATA_SIZE - sizeof(struct extent_item) * 2);
+	btrfs_set_item_offset(&item, LEAF_DATA_SIZE -
+			      sizeof(struct btrfs_extent_item) * 2);
 	btrfs_set_extent_owner(&extent_item, 1);
 	memcpy(empty_leaf.items + 1, &item, sizeof(item));
 	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
@@ -70,13 +70,13 @@ int mkfs(int fd)
 	/* item3, give block 18 for the extent root */
 	btrfs_set_key_objectid(&item.key, 18);
 	btrfs_set_key_offset(&item.key, 1);
-	btrfs_set_item_offset(&item,
-			      LEAF_DATA_SIZE - sizeof(struct extent_item) * 3);
+	btrfs_set_item_offset(&item, LEAF_DATA_SIZE -
+			      sizeof(struct btrfs_extent_item) * 3);
 	btrfs_set_extent_owner(&extent_item, 2);
 	memcpy(empty_leaf.items + 2, &item, sizeof(item));
 	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
 		btrfs_item_size(&item));
-	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 18 * CTREE_BLOCKSIZE);
+	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 18 * BTRFS_BLOCKSIZE);
 	if (ret != sizeof(empty_leaf))
 		return -1;
 	return 0;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 101278e1139..c95c85640aa 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -5,14 +5,14 @@
 #include "ctree.h"
 #include "disk-io.h"
 
-void print_leaf(struct leaf *l)
+void btrfs_print_leaf(struct btrfs_leaf *l)
 {
 	int i;
 	u32 nr = btrfs_header_nritems(&l->header);
 	struct btrfs_item *item;
-	struct extent_item *ei;
+	struct btrfs_extent_item *ei;
 	printf("leaf %Lu total ptrs %d free space %d\n",
-		btrfs_header_blocknr(&l->header), nr, leaf_free_space(l));
+		btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
@@ -26,24 +26,25 @@ void print_leaf(struct leaf *l)
 		fflush(stdout);
 		printf("\t\titem data %.*s\n", btrfs_item_size(item),
 			l->data + btrfs_item_offset(item));
-		ei = (struct extent_item *)(l->data + btrfs_item_offset(item));
+		ei = (struct btrfs_extent_item *)(l->data +
+						  btrfs_item_offset(item));
 		printf("\t\textent data refs %u owner %Lu\n", ei->refs,
 			ei->owner);
 		fflush(stdout);
 	}
 }
-void print_tree(struct ctree_root *root, struct tree_buffer *t)
+void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 {
 	int i;
 	u32 nr;
-	struct node *c;
+	struct btrfs_node *c;
 
 	if (!t)
 		return;
 	c = &t->node;
 	nr = btrfs_header_nritems(&c->header);
 	if (btrfs_is_leaf(c)) {
-		print_leaf((struct leaf *)c);
+		btrfs_print_leaf((struct btrfs_leaf *)c);
 		return;
 	}
 	printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
@@ -58,17 +59,17 @@ void print_tree(struct ctree_root *root, struct tree_buffer *t)
 		fflush(stdout);
 	}
 	for (i = 0; i < nr; i++) {
-		struct tree_buffer *next_buf = read_tree_block(root,
+		struct btrfs_buffer *next_buf = read_tree_block(root,
 						btrfs_node_blockptr(c, i));
-		struct node *next = &next_buf->node;
+		struct btrfs_node *next = &next_buf->node;
 		if (btrfs_is_leaf(next) &&
 		    btrfs_header_level(&c->header) != 1)
 			BUG();
 		if (btrfs_header_level(&next->header) !=
 			btrfs_header_level(&c->header) - 1)
 			BUG();
-		print_tree(root, next_buf);
-		tree_block_release(root, next_buf);
+		btrfs_print_tree(root, next_buf);
+		btrfs_block_release(root, next_buf);
 	}
 
 }
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 3c1e9a3e026..e8d0b847c02 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -1,3 +1,3 @@
 
-void print_leaf(struct leaf *l);
-void print_tree(struct ctree_root *root, struct tree_buffer *t);
+void btrfs_print_leaf(struct btrfs_leaf *l);
+void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t);
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index ab3bda53a2f..66bdc57905d 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -22,9 +22,9 @@ int main(int ac, char **av) {
 	int run_size = 100000;
 	int max_key =  100000000;
 	int tree_size = 0;
-	struct ctree_path path;
-	struct ctree_super_block super;
-	struct ctree_root *root;
+	struct btrfs_path path;
+	struct btrfs_super_block super;
+	struct btrfs_root *root;
 
 	radix_tree_init();
 
@@ -40,12 +40,12 @@ int main(int ac, char **av) {
 		ins.objectid = num;
 		ins.offset = 0;
 		ins.flags = 0;
-		ret = insert_item(root, &ins, buf, strlen(buf));
+		ret = btrfs_insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 		free(buf);
 		if (i == run_size - 5) {
-			commit_transaction(root, &super);
+			btrfs_commit_transaction(root, &super);
 		}
 
 	}
@@ -57,16 +57,16 @@ int main(int ac, char **av) {
 	for (i = 0; i < run_size; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
-		init_path(&path);
+		btrfs_init_path(&path);
 		if (i % 10000 == 0)
 			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = search_slot(root, &ins, &path, 0, 0);
+		ret = btrfs_search_slot(root, &ins, &path, 0, 0);
 		if (ret) {
-			print_tree(root, root->node);
+			btrfs_print_tree(root, root->node);
 			printf("unable to find %d\n", num);
 			exit(1);
 		}
-		release_path(root, &path);
+		btrfs_release_path(root, &path);
 	}
 	close_ctree(root, &super);
 	root = open_ctree("dbfile", &super);
@@ -81,17 +81,17 @@ int main(int ac, char **av) {
 	for (i = 0 ; i < run_size/4; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
-		init_path(&path);
-		ret = search_slot(root, &ins, &path, -1, 1);
+		btrfs_init_path(&path);
+		ret = btrfs_search_slot(root, &ins, &path, -1, 1);
 		if (!ret) {
 			if (i % 10000 == 0)
 				fprintf(stderr, "del %d:%d\n", num, i);
-			ret = del_item(root, &path);
+			ret = btrfs_del_item(root, &path);
 			if (ret != 0)
 				BUG();
 			tree_size--;
 		}
-		release_path(root, &path);
+		btrfs_release_path(root, &path);
 	}
 	close_ctree(root, &super);
 	root = open_ctree("dbfile", &super);
@@ -103,7 +103,7 @@ int main(int ac, char **av) {
 		ins.objectid = num;
 		if (i % 10000 == 0)
 			fprintf(stderr, "insert %d:%d\n", num, i);
-		ret = insert_item(root, &ins, buf, strlen(buf));
+		ret = btrfs_insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 		free(buf);
@@ -115,25 +115,25 @@ int main(int ac, char **av) {
 	for (i = 0; i < run_size; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
-		init_path(&path);
+		btrfs_init_path(&path);
 		if (i % 10000 == 0)
 			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = search_slot(root, &ins, &path, 0, 0);
+		ret = btrfs_search_slot(root, &ins, &path, 0, 0);
 		if (ret) {
-			print_tree(root, root->node);
+			btrfs_print_tree(root, root->node);
 			printf("unable to find %d\n", num);
 			exit(1);
 		}
-		release_path(root, &path);
+		btrfs_release_path(root, &path);
 	}
 	printf("starting big long delete run\n");
 	while(root->node &&
 	      btrfs_header_nritems(&root->node->node.header) > 0) {
-		struct leaf *leaf;
+		struct btrfs_leaf *leaf;
 		int slot;
 		ins.objectid = (u64)-1;
-		init_path(&path);
-		ret = search_slot(root, &ins, &path, -1, 1);
+		btrfs_init_path(&path);
+		ret = btrfs_search_slot(root, &ins, &path, -1, 1);
 		if (ret == 0)
 			BUG();
 
@@ -149,26 +149,26 @@ int main(int ac, char **av) {
 			btrfs_disk_key_to_cpu(&last, &leaf->items[slot].key);
 			if (tree_size % 10000 == 0)
 				printf("big del %d:%d\n", tree_size, i);
-			ret = del_item(root, &path);
+			ret = btrfs_del_item(root, &path);
 			if (ret != 0) {
 				printf("del_item returned %d\n", ret);
 				BUG();
 			}
 			tree_size--;
 		}
-		release_path(root, &path);
+		btrfs_release_path(root, &path);
 	}
 	/*
 	printf("previous tree:\n");
-	print_tree(root, root->commit_root);
+	btrfs_print_tree(root, root->commit_root);
 	printf("map before commit\n");
-	print_tree(root->extent_root, root->extent_root->node);
+	btrfs_print_tree(root->extent_root, root->extent_root->node);
 	*/
-	commit_transaction(root, &super);
+	btrfs_commit_transaction(root, &super);
 	printf("tree size is now %d\n", tree_size);
 	printf("root %p commit root %p\n", root->node, root->commit_root);
 	printf("map tree\n");
-	print_tree(root->extent_root, root->extent_root->node);
+	btrfs_print_tree(root->extent_root, root->extent_root->node);
 	close_ctree(root, &super);
 	return 0;
 }
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index e767528bc52..a8ef0478991 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -8,7 +8,7 @@
 #include "print-tree.h"
 
 int keep_running = 1;
-struct ctree_super_block super;
+struct btrfs_super_block super;
 
 static int setup_key(struct radix_tree_root *root, struct btrfs_key *key,
 		     int exists)
@@ -36,17 +36,17 @@ again:
 	return 0;
 }
 
-static int ins_one(struct ctree_root *root, struct radix_tree_root *radix)
+static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	int ret;
 	char buf[128];
 	unsigned long oid;
-	init_path(&path);
+	btrfs_init_path(&path);
 	ret = setup_key(radix, &key, 0);
 	sprintf(buf, "str-%Lu\n", key.objectid);
-	ret = insert_item(root, &key, buf, strlen(buf));
+	ret = btrfs_insert_item(root, &key, buf, strlen(buf));
 	if (ret)
 		goto error;
 	oid = (unsigned long)key.objectid;
@@ -61,18 +61,18 @@ error:
 	return -1;
 }
 
-static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
+static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	int ret;
 	char buf[128];
-	init_path(&path);
+	btrfs_init_path(&path);
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
 	sprintf(buf, "str-%Lu\n", key.objectid);
-	ret = insert_item(root, &key, buf, strlen(buf));
+	ret = btrfs_insert_item(root, &key, buf, strlen(buf));
 	if (ret != -EEXIST) {
 		printf("insert on %Lu gave us %d\n", key.objectid, ret);
 		return 1;
@@ -80,21 +80,21 @@ static int insert_dup(struct ctree_root *root, struct radix_tree_root *radix)
 	return 0;
 }
 
-static int del_one(struct ctree_root *root, struct radix_tree_root *radix)
+static int del_one(struct btrfs_root *root, struct radix_tree_root *radix)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	int ret;
 	unsigned long *ptr;
-	init_path(&path);
+	btrfs_init_path(&path);
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
-	ret = search_slot(root, &key, &path, -1, 1);
+	ret = btrfs_search_slot(root, &key, &path, -1, 1);
 	if (ret)
 		goto error;
-	ret = del_item(root, &path);
-	release_path(root, &path);
+	ret = btrfs_del_item(root, &path);
+	btrfs_release_path(root, &path);
 	if (ret != 0)
 		goto error;
 	ptr = radix_tree_delete(radix, key.objectid);
@@ -106,17 +106,17 @@ error:
 	return -1;
 }
 
-static int lookup_item(struct ctree_root *root, struct radix_tree_root *radix)
+static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	int ret;
-	init_path(&path);
+	btrfs_init_path(&path);
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
-	ret = search_slot(root, &key, &path, 0, 1);
-	release_path(root, &path);
+	ret = btrfs_search_slot(root, &key, &path, 0, 1);
+	btrfs_release_path(root, &path);
 	if (ret)
 		goto error;
 	return 0;
@@ -125,17 +125,17 @@ error:
 	return -1;
 }
 
-static int lookup_enoent(struct ctree_root *root, struct radix_tree_root *radix)
+static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	int ret;
-	init_path(&path);
+	btrfs_init_path(&path);
 	ret = setup_key(radix, &key, 0);
 	if (ret < 0)
 		return ret;
-	ret = search_slot(root, &key, &path, 0, 0);
-	release_path(root, &path);
+	ret = btrfs_search_slot(root, &key, &path, 0, 0);
+	btrfs_release_path(root, &path);
 	if (ret <= 0)
 		goto error;
 	return 0;
@@ -144,10 +144,10 @@ error:
 	return -1;
 }
 
-static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix,
+static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 		      int nr)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	unsigned long found = 0;
 	int ret;
@@ -159,22 +159,22 @@ static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix,
 	key.flags = 0;
 	key.objectid = (unsigned long)-1;
 	while(nr-- >= 0) {
-		init_path(&path);
-		ret = search_slot(root, &key, &path, -1, 1);
+		btrfs_init_path(&path);
+		ret = btrfs_search_slot(root, &key, &path, -1, 1);
 		if (ret < 0) {
-			release_path(root, &path);
+			btrfs_release_path(root, &path);
 			return ret;
 		}
 		if (ret != 0) {
 			if (path.slots[0] == 0) {
-				release_path(root, &path);
+				btrfs_release_path(root, &path);
 				break;
 			}
 			path.slots[0] -= 1;
 		}
 		slot = path.slots[0];
 		found=btrfs_key_objectid(&path.nodes[0]->leaf.items[slot].key);
-		ret = del_item(root, &path);
+		ret = btrfs_del_item(root, &path);
 		count++;
 		if (ret) {
 			fprintf(stderr,
@@ -182,7 +182,7 @@ static int empty_tree(struct ctree_root *root, struct radix_tree_root *radix,
 				found);
 			return -1;
 		}
-		release_path(root, &path);
+		btrfs_release_path(root, &path);
 		ptr = radix_tree_delete(radix, found);
 		if (!ptr)
 			goto error;
@@ -195,7 +195,7 @@ error:
 	return -1;
 }
 
-static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix,
+static int fill_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 		     int count)
 {
 	int i;
@@ -207,7 +207,7 @@ static int fill_tree(struct ctree_root *root, struct radix_tree_root *radix,
 			goto out;
 		}
 		if (i % 1000 == 0) {
-			ret = commit_transaction(root, &super);
+			ret = btrfs_commit_transaction(root, &super);
 			if (ret) {
 				fprintf(stderr, "fill commit failed\n");
 				return ret;
@@ -223,7 +223,7 @@ out:
 	return ret;
 }
 
-static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix)
+static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix)
 {
 	int ret;
 	int nr = rand() % 5000;
@@ -242,13 +242,13 @@ static int bulk_op(struct ctree_root *root, struct radix_tree_root *radix)
 }
 
 
-int (*ops[])(struct ctree_root *root, struct radix_tree_root *radix) =
+int (*ops[])(struct btrfs_root *root, struct radix_tree_root *radix) =
 	{ ins_one, insert_dup, del_one, lookup_item,
 	  lookup_enoent, bulk_op };
 
-static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
+static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix)
 {
-	struct ctree_path path;
+	struct btrfs_path path;
 	struct btrfs_key key;
 	unsigned long found;
 	int ret;
@@ -259,16 +259,16 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 	key.flags = 0;
 	key.objectid = (unsigned long)-1;
 	while(1) {
-		init_path(&path);
-		ret = search_slot(root, &key, &path, 0, 0);
+		btrfs_init_path(&path);
+		ret = btrfs_search_slot(root, &key, &path, 0, 0);
 		if (ret < 0) {
-			release_path(root, &path);
+			btrfs_release_path(root, &path);
 			return ret;
 		}
 		slot = path.slots[0];
 		if (ret != 0) {
 			if (slot == 0) {
-				release_path(root, &path);
+				btrfs_release_path(root, &path);
 				break;
 			}
 			slot -= 1;
@@ -287,7 +287,7 @@ static int fill_radix(struct ctree_root *root, struct radix_tree_root *radix)
 
 			radix_tree_preload_end();
 		}
-		release_path(root, &path);
+		btrfs_release_path(root, &path);
 		key.objectid = found - 1;
 		if (key.objectid > found)
 			break;
@@ -312,7 +312,7 @@ int print_usage(void)
 int main(int ac, char **av)
 {
 	RADIX_TREE(radix, GFP_KERNEL);
-	struct ctree_root *root;
+	struct btrfs_root *root;
 	int i;
 	int ret;
 	int count;
@@ -370,7 +370,7 @@ int main(int ac, char **av)
 			if (ret) {
 				fprintf(stderr, "op %d failed %d:%d\n",
 					op, i, iterations);
-				print_tree(root, root->node);
+				btrfs_print_tree(root, root->node);
 				fprintf(stderr, "op %d failed %d:%d\n",
 					op, i, iterations);
 				err = ret;
-- 
cgit v1.2.3


From 9aca1d51323c0291a7358e0728b64ec4f17d1a77 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Mar 2007 11:09:37 -0400
Subject: Btrfs: make some funcs static

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 16 ++++++++--------
 fs/btrfs/extent-tree.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 44 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c8529b394d5..bbb63229170 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -34,7 +34,7 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	memset(p, 0, sizeof(*p));
 }
 
-int btrfs_cow_block(struct btrfs_root *root,
+static int btrfs_cow_block(struct btrfs_root *root,
 		    struct btrfs_buffer *buf,
 		    struct btrfs_buffer *parent,
 		    int parent_slot,
@@ -96,7 +96,7 @@ int btrfs_leaf_free_space(struct btrfs_leaf *leaf)
 /*
  * compare two keys in a memcmp fashion
  */
-int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 {
 	struct btrfs_key k1;
 
@@ -117,7 +117,7 @@ int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 	return 0;
 }
 
-int check_node(struct btrfs_path *path, int level)
+static int check_node(struct btrfs_path *path, int level)
 {
 	int i;
 	struct btrfs_node *parent = NULL;
@@ -146,7 +146,7 @@ int check_node(struct btrfs_path *path, int level)
 	return 0;
 }
 
-int check_leaf(struct btrfs_path *path, int level)
+static int check_leaf(struct btrfs_path *path, int level)
 {
 	int i;
 	struct btrfs_leaf *leaf = &path->nodes[level]->leaf;
@@ -186,7 +186,7 @@ int check_leaf(struct btrfs_path *path, int level)
 	return 0;
 }
 
-int check_block(struct btrfs_path *path, int level)
+static int check_block(struct btrfs_path *path, int level)
 {
 	if (level == 0)
 		return check_leaf(path, level);
@@ -202,7 +202,7 @@ int check_block(struct btrfs_path *path, int level)
  *
  * slot may point to max if the key is bigger than all of the keys
  */
-int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
+static int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
 		       int max, int *slot)
 {
 	int low = 0;
@@ -233,7 +233,7 @@ int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
  * simple bin_search frontend that does the right thing for
  * leaves vs nodes
  */
-int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot)
+static int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot)
 {
 	if (btrfs_is_leaf(c)) {
 		struct btrfs_leaf *l = (struct btrfs_leaf *)c;
@@ -250,7 +250,7 @@ int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot)
 	return -1;
 }
 
-struct btrfs_buffer *read_node_slot(struct btrfs_root *root,
+static struct btrfs_buffer *read_node_slot(struct btrfs_root *root,
 				   struct btrfs_buffer *parent_buf,
 				   int slot)
 {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4a4f2d810d9..a6969538bca 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -365,8 +365,9 @@ error:
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-int alloc_extent(struct btrfs_root *root, u64 num_blocks, u64 search_start,
-			 u64 search_end, u64 owner, struct btrfs_key *ins)
+static int alloc_extent(struct btrfs_root *root, u64 num_blocks,
+			u64 search_start, u64 search_end, u64 owner,
+			struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
@@ -425,7 +426,12 @@ struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root)
 	return buf;
 }
 
-int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, int *level)
+/*
+ * helper function for drop_snapshot, this walks down the tree dropping ref
+ * counts as it goes.
+ */
+static int walk_down_tree(struct btrfs_root *root,
+			  struct btrfs_path *path, int *level)
 {
 	struct btrfs_buffer *next;
 	struct btrfs_buffer *cur;
@@ -437,6 +443,9 @@ int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, int *level)
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
+	/*
+	 * walk down to the last node level and free all the leaves
+	 */
 	while(*level > 0) {
 		cur = path->nodes[*level];
 		if (path->slots[*level] >=
@@ -467,7 +476,13 @@ out:
 	return 0;
 }
 
-int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, int *level)
+/*
+ * helper for dropping snapshots.  This walks back up the tree in the path
+ * to find the first node higher up where we haven't yet gone through
+ * all the slots
+ */
+static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
+			int *level)
 {
 	int i;
 	int slot;
@@ -491,9 +506,15 @@ int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, int *level)
 	return 1;
 }
 
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
 int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap)
 {
-	int ret;
+	int ret = 0;;
+	int wret;
 	int level;
 	struct btrfs_path path;
 	int i;
@@ -506,18 +527,22 @@ int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap)
 	path.nodes[level] = snap;
 	path.slots[level] = 0;
 	while(1) {
-		ret = walk_down_tree(root, &path, &level);
-		if (ret > 0)
+		wret = walk_down_tree(root, &path, &level);
+		if (wret > 0)
 			break;
-		ret = walk_up_tree(root, &path, &level);
-		if (ret > 0)
+		if (wret < 0)
+			ret = wret;
+
+		wret = walk_up_tree(root, &path, &level);
+		if (wret > 0)
 			break;
+		if (wret < 0)
+			ret = wret;
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path.nodes[i]) {
 			btrfs_block_release(root, path.nodes[i]);
 		}
 	}
-
-	return 0;
+	return ret;
 }
-- 
cgit v1.2.3


From eaee50e881b88766d9a781c54c4142456cf5e527 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Mar 2007 11:17:52 -0400
Subject: Btrfs: merge leaves before split

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO    | 2 --
 fs/btrfs/ctree.c | 9 +++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 52d2a7a071f..ad29267b873 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,10 +1,8 @@
 * cleanup, add more error checking, get rid of BUG_ONs
-* endian fixes
 * Make IO functions look more like the page cache
 * Fix ENOSPC handling
 * make blocksize a mkfs parameter instead of #define
 * make a real mkfs and superblock
-* Add shadowing and transactions
 * Do checksumming
 * Define FS objects in terms of different item types
 * Add block mapping tree (simple dm layer)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bbb63229170..dc620186e50 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1043,6 +1043,15 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	int ret;
 	int wret;
 
+	wret = push_leaf_left(root, path, data_size);
+	if (wret < 0)
+		return wret;
+	if (wret) {
+		wret = push_leaf_right(root, path, data_size);
+		if (wret < 0)
+			return wret;
+	}
+
 	l_buf = path->nodes[0];
 	l = &l_buf->leaf;
 
-- 
cgit v1.2.3


From 3768f3689fc76ecea17414936dff7a02746a4355 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Mar 2007 16:47:54 -0400
Subject: Btrfs: Change the super to point to a tree of trees to enable
 persistent snapshots

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   3 +-
 fs/btrfs/TODO          |   1 +
 fs/btrfs/ctree.h       | 123 ++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/debug-tree.c  |   4 +-
 fs/btrfs/disk-io.c     | 135 +++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/disk-io.h     |   3 +-
 fs/btrfs/extent-tree.c |  15 +++---
 fs/btrfs/mkfs.c        | 128 +++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/print-tree.c  |  10 ++--
 fs/btrfs/root-tree.c   |  88 ++++++++++++++++++++++++++++++++
 10 files changed, 445 insertions(+), 65 deletions(-)
 create mode 100644 fs/btrfs/root-tree.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f8532200e9a..13ed6313ac8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -2,7 +2,8 @@
 CC=gcc
 CFLAGS = -g -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
-objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o
+objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \
+	  root-tree.o
 
 # if you don't have sparse installed, use ls instead
 CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index ad29267b873..ff90e5eaf43 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -5,6 +5,7 @@
 * make a real mkfs and superblock
 * Do checksumming
 * Define FS objects in terms of different item types
+* add inode tree
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5dfcc90812c..0968899fb7f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4,8 +4,13 @@
 #include "list.h"
 #include "kerncompat.h"
 
+#define BTRFS_MAGIC "_BtRfS_M"
 #define BTRFS_BLOCKSIZE 1024
 
+#define BTRFS_ROOT_TREE_OBJECTID 1
+#define BTRFS_EXTENT_TREE_OBJECTID 2
+#define BTRFS_FS_TREE_OBJECTID 3
+
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
  * block layout.  objectid corresonds to the inode number.  The flags
@@ -36,7 +41,7 @@ struct btrfs_key {
  * every tree block (leaf or node) starts with this header.
  */
 struct btrfs_header {
-	__le64 fsid[2]; /* FS specific uuid */
+	u8 fsid[16]; /* FS specific uuid */
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 parentid; /* objectid of the tree root */
 	__le32 csum;
@@ -52,6 +57,14 @@ struct btrfs_header {
 
 struct btrfs_buffer;
 
+struct btrfs_root_item {
+	__le64 blocknr;
+	__le32 flags;
+	__le64 block_limit;
+	__le64 blocks_used;
+	__le32 refs;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.  current_insert is used
@@ -61,6 +74,7 @@ struct btrfs_root {
 	struct btrfs_buffer *node;
 	struct btrfs_buffer *commit_root;
 	struct btrfs_root *extent_root;
+	struct btrfs_root *tree_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
 	int fp;
@@ -69,28 +83,25 @@ struct btrfs_root {
 	struct list_head trans;
 	struct list_head cache;
 	int cache_size;
+	int ref_cows;
+	struct btrfs_root_item root_item;
+	struct btrfs_key root_key;
 };
 
-/*
- * describes a tree on disk
- */
-struct btrfs_root_info {
-	u64 fsid[2]; /* FS specific uuid */
-	u64 blocknr; /* blocknr of this block */
-	u64 objectid; /* inode number of this root */
-	u64 tree_root; /* the tree root block */
-	u32 csum;
-	u32 ham;
-	u64 snapuuid[2]; /* root specific uuid */
-} __attribute__ ((__packed__));
-
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
  */
 struct btrfs_super_block {
-	struct btrfs_root_info root_info;
-	struct btrfs_root_info extent_info;
+	u8 fsid[16];    /* FS specific uuid */
+	__le64 blocknr; /* this block number */
+	__le32 csum;
+	__le64 magic;
+	__le16 blocksize;
+	__le64 generation;
+	__le64 root;
+	__le64 total_blocks;
+	__le64 blocks_used;
 } __attribute__ ((__packed__));
 
 /*
@@ -317,6 +328,79 @@ static inline int btrfs_is_leaf(struct btrfs_node *n)
 	return (btrfs_header_level(&n->header) == 0);
 }
 
+static inline u64 btrfs_root_blocknr(struct btrfs_root_item *item)
+{
+	return le64_to_cpu(item->blocknr);
+}
+
+static inline void btrfs_set_root_blocknr(struct btrfs_root_item *item, u64 val)
+{
+	item->blocknr = cpu_to_le64(val);
+}
+
+static inline u32 btrfs_root_refs(struct btrfs_root_item *item)
+{
+	return le32_to_cpu(item->refs);
+}
+
+static inline void btrfs_set_root_refs(struct btrfs_root_item *item, u32 val)
+{
+	item->refs = cpu_to_le32(val);
+}
+
+static inline u64 btrfs_super_blocknr(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->blocknr);
+}
+
+static inline void btrfs_set_super_blocknr(struct btrfs_super_block *s, u64 val)
+{
+	s->blocknr = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_super_root(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->root);
+}
+
+static inline void btrfs_set_super_root(struct btrfs_super_block *s, u64 val)
+{
+	s->root = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_super_total_blocks(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->total_blocks);
+}
+
+static inline void btrfs_set_super_total_blocks(struct btrfs_super_block *s,
+						u64 val)
+{
+	s->total_blocks = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_super_blocks_used(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->blocks_used);
+}
+
+static inline void btrfs_set_super_blocks_used(struct btrfs_super_block *s,
+						u64 val)
+{
+	s->blocks_used = cpu_to_le64(val);
+}
+
+static inline u16 btrfs_super_blocksize(struct btrfs_super_block *s)
+{
+	return le16_to_cpu(s->blocksize);
+}
+
+static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s,
+						u16 val)
+{
+	s->blocksize = cpu_to_le16(val);
+}
+
 struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root);
 int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf);
 int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks);
@@ -331,4 +415,11 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_leaf *leaf);
 int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap);
 int btrfs_finish_extent_commit(struct btrfs_root *root);
+int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key,
+		      struct btrfs_root_item *item);
+int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key,
+		      struct btrfs_root_item *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+			struct btrfs_root_item *item, struct btrfs_key *key);
 #endif
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
index 6da0a7aa0f4..de45fb4dfdd 100644
--- a/fs/btrfs/debug-tree.c
+++ b/fs/btrfs/debug-tree.c
@@ -11,9 +11,11 @@ int main(int ac, char **av) {
 	struct btrfs_root *root;
 	radix_tree_init();
 	root = open_ctree("dbfile", &super);
-	printf("root tree\n");
+	printf("fs tree\n");
 	btrfs_print_tree(root, root->node);
 	printf("map tree\n");
 	btrfs_print_tree(root->extent_root, root->extent_root->node);
+	printf("root tree\n");
+	btrfs_print_tree(root->tree_root, root->tree_root->node);
 	return 0;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c34c0c60935..3d4bf6833f2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -154,41 +154,96 @@ static int __commit_transaction(struct btrfs_root *root)
 	return ret;
 }
 
+static int commit_extent_and_tree_roots(struct btrfs_root *tree_root,
+					struct btrfs_root *extent_root)
+{
+	int ret;
+	u64 old_extent_block;
+
+	while(1) {
+		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
+		if (old_extent_block == extent_root->node->blocknr)
+			break;
+		btrfs_set_root_blocknr(&extent_root->root_item,
+				       extent_root->node->blocknr);
+		ret = btrfs_update_root(tree_root,
+					&extent_root->root_key,
+					&extent_root->root_item);
+		BUG_ON(ret);
+	}
+	__commit_transaction(extent_root);
+	__commit_transaction(tree_root);
+	return 0;
+}
+
 int btrfs_commit_transaction(struct btrfs_root *root,
 			     struct btrfs_super_block *s)
 {
 	int ret = 0;
+	struct btrfs_buffer *snap = root->commit_root;
+	struct btrfs_key snap_key;
 
 	ret = __commit_transaction(root);
-	if (!ret && root != root->extent_root)
-		ret = __commit_transaction(root->extent_root);
 	BUG_ON(ret);
-	if (root->commit_root != root->node) {
-		struct btrfs_buffer *snap = root->commit_root;
-		root->commit_root = root->node;
-		root->node->count++;
-		ret = btrfs_drop_snapshot(root, snap);
-		BUG_ON(ret);
-		// btrfs_block_release(root, snap);
-	}
+
+	if (root->commit_root == root->node)
+		return 0;
+
+	memcpy(&snap_key, &root->root_key, sizeof(snap_key));
+	root->root_key.offset++;
+
+	btrfs_set_root_blocknr(&root->root_item, root->node->blocknr);
+	ret = btrfs_insert_root(root->tree_root, &root->root_key,
+				&root->root_item);
+	BUG_ON(ret);
+
+	ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root);
+	BUG_ON(ret);
+
         write_ctree_super(root, s);
-	btrfs_finish_extent_commit(root);
+	btrfs_finish_extent_commit(root->extent_root);
+	btrfs_finish_extent_commit(root->tree_root);
+
+	root->commit_root = root->node;
+	root->node->count++;
+	ret = btrfs_drop_snapshot(root, snap);
+	BUG_ON(ret);
+
+	ret = btrfs_del_root(root->tree_root, &snap_key);
+	BUG_ON(ret);
+
 	return ret;
 }
 
-static int __setup_root(struct btrfs_root *root, struct btrfs_root *extent_root,
-			struct btrfs_root_info *info, int fp)
+static int __setup_root(struct btrfs_root *root, u64 objectid, int fp)
 {
 	INIT_LIST_HEAD(&root->trans);
 	INIT_LIST_HEAD(&root->cache);
 	root->cache_size = 0;
 	root->fp = fp;
 	root->node = NULL;
-	root->extent_root = extent_root;
 	root->commit_root = NULL;
-	root->node = read_tree_block(root, info->tree_root);
 	memset(&root->current_insert, 0, sizeof(root->current_insert));
 	memset(&root->last_insert, 0, sizeof(root->last_insert));
+	memset(&root->root_key, 0, sizeof(root->root_key));
+	memset(&root->root_item, 0, sizeof(root->root_item));
+	return 0;
+}
+
+static int find_and_setup_root(struct btrfs_root *tree_root, u64 objectid,
+			struct btrfs_root *root, int fp)
+{
+	int ret;
+
+	__setup_root(root, objectid, fp);
+	ret = btrfs_find_last_root(tree_root, objectid,
+				   &root->root_item, &root->root_key);
+	BUG_ON(ret);
+
+	root->node = read_tree_block(root,
+				     btrfs_root_blocknr(&root->root_item));
+	root->ref_cows = 0;
+	BUG_ON(!root->node);
 	return 0;
 }
 
@@ -196,9 +251,19 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 {
 	struct btrfs_root *root = malloc(sizeof(struct btrfs_root));
 	struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root));
+	struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root));
 	int fp;
 	int ret;
 
+	root->extent_root = extent_root;
+	root->tree_root = tree_root;
+
+	extent_root->extent_root = extent_root;
+	extent_root->tree_root = tree_root;
+
+	tree_root->extent_root = extent_root;
+	tree_root->tree_root = tree_root;
+
 	fp = open(filename, O_CREAT | O_RDWR, 0600);
 	if (fp < 0) {
 		free(root);
@@ -208,11 +273,14 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 	INIT_RADIX_TREE(&root->pinned_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&extent_root->pinned_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&tree_root->pinned_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&tree_root->cache_radix, GFP_KERNEL);
+
 	ret = pread(fp, super, sizeof(struct btrfs_super_block),
 		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
-	if (ret == 0 || super->root_info.tree_root == 0) {
+	if (ret == 0 || btrfs_super_root(super) == 0) {
 		printf("making new FS!\n");
-		ret = mkfs(fp);
+		ret = mkfs(fp, 0, BTRFS_BLOCKSIZE);
 		if (ret)
 			return NULL;
 		ret = pread(fp, super, sizeof(struct btrfs_super_block),
@@ -221,24 +289,29 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 			return NULL;
 	}
 	BUG_ON(ret < 0);
-	__setup_root(root, extent_root, &super->root_info, fp);
-	__setup_root(extent_root, extent_root, &super->extent_info, fp);
+
+	__setup_root(tree_root, BTRFS_ROOT_TREE_OBJECTID, fp);
+	tree_root->node = read_tree_block(tree_root, btrfs_super_root(super));
+	BUG_ON(!tree_root->node);
+
+	ret = find_and_setup_root(tree_root, BTRFS_EXTENT_TREE_OBJECTID,
+				  extent_root, fp);
+	BUG_ON(ret);
+
+	ret = find_and_setup_root(tree_root, BTRFS_FS_TREE_OBJECTID,
+				  root, fp);
+	BUG_ON(ret);
+
 	root->commit_root = root->node;
 	root->node->count++;
+	root->ref_cows = 1;
 	return root;
 }
 
-static int __update_root(struct btrfs_root *root, struct btrfs_root_info *info)
-{
-	info->tree_root = root->node->blocknr;
-	return 0;
-}
-
 int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s)
 {
 	int ret;
-	__update_root(root, &s->root_info);
-	__update_root(root->extent_root, &s->extent_info);
+	btrfs_set_super_root(s, root->tree_root->node->blocknr);
 	ret = pwrite(root->fp, s, sizeof(*s),
 		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
 	if (ret != sizeof(*s)) {
@@ -260,19 +333,25 @@ static int drop_cache(struct btrfs_root *root)
 }
 int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s)
 {
+	int ret;
 	btrfs_commit_transaction(root, s);
-	__commit_transaction(root->extent_root);
+	ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root);
+	BUG_ON(ret);
 	write_ctree_super(root, s);
 	drop_cache(root->extent_root);
+	drop_cache(root->tree_root);
 	drop_cache(root);
 	BUG_ON(!list_empty(&root->trans));
 	BUG_ON(!list_empty(&root->extent_root->trans));
+	BUG_ON(!list_empty(&root->tree_root->trans));
 
 	close(root->fp);
 	if (root->node)
 		btrfs_block_release(root, root->node);
 	if (root->extent_root->node)
 		btrfs_block_release(root->extent_root, root->extent_root->node);
+	if (root->tree_root->node)
+		btrfs_block_release(root->tree_root, root->tree_root->node);
 	btrfs_block_release(root, root->commit_root);
 	free(root);
 	printf("on close %d blocks are allocated\n", allocated_blocks);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b391335864b..c22a61f9233 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -24,7 +24,8 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s);
 int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s);
 void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf);
 int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s);
-int mkfs(int fd);
+int mkfs(int fd, u64 num_blocks, u16 blocksize);
+
 
 #define BTRFS_SUPER_INFO_OFFSET(bs) (16 * (bs))
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a6969538bca..21f39b40469 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -82,7 +82,7 @@ int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf)
 	u64 blocknr;
 	int i;
 
-	if (root == root->extent_root)
+	if (!root->ref_cows)
 		return 0;
 	if (btrfs_is_leaf(&buf->node))
 		return 0;
@@ -96,23 +96,22 @@ int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf)
 
 int btrfs_finish_extent_commit(struct btrfs_root *root)
 {
-	struct btrfs_root *extent_root = root->extent_root;
 	unsigned long gang[8];
 	int ret;
 	int i;
 
 	while(1) {
-		ret = radix_tree_gang_lookup(&extent_root->pinned_radix,
+		ret = radix_tree_gang_lookup(&root->pinned_radix,
 						 (void **)gang, 0,
 						 ARRAY_SIZE(gang));
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			radix_tree_delete(&extent_root->pinned_radix, gang[i]);
+			radix_tree_delete(&root->pinned_radix, gang[i]);
 		}
 	}
-	extent_root->last_insert.objectid = 0;
-	extent_root->last_insert.offset = 0;
+	root->last_insert.objectid = 0;
+	root->last_insert.offset = 0;
 	return 0;
 }
 
@@ -173,7 +172,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
 	if (refs == 0) {
-		if (root == extent_root) {
+		if (!root->ref_cows) {
 			int err;
 			radix_tree_preload(GFP_KERNEL);
 			err = radix_tree_insert(&extent_root->pinned_radix,
@@ -513,7 +512,7 @@ static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
  */
 int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap)
 {
-	int ret = 0;;
+	int ret = 0;
 	int wret;
 	int level;
 	struct btrfs_path path;
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index fc1923320de..dd14ed4fea6 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -10,6 +10,120 @@
 #include "ctree.h"
 #include "disk-io.h"
 
+int mkfs(int fd, u64 num_blocks, u16 blocksize)
+{
+	struct btrfs_super_block super;
+	struct btrfs_leaf empty_leaf;
+	struct btrfs_root_item root_item;
+	struct btrfs_item item;
+	struct btrfs_extent_item extent_item;
+	char *block;
+	int ret;
+	u16 itemoff;
+
+	btrfs_set_super_blocknr(&super, 16);
+	btrfs_set_super_root(&super, 17);
+	strcpy((char *)(&super.magic), BTRFS_MAGIC);
+	btrfs_set_super_blocksize(&super, blocksize);
+	btrfs_set_super_total_blocks(&super, num_blocks);
+	btrfs_set_super_blocks_used(&super, 0);
+
+	block = malloc(blocksize);
+	memset(block, 0, blocksize);
+	BUG_ON(sizeof(super) > blocksize);
+	memcpy(block, &super, sizeof(super));
+	ret = pwrite(fd, block, blocksize, BTRFS_SUPER_INFO_OFFSET(blocksize));
+	BUG_ON(ret != blocksize);
+
+	/* create the tree of root objects */
+	memset(&empty_leaf, 0, sizeof(empty_leaf));
+	btrfs_set_header_parentid(&empty_leaf.header, BTRFS_ROOT_TREE_OBJECTID);
+	btrfs_set_header_blocknr(&empty_leaf.header, 17);
+	btrfs_set_header_nritems(&empty_leaf.header, 2);
+
+	/* create the items for the root tree */
+	btrfs_set_root_blocknr(&root_item, 18);
+	btrfs_set_root_refs(&root_item, 1);
+	itemoff = LEAF_DATA_SIZE - sizeof(root_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_item_size(&item, sizeof(root_item));
+	btrfs_set_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID);
+	btrfs_set_key_offset(&item.key, 0);
+	btrfs_set_key_flags(&item.key, 0);
+	memcpy(empty_leaf.items, &item, sizeof(item));
+	memcpy(empty_leaf.data + itemoff, &root_item, sizeof(root_item));
+
+	btrfs_set_root_blocknr(&root_item, 19);
+	itemoff = itemoff - sizeof(root_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID);
+	memcpy(empty_leaf.items + 1, &item, sizeof(item));
+	memcpy(empty_leaf.data + itemoff, &root_item, sizeof(root_item));
+	ret = pwrite(fd, &empty_leaf, blocksize, 17 * blocksize);
+
+	/* create the items for the extent tree */
+	btrfs_set_header_parentid(&empty_leaf.header,
+				  BTRFS_EXTENT_TREE_OBJECTID);
+	btrfs_set_header_blocknr(&empty_leaf.header, 18);
+	btrfs_set_header_nritems(&empty_leaf.header, 4);
+
+	/* item1, reserve blocks 0-16 */
+	btrfs_set_key_objectid(&item.key, 0);
+	btrfs_set_key_offset(&item.key, 17);
+	btrfs_set_key_flags(&item.key, 0);
+	itemoff = LEAF_DATA_SIZE - sizeof(struct btrfs_extent_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_item_size(&item, sizeof(struct btrfs_extent_item));
+	btrfs_set_extent_refs(&extent_item, 1);
+	btrfs_set_extent_owner(&extent_item, 0);
+	memcpy(empty_leaf.items, &item, sizeof(item));
+	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
+		btrfs_item_size(&item));
+
+	/* item2, give block 17 to the root */
+	btrfs_set_key_objectid(&item.key, 17);
+	btrfs_set_key_offset(&item.key, 1);
+	itemoff = itemoff - sizeof(struct btrfs_extent_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_extent_owner(&extent_item, BTRFS_ROOT_TREE_OBJECTID);
+	memcpy(empty_leaf.items + 1, &item, sizeof(item));
+	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
+		btrfs_item_size(&item));
+
+	/* item3, give block 18 to the extent root */
+	btrfs_set_key_objectid(&item.key, 18);
+	btrfs_set_key_offset(&item.key, 1);
+	itemoff = itemoff - sizeof(struct btrfs_extent_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_extent_owner(&extent_item, BTRFS_EXTENT_TREE_OBJECTID);
+	memcpy(empty_leaf.items + 2, &item, sizeof(item));
+	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
+		btrfs_item_size(&item));
+
+	/* item4, give block 19 to the FS root */
+	btrfs_set_key_objectid(&item.key, 19);
+	btrfs_set_key_offset(&item.key, 1);
+	itemoff = itemoff - sizeof(struct btrfs_extent_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID);
+	memcpy(empty_leaf.items + 3, &item, sizeof(item));
+	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
+		btrfs_item_size(&item));
+	ret = pwrite(fd, &empty_leaf, blocksize, 18 * blocksize);
+	if (ret != sizeof(empty_leaf))
+		return -1;
+
+	/* finally create the FS root */
+	btrfs_set_header_parentid(&empty_leaf.header, BTRFS_FS_TREE_OBJECTID);
+	btrfs_set_header_blocknr(&empty_leaf.header, 19);
+	btrfs_set_header_nritems(&empty_leaf.header, 0);
+	ret = pwrite(fd, &empty_leaf, blocksize, 19 * blocksize);
+	if (ret != sizeof(empty_leaf))
+		return -1;
+	return 0;
+}
+
+#if 0
 int mkfs(int fd)
 {
 	struct btrfs_root_info info[2];
@@ -20,13 +134,14 @@ int mkfs(int fd)
 
 	/* setup the super block area */
 	memset(info, 0, sizeof(info));
-	info[0].blocknr = 16;
-	info[0].objectid = 1;
-	info[0].tree_root = 17;
+	btrfs_set_root_blocknr(info, 16);
+	btrfs_set_root_objectid(info, 1);
+	btrfs_set_root_tree_root(info, 17);
+
+	btrfs_set_root_blocknr(info + 1, 16);
+	btrfs_set_root_objectid(info + 1, 2);
+	btrfs_set_root_tree_root(info + 1, 18);
 
-	info[1].blocknr = 16;
-	info[1].objectid = 2;
-	info[1].tree_root = 18;
 	ret = pwrite(fd, info, sizeof(info),
 		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
 	if (ret != sizeof(info))
@@ -81,3 +196,4 @@ int mkfs(int fd)
 		return -1;
 	return 0;
 }
+#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index c95c85640aa..64829b6b90f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -11,6 +11,7 @@ void btrfs_print_leaf(struct btrfs_leaf *l)
 	u32 nr = btrfs_header_nritems(&l->header);
 	struct btrfs_item *item;
 	struct btrfs_extent_item *ei;
+	struct btrfs_root_item *ri;
 	printf("leaf %Lu total ptrs %d free space %d\n",
 		btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(l));
 	fflush(stdout);
@@ -23,13 +24,15 @@ void btrfs_print_leaf(struct btrfs_leaf *l)
 			btrfs_key_offset(&item->key),
 			btrfs_item_offset(item),
 			btrfs_item_size(item));
-		fflush(stdout);
 		printf("\t\titem data %.*s\n", btrfs_item_size(item),
 			l->data + btrfs_item_offset(item));
 		ei = (struct btrfs_extent_item *)(l->data +
 						  btrfs_item_offset(item));
-		printf("\t\textent data refs %u owner %Lu\n", ei->refs,
-			ei->owner);
+		printf("\t\textent data refs %u owner %Lu\n",
+			btrfs_extent_refs(ei), btrfs_extent_owner(ei));
+		ri = (struct btrfs_root_item *)ei;
+		printf("\t\troot data blocknr %Lu refs %u\n",
+			btrfs_root_blocknr(ri), btrfs_root_refs(ri));
 		fflush(stdout);
 	}
 }
@@ -71,6 +74,5 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 		btrfs_print_tree(root, next_buf);
 		btrfs_block_release(root, next_buf);
 	}
-
 }
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
new file mode 100644
index 00000000000..0ab90cfea98
--- /dev/null
+++ b/fs/btrfs/root-tree.c
@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
+			struct btrfs_root_item *item, struct btrfs_key *key)
+{
+	struct btrfs_path path;
+	struct btrfs_key search_key;
+	struct btrfs_leaf *l;
+	int ret;
+	int slot;
+
+	search_key.objectid = objectid;
+	search_key.flags = (u32)-1;
+	search_key.offset = (u32)-1;
+
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(root, &search_key, &path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+	l = &path.nodes[0]->leaf;
+	BUG_ON(path.slots[0] == 0);
+	slot = path.slots[0] - 1;
+	if (btrfs_key_objectid(&l->items[slot].key) != objectid) {
+		ret = 1;
+		goto out;
+	}
+	memcpy(item, l->data + btrfs_item_offset(l->items + slot),
+		sizeof(*item));
+	btrfs_disk_key_to_cpu(key, &l->items[slot].key);
+	btrfs_release_path(root, &path);
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key,
+		      struct btrfs_root_item *item)
+{
+	struct btrfs_path path;
+	struct btrfs_leaf *l;
+	int ret;
+	int slot;
+
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(root, key, &path, 0, 1);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret != 0);
+	l = &path.nodes[0]->leaf;
+	slot = path.slots[0];
+	memcpy(l->data + btrfs_item_offset(l->items + slot), item,
+		sizeof(*item));
+out:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key,
+		      struct btrfs_root_item *item)
+{
+	int ret;
+	ret = btrfs_insert_item(root, key, item, sizeof(*item));
+	BUG_ON(ret);
+	return ret;
+}
+
+int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key)
+{
+	struct btrfs_path path;
+	int ret;
+
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(root, key, &path, -1, 1);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret != 0);
+	ret = btrfs_del_item(root, &path);
+out:
+	btrfs_release_path(root, &path);
+	return ret;
+}
-- 
cgit v1.2.3


From 710874947aa0f81c3b348b052e81cb9075113c49 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Mar 2007 09:20:39 -0400
Subject: Btrfs: properly reset block cache on free

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 21f39b40469..7f156d9b708 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -182,7 +182,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 		}
 		ret = btrfs_del_item(extent_root, &path);
 		if (root != extent_root &&
-		    extent_root->last_insert.objectid < blocknr)
+		    extent_root->last_insert.objectid > blocknr)
 			extent_root->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
-- 
cgit v1.2.3


From 4beb1b8b75a86373f6020103ab840448d14c8880 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Mar 2007 10:31:29 -0400
Subject: Btrfs: add leaf data casting helper

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 4 ++++
 fs/btrfs/extent-tree.c | 8 ++------
 fs/btrfs/print-tree.c  | 5 ++---
 3 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0968899fb7f..05c7707263f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -401,6 +401,10 @@ static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s,
 	s->blocksize = cpu_to_le16(val);
 }
 
+/* helper function to cast into the data area of the leaf. */
+#define btrfs_item_ptr(leaf, slot, type) \
+	((type *)((leaf)->data + btrfs_item_offset((leaf)->items + (slot))))
+
 struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root);
 int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf);
 int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7f156d9b708..3767744f659 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -41,9 +41,7 @@ static int inc_block_ref(struct btrfs_root *root, u64 blocknr)
 		BUG();
 	BUG_ON(ret != 0);
 	l = &path.nodes[0]->leaf;
-	item = (struct btrfs_extent_item *)(l->data +
-					    btrfs_item_offset(l->items +
-							      path.slots[0]));
+	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(item);
 	btrfs_set_extent_refs(item, refs + 1);
 
@@ -69,9 +67,7 @@ static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs)
 	if (ret != 0)
 		BUG();
 	l = &path.nodes[0]->leaf;
-	item = (struct btrfs_extent_item *)(l->data +
-				      btrfs_item_offset(l->items +
-							path.slots[0]));
+	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
 	*refs = btrfs_extent_refs(item);
 	btrfs_release_path(root->extent_root, &path);
 	return 0;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 64829b6b90f..e769f36cf05 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -26,11 +26,10 @@ void btrfs_print_leaf(struct btrfs_leaf *l)
 			btrfs_item_size(item));
 		printf("\t\titem data %.*s\n", btrfs_item_size(item),
 			l->data + btrfs_item_offset(item));
-		ei = (struct btrfs_extent_item *)(l->data +
-						  btrfs_item_offset(item));
+		ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
 		printf("\t\textent data refs %u owner %Lu\n",
 			btrfs_extent_refs(ei), btrfs_extent_owner(ei));
-		ri = (struct btrfs_root_item *)ei;
+		ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
 		printf("\t\troot data blocknr %Lu refs %u\n",
 			btrfs_root_blocknr(ri), btrfs_root_refs(ri));
 		fflush(stdout);
-- 
cgit v1.2.3


From 123abc88c9087b9c5605566ee3491aaef17fd837 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Mar 2007 14:14:43 -0400
Subject: Btrfs: variable block size support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 225 ++++++++++++++++++++++++-------------------------
 fs/btrfs/ctree.h       |  62 ++++++++------
 fs/btrfs/disk-io.c     |  42 ++++-----
 fs/btrfs/disk-io.h     |   9 +-
 fs/btrfs/extent-tree.c |   6 +-
 fs/btrfs/mkfs.c        | 171 +++++++++++--------------------------
 fs/btrfs/print-tree.c  |  15 ++--
 fs/btrfs/print-tree.h  |   2 +-
 fs/btrfs/quick-test.c  |   2 +-
 fs/btrfs/root-tree.c   |   4 +-
 10 files changed, 238 insertions(+), 300 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dc620186e50..ece8de7f38e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -47,7 +47,7 @@ static int btrfs_cow_block(struct btrfs_root *root,
 		return 0;
 	}
 	cow = btrfs_alloc_free_block(root);
-	memcpy(&cow->node, &buf->node, sizeof(buf->node));
+	memcpy(&cow->node, &buf->node, root->blocksize);
 	btrfs_set_header_blocknr(&cow->node.header, cow->blocknr);
 	*cow_ret = cow;
 	btrfs_inc_ref(root, buf);
@@ -72,11 +72,12 @@ static int btrfs_cow_block(struct btrfs_root *root,
  * this returns the address of the start of the last item,
  * which is the stop of the leaf data stack
  */
-static inline unsigned int leaf_data_end(struct btrfs_leaf *leaf)
+static inline unsigned int leaf_data_end(struct btrfs_root *root,
+					 struct btrfs_leaf *leaf)
 {
 	u32 nr = btrfs_header_nritems(&leaf->header);
 	if (nr == 0)
-		return sizeof(leaf->data);
+		return BTRFS_LEAF_DATA_SIZE(root);
 	return btrfs_item_offset(leaf->items + nr - 1);
 }
 
@@ -85,12 +86,12 @@ static inline unsigned int leaf_data_end(struct btrfs_leaf *leaf)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int btrfs_leaf_free_space(struct btrfs_leaf *leaf)
+int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf)
 {
-	int data_end = leaf_data_end(leaf);
+	int data_end = leaf_data_end(root, leaf);
 	int nritems = btrfs_header_nritems(&leaf->header);
 	char *items_end = (char *)(leaf->items + nritems + 1);
-	return (char *)(leaf->data + data_end) - (char *)items_end;
+	return (char *)(btrfs_leaf_data(leaf) + data_end) - (char *)items_end;
 }
 
 /*
@@ -117,7 +118,8 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 	return 0;
 }
 
-static int check_node(struct btrfs_path *path, int level)
+static int check_node(struct btrfs_root *root, struct btrfs_path *path,
+		      int level)
 {
 	int i;
 	struct btrfs_node *parent = NULL;
@@ -131,22 +133,23 @@ static int check_node(struct btrfs_path *path, int level)
 	BUG_ON(nritems == 0);
 	if (parent) {
 		struct btrfs_disk_key *parent_key;
-		parent_key = &parent->keys[parent_slot];
-		BUG_ON(memcmp(parent_key, node->keys,
+		parent_key = &parent->ptrs[parent_slot].key;
+		BUG_ON(memcmp(parent_key, &node->ptrs[0].key,
 			      sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
 		       btrfs_header_blocknr(&node->header));
 	}
-	BUG_ON(nritems > NODEPTRS_PER_BLOCK);
+	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
 	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
 		struct btrfs_key cpukey;
-		btrfs_disk_key_to_cpu(&cpukey, &node->keys[i + 1]);
-		BUG_ON(comp_keys(&node->keys[i], &cpukey) >= 0);
+		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[i + 1].key);
+		BUG_ON(comp_keys(&node->ptrs[i].key, &cpukey) >= 0);
 	}
 	return 0;
 }
 
-static int check_leaf(struct btrfs_path *path, int level)
+static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
+		      int level)
 {
 	int i;
 	struct btrfs_leaf *leaf = &path->nodes[level]->leaf;
@@ -157,14 +160,14 @@ static int check_leaf(struct btrfs_path *path, int level)
 	if (path->nodes[level + 1])
 		parent = &path->nodes[level + 1]->node;
 	parent_slot = path->slots[level + 1];
-	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
 
 	if (nritems == 0)
 		return 0;
 
 	if (parent) {
 		struct btrfs_disk_key *parent_key;
-		parent_key = &parent->keys[parent_slot];
+		parent_key = &parent->ptrs[parent_slot].key;
 		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
 		       sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
@@ -180,17 +183,18 @@ static int check_leaf(struct btrfs_path *path, int level)
 		if (i == 0) {
 			BUG_ON(btrfs_item_offset(leaf->items + i) +
 			       btrfs_item_size(leaf->items + i) !=
-			       LEAF_DATA_SIZE);
+			       BTRFS_LEAF_DATA_SIZE(root));
 		}
 	}
 	return 0;
 }
 
-static int check_block(struct btrfs_path *path, int level)
+static int check_block(struct btrfs_root *root, struct btrfs_path *path,
+			int level)
 {
 	if (level == 0)
-		return check_leaf(path, level);
-	return check_node(path, level);
+		return check_leaf(root, path, level);
+	return check_node(root, path, level);
 }
 
 /*
@@ -242,8 +246,8 @@ static int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot)
 					  key, btrfs_header_nritems(&c->header),
 					  slot);
 	} else {
-		return generic_bin_search((void *)c->keys,
-					  sizeof(struct btrfs_disk_key),
+		return generic_bin_search((void *)c->ptrs,
+					  sizeof(struct btrfs_key_ptr),
 					  key, btrfs_header_nritems(&c->header),
 					  slot);
 	}
@@ -311,7 +315,8 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 	}
 	parent = &parent_buf->node;
 
-	if (btrfs_header_nritems(&mid->header) > NODEPTRS_PER_BLOCK / 4)
+	if (btrfs_header_nritems(&mid->header) >
+	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
 	left_buf = read_node_slot(root, parent_buf, pslot - 1);
@@ -351,7 +356,8 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 			if (wret)
 				ret = wret;
 		} else {
-			memcpy(parent->keys + pslot + 1, right->keys,
+			memcpy(&parent->ptrs[pslot + 1].key,
+				&right->ptrs[0].key,
 				sizeof(struct btrfs_disk_key));
 			BUG_ON(list_empty(&parent_buf->dirty));
 		}
@@ -387,7 +393,7 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 			ret = wret;
 	} else {
 		/* update the parent key to reflect our changes */
-		memcpy(parent->keys + pslot, mid->keys,
+		memcpy(&parent->ptrs[pslot].key, &mid->ptrs[0].key,
 		       sizeof(struct btrfs_disk_key));
 		BUG_ON(list_empty(&parent_buf->dirty));
 	}
@@ -407,7 +413,7 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 		}
 	}
 	/* double check we haven't messed things up */
-	check_block(path, level);
+	check_block(root, path, level);
 	if (orig_ptr != btrfs_node_blockptr(&path->nodes[level]->node,
 					    path->slots[level]))
 		BUG();
@@ -456,7 +462,7 @@ again:
 		BUG_ON(!cow && ins_len);
 		c = &b->node;
 		p->nodes[level] = b;
-		ret = check_block(p, level);
+		ret = check_block(root, p, level);
 		if (ret)
 			return -1;
 		ret = bin_search(c, key, &slot);
@@ -465,7 +471,7 @@ again:
 				slot -= 1;
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_header_nritems(&c->header) ==
-			    NODEPTRS_PER_BLOCK) {
+			    BTRFS_NODEPTRS_PER_BLOCK(root)) {
 				int sret = split_node(root, p, level);
 				BUG_ON(sret > 0);
 				if (sret)
@@ -488,7 +494,7 @@ again:
 		} else {
 			struct btrfs_leaf *l = (struct btrfs_leaf *)c;
 			p->slots[level] = slot;
-			if (ins_len > 0 && btrfs_leaf_free_space(l) <
+			if (ins_len > 0 && btrfs_leaf_free_space(root, l) <
 			    sizeof(struct btrfs_item) + ins_len) {
 				int sret = split_leaf(root, p, ins_len);
 				BUG_ON(sret > 0);
@@ -525,7 +531,7 @@ static int fixup_low_keys(struct btrfs_root *root,
 		if (!path->nodes[i])
 			break;
 		t = &path->nodes[i]->node;
-		memcpy(t->keys + tslot, key, sizeof(*key));
+		memcpy(&t->ptrs[tslot].key, key, sizeof(*key));
 		BUG_ON(list_empty(&path->nodes[i]->dirty));
 		if (tslot != 0)
 			break;
@@ -552,7 +558,7 @@ static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst_buf,
 
 	src_nritems = btrfs_header_nritems(&src->header);
 	dst_nritems = btrfs_header_nritems(&dst->header);
-	push_items = NODEPTRS_PER_BLOCK - dst_nritems;
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
 	if (push_items <= 0) {
 		return 1;
 	}
@@ -560,16 +566,12 @@ static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst_buf,
 	if (src_nritems < push_items)
 		push_items = src_nritems;
 
-	memcpy(dst->keys + dst_nritems, src->keys,
-		push_items * sizeof(struct btrfs_disk_key));
-	memcpy(dst->blockptrs + dst_nritems, src->blockptrs,
-		push_items * sizeof(u64));
+	memcpy(dst->ptrs + dst_nritems, src->ptrs,
+		push_items * sizeof(struct btrfs_key_ptr));
 	if (push_items < src_nritems) {
-		memmove(src->keys, src->keys + push_items,
+		memmove(src->ptrs, src->ptrs + push_items,
 			(src_nritems - push_items) *
-			sizeof(struct btrfs_disk_key));
-		memmove(src->blockptrs, src->blockptrs + push_items,
-			(src_nritems - push_items) * sizeof(u64));
+			sizeof(struct btrfs_key_ptr));
 	}
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
@@ -601,7 +603,7 @@ static int balance_node_right(struct btrfs_root *root,
 
 	src_nritems = btrfs_header_nritems(&src->header);
 	dst_nritems = btrfs_header_nritems(&dst->header);
-	push_items = NODEPTRS_PER_BLOCK - dst_nritems;
+	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
 	if (push_items <= 0) {
 		return 1;
 	}
@@ -613,14 +615,10 @@ static int balance_node_right(struct btrfs_root *root,
 	if (max_push < push_items)
 		push_items = max_push;
 
-	memmove(dst->keys + push_items, dst->keys,
-		dst_nritems * sizeof(struct btrfs_disk_key));
-	memmove(dst->blockptrs + push_items, dst->blockptrs,
-		dst_nritems * sizeof(u64));
-	memcpy(dst->keys, src->keys + src_nritems - push_items,
-		push_items * sizeof(struct btrfs_disk_key));
-	memcpy(dst->blockptrs, src->blockptrs + src_nritems - push_items,
-		push_items * sizeof(u64));
+	memmove(dst->ptrs + push_items, dst->ptrs,
+		dst_nritems * sizeof(struct btrfs_key_ptr));
+	memcpy(dst->ptrs, src->ptrs + src_nritems - push_items,
+		push_items * sizeof(struct btrfs_key_ptr));
 
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
@@ -650,7 +648,7 @@ static int insert_new_root(struct btrfs_root *root,
 
 	t = btrfs_alloc_free_block(root);
 	c = &t->node;
-	memset(c, 0, sizeof(c));
+	memset(c, 0, root->blocksize);
 	btrfs_set_header_nritems(&c->header, 1);
 	btrfs_set_header_level(&c->header, level);
 	btrfs_set_header_blocknr(&c->header, t->blocknr);
@@ -660,8 +658,8 @@ static int insert_new_root(struct btrfs_root *root,
 	if (btrfs_is_leaf(lower))
 		lower_key = &((struct btrfs_leaf *)lower)->items[0].key;
 	else
-		lower_key = lower->keys;
-	memcpy(c->keys, lower_key, sizeof(struct btrfs_disk_key));
+		lower_key = &lower->ptrs[0].key;
+	memcpy(&c->ptrs[0].key, lower_key, sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->blocknr);
 	/* the super has an extra ref to root->node */
 	btrfs_block_release(root, root->node);
@@ -693,19 +691,15 @@ static int insert_ptr(struct btrfs_root *root,
 	nritems = btrfs_header_nritems(&lower->header);
 	if (slot > nritems)
 		BUG();
-	if (nritems == NODEPTRS_PER_BLOCK)
+	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
 		BUG();
 	if (slot != nritems) {
-		memmove(lower->keys + slot + 1, lower->keys + slot,
-			(nritems - slot) * sizeof(struct btrfs_disk_key));
-		memmove(lower->blockptrs + slot + 1, lower->blockptrs + slot,
-			(nritems - slot) * sizeof(u64));
+		memmove(lower->ptrs + slot + 1, lower->ptrs + slot,
+			(nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
-	memcpy(lower->keys + slot, key, sizeof(struct btrfs_disk_key));
+	memcpy(&lower->ptrs[slot].key, key, sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(lower, slot, blocknr);
 	btrfs_set_header_nritems(&lower->header, nritems + 1);
-	if (lower->keys[1].objectid == 0)
-			BUG();
 	BUG_ON(list_empty(&path->nodes[level]->dirty));
 	return 0;
 }
@@ -747,17 +741,16 @@ static int split_node(struct btrfs_root *root, struct btrfs_path *path,
 	btrfs_set_header_parentid(&split->header,
 	                       btrfs_header_parentid(&root->node->node.header));
 	mid = (c_nritems + 1) / 2;
-	memcpy(split->keys, c->keys + mid,
-		(c_nritems - mid) * sizeof(struct btrfs_disk_key));
-	memcpy(split->blockptrs, c->blockptrs + mid,
-		(c_nritems - mid) * sizeof(u64));
+	memcpy(split->ptrs, c->ptrs + mid,
+		(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
 	btrfs_set_header_nritems(&split->header, c_nritems - mid);
 	btrfs_set_header_nritems(&c->header, mid);
 	ret = 0;
 
 	BUG_ON(list_empty(&t->dirty));
-	wret = insert_ptr(root, path, split->keys, split_buffer->blocknr,
-			  path->slots[level + 1] + 1, level + 1);
+	wret = insert_ptr(root, path, &split->ptrs[0].key,
+			  split_buffer->blocknr, path->slots[level + 1] + 1,
+			  level + 1);
 	if (wret)
 		ret = wret;
 
@@ -825,7 +818,7 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
 	right_buf = read_tree_block(root, btrfs_node_blockptr(&upper->node,
 							      slot + 1));
 	right = &right_buf->leaf;
-	free_space = btrfs_leaf_free_space(right);
+	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, right_buf);
 		return 1;
@@ -833,7 +826,7 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
 	/* cow and double check */
 	btrfs_cow_block(root, right_buf, upper, slot + 1, &right_buf);
 	right = &right_buf->leaf;
-	free_space = btrfs_leaf_free_space(right);
+	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, right_buf);
 		return 1;
@@ -857,15 +850,14 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
 	right_nritems = btrfs_header_nritems(&right->header);
 	/* push left to right */
 	push_space = btrfs_item_end(left->items + left_nritems - push_items);
-	push_space -= leaf_data_end(left);
+	push_space -= leaf_data_end(root, left);
 	/* make room in the right data area */
-	memmove(right->data + leaf_data_end(right) - push_space,
-		right->data + leaf_data_end(right),
-		LEAF_DATA_SIZE - leaf_data_end(right));
+	memmove(btrfs_leaf_data(right) + leaf_data_end(root, right) -
+		push_space, btrfs_leaf_data(right) + leaf_data_end(root, right),
+		BTRFS_LEAF_DATA_SIZE(root) - leaf_data_end(root, right));
 	/* copy from the left data area */
-	memcpy(right->data + LEAF_DATA_SIZE - push_space,
-		left->data + leaf_data_end(left),
-		push_space);
+	memcpy(btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - push_space,
+		btrfs_leaf_data(left) + leaf_data_end(root, left), push_space);
 	memmove(right->items + push_items, right->items,
 		right_nritems * sizeof(struct btrfs_item));
 	/* copy the items from left to right */
@@ -875,7 +867,7 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
 	/* update the item pointers */
 	right_nritems += push_items;
 	btrfs_set_header_nritems(&right->header, right_nritems);
-	push_space = LEAF_DATA_SIZE;
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
 		btrfs_set_item_offset(right->items + i, push_space -
 				      btrfs_item_size(right->items + i));
@@ -886,7 +878,7 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
 
 	BUG_ON(list_empty(&left_buf->dirty));
 	BUG_ON(list_empty(&right_buf->dirty));
-	memcpy(upper->node.keys + slot + 1,
+	memcpy(&upper->node.ptrs[slot + 1].key,
 		&right->items[0].key, sizeof(struct btrfs_disk_key));
 	BUG_ON(list_empty(&upper->dirty));
 
@@ -932,7 +924,7 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
 	t = read_tree_block(root, btrfs_node_blockptr(&path->nodes[1]->node,
 						      slot - 1));
 	left = &t->leaf;
-	free_space = btrfs_leaf_free_space(left);
+	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, t);
 		return 1;
@@ -941,7 +933,7 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
 	/* cow and double check */
 	btrfs_cow_block(root, t, path->nodes[1], slot - 1, &t);
 	left = &t->leaf;
-	free_space = btrfs_leaf_free_space(left);
+	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, t);
 		return 1;
@@ -964,17 +956,19 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
 	/* push data from right to left */
 	memcpy(left->items + btrfs_header_nritems(&left->header),
 		right->items, push_items * sizeof(struct btrfs_item));
-	push_space = LEAF_DATA_SIZE -
+	push_space = BTRFS_LEAF_DATA_SIZE(root) -
 		     btrfs_item_offset(right->items + push_items -1);
-	memcpy(left->data + leaf_data_end(left) - push_space,
-		right->data + btrfs_item_offset(right->items + push_items - 1),
+	memcpy(btrfs_leaf_data(left) + leaf_data_end(root, left) - push_space,
+		btrfs_leaf_data(right) +
+		btrfs_item_offset(right->items + push_items - 1),
 		push_space);
 	old_left_nritems = btrfs_header_nritems(&left->header);
 	BUG_ON(old_left_nritems < 0);
 
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
-		u16 ioff = btrfs_item_offset(left->items + i);
-		btrfs_set_item_offset(left->items + i, ioff - (LEAF_DATA_SIZE -
+		u32 ioff = btrfs_item_offset(left->items + i);
+		btrfs_set_item_offset(left->items + i, ioff -
+				     (BTRFS_LEAF_DATA_SIZE(root) -
 				      btrfs_item_offset(left->items +
 						        old_left_nritems - 1)));
 	}
@@ -982,16 +976,17 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
 
 	/* fixup right node */
 	push_space = btrfs_item_offset(right->items + push_items - 1) -
-		     leaf_data_end(right);
-	memmove(right->data + LEAF_DATA_SIZE - push_space, right->data +
-		leaf_data_end(right), push_space);
+		     leaf_data_end(root, right);
+	memmove(btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		push_space, btrfs_leaf_data(right) +
+		leaf_data_end(root, right), push_space);
 	memmove(right->items, right->items + push_items,
 		(btrfs_header_nritems(&right->header) - push_items) *
 		sizeof(struct btrfs_item));
 	btrfs_set_header_nritems(&right->header,
 				 btrfs_header_nritems(&right->header) -
 				 push_items);
-	push_space = LEAF_DATA_SIZE;
+	push_space = BTRFS_LEAF_DATA_SIZE(root);
 
 	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
 		btrfs_set_item_offset(right->items + i, push_space -
@@ -1051,12 +1046,12 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		if (wret < 0)
 			return wret;
 	}
-
 	l_buf = path->nodes[0];
 	l = &l_buf->leaf;
 
 	/* did the pushes work? */
-	if (btrfs_leaf_free_space(l) >= sizeof(struct btrfs_item) + data_size)
+	if (btrfs_leaf_free_space(root, l) >=
+	    sizeof(struct btrfs_item) + data_size)
 		return 0;
 
 	if (!path->nodes[1]) {
@@ -1071,16 +1066,16 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	BUG_ON(!right_buffer);
 	BUG_ON(mid == nritems);
 	right = &right_buffer->leaf;
-	memset(right, 0, sizeof(*right));
+	memset(&right->header, 0, sizeof(right->header));
 	if (mid <= slot) {
 		/* FIXME, just alloc a new leaf here */
 		if (leaf_space_used(l, mid, nritems - mid) + space_needed >
-			LEAF_DATA_SIZE)
+			BTRFS_LEAF_DATA_SIZE(root))
 			BUG();
 	} else {
 		/* FIXME, just alloc a new leaf here */
 		if (leaf_space_used(l, 0, mid + 1) + space_needed >
-			LEAF_DATA_SIZE)
+			BTRFS_LEAF_DATA_SIZE(root))
 			BUG();
 	}
 	btrfs_set_header_nritems(&right->header, nritems - mid);
@@ -1088,15 +1083,18 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	btrfs_set_header_level(&right->header, 0);
 	btrfs_set_header_parentid(&right->header,
 	                       btrfs_header_parentid(&root->node->node.header));
-	data_copy_size = btrfs_item_end(l->items + mid) - leaf_data_end(l);
+	data_copy_size = btrfs_item_end(l->items + mid) -
+			 leaf_data_end(root, l);
 	memcpy(right->items, l->items + mid,
 	       (nritems - mid) * sizeof(struct btrfs_item));
-	memcpy(right->data + LEAF_DATA_SIZE - data_copy_size,
-	       l->data + leaf_data_end(l), data_copy_size);
-	rt_data_off = LEAF_DATA_SIZE - btrfs_item_end(l->items + mid);
+	memcpy(btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		data_copy_size, btrfs_leaf_data(l) +
+		leaf_data_end(root, l), data_copy_size);
+	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_end(l->items + mid);
 
 	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
-		u16 ioff = btrfs_item_offset(right->items + i);
+		u32 ioff = btrfs_item_offset(right->items + i);
 		btrfs_set_item_offset(right->items + i, ioff + rt_data_off);
 	}
 
@@ -1156,9 +1154,9 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 	leaf = &leaf_buf->leaf;
 
 	nritems = btrfs_header_nritems(&leaf->header);
-	data_end = leaf_data_end(leaf);
+	data_end = leaf_data_end(root, leaf);
 
-	if (btrfs_leaf_free_space(leaf) <
+	if (btrfs_leaf_free_space(root, leaf) <
 	    sizeof(struct btrfs_item) + data_size)
 		BUG();
 
@@ -1173,7 +1171,7 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 		 */
 		/* first correct the data pointers */
 		for (i = slot; i < nritems; i++) {
-			u16 ioff = btrfs_item_offset(leaf->items + i);
+			u32 ioff = btrfs_item_offset(leaf->items + i);
 			btrfs_set_item_offset(leaf->items + i,
 					      ioff - data_size);
 		}
@@ -1183,7 +1181,8 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 		        (nritems - slot) * sizeof(struct btrfs_item));
 
 		/* shift the data */
-		memmove(leaf->data + data_end - data_size, leaf->data +
+		memmove(btrfs_leaf_data(leaf) + data_end - data_size,
+			btrfs_leaf_data(leaf) +
 		        data_end, old_data - data_end);
 		data_end = old_data;
 	}
@@ -1192,7 +1191,7 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 		sizeof(struct btrfs_disk_key));
 	btrfs_set_item_offset(leaf->items + slot, data_end - data_size);
 	btrfs_set_item_size(leaf->items + slot, data_size);
-	memcpy(leaf->data + data_end - data_size, data, data_size);
+	memcpy(btrfs_leaf_data(leaf) + data_end - data_size, data, data_size);
 	btrfs_set_header_nritems(&leaf->header, nritems + 1);
 
 	ret = 0;
@@ -1200,9 +1199,9 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 		ret = fixup_low_keys(root, &path, &disk_key, 1);
 
 	BUG_ON(list_empty(&leaf_buf->dirty));
-	if (btrfs_leaf_free_space(leaf) < 0)
+	if (btrfs_leaf_free_space(root, leaf) < 0)
 		BUG();
-	check_leaf(&path, 0);
+	check_leaf(root, &path, 0);
 out:
 	btrfs_release_path(root, &path);
 	return ret;
@@ -1227,11 +1226,8 @@ static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
 	node = &parent->node;
 	nritems = btrfs_header_nritems(&node->header);
 	if (slot != nritems -1) {
-		memmove(node->keys + slot, node->keys + slot + 1,
-			sizeof(struct btrfs_disk_key) * (nritems - slot - 1));
-		memmove(node->blockptrs + slot,
-			node->blockptrs + slot + 1,
-			sizeof(u64) * (nritems - slot - 1));
+		memmove(node->ptrs + slot, node->ptrs + slot + 1,
+			sizeof(struct btrfs_key_ptr) * (nritems - slot - 1));
 	}
 	nritems--;
 	btrfs_set_header_nritems(&node->header, nritems);
@@ -1240,7 +1236,8 @@ static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
 		/* just turn the root into a leaf and break */
 		btrfs_set_header_level(&root->node->node.header, 0);
 	} else if (slot == 0) {
-		wret = fixup_low_keys(root, path, node->keys, level + 1);
+		wret = fixup_low_keys(root, path, &node->ptrs[0].key,
+				      level + 1);
 		if (wret)
 			ret = wret;
 	}
@@ -1272,12 +1269,12 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
 
 	if (slot != nritems - 1) {
 		int i;
-		int data_end = leaf_data_end(leaf);
-		memmove(leaf->data + data_end + dsize,
-			leaf->data + data_end,
+		int data_end = leaf_data_end(root, leaf);
+		memmove(btrfs_leaf_data(leaf) + data_end + dsize,
+			btrfs_leaf_data(leaf) + data_end,
 			doff - data_end);
 		for (i = slot + 1; i < nritems; i++) {
-			u16 ioff = btrfs_item_offset(leaf->items + i);
+			u32 ioff = btrfs_item_offset(leaf->items + i);
 			btrfs_set_item_offset(leaf->items + i, ioff + dsize);
 		}
 		memmove(leaf->items + slot, leaf->items + slot + 1,
@@ -1311,7 +1308,7 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
 		BUG_ON(list_empty(&leaf_buf->dirty));
 
 		/* delete the leaf if it is mostly empty */
-		if (used < LEAF_DATA_SIZE / 3) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 05c7707263f..c61ad0f69be 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -5,7 +5,6 @@
 #include "kerncompat.h"
 
 #define BTRFS_MAGIC "_BtRfS_M"
-#define BTRFS_BLOCKSIZE 1024
 
 #define BTRFS_ROOT_TREE_OBJECTID 1
 #define BTRFS_EXTENT_TREE_OBJECTID 2
@@ -52,8 +51,11 @@ struct btrfs_header {
 } __attribute__ ((__packed__));
 
 #define BTRFS_MAX_LEVEL 8
-#define NODEPTRS_PER_BLOCK ((BTRFS_BLOCKSIZE - sizeof(struct btrfs_header)) / \
-			    (sizeof(struct btrfs_disk_key) + sizeof(u64)))
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->blocksize - \
+			        sizeof(struct btrfs_header)) / \
+			       (sizeof(struct btrfs_disk_key) + sizeof(u64)))
+#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize))
 
 struct btrfs_buffer;
 
@@ -86,6 +88,7 @@ struct btrfs_root {
 	int ref_cows;
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
+	u32 blocksize;
 };
 
 /*
@@ -97,7 +100,7 @@ struct btrfs_super_block {
 	__le64 blocknr; /* this block number */
 	__le32 csum;
 	__le64 magic;
-	__le16 blocksize;
+	__le32 blocksize;
 	__le64 generation;
 	__le64 root;
 	__le64 total_blocks;
@@ -111,7 +114,7 @@ struct btrfs_super_block {
  */
 struct btrfs_item {
 	struct btrfs_disk_key key;
-	__le16 offset;
+	__le32 offset;
 	__le16 size;
 } __attribute__ ((__packed__));
 
@@ -122,24 +125,23 @@ struct btrfs_item {
  * The data is separate from the items to get the keys closer together
  * during searches.
  */
-#define LEAF_DATA_SIZE (BTRFS_BLOCKSIZE - sizeof(struct btrfs_header))
 struct btrfs_leaf {
 	struct btrfs_header header;
-	union {
-		struct btrfs_item items[LEAF_DATA_SIZE/
-				        sizeof(struct btrfs_item)];
-		u8 data[BTRFS_BLOCKSIZE - sizeof(struct btrfs_header)];
-	};
+	struct btrfs_item items[];
 } __attribute__ ((__packed__));
 
 /*
  * all non-leaf blocks are nodes, they hold only keys and pointers to
  * other blocks
  */
+struct btrfs_key_ptr {
+	struct btrfs_disk_key key;
+	__le64 blockptr;
+} __attribute__ ((__packed__));
+
 struct btrfs_node {
 	struct btrfs_header header;
-	struct btrfs_disk_key keys[NODEPTRS_PER_BLOCK];
-	__le64 blockptrs[NODEPTRS_PER_BLOCK];
+	struct btrfs_key_ptr ptrs[];
 } __attribute__ ((__packed__));
 
 /*
@@ -186,28 +188,28 @@ static inline void btrfs_set_extent_refs(struct btrfs_extent_item *ei, u32 val)
 
 static inline u64 btrfs_node_blockptr(struct btrfs_node *n, int nr)
 {
-	return le64_to_cpu(n->blockptrs[nr]);
+	return le64_to_cpu(n->ptrs[nr].blockptr);
 }
 
 static inline void btrfs_set_node_blockptr(struct btrfs_node *n, int nr,
 					   u64 val)
 {
-	n->blockptrs[nr] = cpu_to_le64(val);
+	n->ptrs[nr].blockptr = cpu_to_le64(val);
 }
 
-static inline u16 btrfs_item_offset(struct btrfs_item *item)
+static inline u32 btrfs_item_offset(struct btrfs_item *item)
 {
-	return le16_to_cpu(item->offset);
+	return le32_to_cpu(item->offset);
 }
 
-static inline void btrfs_set_item_offset(struct btrfs_item *item, u16 val)
+static inline void btrfs_set_item_offset(struct btrfs_item *item, u32 val)
 {
-	item->offset = cpu_to_le16(val);
+	item->offset = cpu_to_le32(val);
 }
 
-static inline u16 btrfs_item_end(struct btrfs_item *item)
+static inline u32 btrfs_item_end(struct btrfs_item *item)
 {
-	return le16_to_cpu(item->offset) + le16_to_cpu(item->size);
+	return le32_to_cpu(item->offset) + le16_to_cpu(item->size);
 }
 
 static inline u16 btrfs_item_size(struct btrfs_item *item)
@@ -390,20 +392,26 @@ static inline void btrfs_set_super_blocks_used(struct btrfs_super_block *s,
 	s->blocks_used = cpu_to_le64(val);
 }
 
-static inline u16 btrfs_super_blocksize(struct btrfs_super_block *s)
+static inline u32 btrfs_super_blocksize(struct btrfs_super_block *s)
 {
-	return le16_to_cpu(s->blocksize);
+	return le32_to_cpu(s->blocksize);
 }
 
 static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s,
-						u16 val)
+						u32 val)
+{
+	s->blocksize = cpu_to_le32(val);
+}
+
+static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 {
-	s->blocksize = cpu_to_le16(val);
+	return (u8 *)l->items;
 }
 
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
-	((type *)((leaf)->data + btrfs_item_offset((leaf)->items + (slot))))
+	((type *)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset((leaf)->items + (slot))))
 
 struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root);
 int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf);
@@ -416,7 +424,7 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *key,
 		void *data, int data_size);
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-int btrfs_leaf_free_space(struct btrfs_leaf *leaf);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
 int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap);
 int btrfs_finish_extent_commit(struct btrfs_root *root);
 int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3d4bf6833f2..8d9457b5aef 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,7 +46,8 @@ struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr)
 {
 	struct btrfs_buffer *buf;
 	int ret;
-	buf = malloc(sizeof(struct btrfs_buffer));
+
+	buf = malloc(sizeof(struct btrfs_buffer) + root->blocksize);
 	if (!buf)
 		return buf;
 	allocated_blocks++;
@@ -84,7 +85,7 @@ struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr)
 
 struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	loff_t offset = blocknr * BTRFS_BLOCKSIZE;
+	loff_t offset = blocknr * root->blocksize;
 	struct btrfs_buffer *buf;
 	int ret;
 
@@ -95,8 +96,8 @@ struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
 		buf = alloc_tree_block(root, blocknr);
 		if (!buf)
 			return NULL;
-		ret = pread(root->fp, &buf->node, BTRFS_BLOCKSIZE, offset);
-		if (ret != BTRFS_BLOCKSIZE) {
+		ret = pread(root->fp, &buf->node, root->blocksize, offset);
+		if (ret != root->blocksize) {
 			free(buf);
 			return NULL;
 		}
@@ -127,13 +128,13 @@ int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 {
 	u64 blocknr = buf->blocknr;
-	loff_t offset = blocknr * BTRFS_BLOCKSIZE;
+	loff_t offset = blocknr * root->blocksize;
 	int ret;
 
 	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
 		BUG();
-	ret = pwrite(root->fp, &buf->node, BTRFS_BLOCKSIZE, offset);
-	if (ret != BTRFS_BLOCKSIZE)
+	ret = pwrite(root->fp, &buf->node, root->blocksize, offset);
+	if (ret != root->blocksize)
 		return ret;
 	return 0;
 }
@@ -215,7 +216,8 @@ int btrfs_commit_transaction(struct btrfs_root *root,
 	return ret;
 }
 
-static int __setup_root(struct btrfs_root *root, u64 objectid, int fp)
+static int __setup_root(struct btrfs_super_block *super,
+			struct btrfs_root *root, u64 objectid, int fp)
 {
 	INIT_LIST_HEAD(&root->trans);
 	INIT_LIST_HEAD(&root->cache);
@@ -223,6 +225,8 @@ static int __setup_root(struct btrfs_root *root, u64 objectid, int fp)
 	root->fp = fp;
 	root->node = NULL;
 	root->commit_root = NULL;
+	root->blocksize = btrfs_super_blocksize(super);
+	root->ref_cows = 0;
 	memset(&root->current_insert, 0, sizeof(root->current_insert));
 	memset(&root->last_insert, 0, sizeof(root->last_insert));
 	memset(&root->root_key, 0, sizeof(root->root_key));
@@ -230,19 +234,19 @@ static int __setup_root(struct btrfs_root *root, u64 objectid, int fp)
 	return 0;
 }
 
-static int find_and_setup_root(struct btrfs_root *tree_root, u64 objectid,
-			struct btrfs_root *root, int fp)
+static int find_and_setup_root(struct btrfs_super_block *super,
+			       struct btrfs_root *tree_root, u64 objectid,
+			       struct btrfs_root *root, int fp)
 {
 	int ret;
 
-	__setup_root(root, objectid, fp);
+	__setup_root(super, root, objectid, fp);
 	ret = btrfs_find_last_root(tree_root, objectid,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
 
 	root->node = read_tree_block(root,
 				     btrfs_root_blocknr(&root->root_item));
-	root->ref_cows = 0;
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -277,28 +281,28 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 	INIT_RADIX_TREE(&tree_root->cache_radix, GFP_KERNEL);
 
 	ret = pread(fp, super, sizeof(struct btrfs_super_block),
-		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
+		     BTRFS_SUPER_INFO_OFFSET);
 	if (ret == 0 || btrfs_super_root(super) == 0) {
 		printf("making new FS!\n");
-		ret = mkfs(fp, 0, BTRFS_BLOCKSIZE);
+		ret = mkfs(fp, 0, 1024);
 		if (ret)
 			return NULL;
 		ret = pread(fp, super, sizeof(struct btrfs_super_block),
-			     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
+			     BTRFS_SUPER_INFO_OFFSET);
 		if (ret != sizeof(struct btrfs_super_block))
 			return NULL;
 	}
 	BUG_ON(ret < 0);
 
-	__setup_root(tree_root, BTRFS_ROOT_TREE_OBJECTID, fp);
+	__setup_root(super, tree_root, BTRFS_ROOT_TREE_OBJECTID, fp);
 	tree_root->node = read_tree_block(tree_root, btrfs_super_root(super));
 	BUG_ON(!tree_root->node);
 
-	ret = find_and_setup_root(tree_root, BTRFS_EXTENT_TREE_OBJECTID,
+	ret = find_and_setup_root(super, tree_root, BTRFS_EXTENT_TREE_OBJECTID,
 				  extent_root, fp);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(tree_root, BTRFS_FS_TREE_OBJECTID,
+	ret = find_and_setup_root(super, tree_root, BTRFS_FS_TREE_OBJECTID,
 				  root, fp);
 	BUG_ON(ret);
 
@@ -313,7 +317,7 @@ int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s)
 	int ret;
 	btrfs_set_super_root(s, root->tree_root->node->blocknr);
 	ret = pwrite(root->fp, s, sizeof(*s),
-		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
+		     BTRFS_SUPER_INFO_OFFSET);
 	if (ret != sizeof(*s)) {
 		fprintf(stderr, "failed to write new super block err %d\n", ret);
 		return ret;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c22a61f9233..5771bb90acb 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -5,12 +5,12 @@
 struct btrfs_buffer {
 	u64 blocknr;
 	int count;
+	struct list_head dirty;
+	struct list_head cache;
 	union {
 		struct btrfs_node node;
 		struct btrfs_leaf leaf;
 	};
-	struct list_head dirty;
-	struct list_head cache;
 };
 
 struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr);
@@ -24,9 +24,8 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s);
 int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s);
 void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf);
 int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s);
-int mkfs(int fd, u64 num_blocks, u16 blocksize);
-
+int mkfs(int fd, u64 num_blocks, u32 blocksize);
 
-#define BTRFS_SUPER_INFO_OFFSET(bs) (16 * (bs))
+#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
 
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3767744f659..d4f1ec32839 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -143,7 +143,6 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 	struct btrfs_key key;
 	struct btrfs_root *extent_root = root->extent_root;
 	int ret;
-	struct btrfs_item *item;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key ins;
 	u32 refs;
@@ -161,9 +160,8 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 		printf("failed to find %Lu\n", key.objectid);
 		BUG();
 	}
-	item = path.nodes[0]->leaf.items + path.slots[0];
-	ei = (struct btrfs_extent_item *)(path.nodes[0]->leaf.data +
-				    btrfs_item_offset(item));
+	ei = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+			    struct btrfs_extent_item);
 	BUG_ON(ei->refs == 0);
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index dd14ed4fea6..317d20ce759 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -10,19 +10,20 @@
 #include "ctree.h"
 #include "disk-io.h"
 
-int mkfs(int fd, u64 num_blocks, u16 blocksize)
+int mkfs(int fd, u64 num_blocks, u32 blocksize)
 {
 	struct btrfs_super_block super;
-	struct btrfs_leaf empty_leaf;
+	struct btrfs_leaf *empty_leaf;
 	struct btrfs_root_item root_item;
 	struct btrfs_item item;
 	struct btrfs_extent_item extent_item;
 	char *block;
 	int ret;
-	u16 itemoff;
+	u32 itemoff;
+	u32 start_block = BTRFS_SUPER_INFO_OFFSET / blocksize;
 
-	btrfs_set_super_blocknr(&super, 16);
-	btrfs_set_super_root(&super, 17);
+	btrfs_set_super_blocknr(&super, start_block);
+	btrfs_set_super_root(&super, start_block + 1);
 	strcpy((char *)(&super.magic), BTRFS_MAGIC);
 	btrfs_set_super_blocksize(&super, blocksize);
 	btrfs_set_super_total_blocks(&super, num_blocks);
@@ -32,168 +33,98 @@ int mkfs(int fd, u64 num_blocks, u16 blocksize)
 	memset(block, 0, blocksize);
 	BUG_ON(sizeof(super) > blocksize);
 	memcpy(block, &super, sizeof(super));
-	ret = pwrite(fd, block, blocksize, BTRFS_SUPER_INFO_OFFSET(blocksize));
+	ret = pwrite(fd, block, blocksize, BTRFS_SUPER_INFO_OFFSET);
 	BUG_ON(ret != blocksize);
 
 	/* create the tree of root objects */
-	memset(&empty_leaf, 0, sizeof(empty_leaf));
-	btrfs_set_header_parentid(&empty_leaf.header, BTRFS_ROOT_TREE_OBJECTID);
-	btrfs_set_header_blocknr(&empty_leaf.header, 17);
-	btrfs_set_header_nritems(&empty_leaf.header, 2);
+	empty_leaf = malloc(blocksize);
+	memset(empty_leaf, 0, blocksize);
+	btrfs_set_header_parentid(&empty_leaf->header,
+				  BTRFS_ROOT_TREE_OBJECTID);
+	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 1);
+	btrfs_set_header_nritems(&empty_leaf->header, 2);
 
 	/* create the items for the root tree */
-	btrfs_set_root_blocknr(&root_item, 18);
+	btrfs_set_root_blocknr(&root_item, start_block + 2);
 	btrfs_set_root_refs(&root_item, 1);
-	itemoff = LEAF_DATA_SIZE - sizeof(root_item);
+	itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) - sizeof(root_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_item_size(&item, sizeof(root_item));
 	btrfs_set_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID);
 	btrfs_set_key_offset(&item.key, 0);
 	btrfs_set_key_flags(&item.key, 0);
-	memcpy(empty_leaf.items, &item, sizeof(item));
-	memcpy(empty_leaf.data + itemoff, &root_item, sizeof(root_item));
+	memcpy(empty_leaf->items, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
+		&root_item, sizeof(root_item));
 
-	btrfs_set_root_blocknr(&root_item, 19);
+	btrfs_set_root_blocknr(&root_item, start_block + 3);
 	itemoff = itemoff - sizeof(root_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID);
-	memcpy(empty_leaf.items + 1, &item, sizeof(item));
-	memcpy(empty_leaf.data + itemoff, &root_item, sizeof(root_item));
-	ret = pwrite(fd, &empty_leaf, blocksize, 17 * blocksize);
+	memcpy(empty_leaf->items + 1, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
+		&root_item, sizeof(root_item));
+	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 1) * blocksize);
 
 	/* create the items for the extent tree */
-	btrfs_set_header_parentid(&empty_leaf.header,
+	btrfs_set_header_parentid(&empty_leaf->header,
 				  BTRFS_EXTENT_TREE_OBJECTID);
-	btrfs_set_header_blocknr(&empty_leaf.header, 18);
-	btrfs_set_header_nritems(&empty_leaf.header, 4);
+	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 2);
+	btrfs_set_header_nritems(&empty_leaf->header, 4);
 
 	/* item1, reserve blocks 0-16 */
 	btrfs_set_key_objectid(&item.key, 0);
-	btrfs_set_key_offset(&item.key, 17);
+	btrfs_set_key_offset(&item.key, start_block + 1);
 	btrfs_set_key_flags(&item.key, 0);
-	itemoff = LEAF_DATA_SIZE - sizeof(struct btrfs_extent_item);
+	itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) -
+			sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_item_size(&item, sizeof(struct btrfs_extent_item));
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item, 0);
-	memcpy(empty_leaf.items, &item, sizeof(item));
-	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
-		btrfs_item_size(&item));
+	memcpy(empty_leaf->items, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
+		&extent_item, btrfs_item_size(&item));
 
 	/* item2, give block 17 to the root */
-	btrfs_set_key_objectid(&item.key, 17);
+	btrfs_set_key_objectid(&item.key, start_block + 1);
 	btrfs_set_key_offset(&item.key, 1);
 	itemoff = itemoff - sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_extent_owner(&extent_item, BTRFS_ROOT_TREE_OBJECTID);
-	memcpy(empty_leaf.items + 1, &item, sizeof(item));
-	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
-		btrfs_item_size(&item));
+	memcpy(empty_leaf->items + 1, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
+		&extent_item, btrfs_item_size(&item));
 
 	/* item3, give block 18 to the extent root */
-	btrfs_set_key_objectid(&item.key, 18);
+	btrfs_set_key_objectid(&item.key, start_block + 2);
 	btrfs_set_key_offset(&item.key, 1);
 	itemoff = itemoff - sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_extent_owner(&extent_item, BTRFS_EXTENT_TREE_OBJECTID);
-	memcpy(empty_leaf.items + 2, &item, sizeof(item));
-	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
-		btrfs_item_size(&item));
+	memcpy(empty_leaf->items + 2, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
+		&extent_item, btrfs_item_size(&item));
 
 	/* item4, give block 19 to the FS root */
-	btrfs_set_key_objectid(&item.key, 19);
+	btrfs_set_key_objectid(&item.key, start_block + 3);
 	btrfs_set_key_offset(&item.key, 1);
 	itemoff = itemoff - sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID);
-	memcpy(empty_leaf.items + 3, &item, sizeof(item));
-	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
-		btrfs_item_size(&item));
-	ret = pwrite(fd, &empty_leaf, blocksize, 18 * blocksize);
-	if (ret != sizeof(empty_leaf))
+	memcpy(empty_leaf->items + 3, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
+		&extent_item, btrfs_item_size(&item));
+	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 2) * blocksize);
+	if (ret != blocksize)
 		return -1;
 
 	/* finally create the FS root */
-	btrfs_set_header_parentid(&empty_leaf.header, BTRFS_FS_TREE_OBJECTID);
-	btrfs_set_header_blocknr(&empty_leaf.header, 19);
-	btrfs_set_header_nritems(&empty_leaf.header, 0);
-	ret = pwrite(fd, &empty_leaf, blocksize, 19 * blocksize);
-	if (ret != sizeof(empty_leaf))
+	btrfs_set_header_parentid(&empty_leaf->header, BTRFS_FS_TREE_OBJECTID);
+	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 3);
+	btrfs_set_header_nritems(&empty_leaf->header, 0);
+	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 3) * blocksize);
+	if (ret != blocksize)
 		return -1;
 	return 0;
 }
-
-#if 0
-int mkfs(int fd)
-{
-	struct btrfs_root_info info[2];
-	struct btrfs_leaf empty_leaf;
-	struct btrfs_item item;
-	struct btrfs_extent_item extent_item;
-	int ret;
-
-	/* setup the super block area */
-	memset(info, 0, sizeof(info));
-	btrfs_set_root_blocknr(info, 16);
-	btrfs_set_root_objectid(info, 1);
-	btrfs_set_root_tree_root(info, 17);
-
-	btrfs_set_root_blocknr(info + 1, 16);
-	btrfs_set_root_objectid(info + 1, 2);
-	btrfs_set_root_tree_root(info + 1, 18);
-
-	ret = pwrite(fd, info, sizeof(info),
-		     BTRFS_SUPER_INFO_OFFSET(BTRFS_BLOCKSIZE));
-	if (ret != sizeof(info))
-		return -1;
-
-	/* create leaves for the tree root and extent root */
-	memset(&empty_leaf, 0, sizeof(empty_leaf));
-	btrfs_set_header_parentid(&empty_leaf.header, 1);
-	btrfs_set_header_blocknr(&empty_leaf.header, 17);
-	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 17 * BTRFS_BLOCKSIZE);
-	if (ret != sizeof(empty_leaf))
-		return -1;
-
-	btrfs_set_header_parentid(&empty_leaf.header, 2);
-	btrfs_set_header_blocknr(&empty_leaf.header, 18);
-	btrfs_set_header_nritems(&empty_leaf.header, 3);
-
-	/* item1, reserve blocks 0-16 */
-	btrfs_set_key_objectid(&item.key, 0);
-	btrfs_set_key_offset(&item.key, 17);
-	btrfs_set_key_flags(&item.key, 0);
-	btrfs_set_item_offset(&item, LEAF_DATA_SIZE -
-			      sizeof(struct btrfs_extent_item));
-	btrfs_set_item_size(&item, sizeof(struct btrfs_extent_item));
-	btrfs_set_extent_refs(&extent_item, 1);
-	btrfs_set_extent_owner(&extent_item, 0);
-	memcpy(empty_leaf.items, &item, sizeof(item));
-	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
-		btrfs_item_size(&item));
-
-	/* item2, give block 17 to the root */
-	btrfs_set_key_objectid(&item.key, 17);
-	btrfs_set_key_offset(&item.key, 1);
-	btrfs_set_item_offset(&item, LEAF_DATA_SIZE -
-			      sizeof(struct btrfs_extent_item) * 2);
-	btrfs_set_extent_owner(&extent_item, 1);
-	memcpy(empty_leaf.items + 1, &item, sizeof(item));
-	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
-		btrfs_item_size(&item));
-
-	/* item3, give block 18 for the extent root */
-	btrfs_set_key_objectid(&item.key, 18);
-	btrfs_set_key_offset(&item.key, 1);
-	btrfs_set_item_offset(&item, LEAF_DATA_SIZE -
-			      sizeof(struct btrfs_extent_item) * 3);
-	btrfs_set_extent_owner(&extent_item, 2);
-	memcpy(empty_leaf.items + 2, &item, sizeof(item));
-	memcpy(empty_leaf.data + btrfs_item_offset(&item), &extent_item,
-		btrfs_item_size(&item));
-	ret = pwrite(fd, &empty_leaf, sizeof(empty_leaf), 18 * BTRFS_BLOCKSIZE);
-	if (ret != sizeof(empty_leaf))
-		return -1;
-	return 0;
-}
-#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index e769f36cf05..7a189eaa589 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -5,7 +5,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 
-void btrfs_print_leaf(struct btrfs_leaf *l)
+void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 {
 	int i;
 	u32 nr = btrfs_header_nritems(&l->header);
@@ -13,7 +13,8 @@ void btrfs_print_leaf(struct btrfs_leaf *l)
 	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
 	printf("leaf %Lu total ptrs %d free space %d\n",
-		btrfs_header_blocknr(&l->header), nr, btrfs_leaf_free_space(l));
+		btrfs_header_blocknr(&l->header), nr,
+		btrfs_leaf_free_space(root, l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
@@ -25,7 +26,7 @@ void btrfs_print_leaf(struct btrfs_leaf *l)
 			btrfs_item_offset(item),
 			btrfs_item_size(item));
 		printf("\t\titem data %.*s\n", btrfs_item_size(item),
-			l->data + btrfs_item_offset(item));
+			btrfs_leaf_data(l) + btrfs_item_offset(item));
 		ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
 		printf("\t\textent data refs %u owner %Lu\n",
 			btrfs_extent_refs(ei), btrfs_extent_owner(ei));
@@ -46,18 +47,18 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 	c = &t->node;
 	nr = btrfs_header_nritems(&c->header);
 	if (btrfs_is_leaf(c)) {
-		btrfs_print_leaf((struct btrfs_leaf *)c);
+		btrfs_print_leaf(root, (struct btrfs_leaf *)c);
 		return;
 	}
 	printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
 	        btrfs_header_level(&c->header), nr,
-		(u32)NODEPTRS_PER_BLOCK - nr);
+		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
 		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
 		       i,
-		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
-		       btrfs_node_blockptr(c, i));
+		       c->ptrs[i].key.objectid, c->ptrs[i].key.flags,
+		       c->ptrs[i].key.offset, btrfs_node_blockptr(c, i));
 		fflush(stdout);
 	}
 	for (i = 0; i < nr; i++) {
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e8d0b847c02..9236abda2a7 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -1,3 +1,3 @@
 
-void btrfs_print_leaf(struct btrfs_leaf *l);
+void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l);
 void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t);
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index 66bdc57905d..d9287cad35c 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -73,7 +73,7 @@ int main(int ac, char **av) {
 	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
 	        btrfs_header_level(&root->node->node.header),
 		btrfs_header_nritems(&root->node->node.header),
-		NODEPTRS_PER_BLOCK -
+		BTRFS_NODEPTRS_PER_BLOCK(root) -
 		btrfs_header_nritems(&root->node->node.header));
 	printf("all searches good, deleting some items\n");
 	i = 0;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0ab90cfea98..03aa7c2c633 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -31,7 +31,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 		ret = 1;
 		goto out;
 	}
-	memcpy(item, l->data + btrfs_item_offset(l->items + slot),
+	memcpy(item, btrfs_item_ptr(l, slot, struct btrfs_root_item),
 		sizeof(*item));
 	btrfs_disk_key_to_cpu(key, &l->items[slot].key);
 	btrfs_release_path(root, &path);
@@ -55,7 +55,7 @@ int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key,
 	BUG_ON(ret != 0);
 	l = &path.nodes[0]->leaf;
 	slot = path.slots[0];
-	memcpy(l->data + btrfs_item_offset(l->items + slot), item,
+	memcpy(btrfs_item_ptr(l, slot, struct btrfs_root_item), item,
 		sizeof(*item));
 out:
 	btrfs_release_path(root, &path);
-- 
cgit v1.2.3


From a1516c8921ce2c148b69493e858d95bc79c92f01 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Mar 2007 14:26:53 -0400
Subject: Btrfs: reorder key offset and flags

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO         |  1 -
 fs/btrfs/ctree.h      |  4 ++--
 fs/btrfs/print-tree.c | 12 +++++++-----
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index ff90e5eaf43..8142be88d71 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,7 +1,6 @@
 * cleanup, add more error checking, get rid of BUG_ONs
 * Make IO functions look more like the page cache
 * Fix ENOSPC handling
-* make blocksize a mkfs parameter instead of #define
 * make a real mkfs and superblock
 * Do checksumming
 * Define FS objects in terms of different item types
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c61ad0f69be..73ebc824924 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,14 +26,14 @@
  */
 struct btrfs_disk_key {
 	__le64 objectid;
-	__le32 flags;
 	__le64 offset;
+	__le32 flags;
 } __attribute__ ((__packed__));
 
 struct btrfs_key {
 	u64 objectid;
-	u32 flags;
 	u64 offset;
+	u32 flags;
 } __attribute__ ((__packed__));
 
 /*
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 7a189eaa589..4c710190343 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -18,11 +18,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
-		printf("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n",
+		printf("\titem %d key (%Lu %Lu %u) itemoff %d itemsize %d\n",
 			i,
 			btrfs_key_objectid(&item->key),
-			btrfs_key_flags(&item->key),
 			btrfs_key_offset(&item->key),
+			btrfs_key_flags(&item->key),
 			btrfs_item_offset(item),
 			btrfs_item_size(item));
 		printf("\t\titem data %.*s\n", btrfs_item_size(item),
@@ -55,10 +55,12 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
-		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
+		printf("\tkey %d (%Lu %Lu %u) block %Lu\n",
 		       i,
-		       c->ptrs[i].key.objectid, c->ptrs[i].key.flags,
-		       c->ptrs[i].key.offset, btrfs_node_blockptr(c, i));
+		       c->ptrs[i].key.objectid,
+		       c->ptrs[i].key.offset,
+		       c->ptrs[i].key.flags,
+		       btrfs_node_blockptr(c, i));
 		fflush(stdout);
 	}
 	for (i = 0; i < nr; i++) {
-- 
cgit v1.2.3


From 62e2749e03a855d98855f9ce032dbe72d5fad148 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Mar 2007 12:56:47 -0400
Subject: Btrfs: Use a chunk of the key flags to record the item type. Add
 (untested and simple) directory item code Fix comp_keys to use the new key
 ordering Add btrfs_insert_empty_item

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/TODO          |   1 +
 fs/btrfs/ctree.c       |  51 +++++++++-----
 fs/btrfs/ctree.h       | 182 ++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/dir-item.c    | 102 +++++++++++++++++++++++++++
 fs/btrfs/extent-tree.c |  17 +++--
 fs/btrfs/hash.c        |  86 +++++++++++++++++++++++
 fs/btrfs/hasher.c      |  23 +++++++
 fs/btrfs/kerncompat.h  |   1 +
 fs/btrfs/mkfs.c        |  28 ++++----
 fs/btrfs/print-tree.c  |  36 +++++++---
 fs/btrfs/quick-test.c  |   3 +-
 fs/btrfs/random-test.c |  10 ++-
 fs/btrfs/root-tree.c   |   2 +-
 14 files changed, 435 insertions(+), 109 deletions(-)
 create mode 100644 fs/btrfs/dir-item.c
 create mode 100644 fs/btrfs/hash.c
 create mode 100644 fs/btrfs/hasher.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 13ed6313ac8..ee78cdd08c0 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -3,7 +3,7 @@ CC=gcc
 CFLAGS = -g -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \
-	  root-tree.o
+	  root-tree.o dir-item.o hash.o
 
 # if you don't have sparse installed, use ls instead
 CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 8142be88d71..249e2e90bea 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -8,6 +8,7 @@
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
+* Do actual block accounting
 * Port into the kernel
 * Add virtual filesystems, mountable snapshots
 * Get rid of struct ctree_path, limiting tree levels held at one time
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ece8de7f38e..7edfbd46804 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -107,14 +107,14 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 		return 1;
 	if (k1.objectid < k2->objectid)
 		return -1;
-	if (k1.flags > k2->flags)
-		return 1;
-	if (k1.flags < k2->flags)
-		return -1;
 	if (k1.offset > k2->offset)
 		return 1;
 	if (k1.offset < k2->offset)
 		return -1;
+	if (k1.flags > k2->flags)
+		return 1;
+	if (k1.flags < k2->flags)
+		return -1;
 	return 0;
 }
 
@@ -1122,8 +1122,8 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
-int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
-			  void *data, int data_size)
+int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 data_size)
 {
 	int ret = 0;
 	int slot;
@@ -1132,7 +1132,6 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 	struct btrfs_buffer *leaf_buf;
 	u32 nritems;
 	unsigned int data_end;
-	struct btrfs_path path;
 	struct btrfs_disk_key disk_key;
 
 	btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -1140,17 +1139,16 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(root, cpu_key, &path, data_size, 1);
+	ret = btrfs_search_slot(root, cpu_key, path, data_size, 1);
 	if (ret == 0) {
-		btrfs_release_path(root, &path);
+		btrfs_release_path(root, path);
 		return -EEXIST;
 	}
 	if (ret < 0)
 		goto out;
 
-	slot_orig = path.slots[0];
-	leaf_buf = path.nodes[0];
+	slot_orig = path->slots[0];
+	leaf_buf = path->nodes[0];
 	leaf = &leaf_buf->leaf;
 
 	nritems = btrfs_header_nritems(&leaf->header);
@@ -1160,7 +1158,7 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 	    sizeof(struct btrfs_item) + data_size)
 		BUG();
 
-	slot = path.slots[0];
+	slot = path->slots[0];
 	BUG_ON(slot < 0);
 	if (slot != nritems) {
 		int i;
@@ -1186,23 +1184,42 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
 		        data_end, old_data - data_end);
 		data_end = old_data;
 	}
-	/* copy the new data in */
+	/* setup the item for the new data */
 	memcpy(&leaf->items[slot].key, &disk_key,
 		sizeof(struct btrfs_disk_key));
 	btrfs_set_item_offset(leaf->items + slot, data_end - data_size);
 	btrfs_set_item_size(leaf->items + slot, data_size);
-	memcpy(btrfs_leaf_data(leaf) + data_end - data_size, data, data_size);
 	btrfs_set_header_nritems(&leaf->header, nritems + 1);
 
 	ret = 0;
 	if (slot == 0)
-		ret = fixup_low_keys(root, &path, &disk_key, 1);
+		ret = fixup_low_keys(root, path, &disk_key, 1);
 
 	BUG_ON(list_empty(&leaf_buf->dirty));
 	if (btrfs_leaf_free_space(root, leaf) < 0)
 		BUG();
-	check_leaf(root, &path, 0);
+	check_leaf(root, path, 0);
 out:
+	return ret;
+}
+
+/*
+ * Given a key and some data, insert an item into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
+			  void *data, u32 data_size)
+{
+	int ret = 0;
+	struct btrfs_path path;
+	u8 *ptr;
+
+	btrfs_init_path(&path);
+	ret = btrfs_insert_empty_item(root, &path, cpu_key, data_size);
+	if (!ret) {
+		ptr = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], u8);
+		memcpy(ptr, data, data_size);
+	}
 	btrfs_release_path(root, &path);
 	return ret;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 73ebc824924..e8a26fd8ea9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -58,39 +58,6 @@ struct btrfs_header {
 #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize))
 
 struct btrfs_buffer;
-
-struct btrfs_root_item {
-	__le64 blocknr;
-	__le32 flags;
-	__le64 block_limit;
-	__le64 blocks_used;
-	__le32 refs;
-};
-
-/*
- * in ram representation of the tree.  extent_root is used for all allocations
- * and for the extent tree extent_root root.  current_insert is used
- * only for the extent tree.
- */
-struct btrfs_root {
-	struct btrfs_buffer *node;
-	struct btrfs_buffer *commit_root;
-	struct btrfs_root *extent_root;
-	struct btrfs_root *tree_root;
-	struct btrfs_key current_insert;
-	struct btrfs_key last_insert;
-	int fp;
-	struct radix_tree_root cache_radix;
-	struct radix_tree_root pinned_radix;
-	struct list_head trans;
-	struct list_head cache;
-	int cache_size;
-	int ref_cows;
-	struct btrfs_root_item root_item;
-	struct btrfs_key root_key;
-	u32 blocksize;
-};
-
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -108,8 +75,7 @@ struct btrfs_super_block {
 } __attribute__ ((__packed__));
 
 /*
- * A leaf is full of items.  The exact type of item is defined by
- * the key flags parameter.  offset and size tell us where to find
+ * A leaf is full of items. offset and size tell us where to find
  * the item in the leaf (relative to the start of the data area)
  */
 struct btrfs_item {
@@ -144,15 +110,6 @@ struct btrfs_node {
 	struct btrfs_key_ptr ptrs[];
 } __attribute__ ((__packed__));
 
-/*
- * items in the extent btree are used to record the objectid of the
- * owner of the block and the number of references
- */
-struct btrfs_extent_item {
-	__le32 refs;
-	__le64 owner;
-} __attribute__ ((__packed__));
-
 /*
  * btrfs_paths remember the path taken from the root down to the leaf.
  * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
@@ -166,6 +123,94 @@ struct btrfs_path {
 	int slots[BTRFS_MAX_LEVEL];
 };
 
+/*
+ * items in the extent btree are used to record the objectid of the
+ * owner of the block and the number of references
+ */
+struct btrfs_extent_item {
+	__le32 refs;
+	__le64 owner;
+} __attribute__ ((__packed__));
+
+struct btrfs_dir_item {
+	__le64 objectid;
+	__le16 flags;
+	u8 type;
+} __attribute__ ((__packed__));
+
+struct btrfs_root_item {
+	__le64 blocknr;
+	__le32 flags;
+	__le64 block_limit;
+	__le64 blocks_used;
+	__le32 refs;
+};
+
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.  current_insert is used
+ * only for the extent tree.
+ */
+struct btrfs_root {
+	struct btrfs_buffer *node;
+	struct btrfs_buffer *commit_root;
+	struct btrfs_root *extent_root;
+	struct btrfs_root *tree_root;
+	struct btrfs_key current_insert;
+	struct btrfs_key last_insert;
+	int fp;
+	struct radix_tree_root cache_radix;
+	struct radix_tree_root pinned_radix;
+	struct list_head trans;
+	struct list_head cache;
+	int cache_size;
+	int ref_cows;
+	struct btrfs_root_item root_item;
+	struct btrfs_key root_key;
+	u32 blocksize;
+};
+
+
+/* the lower bits in the key flags defines the item type */
+#define BTRFS_KEY_TYPE_MAX	256
+#define BTRFS_KEY_TYPE_MASK	(BTRFS_KEY_TYPE_MAX - 1)
+#define BTRFS_INODE_ITEM_KEY	1
+#define BTRFS_DIR_ITEM_KEY	2
+#define BTRFS_ROOT_ITEM_KEY	3
+#define BTRFS_EXTENT_ITEM_KEY	4
+#define BTRFS_STRING_ITEM_KEY	5
+
+static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d)
+{
+	return le64_to_cpu(d->objectid);
+}
+
+static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val)
+{
+	d->objectid = cpu_to_le64(val);
+}
+
+static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d)
+{
+	return le16_to_cpu(d->flags);
+}
+
+static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val)
+{
+	d->flags = cpu_to_le16(val);
+}
+
+static inline u8 btrfs_dir_type(struct btrfs_dir_item *d)
+{
+	return d->type;
+}
+
+static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val)
+{
+	d->type = val;
+}
+
+
 static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
 {
 	return le64_to_cpu(ei->owner);
@@ -238,39 +283,65 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
 	disk->objectid = cpu_to_le64(cpu->objectid);
 }
 
-static inline u64 btrfs_key_objectid(struct btrfs_disk_key *disk)
+static inline u64 btrfs_disk_key_objectid(struct btrfs_disk_key *disk)
 {
 	return le64_to_cpu(disk->objectid);
 }
 
-static inline void btrfs_set_key_objectid(struct btrfs_disk_key *disk,
-					  u64 val)
+static inline void btrfs_set_disk_key_objectid(struct btrfs_disk_key *disk,
+					       u64 val)
 {
 	disk->objectid = cpu_to_le64(val);
 }
 
-static inline u64 btrfs_key_offset(struct btrfs_disk_key *disk)
+static inline u64 btrfs_disk_key_offset(struct btrfs_disk_key *disk)
 {
 	return le64_to_cpu(disk->offset);
 }
 
-static inline void btrfs_set_key_offset(struct btrfs_disk_key *disk,
-					  u64 val)
+static inline void btrfs_set_disk_key_offset(struct btrfs_disk_key *disk,
+					     u64 val)
 {
 	disk->offset = cpu_to_le64(val);
 }
 
-static inline u32 btrfs_key_flags(struct btrfs_disk_key *disk)
+static inline u32 btrfs_disk_key_flags(struct btrfs_disk_key *disk)
 {
 	return le32_to_cpu(disk->flags);
 }
 
-static inline void btrfs_set_key_flags(struct btrfs_disk_key *disk,
-					  u32 val)
+static inline void btrfs_set_disk_key_flags(struct btrfs_disk_key *disk,
+					    u32 val)
 {
 	disk->flags = cpu_to_le32(val);
 }
 
+static inline u32 btrfs_key_type(struct btrfs_key *key)
+{
+	return key->flags & BTRFS_KEY_TYPE_MASK;
+}
+
+static inline u32 btrfs_disk_key_type(struct btrfs_disk_key *key)
+{
+	return le32_to_cpu(key->flags) & BTRFS_KEY_TYPE_MASK;
+}
+
+static inline void btrfs_set_key_type(struct btrfs_key *key, u32 type)
+{
+	BUG_ON(type >= BTRFS_KEY_TYPE_MAX);
+	key->flags = (key->flags & ~((u64)BTRFS_KEY_TYPE_MASK)) | type;
+}
+
+static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key, u32 type)
+{
+	u32 flags = btrfs_disk_key_flags(key);
+	BUG_ON(type >= BTRFS_KEY_TYPE_MAX);
+	flags = (flags & ~((u64)BTRFS_KEY_TYPE_MASK)) | type;
+	btrfs_set_disk_key_flags(key, flags);
+}
+
+
+
 static inline u64 btrfs_header_blocknr(struct btrfs_header *h)
 {
 	return le64_to_cpu(h->blocknr);
@@ -407,7 +478,6 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 {
 	return (u8 *)l->items;
 }
-
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
@@ -422,7 +492,9 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
 int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *key,
-		void *data, int data_size);
+		void *data, u32 data_size);
+int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 data_size);
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
 int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
new file mode 100644
index 00000000000..2a888e97e1a
--- /dev/null
+++ b/fs/btrfs/dir-item.c
@@ -0,0 +1,102 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "hash.h"
+
+int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len,
+			  u64 dir, u64 objectid, u8 type)
+{
+	int ret = 0;
+	struct btrfs_path path;
+	struct btrfs_dir_item *dir_item;
+	char *name_ptr;
+	struct btrfs_key key;
+	u32 data_size;
+
+	key.objectid = dir;
+	key.flags = 0;
+	ret = btrfs_name_hash(name, name_len, &key.offset);
+	BUG_ON(ret);
+	btrfs_init_path(&path);
+	data_size = sizeof(*dir_item) + name_len;
+	ret = btrfs_insert_empty_item(root, &path, &key, data_size);
+	if (ret)
+		goto out;
+
+	dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+				  struct btrfs_dir_item);
+	btrfs_set_dir_objectid(dir_item, objectid);
+	btrfs_set_dir_type(dir_item, type);
+	btrfs_set_dir_flags(dir_item, 0);
+	name_ptr = (char *)(dir_item + 1);
+	memcpy(name_ptr, name, name_len);
+out:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+int btrfs_del_dir_item(struct btrfs_root *root, u64 dir, char *name,
+		       int name_len)
+{
+	int ret = 0;
+	struct btrfs_path path;
+	struct btrfs_key key;
+
+	key.objectid = dir;
+	key.flags = 0;
+	ret = btrfs_name_hash(name, name_len, &key.offset);
+	BUG_ON(ret);
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(root, &key, &path, 0, 1);
+	if (ret)
+		goto out;
+	ret = btrfs_del_item(root, &path);
+out:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+int btrfs_lookup_dir_item(struct btrfs_root *root, u64 dir, char *name,
+			  int name_len, u64 *objectid)
+{
+	int ret = 0;
+	struct btrfs_path path;
+	struct btrfs_dir_item *dir_item;
+	char *name_ptr;
+	struct btrfs_key key;
+	u32 item_len;
+	struct btrfs_item *item;
+
+	key.objectid = dir;
+	key.flags = 0;
+	ret = btrfs_name_hash(name, name_len, &key.offset);
+	BUG_ON(ret);
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(root, &key, &path, 0, 0);
+	if (ret)
+		goto out;
+
+	dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+				  struct btrfs_dir_item);
+
+	item = path.nodes[0]->leaf.items + path.slots[0];
+	item_len = btrfs_item_size(item);
+	if (item_len != name_len + sizeof(struct btrfs_dir_item)) {
+		BUG();
+		ret = 1;
+		goto out;
+	}
+	name_ptr = (char *)(dir_item + 1);
+	if (memcmp(name_ptr, name, name_len)) {
+		BUG();
+		ret = 1;
+		goto out;
+	}
+	*objectid = btrfs_dir_objectid(dir_item);
+out:
+	btrfs_release_path(root, &path);
+	return ret;
+}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d4f1ec32839..c81e14162ef 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,6 +35,7 @@ static int inc_block_ref(struct btrfs_root *root, u64 blocknr)
 	btrfs_init_path(&path);
 	key.objectid = blocknr;
 	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = 1;
 	ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 1);
 	if (ret != 0)
@@ -61,8 +62,9 @@ static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs)
 	struct btrfs_extent_item *item;
 	btrfs_init_path(&path);
 	key.objectid = blocknr;
-	key.flags = 0;
 	key.offset = 1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 0);
 	if (ret != 0)
 		BUG();
@@ -123,6 +125,7 @@ static int finish_current_insert(struct btrfs_root *extent_root)
 		btrfs_header_parentid(&extent_root->node->node.header));
 	ins.offset = 1;
 	ins.flags = 0;
+	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 
 	for (i = 0; i < extent_root->current_insert.flags; i++) {
 		ins.objectid = extent_root->current_insert.objectid + i;
@@ -149,6 +152,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 
 	key.objectid = blocknr;
 	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 
 	find_free_extent(root, 0, 0, (u64)-1, &ins);
@@ -228,7 +232,6 @@ static int run_pending(struct btrfs_root *extent_root)
  */
 int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 {
-	struct btrfs_key key;
 	struct btrfs_root *extent_root = root->extent_root;
 	struct btrfs_buffer *t;
 	int pending_ret;
@@ -240,9 +243,6 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 				   CTREE_EXTENT_PENDING_DEL);
 		return 0;
 	}
-	key.objectid = blocknr;
-	key.flags = 0;
-	key.offset = num_blocks;
 	ret = __free_extent(root, blocknr, num_blocks);
 	pending_ret = run_pending(root->extent_root);
 	return ret ? ret : pending_ret;
@@ -252,7 +252,7 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
  * ins->objectid == block start
- * ins->flags = 0
+ * ins->flags = BTRFS_EXTENT_ITEM_KEY
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
@@ -275,11 +275,14 @@ static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks,
 	total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3;
 	if (root->last_insert.objectid > search_start)
 		search_start = root->last_insert.objectid;
+
+	ins->flags = 0;
+	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+
 check_failed:
 	btrfs_init_path(&path);
 	ins->objectid = search_start;
 	ins->offset = 0;
-	ins->flags = 0;
 	start_found = 0;
 	ret = btrfs_search_slot(root, ins, &path, 0, 0);
 	if (ret < 0)
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
new file mode 100644
index 00000000000..92c37eece1f
--- /dev/null
+++ b/fs/btrfs/hash.c
@@ -0,0 +1,86 @@
+/*
+ *  Original copy from:
+ *  linux/fs/ext3/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This file is released under the GPL v2.
+ *
+ * This file may be redistributed under the terms of the GNU Public
+ * License.
+ */
+
+#include "kerncompat.h"
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(__u32 buf[2], __u32 const in[])
+{
+	__u32	sum = 0;
+	__u32	b0 = buf[0], b1 = buf[1];
+	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
+	int	n = 16;
+
+	do {
+		sum += DELTA;
+		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+	} while(--n);
+
+	buf[0] += b0;
+	buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i=0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = msg[i] + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+int btrfs_name_hash(const char *name, int len, u64 *hash_result)
+{
+	__u32	hash;
+	__u32	minor_hash = 0;
+	const char	*p;
+	__u32		in[8], buf[2];
+
+	/* Initialize the default seed for the hash checksum functions */
+	buf[0] = 0x67452301;
+	buf[1] = 0xefcdab89;
+	buf[2] = 0x98badcfe;
+	buf[3] = 0x10325476;
+
+	p = name;
+	while (len > 0) {
+		str2hashbuf(p, len, in, 4);
+		TEA_transform(buf, in);
+		len -= 16;
+		p += 16;
+	}
+	hash = buf[0];
+	minor_hash = buf[1];
+	*hash_result = buf[0];
+	*hash_result <<= 32;
+	*hash_result |= buf[1];
+	return 0;
+}
diff --git a/fs/btrfs/hasher.c b/fs/btrfs/hasher.c
new file mode 100644
index 00000000000..96702da4329
--- /dev/null
+++ b/fs/btrfs/hasher.c
@@ -0,0 +1,23 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kerncompat.h"
+#include "hash.h"
+
+int main() {
+	u64 result;
+	int ret;
+	char line[255];
+	char *p;
+	while(1) {
+		p = fgets(line, 255, stdin);
+		if (!p)
+			break;
+		if (strlen(line) == 0)
+			continue;
+		ret = btrfs_name_hash(line, strlen(line), &result);
+		BUG_ON(ret);
+		printf("hash returns %Lu\n", result);
+	}
+	return 0;
+}
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
index 56239b922ba..105d3f58408 100644
--- a/fs/btrfs/kerncompat.h
+++ b/fs/btrfs/kerncompat.h
@@ -21,6 +21,7 @@
 #endif
 
 typedef unsigned int u32;
+typedef u32 __u32;
 typedef unsigned long long u64;
 typedef unsigned char u8;
 typedef unsigned short u16;
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index 317d20ce759..9aa900811c3 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -50,9 +50,10 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) - sizeof(root_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_item_size(&item, sizeof(root_item));
-	btrfs_set_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID);
-	btrfs_set_key_offset(&item.key, 0);
-	btrfs_set_key_flags(&item.key, 0);
+	btrfs_set_disk_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID);
+	btrfs_set_disk_key_offset(&item.key, 0);
+	btrfs_set_disk_key_flags(&item.key, 0);
+	btrfs_set_disk_key_type(&item.key, BTRFS_ROOT_ITEM_KEY);
 	memcpy(empty_leaf->items, &item, sizeof(item));
 	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
 		&root_item, sizeof(root_item));
@@ -60,7 +61,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	btrfs_set_root_blocknr(&root_item, start_block + 3);
 	itemoff = itemoff - sizeof(root_item);
 	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID);
+	btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID);
 	memcpy(empty_leaf->items + 1, &item, sizeof(item));
 	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
 		&root_item, sizeof(root_item));
@@ -73,9 +74,10 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	btrfs_set_header_nritems(&empty_leaf->header, 4);
 
 	/* item1, reserve blocks 0-16 */
-	btrfs_set_key_objectid(&item.key, 0);
-	btrfs_set_key_offset(&item.key, start_block + 1);
-	btrfs_set_key_flags(&item.key, 0);
+	btrfs_set_disk_key_objectid(&item.key, 0);
+	btrfs_set_disk_key_offset(&item.key, start_block + 1);
+	btrfs_set_disk_key_flags(&item.key, 0);
+	btrfs_set_disk_key_type(&item.key, BTRFS_EXTENT_ITEM_KEY);
 	itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) -
 			sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
@@ -87,8 +89,8 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 		&extent_item, btrfs_item_size(&item));
 
 	/* item2, give block 17 to the root */
-	btrfs_set_key_objectid(&item.key, start_block + 1);
-	btrfs_set_key_offset(&item.key, 1);
+	btrfs_set_disk_key_objectid(&item.key, start_block + 1);
+	btrfs_set_disk_key_offset(&item.key, 1);
 	itemoff = itemoff - sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_extent_owner(&extent_item, BTRFS_ROOT_TREE_OBJECTID);
@@ -97,8 +99,8 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 		&extent_item, btrfs_item_size(&item));
 
 	/* item3, give block 18 to the extent root */
-	btrfs_set_key_objectid(&item.key, start_block + 2);
-	btrfs_set_key_offset(&item.key, 1);
+	btrfs_set_disk_key_objectid(&item.key, start_block + 2);
+	btrfs_set_disk_key_offset(&item.key, 1);
 	itemoff = itemoff - sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_extent_owner(&extent_item, BTRFS_EXTENT_TREE_OBJECTID);
@@ -107,8 +109,8 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 		&extent_item, btrfs_item_size(&item));
 
 	/* item4, give block 19 to the FS root */
-	btrfs_set_key_objectid(&item.key, start_block + 3);
-	btrfs_set_key_offset(&item.key, 1);
+	btrfs_set_disk_key_objectid(&item.key, start_block + 3);
+	btrfs_set_disk_key_offset(&item.key, 1);
 	itemoff = itemoff - sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
 	btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 4c710190343..f2745b24747 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -12,27 +12,41 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_item *item;
 	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
+	u32 type;
 	printf("leaf %Lu total ptrs %d free space %d\n",
 		btrfs_header_blocknr(&l->header), nr,
 		btrfs_leaf_free_space(root, l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
+		type = btrfs_disk_key_type(&item->key);
 		printf("\titem %d key (%Lu %Lu %u) itemoff %d itemsize %d\n",
 			i,
-			btrfs_key_objectid(&item->key),
-			btrfs_key_offset(&item->key),
-			btrfs_key_flags(&item->key),
+			btrfs_disk_key_objectid(&item->key),
+			btrfs_disk_key_offset(&item->key),
+			btrfs_disk_key_flags(&item->key),
 			btrfs_item_offset(item),
 			btrfs_item_size(item));
-		printf("\t\titem data %.*s\n", btrfs_item_size(item),
-			btrfs_leaf_data(l) + btrfs_item_offset(item));
-		ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-		printf("\t\textent data refs %u owner %Lu\n",
-			btrfs_extent_refs(ei), btrfs_extent_owner(ei));
-		ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-		printf("\t\troot data blocknr %Lu refs %u\n",
-			btrfs_root_blocknr(ri), btrfs_root_refs(ri));
+		switch (type) {
+		case BTRFS_INODE_ITEM_KEY:
+			break;
+		case BTRFS_DIR_ITEM_KEY:
+			break;
+		case BTRFS_ROOT_ITEM_KEY:
+			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
+			printf("\t\troot data blocknr %Lu refs %u\n",
+				btrfs_root_blocknr(ri), btrfs_root_refs(ri));
+			break;
+		case BTRFS_EXTENT_ITEM_KEY:
+			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
+			printf("\t\textent data refs %u owner %Lu\n",
+				btrfs_extent_refs(ei), btrfs_extent_owner(ei));
+			break;
+		case BTRFS_STRING_ITEM_KEY:
+			printf("\t\titem data %.*s\n", btrfs_item_size(item),
+				btrfs_leaf_data(l) + btrfs_item_offset(item));
+			break;
+		};
 		fflush(stdout);
 	}
 }
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index d9287cad35c..3a257161712 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -30,6 +30,8 @@ int main(int ac, char **av) {
 
 	root = open_ctree("dbfile", &super);
 	srand(55);
+	ins.flags = 0;
+	btrfs_set_key_type(&ins, BTRFS_STRING_ITEM_KEY);
 	for (i = 0; i < run_size; i++) {
 		buf = malloc(64);
 		num = next_key(i, max_key);
@@ -39,7 +41,6 @@ int main(int ac, char **av) {
 			fprintf(stderr, "insert %d:%d\n", num, i);
 		ins.objectid = num;
 		ins.offset = 0;
-		ins.flags = 0;
 		ret = btrfs_insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index a8ef0478991..f05135f13f9 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -18,6 +18,7 @@ static int setup_key(struct radix_tree_root *root, struct btrfs_key *key,
 	int ret;
 
 	key->flags = 0;
+	btrfs_set_key_type(key, BTRFS_STRING_ITEM_KEY);
 	key->offset = 0;
 again:
 	ret = radix_tree_gang_lookup(root, (void **)res, num, 2);
@@ -157,6 +158,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 
 	key.offset = 0;
 	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY);
 	key.objectid = (unsigned long)-1;
 	while(nr-- >= 0) {
 		btrfs_init_path(&path);
@@ -173,7 +175,8 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 			path.slots[0] -= 1;
 		}
 		slot = path.slots[0];
-		found=btrfs_key_objectid(&path.nodes[0]->leaf.items[slot].key);
+		found = btrfs_disk_key_objectid(
+					&path.nodes[0]->leaf.items[slot].key);
 		ret = btrfs_del_item(root, &path);
 		count++;
 		if (ret) {
@@ -257,6 +260,7 @@ static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix)
 
 	key.offset = 0;
 	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY);
 	key.objectid = (unsigned long)-1;
 	while(1) {
 		btrfs_init_path(&path);
@@ -274,8 +278,8 @@ static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix)
 			slot -= 1;
 		}
 		for (i = slot; i >= 0; i--) {
-			found = btrfs_key_objectid(&path.nodes[0]->
-						   leaf.items[i].key);
+			found = btrfs_disk_key_objectid(&path.nodes[0]->
+							leaf.items[i].key);
 			radix_tree_preload(GFP_KERNEL);
 			ret = radix_tree_insert(radix, found, (void *)found);
 			if (ret) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 03aa7c2c633..a24b4727403 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -27,7 +27,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	l = &path.nodes[0]->leaf;
 	BUG_ON(path.slots[0] == 0);
 	slot = path.slots[0] - 1;
-	if (btrfs_key_objectid(&l->items[slot].key) != objectid) {
+	if (btrfs_disk_key_objectid(&l->items[slot].key) != objectid) {
 		ret = 1;
 		goto out;
 	}
-- 
cgit v1.2.3


From 24af03ea5e765d57860c975efcdcd41ec7f63fd9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Mar 2007 12:57:43 -0400
Subject: Btrfs: add hash.h

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/hash.h | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 fs/btrfs/hash.h

(limited to 'fs')

diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
new file mode 100644
index 00000000000..e56c4670093
--- /dev/null
+++ b/fs/btrfs/hash.h
@@ -0,0 +1,4 @@
+#ifndef __HASH__
+#define __HASH__
+int btrfs_name_hash(const char *name, int len, u64 *hash_result);
+#endif
-- 
cgit v1.2.3


From 1d4f6404de26df49eb8452f8fdf7672b59f407fc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Mar 2007 15:18:43 -0400
Subject: Btrfs: directory testing code and dir item fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile     |   5 +-
 fs/btrfs/ctree.h      |  72 +++++----
 fs/btrfs/dir-item.c   |  58 +++-----
 fs/btrfs/dir-test.c   | 404 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/print-tree.c |  12 ++
 5 files changed, 477 insertions(+), 74 deletions(-)
 create mode 100644 fs/btrfs/dir-test.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ee78cdd08c0..ea3e1493baa 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,4 +1,3 @@
-
 CC=gcc
 CFLAGS = -g -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
@@ -15,7 +14,7 @@ check=sparse $(CHECKFLAGS)
 	$(check) $<
 	$(CC) $(CFLAGS) -c $<
 
-all: tester debug-tree quick-test
+all: tester debug-tree quick-test dir-test
 
 debug-tree: $(objects) debug-tree.o
 	gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o
@@ -23,6 +22,8 @@ debug-tree: $(objects) debug-tree.o
 tester: $(objects) random-test.o
 	gcc $(CFLAGS) -o tester $(objects) random-test.o
 
+dir-test: $(objects) dir-test.o
+	gcc $(CFLAGS) -o dir-test $(objects) dir-test.o
 quick-test: $(objects) quick-test.o
 	gcc $(CFLAGS) -o quick-test $(objects) quick-test.o
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e8a26fd8ea9..7a3492d5888 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -180,37 +180,6 @@ struct btrfs_root {
 #define BTRFS_EXTENT_ITEM_KEY	4
 #define BTRFS_STRING_ITEM_KEY	5
 
-static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d)
-{
-	return le64_to_cpu(d->objectid);
-}
-
-static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val)
-{
-	d->objectid = cpu_to_le64(val);
-}
-
-static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d)
-{
-	return le16_to_cpu(d->flags);
-}
-
-static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val)
-{
-	d->flags = cpu_to_le16(val);
-}
-
-static inline u8 btrfs_dir_type(struct btrfs_dir_item *d)
-{
-	return d->type;
-}
-
-static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val)
-{
-	d->type = val;
-}
-
-
 static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
 {
 	return le64_to_cpu(ei->owner);
@@ -267,6 +236,41 @@ static inline void btrfs_set_item_size(struct btrfs_item *item, u16 val)
 	item->size = cpu_to_le16(val);
 }
 
+static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d)
+{
+	return le64_to_cpu(d->objectid);
+}
+
+static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val)
+{
+	d->objectid = cpu_to_le64(val);
+}
+
+static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d)
+{
+	return le16_to_cpu(d->flags);
+}
+
+static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val)
+{
+	d->flags = cpu_to_le16(val);
+}
+
+static inline u8 btrfs_dir_type(struct btrfs_dir_item *d)
+{
+	return d->type;
+}
+
+static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val)
+{
+	d->type = val;
+}
+
+static inline u32 btrfs_dir_name_len(struct btrfs_item *i)
+{
+	return btrfs_item_size(i) - sizeof(struct btrfs_dir_item);
+}
+
 static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
 					 struct btrfs_disk_key *disk)
 {
@@ -506,4 +510,10 @@ int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key,
 		      struct btrfs_root_item *item);
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 			struct btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len,
+			  u64 dir, u64 objectid, u8 type);
+int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+			  u64 dir, char *name, int name_len, int mod);
+int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
+			      char *name, int name_len);
 #endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 2a888e97e1a..8043b2ef10d 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -18,6 +18,7 @@ int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len,
 
 	key.objectid = dir;
 	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
 	btrfs_init_path(&path);
@@ -38,65 +39,40 @@ out:
 	return ret;
 }
 
-int btrfs_del_dir_item(struct btrfs_root *root, u64 dir, char *name,
-		       int name_len)
+int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path,
+			  u64 dir, char *name, int name_len, int mod)
 {
-	int ret = 0;
-	struct btrfs_path path;
+	int ret;
 	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
 
 	key.objectid = dir;
 	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(root, &key, &path, 0, 1);
-	if (ret)
-		goto out;
-	ret = btrfs_del_item(root, &path);
-out:
-	btrfs_release_path(root, &path);
+	ret = btrfs_search_slot(root, &key, path, ins_len, cow);
 	return ret;
 }
 
-int btrfs_lookup_dir_item(struct btrfs_root *root, u64 dir, char *name,
-			  int name_len, u64 *objectid)
+int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
+			      char *name, int name_len)
 {
-	int ret = 0;
-	struct btrfs_path path;
+	struct btrfs_item *item;
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
-	struct btrfs_key key;
 	u32 item_len;
-	struct btrfs_item *item;
-
-	key.objectid = dir;
-	key.flags = 0;
-	ret = btrfs_name_hash(name, name_len, &key.offset);
-	BUG_ON(ret);
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(root, &key, &path, 0, 0);
-	if (ret)
-		goto out;
-
-	dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
-				  struct btrfs_dir_item);
-
-	item = path.nodes[0]->leaf.items + path.slots[0];
+	item = path->nodes[0]->leaf.items + path->slots[0];
 	item_len = btrfs_item_size(item);
 	if (item_len != name_len + sizeof(struct btrfs_dir_item)) {
-		BUG();
-		ret = 1;
-		goto out;
+		return 0;
 	}
+	dir_item = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0],
+				  struct btrfs_dir_item);
 	name_ptr = (char *)(dir_item + 1);
 	if (memcmp(name_ptr, name, name_len)) {
-		BUG();
-		ret = 1;
-		goto out;
+		return 0;
 	}
-	*objectid = btrfs_dir_objectid(dir_item);
-out:
-	btrfs_release_path(root, &path);
-	return ret;
+	return 1;
 }
diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c
new file mode 100644
index 00000000000..b482b8f49f8
--- /dev/null
+++ b/fs/btrfs/dir-test.c
@@ -0,0 +1,404 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "hash.h"
+
+int keep_running = 1;
+struct btrfs_super_block super;
+static u64 dir_oid = 44556;
+static u64 file_oid = 33778;
+
+static int find_num(struct radix_tree_root *root, unsigned long *num_ret,
+		     int exists)
+{
+	unsigned long num = rand();
+	unsigned long res[2];
+	int ret;
+
+again:
+	ret = radix_tree_gang_lookup(root, (void **)res, num, 2);
+	if (exists) {
+		if (ret == 0)
+			return -1;
+		num = res[0];
+	} else if (ret != 0 && num == res[0]) {
+		num++;
+		if (ret > 1 && num == res[1]) {
+			num++;
+			goto again;
+		}
+	}
+	*num_ret = num;
+	return 0;
+}
+
+static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix)
+{
+	int ret;
+	char buf[128];
+	unsigned long oid;
+	struct btrfs_path path;
+
+	find_num(radix, &oid, 0);
+	sprintf(buf, "str-%lu", oid);
+
+	ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid,
+				    1);
+	if (ret)
+		goto error;
+
+	radix_tree_preload(GFP_KERNEL);
+	ret = radix_tree_insert(radix, oid, (void *)oid);
+	radix_tree_preload_end();
+	if (ret)
+		goto error;
+	return ret;
+error:
+	if (ret != -EEXIST)
+		goto fatal;
+
+	/*
+	 * if we got an EEXIST, it may be due to hash collision, double
+	 * check
+	 */
+	btrfs_init_path(&path);
+	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0);
+	if (ret)
+		goto fatal_release;
+	if (!btrfs_match_dir_item_name(root, &path, buf, strlen(buf))) {
+		struct btrfs_dir_item *di;
+		char *found;
+		u32 found_len;
+		u64 myhash;
+		u64 foundhash;
+
+		di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+				    struct btrfs_dir_item);
+		found = (char *)(di + 1);
+		found_len = btrfs_dir_name_len(path.nodes[0]->leaf.items +
+						path.slots[0]);
+		btrfs_name_hash(buf, strlen(buf), &myhash);
+		btrfs_name_hash(found, found_len, &foundhash);
+		if (myhash != foundhash)
+			goto fatal_release;
+		btrfs_release_path(root, &path);
+		return 0;
+	}
+fatal_release:
+	btrfs_release_path(root, &path);
+fatal:
+	printf("failed to insert %lu ret %d\n", oid, ret);
+	return -1;
+}
+
+static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
+{
+	int ret;
+	char buf[128];
+	unsigned long oid;
+
+	ret = find_num(radix, &oid, 1);
+	if (ret < 0)
+		return 0;
+	sprintf(buf, "str-%lu", oid);
+
+	ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid,
+				    1);
+	if (ret != -EEXIST) {
+		printf("insert on %s gave us %d\n", buf, ret);
+		return 1;
+	}
+	return 0;
+}
+
+static int del_one(struct btrfs_root *root, struct radix_tree_root *radix)
+{
+	int ret;
+	char buf[128];
+	unsigned long oid;
+	struct btrfs_path path;
+	unsigned long *ptr;
+
+	ret = find_num(radix, &oid, 1);
+	if (ret < 0)
+		return 0;
+	sprintf(buf, "str-%lu", oid);
+	btrfs_init_path(&path);
+	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), -1);
+	if (ret)
+		goto out_release;
+	ret = btrfs_del_item(root, &path);
+	if (ret)
+		goto out_release;
+	btrfs_release_path(root, &path);
+	ptr = radix_tree_delete(radix, oid);
+	if (!ptr) {
+		ret = -5555;
+		goto out;
+	}
+	return 0;
+out_release:
+	btrfs_release_path(root, &path);
+out:
+	printf("failed to delete %lu %d\n", oid, ret);
+	return -1;
+}
+
+static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix)
+{
+	struct btrfs_path path;
+	char buf[128];
+	int ret;
+	unsigned long oid;
+
+	ret = find_num(radix, &oid, 1);
+	if (ret < 0)
+		return 0;
+	sprintf(buf, "str-%lu", oid);
+	btrfs_init_path(&path);
+	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0);
+	btrfs_release_path(root, &path);
+	if (ret) {
+		printf("unable to find key %lu\n", oid);
+		return -1;
+	}
+	return 0;
+}
+
+static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix)
+{
+	struct btrfs_path path;
+	char buf[128];
+	int ret;
+	unsigned long oid;
+
+	ret = find_num(radix, &oid, 0);
+	if (ret < 0)
+		return 0;
+	sprintf(buf, "str-%lu", oid);
+	btrfs_init_path(&path);
+	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0);
+	btrfs_release_path(root, &path);
+	if (!ret) {
+		printf("able to find key that should not exist %lu\n", oid);
+		return -1;
+	}
+	return 0;
+}
+
+static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
+		      int nr)
+{
+	struct btrfs_path path;
+	struct btrfs_key key;
+	unsigned long found = 0;
+	u32 found_len;
+	int ret;
+	int slot;
+	int *ptr;
+	int count = 0;
+	char buf[128];
+	struct btrfs_dir_item *di;
+
+	key.offset = (u64)-1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	key.objectid = dir_oid;
+	while(nr-- >= 0) {
+		btrfs_init_path(&path);
+		ret = btrfs_search_slot(root, &key, &path, -1, 1);
+		if (ret < 0) {
+			btrfs_release_path(root, &path);
+			return ret;
+		}
+		if (ret != 0) {
+			if (path.slots[0] == 0) {
+				btrfs_release_path(root, &path);
+				break;
+			}
+			path.slots[0] -= 1;
+		}
+		slot = path.slots[0];
+		di = btrfs_item_ptr(&path.nodes[0]->leaf, slot,
+				    struct btrfs_dir_item);
+		found_len = btrfs_dir_name_len(path.nodes[0]->leaf.items +
+						slot);
+		memcpy(buf, (char *)(di + 1), found_len);
+		BUG_ON(found_len > 128);
+		buf[found_len] = '\0';
+		found = atoi(buf + 4);
+		ret = btrfs_del_item(root, &path);
+		count++;
+		if (ret) {
+			fprintf(stderr,
+				"failed to remove %lu from tree\n",
+				found);
+			return -1;
+		}
+		btrfs_release_path(root, &path);
+		ptr = radix_tree_delete(radix, found);
+		if (!ptr)
+			goto error;
+		if (!keep_running)
+			break;
+	}
+	return 0;
+error:
+	fprintf(stderr, "failed to delete from the radix %lu\n", found);
+	return -1;
+}
+
+static int fill_tree(struct btrfs_root *root, struct radix_tree_root *radix,
+		     int count)
+{
+	int i;
+	int ret = 0;
+	for (i = 0; i < count; i++) {
+		ret = ins_one(root, radix);
+		if (ret) {
+			fprintf(stderr, "fill failed\n");
+			goto out;
+		}
+		if (i % 1000 == 0) {
+			ret = btrfs_commit_transaction(root, &super);
+			if (ret) {
+				fprintf(stderr, "fill commit failed\n");
+				return ret;
+			}
+		}
+		if (i && i % 10000 == 0) {
+			printf("bigfill %d\n", i);
+		}
+		if (!keep_running)
+			break;
+	}
+out:
+	return ret;
+}
+
+static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix)
+{
+	int ret;
+	int nr = rand() % 5000;
+	static int run_nr = 0;
+
+	/* do the bulk op much less frequently */
+	if (run_nr++ % 100)
+		return 0;
+	ret = empty_tree(root, radix, nr);
+	if (ret)
+		return ret;
+	ret = fill_tree(root, radix, nr);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+
+int (*ops[])(struct btrfs_root *root, struct radix_tree_root *radix) =
+	{ ins_one, insert_dup, del_one, lookup_item,
+	  lookup_enoent, bulk_op };
+
+void sigstopper(int ignored)
+{
+	keep_running = 0;
+	fprintf(stderr, "caught exit signal, stopping\n");
+}
+
+int print_usage(void)
+{
+	printf("usage: tester [-ih] [-c count] [-f count]\n");
+	printf("\t -c count -- iteration count after filling\n");
+	printf("\t -f count -- run this many random inserts before starting\n");
+	printf("\t -i       -- only do initial fill\n");
+	printf("\t -h       -- this help text\n");
+	exit(1);
+}
+int main(int ac, char **av)
+{
+	RADIX_TREE(radix, GFP_KERNEL);
+	struct btrfs_root *root;
+	int i;
+	int ret;
+	int count;
+	int op;
+	int iterations = 20000;
+	int init_fill_count = 800000;
+	int err = 0;
+	int initial_only = 0;
+	radix_tree_init();
+
+	printf("removing old tree\n");
+	unlink("dbfile");
+	root = open_ctree("dbfile", &super);
+
+	signal(SIGTERM, sigstopper);
+	signal(SIGINT, sigstopper);
+
+	for (i = 1 ; i < ac ; i++) {
+		if (strcmp(av[i], "-i") == 0) {
+			initial_only = 1;
+		} else if (strcmp(av[i], "-c") == 0) {
+			iterations = atoi(av[i+1]);
+			i++;
+		} else if (strcmp(av[i], "-f") == 0) {
+			init_fill_count = atoi(av[i+1]);
+			i++;
+		} else {
+			print_usage();
+		}
+	}
+	printf("initial fill\n");
+	ret = fill_tree(root, &radix, init_fill_count);
+	printf("starting run\n");
+	if (ret) {
+		err = ret;
+		goto out;
+	}
+	if (initial_only == 1) {
+		goto out;
+	}
+	for (i = 0; i < iterations; i++) {
+		op = rand() % ARRAY_SIZE(ops);
+		count = rand() % 128;
+		if (i % 2000 == 0) {
+			printf("%d\n", i);
+			fflush(stdout);
+		}
+		if (i && i % 5000 == 0) {
+			printf("open & close, root level %d nritems %d\n",
+				btrfs_header_level(&root->node->node.header),
+				btrfs_header_nritems(&root->node->node.header));
+			close_ctree(root, &super);
+			root = open_ctree("dbfile", &super);
+		}
+		while(count--) {
+			ret = ops[op](root, &radix);
+			if (ret) {
+				fprintf(stderr, "op %d failed %d:%d\n",
+					op, i, iterations);
+				btrfs_print_tree(root, root->node);
+				fprintf(stderr, "op %d failed %d:%d\n",
+					op, i, iterations);
+				err = ret;
+				goto out;
+			}
+			if (ops[op] == bulk_op)
+				break;
+			if (keep_running == 0) {
+				err = 0;
+				goto out;
+			}
+		}
+	}
+out:
+	close_ctree(root, &super);
+	return err;
+}
+
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f2745b24747..ad244d65811 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -12,7 +12,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_item *item;
 	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
+	struct btrfs_dir_item *di;
 	u32 type;
+	u32 namelen;
+
 	printf("leaf %Lu total ptrs %d free space %d\n",
 		btrfs_header_blocknr(&l->header), nr,
 		btrfs_leaf_free_space(root, l));
@@ -31,6 +34,15 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 		case BTRFS_INODE_ITEM_KEY:
 			break;
 		case BTRFS_DIR_ITEM_KEY:
+			namelen = btrfs_item_size(l->items + i) - sizeof(*di);
+			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+			printf("\t\tdir oid %Lu flags %u type %u\n",
+				btrfs_dir_objectid(di),
+				btrfs_dir_flags(di),
+				btrfs_dir_type(di));
+			printf("\t\tname %.*s\n",
+				namelen, (char *)(di + 1));
+
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-- 
cgit v1.2.3


From 1e1d27017c5986c1ea81181506042cf9cba3f6ea Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 Mar 2007 19:03:33 -0400
Subject: Btrfs: add inode item

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile     |   2 +-
 fs/btrfs/TODO         |   1 +
 fs/btrfs/ctree.h      | 175 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/file-item.c  |   7 ++
 fs/btrfs/inode-item.c |  37 +++++++++++
 5 files changed, 215 insertions(+), 7 deletions(-)
 create mode 100644 fs/btrfs/file-item.c
 create mode 100644 fs/btrfs/inode-item.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ea3e1493baa..36964f54bb4 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -2,7 +2,7 @@ CC=gcc
 CFLAGS = -g -Wall
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \
-	  root-tree.o dir-item.o hash.o
+	  root-tree.o dir-item.o hash.o file-item.o inode-item.o
 
 # if you don't have sparse installed, use ls instead
 CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 249e2e90bea..2ae4b3aae1e 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -9,6 +9,7 @@
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
 * Do actual block accounting
+* Check compat and incompat flags on the inode
 * Port into the kernel
 * Add virtual filesystems, mountable snapshots
 * Get rid of struct ctree_path, limiting tree levels held at one time
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7a3492d5888..dbf3917833f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -132,6 +132,37 @@ struct btrfs_extent_item {
 	__le64 owner;
 } __attribute__ ((__packed__));
 
+struct btrfs_inode_timespec {
+	__le32 sec;
+	__le32 nsec;
+} __attribute__ ((__packed__));
+
+/*
+ * there is no padding here on purpose.  If you want to extent the inode,
+ * make a new item type
+ */
+struct btrfs_inode_item {
+	__le64 generation;
+	__le64 size;
+	__le64 nblocks;
+	__le32 nlink;
+	__le32 uid;
+	__le32 gid;
+	__le32 mode;
+	__le32 rdev;
+	__le16 flags;
+	__le16 compat_flags;
+	struct btrfs_inode_timespec atime;
+	struct btrfs_inode_timespec ctime;
+	struct btrfs_inode_timespec mtime;
+	struct btrfs_inode_timespec otime;
+} __attribute__ ((__packed__));
+
+/* inline data is just a blob of bytes */
+struct btrfs_inline_data_item {
+	u8 data;
+} __attribute__ ((__packed__));
+
 struct btrfs_dir_item {
 	__le64 objectid;
 	__le16 flags;
@@ -170,15 +201,149 @@ struct btrfs_root {
 	u32 blocksize;
 };
 
-
 /* the lower bits in the key flags defines the item type */
 #define BTRFS_KEY_TYPE_MAX	256
 #define BTRFS_KEY_TYPE_MASK	(BTRFS_KEY_TYPE_MAX - 1)
+
+/*
+ * inode items have the data typically returned from stat and store other
+ * info about object characteristics.  There is one for every file and dir in
+ * the FS
+ */
 #define BTRFS_INODE_ITEM_KEY	1
+
+/*
+ * dir items are the name -> inode pointers in a directory.  There is one
+ * for every name in a directory.
+ */
 #define BTRFS_DIR_ITEM_KEY	2
-#define BTRFS_ROOT_ITEM_KEY	3
-#define BTRFS_EXTENT_ITEM_KEY	4
-#define BTRFS_STRING_ITEM_KEY	5
+/*
+ * inline data is file data that fits in the btree.
+ */
+#define BTRFS_INLINE_DATA_KEY	3
+/*
+ * extent data is for data that can't fit in the btree.  It points to
+ * a (hopefully) huge chunk of disk
+ */
+#define BTRFS_EXTENT_DATA_KEY	4
+/*
+ * root items point to tree roots.  There are typically in the root
+ * tree used by the super block to find all the other trees
+ */
+#define BTRFS_ROOT_ITEM_KEY	5
+/*
+ * extent items are in the extent map tree.  These record which blocks
+ * are used, and how many references there are to each block
+ */
+#define BTRFS_EXTENT_ITEM_KEY	6
+/*
+ * string items are for debugging.  They just store a short string of
+ * data in the FS
+ */
+#define BTRFS_STRING_ITEM_KEY	7
+
+static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
+{
+	return le64_to_cpu(i->generation);
+}
+
+static inline void btrfs_set_inode_generation(struct btrfs_inode_item *i,
+					      u64 val)
+{
+	i->generation = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_inode_size(struct btrfs_inode_item *i)
+{
+	return le64_to_cpu(i->size);
+}
+
+static inline void btrfs_set_inode_size(struct btrfs_inode_item *i, u64 val)
+{
+	i->size = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_inode_nblocks(struct btrfs_inode_item *i)
+{
+	return le64_to_cpu(i->nblocks);
+}
+
+static inline void btrfs_set_inode_nblocks(struct btrfs_inode_item *i, u64 val)
+{
+	i->nblocks = cpu_to_le64(val);
+}
+
+static inline u32 btrfs_inode_nlink(struct btrfs_inode_item *i)
+{
+	return le32_to_cpu(i->nlink);
+}
+
+static inline void btrfs_set_inode_nlink(struct btrfs_inode_item *i, u32 val)
+{
+	i->nlink = cpu_to_le32(val);
+}
+
+static inline u32 btrfs_inode_uid(struct btrfs_inode_item *i)
+{
+	return le32_to_cpu(i->uid);
+}
+
+static inline void btrfs_set_inode_uid(struct btrfs_inode_item *i, u32 val)
+{
+	i->uid = cpu_to_le32(val);
+}
+
+static inline u32 btrfs_inode_gid(struct btrfs_inode_item *i)
+{
+	return le32_to_cpu(i->gid);
+}
+
+static inline void btrfs_set_inode_gid(struct btrfs_inode_item *i, u32 val)
+{
+	i->gid = cpu_to_le32(val);
+}
+
+static inline u32 btrfs_inode_mode(struct btrfs_inode_item *i)
+{
+	return le32_to_cpu(i->mode);
+}
+
+static inline void btrfs_set_inode_mode(struct btrfs_inode_item *i, u32 val)
+{
+	i->mode = cpu_to_le32(val);
+}
+
+static inline u32 btrfs_inode_rdev(struct btrfs_inode_item *i)
+{
+	return le32_to_cpu(i->rdev);
+}
+
+static inline void btrfs_set_inode_rdev(struct btrfs_inode_item *i, u32 val)
+{
+	i->rdev = cpu_to_le32(val);
+}
+
+static inline u16 btrfs_inode_flags(struct btrfs_inode_item *i)
+{
+	return le16_to_cpu(i->flags);
+}
+
+static inline void btrfs_set_inode_flags(struct btrfs_inode_item *i, u16 val)
+{
+	i->flags = cpu_to_le16(val);
+}
+
+static inline u16 btrfs_inode_compat_flags(struct btrfs_inode_item *i)
+{
+	return le16_to_cpu(i->compat_flags);
+}
+
+static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i,
+						u16 val)
+{
+	i->compat_flags = cpu_to_le16(val);
+}
+
 
 static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
 {
@@ -344,8 +509,6 @@ static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key, u32 type)
 	btrfs_set_disk_key_flags(key, flags);
 }
 
-
-
 static inline u64 btrfs_header_blocknr(struct btrfs_header *h)
 {
 	return le64_to_cpu(h->blocknr);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
new file mode 100644
index 00000000000..f5b36c732c5
--- /dev/null
+++ b/fs/btrfs/file-item.c
@@ -0,0 +1,7 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
new file mode 100644
index 00000000000..4908f9afdf8
--- /dev/null
+++ b/fs/btrfs/inode-item.c
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_inode(struct btrfs_root *root, u64 objectid,
+		       struct btrfs_inode_item *inode_item)
+{
+	struct btrfs_path path;
+	struct btrfs_key key;
+	int ret;
+	key.objectid = objectid;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	btrfs_init_path(&path);
+	ret = btrfs_insert_item(root, &key, inode_item, sizeof(*inode_item));
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+int btrfs_lookup_inode(struct btrfs_root *root, struct btrfs_path *path,
+			u64 objectid, int mod)
+{
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = objectid;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+	return btrfs_search_slot(root, &key, path, ins_len, cow);
+}
-- 
cgit v1.2.3


From a8a2ee0c600a213d13170c2f4d7bd0b304bbec19 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Mar 2007 08:46:49 -0400
Subject: Btrfs: add a name_len to dir items, reorder key

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c      |  8 ++++----
 fs/btrfs/ctree.h      | 14 ++++++++++----
 fs/btrfs/dir-item.c   | 14 +++++---------
 fs/btrfs/dir-test.c   |  6 ++----
 fs/btrfs/print-tree.c |  8 ++++----
 5 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7edfbd46804..68f66b320a8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -107,14 +107,14 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 		return 1;
 	if (k1.objectid < k2->objectid)
 		return -1;
-	if (k1.offset > k2->offset)
-		return 1;
-	if (k1.offset < k2->offset)
-		return -1;
 	if (k1.flags > k2->flags)
 		return 1;
 	if (k1.flags < k2->flags)
 		return -1;
+	if (k1.offset > k2->offset)
+		return 1;
+	if (k1.offset < k2->offset)
+		return -1;
 	return 0;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dbf3917833f..7c66b647ea2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,14 +26,14 @@
  */
 struct btrfs_disk_key {
 	__le64 objectid;
-	__le64 offset;
 	__le32 flags;
+	__le64 offset;
 } __attribute__ ((__packed__));
 
 struct btrfs_key {
 	u64 objectid;
-	u64 offset;
 	u32 flags;
+	u64 offset;
 } __attribute__ ((__packed__));
 
 /*
@@ -166,6 +166,7 @@ struct btrfs_inline_data_item {
 struct btrfs_dir_item {
 	__le64 objectid;
 	__le16 flags;
+	__le16 name_len;
 	u8 type;
 } __attribute__ ((__packed__));
 
@@ -431,9 +432,14 @@ static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val)
 	d->type = val;
 }
 
-static inline u32 btrfs_dir_name_len(struct btrfs_item *i)
+static inline u16 btrfs_dir_name_len(struct btrfs_dir_item *d)
+{
+	return le16_to_cpu(d->name_len);
+}
+
+static inline void btrfs_set_dir_name_len(struct btrfs_dir_item *d, u16 val)
 {
-	return btrfs_item_size(i) - sizeof(struct btrfs_dir_item);
+	d->name_len = cpu_to_le16(val);
 }
 
 static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 8043b2ef10d..a42a67b9975 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -32,6 +32,7 @@ int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len,
 	btrfs_set_dir_objectid(dir_item, objectid);
 	btrfs_set_dir_type(dir_item, type);
 	btrfs_set_dir_flags(dir_item, 0);
+	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
 	memcpy(name_ptr, name, name_len);
 out:
@@ -59,20 +60,15 @@ int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 			      char *name, int name_len)
 {
-	struct btrfs_item *item;
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
-	u32 item_len;
-	item = path->nodes[0]->leaf.items + path->slots[0];
-	item_len = btrfs_item_size(item);
-	if (item_len != name_len + sizeof(struct btrfs_dir_item)) {
-		return 0;
-	}
+
 	dir_item = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0],
 				  struct btrfs_dir_item);
+	if (btrfs_dir_name_len(dir_item) != name_len)
+		return 0;
 	name_ptr = (char *)(dir_item + 1);
-	if (memcmp(name_ptr, name, name_len)) {
+	if (memcmp(name_ptr, name, name_len))
 		return 0;
-	}
 	return 1;
 }
diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c
index b482b8f49f8..f73aa762339 100644
--- a/fs/btrfs/dir-test.c
+++ b/fs/btrfs/dir-test.c
@@ -81,8 +81,7 @@ error:
 		di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
 				    struct btrfs_dir_item);
 		found = (char *)(di + 1);
-		found_len = btrfs_dir_name_len(path.nodes[0]->leaf.items +
-						path.slots[0]);
+		found_len = btrfs_dir_name_len(di);
 		btrfs_name_hash(buf, strlen(buf), &myhash);
 		btrfs_name_hash(found, found_len, &foundhash);
 		if (myhash != foundhash)
@@ -227,8 +226,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 		slot = path.slots[0];
 		di = btrfs_item_ptr(&path.nodes[0]->leaf, slot,
 				    struct btrfs_dir_item);
-		found_len = btrfs_dir_name_len(path.nodes[0]->leaf.items +
-						slot);
+		found_len = btrfs_dir_name_len(di);
 		memcpy(buf, (char *)(di + 1), found_len);
 		BUG_ON(found_len > 128);
 		buf[found_len] = '\0';
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index ad244d65811..0bb5c38427c 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -23,11 +23,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
 		type = btrfs_disk_key_type(&item->key);
-		printf("\titem %d key (%Lu %Lu %u) itemoff %d itemsize %d\n",
+		printf("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n",
 			i,
 			btrfs_disk_key_objectid(&item->key),
-			btrfs_disk_key_offset(&item->key),
 			btrfs_disk_key_flags(&item->key),
+			btrfs_disk_key_offset(&item->key),
 			btrfs_item_offset(item),
 			btrfs_item_size(item));
 		switch (type) {
@@ -81,11 +81,11 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
-		printf("\tkey %d (%Lu %Lu %u) block %Lu\n",
+		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
 		       i,
 		       c->ptrs[i].key.objectid,
-		       c->ptrs[i].key.offset,
 		       c->ptrs[i].key.flags,
+		       c->ptrs[i].key.offset,
 		       btrfs_node_blockptr(c, i));
 		fflush(stdout);
 	}
-- 
cgit v1.2.3


From 88fd146c27da0f34c512f47e2b3776a0762ecd81 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Mar 2007 08:56:18 -0400
Subject: Btrfs: pin freed blocks from the FS tree too

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 14 +++++++-------
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/extent-tree.c | 28 +++++++++++++++++-----------
 3 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 68f66b320a8..13128b5ed65 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -55,13 +55,13 @@ static int btrfs_cow_block(struct btrfs_root *root,
 		root->node = cow;
 		cow->count++;
 		if (buf != root->commit_root)
-			btrfs_free_extent(root, buf->blocknr, 1);
+			btrfs_free_extent(root, buf->blocknr, 1, 1);
 		btrfs_block_release(root, buf);
 	} else {
 		btrfs_set_node_blockptr(&parent->node, parent_slot,
 					cow->blocknr);
 		BUG_ON(list_empty(&parent->dirty));
-		btrfs_free_extent(root, buf->blocknr, 1);
+		btrfs_free_extent(root, buf->blocknr, 1, 1);
 	}
 	btrfs_block_release(root, buf);
 	return 0;
@@ -311,7 +311,7 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 		/* once for the root ptr */
 		btrfs_block_release(root, mid_buf);
 		clean_tree_block(root, mid_buf);
-		return btrfs_free_extent(root, blocknr, 1);
+		return btrfs_free_extent(root, blocknr, 1, 1);
 	}
 	parent = &parent_buf->node;
 
@@ -352,7 +352,7 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 			wret = del_ptr(root, path, level + 1, pslot + 1);
 			if (wret)
 				ret = wret;
-			wret = btrfs_free_extent(root, blocknr, 1);
+			wret = btrfs_free_extent(root, blocknr, 1, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -388,7 +388,7 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 		wret = del_ptr(root, path, level + 1, pslot);
 		if (wret)
 			ret = wret;
-		wret = btrfs_free_extent(root, blocknr, 1);
+		wret = btrfs_free_extent(root, blocknr, 1, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -1310,7 +1310,7 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
 			wret = del_ptr(root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
-			wret = btrfs_free_extent(root, leaf_buf->blocknr, 1);
+			wret = btrfs_free_extent(root, leaf_buf->blocknr, 1, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -1348,7 +1348,7 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
 				if (wret)
 					ret = wret;
 				btrfs_block_release(root, leaf_buf);
-				wret = btrfs_free_extent(root, blocknr, 1);
+				wret = btrfs_free_extent(root, blocknr, 1, 1);
 				if (wret)
 					ret = wret;
 			} else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7c66b647ea2..d15a2ed9507 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -658,7 +658,8 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 
 struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root);
 int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf);
-int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks);
+int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
+		      int pin);
 int btrfs_search_slot(struct btrfs_root *root, struct btrfs_key *key,
 		struct btrfs_path *p, int ins_len, int cow);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c81e14162ef..4a40282b45f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -95,6 +95,7 @@ int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf)
 int btrfs_finish_extent_commit(struct btrfs_root *root)
 {
 	unsigned long gang[8];
+	u64 first = 0;
 	int ret;
 	int i;
 
@@ -104,11 +105,13 @@ int btrfs_finish_extent_commit(struct btrfs_root *root)
 						 ARRAY_SIZE(gang));
 		if (!ret)
 			break;
+		if (!first)
+			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			radix_tree_delete(&root->pinned_radix, gang[i]);
 		}
 	}
-	root->last_insert.objectid = 0;
+	root->last_insert.objectid = first;
 	root->last_insert.offset = 0;
 	return 0;
 }
@@ -140,7 +143,8 @@ static int finish_current_insert(struct btrfs_root *extent_root)
 /*
  * remove an extent from the root, returns 0 on success
  */
-static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
+static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
+			 int pin)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -150,6 +154,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 	struct btrfs_key ins;
 	u32 refs;
 
+	BUG_ON(pin && num_blocks != 1);
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -170,7 +175,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
 	if (refs == 0) {
-		if (!root->ref_cows) {
+		if (pin) {
 			int err;
 			radix_tree_preload(GFP_KERNEL);
 			err = radix_tree_insert(&extent_root->pinned_radix,
@@ -179,8 +184,7 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 			radix_tree_preload_end();
 		}
 		ret = btrfs_del_item(extent_root, &path);
-		if (root != extent_root &&
-		    extent_root->last_insert.objectid > blocknr)
+		if (!pin && extent_root->last_insert.objectid > blocknr)
 			extent_root->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
@@ -208,7 +212,8 @@ static int del_pending_extents(struct btrfs_root *extent_root)
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			ret = __free_extent(extent_root, gang[i]->blocknr, 1);
+			ret = __free_extent(extent_root,
+					    gang[i]->blocknr, 1, 1);
 			radix_tree_tag_clear(&extent_root->cache_radix,
 						gang[i]->blocknr,
 						CTREE_EXTENT_PENDING_DEL);
@@ -230,7 +235,8 @@ static int run_pending(struct btrfs_root *extent_root)
 /*
  * remove an extent from the root, returns 0 on success
  */
-int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
+int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
+		      int pin)
 {
 	struct btrfs_root *extent_root = root->extent_root;
 	struct btrfs_buffer *t;
@@ -243,7 +249,7 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks)
 				   CTREE_EXTENT_PENDING_DEL);
 		return 0;
 	}
-	ret = __free_extent(root, blocknr, num_blocks);
+	ret = __free_extent(root, blocknr, num_blocks, pin);
 	pending_ret = run_pending(root->extent_root);
 	return ret ? ret : pending_ret;
 }
@@ -451,7 +457,7 @@ static int walk_down_tree(struct btrfs_root *root,
 		ret = lookup_block_ref(root, blocknr, &refs);
 		if (refs != 1 || *level == 1) {
 			path->slots[*level]++;
-			ret = btrfs_free_extent(root, blocknr, 1);
+			ret = btrfs_free_extent(root, blocknr, 1, 1);
 			BUG_ON(ret);
 			continue;
 		}
@@ -464,7 +470,7 @@ static int walk_down_tree(struct btrfs_root *root,
 		path->slots[*level] = 0;
 	}
 out:
-	ret = btrfs_free_extent(root, path->nodes[*level]->blocknr, 1);
+	ret = btrfs_free_extent(root, path->nodes[*level]->blocknr, 1, 1);
 	btrfs_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -492,7 +498,7 @@ static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
 			return 0;
 		} else {
 			ret = btrfs_free_extent(root,
-					  path->nodes[*level]->blocknr, 1);
+					  path->nodes[*level]->blocknr, 1, 1);
 			btrfs_block_release(root, path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
-- 
cgit v1.2.3


From e089f05c18ab36ed5fa7e2319052e03ab800d518 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Mar 2007 16:20:31 -0400
Subject: Btrfs: transaction handles everywhere

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   4 +-
 fs/btrfs/TODO          |   2 +
 fs/btrfs/ctree.c       | 204 +++++++++++++++++++++++++------------------------
 fs/btrfs/ctree.h       |  63 +++++++++------
 fs/btrfs/debug-tree.c  |   1 +
 fs/btrfs/dir-item.c    |  20 +++--
 fs/btrfs/dir-test.c    |  70 ++++++++++-------
 fs/btrfs/disk-io.c     |  60 +++++++++------
 fs/btrfs/disk-io.h     |  16 ++--
 fs/btrfs/extent-tree.c | 135 +++++++++++++++++---------------
 fs/btrfs/inode-item.c  |  15 ++--
 fs/btrfs/print-tree.h  |   4 +-
 fs/btrfs/quick-test.c  |  23 +++---
 fs/btrfs/random-test.c |  62 ++++++++-------
 fs/btrfs/root-tree.c   |  23 +++---
 fs/btrfs/transaction.h |  27 +++++++
 16 files changed, 426 insertions(+), 303 deletions(-)
 create mode 100644 fs/btrfs/transaction.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 36964f54bb4..76bcd87d857 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS = -g -Wall
+CFLAGS = -g -Wall -Werror
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \
 	  root-tree.o dir-item.o hash.o file-item.o inode-item.o
@@ -14,7 +14,7 @@ check=sparse $(CHECKFLAGS)
 	$(check) $<
 	$(CC) $(CFLAGS) -c $<
 
-all: tester debug-tree quick-test dir-test
+all: tester debug-tree quick-test dir-test tags
 
 debug-tree: $(objects) debug-tree.o
 	gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 2ae4b3aae1e..e9894999322 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -8,6 +8,8 @@
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
+* Only pin blocks allocated in this transaction
+* Add transaction handles
 * Do actual block accounting
 * Check compat and incompat flags on the inode
 * Port into the kernel
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 13128b5ed65..17a3ff2f182 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -6,17 +6,18 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
-static int split_node(struct btrfs_root *root, struct btrfs_path *path,
-		      int level);
-static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
-		      int data_size);
-static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst,
-			  struct btrfs_buffer *src);
-static int balance_node_right(struct btrfs_root *root,
-			      struct btrfs_buffer *dst_buf,
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, int level);
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, int data_size);
+static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_buffer *dst, struct btrfs_buffer
+			  *src);
+static int balance_node_right(struct btrfs_trans_handle *trans, struct
+			      btrfs_root *root, struct btrfs_buffer *dst_buf,
 			      struct btrfs_buffer *src_buf);
-static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
-		   int slot);
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int level, int slot);
 
 inline void btrfs_init_path(struct btrfs_path *p)
 {
@@ -34,11 +35,10 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	memset(p, 0, sizeof(*p));
 }
 
-static int btrfs_cow_block(struct btrfs_root *root,
-		    struct btrfs_buffer *buf,
-		    struct btrfs_buffer *parent,
-		    int parent_slot,
-		    struct btrfs_buffer **cow_ret)
+static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_buffer *buf, struct btrfs_buffer
+			   *parent, int parent_slot, struct btrfs_buffer
+			   **cow_ret)
 {
 	struct btrfs_buffer *cow;
 
@@ -46,22 +46,22 @@ static int btrfs_cow_block(struct btrfs_root *root,
 		*cow_ret = buf;
 		return 0;
 	}
-	cow = btrfs_alloc_free_block(root);
+	cow = btrfs_alloc_free_block(trans, root);
 	memcpy(&cow->node, &buf->node, root->blocksize);
 	btrfs_set_header_blocknr(&cow->node.header, cow->blocknr);
 	*cow_ret = cow;
-	btrfs_inc_ref(root, buf);
+	btrfs_inc_ref(trans, root, buf);
 	if (buf == root->node) {
 		root->node = cow;
 		cow->count++;
 		if (buf != root->commit_root)
-			btrfs_free_extent(root, buf->blocknr, 1, 1);
+			btrfs_free_extent(trans, root, buf->blocknr, 1, 1);
 		btrfs_block_release(root, buf);
 	} else {
 		btrfs_set_node_blockptr(&parent->node, parent_slot,
 					cow->blocknr);
 		BUG_ON(list_empty(&parent->dirty));
-		btrfs_free_extent(root, buf->blocknr, 1, 1);
+		btrfs_free_extent(trans, root, buf->blocknr, 1, 1);
 	}
 	btrfs_block_release(root, buf);
 	return 0;
@@ -266,8 +266,8 @@ static struct btrfs_buffer *read_node_slot(struct btrfs_root *root,
 	return read_tree_block(root, btrfs_node_blockptr(node, slot));
 }
 
-static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
-			int level)
+static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
+			 *root, struct btrfs_path *path, int level)
 {
 	struct btrfs_buffer *right_buf;
 	struct btrfs_buffer *mid_buf;
@@ -310,8 +310,8 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 		btrfs_block_release(root, mid_buf);
 		/* once for the root ptr */
 		btrfs_block_release(root, mid_buf);
-		clean_tree_block(root, mid_buf);
-		return btrfs_free_extent(root, blocknr, 1, 1);
+		clean_tree_block(trans, root, mid_buf);
+		return btrfs_free_extent(trans, root, blocknr, 1, 1);
 	}
 	parent = &parent_buf->node;
 
@@ -324,11 +324,11 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 
 	/* first, try to make some room in the middle buffer */
 	if (left_buf) {
-		btrfs_cow_block(root, left_buf, parent_buf,
-				pslot - 1, &left_buf);
+		btrfs_cow_block(trans, root, left_buf, parent_buf, pslot - 1,
+				&left_buf);
 		left = &left_buf->node;
 		orig_slot += btrfs_header_nritems(&left->header);
-		wret = push_node_left(root, left_buf, mid_buf);
+		wret = push_node_left(trans, root, left_buf, mid_buf);
 		if (wret < 0)
 			ret = wret;
 	}
@@ -337,22 +337,23 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right_buf) {
-		btrfs_cow_block(root, right_buf, parent_buf,
-				pslot + 1, &right_buf);
+		btrfs_cow_block(trans, root, right_buf, parent_buf, pslot + 1,
+				&right_buf);
 		right = &right_buf->node;
-		wret = push_node_left(root, mid_buf, right_buf);
+		wret = push_node_left(trans, root, mid_buf, right_buf);
 		if (wret < 0)
 			ret = wret;
 		if (btrfs_header_nritems(&right->header) == 0) {
 			u64 blocknr = right_buf->blocknr;
 			btrfs_block_release(root, right_buf);
-			clean_tree_block(root, right_buf);
+			clean_tree_block(trans, root, right_buf);
 			right_buf = NULL;
 			right = NULL;
-			wret = del_ptr(root, path, level + 1, pslot + 1);
+			wret = del_ptr(trans, root, path, level + 1, pslot +
+				       1);
 			if (wret)
 				ret = wret;
-			wret = btrfs_free_extent(root, blocknr, 1, 1);
+			wret = btrfs_free_extent(trans, root, blocknr, 1, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -373,7 +374,7 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 		 * right
 		 */
 		BUG_ON(!left_buf);
-		wret = balance_node_right(root, mid_buf, left_buf);
+		wret = balance_node_right(trans, root, mid_buf, left_buf);
 		if (wret < 0)
 			ret = wret;
 		BUG_ON(wret == 1);
@@ -382,13 +383,13 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
 		/* we've managed to empty the middle node, drop it */
 		u64 blocknr = mid_buf->blocknr;
 		btrfs_block_release(root, mid_buf);
-		clean_tree_block(root, mid_buf);
+		clean_tree_block(trans, root, mid_buf);
 		mid_buf = NULL;
 		mid = NULL;
-		wret = del_ptr(root, path, level + 1, pslot);
+		wret = del_ptr(trans, root, path, level + 1, pslot);
 		if (wret)
 			ret = wret;
-		wret = btrfs_free_extent(root, blocknr, 1, 1);
+		wret = btrfs_free_extent(trans, root, blocknr, 1, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -438,8 +439,9 @@ static int balance_level(struct btrfs_root *root, struct btrfs_path *path,
  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
  * possible)
  */
-int btrfs_search_slot(struct btrfs_root *root, struct btrfs_key *key,
-		struct btrfs_path *p, int ins_len, int cow)
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow)
 {
 	struct btrfs_buffer *b;
 	struct btrfs_buffer *cow_buf;
@@ -455,8 +457,9 @@ again:
 		level = btrfs_header_level(&b->node.header);
 		if (cow) {
 			int wret;
-			wret = btrfs_cow_block(root, b, p->nodes[level + 1],
-					       p->slots[level + 1], &cow_buf);
+			wret = btrfs_cow_block(trans, root, b, p->nodes[level +
+					       1], p->slots[level + 1],
+					       &cow_buf);
 			b = cow_buf;
 		}
 		BUG_ON(!cow && ins_len);
@@ -472,7 +475,7 @@ again:
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_header_nritems(&c->header) ==
 			    BTRFS_NODEPTRS_PER_BLOCK(root)) {
-				int sret = split_node(root, p, level);
+				int sret = split_node(trans, root, p, level);
 				BUG_ON(sret > 0);
 				if (sret)
 					return sret;
@@ -480,7 +483,8 @@ again:
 				c = &b->node;
 				slot = p->slots[level];
 			} else if (ins_len < 0) {
-				int sret = balance_level(root, p, level);
+				int sret = balance_level(trans, root, p,
+							 level);
 				if (sret)
 					return sret;
 				b = p->nodes[level];
@@ -496,7 +500,7 @@ again:
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, l) <
 			    sizeof(struct btrfs_item) + ins_len) {
-				int sret = split_leaf(root, p, ins_len);
+				int sret = split_leaf(trans, root, p, ins_len);
 				BUG_ON(sret > 0);
 				if (sret)
 					return sret;
@@ -519,9 +523,9 @@ again:
  * If this fails to write a tree block, it returns -1, but continues
  * fixing up the blocks in ram so the tree is consistent.
  */
-static int fixup_low_keys(struct btrfs_root *root,
-			   struct btrfs_path *path, struct btrfs_disk_key *key,
-			   int level)
+static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, struct btrfs_disk_key
+			  *key, int level)
 {
 	int i;
 	int ret = 0;
@@ -546,8 +550,9 @@ static int fixup_low_keys(struct btrfs_root *root,
  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
  * error, and > 0 if there was no room in the left hand block.
  */
-static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst_buf,
-			  struct btrfs_buffer *src_buf)
+static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_buffer *dst_buf, struct
+			  btrfs_buffer *src_buf)
 {
 	struct btrfs_node *src = &src_buf->node;
 	struct btrfs_node *dst = &dst_buf->node;
@@ -589,8 +594,8 @@ static int push_node_left(struct btrfs_root *root, struct btrfs_buffer *dst_buf,
  *
  * this will  only push up to 1/2 the contents of the left node over
  */
-static int balance_node_right(struct btrfs_root *root,
-			      struct btrfs_buffer *dst_buf,
+static int balance_node_right(struct btrfs_trans_handle *trans, struct
+			      btrfs_root *root, struct btrfs_buffer *dst_buf,
 			      struct btrfs_buffer *src_buf)
 {
 	struct btrfs_node *src = &src_buf->node;
@@ -635,8 +640,8 @@ static int balance_node_right(struct btrfs_root *root,
  *
  * returns zero on success or < 0 on failure.
  */
-static int insert_new_root(struct btrfs_root *root,
-			   struct btrfs_path *path, int level)
+static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path, int level)
 {
 	struct btrfs_buffer *t;
 	struct btrfs_node *lower;
@@ -646,7 +651,7 @@ static int insert_new_root(struct btrfs_root *root,
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	t = btrfs_alloc_free_block(root);
+	t = btrfs_alloc_free_block(trans, root);
 	c = &t->node;
 	memset(c, 0, root->blocksize);
 	btrfs_set_header_nritems(&c->header, 1);
@@ -679,9 +684,9 @@ static int insert_new_root(struct btrfs_root *root,
  *
  * returns zero on success and < 0 on any error
  */
-static int insert_ptr(struct btrfs_root *root,
-		struct btrfs_path *path, struct btrfs_disk_key *key,
-		u64 blocknr, int slot, int level)
+static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, struct btrfs_disk_key
+		      *key, u64 blocknr, int slot, int level)
 {
 	struct btrfs_node *lower;
 	int nritems;
@@ -713,8 +718,8 @@ static int insert_ptr(struct btrfs_root *root,
  *
  * returns 0 on success and < 0 on failure
  */
-static int split_node(struct btrfs_root *root, struct btrfs_path *path,
-		      int level)
+static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, int level)
 {
 	struct btrfs_buffer *t;
 	struct btrfs_node *c;
@@ -729,12 +734,12 @@ static int split_node(struct btrfs_root *root, struct btrfs_path *path,
 	c = &t->node;
 	if (t == root->node) {
 		/* trying to split the root, lets make a new one */
-		ret = insert_new_root(root, path, level + 1);
+		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
 	}
 	c_nritems = btrfs_header_nritems(&c->header);
-	split_buffer = btrfs_alloc_free_block(root);
+	split_buffer = btrfs_alloc_free_block(trans, root);
 	split = &split_buffer->node;
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
 	btrfs_set_header_blocknr(&split->header, split_buffer->blocknr);
@@ -748,7 +753,7 @@ static int split_node(struct btrfs_root *root, struct btrfs_path *path,
 	ret = 0;
 
 	BUG_ON(list_empty(&t->dirty));
-	wret = insert_ptr(root, path, &split->ptrs[0].key,
+	wret = insert_ptr(trans, root, path, &split->ptrs[0].key,
 			  split_buffer->blocknr, path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
@@ -790,8 +795,8 @@ static int leaf_space_used(struct btrfs_leaf *l, int start, int nr)
  * returns 1 if the push failed because the other node didn't have enough
  * room, 0 if everything worked out and < 0 if there were major errors.
  */
-static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
-			   int data_size)
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct btrfs_path *path, int data_size)
 {
 	struct btrfs_buffer *left_buf = path->nodes[0];
 	struct btrfs_leaf *left = &left_buf->leaf;
@@ -824,7 +829,7 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
 		return 1;
 	}
 	/* cow and double check */
-	btrfs_cow_block(root, right_buf, upper, slot + 1, &right_buf);
+	btrfs_cow_block(trans, root, right_buf, upper, slot + 1, &right_buf);
 	right = &right_buf->leaf;
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
@@ -897,8 +902,8 @@ static int push_leaf_right(struct btrfs_root *root, struct btrfs_path *path,
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
-static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
-			  int data_size)
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int data_size)
 {
 	struct btrfs_buffer *right_buf = path->nodes[0];
 	struct btrfs_leaf *right = &right_buf->leaf;
@@ -931,7 +936,7 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
 	}
 
 	/* cow and double check */
-	btrfs_cow_block(root, t, path->nodes[1], slot - 1, &t);
+	btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t);
 	left = &t->leaf;
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
@@ -997,7 +1002,7 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
 	BUG_ON(list_empty(&t->dirty));
 	BUG_ON(list_empty(&right_buf->dirty));
 
-	wret = fixup_low_keys(root, path, &right->items[0].key, 1);
+	wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1);
 	if (wret)
 		ret = wret;
 
@@ -1021,8 +1026,8 @@ static int push_leaf_left(struct btrfs_root *root, struct btrfs_path *path,
  *
  * returns 0 if all went well and < 0 on failure.
  */
-static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
-		      int data_size)
+static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, int data_size)
 {
 	struct btrfs_buffer *l_buf;
 	struct btrfs_leaf *l;
@@ -1038,11 +1043,11 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	int ret;
 	int wret;
 
-	wret = push_leaf_left(root, path, data_size);
+	wret = push_leaf_left(trans, root, path, data_size);
 	if (wret < 0)
 		return wret;
 	if (wret) {
-		wret = push_leaf_right(root, path, data_size);
+		wret = push_leaf_right(trans, root, path, data_size);
 		if (wret < 0)
 			return wret;
 	}
@@ -1055,14 +1060,14 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		return 0;
 
 	if (!path->nodes[1]) {
-		ret = insert_new_root(root, path, 1);
+		ret = insert_new_root(trans, root, path, 1);
 		if (ret)
 			return ret;
 	}
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
-	right_buffer = btrfs_alloc_free_block(root);
+	right_buffer = btrfs_alloc_free_block(trans, root);
 	BUG_ON(!right_buffer);
 	BUG_ON(mid == nritems);
 	right = &right_buffer->leaf;
@@ -1100,7 +1105,7 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
 
 	btrfs_set_header_nritems(&l->header, mid);
 	ret = 0;
-	wret = insert_ptr(root, path, &right->items[0].key,
+	wret = insert_ptr(trans, root, path, &right->items[0].key,
 			  right_buffer->blocknr, path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
@@ -1122,8 +1127,9 @@ static int split_leaf(struct btrfs_root *root, struct btrfs_path *path,
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
-int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 data_size)
+int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			    *root, struct btrfs_path *path, struct btrfs_key
+			    *cpu_key, u32 data_size)
 {
 	int ret = 0;
 	int slot;
@@ -1139,7 +1145,7 @@ int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path,
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
-	ret = btrfs_search_slot(root, cpu_key, path, data_size, 1);
+	ret = btrfs_search_slot(trans, root, cpu_key, path, data_size, 1);
 	if (ret == 0) {
 		btrfs_release_path(root, path);
 		return -EEXIST;
@@ -1193,7 +1199,7 @@ int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path,
 
 	ret = 0;
 	if (slot == 0)
-		ret = fixup_low_keys(root, path, &disk_key, 1);
+		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 
 	BUG_ON(list_empty(&leaf_buf->dirty));
 	if (btrfs_leaf_free_space(root, leaf) < 0)
@@ -1207,15 +1213,16 @@ out:
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
-int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
-			  void *data, u32 data_size)
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *cpu_key, void *data, u32
+		      data_size)
 {
 	int ret = 0;
 	struct btrfs_path path;
 	u8 *ptr;
 
 	btrfs_init_path(&path);
-	ret = btrfs_insert_empty_item(root, &path, cpu_key, data_size);
+	ret = btrfs_insert_empty_item(trans, root, &path, cpu_key, data_size);
 	if (!ret) {
 		ptr = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], u8);
 		memcpy(ptr, data, data_size);
@@ -1231,8 +1238,8 @@ int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *cpu_key,
  * continuing all the way the root if required.  The root is converted into
  * a leaf if all the nodes are emptied.
  */
-static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
-		   int slot)
+static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int level, int slot)
 {
 	struct btrfs_node *node;
 	struct btrfs_buffer *parent = path->nodes[level];
@@ -1253,7 +1260,7 @@ static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
 		/* just turn the root into a leaf and break */
 		btrfs_set_header_level(&root->node->node.header, 0);
 	} else if (slot == 0) {
-		wret = fixup_low_keys(root, path, &node->ptrs[0].key,
+		wret = fixup_low_keys(trans, root, path, &node->ptrs[0].key,
 				      level + 1);
 		if (wret)
 			ret = wret;
@@ -1266,7 +1273,8 @@ static int del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level,
  * delete the item at the leaf level in path.  If that empties
  * the leaf, remove it from the tree
  */
-int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
+int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path)
 {
 	int slot;
 	struct btrfs_leaf *leaf;
@@ -1306,19 +1314,20 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
 			btrfs_set_header_level(&leaf->header, 0);
 			BUG_ON(list_empty(&leaf_buf->dirty));
 		} else {
-			clean_tree_block(root, leaf_buf);
-			wret = del_ptr(root, path, 1, path->slots[1]);
+			clean_tree_block(trans, root, leaf_buf);
+			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
-			wret = btrfs_free_extent(root, leaf_buf->blocknr, 1, 1);
+			wret = btrfs_free_extent(trans, root,
+						 leaf_buf->blocknr, 1, 1);
 			if (wret)
 				ret = wret;
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, nritems);
 		if (slot == 0) {
-			wret = fixup_low_keys(root, path,
-						   &leaf->items[0].key, 1);
+			wret = fixup_low_keys(trans, root, path,
+					      &leaf->items[0].key, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -1332,23 +1341,24 @@ int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path)
 			 */
 			slot = path->slots[1];
 			leaf_buf->count++;
-			wret = push_leaf_left(root, path, 1);
+			wret = push_leaf_left(trans, root, path, 1);
 			if (wret < 0)
 				ret = wret;
 			if (path->nodes[0] == leaf_buf &&
 			    btrfs_header_nritems(&leaf->header)) {
-				wret = push_leaf_right(root, path, 1);
+				wret = push_leaf_right(trans, root, path, 1);
 				if (wret < 0)
 					ret = wret;
 			}
 			if (btrfs_header_nritems(&leaf->header) == 0) {
 				u64 blocknr = leaf_buf->blocknr;
-				clean_tree_block(root, leaf_buf);
-				wret = del_ptr(root, path, 1, slot);
+				clean_tree_block(trans, root, leaf_buf);
+				wret = del_ptr(trans, root, path, 1, slot);
 				if (wret)
 					ret = wret;
 				btrfs_block_release(root, leaf_buf);
-				wret = btrfs_free_extent(root, blocknr, 1, 1);
+				wret = btrfs_free_extent(trans, root, blocknr,
+							 1, 1);
 				if (wret)
 					ret = wret;
 			} else {
@@ -1401,5 +1411,3 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	}
 	return 0;
 }
-
-
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d15a2ed9507..68f0af39777 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4,6 +4,8 @@
 #include "list.h"
 #include "kerncompat.h"
 
+struct btrfs_trans_handle;
+
 #define BTRFS_MAGIC "_BtRfS_M"
 
 #define BTRFS_ROOT_TREE_OBJECTID 1
@@ -200,6 +202,7 @@ struct btrfs_root {
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	u32 blocksize;
+	struct btrfs_trans_handle *running_transaction;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -656,34 +659,46 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 	((type *)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset((leaf)->items + (slot))))
 
-struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root);
-int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf);
-int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
-		      int pin);
-int btrfs_search_slot(struct btrfs_root *root, struct btrfs_key *key,
-		struct btrfs_path *p, int ins_len, int cow);
+struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root);
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct btrfs_buffer *buf);
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, u64 blocknr, u64 num_blocks, int pin);
+int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_path *p, int
+		      ins_len, int cow);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
-int btrfs_del_item(struct btrfs_root *root, struct btrfs_path *path);
-int btrfs_insert_item(struct btrfs_root *root, struct btrfs_key *key,
-		void *data, u32 data_size);
-int btrfs_insert_empty_item(struct btrfs_root *root, struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 data_size);
+int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path);
+int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			    *root, struct btrfs_path *path, struct btrfs_key
+			    *cpu_key, u32 data_size);
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap);
-int btrfs_finish_extent_commit(struct btrfs_root *root);
-int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key);
-int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key,
-		      struct btrfs_root_item *item);
-int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key,
-		      struct btrfs_root_item *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
-			struct btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len,
-			  u64 dir, u64 objectid, u8 type);
-int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path,
-			  u64 dir, char *name, int name_len, int mod);
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+			*root, struct btrfs_buffer *snap);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
+			       btrfs_root *root);
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key);
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item);
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item);
+int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
+			 btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, char *name, int name_len, u64 dir, u64
+			  objectid, u8 type);
+int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, u64 dir, char *name,
+			  int name_len, int mod);
 int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 			      char *name, int name_len);
 #endif
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
index de45fb4dfdd..91dea7a0a47 100644
--- a/fs/btrfs/debug-tree.c
+++ b/fs/btrfs/debug-tree.c
@@ -5,6 +5,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "transaction.h"
 
 int main(int ac, char **av) {
 	struct btrfs_super_block super;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a42a67b9975..949c4e52679 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -5,9 +5,11 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "hash.h"
+#include "transaction.h"
 
-int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len,
-			  u64 dir, u64 objectid, u8 type)
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, char *name, int name_len, u64 dir, u64
+			  objectid, u8 type)
 {
 	int ret = 0;
 	struct btrfs_path path;
@@ -23,7 +25,7 @@ int btrfs_insert_dir_item(struct btrfs_root *root, char *name, int name_len,
 	BUG_ON(ret);
 	btrfs_init_path(&path);
 	data_size = sizeof(*dir_item) + name_len;
-	ret = btrfs_insert_empty_item(root, &path, &key, data_size);
+	ret = btrfs_insert_empty_item(trans, root, &path, &key, data_size);
 	if (ret)
 		goto out;
 
@@ -40,8 +42,9 @@ out:
 	return ret;
 }
 
-int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path,
-			  u64 dir, char *name, int name_len, int mod)
+int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, u64 dir, char *name,
+			  int name_len, int mod)
 {
 	int ret;
 	struct btrfs_key key;
@@ -53,12 +56,13 @@ int btrfs_lookup_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
-	ret = btrfs_search_slot(root, &key, path, ins_len, cow);
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	return ret;
 }
 
-int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
-			      char *name, int name_len)
+int btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path, char
+			      *name, int name_len)
 {
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c
index f73aa762339..e908c0c588c 100644
--- a/fs/btrfs/dir-test.c
+++ b/fs/btrfs/dir-test.c
@@ -8,6 +8,7 @@
 #include "disk-io.h"
 #include "print-tree.h"
 #include "hash.h"
+#include "transaction.h"
 
 int keep_running = 1;
 struct btrfs_super_block super;
@@ -38,7 +39,8 @@ again:
 	return 0;
 }
 
-static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix)
+static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct radix_tree_root *radix)
 {
 	int ret;
 	char buf[128];
@@ -48,8 +50,8 @@ static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix)
 	find_num(radix, &oid, 0);
 	sprintf(buf, "str-%lu", oid);
 
-	ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid,
-				    1);
+	ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid,
+				    file_oid, 1);
 	if (ret)
 		goto error;
 
@@ -68,7 +70,8 @@ error:
 	 * check
 	 */
 	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0);
+	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
+				    strlen(buf), 0);
 	if (ret)
 		goto fatal_release;
 	if (!btrfs_match_dir_item_name(root, &path, buf, strlen(buf))) {
@@ -96,7 +99,8 @@ fatal:
 	return -1;
 }
 
-static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
+static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct radix_tree_root *radix)
 {
 	int ret;
 	char buf[128];
@@ -107,8 +111,8 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
 		return 0;
 	sprintf(buf, "str-%lu", oid);
 
-	ret = btrfs_insert_dir_item(root, buf, strlen(buf), dir_oid, file_oid,
-				    1);
+	ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid,
+				    file_oid, 1);
 	if (ret != -EEXIST) {
 		printf("insert on %s gave us %d\n", buf, ret);
 		return 1;
@@ -116,7 +120,8 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
 	return 0;
 }
 
-static int del_one(struct btrfs_root *root, struct radix_tree_root *radix)
+static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct radix_tree_root *radix)
 {
 	int ret;
 	char buf[128];
@@ -129,10 +134,11 @@ static int del_one(struct btrfs_root *root, struct radix_tree_root *radix)
 		return 0;
 	sprintf(buf, "str-%lu", oid);
 	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), -1);
+	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
+				    strlen(buf), -1);
 	if (ret)
 		goto out_release;
-	ret = btrfs_del_item(root, &path);
+	ret = btrfs_del_item(trans, root, &path);
 	if (ret)
 		goto out_release;
 	btrfs_release_path(root, &path);
@@ -149,7 +155,8 @@ out:
 	return -1;
 }
 
-static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix)
+static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct radix_tree_root *radix)
 {
 	struct btrfs_path path;
 	char buf[128];
@@ -161,7 +168,8 @@ static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix)
 		return 0;
 	sprintf(buf, "str-%lu", oid);
 	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0);
+	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
+				    strlen(buf), 0);
 	btrfs_release_path(root, &path);
 	if (ret) {
 		printf("unable to find key %lu\n", oid);
@@ -170,7 +178,8 @@ static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix)
 	return 0;
 }
 
-static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix)
+static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root
+			 *root, struct radix_tree_root *radix)
 {
 	struct btrfs_path path;
 	char buf[128];
@@ -182,7 +191,8 @@ static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix)
 		return 0;
 	sprintf(buf, "str-%lu", oid);
 	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(root, &path, dir_oid, buf, strlen(buf), 0);
+	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
+				    strlen(buf), 0);
 	btrfs_release_path(root, &path);
 	if (!ret) {
 		printf("able to find key that should not exist %lu\n", oid);
@@ -191,8 +201,8 @@ static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix)
 	return 0;
 }
 
-static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
-		      int nr)
+static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct radix_tree_root *radix, int nr)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -211,7 +221,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 	key.objectid = dir_oid;
 	while(nr-- >= 0) {
 		btrfs_init_path(&path);
-		ret = btrfs_search_slot(root, &key, &path, -1, 1);
+		ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
 		if (ret < 0) {
 			btrfs_release_path(root, &path);
 			return ret;
@@ -231,7 +241,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 		BUG_ON(found_len > 128);
 		buf[found_len] = '\0';
 		found = atoi(buf + 4);
-		ret = btrfs_del_item(root, &path);
+		ret = btrfs_del_item(trans, root, &path);
 		count++;
 		if (ret) {
 			fprintf(stderr,
@@ -252,19 +262,19 @@ error:
 	return -1;
 }
 
-static int fill_tree(struct btrfs_root *root, struct radix_tree_root *radix,
-		     int count)
+static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct radix_tree_root *radix, int count)
 {
 	int i;
 	int ret = 0;
 	for (i = 0; i < count; i++) {
-		ret = ins_one(root, radix);
+		ret = ins_one(trans, root, radix);
 		if (ret) {
 			fprintf(stderr, "fill failed\n");
 			goto out;
 		}
 		if (i % 1000 == 0) {
-			ret = btrfs_commit_transaction(root, &super);
+			ret = btrfs_commit_transaction(trans, root, &super);
 			if (ret) {
 				fprintf(stderr, "fill commit failed\n");
 				return ret;
@@ -280,7 +290,8 @@ out:
 	return ret;
 }
 
-static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix)
+static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct radix_tree_root *radix)
 {
 	int ret;
 	int nr = rand() % 5000;
@@ -289,17 +300,18 @@ static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix)
 	/* do the bulk op much less frequently */
 	if (run_nr++ % 100)
 		return 0;
-	ret = empty_tree(root, radix, nr);
+	ret = empty_tree(trans, root, radix, nr);
 	if (ret)
 		return ret;
-	ret = fill_tree(root, radix, nr);
+	ret = fill_tree(trans, root, radix, nr);
 	if (ret)
 		return ret;
 	return 0;
 }
 
 
-int (*ops[])(struct btrfs_root *root, struct radix_tree_root *radix) =
+int (*ops[])(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct
+	     radix_tree_root *radix) =
 	{ ins_one, insert_dup, del_one, lookup_item,
 	  lookup_enoent, bulk_op };
 
@@ -330,11 +342,13 @@ int main(int ac, char **av)
 	int init_fill_count = 800000;
 	int err = 0;
 	int initial_only = 0;
+	struct btrfs_trans_handle *trans;
 	radix_tree_init();
 
 	printf("removing old tree\n");
 	unlink("dbfile");
 	root = open_ctree("dbfile", &super);
+	trans = btrfs_start_transaction(root, 1);
 
 	signal(SIGTERM, sigstopper);
 	signal(SIGINT, sigstopper);
@@ -353,7 +367,7 @@ int main(int ac, char **av)
 		}
 	}
 	printf("initial fill\n");
-	ret = fill_tree(root, &radix, init_fill_count);
+	ret = fill_tree(trans, root, &radix, init_fill_count);
 	printf("starting run\n");
 	if (ret) {
 		err = ret;
@@ -377,7 +391,7 @@ int main(int ac, char **av)
 			root = open_ctree("dbfile", &super);
 		}
 		while(count--) {
-			ret = ops[op](root, &radix);
+			ret = ops[op](trans, root, &radix);
 			if (ret) {
 				fprintf(stderr, "op %d failed %d:%d\n",
 					op, i, iterations);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8d9457b5aef..25ce07908ee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -9,6 +9,7 @@
 #include "radix-tree.h"
 #include "ctree.h"
 #include "disk-io.h"
+#include "transaction.h"
 
 static int allocated_blocks = 0;
 int cache_max = 10000;
@@ -107,7 +108,8 @@ struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
 	return buf;
 }
 
-int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
+int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct btrfs_buffer *buf)
 {
 	if (!list_empty(&buf->dirty))
 		return 0;
@@ -116,7 +118,8 @@ int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 	return 0;
 }
 
-int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
+int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct btrfs_buffer *buf)
 {
 	if (!list_empty(&buf->dirty)) {
 		list_del_init(&buf->dirty);
@@ -125,7 +128,8 @@ int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 	return 0;
 }
 
-int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
+int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct btrfs_buffer *buf)
 {
 	u64 blocknr = buf->blocknr;
 	loff_t offset = blocknr * root->blocksize;
@@ -139,7 +143,8 @@ int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
 	return 0;
 }
 
-static int __commit_transaction(struct btrfs_root *root)
+static int __commit_transaction(struct btrfs_trans_handle *trans, struct
+				btrfs_root *root)
 {
 	struct btrfs_buffer *b;
 	int ret = 0;
@@ -147,7 +152,7 @@ static int __commit_transaction(struct btrfs_root *root)
 	while(!list_empty(&root->trans)) {
 		b = list_entry(root->trans.next, struct btrfs_buffer, dirty);
 		list_del_init(&b->dirty);
-		wret = write_tree_block(root, b);
+		wret = write_tree_block(trans, root, b);
 		if (wret)
 			ret = wret;
 		btrfs_block_release(root, b);
@@ -155,8 +160,9 @@ static int __commit_transaction(struct btrfs_root *root)
 	return ret;
 }
 
-static int commit_extent_and_tree_roots(struct btrfs_root *tree_root,
-					struct btrfs_root *extent_root)
+static int commit_extent_and_tree_roots(struct btrfs_trans_handle *trans,
+					struct btrfs_root *tree_root, struct
+					btrfs_root *extent_root)
 {
 	int ret;
 	u64 old_extent_block;
@@ -167,24 +173,24 @@ static int commit_extent_and_tree_roots(struct btrfs_root *tree_root,
 			break;
 		btrfs_set_root_blocknr(&extent_root->root_item,
 				       extent_root->node->blocknr);
-		ret = btrfs_update_root(tree_root,
+		ret = btrfs_update_root(trans, tree_root,
 					&extent_root->root_key,
 					&extent_root->root_item);
 		BUG_ON(ret);
 	}
-	__commit_transaction(extent_root);
-	__commit_transaction(tree_root);
+	__commit_transaction(trans, extent_root);
+	__commit_transaction(trans, tree_root);
 	return 0;
 }
 
-int btrfs_commit_transaction(struct btrfs_root *root,
-			     struct btrfs_super_block *s)
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
+			     btrfs_root *root, struct btrfs_super_block *s)
 {
 	int ret = 0;
 	struct btrfs_buffer *snap = root->commit_root;
 	struct btrfs_key snap_key;
 
-	ret = __commit_transaction(root);
+	ret = __commit_transaction(trans, root);
 	BUG_ON(ret);
 
 	if (root->commit_root == root->node)
@@ -194,23 +200,24 @@ int btrfs_commit_transaction(struct btrfs_root *root,
 	root->root_key.offset++;
 
 	btrfs_set_root_blocknr(&root->root_item, root->node->blocknr);
-	ret = btrfs_insert_root(root->tree_root, &root->root_key,
+	ret = btrfs_insert_root(trans, root->tree_root, &root->root_key,
 				&root->root_item);
 	BUG_ON(ret);
 
-	ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root);
+	ret = commit_extent_and_tree_roots(trans, root->tree_root,
+					   root->extent_root);
 	BUG_ON(ret);
 
-        write_ctree_super(root, s);
-	btrfs_finish_extent_commit(root->extent_root);
-	btrfs_finish_extent_commit(root->tree_root);
+	write_ctree_super(trans, root, s);
+	btrfs_finish_extent_commit(trans, root->extent_root);
+	btrfs_finish_extent_commit(trans, root->tree_root);
 
 	root->commit_root = root->node;
 	root->node->count++;
-	ret = btrfs_drop_snapshot(root, snap);
+	ret = btrfs_drop_snapshot(trans, root, snap);
 	BUG_ON(ret);
 
-	ret = btrfs_del_root(root->tree_root, &snap_key);
+	ret = btrfs_del_root(trans, root->tree_root, &snap_key);
 	BUG_ON(ret);
 
 	return ret;
@@ -312,7 +319,8 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 	return root;
 }
 
-int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s)
+int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_super_block *s)
 {
 	int ret;
 	btrfs_set_super_root(s, root->tree_root->node->blocknr);
@@ -338,10 +346,14 @@ static int drop_cache(struct btrfs_root *root)
 int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s)
 {
 	int ret;
-	btrfs_commit_transaction(root, s);
-	ret = commit_extent_and_tree_roots(root->tree_root, root->extent_root);
+	struct btrfs_trans_handle *trans;
+
+	trans = root->running_transaction;
+	btrfs_commit_transaction(trans, root, s);
+	ret = commit_extent_and_tree_roots(trans, root->tree_root,
+					   root->extent_root);
 	BUG_ON(ret);
-	write_ctree_super(root, s);
+	write_ctree_super(trans, root, s);
 	drop_cache(root->extent_root);
 	drop_cache(root->tree_root);
 	drop_cache(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5771bb90acb..24a9e77c831 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -15,15 +15,19 @@ struct btrfs_buffer {
 
 struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr);
 struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr);
-int write_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf);
-int dirty_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf);
-int clean_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf);
-int btrfs_commit_transaction(struct btrfs_root *root,
-			     struct btrfs_super_block *s);
+int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct btrfs_buffer *buf);
+int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct btrfs_buffer *buf);
+int clean_tree_block(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct btrfs_buffer *buf);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root
+			     *root, struct btrfs_super_block *s);
 struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s);
 int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s);
 void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf);
-int write_ctree_super(struct btrfs_root *root, struct btrfs_super_block *s);
+int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct btrfs_super_block *s);
 int mkfs(int fd, u64 num_blocks, u32 blocksize);
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4a40282b45f..c29b92d440e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5,12 +5,15 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "transaction.h"
 
-static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks,
-			    u64 search_start, u64 search_end,
-			    struct btrfs_key *ins);
-static int finish_current_insert(struct btrfs_root *extent_root);
-static int run_pending(struct btrfs_root *extent_root);
+static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+			    *orig_root, u64 num_blocks, u64 search_start, u64
+			    search_end, struct btrfs_key *ins);
+static int finish_current_insert(struct btrfs_trans_handle *trans, struct
+				 btrfs_root *extent_root);
+static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *extent_root);
 
 /*
  * pending extents are blocks that we're trying to allocate in the extent
@@ -21,7 +24,8 @@ static int run_pending(struct btrfs_root *extent_root);
  */
 #define CTREE_EXTENT_PENDING_DEL 0
 
-static int inc_block_ref(struct btrfs_root *root, u64 blocknr)
+static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
+			 *root, u64 blocknr)
 {
 	struct btrfs_path path;
 	int ret;
@@ -31,13 +35,13 @@ static int inc_block_ref(struct btrfs_root *root, u64 blocknr)
 	struct btrfs_key ins;
 	u32 refs;
 
-	find_free_extent(root->extent_root, 0, 0, (u64)-1, &ins);
+	find_free_extent(trans, root->extent_root, 0, 0, (u64)-1, &ins);
 	btrfs_init_path(&path);
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = 1;
-	ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 1);
+	ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 1);
 	if (ret != 0)
 		BUG();
 	BUG_ON(ret != 0);
@@ -48,12 +52,13 @@ static int inc_block_ref(struct btrfs_root *root, u64 blocknr)
 
 	BUG_ON(list_empty(&path.nodes[0]->dirty));
 	btrfs_release_path(root->extent_root, &path);
-	finish_current_insert(root->extent_root);
-	run_pending(root->extent_root);
+	finish_current_insert(trans, root->extent_root);
+	run_pending(trans, root->extent_root);
 	return 0;
 }
 
-static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs)
+static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
+			    *root, u64 blocknr, u32 *refs)
 {
 	struct btrfs_path path;
 	int ret;
@@ -65,7 +70,7 @@ static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs)
 	key.offset = 1;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	ret = btrfs_search_slot(root->extent_root, &key, &path, 0, 0);
+	ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 0);
 	if (ret != 0)
 		BUG();
 	l = &path.nodes[0]->leaf;
@@ -75,7 +80,8 @@ static int lookup_block_ref(struct btrfs_root *root, u64 blocknr, u32 *refs)
 	return 0;
 }
 
-int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf)
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct btrfs_buffer *buf)
 {
 	u64 blocknr;
 	int i;
@@ -87,12 +93,13 @@ int btrfs_inc_ref(struct btrfs_root *root, struct btrfs_buffer *buf)
 
 	for (i = 0; i < btrfs_header_nritems(&buf->node.header); i++) {
 		blocknr = btrfs_node_blockptr(&buf->node, i);
-		inc_block_ref(root, blocknr);
+		inc_block_ref(trans, root, blocknr);
 	}
 	return 0;
 }
 
-int btrfs_finish_extent_commit(struct btrfs_root *root)
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
+			       btrfs_root *root)
 {
 	unsigned long gang[8];
 	u64 first = 0;
@@ -116,7 +123,8 @@ int btrfs_finish_extent_commit(struct btrfs_root *root)
 	return 0;
 }
 
-static int finish_current_insert(struct btrfs_root *extent_root)
+static int finish_current_insert(struct btrfs_trans_handle *trans, struct
+				 btrfs_root *extent_root)
 {
 	struct btrfs_key ins;
 	struct btrfs_extent_item extent_item;
@@ -132,8 +140,8 @@ static int finish_current_insert(struct btrfs_root *extent_root)
 
 	for (i = 0; i < extent_root->current_insert.flags; i++) {
 		ins.objectid = extent_root->current_insert.objectid + i;
-		ret = btrfs_insert_item(extent_root, &ins, &extent_item,
-				  sizeof(extent_item));
+		ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item,
+					sizeof(extent_item));
 		BUG_ON(ret);
 	}
 	extent_root->current_insert.offset = 0;
@@ -143,8 +151,8 @@ static int finish_current_insert(struct btrfs_root *extent_root)
 /*
  * remove an extent from the root, returns 0 on success
  */
-static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
-			 int pin)
+static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+			 *root, u64 blocknr, u64 num_blocks, int pin)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -160,9 +168,9 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 
-	find_free_extent(root, 0, 0, (u64)-1, &ins);
+	find_free_extent(trans, root, 0, 0, (u64)-1, &ins);
 	btrfs_init_path(&path);
-	ret = btrfs_search_slot(extent_root, &key, &path, -1, 1);
+	ret = btrfs_search_slot(trans, extent_root, &key, &path, -1, 1);
 	if (ret) {
 		printf("failed to find %Lu\n", key.objectid);
 		btrfs_print_tree(extent_root, extent_root->node);
@@ -183,14 +191,14 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
 			BUG_ON(err);
 			radix_tree_preload_end();
 		}
-		ret = btrfs_del_item(extent_root, &path);
+		ret = btrfs_del_item(trans, extent_root, &path);
 		if (!pin && extent_root->last_insert.objectid > blocknr)
 			extent_root->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
 	}
 	btrfs_release_path(extent_root, &path);
-	finish_current_insert(extent_root);
+	finish_current_insert(trans, extent_root);
 	return ret;
 }
 
@@ -198,7 +206,8 @@ static int __free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
  * find all the blocks marked as pending in the radix tree and remove
  * them from the extent map
  */
-static int del_pending_extents(struct btrfs_root *extent_root)
+static int del_pending_extents(struct btrfs_trans_handle *trans, struct
+			       btrfs_root *extent_root)
 {
 	int ret;
 	struct btrfs_buffer *gang[4];
@@ -212,7 +221,7 @@ static int del_pending_extents(struct btrfs_root *extent_root)
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			ret = __free_extent(extent_root,
+			ret = __free_extent(trans, extent_root,
 					    gang[i]->blocknr, 1, 1);
 			radix_tree_tag_clear(&extent_root->cache_radix,
 						gang[i]->blocknr,
@@ -223,11 +232,12 @@ static int del_pending_extents(struct btrfs_root *extent_root)
 	return 0;
 }
 
-static int run_pending(struct btrfs_root *extent_root)
+static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *extent_root)
 {
 	while(radix_tree_tagged(&extent_root->cache_radix,
 			        CTREE_EXTENT_PENDING_DEL))
-		del_pending_extents(extent_root);
+		del_pending_extents(trans, extent_root);
 	return 0;
 }
 
@@ -235,8 +245,8 @@ static int run_pending(struct btrfs_root *extent_root)
 /*
  * remove an extent from the root, returns 0 on success
  */
-int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
-		      int pin)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, u64 blocknr, u64 num_blocks, int pin)
 {
 	struct btrfs_root *extent_root = root->extent_root;
 	struct btrfs_buffer *t;
@@ -249,8 +259,8 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
 				   CTREE_EXTENT_PENDING_DEL);
 		return 0;
 	}
-	ret = __free_extent(root, blocknr, num_blocks, pin);
-	pending_ret = run_pending(root->extent_root);
+	ret = __free_extent(trans, root, blocknr, num_blocks, pin);
+	pending_ret = run_pending(trans, root->extent_root);
 	return ret ? ret : pending_ret;
 }
 
@@ -262,9 +272,9 @@ int btrfs_free_extent(struct btrfs_root *root, u64 blocknr, u64 num_blocks,
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-static int find_free_extent(struct btrfs_root *orig_root, u64 num_blocks,
-			    u64 search_start, u64 search_end,
-			    struct btrfs_key *ins)
+static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+			    *orig_root, u64 num_blocks, u64 search_start, u64
+			    search_end, struct btrfs_key *ins)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -290,7 +300,7 @@ check_failed:
 	ins->objectid = search_start;
 	ins->offset = 0;
 	start_found = 0;
-	ret = btrfs_search_slot(root, ins, &path, 0, 0);
+	ret = btrfs_search_slot(trans, root, ins, &path, 0, 0);
 	if (ret < 0)
 		goto error;
 
@@ -367,9 +377,9 @@ error:
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-static int alloc_extent(struct btrfs_root *root, u64 num_blocks,
-			u64 search_start, u64 search_end, u64 owner,
-			struct btrfs_key *ins)
+static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+			*root, u64 num_blocks, u64 search_start, u64
+			search_end, u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
@@ -389,16 +399,16 @@ static int alloc_extent(struct btrfs_root *root, u64 num_blocks,
 				extent_root->current_insert.flags++;
 		return 0;
 	}
-	ret = find_free_extent(root, num_blocks, search_start,
+	ret = find_free_extent(trans, root, num_blocks, search_start,
 			       search_end, ins);
 	if (ret)
 		return ret;
 
-	ret = btrfs_insert_item(extent_root, ins, &extent_item,
-			  sizeof(extent_item));
+	ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
+				sizeof(extent_item));
 
-	finish_current_insert(extent_root);
-	pending_ret = run_pending(extent_root);
+	finish_current_insert(trans, extent_root);
+	pending_ret = run_pending(trans, extent_root);
 	if (ret)
 		return ret;
 	if (pending_ret)
@@ -410,13 +420,14 @@ static int alloc_extent(struct btrfs_root *root, u64 num_blocks,
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
  */
-struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root)
+struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct btrfs_buffer *buf;
 
-	ret = alloc_extent(root, 1, 0, (unsigned long)-1,
+	ret = alloc_extent(trans, root, 1, 0, (unsigned long)-1,
 			   btrfs_header_parentid(&root->node->node.header),
 			   &ins);
 	if (ret) {
@@ -424,7 +435,7 @@ struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root)
 		return NULL;
 	}
 	buf = find_tree_block(root, ins.objectid);
-	dirty_tree_block(root, buf);
+	dirty_tree_block(trans, root, buf);
 	return buf;
 }
 
@@ -432,8 +443,8 @@ struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_root *root)
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
-static int walk_down_tree(struct btrfs_root *root,
-			  struct btrfs_path *path, int *level)
+static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
+			  *root, struct btrfs_path *path, int *level)
 {
 	struct btrfs_buffer *next;
 	struct btrfs_buffer *cur;
@@ -441,7 +452,8 @@ static int walk_down_tree(struct btrfs_root *root,
 	int ret;
 	u32 refs;
 
-	ret = lookup_block_ref(root, path->nodes[*level]->blocknr, &refs);
+	ret = lookup_block_ref(trans, root, path->nodes[*level]->blocknr,
+			       &refs);
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
@@ -454,10 +466,10 @@ static int walk_down_tree(struct btrfs_root *root,
 		    btrfs_header_nritems(&cur->node.header))
 			break;
 		blocknr = btrfs_node_blockptr(&cur->node, path->slots[*level]);
-		ret = lookup_block_ref(root, blocknr, &refs);
+		ret = lookup_block_ref(trans, root, blocknr, &refs);
 		if (refs != 1 || *level == 1) {
 			path->slots[*level]++;
-			ret = btrfs_free_extent(root, blocknr, 1, 1);
+			ret = btrfs_free_extent(trans, root, blocknr, 1, 1);
 			BUG_ON(ret);
 			continue;
 		}
@@ -470,7 +482,8 @@ static int walk_down_tree(struct btrfs_root *root,
 		path->slots[*level] = 0;
 	}
 out:
-	ret = btrfs_free_extent(root, path->nodes[*level]->blocknr, 1, 1);
+	ret = btrfs_free_extent(trans, root, path->nodes[*level]->blocknr, 1,
+				1);
 	btrfs_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -483,8 +496,8 @@ out:
  * to find the first node higher up where we haven't yet gone through
  * all the slots
  */
-static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
-			int *level)
+static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
+			*root, struct btrfs_path *path, int *level)
 {
 	int i;
 	int slot;
@@ -497,8 +510,9 @@ static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
 			*level = i;
 			return 0;
 		} else {
-			ret = btrfs_free_extent(root,
-					  path->nodes[*level]->blocknr, 1, 1);
+			ret = btrfs_free_extent(trans, root,
+						path->nodes[*level]->blocknr,
+						1, 1);
 			btrfs_block_release(root, path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
@@ -513,7 +527,8 @@ static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path,
  * the tree freeing any blocks that have a ref count of zero after being
  * decremented.
  */
-int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap)
+int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
+			*root, struct btrfs_buffer *snap)
 {
 	int ret = 0;
 	int wret;
@@ -529,13 +544,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_buffer *snap)
 	path.nodes[level] = snap;
 	path.slots[level] = 0;
 	while(1) {
-		wret = walk_down_tree(root, &path, &level);
+		wret = walk_down_tree(trans, root, &path, &level);
 		if (wret > 0)
 			break;
 		if (wret < 0)
 			ret = wret;
 
-		wret = walk_up_tree(root, &path, &level);
+		wret = walk_up_tree(trans, root, &path, &level);
 		if (wret > 0)
 			break;
 		if (wret < 0)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 4908f9afdf8..7caeb11e875 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -4,9 +4,11 @@
 #include "radix-tree.h"
 #include "ctree.h"
 #include "disk-io.h"
+#include "transaction.h"
 
-int btrfs_insert_inode(struct btrfs_root *root, u64 objectid,
-		       struct btrfs_inode_item *inode_item)
+int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, u64 objectid, struct btrfs_inode_item
+		       *inode_item)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -17,13 +19,14 @@ int btrfs_insert_inode(struct btrfs_root *root, u64 objectid,
 	key.offset = 0;
 
 	btrfs_init_path(&path);
-	ret = btrfs_insert_item(root, &key, inode_item, sizeof(*inode_item));
+	ret = btrfs_insert_item(trans, root, &key, inode_item,
+				sizeof(*inode_item));
 	btrfs_release_path(root, &path);
 	return ret;
 }
 
-int btrfs_lookup_inode(struct btrfs_root *root, struct btrfs_path *path,
-			u64 objectid, int mod)
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path, u64 objectid, int mod)
 {
 	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
@@ -33,5 +36,5 @@ int btrfs_lookup_inode(struct btrfs_root *root, struct btrfs_path *path,
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
-	return btrfs_search_slot(root, &key, path, ins_len, cow);
+	return btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 }
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 9236abda2a7..0882ca904ec 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -1,3 +1,5 @@
-
+#ifndef __PRINT_TREE_
+#define __PRINT_TREE_
 void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l);
 void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t);
+#endif
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index 3a257161712..d676577185d 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -5,6 +5,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "transaction.h"
 
 /* for testing only */
 int next_key(int i, int max_key) {
@@ -25,10 +26,12 @@ int main(int ac, char **av) {
 	struct btrfs_path path;
 	struct btrfs_super_block super;
 	struct btrfs_root *root;
+	struct btrfs_trans_handle *trans;
 
 	radix_tree_init();
 
 	root = open_ctree("dbfile", &super);
+	trans = btrfs_start_transaction(root, 1);
 	srand(55);
 	ins.flags = 0;
 	btrfs_set_key_type(&ins, BTRFS_STRING_ITEM_KEY);
@@ -41,12 +44,12 @@ int main(int ac, char **av) {
 			fprintf(stderr, "insert %d:%d\n", num, i);
 		ins.objectid = num;
 		ins.offset = 0;
-		ret = btrfs_insert_item(root, &ins, buf, strlen(buf));
+		ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 		free(buf);
 		if (i == run_size - 5) {
-			btrfs_commit_transaction(root, &super);
+			btrfs_commit_transaction(trans, root, &super);
 		}
 
 	}
@@ -61,7 +64,7 @@ int main(int ac, char **av) {
 		btrfs_init_path(&path);
 		if (i % 10000 == 0)
 			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = btrfs_search_slot(root, &ins, &path, 0, 0);
+		ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0);
 		if (ret) {
 			btrfs_print_tree(root, root->node);
 			printf("unable to find %d\n", num);
@@ -83,11 +86,11 @@ int main(int ac, char **av) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		btrfs_init_path(&path);
-		ret = btrfs_search_slot(root, &ins, &path, -1, 1);
+		ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1);
 		if (!ret) {
 			if (i % 10000 == 0)
 				fprintf(stderr, "del %d:%d\n", num, i);
-			ret = btrfs_del_item(root, &path);
+			ret = btrfs_del_item(trans, root, &path);
 			if (ret != 0)
 				BUG();
 			tree_size--;
@@ -104,7 +107,7 @@ int main(int ac, char **av) {
 		ins.objectid = num;
 		if (i % 10000 == 0)
 			fprintf(stderr, "insert %d:%d\n", num, i);
-		ret = btrfs_insert_item(root, &ins, buf, strlen(buf));
+		ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 		free(buf);
@@ -119,7 +122,7 @@ int main(int ac, char **av) {
 		btrfs_init_path(&path);
 		if (i % 10000 == 0)
 			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = btrfs_search_slot(root, &ins, &path, 0, 0);
+		ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0);
 		if (ret) {
 			btrfs_print_tree(root, root->node);
 			printf("unable to find %d\n", num);
@@ -134,7 +137,7 @@ int main(int ac, char **av) {
 		int slot;
 		ins.objectid = (u64)-1;
 		btrfs_init_path(&path);
-		ret = btrfs_search_slot(root, &ins, &path, -1, 1);
+		ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1);
 		if (ret == 0)
 			BUG();
 
@@ -150,7 +153,7 @@ int main(int ac, char **av) {
 			btrfs_disk_key_to_cpu(&last, &leaf->items[slot].key);
 			if (tree_size % 10000 == 0)
 				printf("big del %d:%d\n", tree_size, i);
-			ret = btrfs_del_item(root, &path);
+			ret = btrfs_del_item(trans, root, &path);
 			if (ret != 0) {
 				printf("del_item returned %d\n", ret);
 				BUG();
@@ -165,7 +168,7 @@ int main(int ac, char **av) {
 	printf("map before commit\n");
 	btrfs_print_tree(root->extent_root, root->extent_root->node);
 	*/
-	btrfs_commit_transaction(root, &super);
+	btrfs_commit_transaction(trans, root, &super);
 	printf("tree size is now %d\n", tree_size);
 	printf("root %p commit root %p\n", root->node, root->commit_root);
 	printf("map tree\n");
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
index f05135f13f9..3a38ae7a886 100644
--- a/fs/btrfs/random-test.c
+++ b/fs/btrfs/random-test.c
@@ -6,6 +6,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
+#include "transaction.h"
 
 int keep_running = 1;
 struct btrfs_super_block super;
@@ -37,7 +38,8 @@ again:
 	return 0;
 }
 
-static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix)
+static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct radix_tree_root *radix)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -47,7 +49,7 @@ static int ins_one(struct btrfs_root *root, struct radix_tree_root *radix)
 	btrfs_init_path(&path);
 	ret = setup_key(radix, &key, 0);
 	sprintf(buf, "str-%Lu\n", key.objectid);
-	ret = btrfs_insert_item(root, &key, buf, strlen(buf));
+	ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf));
 	if (ret)
 		goto error;
 	oid = (unsigned long)key.objectid;
@@ -62,7 +64,8 @@ error:
 	return -1;
 }
 
-static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
+static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct radix_tree_root *radix)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -73,7 +76,7 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
 	if (ret < 0)
 		return 0;
 	sprintf(buf, "str-%Lu\n", key.objectid);
-	ret = btrfs_insert_item(root, &key, buf, strlen(buf));
+	ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf));
 	if (ret != -EEXIST) {
 		printf("insert on %Lu gave us %d\n", key.objectid, ret);
 		return 1;
@@ -81,7 +84,8 @@ static int insert_dup(struct btrfs_root *root, struct radix_tree_root *radix)
 	return 0;
 }
 
-static int del_one(struct btrfs_root *root, struct radix_tree_root *radix)
+static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct radix_tree_root *radix)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -91,10 +95,10 @@ static int del_one(struct btrfs_root *root, struct radix_tree_root *radix)
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
-	ret = btrfs_search_slot(root, &key, &path, -1, 1);
+	ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
 	if (ret)
 		goto error;
-	ret = btrfs_del_item(root, &path);
+	ret = btrfs_del_item(trans, root, &path);
 	btrfs_release_path(root, &path);
 	if (ret != 0)
 		goto error;
@@ -107,7 +111,8 @@ error:
 	return -1;
 }
 
-static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix)
+static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct radix_tree_root *radix)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -116,7 +121,7 @@ static int lookup_item(struct btrfs_root *root, struct radix_tree_root *radix)
 	ret = setup_key(radix, &key, 1);
 	if (ret < 0)
 		return 0;
-	ret = btrfs_search_slot(root, &key, &path, 0, 1);
+	ret = btrfs_search_slot(trans, root, &key, &path, 0, 1);
 	btrfs_release_path(root, &path);
 	if (ret)
 		goto error;
@@ -126,7 +131,8 @@ error:
 	return -1;
 }
 
-static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix)
+static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root
+			 *root, struct radix_tree_root *radix)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -135,7 +141,7 @@ static int lookup_enoent(struct btrfs_root *root, struct radix_tree_root *radix)
 	ret = setup_key(radix, &key, 0);
 	if (ret < 0)
 		return ret;
-	ret = btrfs_search_slot(root, &key, &path, 0, 0);
+	ret = btrfs_search_slot(trans, root, &key, &path, 0, 0);
 	btrfs_release_path(root, &path);
 	if (ret <= 0)
 		goto error;
@@ -145,8 +151,8 @@ error:
 	return -1;
 }
 
-static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
-		      int nr)
+static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct radix_tree_root *radix, int nr)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -162,7 +168,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 	key.objectid = (unsigned long)-1;
 	while(nr-- >= 0) {
 		btrfs_init_path(&path);
-		ret = btrfs_search_slot(root, &key, &path, -1, 1);
+		ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
 		if (ret < 0) {
 			btrfs_release_path(root, &path);
 			return ret;
@@ -177,7 +183,7 @@ static int empty_tree(struct btrfs_root *root, struct radix_tree_root *radix,
 		slot = path.slots[0];
 		found = btrfs_disk_key_objectid(
 					&path.nodes[0]->leaf.items[slot].key);
-		ret = btrfs_del_item(root, &path);
+		ret = btrfs_del_item(trans, root, &path);
 		count++;
 		if (ret) {
 			fprintf(stderr,
@@ -198,19 +204,19 @@ error:
 	return -1;
 }
 
-static int fill_tree(struct btrfs_root *root, struct radix_tree_root *radix,
-		     int count)
+static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		     struct radix_tree_root *radix, int count)
 {
 	int i;
 	int ret = 0;
 	for (i = 0; i < count; i++) {
-		ret = ins_one(root, radix);
+		ret = ins_one(trans, root, radix);
 		if (ret) {
 			fprintf(stderr, "fill failed\n");
 			goto out;
 		}
 		if (i % 1000 == 0) {
-			ret = btrfs_commit_transaction(root, &super);
+			ret = btrfs_commit_transaction(trans, root, &super);
 			if (ret) {
 				fprintf(stderr, "fill commit failed\n");
 				return ret;
@@ -226,7 +232,8 @@ out:
 	return ret;
 }
 
-static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix)
+static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct radix_tree_root *radix)
 {
 	int ret;
 	int nr = rand() % 5000;
@@ -235,17 +242,18 @@ static int bulk_op(struct btrfs_root *root, struct radix_tree_root *radix)
 	/* do the bulk op much less frequently */
 	if (run_nr++ % 100)
 		return 0;
-	ret = empty_tree(root, radix, nr);
+	ret = empty_tree(trans, root, radix, nr);
 	if (ret)
 		return ret;
-	ret = fill_tree(root, radix, nr);
+	ret = fill_tree(trans, root, radix, nr);
 	if (ret)
 		return ret;
 	return 0;
 }
 
 
-int (*ops[])(struct btrfs_root *root, struct radix_tree_root *radix) =
+int (*ops[])(struct btrfs_trans_handle *,
+	     struct btrfs_root *root, struct radix_tree_root *radix) =
 	{ ins_one, insert_dup, del_one, lookup_item,
 	  lookup_enoent, bulk_op };
 
@@ -264,7 +272,7 @@ static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix)
 	key.objectid = (unsigned long)-1;
 	while(1) {
 		btrfs_init_path(&path);
-		ret = btrfs_search_slot(root, &key, &path, 0, 0);
+		ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
 		if (ret < 0) {
 			btrfs_release_path(root, &path);
 			return ret;
@@ -325,6 +333,7 @@ int main(int ac, char **av)
 	int init_fill_count = 800000;
 	int err = 0;
 	int initial_only = 0;
+	struct btrfs_trans_handle *trans;
 	radix_tree_init();
 	root = open_ctree("dbfile", &super);
 	fill_radix(root, &radix);
@@ -346,7 +355,8 @@ int main(int ac, char **av)
 		}
 	}
 	printf("initial fill\n");
-	ret = fill_tree(root, &radix, init_fill_count);
+	trans = btrfs_start_transaction(root, 1);
+	ret = fill_tree(trans, root, &radix, init_fill_count);
 	printf("starting run\n");
 	if (ret) {
 		err = ret;
@@ -370,7 +380,7 @@ int main(int ac, char **av)
 			root = open_ctree("dbfile", &super);
 		}
 		while(count--) {
-			ret = ops[op](root, &radix);
+			ret = ops[op](trans, root, &radix);
 			if (ret) {
 				fprintf(stderr, "op %d failed %d:%d\n",
 					op, i, iterations);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a24b4727403..9cccecc0f43 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -20,7 +20,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	search_key.offset = (u32)-1;
 
 	btrfs_init_path(&path);
-	ret = btrfs_search_slot(root, &search_key, &path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &search_key, &path, 0, 0);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
@@ -40,8 +40,9 @@ out:
 	return ret;
 }
 
-int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key,
-		      struct btrfs_root_item *item)
+int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item)
 {
 	struct btrfs_path path;
 	struct btrfs_leaf *l;
@@ -49,7 +50,7 @@ int btrfs_update_root(struct btrfs_root *root, struct btrfs_key *key,
 	int slot;
 
 	btrfs_init_path(&path);
-	ret = btrfs_search_slot(root, key, &path, 0, 1);
+	ret = btrfs_search_slot(trans, root, key, &path, 0, 1);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
@@ -62,26 +63,28 @@ out:
 	return ret;
 }
 
-int btrfs_insert_root(struct btrfs_root *root, struct btrfs_key *key,
-		      struct btrfs_root_item *item)
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_key *key, struct btrfs_root_item
+		      *item)
 {
 	int ret;
-	ret = btrfs_insert_item(root, key, item, sizeof(*item));
+	ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
 	BUG_ON(ret);
 	return ret;
 }
 
-int btrfs_del_root(struct btrfs_root *root, struct btrfs_key *key)
+int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_key *key)
 {
 	struct btrfs_path path;
 	int ret;
 
 	btrfs_init_path(&path);
-	ret = btrfs_search_slot(root, key, &path, -1, 1);
+	ret = btrfs_search_slot(trans, root, key, &path, -1, 1);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
-	ret = btrfs_del_item(root, &path);
+	ret = btrfs_del_item(trans, root, &path);
 out:
 	btrfs_release_path(root, &path);
 	return ret;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
new file mode 100644
index 00000000000..3adb6e69fe4
--- /dev/null
+++ b/fs/btrfs/transaction.h
@@ -0,0 +1,27 @@
+#ifndef __TRANSACTION__
+#define __TRANSACTION__
+
+struct btrfs_trans_handle {
+	u64 transid;
+	unsigned long blocks_reserved;
+	unsigned long blocks_used;
+};
+
+static inline struct btrfs_trans_handle *
+btrfs_start_transaction(struct btrfs_root *root, int num_blocks)
+{
+	struct btrfs_trans_handle *h = malloc(sizeof(*h));
+	h->transid = root->root_key.offset;
+	h->blocks_reserved = num_blocks;
+	h->blocks_used = 0;
+	return h;
+}
+
+static inline void btrfs_free_transaction(struct btrfs_root *root,
+					  struct btrfs_trans_handle *handle)
+{
+	memset(handle, 0, sizeof(*handle));
+	free(handle);
+}
+
+#endif
-- 
cgit v1.2.3


From 4068947886f2d2a4c192941397bacd014a04b804 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 17 Mar 2007 14:29:23 -0400
Subject: Btrfs: minor comments

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 17a3ff2f182..32922643b5b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -294,6 +294,10 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		parent_buf = path->nodes[level + 1];
 	pslot = path->slots[level + 1];
 
+	/*
+	 * deal with the case where there is only one pointer in the root
+	 * by promoting the node below to a root
+	 */
 	if (!parent_buf) {
 		struct btrfs_buffer *child;
 		u64 blocknr = mid_buf->blocknr;
@@ -1043,6 +1047,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	int wret;
 
+	/* first try to make some room by pushing left and right */
 	wret = push_leaf_left(trans, root, path, data_size);
 	if (wret < 0)
 		return wret;
-- 
cgit v1.2.3


From 631d7d950a2f2706f5c86858402c1ec50b9801f1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Mar 2007 09:13:49 -0400
Subject: Btrfs: add transaction.h to the Makefile

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 76bcd87d857..7953b41848c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,6 +1,7 @@
 CC=gcc
 CFLAGS = -g -Wall -Werror
-headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h
+headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \
+	  transaction.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \
 	  root-tree.o dir-item.o hash.o file-item.o inode-item.o
 
-- 
cgit v1.2.3


From 9f5fae2fe6dc35b46bf56183f11398451851cb3f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Mar 2007 14:38:32 -0400
Subject: Btrfs: Add inode map, and the start of file extent items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   3 +-
 fs/btrfs/ctree.h       | 129 ++++++++++++++++++++++++++++++++----
 fs/btrfs/debug-tree.c  |   9 ++-
 fs/btrfs/dir-test.c    |  91 ++++++++++++++++++++-----
 fs/btrfs/disk-io.c     | 176 +++++++++++++++++++++++++++----------------------
 fs/btrfs/extent-tree.c | 105 +++++++++++++++--------------
 fs/btrfs/file-item.c   |   6 ++
 fs/btrfs/inode-map.c   | 136 ++++++++++++++++++++++++++++++++++++++
 fs/btrfs/mkfs.c        |  44 +++++++++++--
 fs/btrfs/print-tree.c  |  13 ++--
 fs/btrfs/quick-test.c  |   3 +-
 11 files changed, 542 insertions(+), 173 deletions(-)
 create mode 100644 fs/btrfs/inode-map.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7953b41848c..636c63efde6 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -3,7 +3,8 @@ CFLAGS = -g -Wall -Werror
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \
 	  transaction.h
 objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \
-	  root-tree.o dir-item.o hash.o file-item.o inode-item.o
+	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
+	  inode-map.o \
 
 # if you don't have sparse installed, use ls instead
 CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68f0af39777..68e3da274f1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -10,7 +10,8 @@ struct btrfs_trans_handle;
 
 #define BTRFS_ROOT_TREE_OBJECTID 1
 #define BTRFS_EXTENT_TREE_OBJECTID 2
-#define BTRFS_FS_TREE_OBJECTID 3
+#define BTRFS_INODE_MAP_OBJECTID 3
+#define BTRFS_FS_TREE_OBJECTID 4
 
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
@@ -178,31 +179,65 @@ struct btrfs_root_item {
 	__le64 block_limit;
 	__le64 blocks_used;
 	__le32 refs;
-};
+} __attribute__ ((__packed__));
 
-/*
- * in ram representation of the tree.  extent_root is used for all allocations
- * and for the extent tree extent_root root.  current_insert is used
- * only for the extent tree.
- */
-struct btrfs_root {
-	struct btrfs_buffer *node;
-	struct btrfs_buffer *commit_root;
+struct btrfs_file_extent_item {
+	/*
+	 * disk space consumed by the extent, checksum blocks are included
+	 * in these numbers
+	 */
+	__le64 disk_blocknr;
+	__le64 disk_num_blocks;
+	/*
+	 * the logical offset in file bytes (no csums)
+	 * this extent record is for.  This allows a file extent to point
+	 * into the middle of an existing extent on disk, sharing it
+	 * between two snapshots (useful if some bytes in the middle of the
+	 * extent have changed
+	 */
+	__le64 offset;
+	/*
+	 * the logical number of file blocks (no csums included)
+	 */
+	__le64 num_blocks;
+} __attribute__ ((__packed__));
+
+struct btrfs_inode_map_item {
+	struct btrfs_disk_key key;
+} __attribute__ ((__packed__));
+
+struct btrfs_fs_info {
+	struct btrfs_root *fs_root;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
+	struct btrfs_root *inode_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
-	int fp;
 	struct radix_tree_root cache_radix;
 	struct radix_tree_root pinned_radix;
 	struct list_head trans;
 	struct list_head cache;
+	u64 last_inode_alloc;
+	u64 last_inode_alloc_dirid;
 	int cache_size;
-	int ref_cows;
+	int fp;
+	struct btrfs_trans_handle *running_transaction;
+};
+
+/*
+ * in ram representation of the tree.  extent_root is used for all allocations
+ * and for the extent tree extent_root root.  current_insert is used
+ * only for the extent tree.
+ */
+struct btrfs_root {
+	struct btrfs_buffer *node;
+	struct btrfs_buffer *commit_root;
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
+	struct btrfs_fs_info *fs_info;
 	u32 blocksize;
-	struct btrfs_trans_handle *running_transaction;
+	int ref_cows;
+	u32 type;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -240,11 +275,17 @@ struct btrfs_root {
  * are used, and how many references there are to each block
  */
 #define BTRFS_EXTENT_ITEM_KEY	6
+
+/*
+ * the inode map records which inode numbers are in use and where
+ * they actually live on disk
+ */
+#define BTRFS_INODE_MAP_ITEM_KEY 7
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	7
+#define BTRFS_STRING_ITEM_KEY	8
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -654,6 +695,57 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 {
 	return (u8 *)l->items;
 }
+
+static inline u64 btrfs_file_extent_disk_blocknr(struct btrfs_file_extent_item
+						 *e)
+{
+	return le64_to_cpu(e->disk_blocknr);
+}
+
+static inline void btrfs_set_file_extent_disk_blocknr(struct
+						      btrfs_file_extent_item
+						      *e, u64 val)
+{
+	e->disk_blocknr = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_file_extent_disk_num_blocks(struct
+						    btrfs_file_extent_item *e)
+{
+	return le64_to_cpu(e->disk_num_blocks);
+}
+
+static inline void btrfs_set_file_extent_disk_num_blocks(struct
+							 btrfs_file_extent_item
+							 *e, u64 val)
+{
+	e->disk_num_blocks = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_file_extent_offset(struct btrfs_file_extent_item *e)
+{
+	return le64_to_cpu(e->offset);
+}
+
+static inline void btrfs_set_file_extent_offset(struct btrfs_file_extent_item
+						*e, u64 val)
+{
+	e->offset = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_file_extent_num_blocks(struct btrfs_file_extent_item
+					       *e)
+{
+	return le64_to_cpu(e->num_blocks);
+}
+
+static inline void btrfs_set_file_extent_num_blocks(struct
+						    btrfs_file_extent_item *e,
+						    u64 val)
+{
+	e->num_blocks = cpu_to_le64(val);
+}
+
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
@@ -701,4 +793,13 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  int name_len, int mod);
 int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 			      char *name, int name_len);
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *fs_root,
+			     u64 dirid, u64 *objectid);
+int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   u64 objectid, struct btrfs_key *location);
+int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct btrfs_path *path,
+			   u64 objectid, int mod);
 #endif
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
index 91dea7a0a47..d9f36efc3dd 100644
--- a/fs/btrfs/debug-tree.c
+++ b/fs/btrfs/debug-tree.c
@@ -15,8 +15,13 @@ int main(int ac, char **av) {
 	printf("fs tree\n");
 	btrfs_print_tree(root, root->node);
 	printf("map tree\n");
-	btrfs_print_tree(root->extent_root, root->extent_root->node);
+	btrfs_print_tree(root->fs_info->extent_root,
+			 root->fs_info->extent_root->node);
+	printf("inode tree\n");
+	btrfs_print_tree(root->fs_info->inode_root,
+			 root->fs_info->inode_root->node);
 	printf("root tree\n");
-	btrfs_print_tree(root->tree_root, root->tree_root->node);
+	btrfs_print_tree(root->fs_info->tree_root,
+			 root->fs_info->tree_root->node);
 	return 0;
 }
diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c
index e908c0c588c..56f06c3ba57 100644
--- a/fs/btrfs/dir-test.c
+++ b/fs/btrfs/dir-test.c
@@ -45,13 +45,26 @@ static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int ret;
 	char buf[128];
 	unsigned long oid;
+	u64 objectid;
 	struct btrfs_path path;
+	struct btrfs_key inode_map;
 
 	find_num(radix, &oid, 0);
 	sprintf(buf, "str-%lu", oid);
 
+	ret = btrfs_find_free_objectid(trans, root, dir_oid + 1, &objectid);
+	if (ret)
+		goto error;
+
+	inode_map.objectid = objectid;
+	inode_map.flags = 0;
+	inode_map.offset = 0;
+
+	ret = btrfs_insert_inode_map(trans, root, objectid, &inode_map);
+	if (ret)
+		goto error;
 	ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid,
-				    file_oid, 1);
+				    objectid, 1);
 	if (ret)
 		goto error;
 
@@ -120,6 +133,53 @@ static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 0;
 }
 
+static int del_dir_item(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct radix_tree_root *radix,
+			unsigned long radix_index,
+			struct btrfs_path *path)
+{
+	int ret;
+	unsigned long *ptr;
+	u64 file_objectid;
+	struct btrfs_dir_item *di;
+	struct btrfs_path map_path;
+
+	/* find the inode number of the file */
+	di = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0],
+			    struct btrfs_dir_item);
+	file_objectid = btrfs_dir_objectid(di);
+
+	/* delete the directory item */
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	/* delete the inode mapping */
+	btrfs_init_path(&map_path);
+	ret = btrfs_lookup_inode_map(trans, root, &map_path, file_objectid, -1);
+	if (ret)
+		goto out_release;
+	ret = btrfs_del_item(trans, root->fs_info->inode_root, &map_path);
+	if (ret)
+		goto out_release;
+
+	if (root->fs_info->last_inode_alloc > file_objectid)
+		root->fs_info->last_inode_alloc = file_objectid;
+	btrfs_release_path(root, &map_path);
+	ptr = radix_tree_delete(radix, radix_index);
+	if (!ptr) {
+		ret = -5555;
+		goto out;
+	}
+	return 0;
+out_release:
+	btrfs_release_path(root, &map_path);
+out:
+	printf("failed to delete %lu %d\n", radix_index, ret);
+	return -1;
+}
+
 static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct radix_tree_root *radix)
 {
@@ -127,7 +187,6 @@ static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	char buf[128];
 	unsigned long oid;
 	struct btrfs_path path;
-	unsigned long *ptr;
 
 	ret = find_num(radix, &oid, 1);
 	if (ret < 0)
@@ -138,19 +197,14 @@ static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				    strlen(buf), -1);
 	if (ret)
 		goto out_release;
-	ret = btrfs_del_item(trans, root, &path);
+
+	ret = del_dir_item(trans, root, radix, oid, &path);
 	if (ret)
 		goto out_release;
 	btrfs_release_path(root, &path);
-	ptr = radix_tree_delete(radix, oid);
-	if (!ptr) {
-		ret = -5555;
-		goto out;
-	}
-	return 0;
+	return ret;
 out_release:
 	btrfs_release_path(root, &path);
-out:
 	printf("failed to delete %lu %d\n", oid, ret);
 	return -1;
 }
@@ -162,6 +216,8 @@ static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	char buf[128];
 	int ret;
 	unsigned long oid;
+	u64 objectid;
+	struct btrfs_dir_item *di;
 
 	ret = find_num(radix, &oid, 1);
 	if (ret < 0)
@@ -170,6 +226,14 @@ static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_init_path(&path);
 	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
 				    strlen(buf), 0);
+	if (!ret) {
+		di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+				    struct btrfs_dir_item);
+		objectid = btrfs_dir_objectid(di);
+		btrfs_release_path(root, &path);
+		btrfs_init_path(&path);
+		ret = btrfs_lookup_inode_map(trans, root, &path, objectid, 0);
+	}
 	btrfs_release_path(root, &path);
 	if (ret) {
 		printf("unable to find key %lu\n", oid);
@@ -210,7 +274,6 @@ static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 found_len;
 	int ret;
 	int slot;
-	int *ptr;
 	int count = 0;
 	char buf[128];
 	struct btrfs_dir_item *di;
@@ -241,7 +304,7 @@ static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		BUG_ON(found_len > 128);
 		buf[found_len] = '\0';
 		found = atoi(buf + 4);
-		ret = btrfs_del_item(trans, root, &path);
+		ret = del_dir_item(trans, root, radix, found, &path);
 		count++;
 		if (ret) {
 			fprintf(stderr,
@@ -250,14 +313,10 @@ static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			return -1;
 		}
 		btrfs_release_path(root, &path);
-		ptr = radix_tree_delete(radix, found);
-		if (!ptr)
-			goto error;
 		if (!keep_running)
 			break;
 	}
 	return 0;
-error:
 	fprintf(stderr, "failed to delete from the radix %lu\n", found);
 	return -1;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 25ce07908ee..1849a99690c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,15 +28,15 @@ static int free_some_buffers(struct btrfs_root *root)
 {
 	struct list_head *node, *next;
 	struct btrfs_buffer *b;
-	if (root->cache_size < cache_max)
+	if (root->fs_info->cache_size < cache_max)
 		return 0;
-	list_for_each_safe(node, next, &root->cache) {
+	list_for_each_safe(node, next, &root->fs_info->cache) {
 		b = list_entry(node, struct btrfs_buffer, cache);
 		if (b->count == 1) {
 			BUG_ON(!list_empty(&b->dirty));
 			list_del_init(&b->cache);
 			btrfs_block_release(root, b);
-			if (root->cache_size < cache_max)
+			if (root->fs_info->cache_size < cache_max)
 				break;
 		}
 	}
@@ -57,10 +57,10 @@ struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr)
 	INIT_LIST_HEAD(&buf->dirty);
 	free_some_buffers(root);
 	radix_tree_preload(GFP_KERNEL);
-	ret = radix_tree_insert(&root->cache_radix, blocknr, buf);
+	ret = radix_tree_insert(&root->fs_info->cache_radix, blocknr, buf);
 	radix_tree_preload_end();
-	list_add_tail(&buf->cache, &root->cache);
-	root->cache_size++;
+	list_add_tail(&buf->cache, &root->fs_info->cache);
+	root->fs_info->cache_size++;
 	if (ret) {
 		free(buf);
 		return NULL;
@@ -71,7 +71,7 @@ struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr)
 struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr)
 {
 	struct btrfs_buffer *buf;
-	buf = radix_tree_lookup(&root->cache_radix, blocknr);
+	buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr);
 	if (buf) {
 		buf->count++;
 	} else {
@@ -90,14 +90,15 @@ struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
 	struct btrfs_buffer *buf;
 	int ret;
 
-	buf = radix_tree_lookup(&root->cache_radix, blocknr);
+	buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr);
 	if (buf) {
 		buf->count++;
 	} else {
 		buf = alloc_tree_block(root, blocknr);
 		if (!buf)
 			return NULL;
-		ret = pread(root->fp, &buf->node, root->blocksize, offset);
+		ret = pread(root->fs_info->fp, &buf->node, root->blocksize,
+			    offset);
 		if (ret != root->blocksize) {
 			free(buf);
 			return NULL;
@@ -113,7 +114,7 @@ int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	if (!list_empty(&buf->dirty))
 		return 0;
-	list_add_tail(&buf->dirty, &root->trans);
+	list_add_tail(&buf->dirty, &root->fs_info->trans);
 	buf->count++;
 	return 0;
 }
@@ -137,7 +138,7 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
 		BUG();
-	ret = pwrite(root->fp, &buf->node, root->blocksize, offset);
+	ret = pwrite(root->fs_info->fp, &buf->node, root->blocksize, offset);
 	if (ret != root->blocksize)
 		return ret;
 	return 0;
@@ -149,8 +150,9 @@ static int __commit_transaction(struct btrfs_trans_handle *trans, struct
 	struct btrfs_buffer *b;
 	int ret = 0;
 	int wret;
-	while(!list_empty(&root->trans)) {
-		b = list_entry(root->trans.next, struct btrfs_buffer, dirty);
+	while(!list_empty(&root->fs_info->trans)) {
+		b = list_entry(root->fs_info->trans.next, struct btrfs_buffer,
+			       dirty);
 		list_del_init(&b->dirty);
 		wret = write_tree_block(trans, root, b);
 		if (wret)
@@ -160,13 +162,21 @@ static int __commit_transaction(struct btrfs_trans_handle *trans, struct
 	return ret;
 }
 
-static int commit_extent_and_tree_roots(struct btrfs_trans_handle *trans,
-					struct btrfs_root *tree_root, struct
-					btrfs_root *extent_root)
+static int commit_tree_roots(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
 {
 	int ret;
 	u64 old_extent_block;
-
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *inode_root = fs_info->inode_root;
+
+	btrfs_set_root_blocknr(&inode_root->root_item,
+			       inode_root->node->blocknr);
+	ret = btrfs_update_root(trans, tree_root,
+				&inode_root->root_key,
+				&inode_root->root_item);
+	BUG_ON(ret);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
 		if (old_extent_block == extent_root->node->blocknr)
@@ -178,8 +188,6 @@ static int commit_extent_and_tree_roots(struct btrfs_trans_handle *trans,
 					&extent_root->root_item);
 		BUG_ON(ret);
 	}
-	__commit_transaction(trans, extent_root);
-	__commit_transaction(trans, tree_root);
 	return 0;
 }
 
@@ -190,9 +198,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
 	struct btrfs_buffer *snap = root->commit_root;
 	struct btrfs_key snap_key;
 
-	ret = __commit_transaction(trans, root);
-	BUG_ON(ret);
-
 	if (root->commit_root == root->node)
 		return 0;
 
@@ -200,54 +205,55 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
 	root->root_key.offset++;
 
 	btrfs_set_root_blocknr(&root->root_item, root->node->blocknr);
-	ret = btrfs_insert_root(trans, root->tree_root, &root->root_key,
-				&root->root_item);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+	BUG_ON(ret);
+
+	ret = commit_tree_roots(trans, root->fs_info);
 	BUG_ON(ret);
 
-	ret = commit_extent_and_tree_roots(trans, root->tree_root,
-					   root->extent_root);
+	ret = __commit_transaction(trans, root);
 	BUG_ON(ret);
 
 	write_ctree_super(trans, root, s);
-	btrfs_finish_extent_commit(trans, root->extent_root);
-	btrfs_finish_extent_commit(trans, root->tree_root);
+	btrfs_finish_extent_commit(trans, root->fs_info->extent_root);
+	btrfs_finish_extent_commit(trans, root->fs_info->tree_root);
 
 	root->commit_root = root->node;
 	root->node->count++;
 	ret = btrfs_drop_snapshot(trans, root, snap);
 	BUG_ON(ret);
 
-	ret = btrfs_del_root(trans, root->tree_root, &snap_key);
+	ret = btrfs_del_root(trans, root->fs_info->tree_root, &snap_key);
 	BUG_ON(ret);
 
 	return ret;
 }
 
 static int __setup_root(struct btrfs_super_block *super,
-			struct btrfs_root *root, u64 objectid, int fp)
+			struct btrfs_root *root,
+			struct btrfs_fs_info *fs_info,
+			u64 objectid, int fp)
 {
-	INIT_LIST_HEAD(&root->trans);
-	INIT_LIST_HEAD(&root->cache);
-	root->cache_size = 0;
-	root->fp = fp;
 	root->node = NULL;
 	root->commit_root = NULL;
 	root->blocksize = btrfs_super_blocksize(super);
 	root->ref_cows = 0;
-	memset(&root->current_insert, 0, sizeof(root->current_insert));
-	memset(&root->last_insert, 0, sizeof(root->last_insert));
+	root->fs_info = fs_info;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	return 0;
 }
 
 static int find_and_setup_root(struct btrfs_super_block *super,
-			       struct btrfs_root *tree_root, u64 objectid,
+			       struct btrfs_root *tree_root,
+			       struct btrfs_fs_info *fs_info,
+			       u64 objectid,
 			       struct btrfs_root *root, int fp)
 {
 	int ret;
 
-	__setup_root(super, root, objectid, fp);
+	__setup_root(super, root, fs_info, objectid, fp);
 	ret = btrfs_find_last_root(tree_root, objectid,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
@@ -263,29 +269,31 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 	struct btrfs_root *root = malloc(sizeof(struct btrfs_root));
 	struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root));
 	struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root));
+	struct btrfs_root *inode_root = malloc(sizeof(struct btrfs_root));
+	struct btrfs_fs_info *fs_info = malloc(sizeof(*fs_info));
 	int fp;
 	int ret;
 
-	root->extent_root = extent_root;
-	root->tree_root = tree_root;
-
-	extent_root->extent_root = extent_root;
-	extent_root->tree_root = tree_root;
-
-	tree_root->extent_root = extent_root;
-	tree_root->tree_root = tree_root;
-
 	fp = open(filename, O_CREAT | O_RDWR, 0600);
 	if (fp < 0) {
 		free(root);
 		return NULL;
 	}
-	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
-	INIT_RADIX_TREE(&root->pinned_radix, GFP_KERNEL);
-	INIT_RADIX_TREE(&extent_root->pinned_radix, GFP_KERNEL);
-	INIT_RADIX_TREE(&extent_root->cache_radix, GFP_KERNEL);
-	INIT_RADIX_TREE(&tree_root->pinned_radix, GFP_KERNEL);
-	INIT_RADIX_TREE(&tree_root->cache_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&fs_info->cache_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL);
+	INIT_LIST_HEAD(&fs_info->trans);
+	INIT_LIST_HEAD(&fs_info->cache);
+	fs_info->cache_size = 0;
+	fs_info->fp = fp;
+	fs_info->running_transaction = NULL;
+	fs_info->fs_root = root;
+	fs_info->tree_root = tree_root;
+	fs_info->extent_root = extent_root;
+	fs_info->inode_root = inode_root;
+	fs_info->last_inode_alloc = 0;
+	fs_info->last_inode_alloc_dirid = 0;
+	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
+	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
 	ret = pread(fp, super, sizeof(struct btrfs_super_block),
 		     BTRFS_SUPER_INFO_OFFSET);
@@ -301,16 +309,20 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 	}
 	BUG_ON(ret < 0);
 
-	__setup_root(super, tree_root, BTRFS_ROOT_TREE_OBJECTID, fp);
+	__setup_root(super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID, fp);
 	tree_root->node = read_tree_block(tree_root, btrfs_super_root(super));
 	BUG_ON(!tree_root->node);
 
-	ret = find_and_setup_root(super, tree_root, BTRFS_EXTENT_TREE_OBJECTID,
-				  extent_root, fp);
+	ret = find_and_setup_root(super, tree_root, fs_info,
+				  BTRFS_EXTENT_TREE_OBJECTID, extent_root, fp);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(super, tree_root, BTRFS_FS_TREE_OBJECTID,
-				  root, fp);
+	ret = find_and_setup_root(super, tree_root, fs_info,
+				  BTRFS_INODE_MAP_OBJECTID, inode_root, fp);
+	BUG_ON(ret);
+
+	ret = find_and_setup_root(super, tree_root, fs_info,
+				  BTRFS_FS_TREE_OBJECTID, root, fp);
 	BUG_ON(ret);
 
 	root->commit_root = root->node;
@@ -323,8 +335,8 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_super_block *s)
 {
 	int ret;
-	btrfs_set_super_root(s, root->tree_root->node->blocknr);
-	ret = pwrite(root->fp, s, sizeof(*s),
+	btrfs_set_super_root(s, root->fs_info->tree_root->node->blocknr);
+	ret = pwrite(root->fs_info->fp, s, sizeof(*s),
 		     BTRFS_SUPER_INFO_OFFSET);
 	if (ret != sizeof(*s)) {
 		fprintf(stderr, "failed to write new super block err %d\n", ret);
@@ -335,9 +347,10 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 
 static int drop_cache(struct btrfs_root *root)
 {
-	while(!list_empty(&root->cache)) {
-		struct btrfs_buffer *b = list_entry(root->cache.next,
-						   struct btrfs_buffer, cache);
+	while(!list_empty(&root->fs_info->cache)) {
+		struct btrfs_buffer *b = list_entry(root->fs_info->cache.next,
+						    struct btrfs_buffer,
+						    cache);
 		list_del_init(&b->cache);
 		btrfs_block_release(root, b);
 	}
@@ -348,26 +361,28 @@ int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s)
 	int ret;
 	struct btrfs_trans_handle *trans;
 
-	trans = root->running_transaction;
+	trans = root->fs_info->running_transaction;
 	btrfs_commit_transaction(trans, root, s);
-	ret = commit_extent_and_tree_roots(trans, root->tree_root,
-					   root->extent_root);
+	ret = commit_tree_roots(trans, root->fs_info);
+	BUG_ON(ret);
+	ret = __commit_transaction(trans, root);
 	BUG_ON(ret);
 	write_ctree_super(trans, root, s);
-	drop_cache(root->extent_root);
-	drop_cache(root->tree_root);
 	drop_cache(root);
-	BUG_ON(!list_empty(&root->trans));
-	BUG_ON(!list_empty(&root->extent_root->trans));
-	BUG_ON(!list_empty(&root->tree_root->trans));
+	BUG_ON(!list_empty(&root->fs_info->trans));
 
-	close(root->fp);
+	close(root->fs_info->fp);
 	if (root->node)
 		btrfs_block_release(root, root->node);
-	if (root->extent_root->node)
-		btrfs_block_release(root->extent_root, root->extent_root->node);
-	if (root->tree_root->node)
-		btrfs_block_release(root->tree_root, root->tree_root->node);
+	if (root->fs_info->extent_root->node)
+		btrfs_block_release(root->fs_info->extent_root,
+				    root->fs_info->extent_root->node);
+	if (root->fs_info->inode_root->node)
+		btrfs_block_release(root->fs_info->inode_root,
+				    root->fs_info->inode_root->node);
+	if (root->fs_info->tree_root->node)
+		btrfs_block_release(root->fs_info->tree_root,
+				    root->fs_info->tree_root->node);
 	btrfs_block_release(root, root->commit_root);
 	free(root);
 	printf("on close %d blocks are allocated\n", allocated_blocks);
@@ -382,15 +397,16 @@ void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf)
 	if (buf->count == 0) {
 		BUG_ON(!list_empty(&buf->cache));
 		BUG_ON(!list_empty(&buf->dirty));
-		if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
+		if (!radix_tree_lookup(&root->fs_info->cache_radix,
+				       buf->blocknr))
 			BUG();
-		radix_tree_delete(&root->cache_radix, buf->blocknr);
+		radix_tree_delete(&root->fs_info->cache_radix, buf->blocknr);
 		memset(buf, 0, sizeof(*buf));
 		free(buf);
 		BUG_ON(allocated_blocks == 0);
 		allocated_blocks--;
-		BUG_ON(root->cache_size == 0);
-		root->cache_size--;
+		BUG_ON(root->fs_info->cache_size == 0);
+		root->fs_info->cache_size--;
 	}
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c29b92d440e..09eeeb4d9d2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,13 +35,15 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_key ins;
 	u32 refs;
 
-	find_free_extent(trans, root->extent_root, 0, 0, (u64)-1, &ins);
+	find_free_extent(trans, root->fs_info->extent_root, 0, 0, (u64)-1,
+			 &ins);
 	btrfs_init_path(&path);
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = 1;
-	ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 1);
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path,
+				0, 1);
 	if (ret != 0)
 		BUG();
 	BUG_ON(ret != 0);
@@ -51,9 +53,9 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_extent_refs(item, refs + 1);
 
 	BUG_ON(list_empty(&path.nodes[0]->dirty));
-	btrfs_release_path(root->extent_root, &path);
-	finish_current_insert(trans, root->extent_root);
-	run_pending(trans, root->extent_root);
+	btrfs_release_path(root->fs_info->extent_root, &path);
+	finish_current_insert(trans, root->fs_info->extent_root);
+	run_pending(trans, root->fs_info->extent_root);
 	return 0;
 }
 
@@ -70,13 +72,14 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.offset = 1;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	ret = btrfs_search_slot(trans, root->extent_root, &key, &path, 0, 0);
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path,
+				0, 0);
 	if (ret != 0)
 		BUG();
 	l = &path.nodes[0]->leaf;
 	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
 	*refs = btrfs_extent_refs(item);
-	btrfs_release_path(root->extent_root, &path);
+	btrfs_release_path(root->fs_info->extent_root, &path);
 	return 0;
 }
 
@@ -107,19 +110,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 	int i;
 
 	while(1) {
-		ret = radix_tree_gang_lookup(&root->pinned_radix,
-						 (void **)gang, 0,
-						 ARRAY_SIZE(gang));
+		ret = radix_tree_gang_lookup(&root->fs_info->pinned_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
 		if (!ret)
 			break;
 		if (!first)
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
-			radix_tree_delete(&root->pinned_radix, gang[i]);
+			radix_tree_delete(&root->fs_info->pinned_radix,
+					  gang[i]);
 		}
 	}
-	root->last_insert.objectid = first;
-	root->last_insert.offset = 0;
+	root->fs_info->last_insert.objectid = first;
+	root->fs_info->last_insert.offset = 0;
 	return 0;
 }
 
@@ -138,13 +142,14 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	ins.flags = 0;
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 
-	for (i = 0; i < extent_root->current_insert.flags; i++) {
-		ins.objectid = extent_root->current_insert.objectid + i;
+	for (i = 0; i < extent_root->fs_info->current_insert.flags; i++) {
+		ins.objectid = extent_root->fs_info->current_insert.objectid +
+				i;
 		ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item,
 					sizeof(extent_item));
 		BUG_ON(ret);
 	}
-	extent_root->current_insert.offset = 0;
+	extent_root->fs_info->current_insert.offset = 0;
 	return 0;
 }
 
@@ -156,7 +161,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
-	struct btrfs_root *extent_root = root->extent_root;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int ret;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key ins;
@@ -186,14 +191,16 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (pin) {
 			int err;
 			radix_tree_preload(GFP_KERNEL);
-			err = radix_tree_insert(&extent_root->pinned_radix,
-					  blocknr, (void *)blocknr);
+			err = radix_tree_insert(
+					&extent_root->fs_info->pinned_radix,
+					blocknr, (void *)blocknr);
 			BUG_ON(err);
 			radix_tree_preload_end();
 		}
 		ret = btrfs_del_item(trans, extent_root, &path);
-		if (!pin && extent_root->last_insert.objectid > blocknr)
-			extent_root->last_insert.objectid = blocknr;
+		if (!pin && extent_root->fs_info->last_insert.objectid >
+		    blocknr)
+			extent_root->fs_info->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
 	}
@@ -214,18 +221,19 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	int i;
 
 	while(1) {
-		ret = radix_tree_gang_lookup_tag(&extent_root->cache_radix,
-						 (void **)gang, 0,
-						 ARRAY_SIZE(gang),
-						 CTREE_EXTENT_PENDING_DEL);
+		ret = radix_tree_gang_lookup_tag(
+					&extent_root->fs_info->cache_radix,
+					(void **)gang, 0,
+					ARRAY_SIZE(gang),
+					CTREE_EXTENT_PENDING_DEL);
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
 			ret = __free_extent(trans, extent_root,
 					    gang[i]->blocknr, 1, 1);
-			radix_tree_tag_clear(&extent_root->cache_radix,
-						gang[i]->blocknr,
-						CTREE_EXTENT_PENDING_DEL);
+			radix_tree_tag_clear(&extent_root->fs_info->cache_radix,
+					     gang[i]->blocknr,
+					     CTREE_EXTENT_PENDING_DEL);
 			btrfs_block_release(extent_root, gang[i]);
 		}
 	}
@@ -235,8 +243,8 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *extent_root)
 {
-	while(radix_tree_tagged(&extent_root->cache_radix,
-			        CTREE_EXTENT_PENDING_DEL))
+	while(radix_tree_tagged(&extent_root->fs_info->cache_radix,
+				CTREE_EXTENT_PENDING_DEL))
 		del_pending_extents(trans, extent_root);
 	return 0;
 }
@@ -248,19 +256,19 @@ static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin)
 {
-	struct btrfs_root *extent_root = root->extent_root;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	struct btrfs_buffer *t;
 	int pending_ret;
 	int ret;
 
 	if (root == extent_root) {
 		t = find_tree_block(root, blocknr);
-		radix_tree_tag_set(&root->cache_radix, blocknr,
+		radix_tree_tag_set(&root->fs_info->cache_radix, blocknr,
 				   CTREE_EXTENT_PENDING_DEL);
 		return 0;
 	}
 	ret = __free_extent(trans, root, blocknr, num_blocks, pin);
-	pending_ret = run_pending(trans, root->extent_root);
+	pending_ret = run_pending(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
 
@@ -285,12 +293,12 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	u64 test_block;
 	int start_found;
 	struct btrfs_leaf *l;
-	struct btrfs_root * root = orig_root->extent_root;
+	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	int total_needed = num_blocks;
 
 	total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3;
-	if (root->last_insert.objectid > search_start)
-		search_start = root->last_insert.objectid;
+	if (root->fs_info->last_insert.objectid > search_start)
+		search_start = root->fs_info->last_insert.objectid;
 
 	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -353,16 +361,17 @@ check_pending:
 	BUG_ON(ins->objectid < search_start);
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + total_needed; test_block++) {
-		if (radix_tree_lookup(&root->pinned_radix, test_block)) {
+		if (radix_tree_lookup(&root->fs_info->pinned_radix,
+				      test_block)) {
 			search_start = test_block + 1;
 			goto check_failed;
 		}
 	}
-	BUG_ON(root->current_insert.offset);
-	root->current_insert.offset = total_needed - num_blocks;
-	root->current_insert.objectid = ins->objectid + num_blocks;
-	root->current_insert.flags = 0;
-	root->last_insert.objectid = ins->objectid;
+	BUG_ON(root->fs_info->current_insert.offset);
+	root->fs_info->current_insert.offset = total_needed - num_blocks;
+	root->fs_info->current_insert.objectid = ins->objectid + num_blocks;
+	root->fs_info->current_insert.flags = 0;
+	root->fs_info->last_insert.objectid = ins->objectid;
 	ins->offset = num_blocks;
 	return 0;
 error:
@@ -383,20 +392,20 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	int ret;
 	int pending_ret;
-	struct btrfs_root *extent_root = root->extent_root;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	struct btrfs_extent_item extent_item;
 
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item, owner);
 
 	if (root == extent_root) {
-		BUG_ON(extent_root->current_insert.offset == 0);
+		BUG_ON(extent_root->fs_info->current_insert.offset == 0);
 		BUG_ON(num_blocks != 1);
-		BUG_ON(extent_root->current_insert.flags ==
-		       extent_root->current_insert.offset);
+		BUG_ON(extent_root->fs_info->current_insert.flags ==
+		       extent_root->fs_info->current_insert.offset);
 		ins->offset = 1;
-		ins->objectid = extent_root->current_insert.objectid +
-				extent_root->current_insert.flags++;
+		ins->objectid = extent_root->fs_info->current_insert.objectid +
+				extent_root->fs_info->current_insert.flags++;
 		return 0;
 	}
 	ret = find_free_extent(trans, root, num_blocks, search_start,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f5b36c732c5..24cfd6d8524 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -4,4 +4,10 @@
 #include "radix-tree.h"
 #include "ctree.h"
 #include "disk-io.h"
+#include "transaction.h"
 
+int btrfs_create_file(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, u64 dirid, u64 *objectid)
+{
+	return 0;
+}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
new file mode 100644
index 00000000000..f412b339213
--- /dev/null
+++ b/fs/btrfs/inode-map.c
@@ -0,0 +1,136 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+/*
+ * walks the btree of allocated inodes and find a hole.
+ */
+int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *fs_root,
+			     u64 dirid, u64 *objectid)
+{
+	struct btrfs_path path;
+	struct btrfs_key key;
+	int ret;
+	u64 hole_size = 0;
+	int slot = 0;
+	u64 last_ino;
+	int start_found;
+	struct btrfs_leaf *l;
+	struct btrfs_root *root = fs_root->fs_info->inode_root;
+	struct btrfs_key search_key;
+	u64 search_start = dirid;
+
+	if (fs_root->fs_info->last_inode_alloc_dirid == dirid)
+		search_start = fs_root->fs_info->last_inode_alloc;
+
+	search_key.objectid = search_start;
+	search_key.flags = 0;
+	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
+	search_key.offset = 0;
+
+	btrfs_init_path(&path);
+	start_found = 0;
+	ret = btrfs_search_slot(trans, root, &search_key, &path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	if (path.slots[0] > 0)
+		path.slots[0]--;
+
+	while (1) {
+		l = &path.nodes[0]->leaf;
+		slot = path.slots[0];
+		if (slot >= btrfs_header_nritems(&l->header)) {
+			ret = btrfs_next_leaf(root, &path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			if (!start_found) {
+				*objectid = search_start;
+				start_found = 1;
+				goto found;
+			}
+			*objectid = last_ino > search_start ?
+				last_ino : search_start;
+			goto found;
+		}
+		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
+		if (key.objectid >= search_start) {
+			if (start_found) {
+				if (last_ino < search_start)
+					last_ino = search_start;
+				hole_size = key.objectid - last_ino;
+				if (hole_size > 0) {
+					*objectid = last_ino;
+					goto found;
+				}
+			}
+		}
+		start_found = 1;
+		last_ino = key.objectid + 1;
+		path.slots[0]++;
+	}
+	// FIXME -ENOSPC
+found:
+	root->fs_info->last_inode_alloc = *objectid;
+	root->fs_info->last_inode_alloc_dirid = dirid;
+	btrfs_release_path(root, &path);
+	BUG_ON(*objectid < search_start);
+	return 0;
+error:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *fs_root,
+			   u64 objectid, struct btrfs_key *location)
+{
+	int ret = 0;
+	struct btrfs_path path;
+	struct btrfs_inode_map_item *inode_item;
+	struct btrfs_key key;
+	struct btrfs_root *inode_root = fs_root->fs_info->inode_root;
+
+	key.objectid = objectid;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY);
+	key.offset = 0;
+	btrfs_init_path(&path);
+	ret = btrfs_insert_empty_item(trans, inode_root, &path, &key,
+				      sizeof(struct btrfs_inode_map_item));
+	if (ret)
+		goto out;
+
+	inode_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+				  struct btrfs_inode_map_item);
+	btrfs_cpu_key_to_disk(&inode_item->key, location);
+out:
+	btrfs_release_path(inode_root, &path);
+	return ret;
+}
+
+int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *fs_root, struct btrfs_path *path,
+			   u64 objectid, int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	struct btrfs_root *inode_root = fs_root->fs_info->inode_root;
+
+	key.objectid = objectid;
+	key.flags = 0;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY);
+	ret = btrfs_search_slot(trans, inode_root, &key, path, ins_len, cow);
+	return ret;
+}
+
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index 9aa900811c3..1cac5ab114d 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -42,7 +42,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	btrfs_set_header_parentid(&empty_leaf->header,
 				  BTRFS_ROOT_TREE_OBJECTID);
 	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 1);
-	btrfs_set_header_nritems(&empty_leaf->header, 2);
+	btrfs_set_header_nritems(&empty_leaf->header, 3);
 
 	/* create the items for the root tree */
 	btrfs_set_root_blocknr(&root_item, start_block + 2);
@@ -61,8 +61,16 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	btrfs_set_root_blocknr(&root_item, start_block + 3);
 	itemoff = itemoff - sizeof(root_item);
 	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID);
+	btrfs_set_disk_key_objectid(&item.key, BTRFS_INODE_MAP_OBJECTID);
 	memcpy(empty_leaf->items + 1, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
+		&root_item, sizeof(root_item));
+
+	btrfs_set_root_blocknr(&root_item, start_block + 4);
+	itemoff = itemoff - sizeof(root_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID);
+	memcpy(empty_leaf->items + 2, &item, sizeof(item));
 	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
 		&root_item, sizeof(root_item));
 	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 1) * blocksize);
@@ -71,7 +79,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	btrfs_set_header_parentid(&empty_leaf->header,
 				  BTRFS_EXTENT_TREE_OBJECTID);
 	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 2);
-	btrfs_set_header_nritems(&empty_leaf->header, 4);
+	btrfs_set_header_nritems(&empty_leaf->header, 5);
 
 	/* item1, reserve blocks 0-16 */
 	btrfs_set_disk_key_objectid(&item.key, 0);
@@ -108,12 +116,12 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
 		&extent_item, btrfs_item_size(&item));
 
-	/* item4, give block 19 to the FS root */
+	/* item4, give block 19 to the inode map */
 	btrfs_set_disk_key_objectid(&item.key, start_block + 3);
 	btrfs_set_disk_key_offset(&item.key, 1);
 	itemoff = itemoff - sizeof(struct btrfs_extent_item);
 	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID);
+	btrfs_set_extent_owner(&extent_item, BTRFS_INODE_MAP_OBJECTID);
 	memcpy(empty_leaf->items + 3, &item, sizeof(item));
 	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
 		&extent_item, btrfs_item_size(&item));
@@ -121,11 +129,33 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	if (ret != blocksize)
 		return -1;
 
-	/* finally create the FS root */
-	btrfs_set_header_parentid(&empty_leaf->header, BTRFS_FS_TREE_OBJECTID);
+	/* item5, give block 20 to the FS root */
+	btrfs_set_disk_key_objectid(&item.key, start_block + 4);
+	btrfs_set_disk_key_offset(&item.key, 1);
+	itemoff = itemoff - sizeof(struct btrfs_extent_item);
+	btrfs_set_item_offset(&item, itemoff);
+	btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID);
+	memcpy(empty_leaf->items + 4, &item, sizeof(item));
+	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
+		&extent_item, btrfs_item_size(&item));
+	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 2) * blocksize);
+	if (ret != blocksize)
+		return -1;
+
+	/* create the inode map */
+	btrfs_set_header_parentid(&empty_leaf->header,
+				  BTRFS_INODE_MAP_OBJECTID);
 	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 3);
 	btrfs_set_header_nritems(&empty_leaf->header, 0);
 	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 3) * blocksize);
+	if (ret != blocksize)
+		return -1;
+
+	/* finally create the FS root */
+	btrfs_set_header_parentid(&empty_leaf->header, BTRFS_FS_TREE_OBJECTID);
+	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 4);
+	btrfs_set_header_nritems(&empty_leaf->header, 0);
+	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 4) * blocksize);
 	if (ret != blocksize)
 		return -1;
 	return 0;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0bb5c38427c..f250e5fad77 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -13,8 +13,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
+	struct btrfs_inode_map_item *mi;
 	u32 type;
-	u32 namelen;
 
 	printf("leaf %Lu total ptrs %d free space %d\n",
 		btrfs_header_blocknr(&l->header), nr,
@@ -34,15 +34,13 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 		case BTRFS_INODE_ITEM_KEY:
 			break;
 		case BTRFS_DIR_ITEM_KEY:
-			namelen = btrfs_item_size(l->items + i) - sizeof(*di);
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			printf("\t\tdir oid %Lu flags %u type %u\n",
 				btrfs_dir_objectid(di),
 				btrfs_dir_flags(di),
 				btrfs_dir_type(di));
 			printf("\t\tname %.*s\n",
-				namelen, (char *)(di + 1));
-
+			       btrfs_dir_name_len(di),(char *)(di + 1));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@@ -54,6 +52,13 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			printf("\t\textent data refs %u owner %Lu\n",
 				btrfs_extent_refs(ei), btrfs_extent_owner(ei));
 			break;
+		case BTRFS_INODE_MAP_ITEM_KEY:
+			mi = btrfs_item_ptr(l, i, struct btrfs_inode_map_item);
+			printf("\t\tinode map key %Lu %u %Lu\n",
+			       btrfs_disk_key_objectid(&mi->key),
+			       btrfs_disk_key_flags(&mi->key),
+			       btrfs_disk_key_offset(&mi->key));
+			break;
 		case BTRFS_STRING_ITEM_KEY:
 			printf("\t\titem data %.*s\n", btrfs_item_size(item),
 				btrfs_leaf_data(l) + btrfs_item_offset(item));
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
index d676577185d..ef12cf88878 100644
--- a/fs/btrfs/quick-test.c
+++ b/fs/btrfs/quick-test.c
@@ -172,7 +172,8 @@ int main(int ac, char **av) {
 	printf("tree size is now %d\n", tree_size);
 	printf("root %p commit root %p\n", root->node, root->commit_root);
 	printf("map tree\n");
-	btrfs_print_tree(root->extent_root, root->extent_root->node);
+	btrfs_print_tree(root->fs_info->extent_root,
+			 root->fs_info->extent_root->node);
 	close_ctree(root, &super);
 	return 0;
 }
-- 
cgit v1.2.3


From 313a013bb927cfba849b1bb746e9112ba856e43a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Mar 2007 14:51:12 -0400
Subject: Btrfs: Update TODO

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index e9894999322..c5d67bbd4d9 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -4,7 +4,6 @@
 * make a real mkfs and superblock
 * Do checksumming
 * Define FS objects in terms of different item types
-* add inode tree
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
-- 
cgit v1.2.3


From 293ffd5fd340428276fbbd24ce7b98bf6728466b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Mar 2007 15:57:25 -0400
Subject: Btrfs: change dir-test to insert inode_items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      |  6 ++++++
 fs/btrfs/dir-test.c   | 39 ++++++++++++++++++++++++++++++---------
 fs/btrfs/disk-io.c    |  2 ++
 fs/btrfs/print-tree.c |  5 +++++
 4 files changed, 43 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68e3da274f1..b30b2ce7245 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -219,6 +219,7 @@ struct btrfs_fs_info {
 	struct list_head cache;
 	u64 last_inode_alloc;
 	u64 last_inode_alloc_dirid;
+	u64 generation;
 	int cache_size;
 	int fp;
 	struct btrfs_trans_handle *running_transaction;
@@ -802,4 +803,9 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct btrfs_path *path,
 			   u64 objectid, int mod);
+int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, u64 objectid, struct btrfs_inode_item
+		       *inode_item);
+int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
+		       *root, struct btrfs_path *path, u64 objectid, int mod);
 #endif
diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c
index 56f06c3ba57..8fc77c83a35 100644
--- a/fs/btrfs/dir-test.c
+++ b/fs/btrfs/dir-test.c
@@ -39,6 +39,13 @@ again:
 	return 0;
 }
 
+static void initial_inode_init(struct btrfs_root *root,
+			       struct btrfs_inode_item *inode_item)
+{
+	memset(inode_item, 0, sizeof(*inode_item));
+	btrfs_set_inode_generation(inode_item, root->fs_info->generation);
+}
+
 static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct radix_tree_root *radix)
 {
@@ -48,6 +55,7 @@ static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	u64 objectid;
 	struct btrfs_path path;
 	struct btrfs_key inode_map;
+	struct btrfs_inode_item inode_item;
 
 	find_num(radix, &oid, 0);
 	sprintf(buf, "str-%lu", oid);
@@ -61,6 +69,11 @@ static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	inode_map.offset = 0;
 
 	ret = btrfs_insert_inode_map(trans, root, objectid, &inode_map);
+	if (ret)
+		goto error;
+
+	initial_inode_init(root, &inode_item);
+	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
 	if (ret)
 		goto error;
 	ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid,
@@ -143,7 +156,6 @@ static int del_dir_item(struct btrfs_trans_handle *trans,
 	unsigned long *ptr;
 	u64 file_objectid;
 	struct btrfs_dir_item *di;
-	struct btrfs_path map_path;
 
 	/* find the inode number of the file */
 	di = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0],
@@ -153,20 +165,31 @@ static int del_dir_item(struct btrfs_trans_handle *trans,
 	/* delete the directory item */
 	ret = btrfs_del_item(trans, root, path);
 	if (ret)
-		goto out;
+		goto out_release;
+	btrfs_release_path(root, path);
+
+	/* delete the inode */
+	btrfs_init_path(path);
+	ret = btrfs_lookup_inode(trans, root, path, file_objectid, -1);
+	if (ret)
+		goto out_release;
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out_release;
+	btrfs_release_path(root, path);
 
 	/* delete the inode mapping */
-	btrfs_init_path(&map_path);
-	ret = btrfs_lookup_inode_map(trans, root, &map_path, file_objectid, -1);
+	btrfs_init_path(path);
+	ret = btrfs_lookup_inode_map(trans, root, path, file_objectid, -1);
 	if (ret)
 		goto out_release;
-	ret = btrfs_del_item(trans, root->fs_info->inode_root, &map_path);
+	ret = btrfs_del_item(trans, root->fs_info->inode_root, path);
 	if (ret)
 		goto out_release;
 
 	if (root->fs_info->last_inode_alloc > file_objectid)
 		root->fs_info->last_inode_alloc = file_objectid;
-	btrfs_release_path(root, &map_path);
+	btrfs_release_path(root, path);
 	ptr = radix_tree_delete(radix, radix_index);
 	if (!ptr) {
 		ret = -5555;
@@ -174,7 +197,7 @@ static int del_dir_item(struct btrfs_trans_handle *trans,
 	}
 	return 0;
 out_release:
-	btrfs_release_path(root, &map_path);
+	btrfs_release_path(root, path);
 out:
 	printf("failed to delete %lu %d\n", radix_index, ret);
 	return -1;
@@ -201,7 +224,6 @@ static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	ret = del_dir_item(trans, root, radix, oid, &path);
 	if (ret)
 		goto out_release;
-	btrfs_release_path(root, &path);
 	return ret;
 out_release:
 	btrfs_release_path(root, &path);
@@ -312,7 +334,6 @@ static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 				found);
 			return -1;
 		}
-		btrfs_release_path(root, &path);
 		if (!keep_running)
 			break;
 	}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1849a99690c..bacaa38ea82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -226,6 +226,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
 
 	ret = btrfs_del_root(trans, root->fs_info->tree_root, &snap_key);
 	BUG_ON(ret);
+	root->fs_info->generation = root->root_key.offset + 1;
 
 	return ret;
 }
@@ -328,6 +329,7 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 	root->commit_root = root->node;
 	root->node->count++;
 	root->ref_cows = 1;
+	root->fs_info->generation = root->root_key.offset + 1;
 	return root;
 }
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f250e5fad77..f53b99da12f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -14,6 +14,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_map_item *mi;
+	struct btrfs_inode_item *ii;
 	u32 type;
 
 	printf("leaf %Lu total ptrs %d free space %d\n",
@@ -32,6 +33,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			btrfs_item_size(item));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
+			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
+			printf("\t\tinode generation %Lu size %Lu\n",
+			       btrfs_inode_generation(ii),
+			       btrfs_inode_size(ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
-- 
cgit v1.2.3


From 1261ec42b3d3a3ad878bd172144940e3ac710749 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Mar 2007 20:35:03 -0400
Subject: Btrfs: Better block record keeping, real mkfs

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |  7 ++--
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/debug-tree.c  | 13 ++++++-
 fs/btrfs/dir-test.c    |  2 --
 fs/btrfs/disk-io.c     | 11 ++----
 fs/btrfs/extent-tree.c | 24 ++++++++++---
 fs/btrfs/mkfs.c        | 95 +++++++++++++++++++++++++++++++++++++++++++++++++-
 7 files changed, 134 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 636c63efde6..0720169b6d6 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -2,7 +2,7 @@ CC=gcc
 CFLAGS = -g -Wall -Werror
 headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \
 	  transaction.h
-objects = ctree.o disk-io.o radix-tree.o mkfs.o extent-tree.o print-tree.o \
+objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
 	  inode-map.o \
 
@@ -16,7 +16,10 @@ check=sparse $(CHECKFLAGS)
 	$(check) $<
 	$(CC) $(CFLAGS) -c $<
 
-all: tester debug-tree quick-test dir-test tags
+all: tester debug-tree quick-test dir-test tags mkfs.btrfs
+
+mkfs.btrfs: $(objects) mkfs.o
+	gcc $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o
 
 debug-tree: $(objects) debug-tree.o
 	gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b30b2ce7245..1a4d1d6fa40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -223,6 +223,7 @@ struct btrfs_fs_info {
 	int cache_size;
 	int fp;
 	struct btrfs_trans_handle *running_transaction;
+	struct btrfs_super_block *disk_super;
 };
 
 /*
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
index d9f36efc3dd..fd07969600c 100644
--- a/fs/btrfs/debug-tree.c
+++ b/fs/btrfs/debug-tree.c
@@ -10,8 +10,17 @@
 int main(int ac, char **av) {
 	struct btrfs_super_block super;
 	struct btrfs_root *root;
+
+	if (ac != 2) {
+		fprintf(stderr, "usage: %s device\n", av[0]);
+		exit(1);
+	}
 	radix_tree_init();
-	root = open_ctree("dbfile", &super);
+	root = open_ctree(av[1], &super);
+	if (!root) {
+		fprintf(stderr, "unable to open %s\n", av[1]);
+		exit(1);
+	}
 	printf("fs tree\n");
 	btrfs_print_tree(root, root->node);
 	printf("map tree\n");
@@ -23,5 +32,7 @@ int main(int ac, char **av) {
 	printf("root tree\n");
 	btrfs_print_tree(root->fs_info->tree_root,
 			 root->fs_info->tree_root->node);
+	printf("total blocks %Lu\n", btrfs_super_total_blocks(&super));
+	printf("blocks used %Lu\n", btrfs_super_blocks_used(&super));
 	return 0;
 }
diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c
index 8fc77c83a35..b673982a1f3 100644
--- a/fs/btrfs/dir-test.c
+++ b/fs/btrfs/dir-test.c
@@ -425,8 +425,6 @@ int main(int ac, char **av)
 	struct btrfs_trans_handle *trans;
 	radix_tree_init();
 
-	printf("removing old tree\n");
-	unlink("dbfile");
 	root = open_ctree("dbfile", &super);
 	trans = btrfs_start_transaction(root, 1);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bacaa38ea82..0322c55162c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -293,20 +293,15 @@ struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
 	fs_info->inode_root = inode_root;
 	fs_info->last_inode_alloc = 0;
 	fs_info->last_inode_alloc_dirid = 0;
+	fs_info->disk_super = super;
 	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
 	ret = pread(fp, super, sizeof(struct btrfs_super_block),
 		     BTRFS_SUPER_INFO_OFFSET);
 	if (ret == 0 || btrfs_super_root(super) == 0) {
-		printf("making new FS!\n");
-		ret = mkfs(fp, 0, 1024);
-		if (ret)
-			return NULL;
-		ret = pread(fp, super, sizeof(struct btrfs_super_block),
-			     BTRFS_SUPER_INFO_OFFSET);
-		if (ret != sizeof(struct btrfs_super_block))
-			return NULL;
+		BUG();
+		return NULL;
 	}
 	BUG_ON(ret < 0);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 09eeeb4d9d2..9bc4ad38876 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -134,6 +134,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	struct btrfs_extent_item extent_item;
 	int i;
 	int ret;
+	u64 super_blocks_used;
+	struct btrfs_fs_info *info = extent_root->fs_info;
 
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item,
@@ -145,6 +147,9 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	for (i = 0; i < extent_root->fs_info->current_insert.flags; i++) {
 		ins.objectid = extent_root->fs_info->current_insert.objectid +
 				i;
+		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
+		btrfs_set_super_blocks_used(info->disk_super,
+					    super_blocks_used + 1);
 		ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item,
 					sizeof(extent_item));
 		BUG_ON(ret);
@@ -161,7 +166,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
-	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
 	int ret;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key ins;
@@ -188,15 +194,18 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
 	if (refs == 0) {
+		u64 super_blocks_used;
 		if (pin) {
 			int err;
 			radix_tree_preload(GFP_KERNEL);
-			err = radix_tree_insert(
-					&extent_root->fs_info->pinned_radix,
-					blocknr, (void *)blocknr);
+			err = radix_tree_insert(&info->pinned_radix,
+						blocknr, (void *)blocknr);
 			BUG_ON(err);
 			radix_tree_preload_end();
 		}
+		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
+		btrfs_set_super_blocks_used(info->disk_super,
+					    super_blocks_used - num_blocks);
 		ret = btrfs_del_item(trans, extent_root, &path);
 		if (!pin && extent_root->fs_info->last_insert.objectid >
 		    blocknr)
@@ -392,7 +401,9 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	int ret;
 	int pending_ret;
-	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	u64 super_blocks_used;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
 
 	btrfs_set_extent_refs(&extent_item, 1);
@@ -413,6 +424,9 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret)
 		return ret;
 
+	super_blocks_used = btrfs_super_blocks_used(info->disk_super);
+	btrfs_set_super_blocks_used(info->disk_super, super_blocks_used +
+				    num_blocks);
 	ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
 				sizeof(extent_item));
 
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
index 1cac5ab114d..f7efc8a5fb1 100644
--- a/fs/btrfs/mkfs.c
+++ b/fs/btrfs/mkfs.c
@@ -1,4 +1,8 @@
 #define _XOPEN_SOURCE 500
+#ifndef __CHECKER__
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
@@ -10,6 +14,17 @@
 #include "ctree.h"
 #include "disk-io.h"
 
+#ifdef __CHECKER__
+#define BLKGETSIZE64 0
+static inline int ioctl(int fd, int define, u64 *size) { return 0; }
+#endif
+
+#if 0
+#if defined(__linux__) && defined(_IOR) && !defined(BLKGETSIZE64)
+#   define BLKGETSIZE64 _IOR(0x12, 114, __u64)
+#endif
+#endif
+
 int mkfs(int fd, u64 num_blocks, u32 blocksize)
 {
 	struct btrfs_super_block super;
@@ -27,7 +42,7 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 	strcpy((char *)(&super.magic), BTRFS_MAGIC);
 	btrfs_set_super_blocksize(&super, blocksize);
 	btrfs_set_super_total_blocks(&super, num_blocks);
-	btrfs_set_super_blocks_used(&super, 0);
+	btrfs_set_super_blocks_used(&super, start_block + 5);
 
 	block = malloc(blocksize);
 	memset(block, 0, blocksize);
@@ -160,3 +175,81 @@ int mkfs(int fd, u64 num_blocks, u32 blocksize)
 		return -1;
 	return 0;
 }
+
+u64 device_size(int fd, struct stat *st)
+{
+	u64 size;
+	if (S_ISREG(st->st_mode)) {
+		return st->st_size;
+	}
+	if (!S_ISBLK(st->st_mode)) {
+		return 0;
+	}
+	if (ioctl(fd, BLKGETSIZE64, &size) >= 0) {
+		return size;
+	}
+	return 0;
+}
+
+int main(int ac, char **av)
+{
+	char *file;
+	u64 block_count = 0;
+	int fd;
+	struct stat st;
+	int ret;
+	int i;
+	char *buf = malloc(4096);
+	if (ac >= 2) {
+		file = av[1];
+		if (ac == 3) {
+			block_count = atoi(av[2]);
+			if (!block_count) {
+				fprintf(stderr, "error finding block count\n");
+				exit(1);
+			}
+		}
+	} else {
+		fprintf(stderr, "usage: mkfs.btrfs file [block count]\n");
+		exit(1);
+	}
+	fd = open(file, O_RDWR);
+	if (fd < 0) {
+		fprintf(stderr, "unable to open %s\n", file);
+		exit(1);
+	}
+	ret = fstat(fd, &st);
+	if (ret < 0) {
+		fprintf(stderr, "unable to stat %s\n", file);
+		exit(1);
+	}
+	if (block_count == 0) {
+		block_count = device_size(fd, &st);
+		if (block_count == 0) {
+			fprintf(stderr, "unable to find %s size\n", file);
+			exit(1);
+		}
+	}
+	block_count /= 4096;
+	if (block_count < 256) {
+		fprintf(stderr, "device %s is too small\n", file);
+		exit(1);
+	}
+	memset(buf, 0, 4096);
+	for(i = 0; i < 6; i++) {
+		ret = write(fd, buf, 4096);
+		if (ret != 4096) {
+			fprintf(stderr, "unable to zero fill device\n");
+			exit(1);
+		}
+	}
+	ret = mkfs(fd, block_count, 4096);
+	if (ret) {
+		fprintf(stderr, "error during mkfs %d\n", ret);
+		exit(1);
+	}
+	printf("fs created on %s blocksize %d blocks %Lu\n",
+	       file, 4096, block_count);
+	return 0;
+}
+
-- 
cgit v1.2.3


From 2e635a278354a1a7951e16cfea4c247d6d0e7c99 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 21 Mar 2007 11:12:56 -0400
Subject: Btrfs: initial move to kernel module land

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |  50 +--
 fs/btrfs/ctree.c       |   6 +-
 fs/btrfs/ctree.h       |  15 +-
 fs/btrfs/debug-tree.c  |  38 ---
 fs/btrfs/dir-item.c    |  12 +-
 fs/btrfs/dir-test.c    | 494 -----------------------------
 fs/btrfs/disk-io.c     |  17 +-
 fs/btrfs/disk-io.h     |   1 +
 fs/btrfs/extent-tree.c |  10 +-
 fs/btrfs/file-item.c   |   6 +-
 fs/btrfs/hash.c        |   1 -
 fs/btrfs/hasher.c      |  23 --
 fs/btrfs/inode-item.c  |   5 +-
 fs/btrfs/inode-map.c   |   5 +-
 fs/btrfs/kerncompat.h  |  96 ------
 fs/btrfs/list.h        | 418 -------------------------
 fs/btrfs/mkfs.c        | 255 ---------------
 fs/btrfs/print-tree.c  |  30 +-
 fs/btrfs/quick-test.c  | 179 -----------
 fs/btrfs/radix-tree.c  | 836 -------------------------------------------------
 fs/btrfs/radix-tree.h  |  73 -----
 fs/btrfs/random-test.c | 405 ------------------------
 fs/btrfs/root-tree.c   |   5 +-
 fs/btrfs/super.c       | 205 ++++++++++++
 24 files changed, 274 insertions(+), 2911 deletions(-)
 delete mode 100644 fs/btrfs/debug-tree.c
 delete mode 100644 fs/btrfs/dir-test.c
 delete mode 100644 fs/btrfs/hasher.c
 delete mode 100644 fs/btrfs/kerncompat.h
 delete mode 100644 fs/btrfs/list.h
 delete mode 100644 fs/btrfs/mkfs.c
 delete mode 100644 fs/btrfs/quick-test.c
 delete mode 100644 fs/btrfs/radix-tree.c
 delete mode 100644 fs/btrfs/radix-tree.h
 delete mode 100644 fs/btrfs/random-test.c
 create mode 100644 fs/btrfs/super.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0720169b6d6..99e45a54ebd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,40 +1,20 @@
-CC=gcc
-CFLAGS = -g -Wall -Werror
-headers = radix-tree.h ctree.h disk-io.h kerncompat.h print-tree.h list.h \
-	  transaction.h
-objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
-	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
-	  inode-map.o \
+ifneq ($(KERNELRELEASE),)
+# kbuild part of makefile
 
-# if you don't have sparse installed, use ls instead
-CHECKFLAGS=-D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ -Wbitwise \
-		-Wcontext -Wcast-truncate -Wuninitialized -Wshadow -Wundef
-check=sparse $(CHECKFLAGS)
-#check=ls
+obj-m  := btrfs.o
+btrfs-y := super.o
 
-.c.o:
-	$(check) $<
-	$(CC) $(CFLAGS) -c $<
+#btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
+#	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
+#	  inode-map.o \
 
-all: tester debug-tree quick-test dir-test tags mkfs.btrfs
-
-mkfs.btrfs: $(objects) mkfs.o
-	gcc $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o
-
-debug-tree: $(objects) debug-tree.o
-	gcc $(CFLAGS) -o debug-tree $(objects) debug-tree.o
-
-tester: $(objects) random-test.o
-	gcc $(CFLAGS) -o tester $(objects) random-test.o
-
-dir-test: $(objects) dir-test.o
-	gcc $(CFLAGS) -o dir-test $(objects) dir-test.o
-quick-test: $(objects) quick-test.o
-	gcc $(CFLAGS) -o quick-test $(objects) quick-test.o
-
-$(objects): $(headers)
-
-clean :
-	rm debug-tree tester *.o
+else
 
+# Normal Makefile
 
+KERNELDIR := /lib/modules/`uname -r`/build
+all::
+	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
+clean::
+	rm *.o btrfs.ko
+endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 32922643b5b..9fbd07c37fd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,10 +1,6 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
-#include "print-tree.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a4d1d6fa40..ae8518cb94b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,9 +1,6 @@
 #ifndef __BTRFS__
 #define __BTRFS__
 
-#include "list.h"
-#include "kerncompat.h"
-
 struct btrfs_trans_handle;
 
 #define BTRFS_MAGIC "_BtRfS_M"
@@ -75,6 +72,7 @@ struct btrfs_super_block {
 	__le64 root;
 	__le64 total_blocks;
 	__le64 blocks_used;
+	__le64 root_dir_objectid;
 } __attribute__ ((__packed__));
 
 /*
@@ -693,6 +691,17 @@ static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s,
 	s->blocksize = cpu_to_le32(val);
 }
 
+static inline u64 btrfs_super_root_dir(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->root_dir_objectid);
+}
+
+static inline void btrfs_set_super_root_dir(struct btrfs_super_block *s, u64
+					    val)
+{
+	s->root_dir_objectid = cpu_to_le64(val);
+}
+
 static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 {
 	return (u8 *)l->items;
diff --git a/fs/btrfs/debug-tree.c b/fs/btrfs/debug-tree.c
deleted file mode 100644
index fd07969600c..00000000000
--- a/fs/btrfs/debug-tree.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "print-tree.h"
-#include "transaction.h"
-
-int main(int ac, char **av) {
-	struct btrfs_super_block super;
-	struct btrfs_root *root;
-
-	if (ac != 2) {
-		fprintf(stderr, "usage: %s device\n", av[0]);
-		exit(1);
-	}
-	radix_tree_init();
-	root = open_ctree(av[1], &super);
-	if (!root) {
-		fprintf(stderr, "unable to open %s\n", av[1]);
-		exit(1);
-	}
-	printf("fs tree\n");
-	btrfs_print_tree(root, root->node);
-	printf("map tree\n");
-	btrfs_print_tree(root->fs_info->extent_root,
-			 root->fs_info->extent_root->node);
-	printf("inode tree\n");
-	btrfs_print_tree(root->fs_info->inode_root,
-			 root->fs_info->inode_root->node);
-	printf("root tree\n");
-	btrfs_print_tree(root->fs_info->tree_root,
-			 root->fs_info->tree_root->node);
-	printf("total blocks %Lu\n", btrfs_super_total_blocks(&super));
-	printf("blocks used %Lu\n", btrfs_super_blocks_used(&super));
-	return 0;
-}
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 949c4e52679..4d8083d92fa 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -1,7 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "hash.h"
@@ -21,7 +18,12 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = dir;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	ret = btrfs_name_hash(name, name_len, &key.offset);
+	if (name_len == 1 && *name == '.')
+		key.offset = 1;
+	else if (name_len == 2 && name[0] == '.' && name[1] == '.')
+		key.offset = 2;
+	else
+		ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
 	btrfs_init_path(&path);
 	data_size = sizeof(*dir_item) + name_len;
diff --git a/fs/btrfs/dir-test.c b/fs/btrfs/dir-test.c
deleted file mode 100644
index b673982a1f3..00000000000
--- a/fs/btrfs/dir-test.c
+++ /dev/null
@@ -1,494 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <unistd.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "print-tree.h"
-#include "hash.h"
-#include "transaction.h"
-
-int keep_running = 1;
-struct btrfs_super_block super;
-static u64 dir_oid = 44556;
-static u64 file_oid = 33778;
-
-static int find_num(struct radix_tree_root *root, unsigned long *num_ret,
-		     int exists)
-{
-	unsigned long num = rand();
-	unsigned long res[2];
-	int ret;
-
-again:
-	ret = radix_tree_gang_lookup(root, (void **)res, num, 2);
-	if (exists) {
-		if (ret == 0)
-			return -1;
-		num = res[0];
-	} else if (ret != 0 && num == res[0]) {
-		num++;
-		if (ret > 1 && num == res[1]) {
-			num++;
-			goto again;
-		}
-	}
-	*num_ret = num;
-	return 0;
-}
-
-static void initial_inode_init(struct btrfs_root *root,
-			       struct btrfs_inode_item *inode_item)
-{
-	memset(inode_item, 0, sizeof(*inode_item));
-	btrfs_set_inode_generation(inode_item, root->fs_info->generation);
-}
-
-static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct radix_tree_root *radix)
-{
-	int ret;
-	char buf[128];
-	unsigned long oid;
-	u64 objectid;
-	struct btrfs_path path;
-	struct btrfs_key inode_map;
-	struct btrfs_inode_item inode_item;
-
-	find_num(radix, &oid, 0);
-	sprintf(buf, "str-%lu", oid);
-
-	ret = btrfs_find_free_objectid(trans, root, dir_oid + 1, &objectid);
-	if (ret)
-		goto error;
-
-	inode_map.objectid = objectid;
-	inode_map.flags = 0;
-	inode_map.offset = 0;
-
-	ret = btrfs_insert_inode_map(trans, root, objectid, &inode_map);
-	if (ret)
-		goto error;
-
-	initial_inode_init(root, &inode_item);
-	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
-	if (ret)
-		goto error;
-	ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid,
-				    objectid, 1);
-	if (ret)
-		goto error;
-
-	radix_tree_preload(GFP_KERNEL);
-	ret = radix_tree_insert(radix, oid, (void *)oid);
-	radix_tree_preload_end();
-	if (ret)
-		goto error;
-	return ret;
-error:
-	if (ret != -EEXIST)
-		goto fatal;
-
-	/*
-	 * if we got an EEXIST, it may be due to hash collision, double
-	 * check
-	 */
-	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
-				    strlen(buf), 0);
-	if (ret)
-		goto fatal_release;
-	if (!btrfs_match_dir_item_name(root, &path, buf, strlen(buf))) {
-		struct btrfs_dir_item *di;
-		char *found;
-		u32 found_len;
-		u64 myhash;
-		u64 foundhash;
-
-		di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
-				    struct btrfs_dir_item);
-		found = (char *)(di + 1);
-		found_len = btrfs_dir_name_len(di);
-		btrfs_name_hash(buf, strlen(buf), &myhash);
-		btrfs_name_hash(found, found_len, &foundhash);
-		if (myhash != foundhash)
-			goto fatal_release;
-		btrfs_release_path(root, &path);
-		return 0;
-	}
-fatal_release:
-	btrfs_release_path(root, &path);
-fatal:
-	printf("failed to insert %lu ret %d\n", oid, ret);
-	return -1;
-}
-
-static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct radix_tree_root *radix)
-{
-	int ret;
-	char buf[128];
-	unsigned long oid;
-
-	ret = find_num(radix, &oid, 1);
-	if (ret < 0)
-		return 0;
-	sprintf(buf, "str-%lu", oid);
-
-	ret = btrfs_insert_dir_item(trans, root, buf, strlen(buf), dir_oid,
-				    file_oid, 1);
-	if (ret != -EEXIST) {
-		printf("insert on %s gave us %d\n", buf, ret);
-		return 1;
-	}
-	return 0;
-}
-
-static int del_dir_item(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct radix_tree_root *radix,
-			unsigned long radix_index,
-			struct btrfs_path *path)
-{
-	int ret;
-	unsigned long *ptr;
-	u64 file_objectid;
-	struct btrfs_dir_item *di;
-
-	/* find the inode number of the file */
-	di = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0],
-			    struct btrfs_dir_item);
-	file_objectid = btrfs_dir_objectid(di);
-
-	/* delete the directory item */
-	ret = btrfs_del_item(trans, root, path);
-	if (ret)
-		goto out_release;
-	btrfs_release_path(root, path);
-
-	/* delete the inode */
-	btrfs_init_path(path);
-	ret = btrfs_lookup_inode(trans, root, path, file_objectid, -1);
-	if (ret)
-		goto out_release;
-	ret = btrfs_del_item(trans, root, path);
-	if (ret)
-		goto out_release;
-	btrfs_release_path(root, path);
-
-	/* delete the inode mapping */
-	btrfs_init_path(path);
-	ret = btrfs_lookup_inode_map(trans, root, path, file_objectid, -1);
-	if (ret)
-		goto out_release;
-	ret = btrfs_del_item(trans, root->fs_info->inode_root, path);
-	if (ret)
-		goto out_release;
-
-	if (root->fs_info->last_inode_alloc > file_objectid)
-		root->fs_info->last_inode_alloc = file_objectid;
-	btrfs_release_path(root, path);
-	ptr = radix_tree_delete(radix, radix_index);
-	if (!ptr) {
-		ret = -5555;
-		goto out;
-	}
-	return 0;
-out_release:
-	btrfs_release_path(root, path);
-out:
-	printf("failed to delete %lu %d\n", radix_index, ret);
-	return -1;
-}
-
-static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct radix_tree_root *radix)
-{
-	int ret;
-	char buf[128];
-	unsigned long oid;
-	struct btrfs_path path;
-
-	ret = find_num(radix, &oid, 1);
-	if (ret < 0)
-		return 0;
-	sprintf(buf, "str-%lu", oid);
-	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
-				    strlen(buf), -1);
-	if (ret)
-		goto out_release;
-
-	ret = del_dir_item(trans, root, radix, oid, &path);
-	if (ret)
-		goto out_release;
-	return ret;
-out_release:
-	btrfs_release_path(root, &path);
-	printf("failed to delete %lu %d\n", oid, ret);
-	return -1;
-}
-
-static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	char buf[128];
-	int ret;
-	unsigned long oid;
-	u64 objectid;
-	struct btrfs_dir_item *di;
-
-	ret = find_num(radix, &oid, 1);
-	if (ret < 0)
-		return 0;
-	sprintf(buf, "str-%lu", oid);
-	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
-				    strlen(buf), 0);
-	if (!ret) {
-		di = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
-				    struct btrfs_dir_item);
-		objectid = btrfs_dir_objectid(di);
-		btrfs_release_path(root, &path);
-		btrfs_init_path(&path);
-		ret = btrfs_lookup_inode_map(trans, root, &path, objectid, 0);
-	}
-	btrfs_release_path(root, &path);
-	if (ret) {
-		printf("unable to find key %lu\n", oid);
-		return -1;
-	}
-	return 0;
-}
-
-static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	char buf[128];
-	int ret;
-	unsigned long oid;
-
-	ret = find_num(radix, &oid, 0);
-	if (ret < 0)
-		return 0;
-	sprintf(buf, "str-%lu", oid);
-	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(trans, root, &path, dir_oid, buf,
-				    strlen(buf), 0);
-	btrfs_release_path(root, &path);
-	if (!ret) {
-		printf("able to find key that should not exist %lu\n", oid);
-		return -1;
-	}
-	return 0;
-}
-
-static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct radix_tree_root *radix, int nr)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	unsigned long found = 0;
-	u32 found_len;
-	int ret;
-	int slot;
-	int count = 0;
-	char buf[128];
-	struct btrfs_dir_item *di;
-
-	key.offset = (u64)-1;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	key.objectid = dir_oid;
-	while(nr-- >= 0) {
-		btrfs_init_path(&path);
-		ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
-		if (ret < 0) {
-			btrfs_release_path(root, &path);
-			return ret;
-		}
-		if (ret != 0) {
-			if (path.slots[0] == 0) {
-				btrfs_release_path(root, &path);
-				break;
-			}
-			path.slots[0] -= 1;
-		}
-		slot = path.slots[0];
-		di = btrfs_item_ptr(&path.nodes[0]->leaf, slot,
-				    struct btrfs_dir_item);
-		found_len = btrfs_dir_name_len(di);
-		memcpy(buf, (char *)(di + 1), found_len);
-		BUG_ON(found_len > 128);
-		buf[found_len] = '\0';
-		found = atoi(buf + 4);
-		ret = del_dir_item(trans, root, radix, found, &path);
-		count++;
-		if (ret) {
-			fprintf(stderr,
-				"failed to remove %lu from tree\n",
-				found);
-			return -1;
-		}
-		if (!keep_running)
-			break;
-	}
-	return 0;
-	fprintf(stderr, "failed to delete from the radix %lu\n", found);
-	return -1;
-}
-
-static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct radix_tree_root *radix, int count)
-{
-	int i;
-	int ret = 0;
-	for (i = 0; i < count; i++) {
-		ret = ins_one(trans, root, radix);
-		if (ret) {
-			fprintf(stderr, "fill failed\n");
-			goto out;
-		}
-		if (i % 1000 == 0) {
-			ret = btrfs_commit_transaction(trans, root, &super);
-			if (ret) {
-				fprintf(stderr, "fill commit failed\n");
-				return ret;
-			}
-		}
-		if (i && i % 10000 == 0) {
-			printf("bigfill %d\n", i);
-		}
-		if (!keep_running)
-			break;
-	}
-out:
-	return ret;
-}
-
-static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct radix_tree_root *radix)
-{
-	int ret;
-	int nr = rand() % 5000;
-	static int run_nr = 0;
-
-	/* do the bulk op much less frequently */
-	if (run_nr++ % 100)
-		return 0;
-	ret = empty_tree(trans, root, radix, nr);
-	if (ret)
-		return ret;
-	ret = fill_tree(trans, root, radix, nr);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-
-int (*ops[])(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct
-	     radix_tree_root *radix) =
-	{ ins_one, insert_dup, del_one, lookup_item,
-	  lookup_enoent, bulk_op };
-
-void sigstopper(int ignored)
-{
-	keep_running = 0;
-	fprintf(stderr, "caught exit signal, stopping\n");
-}
-
-int print_usage(void)
-{
-	printf("usage: tester [-ih] [-c count] [-f count]\n");
-	printf("\t -c count -- iteration count after filling\n");
-	printf("\t -f count -- run this many random inserts before starting\n");
-	printf("\t -i       -- only do initial fill\n");
-	printf("\t -h       -- this help text\n");
-	exit(1);
-}
-int main(int ac, char **av)
-{
-	RADIX_TREE(radix, GFP_KERNEL);
-	struct btrfs_root *root;
-	int i;
-	int ret;
-	int count;
-	int op;
-	int iterations = 20000;
-	int init_fill_count = 800000;
-	int err = 0;
-	int initial_only = 0;
-	struct btrfs_trans_handle *trans;
-	radix_tree_init();
-
-	root = open_ctree("dbfile", &super);
-	trans = btrfs_start_transaction(root, 1);
-
-	signal(SIGTERM, sigstopper);
-	signal(SIGINT, sigstopper);
-
-	for (i = 1 ; i < ac ; i++) {
-		if (strcmp(av[i], "-i") == 0) {
-			initial_only = 1;
-		} else if (strcmp(av[i], "-c") == 0) {
-			iterations = atoi(av[i+1]);
-			i++;
-		} else if (strcmp(av[i], "-f") == 0) {
-			init_fill_count = atoi(av[i+1]);
-			i++;
-		} else {
-			print_usage();
-		}
-	}
-	printf("initial fill\n");
-	ret = fill_tree(trans, root, &radix, init_fill_count);
-	printf("starting run\n");
-	if (ret) {
-		err = ret;
-		goto out;
-	}
-	if (initial_only == 1) {
-		goto out;
-	}
-	for (i = 0; i < iterations; i++) {
-		op = rand() % ARRAY_SIZE(ops);
-		count = rand() % 128;
-		if (i % 2000 == 0) {
-			printf("%d\n", i);
-			fflush(stdout);
-		}
-		if (i && i % 5000 == 0) {
-			printf("open & close, root level %d nritems %d\n",
-				btrfs_header_level(&root->node->node.header),
-				btrfs_header_nritems(&root->node->node.header));
-			close_ctree(root, &super);
-			root = open_ctree("dbfile", &super);
-		}
-		while(count--) {
-			ret = ops[op](trans, root, &radix);
-			if (ret) {
-				fprintf(stderr, "op %d failed %d:%d\n",
-					op, i, iterations);
-				btrfs_print_tree(root, root->node);
-				fprintf(stderr, "op %d failed %d:%d\n",
-					op, i, iterations);
-				err = ret;
-				goto out;
-			}
-			if (ops[op] == bulk_op)
-				break;
-			if (keep_running == 0) {
-				err = 0;
-				goto out;
-			}
-		}
-	}
-out:
-	close_ctree(root, &super);
-	return err;
-}
-
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0322c55162c..05637f9fd7c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -266,20 +266,25 @@ static int find_and_setup_root(struct btrfs_super_block *super,
 }
 
 struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
+{
+	int fp;
+
+	fp = open(filename, O_CREAT | O_RDWR, 0600);
+	if (fp < 0) {
+		return NULL;
+	}
+	return open_ctree_fd(fp, super);
+}
+
+struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super)
 {
 	struct btrfs_root *root = malloc(sizeof(struct btrfs_root));
 	struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root));
 	struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root));
 	struct btrfs_root *inode_root = malloc(sizeof(struct btrfs_root));
 	struct btrfs_fs_info *fs_info = malloc(sizeof(*fs_info));
-	int fp;
 	int ret;
 
-	fp = open(filename, O_CREAT | O_RDWR, 0600);
-	if (fp < 0) {
-		free(root);
-		return NULL;
-	}
 	INIT_RADIX_TREE(&fs_info->cache_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 24a9e77c831..d888cf5c350 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -24,6 +24,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans,
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root
 			     *root, struct btrfs_super_block *s);
 struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s);
+struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super);
 int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s);
 void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf);
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9bc4ad38876..53a7550b5c1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1,7 +1,5 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
+#include <linux/radix-tree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -183,9 +181,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_init_path(&path);
 	ret = btrfs_search_slot(trans, extent_root, &key, &path, -1, 1);
 	if (ret) {
-		printf("failed to find %Lu\n", key.objectid);
+		printk("failed to find %Lu\n", key.objectid);
 		btrfs_print_tree(extent_root, extent_root->node);
-		printf("failed to find %Lu\n", key.objectid);
+		printk("failed to find %Lu\n", key.objectid);
 		BUG();
 	}
 	ei = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 24cfd6d8524..8e1e5b4ccfa 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -1,9 +1,5 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
 #include "ctree.h"
-#include "disk-io.h"
 #include "transaction.h"
 
 int btrfs_create_file(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 92c37eece1f..6c2a71a46c7 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -10,7 +10,6 @@
  * License.
  */
 
-#include "kerncompat.h"
 #define DELTA 0x9E3779B9
 
 static void TEA_transform(__u32 buf[2], __u32 const in[])
diff --git a/fs/btrfs/hasher.c b/fs/btrfs/hasher.c
deleted file mode 100644
index 96702da4329..00000000000
--- a/fs/btrfs/hasher.c
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "kerncompat.h"
-#include "hash.h"
-
-int main() {
-	u64 result;
-	int ret;
-	char line[255];
-	char *p;
-	while(1) {
-		p = fgets(line, 255, stdin);
-		if (!p)
-			break;
-		if (strlen(line) == 0)
-			continue;
-		ret = btrfs_name_hash(line, strlen(line), &result);
-		BUG_ON(ret);
-		printf("hash returns %Lu\n", result);
-	}
-	return 0;
-}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 7caeb11e875..8d8c26a6c1a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -1,7 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f412b339213..c7fda3bf7b2 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -1,7 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
deleted file mode 100644
index 105d3f58408..00000000000
--- a/fs/btrfs/kerncompat.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef __KERNCOMPAT
-#define __KERNCOMPAT
-#define gfp_t int
-#define get_cpu_var(p) (p)
-#define __get_cpu_var(p) (p)
-#define BITS_PER_LONG 64
-#define __GFP_BITS_SHIFT 20
-#define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1))
-#define GFP_KERNEL 0
-#define __read_mostly
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#define PAGE_SHIFT 12
-#define ULONG_MAX       (~0UL)
-#define BUG() abort()
-#ifdef __CHECKER__
-#define __force    __attribute__((force))
-#define __bitwise__ __attribute__((bitwise))
-#else
-#define __force
-#define __bitwise__
-#endif
-
-typedef unsigned int u32;
-typedef u32 __u32;
-typedef unsigned long long u64;
-typedef unsigned char u8;
-typedef unsigned short u16;
-
-typedef unsigned long pgoff_t;
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-struct vma_shared { int prio_tree_node; };
-struct vm_area_struct {
-	unsigned long vm_pgoff;
-	unsigned long vm_start;
-	unsigned long vm_end;
-	struct vma_shared shared;
-};
-
-struct page {
-	unsigned long index;
-};
-
-static inline void preempt_enable(void) { do {; } while(0);}
-static inline void preempt_disable(void) { do {; } while(0);}
-
-static inline void __set_bit(int bit, unsigned long *map) {
-	unsigned long *p = map + bit / BITS_PER_LONG;
-	bit = bit & (BITS_PER_LONG -1);
-	*p |= 1UL << bit;
-}
-
-static inline int test_bit(int bit, unsigned long *map) {
-	unsigned long *p = map + bit / BITS_PER_LONG;
-	bit = bit & (BITS_PER_LONG -1);
-	return *p & (1UL << bit) ? 1 : 0;
-}
-
-static inline void __clear_bit(int bit, unsigned long *map) {
-	unsigned long *p = map + bit / BITS_PER_LONG;
-	bit = bit & (BITS_PER_LONG -1);
-	*p &= ~(1UL << bit);
-}
-#define BUG_ON(c) do { if (c) abort(); } while (0)
-
-#define container_of(ptr, type, member) ({                      \
-        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
-	        (type *)( (char *)__mptr - __builtin_offsetof(type,member) );})
-
-#define ENOMEM 5
-#define EEXIST 6
-
-#define __CHECK_ENDIAN__
-#ifdef __CHECK_ENDIAN__
-#define __bitwise __bitwise__
-#else
-#define __bitwise
-#endif
-
-typedef u16 __bitwise __le16;
-typedef u16 __bitwise __be16;
-typedef u32 __bitwise __le32;
-typedef u32 __bitwise __be32;
-typedef u64 __bitwise __le64;
-typedef u64 __bitwise __be64;
-
-#define cpu_to_le64(x) ((__force __le64)(u64)(x))
-#define le64_to_cpu(x) ((__force u64)(__le64)(x))
-#define cpu_to_le32(x) ((__force __le32)(u32)(x))
-#define le32_to_cpu(x) ((__force u32)(__le32)(x))
-#define cpu_to_le16(x) ((__force __le16)(u16)(x))
-#define le16_to_cpu(x) ((__force u16)(__le16)(x))
-#endif
diff --git a/fs/btrfs/list.h b/fs/btrfs/list.h
deleted file mode 100644
index 1aafafb1337..00000000000
--- a/fs/btrfs/list.h
+++ /dev/null
@@ -1,418 +0,0 @@
-#ifndef _LINUX_LIST_H
-#define _LINUX_LIST_H
-
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
-
-/*
- * Simple doubly linked list implementation.
- *
- * Some of the internal functions ("__xxx") are useful when
- * manipulating whole lists rather than single entries, as
- * sometimes we already know the next/prev entries and we can
- * generate better code by using them directly rather than
- * using the generic single-entry routines.
- */
-
-struct list_head {
-	struct list_head *next, *prev;
-};
-
-#define LIST_HEAD_INIT(name) { &(name), &(name) }
-
-#define LIST_HEAD(name) \
-	struct list_head name = LIST_HEAD_INIT(name)
-
-static inline void INIT_LIST_HEAD(struct list_head *list)
-{
-	list->next = list;
-	list->prev = list;
-}
-
-/*
- * Insert a new entry between two known consecutive entries.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
- */
-#ifndef CONFIG_DEBUG_LIST
-static inline void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next)
-{
-	next->prev = new;
-	new->next = next;
-	new->prev = prev;
-	prev->next = new;
-}
-#else
-extern void __list_add(struct list_head *new,
-			      struct list_head *prev,
-			      struct list_head *next);
-#endif
-
-/**
- * list_add - add a new entry
- * @new: new entry to be added
- * @head: list head to add it after
- *
- * Insert a new entry after the specified head.
- * This is good for implementing stacks.
- */
-#ifndef CONFIG_DEBUG_LIST
-static inline void list_add(struct list_head *new, struct list_head *head)
-{
-	__list_add(new, head, head->next);
-}
-#else
-extern void list_add(struct list_head *new, struct list_head *head);
-#endif
-
-
-/**
- * list_add_tail - add a new entry
- * @new: new entry to be added
- * @head: list head to add it before
- *
- * Insert a new entry before the specified head.
- * This is useful for implementing queues.
- */
-static inline void list_add_tail(struct list_head *new, struct list_head *head)
-{
-	__list_add(new, head->prev, head);
-}
-
-/*
- * Delete a list entry by making the prev/next entries
- * point to each other.
- *
- * This is only for internal list manipulation where we know
- * the prev/next entries already!
- */
-static inline void __list_del(struct list_head * prev, struct list_head * next)
-{
-	next->prev = prev;
-	prev->next = next;
-}
-
-/**
- * list_del - deletes entry from list.
- * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is
- * in an undefined state.
- */
-#ifndef CONFIG_DEBUG_LIST
-static inline void list_del(struct list_head *entry)
-{
-	__list_del(entry->prev, entry->next);
-	entry->next = LIST_POISON1;
-	entry->prev = LIST_POISON2;
-}
-#else
-extern void list_del(struct list_head *entry);
-#endif
-
-/**
- * list_replace - replace old entry by new one
- * @old : the element to be replaced
- * @new : the new element to insert
- * Note: if 'old' was empty, it will be overwritten.
- */
-static inline void list_replace(struct list_head *old,
-				struct list_head *new)
-{
-	new->next = old->next;
-	new->next->prev = new;
-	new->prev = old->prev;
-	new->prev->next = new;
-}
-
-static inline void list_replace_init(struct list_head *old,
-					struct list_head *new)
-{
-	list_replace(old, new);
-	INIT_LIST_HEAD(old);
-}
-/**
- * list_del_init - deletes entry from list and reinitialize it.
- * @entry: the element to delete from the list.
- */
-static inline void list_del_init(struct list_head *entry)
-{
-	__list_del(entry->prev, entry->next);
-	INIT_LIST_HEAD(entry);
-}
-
-/**
- * list_move - delete from one list and add as another's head
- * @list: the entry to move
- * @head: the head that will precede our entry
- */
-static inline void list_move(struct list_head *list, struct list_head *head)
-{
-        __list_del(list->prev, list->next);
-        list_add(list, head);
-}
-
-/**
- * list_move_tail - delete from one list and add as another's tail
- * @list: the entry to move
- * @head: the head that will follow our entry
- */
-static inline void list_move_tail(struct list_head *list,
-				  struct list_head *head)
-{
-        __list_del(list->prev, list->next);
-        list_add_tail(list, head);
-}
-
-/**
- * list_is_last - tests whether @list is the last entry in list @head
- * @list: the entry to test
- * @head: the head of the list
- */
-static inline int list_is_last(const struct list_head *list,
-				const struct list_head *head)
-{
-	return list->next == head;
-}
-
-/**
- * list_empty - tests whether a list is empty
- * @head: the list to test.
- */
-static inline int list_empty(const struct list_head *head)
-{
-	return head->next == head;
-}
-
-/**
- * list_empty_careful - tests whether a list is empty and not being modified
- * @head: the list to test
- *
- * Description:
- * tests whether a list is empty _and_ checks that no other CPU might be
- * in the process of modifying either member (next or prev)
- *
- * NOTE: using list_empty_careful() without synchronization
- * can only be safe if the only activity that can happen
- * to the list entry is list_del_init(). Eg. it cannot be used
- * if another CPU could re-list_add() it.
- */
-static inline int list_empty_careful(const struct list_head *head)
-{
-	struct list_head *next = head->next;
-	return (next == head) && (next == head->prev);
-}
-
-static inline void __list_splice(struct list_head *list,
-				 struct list_head *head)
-{
-	struct list_head *first = list->next;
-	struct list_head *last = list->prev;
-	struct list_head *at = head->next;
-
-	first->prev = head;
-	head->next = first;
-
-	last->next = at;
-	at->prev = last;
-}
-
-/**
- * list_splice - join two lists
- * @list: the new list to add.
- * @head: the place to add it in the first list.
- */
-static inline void list_splice(struct list_head *list, struct list_head *head)
-{
-	if (!list_empty(list))
-		__list_splice(list, head);
-}
-
-/**
- * list_splice_init - join two lists and reinitialise the emptied list.
- * @list: the new list to add.
- * @head: the place to add it in the first list.
- *
- * The list at @list is reinitialised
- */
-static inline void list_splice_init(struct list_head *list,
-				    struct list_head *head)
-{
-	if (!list_empty(list)) {
-		__list_splice(list, head);
-		INIT_LIST_HEAD(list);
-	}
-}
-
-/**
- * list_entry - get the struct for this entry
- * @ptr:	the &struct list_head pointer.
- * @type:	the type of the struct this is embedded in.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_entry(ptr, type, member) \
-	container_of(ptr, type, member)
-
-/**
- * list_for_each	-	iterate over a list
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- */
-#define list_for_each(pos, head) \
-	for (pos = (head)->next; prefetch(pos->next), pos != (head); \
-        	pos = pos->next)
-
-/**
- * __list_for_each	-	iterate over a list
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- *
- * This variant differs from list_for_each() in that it's the
- * simplest possible list iteration code, no prefetching is done.
- * Use this for code that knows the list to be very short (empty
- * or 1 entry) most of the time.
- */
-#define __list_for_each(pos, head) \
-	for (pos = (head)->next; pos != (head); pos = pos->next)
-
-/**
- * list_for_each_prev	-	iterate over a list backwards
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- */
-#define list_for_each_prev(pos, head) \
-	for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \
-        	pos = pos->prev)
-
-/**
- * list_for_each_safe - iterate over a list safe against removal of list entry
- * @pos:	the &struct list_head to use as a loop cursor.
- * @n:		another &struct list_head to use as temporary storage
- * @head:	the head for your list.
- */
-#define list_for_each_safe(pos, n, head) \
-	for (pos = (head)->next, n = pos->next; pos != (head); \
-		pos = n, n = pos->next)
-
-/**
- * list_for_each_entry	-	iterate over list of given type
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_for_each_entry(pos, head, member)				\
-	for (pos = list_entry((head)->next, typeof(*pos), member);	\
-	     prefetch(pos->member.next), &pos->member != (head); 	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
-
-/**
- * list_for_each_entry_reverse - iterate backwards over list of given type.
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_for_each_entry_reverse(pos, head, member)			\
-	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
-	     prefetch(pos->member.prev), &pos->member != (head); 	\
-	     pos = list_entry(pos->member.prev, typeof(*pos), member))
-
-/**
- * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
- * @pos:	the type * to use as a start point
- * @head:	the head of the list
- * @member:	the name of the list_struct within the struct.
- *
- * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
- */
-#define list_prepare_entry(pos, head, member) \
-	((pos) ? : list_entry(head, typeof(*pos), member))
-
-/**
- * list_for_each_entry_continue - continue iteration over list of given type
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Continue to iterate over list of given type, continuing after
- * the current position.
- */
-#define list_for_each_entry_continue(pos, head, member) 		\
-	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
-	     prefetch(pos->member.next), &pos->member != (head);	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
-
-/**
- * list_for_each_entry_from - iterate over list of given type from the current point
- * @pos:	the type * to use as a loop cursor.
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate over list of given type, continuing from current position.
- */
-#define list_for_each_entry_from(pos, head, member) 			\
-	for (; prefetch(pos->member.next), &pos->member != (head);	\
-	     pos = list_entry(pos->member.next, typeof(*pos), member))
-
-/**
- * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- */
-#define list_for_each_entry_safe(pos, n, head, member)			\
-	for (pos = list_entry((head)->next, typeof(*pos), member),	\
-		n = list_entry(pos->member.next, typeof(*pos), member);	\
-	     &pos->member != (head); 					\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-/**
- * list_for_each_entry_safe_continue
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate over list of given type, continuing after current point,
- * safe against removal of list entry.
- */
-#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
-	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
-		n = list_entry(pos->member.next, typeof(*pos), member);		\
-	     &pos->member != (head);						\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-/**
- * list_for_each_entry_safe_from
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate over list of given type from current point, safe against
- * removal of list entry.
- */
-#define list_for_each_entry_safe_from(pos, n, head, member) 			\
-	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
-	     &pos->member != (head);						\
-	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-/**
- * list_for_each_entry_safe_reverse
- * @pos:	the type * to use as a loop cursor.
- * @n:		another type * to use as temporary storage
- * @head:	the head for your list.
- * @member:	the name of the list_struct within the struct.
- *
- * Iterate backwards over list of given type, safe against removal
- * of list entry.
- */
-#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
-	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
-		n = list_entry(pos->member.prev, typeof(*pos), member);	\
-	     &pos->member != (head); 					\
-	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
-
-#endif
diff --git a/fs/btrfs/mkfs.c b/fs/btrfs/mkfs.c
deleted file mode 100644
index f7efc8a5fb1..00000000000
--- a/fs/btrfs/mkfs.c
+++ /dev/null
@@ -1,255 +0,0 @@
-#define _XOPEN_SOURCE 500
-#ifndef __CHECKER__
-#include <sys/ioctl.h>
-#include <sys/mount.h>
-#endif
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
-#include "ctree.h"
-#include "disk-io.h"
-
-#ifdef __CHECKER__
-#define BLKGETSIZE64 0
-static inline int ioctl(int fd, int define, u64 *size) { return 0; }
-#endif
-
-#if 0
-#if defined(__linux__) && defined(_IOR) && !defined(BLKGETSIZE64)
-#   define BLKGETSIZE64 _IOR(0x12, 114, __u64)
-#endif
-#endif
-
-int mkfs(int fd, u64 num_blocks, u32 blocksize)
-{
-	struct btrfs_super_block super;
-	struct btrfs_leaf *empty_leaf;
-	struct btrfs_root_item root_item;
-	struct btrfs_item item;
-	struct btrfs_extent_item extent_item;
-	char *block;
-	int ret;
-	u32 itemoff;
-	u32 start_block = BTRFS_SUPER_INFO_OFFSET / blocksize;
-
-	btrfs_set_super_blocknr(&super, start_block);
-	btrfs_set_super_root(&super, start_block + 1);
-	strcpy((char *)(&super.magic), BTRFS_MAGIC);
-	btrfs_set_super_blocksize(&super, blocksize);
-	btrfs_set_super_total_blocks(&super, num_blocks);
-	btrfs_set_super_blocks_used(&super, start_block + 5);
-
-	block = malloc(blocksize);
-	memset(block, 0, blocksize);
-	BUG_ON(sizeof(super) > blocksize);
-	memcpy(block, &super, sizeof(super));
-	ret = pwrite(fd, block, blocksize, BTRFS_SUPER_INFO_OFFSET);
-	BUG_ON(ret != blocksize);
-
-	/* create the tree of root objects */
-	empty_leaf = malloc(blocksize);
-	memset(empty_leaf, 0, blocksize);
-	btrfs_set_header_parentid(&empty_leaf->header,
-				  BTRFS_ROOT_TREE_OBJECTID);
-	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 1);
-	btrfs_set_header_nritems(&empty_leaf->header, 3);
-
-	/* create the items for the root tree */
-	btrfs_set_root_blocknr(&root_item, start_block + 2);
-	btrfs_set_root_refs(&root_item, 1);
-	itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) - sizeof(root_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_item_size(&item, sizeof(root_item));
-	btrfs_set_disk_key_objectid(&item.key, BTRFS_EXTENT_TREE_OBJECTID);
-	btrfs_set_disk_key_offset(&item.key, 0);
-	btrfs_set_disk_key_flags(&item.key, 0);
-	btrfs_set_disk_key_type(&item.key, BTRFS_ROOT_ITEM_KEY);
-	memcpy(empty_leaf->items, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
-		&root_item, sizeof(root_item));
-
-	btrfs_set_root_blocknr(&root_item, start_block + 3);
-	itemoff = itemoff - sizeof(root_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_disk_key_objectid(&item.key, BTRFS_INODE_MAP_OBJECTID);
-	memcpy(empty_leaf->items + 1, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
-		&root_item, sizeof(root_item));
-
-	btrfs_set_root_blocknr(&root_item, start_block + 4);
-	itemoff = itemoff - sizeof(root_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_disk_key_objectid(&item.key, BTRFS_FS_TREE_OBJECTID);
-	memcpy(empty_leaf->items + 2, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + itemoff,
-		&root_item, sizeof(root_item));
-	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 1) * blocksize);
-
-	/* create the items for the extent tree */
-	btrfs_set_header_parentid(&empty_leaf->header,
-				  BTRFS_EXTENT_TREE_OBJECTID);
-	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 2);
-	btrfs_set_header_nritems(&empty_leaf->header, 5);
-
-	/* item1, reserve blocks 0-16 */
-	btrfs_set_disk_key_objectid(&item.key, 0);
-	btrfs_set_disk_key_offset(&item.key, start_block + 1);
-	btrfs_set_disk_key_flags(&item.key, 0);
-	btrfs_set_disk_key_type(&item.key, BTRFS_EXTENT_ITEM_KEY);
-	itemoff = __BTRFS_LEAF_DATA_SIZE(blocksize) -
-			sizeof(struct btrfs_extent_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_item_size(&item, sizeof(struct btrfs_extent_item));
-	btrfs_set_extent_refs(&extent_item, 1);
-	btrfs_set_extent_owner(&extent_item, 0);
-	memcpy(empty_leaf->items, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
-		&extent_item, btrfs_item_size(&item));
-
-	/* item2, give block 17 to the root */
-	btrfs_set_disk_key_objectid(&item.key, start_block + 1);
-	btrfs_set_disk_key_offset(&item.key, 1);
-	itemoff = itemoff - sizeof(struct btrfs_extent_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_extent_owner(&extent_item, BTRFS_ROOT_TREE_OBJECTID);
-	memcpy(empty_leaf->items + 1, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
-		&extent_item, btrfs_item_size(&item));
-
-	/* item3, give block 18 to the extent root */
-	btrfs_set_disk_key_objectid(&item.key, start_block + 2);
-	btrfs_set_disk_key_offset(&item.key, 1);
-	itemoff = itemoff - sizeof(struct btrfs_extent_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_extent_owner(&extent_item, BTRFS_EXTENT_TREE_OBJECTID);
-	memcpy(empty_leaf->items + 2, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
-		&extent_item, btrfs_item_size(&item));
-
-	/* item4, give block 19 to the inode map */
-	btrfs_set_disk_key_objectid(&item.key, start_block + 3);
-	btrfs_set_disk_key_offset(&item.key, 1);
-	itemoff = itemoff - sizeof(struct btrfs_extent_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_extent_owner(&extent_item, BTRFS_INODE_MAP_OBJECTID);
-	memcpy(empty_leaf->items + 3, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
-		&extent_item, btrfs_item_size(&item));
-	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 2) * blocksize);
-	if (ret != blocksize)
-		return -1;
-
-	/* item5, give block 20 to the FS root */
-	btrfs_set_disk_key_objectid(&item.key, start_block + 4);
-	btrfs_set_disk_key_offset(&item.key, 1);
-	itemoff = itemoff - sizeof(struct btrfs_extent_item);
-	btrfs_set_item_offset(&item, itemoff);
-	btrfs_set_extent_owner(&extent_item, BTRFS_FS_TREE_OBJECTID);
-	memcpy(empty_leaf->items + 4, &item, sizeof(item));
-	memcpy(btrfs_leaf_data(empty_leaf) + btrfs_item_offset(&item),
-		&extent_item, btrfs_item_size(&item));
-	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 2) * blocksize);
-	if (ret != blocksize)
-		return -1;
-
-	/* create the inode map */
-	btrfs_set_header_parentid(&empty_leaf->header,
-				  BTRFS_INODE_MAP_OBJECTID);
-	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 3);
-	btrfs_set_header_nritems(&empty_leaf->header, 0);
-	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 3) * blocksize);
-	if (ret != blocksize)
-		return -1;
-
-	/* finally create the FS root */
-	btrfs_set_header_parentid(&empty_leaf->header, BTRFS_FS_TREE_OBJECTID);
-	btrfs_set_header_blocknr(&empty_leaf->header, start_block + 4);
-	btrfs_set_header_nritems(&empty_leaf->header, 0);
-	ret = pwrite(fd, empty_leaf, blocksize, (start_block + 4) * blocksize);
-	if (ret != blocksize)
-		return -1;
-	return 0;
-}
-
-u64 device_size(int fd, struct stat *st)
-{
-	u64 size;
-	if (S_ISREG(st->st_mode)) {
-		return st->st_size;
-	}
-	if (!S_ISBLK(st->st_mode)) {
-		return 0;
-	}
-	if (ioctl(fd, BLKGETSIZE64, &size) >= 0) {
-		return size;
-	}
-	return 0;
-}
-
-int main(int ac, char **av)
-{
-	char *file;
-	u64 block_count = 0;
-	int fd;
-	struct stat st;
-	int ret;
-	int i;
-	char *buf = malloc(4096);
-	if (ac >= 2) {
-		file = av[1];
-		if (ac == 3) {
-			block_count = atoi(av[2]);
-			if (!block_count) {
-				fprintf(stderr, "error finding block count\n");
-				exit(1);
-			}
-		}
-	} else {
-		fprintf(stderr, "usage: mkfs.btrfs file [block count]\n");
-		exit(1);
-	}
-	fd = open(file, O_RDWR);
-	if (fd < 0) {
-		fprintf(stderr, "unable to open %s\n", file);
-		exit(1);
-	}
-	ret = fstat(fd, &st);
-	if (ret < 0) {
-		fprintf(stderr, "unable to stat %s\n", file);
-		exit(1);
-	}
-	if (block_count == 0) {
-		block_count = device_size(fd, &st);
-		if (block_count == 0) {
-			fprintf(stderr, "unable to find %s size\n", file);
-			exit(1);
-		}
-	}
-	block_count /= 4096;
-	if (block_count < 256) {
-		fprintf(stderr, "device %s is too small\n", file);
-		exit(1);
-	}
-	memset(buf, 0, 4096);
-	for(i = 0; i < 6; i++) {
-		ret = write(fd, buf, 4096);
-		if (ret != 4096) {
-			fprintf(stderr, "unable to zero fill device\n");
-			exit(1);
-		}
-	}
-	ret = mkfs(fd, block_count, 4096);
-	if (ret) {
-		fprintf(stderr, "error during mkfs %d\n", ret);
-		exit(1);
-	}
-	printf("fs created on %s blocksize %d blocks %Lu\n",
-	       file, 4096, block_count);
-	return 0;
-}
-
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f53b99da12f..aa2d3fac880 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -1,7 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 
@@ -17,14 +14,14 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_inode_item *ii;
 	u32 type;
 
-	printf("leaf %Lu total ptrs %d free space %d\n",
+	printk("leaf %Lu total ptrs %d free space %d\n",
 		btrfs_header_blocknr(&l->header), nr,
 		btrfs_leaf_free_space(root, l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
 		type = btrfs_disk_key_type(&item->key);
-		printf("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n",
+		printk("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n",
 			i,
 			btrfs_disk_key_objectid(&item->key),
 			btrfs_disk_key_flags(&item->key),
@@ -34,38 +31,39 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-			printf("\t\tinode generation %Lu size %Lu\n",
+			printk("\t\tinode generation %Lu size %Lu mode %o\n",
 			       btrfs_inode_generation(ii),
-			       btrfs_inode_size(ii));
+			       btrfs_inode_size(ii),
+			       btrfs_inode_mode(ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
-			printf("\t\tdir oid %Lu flags %u type %u\n",
+			printk("\t\tdir oid %Lu flags %u type %u\n",
 				btrfs_dir_objectid(di),
 				btrfs_dir_flags(di),
 				btrfs_dir_type(di));
-			printf("\t\tname %.*s\n",
+			printk("\t\tname %.*s\n",
 			       btrfs_dir_name_len(di),(char *)(di + 1));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-			printf("\t\troot data blocknr %Lu refs %u\n",
+			printk("\t\troot data blocknr %Lu refs %u\n",
 				btrfs_root_blocknr(ri), btrfs_root_refs(ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
 			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-			printf("\t\textent data refs %u owner %Lu\n",
+			printk("\t\textent data refs %u owner %Lu\n",
 				btrfs_extent_refs(ei), btrfs_extent_owner(ei));
 			break;
 		case BTRFS_INODE_MAP_ITEM_KEY:
 			mi = btrfs_item_ptr(l, i, struct btrfs_inode_map_item);
-			printf("\t\tinode map key %Lu %u %Lu\n",
+			printk("\t\tinode map key %Lu %u %Lu\n",
 			       btrfs_disk_key_objectid(&mi->key),
 			       btrfs_disk_key_flags(&mi->key),
 			       btrfs_disk_key_offset(&mi->key));
 			break;
 		case BTRFS_STRING_ITEM_KEY:
-			printf("\t\titem data %.*s\n", btrfs_item_size(item),
+			printk("\t\titem data %.*s\n", btrfs_item_size(item),
 				btrfs_leaf_data(l) + btrfs_item_offset(item));
 			break;
 		};
@@ -86,12 +84,12 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 		btrfs_print_leaf(root, (struct btrfs_leaf *)c);
 		return;
 	}
-	printf("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
+	printk("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
 	        btrfs_header_level(&c->header), nr,
 		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
-		printf("\tkey %d (%Lu %u %Lu) block %Lu\n",
+		printk("\tkey %d (%Lu %u %Lu) block %Lu\n",
 		       i,
 		       c->ptrs[i].key.objectid,
 		       c->ptrs[i].key.flags,
diff --git a/fs/btrfs/quick-test.c b/fs/btrfs/quick-test.c
deleted file mode 100644
index ef12cf88878..00000000000
--- a/fs/btrfs/quick-test.c
+++ /dev/null
@@ -1,179 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "print-tree.h"
-#include "transaction.h"
-
-/* for testing only */
-int next_key(int i, int max_key) {
-	return rand() % max_key;
-	// return i;
-}
-
-int main(int ac, char **av) {
-	struct btrfs_key ins;
-	struct btrfs_key last = { (u64)-1, 0, 0};
-	char *buf;
-	int i;
-	int num;
-	int ret;
-	int run_size = 100000;
-	int max_key =  100000000;
-	int tree_size = 0;
-	struct btrfs_path path;
-	struct btrfs_super_block super;
-	struct btrfs_root *root;
-	struct btrfs_trans_handle *trans;
-
-	radix_tree_init();
-
-	root = open_ctree("dbfile", &super);
-	trans = btrfs_start_transaction(root, 1);
-	srand(55);
-	ins.flags = 0;
-	btrfs_set_key_type(&ins, BTRFS_STRING_ITEM_KEY);
-	for (i = 0; i < run_size; i++) {
-		buf = malloc(64);
-		num = next_key(i, max_key);
-		// num = i;
-		sprintf(buf, "string-%d", num);
-		if (i % 10000 == 0)
-			fprintf(stderr, "insert %d:%d\n", num, i);
-		ins.objectid = num;
-		ins.offset = 0;
-		ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf));
-		if (!ret)
-			tree_size++;
-		free(buf);
-		if (i == run_size - 5) {
-			btrfs_commit_transaction(trans, root, &super);
-		}
-
-	}
-	close_ctree(root, &super);
-
-	root = open_ctree("dbfile", &super);
-	printf("starting search\n");
-	srand(55);
-	for (i = 0; i < run_size; i++) {
-		num = next_key(i, max_key);
-		ins.objectid = num;
-		btrfs_init_path(&path);
-		if (i % 10000 == 0)
-			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0);
-		if (ret) {
-			btrfs_print_tree(root, root->node);
-			printf("unable to find %d\n", num);
-			exit(1);
-		}
-		btrfs_release_path(root, &path);
-	}
-	close_ctree(root, &super);
-	root = open_ctree("dbfile", &super);
-	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
-	        btrfs_header_level(&root->node->node.header),
-		btrfs_header_nritems(&root->node->node.header),
-		BTRFS_NODEPTRS_PER_BLOCK(root) -
-		btrfs_header_nritems(&root->node->node.header));
-	printf("all searches good, deleting some items\n");
-	i = 0;
-	srand(55);
-	for (i = 0 ; i < run_size/4; i++) {
-		num = next_key(i, max_key);
-		ins.objectid = num;
-		btrfs_init_path(&path);
-		ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1);
-		if (!ret) {
-			if (i % 10000 == 0)
-				fprintf(stderr, "del %d:%d\n", num, i);
-			ret = btrfs_del_item(trans, root, &path);
-			if (ret != 0)
-				BUG();
-			tree_size--;
-		}
-		btrfs_release_path(root, &path);
-	}
-	close_ctree(root, &super);
-	root = open_ctree("dbfile", &super);
-	srand(128);
-	for (i = 0; i < run_size; i++) {
-		buf = malloc(64);
-		num = next_key(i, max_key);
-		sprintf(buf, "string-%d", num);
-		ins.objectid = num;
-		if (i % 10000 == 0)
-			fprintf(stderr, "insert %d:%d\n", num, i);
-		ret = btrfs_insert_item(trans, root, &ins, buf, strlen(buf));
-		if (!ret)
-			tree_size++;
-		free(buf);
-	}
-	close_ctree(root, &super);
-	root = open_ctree("dbfile", &super);
-	srand(128);
-	printf("starting search2\n");
-	for (i = 0; i < run_size; i++) {
-		num = next_key(i, max_key);
-		ins.objectid = num;
-		btrfs_init_path(&path);
-		if (i % 10000 == 0)
-			fprintf(stderr, "search %d:%d\n", num, i);
-		ret = btrfs_search_slot(trans, root, &ins, &path, 0, 0);
-		if (ret) {
-			btrfs_print_tree(root, root->node);
-			printf("unable to find %d\n", num);
-			exit(1);
-		}
-		btrfs_release_path(root, &path);
-	}
-	printf("starting big long delete run\n");
-	while(root->node &&
-	      btrfs_header_nritems(&root->node->node.header) > 0) {
-		struct btrfs_leaf *leaf;
-		int slot;
-		ins.objectid = (u64)-1;
-		btrfs_init_path(&path);
-		ret = btrfs_search_slot(trans, root, &ins, &path, -1, 1);
-		if (ret == 0)
-			BUG();
-
-		leaf = &path.nodes[0]->leaf;
-		slot = path.slots[0];
-		if (slot != btrfs_header_nritems(&leaf->header))
-			BUG();
-		while(path.slots[0] > 0) {
-			path.slots[0] -= 1;
-			slot = path.slots[0];
-			leaf = &path.nodes[0]->leaf;
-
-			btrfs_disk_key_to_cpu(&last, &leaf->items[slot].key);
-			if (tree_size % 10000 == 0)
-				printf("big del %d:%d\n", tree_size, i);
-			ret = btrfs_del_item(trans, root, &path);
-			if (ret != 0) {
-				printf("del_item returned %d\n", ret);
-				BUG();
-			}
-			tree_size--;
-		}
-		btrfs_release_path(root, &path);
-	}
-	/*
-	printf("previous tree:\n");
-	btrfs_print_tree(root, root->commit_root);
-	printf("map before commit\n");
-	btrfs_print_tree(root->extent_root, root->extent_root->node);
-	*/
-	btrfs_commit_transaction(trans, root, &super);
-	printf("tree size is now %d\n", tree_size);
-	printf("root %p commit root %p\n", root->node, root->commit_root);
-	printf("map tree\n");
-	btrfs_print_tree(root->fs_info->extent_root,
-			 root->fs_info->extent_root->node);
-	close_ctree(root, &super);
-	return 0;
-}
diff --git a/fs/btrfs/radix-tree.c b/fs/btrfs/radix-tree.c
deleted file mode 100644
index baa25ca1c2a..00000000000
--- a/fs/btrfs/radix-tree.c
+++ /dev/null
@@ -1,836 +0,0 @@
-/*
- * Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "kerncompat.h"
-#include "radix-tree.h"
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
-#else
-#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
-#endif
-
-#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
-
-#define RADIX_TREE_TAG_LONGS	\
-	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-
-struct radix_tree_node {
-	unsigned int	count;
-	void		*slots[RADIX_TREE_MAP_SIZE];
-	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
-};
-
-struct radix_tree_path {
-	struct radix_tree_node *node;
-	int offset;
-};
-
-#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
-#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
-
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
-
-/*
- * Per-cpu pool of preloaded nodes
- */
-struct radix_tree_preload {
-	int nr;
-	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
-};
-struct radix_tree_preload radix_tree_preloads = { 0, };
-
-static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
-{
-	return root->gfp_mask & __GFP_BITS_MASK;
-}
-
-static int internal_nodes = 0;
-/*
- * This assumes that the caller has performed appropriate preallocation, and
- * that the caller has pinned this thread of control to the current CPU.
- */
-static struct radix_tree_node *
-radix_tree_node_alloc(struct radix_tree_root *root)
-{
-	struct radix_tree_node *ret;
-	ret = malloc(sizeof(struct radix_tree_node));
-	if (ret) {
-		memset(ret, 0, sizeof(struct radix_tree_node));
-		internal_nodes++;
-	}
-	return ret;
-}
-
-static inline void
-radix_tree_node_free(struct radix_tree_node *node)
-{
-	internal_nodes--;
-	free(node);
-}
-
-/*
- * Load up this CPU's radix_tree_node buffer with sufficient objects to
- * ensure that the addition of a single element in the tree cannot fail.  On
- * success, return zero, with preemption disabled.  On error, return -ENOMEM
- * with preemption not disabled.
- */
-int radix_tree_preload(gfp_t gfp_mask)
-{
-	struct radix_tree_preload *rtp;
-	struct radix_tree_node *node;
-	int ret = -ENOMEM;
-
-	preempt_disable();
-	rtp = &__get_cpu_var(radix_tree_preloads);
-	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
-		preempt_enable();
-		node = radix_tree_node_alloc(NULL);
-		if (node == NULL)
-			goto out;
-		preempt_disable();
-		rtp = &__get_cpu_var(radix_tree_preloads);
-		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
-			rtp->nodes[rtp->nr++] = node;
-		else
-			radix_tree_node_free(node);
-	}
-	ret = 0;
-out:
-	return ret;
-}
-
-static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	__set_bit(offset, node->tags[tag]);
-}
-
-static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	__clear_bit(offset, node->tags[tag]);
-}
-
-static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
-		int offset)
-{
-	return test_bit(offset, node->tags[tag]);
-}
-
-static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
-{
-	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
-}
-
-
-static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
-{
-	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
-}
-
-static inline void root_tag_clear_all(struct radix_tree_root *root)
-{
-	root->gfp_mask &= __GFP_BITS_MASK;
-}
-
-static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
-{
-	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
-}
-
-/*
- * Returns 1 if any slot in the node has this tag set.
- * Otherwise returns 0.
- */
-static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
-{
-	int idx;
-	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
-		if (node->tags[tag][idx])
-			return 1;
-	}
-	return 0;
-}
-
-/*
- *	Return the maximum key which can be store into a
- *	radix tree with height HEIGHT.
- */
-static inline unsigned long radix_tree_maxindex(unsigned int height)
-{
-	return height_to_maxindex[height];
-}
-
-/*
- *	Extend a radix tree so it can store key @index.
- */
-static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
-{
-	struct radix_tree_node *node;
-	unsigned int height;
-	int tag;
-
-	/* Figure out what the height should be.  */
-	height = root->height + 1;
-	while (index > radix_tree_maxindex(height))
-		height++;
-
-	if (root->rnode == NULL) {
-		root->height = height;
-		goto out;
-	}
-
-	do {
-		if (!(node = radix_tree_node_alloc(root)))
-			return -ENOMEM;
-
-		/* Increase the height.  */
-		node->slots[0] = root->rnode;
-
-		/* Propagate the aggregated tag info into the new root */
-		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-			if (root_tag_get(root, tag))
-				tag_set(node, tag, 0);
-		}
-
-		node->count = 1;
-		root->rnode = node;
-		root->height++;
-	} while (height > root->height);
-out:
-	return 0;
-}
-
-/**
- *	radix_tree_insert    -    insert into a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *	@item:		item to insert
- *
- *	Insert an item into the radix tree at position @index.
- */
-int radix_tree_insert(struct radix_tree_root *root,
-			unsigned long index, void *item)
-{
-	struct radix_tree_node *node = NULL, *slot;
-	unsigned int height, shift;
-	int offset;
-	int error;
-
-	/* Make sure the tree is high enough.  */
-	if (index > radix_tree_maxindex(root->height)) {
-		error = radix_tree_extend(root, index);
-		if (error)
-			return error;
-	}
-
-	slot = root->rnode;
-	height = root->height;
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-
-	offset = 0;			/* uninitialised var warning */
-	while (height > 0) {
-		if (slot == NULL) {
-			/* Have to add a child node.  */
-			if (!(slot = radix_tree_node_alloc(root)))
-				return -ENOMEM;
-			if (node) {
-				node->slots[offset] = slot;
-				node->count++;
-			} else
-				root->rnode = slot;
-		}
-
-		/* Go a level down */
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		node = slot;
-		slot = node->slots[offset];
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	if (slot != NULL)
-		return -EEXIST;
-
-	if (node) {
-		node->count++;
-		node->slots[offset] = item;
-		BUG_ON(tag_get(node, 0, offset));
-		BUG_ON(tag_get(node, 1, offset));
-	} else {
-		root->rnode = item;
-		BUG_ON(root_tag_get(root, 0));
-		BUG_ON(root_tag_get(root, 1));
-	}
-
-	return 0;
-}
-
-static inline void **__lookup_slot(struct radix_tree_root *root,
-				   unsigned long index)
-{
-	unsigned int height, shift;
-	struct radix_tree_node **slot;
-
-	height = root->height;
-
-	if (index > radix_tree_maxindex(height))
-		return NULL;
-
-	if (height == 0 && root->rnode)
-		return (void **)&root->rnode;
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = &root->rnode;
-
-	while (height > 0) {
-		if (*slot == NULL)
-			return NULL;
-
-		slot = (struct radix_tree_node **)
-			((*slot)->slots +
-				((index >> shift) & RADIX_TREE_MAP_MASK));
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	return (void **)slot;
-}
-
-/**
- *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the slot corresponding to the position @index in the radix tree
- *	@root. This is useful for update-if-exists operations.
- */
-void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
-{
-	return __lookup_slot(root, index);
-}
-
-/**
- *	radix_tree_lookup    -    perform lookup operation on a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Lookup the item at the position @index in the radix tree @root.
- */
-void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
-{
-	void **slot;
-
-	slot = __lookup_slot(root, index);
-	return slot != NULL ? *slot : NULL;
-}
-
-/**
- *	radix_tree_tag_set - set a tag on a radix tree node
- *	@root:		radix tree root
- *	@index:		index key
- *	@tag: 		tag index
- *
- *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
- *	corresponding to @index in the radix tree.  From
- *	the root all the way down to the leaf node.
- *
- *	Returns the address of the tagged item.   Setting a tag on a not-present
- *	item is a bug.
- */
-void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag)
-{
-	unsigned int height, shift;
-	struct radix_tree_node *slot;
-
-	height = root->height;
-	BUG_ON(index > radix_tree_maxindex(height));
-
-	slot = root->rnode;
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-
-	while (height > 0) {
-		int offset;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(slot, tag, offset))
-			tag_set(slot, tag, offset);
-		slot = slot->slots[offset];
-		BUG_ON(slot == NULL);
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	/* set the root's tag bit */
-	if (slot && !root_tag_get(root, tag))
-		root_tag_set(root, tag);
-
-	return slot;
-}
-
-/**
- *	radix_tree_tag_clear - clear a tag on a radix tree node
- *	@root:		radix tree root
- *	@index:		index key
- *	@tag: 		tag index
- *
- *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
- *	corresponding to @index in the radix tree.  If
- *	this causes the leaf node to have no tags set then clear the tag in the
- *	next-to-leaf node, etc.
- *
- *	Returns the address of the tagged item on success, else NULL.  ie:
- *	has the same return value and semantics as radix_tree_lookup().
- */
-void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag)
-{
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_node *slot = NULL;
-	unsigned int height, shift;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
-	slot = root->rnode;
-
-	while (height > 0) {
-		int offset;
-
-		if (slot == NULL)
-			goto out;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp[1].offset = offset;
-		pathp[1].node = slot;
-		slot = slot->slots[offset];
-		pathp++;
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-
-	if (slot == NULL)
-		goto out;
-
-	while (pathp->node) {
-		if (!tag_get(pathp->node, tag, pathp->offset))
-			goto out;
-		tag_clear(pathp->node, tag, pathp->offset);
-		if (any_tag_set(pathp->node, tag))
-			goto out;
-		pathp--;
-	}
-
-	/* clear the root's tag bit */
-	if (root_tag_get(root, tag))
-		root_tag_clear(root, tag);
-
-out:
-	return slot;
-}
-
-#ifndef __KERNEL__	/* Only the test harness uses this at present */
-/**
- * radix_tree_tag_get - get a tag on a radix tree node
- * @root:		radix tree root
- * @index:		index key
- * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
- *
- * Return values:
- *
- *  0: tag not present or not set
- *  1: tag set
- */
-int radix_tree_tag_get(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag)
-{
-	unsigned int height, shift;
-	struct radix_tree_node *slot;
-	int saw_unset_tag = 0;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		return 0;
-
-	/* check the root's tag bit */
-	if (!root_tag_get(root, tag))
-		return 0;
-
-	if (height == 0)
-		return 1;
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	for ( ; ; ) {
-		int offset;
-
-		if (slot == NULL)
-			return 0;
-
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-
-		/*
-		 * This is just a debug check.  Later, we can bale as soon as
-		 * we see an unset tag.
-		 */
-		if (!tag_get(slot, tag, offset))
-			saw_unset_tag = 1;
-		if (height == 1) {
-			int ret = tag_get(slot, tag, offset);
-
-			BUG_ON(ret && saw_unset_tag);
-			return !!ret;
-		}
-		slot = slot->slots[offset];
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	}
-}
-#endif
-
-static unsigned int
-__lookup(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index)
-{
-	unsigned int nr_found = 0;
-	unsigned int shift, height;
-	struct radix_tree_node *slot;
-	unsigned long i;
-
-	height = root->height;
-	if (height == 0) {
-		if (root->rnode && index == 0)
-			results[nr_found++] = root->rnode;
-		goto out;
-	}
-
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	for ( ; height > 1; height--) {
-
-		for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
-				i < RADIX_TREE_MAP_SIZE; i++) {
-			if (slot->slots[i] != NULL)
-				break;
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
-
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	}
-
-	/* Bottom level: grab some items */
-	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
-		index++;
-		if (slot->slots[i]) {
-			results[nr_found++] = slot->slots[i];
-			if (nr_found == max_items)
-				goto out;
-		}
-	}
-out:
-	*next_index = index;
-	return nr_found;
-}
-
-/**
- *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@max_items:	place up to this many items at *results
- *
- *	Performs an index-ascending scan of the tree for present items.  Places
- *	them at *@results and returns the number of items which were placed at
- *	*@results.
- *
- *	The implementation is naive.
- */
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	while (ret < max_items) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup(root, results + ret, cur_index,
-					max_items - ret, &next_index);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-/*
- * FIXME: the two tag_get()s here should use find_next_bit() instead of
- * open-coding the search.
- */
-static unsigned int
-__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
-	unsigned int max_items, unsigned long *next_index, unsigned int tag)
-{
-	unsigned int nr_found = 0;
-	unsigned int shift;
-	unsigned int height = root->height;
-	struct radix_tree_node *slot;
-
-	if (height == 0) {
-		if (root->rnode && index == 0)
-			results[nr_found++] = root->rnode;
-		goto out;
-	}
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	slot = root->rnode;
-
-	do {
-		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
-
-		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
-			if (tag_get(slot, tag, i)) {
-				BUG_ON(slot->slots[i] == NULL);
-				break;
-			}
-			index &= ~((1UL << shift) - 1);
-			index += 1UL << shift;
-			if (index == 0)
-				goto out;	/* 32-bit wraparound */
-		}
-		if (i == RADIX_TREE_MAP_SIZE)
-			goto out;
-		height--;
-		if (height == 0) {	/* Bottom level: grab some items */
-			unsigned long j = index & RADIX_TREE_MAP_MASK;
-
-			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
-				index++;
-				if (tag_get(slot, tag, j)) {
-					BUG_ON(slot->slots[j] == NULL);
-					results[nr_found++] = slot->slots[j];
-					if (nr_found == max_items)
-						goto out;
-				}
-			}
-		}
-		shift -= RADIX_TREE_MAP_SHIFT;
-		slot = slot->slots[i];
-	} while (height > 0);
-out:
-	*next_index = index;
-	return nr_found;
-}
-
-/**
- *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
- *	                             based on a tag
- *	@root:		radix tree root
- *	@results:	where the results of the lookup are placed
- *	@first_index:	start the lookup from this key
- *	@max_items:	place up to this many items at *results
- *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
- *
- *	Performs an index-ascending scan of the tree for present items which
- *	have the tag indexed by @tag set.  Places the items at *@results and
- *	returns the number of items which were placed at *@results.
- */
-unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items,
-		unsigned int tag)
-{
-	const unsigned long max_index = radix_tree_maxindex(root->height);
-	unsigned long cur_index = first_index;
-	unsigned int ret = 0;
-
-	/* check the root's tag bit */
-	if (!root_tag_get(root, tag))
-		return 0;
-
-	while (ret < max_items) {
-		unsigned int nr_found;
-		unsigned long next_index;	/* Index of next search */
-
-		if (cur_index > max_index)
-			break;
-		nr_found = __lookup_tag(root, results + ret, cur_index,
-					max_items - ret, &next_index, tag);
-		ret += nr_found;
-		if (next_index == 0)
-			break;
-		cur_index = next_index;
-	}
-	return ret;
-}
-
-/**
- *	radix_tree_shrink    -    shrink height of a radix tree to minimal
- *	@root		radix tree root
- */
-static inline void radix_tree_shrink(struct radix_tree_root *root)
-{
-	/* try to shrink tree height */
-	while (root->height > 0 &&
-			root->rnode->count == 1 &&
-			root->rnode->slots[0]) {
-		struct radix_tree_node *to_free = root->rnode;
-
-		root->rnode = to_free->slots[0];
-		root->height--;
-		/* must only free zeroed nodes into the slab */
-		tag_clear(to_free, 0, 0);
-		tag_clear(to_free, 1, 0);
-		to_free->slots[0] = NULL;
-		to_free->count = 0;
-		radix_tree_node_free(to_free);
-	}
-}
-
-/**
- *	radix_tree_delete    -    delete an item from a radix tree
- *	@root:		radix tree root
- *	@index:		index key
- *
- *	Remove the item at @index from the radix tree rooted at @root.
- *
- *	Returns the address of the deleted item, or NULL if it was not present.
- */
-void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
-{
-	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
-	struct radix_tree_node *slot = NULL;
-	unsigned int height, shift;
-	int tag;
-	int offset;
-
-	height = root->height;
-	if (index > radix_tree_maxindex(height))
-		goto out;
-
-	slot = root->rnode;
-	if (height == 0 && root->rnode) {
-		root_tag_clear_all(root);
-		root->rnode = NULL;
-		goto out;
-	}
-
-	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
-	pathp->node = NULL;
-
-	do {
-		if (slot == NULL)
-			goto out;
-
-		pathp++;
-		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		pathp->offset = offset;
-		pathp->node = slot;
-		slot = slot->slots[offset];
-		shift -= RADIX_TREE_MAP_SHIFT;
-		height--;
-	} while (height > 0);
-
-	if (slot == NULL)
-		goto out;
-
-	/*
-	 * Clear all tags associated with the just-deleted item
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (tag_get(pathp->node, tag, pathp->offset))
-			radix_tree_tag_clear(root, index, tag);
-	}
-
-	/* Now free the nodes we do not need anymore */
-	while (pathp->node) {
-		pathp->node->slots[pathp->offset] = NULL;
-		pathp->node->count--;
-
-		if (pathp->node->count) {
-			if (pathp->node == root->rnode)
-				radix_tree_shrink(root);
-			goto out;
-		}
-
-		/* Node with zero slots in use so free it */
-		radix_tree_node_free(pathp->node);
-
-		pathp--;
-	}
-	root_tag_clear_all(root);
-	root->height = 0;
-	root->rnode = NULL;
-
-out:
-	return slot;
-}
-
-/**
- *	radix_tree_tagged - test whether any items in the tree are tagged
- *	@root:		radix tree root
- *	@tag:		tag to test
- */
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
-{
-	return root_tag_get(root, tag);
-}
-
-static unsigned long __maxindex(unsigned int height)
-{
-	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
-	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
-
-	if (tmp >= RADIX_TREE_INDEX_BITS)
-		index = ~0UL;
-	return index;
-}
-
-static void radix_tree_init_maxindex(void)
-{
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
-		height_to_maxindex[i] = __maxindex(i);
-}
-
-void radix_tree_init(void)
-{
-	radix_tree_init_maxindex();
-}
diff --git a/fs/btrfs/radix-tree.h b/fs/btrfs/radix-tree.h
deleted file mode 100644
index c3ce88137f7..00000000000
--- a/fs/btrfs/radix-tree.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2001 Momchil Velikov
- * Portions Copyright (C) 2001 Christoph Hellwig
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2, or (at
- * your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- * 
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef _LINUX_RADIX_TREE_H
-#define _LINUX_RADIX_TREE_H
-
-#define RADIX_TREE_MAX_TAGS 2
-
-/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
-struct radix_tree_root {
-	unsigned int		height;
-	gfp_t			gfp_mask;
-	struct radix_tree_node	*rnode;
-};
-
-#define RADIX_TREE_INIT(mask)	{					\
-	.height = 0,							\
-	.gfp_mask = (mask),						\
-	.rnode = NULL,							\
-}
-
-#define RADIX_TREE(name, mask) \
-	struct radix_tree_root name = RADIX_TREE_INIT(mask)
-
-#define INIT_RADIX_TREE(root, mask)					\
-do {									\
-	(root)->height = 0;						\
-	(root)->gfp_mask = (mask);					\
-	(root)->rnode = NULL;						\
-} while (0)
-
-int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
-void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
-void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items);
-int radix_tree_preload(gfp_t gfp_mask);
-void radix_tree_init(void);
-void *radix_tree_tag_set(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-void *radix_tree_tag_clear(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-int radix_tree_tag_get(struct radix_tree_root *root,
-			unsigned long index, unsigned int tag);
-unsigned int
-radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
-		unsigned long first_index, unsigned int max_items,
-		unsigned int tag);
-int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
-
-static inline void radix_tree_preload_end(void)
-{
-	preempt_enable();
-}
-
-#endif /* _LINUX_RADIX_TREE_H */
diff --git a/fs/btrfs/random-test.c b/fs/btrfs/random-test.c
deleted file mode 100644
index 3a38ae7a886..00000000000
--- a/fs/btrfs/random-test.c
+++ /dev/null
@@ -1,405 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
-#include "ctree.h"
-#include "disk-io.h"
-#include "print-tree.h"
-#include "transaction.h"
-
-int keep_running = 1;
-struct btrfs_super_block super;
-
-static int setup_key(struct radix_tree_root *root, struct btrfs_key *key,
-		     int exists)
-{
-	int num = rand();
-	unsigned long res[2];
-	int ret;
-
-	key->flags = 0;
-	btrfs_set_key_type(key, BTRFS_STRING_ITEM_KEY);
-	key->offset = 0;
-again:
-	ret = radix_tree_gang_lookup(root, (void **)res, num, 2);
-	if (exists) {
-		if (ret == 0)
-			return -1;
-		num = res[0];
-	} else if (ret != 0 && num == res[0]) {
-		num++;
-		if (ret > 1 && num == res[1]) {
-			num++;
-			goto again;
-		}
-	}
-	key->objectid = num;
-	return 0;
-}
-
-static int ins_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	int ret;
-	char buf[128];
-	unsigned long oid;
-	btrfs_init_path(&path);
-	ret = setup_key(radix, &key, 0);
-	sprintf(buf, "str-%Lu\n", key.objectid);
-	ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf));
-	if (ret)
-		goto error;
-	oid = (unsigned long)key.objectid;
-	radix_tree_preload(GFP_KERNEL);
-	ret = radix_tree_insert(radix, oid, (void *)oid);
-	radix_tree_preload_end();
-	if (ret)
-		goto error;
-	return ret;
-error:
-	printf("failed to insert %Lu\n", key.objectid);
-	return -1;
-}
-
-static int insert_dup(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	int ret;
-	char buf[128];
-	btrfs_init_path(&path);
-	ret = setup_key(radix, &key, 1);
-	if (ret < 0)
-		return 0;
-	sprintf(buf, "str-%Lu\n", key.objectid);
-	ret = btrfs_insert_item(trans, root, &key, buf, strlen(buf));
-	if (ret != -EEXIST) {
-		printf("insert on %Lu gave us %d\n", key.objectid, ret);
-		return 1;
-	}
-	return 0;
-}
-
-static int del_one(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	int ret;
-	unsigned long *ptr;
-	btrfs_init_path(&path);
-	ret = setup_key(radix, &key, 1);
-	if (ret < 0)
-		return 0;
-	ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
-	if (ret)
-		goto error;
-	ret = btrfs_del_item(trans, root, &path);
-	btrfs_release_path(root, &path);
-	if (ret != 0)
-		goto error;
-	ptr = radix_tree_delete(radix, key.objectid);
-	if (!ptr)
-		goto error;
-	return 0;
-error:
-	printf("failed to delete %Lu\n", key.objectid);
-	return -1;
-}
-
-static int lookup_item(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	int ret;
-	btrfs_init_path(&path);
-	ret = setup_key(radix, &key, 1);
-	if (ret < 0)
-		return 0;
-	ret = btrfs_search_slot(trans, root, &key, &path, 0, 1);
-	btrfs_release_path(root, &path);
-	if (ret)
-		goto error;
-	return 0;
-error:
-	printf("unable to find key %Lu\n", key.objectid);
-	return -1;
-}
-
-static int lookup_enoent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	int ret;
-	btrfs_init_path(&path);
-	ret = setup_key(radix, &key, 0);
-	if (ret < 0)
-		return ret;
-	ret = btrfs_search_slot(trans, root, &key, &path, 0, 0);
-	btrfs_release_path(root, &path);
-	if (ret <= 0)
-		goto error;
-	return 0;
-error:
-	printf("able to find key that should not exist %Lu\n", key.objectid);
-	return -1;
-}
-
-static int empty_tree(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct radix_tree_root *radix, int nr)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	unsigned long found = 0;
-	int ret;
-	int slot;
-	int *ptr;
-	int count = 0;
-
-	key.offset = 0;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY);
-	key.objectid = (unsigned long)-1;
-	while(nr-- >= 0) {
-		btrfs_init_path(&path);
-		ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
-		if (ret < 0) {
-			btrfs_release_path(root, &path);
-			return ret;
-		}
-		if (ret != 0) {
-			if (path.slots[0] == 0) {
-				btrfs_release_path(root, &path);
-				break;
-			}
-			path.slots[0] -= 1;
-		}
-		slot = path.slots[0];
-		found = btrfs_disk_key_objectid(
-					&path.nodes[0]->leaf.items[slot].key);
-		ret = btrfs_del_item(trans, root, &path);
-		count++;
-		if (ret) {
-			fprintf(stderr,
-				"failed to remove %lu from tree\n",
-				found);
-			return -1;
-		}
-		btrfs_release_path(root, &path);
-		ptr = radix_tree_delete(radix, found);
-		if (!ptr)
-			goto error;
-		if (!keep_running)
-			break;
-	}
-	return 0;
-error:
-	fprintf(stderr, "failed to delete from the radix %lu\n", found);
-	return -1;
-}
-
-static int fill_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct radix_tree_root *radix, int count)
-{
-	int i;
-	int ret = 0;
-	for (i = 0; i < count; i++) {
-		ret = ins_one(trans, root, radix);
-		if (ret) {
-			fprintf(stderr, "fill failed\n");
-			goto out;
-		}
-		if (i % 1000 == 0) {
-			ret = btrfs_commit_transaction(trans, root, &super);
-			if (ret) {
-				fprintf(stderr, "fill commit failed\n");
-				return ret;
-			}
-		}
-		if (i && i % 10000 == 0) {
-			printf("bigfill %d\n", i);
-		}
-		if (!keep_running)
-			break;
-	}
-out:
-	return ret;
-}
-
-static int bulk_op(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct radix_tree_root *radix)
-{
-	int ret;
-	int nr = rand() % 5000;
-	static int run_nr = 0;
-
-	/* do the bulk op much less frequently */
-	if (run_nr++ % 100)
-		return 0;
-	ret = empty_tree(trans, root, radix, nr);
-	if (ret)
-		return ret;
-	ret = fill_tree(trans, root, radix, nr);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-
-int (*ops[])(struct btrfs_trans_handle *,
-	     struct btrfs_root *root, struct radix_tree_root *radix) =
-	{ ins_one, insert_dup, del_one, lookup_item,
-	  lookup_enoent, bulk_op };
-
-static int fill_radix(struct btrfs_root *root, struct radix_tree_root *radix)
-{
-	struct btrfs_path path;
-	struct btrfs_key key;
-	unsigned long found;
-	int ret;
-	int slot;
-	int i;
-
-	key.offset = 0;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_STRING_ITEM_KEY);
-	key.objectid = (unsigned long)-1;
-	while(1) {
-		btrfs_init_path(&path);
-		ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
-		if (ret < 0) {
-			btrfs_release_path(root, &path);
-			return ret;
-		}
-		slot = path.slots[0];
-		if (ret != 0) {
-			if (slot == 0) {
-				btrfs_release_path(root, &path);
-				break;
-			}
-			slot -= 1;
-		}
-		for (i = slot; i >= 0; i--) {
-			found = btrfs_disk_key_objectid(&path.nodes[0]->
-							leaf.items[i].key);
-			radix_tree_preload(GFP_KERNEL);
-			ret = radix_tree_insert(radix, found, (void *)found);
-			if (ret) {
-				fprintf(stderr,
-					"failed to insert %lu into radix\n",
-					found);
-				exit(1);
-			}
-
-			radix_tree_preload_end();
-		}
-		btrfs_release_path(root, &path);
-		key.objectid = found - 1;
-		if (key.objectid > found)
-			break;
-	}
-	return 0;
-}
-void sigstopper(int ignored)
-{
-	keep_running = 0;
-	fprintf(stderr, "caught exit signal, stopping\n");
-}
-
-int print_usage(void)
-{
-	printf("usage: tester [-ih] [-c count] [-f count]\n");
-	printf("\t -c count -- iteration count after filling\n");
-	printf("\t -f count -- run this many random inserts before starting\n");
-	printf("\t -i       -- only do initial fill\n");
-	printf("\t -h       -- this help text\n");
-	exit(1);
-}
-int main(int ac, char **av)
-{
-	RADIX_TREE(radix, GFP_KERNEL);
-	struct btrfs_root *root;
-	int i;
-	int ret;
-	int count;
-	int op;
-	int iterations = 20000;
-	int init_fill_count = 800000;
-	int err = 0;
-	int initial_only = 0;
-	struct btrfs_trans_handle *trans;
-	radix_tree_init();
-	root = open_ctree("dbfile", &super);
-	fill_radix(root, &radix);
-
-	signal(SIGTERM, sigstopper);
-	signal(SIGINT, sigstopper);
-
-	for (i = 1 ; i < ac ; i++) {
-		if (strcmp(av[i], "-i") == 0) {
-			initial_only = 1;
-		} else if (strcmp(av[i], "-c") == 0) {
-			iterations = atoi(av[i+1]);
-			i++;
-		} else if (strcmp(av[i], "-f") == 0) {
-			init_fill_count = atoi(av[i+1]);
-			i++;
-		} else {
-			print_usage();
-		}
-	}
-	printf("initial fill\n");
-	trans = btrfs_start_transaction(root, 1);
-	ret = fill_tree(trans, root, &radix, init_fill_count);
-	printf("starting run\n");
-	if (ret) {
-		err = ret;
-		goto out;
-	}
-	if (initial_only == 1) {
-		goto out;
-	}
-	for (i = 0; i < iterations; i++) {
-		op = rand() % ARRAY_SIZE(ops);
-		count = rand() % 128;
-		if (i % 2000 == 0) {
-			printf("%d\n", i);
-			fflush(stdout);
-		}
-		if (i && i % 5000 == 0) {
-			printf("open & close, root level %d nritems %d\n",
-				btrfs_header_level(&root->node->node.header),
-				btrfs_header_nritems(&root->node->node.header));
-			close_ctree(root, &super);
-			root = open_ctree("dbfile", &super);
-		}
-		while(count--) {
-			ret = ops[op](trans, root, &radix);
-			if (ret) {
-				fprintf(stderr, "op %d failed %d:%d\n",
-					op, i, iterations);
-				btrfs_print_tree(root, root->node);
-				fprintf(stderr, "op %d failed %d:%d\n",
-					op, i, iterations);
-				err = ret;
-				goto out;
-			}
-			if (ops[op] == bulk_op)
-				break;
-			if (keep_running == 0) {
-				err = 0;
-				goto out;
-			}
-		}
-	}
-out:
-	close_ctree(root, &super);
-	return err;
-}
-
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9cccecc0f43..52c83be4b30 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -1,7 +1,4 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
new file mode 100644
index 00000000000..4ae76044aea
--- /dev/null
+++ b/fs/btrfs/super.c
@@ -0,0 +1,205 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include "ctree.h"
+
+#define BTRFS_SUPER_MAGIC 0x9123682E
+#if 0
+/* some random number */
+
+static struct super_operations ramfs_ops;
+static struct inode_operations ramfs_dir_inode_operations;
+
+static struct backing_dev_info ramfs_backing_dev_info = {
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK |
+			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
+			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
+};
+
+struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+{
+	struct inode * inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blocks = 0;
+		inode->i_mapping->a_ops = &ramfs_aops;
+		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		switch (mode & S_IFMT) {
+		default:
+			init_special_inode(inode, mode, dev);
+			break;
+		case S_IFREG:
+			inode->i_op = &ramfs_file_inode_operations;
+			inode->i_fop = &ramfs_file_operations;
+			break;
+		case S_IFDIR:
+			inode->i_op = &ramfs_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+
+			/* directory inodes start off with i_nlink == 2 (for "." entry) */
+			inc_nlink(inode);
+			break;
+		case S_IFLNK:
+			inode->i_op = &page_symlink_inode_operations;
+			break;
+		}
+	}
+	return inode;
+}
+
+/*
+ * File creation. Allocate an inode, and we're done..
+ */
+/* SMP-safe */
+static int
+ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
+{
+	struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
+	int error = -ENOSPC;
+
+	if (inode) {
+		if (dir->i_mode & S_ISGID) {
+			inode->i_gid = dir->i_gid;
+			if (S_ISDIR(mode))
+				inode->i_mode |= S_ISGID;
+		}
+		d_instantiate(dentry, inode);
+		dget(dentry);	/* Extra count - pin the dentry in core */
+		error = 0;
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	}
+	return error;
+}
+
+static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+{
+	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+	if (!retval)
+		inc_nlink(dir);
+	return retval;
+}
+
+static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
+{
+	return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
+}
+
+static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+
+	inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
+	if (inode) {
+		int l = strlen(symname)+1;
+		error = page_symlink(inode, symname, l);
+		if (!error) {
+			if (dir->i_mode & S_ISGID)
+				inode->i_gid = dir->i_gid;
+			d_instantiate(dentry, inode);
+			dget(dentry);
+			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+		} else
+			iput(inode);
+	}
+	return error;
+}
+
+static struct inode_operations ramfs_dir_inode_operations = {
+	.create		= ramfs_create,
+	.lookup		= simple_lookup,
+	.link		= simple_link,
+	.unlink		= simple_unlink,
+	.symlink	= ramfs_symlink,
+	.mkdir		= ramfs_mkdir,
+	.rmdir		= simple_rmdir,
+	.mknod		= ramfs_mknod,
+	.rename		= simple_rename,
+};
+#endif
+
+struct inode *btrfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+{
+	struct inode * inode = new_inode(sb);
+
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blocks = 0;
+		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	}
+	return inode;
+}
+
+static struct super_operations btrfs_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+};
+
+static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+	struct inode * inode;
+	struct dentry * root;
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = BTRFS_SUPER_MAGIC;
+	sb->s_op = &btrfs_ops;
+	sb->s_time_gran = 1;
+	inode = btrfs_get_inode(sb, S_IFDIR | 0755, 0);
+	if (!inode)
+		return -ENOMEM;
+
+	root = d_alloc_root(inode);
+	if (!root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+	sb->s_root = root;
+	return 0;
+}
+
+static int btrfs_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data,
+			   btrfs_fill_super, mnt);
+}
+
+static struct file_system_type btrfs_fs_type = {
+	.owner		= THIS_MODULE,
+	.name		= "btrfs",
+	.get_sb		= btrfs_get_sb,
+	.kill_sb	= kill_block_super,
+	.fs_flags	= FS_REQUIRES_DEV,
+};
+
+static int __init init_btrfs_fs(void)
+{
+	printk("btrfs loaded!\n");
+	return register_filesystem(&btrfs_fs_type);
+}
+
+static void __exit exit_btrfs_fs(void)
+{
+	unregister_filesystem(&btrfs_fs_type);
+	printk("btrfs unloaded\n");
+}
+
+module_init(init_btrfs_fs)
+module_exit(exit_btrfs_fs)
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From e20d96d64f9cf9288ffecc9ad4714e91c3b97ca8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 22 Mar 2007 12:13:20 -0400
Subject: Mountable btrfs, with readdir

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   5 +-
 fs/btrfs/ctree.c       | 273 ++++++++++++++++++++++++-----------------------
 fs/btrfs/ctree.h       |  62 ++++++++---
 fs/btrfs/dir-item.c    |  17 ++-
 fs/btrfs/disk-io.c     | 281 +++++++++++++++----------------------------------
 fs/btrfs/disk-io.h     |  51 +++++----
 fs/btrfs/extent-tree.c | 144 +++++++++++++------------
 fs/btrfs/hash.c        |   9 ++
 fs/btrfs/inode-map.c   |   8 +-
 fs/btrfs/print-tree.c  |  20 ++--
 fs/btrfs/print-tree.h  |   2 +-
 fs/btrfs/root-tree.c   |   4 +-
 fs/btrfs/super.c       | 261 +++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/transaction.h |   4 +-
 14 files changed, 651 insertions(+), 490 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 99e45a54ebd..1a0fb742239 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -2,7 +2,8 @@ ifneq ($(KERNELRELEASE),)
 # kbuild part of makefile
 
 obj-m  := btrfs.o
-btrfs-y := super.o
+btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
+	   hash.o file-item.o inode-item.o inode-map.o disk-io.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
@@ -14,7 +15,7 @@ else
 
 KERNELDIR := /lib/modules/`uname -r`/build
 all::
-	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
+	$(MAKE) C=1 -C $(KERNELDIR) M=`pwd` modules
 clean::
 	rm *.o btrfs.ko
 endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9fbd07c37fd..e690e2bb47d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -7,11 +7,11 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int data_size);
 static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_buffer *dst, struct btrfs_buffer
+			  *root, struct buffer_head *dst, struct buffer_head
 			  *src);
 static int balance_node_right(struct btrfs_trans_handle *trans, struct
-			      btrfs_root *root, struct btrfs_buffer *dst_buf,
-			      struct btrfs_buffer *src_buf);
+			      btrfs_root *root, struct buffer_head *dst_buf,
+			      struct buffer_head *src_buf);
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
@@ -32,32 +32,34 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 }
 
 static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_buffer *buf, struct btrfs_buffer
-			   *parent, int parent_slot, struct btrfs_buffer
+			   *root, struct buffer_head *buf, struct buffer_head
+			   *parent, int parent_slot, struct buffer_head
 			   **cow_ret)
 {
-	struct btrfs_buffer *cow;
+	struct buffer_head *cow;
+	struct btrfs_node *cow_node;
 
-	if (!list_empty(&buf->dirty)) {
+	if (!buffer_dirty(buf)) {
 		*cow_ret = buf;
 		return 0;
 	}
 	cow = btrfs_alloc_free_block(trans, root);
-	memcpy(&cow->node, &buf->node, root->blocksize);
-	btrfs_set_header_blocknr(&cow->node.header, cow->blocknr);
+	cow_node = btrfs_buffer_node(cow);
+	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
+	btrfs_set_header_blocknr(&cow_node->header, cow->b_blocknr);
 	*cow_ret = cow;
 	btrfs_inc_ref(trans, root, buf);
 	if (buf == root->node) {
 		root->node = cow;
-		cow->count++;
+		get_bh(cow);
 		if (buf != root->commit_root)
-			btrfs_free_extent(trans, root, buf->blocknr, 1, 1);
+			btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
 		btrfs_block_release(root, buf);
 	} else {
-		btrfs_set_node_blockptr(&parent->node, parent_slot,
-					cow->blocknr);
-		BUG_ON(list_empty(&parent->dirty));
-		btrfs_free_extent(trans, root, buf->blocknr, 1, 1);
+		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
+					cow->b_blocknr);
+		BUG_ON(!buffer_dirty(parent));
+		btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
 	}
 	btrfs_block_release(root, buf);
 	return 0;
@@ -119,12 +121,12 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 {
 	int i;
 	struct btrfs_node *parent = NULL;
-	struct btrfs_node *node = &path->nodes[level]->node;
+	struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]);
 	int parent_slot;
 	u32 nritems = btrfs_header_nritems(&node->header);
 
 	if (path->nodes[level + 1])
-		parent = &path->nodes[level + 1]->node;
+		parent = btrfs_buffer_node(path->nodes[level + 1]);
 	parent_slot = path->slots[level + 1];
 	BUG_ON(nritems == 0);
 	if (parent) {
@@ -148,13 +150,13 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
 	int i;
-	struct btrfs_leaf *leaf = &path->nodes[level]->leaf;
+	struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[level]);
 	struct btrfs_node *parent = NULL;
 	int parent_slot;
 	u32 nritems = btrfs_header_nritems(&leaf->header);
 
 	if (path->nodes[level + 1])
-		parent = &path->nodes[level + 1]->node;
+		parent = btrfs_buffer_node(path->nodes[level + 1]);
 	parent_slot = path->slots[level + 1];
 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
 
@@ -250,11 +252,11 @@ static int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot)
 	return -1;
 }
 
-static struct btrfs_buffer *read_node_slot(struct btrfs_root *root,
-				   struct btrfs_buffer *parent_buf,
+static struct buffer_head *read_node_slot(struct btrfs_root *root,
+				   struct buffer_head *parent_buf,
 				   int slot)
 {
-	struct btrfs_node *node = &parent_buf->node;
+	struct btrfs_node *node = btrfs_buffer_node(parent_buf);
 	if (slot < 0)
 		return NULL;
 	if (slot >= btrfs_header_nritems(&node->header))
@@ -265,10 +267,10 @@ static struct btrfs_buffer *read_node_slot(struct btrfs_root *root,
 static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			 *root, struct btrfs_path *path, int level)
 {
-	struct btrfs_buffer *right_buf;
-	struct btrfs_buffer *mid_buf;
-	struct btrfs_buffer *left_buf;
-	struct btrfs_buffer *parent_buf = NULL;
+	struct buffer_head *right_buf;
+	struct buffer_head *mid_buf;
+	struct buffer_head *left_buf;
+	struct buffer_head *parent_buf = NULL;
 	struct btrfs_node *right = NULL;
 	struct btrfs_node *mid;
 	struct btrfs_node *left = NULL;
@@ -283,7 +285,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 0;
 
 	mid_buf = path->nodes[level];
-	mid = &mid_buf->node;
+	mid = btrfs_buffer_node(mid_buf);
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < BTRFS_MAX_LEVEL - 1)
@@ -295,8 +297,8 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	 * by promoting the node below to a root
 	 */
 	if (!parent_buf) {
-		struct btrfs_buffer *child;
-		u64 blocknr = mid_buf->blocknr;
+		struct buffer_head *child;
+		u64 blocknr = mid_buf->b_blocknr;
 
 		if (btrfs_header_nritems(&mid->header) != 1)
 			return 0;
@@ -313,7 +315,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		clean_tree_block(trans, root, mid_buf);
 		return btrfs_free_extent(trans, root, blocknr, 1, 1);
 	}
-	parent = &parent_buf->node;
+	parent = btrfs_buffer_node(parent_buf);
 
 	if (btrfs_header_nritems(&mid->header) >
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -326,7 +328,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (left_buf) {
 		btrfs_cow_block(trans, root, left_buf, parent_buf, pslot - 1,
 				&left_buf);
-		left = &left_buf->node;
+		left = btrfs_buffer_node(left_buf);
 		orig_slot += btrfs_header_nritems(&left->header);
 		wret = push_node_left(trans, root, left_buf, mid_buf);
 		if (wret < 0)
@@ -339,12 +341,12 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_buf) {
 		btrfs_cow_block(trans, root, right_buf, parent_buf, pslot + 1,
 				&right_buf);
-		right = &right_buf->node;
+		right = btrfs_buffer_node(right_buf);
 		wret = push_node_left(trans, root, mid_buf, right_buf);
 		if (wret < 0)
 			ret = wret;
 		if (btrfs_header_nritems(&right->header) == 0) {
-			u64 blocknr = right_buf->blocknr;
+			u64 blocknr = right_buf->b_blocknr;
 			btrfs_block_release(root, right_buf);
 			clean_tree_block(trans, root, right_buf);
 			right_buf = NULL;
@@ -360,7 +362,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			memcpy(&parent->ptrs[pslot + 1].key,
 				&right->ptrs[0].key,
 				sizeof(struct btrfs_disk_key));
-			BUG_ON(list_empty(&parent_buf->dirty));
+			BUG_ON(!buffer_dirty(parent_buf));
 		}
 	}
 	if (btrfs_header_nritems(&mid->header) == 1) {
@@ -381,7 +383,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	if (btrfs_header_nritems(&mid->header) == 0) {
 		/* we've managed to empty the middle node, drop it */
-		u64 blocknr = mid_buf->blocknr;
+		u64 blocknr = mid_buf->b_blocknr;
 		btrfs_block_release(root, mid_buf);
 		clean_tree_block(trans, root, mid_buf);
 		mid_buf = NULL;
@@ -396,13 +398,13 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		/* update the parent key to reflect our changes */
 		memcpy(&parent->ptrs[pslot].key, &mid->ptrs[0].key,
 		       sizeof(struct btrfs_disk_key));
-		BUG_ON(list_empty(&parent_buf->dirty));
+		BUG_ON(!buffer_dirty(parent_buf));
 	}
 
 	/* update the path */
 	if (left_buf) {
 		if (btrfs_header_nritems(&left->header) > orig_slot) {
-			left_buf->count++; // released below
+			get_bh(left_buf);
 			path->nodes[level] = left_buf;
 			path->slots[level + 1] -= 1;
 			path->slots[level] = orig_slot;
@@ -415,8 +417,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	/* double check we haven't messed things up */
 	check_block(root, path, level);
-	if (orig_ptr != btrfs_node_blockptr(&path->nodes[level]->node,
-					    path->slots[level]))
+	if (orig_ptr !=
+	    btrfs_node_blockptr(btrfs_buffer_node(path->nodes[level]),
+				path->slots[level]))
 		BUG();
 
 	if (right_buf)
@@ -443,8 +446,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow)
 {
-	struct btrfs_buffer *b;
-	struct btrfs_buffer *cow_buf;
+	struct buffer_head *b;
+	struct buffer_head *cow_buf;
 	struct btrfs_node *c;
 	int slot;
 	int ret;
@@ -452,18 +455,20 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 
 again:
 	b = root->node;
-	b->count++;
+	get_bh(b);
 	while (b) {
-		level = btrfs_header_level(&b->node.header);
+		c = btrfs_buffer_node(b);
+		level = btrfs_header_level(&c->header);
 		if (cow) {
 			int wret;
-			wret = btrfs_cow_block(trans, root, b, p->nodes[level +
-					       1], p->slots[level + 1],
+			wret = btrfs_cow_block(trans, root, b,
+					       p->nodes[level + 1],
+					       p->slots[level + 1],
 					       &cow_buf);
 			b = cow_buf;
 		}
 		BUG_ON(!cow && ins_len);
-		c = &b->node;
+		c = btrfs_buffer_node(b);
 		p->nodes[level] = b;
 		ret = check_block(root, p, level);
 		if (ret)
@@ -480,7 +485,7 @@ again:
 				if (sret)
 					return sret;
 				b = p->nodes[level];
-				c = &b->node;
+				c = btrfs_buffer_node(b);
 				slot = p->slots[level];
 			} else if (ins_len < 0) {
 				int sret = balance_level(trans, root, p,
@@ -490,7 +495,7 @@ again:
 				b = p->nodes[level];
 				if (!b)
 					goto again;
-				c = &b->node;
+				c = btrfs_buffer_node(b);
 				slot = p->slots[level];
 				BUG_ON(btrfs_header_nritems(&c->header) == 1);
 			}
@@ -505,11 +510,9 @@ again:
 				if (sret)
 					return sret;
 			}
-			BUG_ON(root->node->count == 1);
 			return ret;
 		}
 	}
-	BUG_ON(root->node->count == 1);
 	return 1;
 }
 
@@ -534,9 +537,9 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root
 		int tslot = path->slots[i];
 		if (!path->nodes[i])
 			break;
-		t = &path->nodes[i]->node;
+		t = btrfs_buffer_node(path->nodes[i]);
 		memcpy(&t->ptrs[tslot].key, key, sizeof(*key));
-		BUG_ON(list_empty(&path->nodes[i]->dirty));
+		BUG_ON(!buffer_dirty(path->nodes[i]));
 		if (tslot != 0)
 			break;
 	}
@@ -551,11 +554,11 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root
  * error, and > 0 if there was no room in the left hand block.
  */
 static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_buffer *dst_buf, struct
-			  btrfs_buffer *src_buf)
+			  *root, struct buffer_head *dst_buf, struct
+			  buffer_head *src_buf)
 {
-	struct btrfs_node *src = &src_buf->node;
-	struct btrfs_node *dst = &dst_buf->node;
+	struct btrfs_node *src = btrfs_buffer_node(src_buf);
+	struct btrfs_node *dst = btrfs_buffer_node(dst_buf);
 	int push_items = 0;
 	int src_nritems;
 	int dst_nritems;
@@ -580,8 +583,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
-	BUG_ON(list_empty(&src_buf->dirty));
-	BUG_ON(list_empty(&dst_buf->dirty));
+	BUG_ON(!buffer_dirty(src_buf));
+	BUG_ON(!buffer_dirty(dst_buf));
 	return ret;
 }
 
@@ -595,11 +598,11 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
  * this will  only push up to 1/2 the contents of the left node over
  */
 static int balance_node_right(struct btrfs_trans_handle *trans, struct
-			      btrfs_root *root, struct btrfs_buffer *dst_buf,
-			      struct btrfs_buffer *src_buf)
+			      btrfs_root *root, struct buffer_head *dst_buf,
+			      struct buffer_head *src_buf)
 {
-	struct btrfs_node *src = &src_buf->node;
-	struct btrfs_node *dst = &dst_buf->node;
+	struct btrfs_node *src = btrfs_buffer_node(src_buf);
+	struct btrfs_node *dst = btrfs_buffer_node(dst_buf);
 	int push_items = 0;
 	int max_push;
 	int src_nritems;
@@ -628,8 +631,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
 
-	BUG_ON(list_empty(&src_buf->dirty));
-	BUG_ON(list_empty(&dst_buf->dirty));
+	BUG_ON(!buffer_dirty(src_buf));
+	BUG_ON(!buffer_dirty(dst_buf));
 	return ret;
 }
 
@@ -643,7 +646,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 			   *root, struct btrfs_path *path, int level)
 {
-	struct btrfs_buffer *t;
+	struct buffer_head *t;
 	struct btrfs_node *lower;
 	struct btrfs_node *c;
 	struct btrfs_disk_key *lower_key;
@@ -652,24 +655,24 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(path->nodes[level-1] != root->node);
 
 	t = btrfs_alloc_free_block(trans, root);
-	c = &t->node;
+	c = btrfs_buffer_node(t);
 	memset(c, 0, root->blocksize);
 	btrfs_set_header_nritems(&c->header, 1);
 	btrfs_set_header_level(&c->header, level);
-	btrfs_set_header_blocknr(&c->header, t->blocknr);
+	btrfs_set_header_blocknr(&c->header, t->b_blocknr);
 	btrfs_set_header_parentid(&c->header,
-	                       btrfs_header_parentid(&root->node->node.header));
-	lower = &path->nodes[level-1]->node;
+	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
+	lower = btrfs_buffer_node(path->nodes[level-1]);
 	if (btrfs_is_leaf(lower))
 		lower_key = &((struct btrfs_leaf *)lower)->items[0].key;
 	else
 		lower_key = &lower->ptrs[0].key;
 	memcpy(&c->ptrs[0].key, lower_key, sizeof(struct btrfs_disk_key));
-	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->blocknr);
+	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->b_blocknr);
 	/* the super has an extra ref to root->node */
 	btrfs_block_release(root, root->node);
 	root->node = t;
-	t->count++;
+	get_bh(t);
 	path->nodes[level] = t;
 	path->slots[level] = 0;
 	return 0;
@@ -692,7 +695,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	int nritems;
 
 	BUG_ON(!path->nodes[level]);
-	lower = &path->nodes[level]->node;
+	lower = btrfs_buffer_node(path->nodes[level]);
 	nritems = btrfs_header_nritems(&lower->header);
 	if (slot > nritems)
 		BUG();
@@ -705,7 +708,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	memcpy(&lower->ptrs[slot].key, key, sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(lower, slot, blocknr);
 	btrfs_set_header_nritems(&lower->header, nritems + 1);
-	BUG_ON(list_empty(&path->nodes[level]->dirty));
+	BUG_ON(!buffer_dirty(path->nodes[level]));
 	return 0;
 }
 
@@ -721,9 +724,9 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level)
 {
-	struct btrfs_buffer *t;
+	struct buffer_head *t;
 	struct btrfs_node *c;
-	struct btrfs_buffer *split_buffer;
+	struct buffer_head *split_buffer;
 	struct btrfs_node *split;
 	int mid;
 	int ret;
@@ -731,7 +734,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 c_nritems;
 
 	t = path->nodes[level];
-	c = &t->node;
+	c = btrfs_buffer_node(t);
 	if (t == root->node) {
 		/* trying to split the root, lets make a new one */
 		ret = insert_new_root(trans, root, path, level + 1);
@@ -740,11 +743,11 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	c_nritems = btrfs_header_nritems(&c->header);
 	split_buffer = btrfs_alloc_free_block(trans, root);
-	split = &split_buffer->node;
+	split = btrfs_buffer_node(split_buffer);
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
-	btrfs_set_header_blocknr(&split->header, split_buffer->blocknr);
+	btrfs_set_header_blocknr(&split->header, split_buffer->b_blocknr);
 	btrfs_set_header_parentid(&split->header,
-	                       btrfs_header_parentid(&root->node->node.header));
+	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	mid = (c_nritems + 1) / 2;
 	memcpy(split->ptrs, c->ptrs + mid,
 		(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
@@ -752,9 +755,9 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_nritems(&c->header, mid);
 	ret = 0;
 
-	BUG_ON(list_empty(&t->dirty));
+	BUG_ON(!buffer_dirty(t));
 	wret = insert_ptr(trans, root, path, &split->ptrs[0].key,
-			  split_buffer->blocknr, path->slots[level + 1] + 1,
+			  split_buffer->b_blocknr, path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
 		ret = wret;
@@ -798,11 +801,12 @@ static int leaf_space_used(struct btrfs_leaf *l, int start, int nr)
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 			   *root, struct btrfs_path *path, int data_size)
 {
-	struct btrfs_buffer *left_buf = path->nodes[0];
-	struct btrfs_leaf *left = &left_buf->leaf;
+	struct buffer_head *left_buf = path->nodes[0];
+	struct btrfs_leaf *left = btrfs_buffer_leaf(left_buf);
 	struct btrfs_leaf *right;
-	struct btrfs_buffer *right_buf;
-	struct btrfs_buffer *upper;
+	struct buffer_head *right_buf;
+	struct buffer_head *upper;
+	struct btrfs_node *upper_node;
 	int slot;
 	int i;
 	int free_space;
@@ -817,12 +821,13 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 	upper = path->nodes[1];
-	if (slot >= btrfs_header_nritems(&upper->node.header) - 1) {
+	upper_node = btrfs_buffer_node(upper);
+	if (slot >= btrfs_header_nritems(&upper_node->header) - 1) {
 		return 1;
 	}
-	right_buf = read_tree_block(root, btrfs_node_blockptr(&upper->node,
-							      slot + 1));
-	right = &right_buf->leaf;
+	right_buf = read_tree_block(root,
+		    btrfs_node_blockptr(btrfs_buffer_node(upper), slot + 1));
+	right = btrfs_buffer_leaf(right_buf);
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, right_buf);
@@ -830,7 +835,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	/* cow and double check */
 	btrfs_cow_block(trans, root, right_buf, upper, slot + 1, &right_buf);
-	right = &right_buf->leaf;
+	right = btrfs_buffer_leaf(right_buf);
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, right_buf);
@@ -881,11 +886,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	left_nritems -= push_items;
 	btrfs_set_header_nritems(&left->header, left_nritems);
 
-	BUG_ON(list_empty(&left_buf->dirty));
-	BUG_ON(list_empty(&right_buf->dirty));
-	memcpy(&upper->node.ptrs[slot + 1].key,
+	BUG_ON(!buffer_dirty(left_buf));
+	BUG_ON(!buffer_dirty(right_buf));
+	memcpy(&upper_node->ptrs[slot + 1].key,
 		&right->items[0].key, sizeof(struct btrfs_disk_key));
-	BUG_ON(list_empty(&upper->dirty));
+	BUG_ON(!buffer_dirty(upper));
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
@@ -905,9 +910,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, int data_size)
 {
-	struct btrfs_buffer *right_buf = path->nodes[0];
-	struct btrfs_leaf *right = &right_buf->leaf;
-	struct btrfs_buffer *t;
+	struct buffer_head *right_buf = path->nodes[0];
+	struct btrfs_leaf *right = btrfs_buffer_leaf(right_buf);
+	struct buffer_head *t;
 	struct btrfs_leaf *left;
 	int slot;
 	int i;
@@ -926,9 +931,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path->nodes[1]) {
 		return 1;
 	}
-	t = read_tree_block(root, btrfs_node_blockptr(&path->nodes[1]->node,
-						      slot - 1));
-	left = &t->leaf;
+	t = read_tree_block(root,
+	    btrfs_node_blockptr(btrfs_buffer_node(path->nodes[1]), slot - 1));
+	left = btrfs_buffer_leaf(t);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, t);
@@ -937,7 +942,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* cow and double check */
 	btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t);
-	left = &t->leaf;
+	left = btrfs_buffer_leaf(t);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		btrfs_block_release(root, t);
@@ -999,8 +1004,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		push_space = btrfs_item_offset(right->items + i);
 	}
 
-	BUG_ON(list_empty(&t->dirty));
-	BUG_ON(list_empty(&right_buf->dirty));
+	BUG_ON(!buffer_dirty(t));
+	BUG_ON(!buffer_dirty(right_buf));
 
 	wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1);
 	if (wret)
@@ -1029,13 +1034,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int data_size)
 {
-	struct btrfs_buffer *l_buf;
+	struct buffer_head *l_buf;
 	struct btrfs_leaf *l;
 	u32 nritems;
 	int mid;
 	int slot;
 	struct btrfs_leaf *right;
-	struct btrfs_buffer *right_buffer;
+	struct buffer_head *right_buffer;
 	int space_needed = data_size + sizeof(struct btrfs_item);
 	int data_copy_size;
 	int rt_data_off;
@@ -1053,7 +1058,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			return wret;
 	}
 	l_buf = path->nodes[0];
-	l = &l_buf->leaf;
+	l = btrfs_buffer_leaf(l_buf);
 
 	/* did the pushes work? */
 	if (btrfs_leaf_free_space(root, l) >=
@@ -1071,7 +1076,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	right_buffer = btrfs_alloc_free_block(trans, root);
 	BUG_ON(!right_buffer);
 	BUG_ON(mid == nritems);
-	right = &right_buffer->leaf;
+	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
 	if (mid <= slot) {
 		/* FIXME, just alloc a new leaf here */
@@ -1085,10 +1090,10 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			BUG();
 	}
 	btrfs_set_header_nritems(&right->header, nritems - mid);
-	btrfs_set_header_blocknr(&right->header, right_buffer->blocknr);
+	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
 	btrfs_set_header_level(&right->header, 0);
 	btrfs_set_header_parentid(&right->header,
-	                       btrfs_header_parentid(&root->node->node.header));
+	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	data_copy_size = btrfs_item_end(l->items + mid) -
 			 leaf_data_end(root, l);
 	memcpy(right->items, l->items + mid,
@@ -1107,11 +1112,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_nritems(&l->header, mid);
 	ret = 0;
 	wret = insert_ptr(trans, root, path, &right->items[0].key,
-			  right_buffer->blocknr, path->slots[1] + 1, 1);
+			  right_buffer->b_blocknr, path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
-	BUG_ON(list_empty(&right_buffer->dirty));
-	BUG_ON(list_empty(&l_buf->dirty));
+	BUG_ON(!buffer_dirty(right_buffer));
+	BUG_ON(!buffer_dirty(l_buf));
 	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
 		btrfs_block_release(root, path->nodes[0]);
@@ -1136,7 +1141,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	int slot;
 	int slot_orig;
 	struct btrfs_leaf *leaf;
-	struct btrfs_buffer *leaf_buf;
+	struct buffer_head *leaf_buf;
 	u32 nritems;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
@@ -1156,7 +1161,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	slot_orig = path->slots[0];
 	leaf_buf = path->nodes[0];
-	leaf = &leaf_buf->leaf;
+	leaf = btrfs_buffer_leaf(leaf_buf);
 
 	nritems = btrfs_header_nritems(&leaf->header);
 	data_end = leaf_data_end(root, leaf);
@@ -1202,7 +1207,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot == 0)
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 
-	BUG_ON(list_empty(&leaf_buf->dirty));
+	BUG_ON(!buffer_dirty(leaf_buf));
 	if (btrfs_leaf_free_space(root, leaf) < 0)
 		BUG();
 	check_leaf(root, path, 0);
@@ -1225,7 +1230,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_init_path(&path);
 	ret = btrfs_insert_empty_item(trans, root, &path, cpu_key, data_size);
 	if (!ret) {
-		ptr = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0], u8);
+		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+				     path.slots[0], u8);
 		memcpy(ptr, data, data_size);
 	}
 	btrfs_release_path(root, &path);
@@ -1243,12 +1249,12 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot)
 {
 	struct btrfs_node *node;
-	struct btrfs_buffer *parent = path->nodes[level];
+	struct buffer_head *parent = path->nodes[level];
 	u32 nritems;
 	int ret = 0;
 	int wret;
 
-	node = &parent->node;
+	node = btrfs_buffer_node(parent);
 	nritems = btrfs_header_nritems(&node->header);
 	if (slot != nritems -1) {
 		memmove(node->ptrs + slot, node->ptrs + slot + 1,
@@ -1257,16 +1263,17 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	nritems--;
 	btrfs_set_header_nritems(&node->header, nritems);
 	if (nritems == 0 && parent == root->node) {
-		BUG_ON(btrfs_header_level(&root->node->node.header) != 1);
+		struct btrfs_header *header = btrfs_buffer_header(root->node);
+		BUG_ON(btrfs_header_level(header) != 1);
 		/* just turn the root into a leaf and break */
-		btrfs_set_header_level(&root->node->node.header, 0);
+		btrfs_set_header_level(header, 0);
 	} else if (slot == 0) {
 		wret = fixup_low_keys(trans, root, path, &node->ptrs[0].key,
 				      level + 1);
 		if (wret)
 			ret = wret;
 	}
-	BUG_ON(list_empty(&parent->dirty));
+	BUG_ON(!buffer_dirty(parent));
 	return ret;
 }
 
@@ -1279,7 +1286,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	int slot;
 	struct btrfs_leaf *leaf;
-	struct btrfs_buffer *leaf_buf;
+	struct buffer_head *leaf_buf;
 	int doff;
 	int dsize;
 	int ret = 0;
@@ -1287,7 +1294,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	u32 nritems;
 
 	leaf_buf = path->nodes[0];
-	leaf = &leaf_buf->leaf;
+	leaf = btrfs_buffer_leaf(leaf_buf);
 	slot = path->slots[0];
 	doff = btrfs_item_offset(leaf->items + slot);
 	dsize = btrfs_item_size(leaf->items + slot);
@@ -1313,14 +1320,13 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (nritems == 0) {
 		if (leaf_buf == root->node) {
 			btrfs_set_header_level(&leaf->header, 0);
-			BUG_ON(list_empty(&leaf_buf->dirty));
 		} else {
 			clean_tree_block(trans, root, leaf_buf);
 			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root,
-						 leaf_buf->blocknr, 1, 1);
+						 leaf_buf->b_blocknr, 1, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -1332,7 +1338,6 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (wret)
 				ret = wret;
 		}
-		BUG_ON(list_empty(&leaf_buf->dirty));
 
 		/* delete the leaf if it is mostly empty */
 		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
@@ -1341,7 +1346,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			 * for possible call to del_ptr below
 			 */
 			slot = path->slots[1];
-			leaf_buf->count++;
+			get_bh(leaf_buf);
 			wret = push_leaf_left(trans, root, path, 1);
 			if (wret < 0)
 				ret = wret;
@@ -1352,7 +1357,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					ret = wret;
 			}
 			if (btrfs_header_nritems(&leaf->header) == 0) {
-				u64 blocknr = leaf_buf->blocknr;
+				u64 blocknr = leaf_buf->b_blocknr;
 				clean_tree_block(trans, root, leaf_buf);
 				wret = del_ptr(trans, root, path, 1, slot);
 				if (wret)
@@ -1380,19 +1385,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	int slot;
 	int level = 1;
 	u64 blocknr;
-	struct btrfs_buffer *c;
-	struct btrfs_buffer *next = NULL;
+	struct buffer_head *c;
+	struct btrfs_node *c_node;
+	struct buffer_head *next = NULL;
 
 	while(level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 		slot = path->slots[level] + 1;
 		c = path->nodes[level];
-		if (slot >= btrfs_header_nritems(&c->node.header)) {
+		c_node = btrfs_buffer_node(c);
+		if (slot >= btrfs_header_nritems(&c_node->header)) {
 			level++;
 			continue;
 		}
-		blocknr = btrfs_node_blockptr(&c->node, slot);
+		blocknr = btrfs_node_blockptr(c_node, slot);
 		if (next)
 			btrfs_block_release(root, next);
 		next = read_tree_block(root, blocknr);
@@ -1408,7 +1415,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (!level)
 			break;
 		next = read_tree_block(root,
-				       btrfs_node_blockptr(&next->node, 0));
+		       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
 	}
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ae8518cb94b..7748eecd930 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,6 +1,9 @@
 #ifndef __BTRFS__
 #define __BTRFS__
 
+#include <linux/radix-tree.h>
+#include <linux/fs.h>
+
 struct btrfs_trans_handle;
 
 #define BTRFS_MAGIC "_BtRfS_M"
@@ -10,6 +13,12 @@ struct btrfs_trans_handle;
 #define BTRFS_INODE_MAP_OBJECTID 3
 #define BTRFS_FS_TREE_OBJECTID 4
 
+/*
+ * we can actually store much bigger names, but lets not confuse the rest
+ * of linux
+ */
+#define BTRFS_NAME_LEN 255
+
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
  * block layout.  objectid corresonds to the inode number.  The flags
@@ -57,7 +66,7 @@ struct btrfs_header {
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
 #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize))
 
-struct btrfs_buffer;
+struct buffer_head;
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -120,7 +129,7 @@ struct btrfs_node {
  * used while walking the tree.
  */
 struct btrfs_path {
-	struct btrfs_buffer *nodes[BTRFS_MAX_LEVEL];
+	struct buffer_head *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
 };
 
@@ -211,17 +220,14 @@ struct btrfs_fs_info {
 	struct btrfs_root *inode_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
-	struct radix_tree_root cache_radix;
 	struct radix_tree_root pinned_radix;
-	struct list_head trans;
-	struct list_head cache;
 	u64 last_inode_alloc;
 	u64 last_inode_alloc_dirid;
 	u64 generation;
-	int cache_size;
-	int fp;
 	struct btrfs_trans_handle *running_transaction;
 	struct btrfs_super_block *disk_super;
+	struct buffer_head *sb_buffer;
+	struct super_block *sb;
 };
 
 /*
@@ -230,8 +236,8 @@ struct btrfs_fs_info {
  * only for the extent tree.
  */
 struct btrfs_root {
-	struct btrfs_buffer *node;
-	struct btrfs_buffer *commit_root;
+	struct buffer_head *node;
+	struct buffer_head *commit_root;
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
@@ -389,6 +395,29 @@ static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i,
 	i->compat_flags = cpu_to_le16(val);
 }
 
+static inline u32 btrfs_timespec_sec(struct btrfs_inode_timespec *ts)
+{
+	return le32_to_cpu(ts->sec);
+}
+
+static inline void btrfs_set_timespec_sec(struct btrfs_inode_timespec *ts,
+					  u32 val)
+{
+	ts->sec = cpu_to_le32(val);
+}
+
+static inline u32 btrfs_timespec_nsec(struct btrfs_inode_timespec *ts)
+{
+	return le32_to_cpu(ts->nsec);
+}
+
+static inline void btrfs_set_timespec_nsec(struct btrfs_inode_timespec *ts,
+					  u32 val)
+{
+	ts->nsec = cpu_to_le32(val);
+}
+
+
 
 static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
 {
@@ -757,15 +786,20 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
+static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset((leaf)->items + (slot))))
 
-struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct btrfs_buffer *buf);
+		  struct buffer_head *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -783,7 +817,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, struct btrfs_buffer *snap);
+			*root, struct buffer_head *snap);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root);
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -800,8 +834,8 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, char *name, int name_len, u64 dir, u64
 			  objectid, u8 type);
 int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, u64 dir, char *name,
-			  int name_len, int mod);
+			  *root, struct btrfs_path *path, u64 dir,
+			  const char *name, int name_len, int mod);
 int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 			      char *name, int name_len);
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 4d8083d92fa..75d6e373e98 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -18,12 +18,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = dir;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	if (name_len == 1 && *name == '.')
-		key.offset = 1;
-	else if (name_len == 2 && name[0] == '.' && name[1] == '.')
-		key.offset = 2;
-	else
-		ret = btrfs_name_hash(name, name_len, &key.offset);
+	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
 	btrfs_init_path(&path);
 	data_size = sizeof(*dir_item) + name_len;
@@ -31,7 +26,8 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret)
 		goto out;
 
-	dir_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+				  path.slots[0],
 				  struct btrfs_dir_item);
 	btrfs_set_dir_objectid(dir_item, objectid);
 	btrfs_set_dir_type(dir_item, type);
@@ -45,8 +41,8 @@ out:
 }
 
 int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, u64 dir, char *name,
-			  int name_len, int mod)
+			  *root, struct btrfs_path *path, u64 dir,
+			  const char *name, int name_len, int mod)
 {
 	int ret;
 	struct btrfs_key key;
@@ -69,7 +65,8 @@ int btrfs_match_dir_item_name(struct btrfs_root *root,
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
 
-	dir_item = btrfs_item_ptr(&path->nodes[0]->leaf, path->slots[0],
+	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
 				  struct btrfs_dir_item);
 	if (btrfs_dir_name_len(dir_item) != name_len)
 		return 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 05637f9fd7c..df2061a735c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1,165 +1,67 @@
-#define _XOPEN_SOURCE 500
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "kerncompat.h"
-#include "radix-tree.h"
+#include <linux/module.h>
+#include <linux/fs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 
-static int allocated_blocks = 0;
-int cache_max = 10000;
-
-static int check_tree_block(struct btrfs_root *root, struct btrfs_buffer *buf)
+static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
-	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
+	struct btrfs_node *node = btrfs_buffer_node(buf);
+	if (buf->b_blocknr != btrfs_header_blocknr(&node->header))
 		BUG();
-	if (root->node && btrfs_header_parentid(&buf->node.header) !=
-	    btrfs_header_parentid(&root->node->node.header))
+	if (root->node && btrfs_header_parentid(&node->header) !=
+	    btrfs_header_parentid(btrfs_buffer_header(root->node)))
 		BUG();
 	return 0;
 }
 
-static int free_some_buffers(struct btrfs_root *root)
+struct buffer_head *alloc_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct list_head *node, *next;
-	struct btrfs_buffer *b;
-	if (root->fs_info->cache_size < cache_max)
-		return 0;
-	list_for_each_safe(node, next, &root->fs_info->cache) {
-		b = list_entry(node, struct btrfs_buffer, cache);
-		if (b->count == 1) {
-			BUG_ON(!list_empty(&b->dirty));
-			list_del_init(&b->cache);
-			btrfs_block_release(root, b);
-			if (root->fs_info->cache_size < cache_max)
-				break;
-		}
-	}
-	return 0;
+	return sb_getblk(root->fs_info->sb, blocknr);
 }
 
-struct btrfs_buffer *alloc_tree_block(struct btrfs_root *root, u64 blocknr)
+struct buffer_head *find_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct btrfs_buffer *buf;
-	int ret;
-
-	buf = malloc(sizeof(struct btrfs_buffer) + root->blocksize);
-	if (!buf)
-		return buf;
-	allocated_blocks++;
-	buf->blocknr = blocknr;
-	buf->count = 2;
-	INIT_LIST_HEAD(&buf->dirty);
-	free_some_buffers(root);
-	radix_tree_preload(GFP_KERNEL);
-	ret = radix_tree_insert(&root->fs_info->cache_radix, blocknr, buf);
-	radix_tree_preload_end();
-	list_add_tail(&buf->cache, &root->fs_info->cache);
-	root->fs_info->cache_size++;
-	if (ret) {
-		free(buf);
-		return NULL;
-	}
-	return buf;
+	return sb_getblk(root->fs_info->sb, blocknr);
 }
 
-struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr)
+struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct btrfs_buffer *buf;
-	buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr);
-	if (buf) {
-		buf->count++;
-	} else {
-		buf = alloc_tree_block(root, blocknr);
-		if (!buf) {
-			BUG();
-			return NULL;
-		}
-	}
-	return buf;
-}
-
-struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
-{
-	loff_t offset = blocknr * root->blocksize;
-	struct btrfs_buffer *buf;
-	int ret;
+	struct buffer_head *buf = sb_bread(root->fs_info->sb, blocknr);
 
-	buf = radix_tree_lookup(&root->fs_info->cache_radix, blocknr);
-	if (buf) {
-		buf->count++;
-	} else {
-		buf = alloc_tree_block(root, blocknr);
-		if (!buf)
-			return NULL;
-		ret = pread(root->fs_info->fp, &buf->node, root->blocksize,
-			    offset);
-		if (ret != root->blocksize) {
-			free(buf);
-			return NULL;
-		}
-	}
+	if (!buf)
+		return buf;
 	if (check_tree_block(root, buf))
 		BUG();
 	return buf;
 }
 
 int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct btrfs_buffer *buf)
+		     struct buffer_head *buf)
 {
-	if (!list_empty(&buf->dirty))
-		return 0;
-	list_add_tail(&buf->dirty, &root->fs_info->trans);
-	buf->count++;
+	mark_buffer_dirty(buf);
 	return 0;
 }
 
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct btrfs_buffer *buf)
+		     struct buffer_head *buf)
 {
-	if (!list_empty(&buf->dirty)) {
-		list_del_init(&buf->dirty);
-		btrfs_block_release(root, buf);
-	}
+	clear_buffer_dirty(buf);
 	return 0;
 }
 
 int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct btrfs_buffer *buf)
+		     struct buffer_head *buf)
 {
-	u64 blocknr = buf->blocknr;
-	loff_t offset = blocknr * root->blocksize;
-	int ret;
-
-	if (buf->blocknr != btrfs_header_blocknr(&buf->node.header))
-		BUG();
-	ret = pwrite(root->fs_info->fp, &buf->node, root->blocksize, offset);
-	if (ret != root->blocksize)
-		return ret;
+	mark_buffer_dirty(buf);
 	return 0;
 }
 
 static int __commit_transaction(struct btrfs_trans_handle *trans, struct
 				btrfs_root *root)
 {
-	struct btrfs_buffer *b;
-	int ret = 0;
-	int wret;
-	while(!list_empty(&root->fs_info->trans)) {
-		b = list_entry(root->fs_info->trans.next, struct btrfs_buffer,
-			       dirty);
-		list_del_init(&b->dirty);
-		wret = write_tree_block(trans, root, b);
-		if (wret)
-			ret = wret;
-		btrfs_block_release(root, b);
-	}
-	return ret;
+	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
+	return 0;
 }
 
 static int commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -172,17 +74,17 @@ static int commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_root *inode_root = fs_info->inode_root;
 
 	btrfs_set_root_blocknr(&inode_root->root_item,
-			       inode_root->node->blocknr);
+			       inode_root->node->b_blocknr);
 	ret = btrfs_update_root(trans, tree_root,
 				&inode_root->root_key,
 				&inode_root->root_item);
 	BUG_ON(ret);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
-		if (old_extent_block == extent_root->node->blocknr)
+		if (old_extent_block == extent_root->node->b_blocknr)
 			break;
 		btrfs_set_root_blocknr(&extent_root->root_item,
-				       extent_root->node->blocknr);
+				       extent_root->node->b_blocknr);
 		ret = btrfs_update_root(trans, tree_root,
 					&extent_root->root_key,
 					&extent_root->root_item);
@@ -195,7 +97,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
 			     btrfs_root *root, struct btrfs_super_block *s)
 {
 	int ret = 0;
-	struct btrfs_buffer *snap = root->commit_root;
+	struct buffer_head *snap = root->commit_root;
 	struct btrfs_key snap_key;
 
 	if (root->commit_root == root->node)
@@ -204,7 +106,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
 	memcpy(&snap_key, &root->root_key, sizeof(snap_key));
 	root->root_key.offset++;
 
-	btrfs_set_root_blocknr(&root->root_item, root->node->blocknr);
+	btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
 				&root->root_key, &root->root_item);
 	BUG_ON(ret);
@@ -220,7 +122,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
 	btrfs_finish_extent_commit(trans, root->fs_info->tree_root);
 
 	root->commit_root = root->node;
-	root->node->count++;
+	get_bh(root->node);
 	ret = btrfs_drop_snapshot(trans, root, snap);
 	BUG_ON(ret);
 
@@ -234,7 +136,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
 static int __setup_root(struct btrfs_super_block *super,
 			struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
-			u64 objectid, int fp)
+			u64 objectid)
 {
 	root->node = NULL;
 	root->commit_root = NULL;
@@ -250,11 +152,11 @@ static int find_and_setup_root(struct btrfs_super_block *super,
 			       struct btrfs_root *tree_root,
 			       struct btrfs_fs_info *fs_info,
 			       u64 objectid,
-			       struct btrfs_root *root, int fp)
+			       struct btrfs_root *root)
 {
 	int ret;
 
-	__setup_root(super, root, fs_info, objectid, fp);
+	__setup_root(super, root, fs_info, objectid);
 	ret = btrfs_find_last_root(tree_root, objectid,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
@@ -265,32 +167,26 @@ static int find_and_setup_root(struct btrfs_super_block *super,
 	return 0;
 }
 
-struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *super)
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct buffer_head *sb_buffer,
+			      struct btrfs_super_block *disk_super)
 {
-	int fp;
-
-	fp = open(filename, O_CREAT | O_RDWR, 0600);
-	if (fp < 0) {
-		return NULL;
-	}
-	return open_ctree_fd(fp, super);
-}
-
-struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super)
-{
-	struct btrfs_root *root = malloc(sizeof(struct btrfs_root));
-	struct btrfs_root *extent_root = malloc(sizeof(struct btrfs_root));
-	struct btrfs_root *tree_root = malloc(sizeof(struct btrfs_root));
-	struct btrfs_root *inode_root = malloc(sizeof(struct btrfs_root));
-	struct btrfs_fs_info *fs_info = malloc(sizeof(*fs_info));
+	struct btrfs_root *root = kmalloc(sizeof(struct btrfs_root),
+					  GFP_NOFS);
+	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
+	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
+					       GFP_NOFS);
+	struct btrfs_root *inode_root = kmalloc(sizeof(struct btrfs_root),
+						GFP_NOFS);
+	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
+						GFP_NOFS);
 	int ret;
 
-	INIT_RADIX_TREE(&fs_info->cache_radix, GFP_KERNEL);
+	/* FIXME: don't be stupid */
+	if (!btrfs_super_root(disk_super))
+		return NULL;
 	INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL);
-	INIT_LIST_HEAD(&fs_info->trans);
-	INIT_LIST_HEAD(&fs_info->cache);
-	fs_info->cache_size = 0;
-	fs_info->fp = fp;
 	fs_info->running_transaction = NULL;
 	fs_info->fs_root = root;
 	fs_info->tree_root = tree_root;
@@ -298,36 +194,31 @@ struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super)
 	fs_info->inode_root = inode_root;
 	fs_info->last_inode_alloc = 0;
 	fs_info->last_inode_alloc_dirid = 0;
-	fs_info->disk_super = super;
+	fs_info->disk_super = disk_super;
+	fs_info->sb_buffer = sb_buffer;
+	fs_info->sb = sb;
 	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
-	ret = pread(fp, super, sizeof(struct btrfs_super_block),
-		     BTRFS_SUPER_INFO_OFFSET);
-	if (ret == 0 || btrfs_super_root(super) == 0) {
-		BUG();
-		return NULL;
-	}
-	BUG_ON(ret < 0);
-
-	__setup_root(super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID, fp);
-	tree_root->node = read_tree_block(tree_root, btrfs_super_root(super));
+	__setup_root(disk_super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
+	tree_root->node = read_tree_block(tree_root,
+					  btrfs_super_root(disk_super));
 	BUG_ON(!tree_root->node);
 
-	ret = find_and_setup_root(super, tree_root, fs_info,
-				  BTRFS_EXTENT_TREE_OBJECTID, extent_root, fp);
+	ret = find_and_setup_root(disk_super, tree_root, fs_info,
+				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(super, tree_root, fs_info,
-				  BTRFS_INODE_MAP_OBJECTID, inode_root, fp);
+	ret = find_and_setup_root(disk_super, tree_root, fs_info,
+				  BTRFS_INODE_MAP_OBJECTID, inode_root);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(super, tree_root, fs_info,
-				  BTRFS_FS_TREE_OBJECTID, root, fp);
+	ret = find_and_setup_root(disk_super, tree_root, fs_info,
+				  BTRFS_FS_TREE_OBJECTID, root);
 	BUG_ON(ret);
 
 	root->commit_root = root->node;
-	root->node->count++;
+	get_bh(root->node);
 	root->ref_cows = 1;
 	root->fs_info->generation = root->root_key.offset + 1;
 	return root;
@@ -336,8 +227,11 @@ struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super)
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_super_block *s)
 {
+	return 0;
+#if 0
 	int ret;
-	btrfs_set_super_root(s, root->fs_info->tree_root->node->blocknr);
+	btrfs_set_super_root(s, root->fs_info->tree_root->node->b_blocknr);
+
 	ret = pwrite(root->fs_info->fp, s, sizeof(*s),
 		     BTRFS_SUPER_INFO_OFFSET);
 	if (ret != sizeof(*s)) {
@@ -345,35 +239,38 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		return ret;
 	}
 	return 0;
+#endif
 }
 
 static int drop_cache(struct btrfs_root *root)
 {
+	return 0;
+#if 0
 	while(!list_empty(&root->fs_info->cache)) {
-		struct btrfs_buffer *b = list_entry(root->fs_info->cache.next,
-						    struct btrfs_buffer,
+		struct buffer_head *b = list_entry(root->fs_info->cache.next,
+						    struct buffer_head,
 						    cache);
 		list_del_init(&b->cache);
 		btrfs_block_release(root, b);
 	}
 	return 0;
+#endif
 }
-int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s)
+
+int close_ctree(struct btrfs_root *root)
 {
 	int ret;
 	struct btrfs_trans_handle *trans;
 
 	trans = root->fs_info->running_transaction;
-	btrfs_commit_transaction(trans, root, s);
+	btrfs_commit_transaction(trans, root, root->fs_info->disk_super);
 	ret = commit_tree_roots(trans, root->fs_info);
 	BUG_ON(ret);
 	ret = __commit_transaction(trans, root);
 	BUG_ON(ret);
-	write_ctree_super(trans, root, s);
+	write_ctree_super(trans, root, root->fs_info->disk_super);
 	drop_cache(root);
-	BUG_ON(!list_empty(&root->fs_info->trans));
 
-	close(root->fs_info->fp);
 	if (root->node)
 		btrfs_block_release(root, root->node);
 	if (root->fs_info->extent_root->node)
@@ -386,29 +283,17 @@ int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s)
 		btrfs_block_release(root->fs_info->tree_root,
 				    root->fs_info->tree_root->node);
 	btrfs_block_release(root, root->commit_root);
-	free(root);
-	printf("on close %d blocks are allocated\n", allocated_blocks);
+	btrfs_block_release(root, root->fs_info->sb_buffer);
+	kfree(root->fs_info->extent_root);
+	kfree(root->fs_info->inode_root);
+	kfree(root->fs_info->tree_root);
+	kfree(root->fs_info);
+	kfree(root);
 	return 0;
 }
 
-void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf)
+void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 {
-	buf->count--;
-	if (buf->count < 0)
-		BUG();
-	if (buf->count == 0) {
-		BUG_ON(!list_empty(&buf->cache));
-		BUG_ON(!list_empty(&buf->dirty));
-		if (!radix_tree_lookup(&root->fs_info->cache_radix,
-				       buf->blocknr))
-			BUG();
-		radix_tree_delete(&root->fs_info->cache_radix, buf->blocknr);
-		memset(buf, 0, sizeof(*buf));
-		free(buf);
-		BUG_ON(allocated_blocks == 0);
-		allocated_blocks--;
-		BUG_ON(root->fs_info->cache_size == 0);
-		root->fs_info->cache_size--;
-	}
+	brelse(buf);
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index d888cf5c350..7f4bb729b73 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -1,36 +1,41 @@
 #ifndef __DISKIO__
 #define __DISKIO__
-#include "list.h"
 
-struct btrfs_buffer {
-	u64 blocknr;
-	int count;
-	struct list_head dirty;
-	struct list_head cache;
-	union {
-		struct btrfs_node node;
-		struct btrfs_leaf leaf;
-	};
-};
+#include <linux/buffer_head.h>
 
-struct btrfs_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr);
-struct btrfs_buffer *find_tree_block(struct btrfs_root *root, u64 blocknr);
+#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+
+static inline struct btrfs_node *btrfs_buffer_node(struct buffer_head *bh)
+{
+	return (struct btrfs_node *)bh->b_data;
+}
+
+static inline struct btrfs_leaf *btrfs_buffer_leaf(struct buffer_head *bh)
+{
+	return (struct btrfs_leaf *)bh->b_data;
+}
+
+static inline struct btrfs_header *btrfs_buffer_header(struct buffer_head *bh)
+{
+	return &((struct btrfs_node *)bh->b_data)->header;
+}
+
+struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr);
+struct buffer_head *find_tree_block(struct btrfs_root *root, u64 blocknr);
 int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct btrfs_buffer *buf);
+		     struct buffer_head *buf);
 int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct btrfs_buffer *buf);
+		     struct buffer_head *buf);
 int clean_tree_block(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct btrfs_buffer *buf);
+		     struct btrfs_root *root, struct buffer_head *buf);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root
 			     *root, struct btrfs_super_block *s);
-struct btrfs_root *open_ctree(char *filename, struct btrfs_super_block *s);
-struct btrfs_root *open_ctree_fd(int fp, struct btrfs_super_block *super);
-int close_ctree(struct btrfs_root *root, struct btrfs_super_block *s);
-void btrfs_block_release(struct btrfs_root *root, struct btrfs_buffer *buf);
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct buffer_head *sb_buffer,
+			      struct btrfs_super_block *disk_super);
+int close_ctree(struct btrfs_root *root);
+void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf);
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		      struct btrfs_super_block *s);
 int mkfs(int fd, u64 num_blocks, u32 blocksize);
-
-#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
-
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53a7550b5c1..e3af2c03568 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -10,9 +10,8 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			    search_end, struct btrfs_key *ins);
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
-static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *extent_root);
-
+static int del_pending_extents(struct btrfs_trans_handle *trans, struct
+			       btrfs_root *extent_root);
 /*
  * pending extents are blocks that we're trying to allocate in the extent
  * map while trying to grow the map because of other allocations.  To avoid
@@ -21,6 +20,7 @@ static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root
  * manner for deletes.
  */
 #define CTREE_EXTENT_PENDING_DEL 0
+#define CTREE_EXTENT_PINNED 1
 
 static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 			 *root, u64 blocknr)
@@ -45,15 +45,14 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret != 0)
 		BUG();
 	BUG_ON(ret != 0);
-	l = &path.nodes[0]->leaf;
+	l = btrfs_buffer_leaf(path.nodes[0]);
 	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(item);
 	btrfs_set_extent_refs(item, refs + 1);
 
-	BUG_ON(list_empty(&path.nodes[0]->dirty));
 	btrfs_release_path(root->fs_info->extent_root, &path);
 	finish_current_insert(trans, root->fs_info->extent_root);
-	run_pending(trans, root->fs_info->extent_root);
+	del_pending_extents(trans, root->fs_info->extent_root);
 	return 0;
 }
 
@@ -74,7 +73,7 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 				0, 0);
 	if (ret != 0)
 		BUG();
-	l = &path.nodes[0]->leaf;
+	l = btrfs_buffer_leaf(path.nodes[0]);
 	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
 	*refs = btrfs_extent_refs(item);
 	btrfs_release_path(root->fs_info->extent_root, &path);
@@ -82,18 +81,20 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct btrfs_buffer *buf)
+		  struct buffer_head *buf)
 {
 	u64 blocknr;
+	struct btrfs_node *buf_node;
 	int i;
 
 	if (!root->ref_cows)
 		return 0;
-	if (btrfs_is_leaf(&buf->node))
+	buf_node = btrfs_buffer_node(buf);
+	if (btrfs_is_leaf(buf_node))
 		return 0;
 
-	for (i = 0; i < btrfs_header_nritems(&buf->node.header); i++) {
-		blocknr = btrfs_node_blockptr(&buf->node, i);
+	for (i = 0; i < btrfs_header_nritems(&buf_node->header); i++) {
+		blocknr = btrfs_node_blockptr(buf_node, i);
 		inc_block_ref(trans, root, blocknr);
 	}
 	return 0;
@@ -108,9 +109,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 	int i;
 
 	while(1) {
-		ret = radix_tree_gang_lookup(&root->fs_info->pinned_radix,
+		ret = radix_tree_gang_lookup_tag(&root->fs_info->pinned_radix,
 					     (void **)gang, 0,
-					     ARRAY_SIZE(gang));
+					     ARRAY_SIZE(gang),
+					     CTREE_EXTENT_PINNED);
 		if (!ret)
 			break;
 		if (!first)
@@ -137,7 +139,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item,
-		btrfs_header_parentid(&extent_root->node->node.header));
+		btrfs_header_parentid(btrfs_buffer_header(extent_root->node)));
 	ins.offset = 1;
 	ins.flags = 0;
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
@@ -156,11 +158,24 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	return 0;
 }
 
+static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag)
+{
+	int err;
+	err = radix_tree_insert(&root->fs_info->pinned_radix,
+				blocknr, (void *)blocknr);
+	BUG_ON(err);
+	if (err)
+		return err;
+	radix_tree_tag_set(&root->fs_info->pinned_radix, blocknr,
+			   tag);
+	return 0;
+}
+
 /*
  * remove an extent from the root, returns 0 on success
  */
 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 blocknr, u64 num_blocks, int pin)
+			 *root, u64 blocknr, u64 num_blocks)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -171,7 +186,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_key ins;
 	u32 refs;
 
-	BUG_ON(pin && num_blocks != 1);
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -186,26 +200,18 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		printk("failed to find %Lu\n", key.objectid);
 		BUG();
 	}
-	ei = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
+	ei = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
 			    struct btrfs_extent_item);
 	BUG_ON(ei->refs == 0);
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
 	if (refs == 0) {
 		u64 super_blocks_used;
-		if (pin) {
-			int err;
-			radix_tree_preload(GFP_KERNEL);
-			err = radix_tree_insert(&info->pinned_radix,
-						blocknr, (void *)blocknr);
-			BUG_ON(err);
-			radix_tree_preload_end();
-		}
 		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
 		btrfs_set_super_blocks_used(info->disk_super,
 					    super_blocks_used - num_blocks);
 		ret = btrfs_del_item(trans, extent_root, &path);
-		if (!pin && extent_root->fs_info->last_insert.objectid >
+		if (extent_root->fs_info->last_insert.objectid >
 		    blocknr)
 			extent_root->fs_info->last_insert.objectid = blocknr;
 		if (ret)
@@ -224,39 +230,32 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root)
 {
 	int ret;
-	struct btrfs_buffer *gang[4];
+	int wret;
+	int err = 0;
+	unsigned long gang[4];
 	int i;
+	struct radix_tree_root *radix = &extent_root->fs_info->pinned_radix;
 
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(
-					&extent_root->fs_info->cache_radix,
+					&extent_root->fs_info->pinned_radix,
 					(void **)gang, 0,
 					ARRAY_SIZE(gang),
 					CTREE_EXTENT_PENDING_DEL);
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			ret = __free_extent(trans, extent_root,
-					    gang[i]->blocknr, 1, 1);
-			radix_tree_tag_clear(&extent_root->fs_info->cache_radix,
-					     gang[i]->blocknr,
+			radix_tree_tag_set(radix, gang[i], CTREE_EXTENT_PINNED);
+			radix_tree_tag_clear(radix, gang[i],
 					     CTREE_EXTENT_PENDING_DEL);
-			btrfs_block_release(extent_root, gang[i]);
+			wret = __free_extent(trans, extent_root, gang[i], 1);
+			if (wret)
+				err = wret;
 		}
 	}
-	return 0;
+	return err;
 }
 
-static int run_pending(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *extent_root)
-{
-	while(radix_tree_tagged(&extent_root->fs_info->cache_radix,
-				CTREE_EXTENT_PENDING_DEL))
-		del_pending_extents(trans, extent_root);
-	return 0;
-}
-
-
 /*
  * remove an extent from the root, returns 0 on success
  */
@@ -264,18 +263,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	struct btrfs_buffer *t;
+	struct buffer_head *t;
 	int pending_ret;
 	int ret;
 
 	if (root == extent_root) {
 		t = find_tree_block(root, blocknr);
-		radix_tree_tag_set(&root->fs_info->cache_radix, blocknr,
-				   CTREE_EXTENT_PENDING_DEL);
+		pin_down_block(root, blocknr, CTREE_EXTENT_PENDING_DEL);
 		return 0;
 	}
-	ret = __free_extent(trans, root, blocknr, num_blocks, pin);
-	pending_ret = run_pending(trans, root->fs_info->extent_root);
+	if (pin) {
+		ret = pin_down_block(root, blocknr, CTREE_EXTENT_PINNED);
+		BUG_ON(ret);
+	}
+	ret = __free_extent(trans, root, blocknr, num_blocks);
+	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
 
@@ -296,14 +298,16 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	u64 hole_size = 0;
 	int slot = 0;
-	u64 last_block;
+	u64 last_block = 0;
 	u64 test_block;
 	int start_found;
 	struct btrfs_leaf *l;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	int total_needed = num_blocks;
+	int level;
 
-	total_needed += (btrfs_header_level(&root->node->node.header) + 1) * 3;
+	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	total_needed += (level + 1) * 3;
 	if (root->fs_info->last_insert.objectid > search_start)
 		search_start = root->fs_info->last_insert.objectid;
 
@@ -323,7 +327,7 @@ check_failed:
 		path.slots[0]--;
 
 	while (1) {
-		l = &path.nodes[0]->leaf;
+		l = btrfs_buffer_leaf(path.nodes[0]);
 		slot = path.slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
 			ret = btrfs_next_leaf(root, &path);
@@ -429,7 +433,7 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 				sizeof(extent_item));
 
 	finish_current_insert(trans, extent_root);
-	pending_ret = run_pending(trans, extent_root);
+	pending_ret = del_pending_extents(trans, extent_root);
 	if (ret)
 		return ret;
 	if (pending_ret)
@@ -441,16 +445,15 @@ static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
  */
-struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root)
 {
 	struct btrfs_key ins;
 	int ret;
-	struct btrfs_buffer *buf;
+	struct buffer_head *buf;
 
 	ret = alloc_extent(trans, root, 1, 0, (unsigned long)-1,
-			   btrfs_header_parentid(&root->node->node.header),
-			   &ins);
+		btrfs_header_parentid(btrfs_buffer_header(root->node)), &ins);
 	if (ret) {
 		BUG();
 		return NULL;
@@ -467,13 +470,13 @@ struct btrfs_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, int *level)
 {
-	struct btrfs_buffer *next;
-	struct btrfs_buffer *cur;
+	struct buffer_head *next;
+	struct buffer_head *cur;
 	u64 blocknr;
 	int ret;
 	u32 refs;
 
-	ret = lookup_block_ref(trans, root, path->nodes[*level]->blocknr,
+	ret = lookup_block_ref(trans, root, path->nodes[*level]->b_blocknr,
 			       &refs);
 	BUG_ON(ret);
 	if (refs > 1)
@@ -484,9 +487,10 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	while(*level > 0) {
 		cur = path->nodes[*level];
 		if (path->slots[*level] >=
-		    btrfs_header_nritems(&cur->node.header))
+		    btrfs_header_nritems(btrfs_buffer_header(cur)))
 			break;
-		blocknr = btrfs_node_blockptr(&cur->node, path->slots[*level]);
+		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
+					      path->slots[*level]);
 		ret = lookup_block_ref(trans, root, blocknr, &refs);
 		if (refs != 1 || *level == 1) {
 			path->slots[*level]++;
@@ -499,12 +503,12 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (path->nodes[*level-1])
 			btrfs_block_release(root, path->nodes[*level-1]);
 		path->nodes[*level-1] = next;
-		*level = btrfs_header_level(&next->node.header);
+		*level = btrfs_header_level(btrfs_buffer_header(next));
 		path->slots[*level] = 0;
 	}
 out:
-	ret = btrfs_free_extent(trans, root, path->nodes[*level]->blocknr, 1,
-				1);
+	ret = btrfs_free_extent(trans, root, path->nodes[*level]->b_blocknr,
+				1, 1);
 	btrfs_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -525,14 +529,14 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot <
-		    btrfs_header_nritems(&path->nodes[i]->node.header)- 1) {
+		if (slot < btrfs_header_nritems(
+		    btrfs_buffer_header(path->nodes[i])) - 1) {
 			path->slots[i]++;
 			*level = i;
 			return 0;
 		} else {
 			ret = btrfs_free_extent(trans, root,
-						path->nodes[*level]->blocknr,
+						path->nodes[*level]->b_blocknr,
 						1, 1);
 			btrfs_block_release(root, path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -549,7 +553,7 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
  * decremented.
  */
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, struct btrfs_buffer *snap)
+			*root, struct buffer_head *snap)
 {
 	int ret = 0;
 	int wret;
@@ -560,7 +564,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	btrfs_init_path(&path);
 
-	level = btrfs_header_level(&snap->node.header);
+	level = btrfs_header_level(btrfs_buffer_header(snap));
 	orig_level = level;
 	path.nodes[level] = snap;
 	path.slots[level] = 0;
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 6c2a71a46c7..22519b8e0cf 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -10,6 +10,7 @@
  * License.
  */
 
+#include <linux/types.h>
 #define DELTA 0x9E3779B9
 
 static void TEA_transform(__u32 buf[2], __u32 const in[])
@@ -63,6 +64,14 @@ int btrfs_name_hash(const char *name, int len, u64 *hash_result)
 	const char	*p;
 	__u32		in[8], buf[2];
 
+	if (len == 1 && *name == '.') {
+		*hash_result = 1;
+		return 0;
+	} else if (len == 2 && name[0] == '.' && name[1] == '.') {
+		*hash_result = 2;
+		return 0;
+	}
+
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
 	buf[1] = 0xefcdab89;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c7fda3bf7b2..c45aec258bd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -15,7 +15,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 hole_size = 0;
 	int slot = 0;
-	u64 last_ino;
+	u64 last_ino = 0;
 	int start_found;
 	struct btrfs_leaf *l;
 	struct btrfs_root *root = fs_root->fs_info->inode_root;
@@ -40,7 +40,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 		path.slots[0]--;
 
 	while (1) {
-		l = &path.nodes[0]->leaf;
+		l = btrfs_buffer_leaf(path.nodes[0]);
 		slot = path.slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
 			ret = btrfs_next_leaf(root, &path);
@@ -105,8 +105,8 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto out;
 
-	inode_item = btrfs_item_ptr(&path.nodes[0]->leaf, path.slots[0],
-				  struct btrfs_inode_map_item);
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+				    path.slots[0], struct btrfs_inode_map_item);
 	btrfs_cpu_key_to_disk(&inode_item->key, location);
 out:
 	btrfs_release_path(inode_root, &path);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index aa2d3fac880..c8ee938c125 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -17,7 +17,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	printk("leaf %Lu total ptrs %d free space %d\n",
 		btrfs_header_blocknr(&l->header), nr,
 		btrfs_leaf_free_space(root, l));
-	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
 		type = btrfs_disk_key_type(&item->key);
@@ -67,10 +66,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 				btrfs_leaf_data(l) + btrfs_item_offset(item));
 			break;
 		};
-		fflush(stdout);
 	}
 }
-void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
+
+void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t)
 {
 	int i;
 	u32 nr;
@@ -78,16 +77,16 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 
 	if (!t)
 		return;
-	c = &t->node;
+	c = btrfs_buffer_node(t);
 	nr = btrfs_header_nritems(&c->header);
 	if (btrfs_is_leaf(c)) {
 		btrfs_print_leaf(root, (struct btrfs_leaf *)c);
 		return;
 	}
-	printk("node %Lu level %d total ptrs %d free spc %u\n", t->blocknr,
-	        btrfs_header_level(&c->header), nr,
-		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
-	fflush(stdout);
+	printk("node %Lu level %d total ptrs %d free spc %u\n",
+	       btrfs_header_blocknr(&c->header),
+	       btrfs_header_level(&c->header), nr,
+	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		printk("\tkey %d (%Lu %u %Lu) block %Lu\n",
 		       i,
@@ -95,12 +94,11 @@ void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t)
 		       c->ptrs[i].key.flags,
 		       c->ptrs[i].key.offset,
 		       btrfs_node_blockptr(c, i));
-		fflush(stdout);
 	}
 	for (i = 0; i < nr; i++) {
-		struct btrfs_buffer *next_buf = read_tree_block(root,
+		struct buffer_head *next_buf = read_tree_block(root,
 						btrfs_node_blockptr(c, i));
-		struct btrfs_node *next = &next_buf->node;
+		struct btrfs_node *next = btrfs_buffer_node(next_buf);
 		if (btrfs_is_leaf(next) &&
 		    btrfs_header_level(&c->header) != 1)
 			BUG();
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 0882ca904ec..396041a05cf 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -1,5 +1,5 @@
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
 void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l);
-void btrfs_print_tree(struct btrfs_root *root, struct btrfs_buffer *t);
+void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t);
 #endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 52c83be4b30..a4554c007ef 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -21,7 +21,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
-	l = &path.nodes[0]->leaf;
+	l = btrfs_buffer_leaf(path.nodes[0]);
 	BUG_ON(path.slots[0] == 0);
 	slot = path.slots[0] - 1;
 	if (btrfs_disk_key_objectid(&l->items[slot].key) != objectid) {
@@ -51,7 +51,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
-	l = &path.nodes[0]->leaf;
+	l = btrfs_buffer_leaf(path.nodes[0]);
 	slot = path.slots[0];
 	memcpy(btrfs_item_ptr(l, slot, struct btrfs_root_item), item,
 		sizeof(*item));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4ae76044aea..ccc056aad69 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1,4 +1,5 @@
 #include <linux/module.h>
+#include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -8,13 +9,18 @@
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include "ctree.h"
+#include "disk-io.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123682E
+
+static struct inode_operations btrfs_dir_inode_operations;
+static struct super_operations btrfs_super_ops;
+static struct file_operations btrfs_dir_file_operations;
+
 #if 0
 /* some random number */
 
 static struct super_operations ramfs_ops;
-static struct inode_operations ramfs_dir_inode_operations;
 
 static struct backing_dev_info ramfs_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
@@ -129,46 +135,243 @@ static struct inode_operations ramfs_dir_inode_operations = {
 };
 #endif
 
-struct inode *btrfs_get_inode(struct super_block *sb, int mode, dev_t dev)
+static void btrfs_read_locked_inode(struct inode *inode)
 {
-	struct inode * inode = new_inode(sb);
+	struct btrfs_path path;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	int ret;
+printk("read locked inode %lu\n", inode->i_ino);
+	btrfs_init_path(&path);
+	ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0);
+	if (ret) {
+		make_bad_inode(inode);
+		return;
+	}
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+				  path.slots[0],
+				  struct btrfs_inode_item);
 
-	if (inode) {
-		inode->i_mode = mode;
-		inode->i_uid = current->fsuid;
-		inode->i_gid = current->fsgid;
-		inode->i_blocks = 0;
-		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+printk("found locked inode %lu\n", inode->i_ino);
+	inode->i_mode = btrfs_inode_mode(inode_item);
+	inode->i_nlink = btrfs_inode_nlink(inode_item);
+	inode->i_uid = btrfs_inode_uid(inode_item);
+	inode->i_gid = btrfs_inode_gid(inode_item);
+	inode->i_size = btrfs_inode_size(inode_item);
+	inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime);
+	inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime);
+	inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime);
+	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime);
+	inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime);
+	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
+	inode->i_blocks = btrfs_inode_nblocks(inode_item);
+	inode->i_generation = btrfs_inode_generation(inode_item);
+printk("about to release\n");
+	btrfs_release_path(root, &path);
+	switch (inode->i_mode & S_IFMT) {
+#if 0
+	default:
+		init_special_inode(inode, inode->i_mode,
+				   btrfs_inode_rdev(inode_item));
+		break;
+#endif
+	case S_IFREG:
+printk("inode %lu now a file\n", inode->i_ino);
+		break;
+	case S_IFDIR:
+printk("inode %lu now a directory\n", inode->i_ino);
+		inode->i_op = &btrfs_dir_inode_operations;
+		inode->i_fop = &btrfs_dir_file_operations;
+		break;
+	case S_IFLNK:
+printk("inode %lu now a link\n", inode->i_ino);
+		// inode->i_op = &page_symlink_inode_operations;
+		break;
 	}
-	return inode;
+printk("returning!\n");
+	return;
 }
 
-static struct super_operations btrfs_ops = {
-	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
-};
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+			      ino_t *ino)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct btrfs_dir_item *di;
+	struct btrfs_path path;
+	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	int ret;
+
+	btrfs_init_path(&path);
+	ret = btrfs_lookup_dir_item(NULL, root, &path, dir->i_ino, name,
+				    namelen, 0);
+	if (ret) {
+		*ino = 0;
+		goto out;
+	}
+	di = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+			    struct btrfs_dir_item);
+	*ino = btrfs_dir_objectid(di);
+out:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode * inode;
+	ino_t ino;
+	int ret;
+
+	if (dentry->d_name.len > BTRFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ret = btrfs_inode_by_name(dir, dentry, &ino);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	inode = NULL;
+	if (ino) {
+printk("lookup on %.*s returns %lu\n", dentry->d_name.len, dentry->d_name.name, ino);
+		inode = iget(dir->i_sb, ino);
+		if (!inode)
+			return ERR_PTR(-EACCES);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_item *item;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_path path;
+	int ret;
+	u32 nritems;
+	struct btrfs_leaf *leaf;
+	int slot;
+	int advance;
+	unsigned char d_type = DT_UNKNOWN;
+	int over;
+
+	key.objectid = inode->i_ino;
+printk("readdir on dir %Lu pos %Lu\n", key.objectid, filp->f_pos);
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	key.offset = filp->f_pos;
+	btrfs_init_path(&path);
+	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+	if (ret < 0) {
+		goto err;
+	}
+printk("first ret %d\n", ret);
+	advance = filp->f_pos > 0 && ret != 0;
+	while(1) {
+		leaf = btrfs_buffer_leaf(path.nodes[0]);
+		nritems = btrfs_header_nritems(&leaf->header);
+		slot = path.slots[0];
+printk("leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot);
+		if (advance) {
+printk("advancing!\n");
+			if (slot == nritems -1) {
+				ret = btrfs_next_leaf(root, &path);
+				if (ret)
+					break;
+				leaf = btrfs_buffer_leaf(path.nodes[0]);
+				nritems = btrfs_header_nritems(&leaf->header);
+				slot = path.slots[0];
+printk("2leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot);
+			} else {
+				slot++;
+				path.slots[0]++;
+			}
+		}
+		advance = 1;
+		item = leaf->items + slot;
+printk("item key %Lu %u %Lu\n", btrfs_disk_key_objectid(&item->key),
+       btrfs_disk_key_flags(&item->key), btrfs_disk_key_offset(&item->key));
+		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
+			break;
+		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_ITEM_KEY)
+			continue;
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+printk("filldir name %.*s, objectid %Lu\n", btrfs_dir_name_len(di),
+       (const char *)(di + 1), btrfs_dir_objectid(di));
+		over = filldir(dirent, (const char *)(di + 1),
+			       btrfs_dir_name_len(di),
+			       btrfs_disk_key_offset(&item->key),
+			       btrfs_dir_objectid(di), d_type);
+		if (over)
+			break;
+		filp->f_pos = btrfs_disk_key_offset(&item->key) + 1;
+	}
+printk("filldir all done\n");
+	ret = 0;
+err:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+static void btrfs_put_super (struct super_block * sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	ret = close_ctree(root);
+	if (ret) {
+		printk("close ctree returns %d\n", ret);
+	}
+	sb->s_fs_info = NULL;
+}
 
 static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 {
 	struct inode * inode;
-	struct dentry * root;
+	struct dentry * root_dentry;
+	struct btrfs_super_block *disk_super;
+	struct buffer_head *bh;
+	struct btrfs_root *root;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
-	sb->s_op = &btrfs_ops;
+	sb->s_op = &btrfs_super_ops;
 	sb->s_time_gran = 1;
-	inode = btrfs_get_inode(sb, S_IFDIR | 0755, 0);
+
+	bh = sb_bread(sb, BTRFS_SUPER_INFO_OFFSET / sb->s_blocksize);
+	if (!bh) {
+		printk("btrfs: unable to read on disk super\n");
+		return -EIO;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	root = open_ctree(sb, bh, disk_super);
+	sb->s_fs_info = root;
+	if (!root) {
+		printk("btrfs: open_ctree failed\n");
+		return -EIO;
+	}
+	printk("read in super total blocks %Lu root %Lu\n",
+	       btrfs_super_total_blocks(disk_super),
+	       btrfs_super_root_dir(disk_super));
+
+	inode = iget_locked(sb, btrfs_super_root_dir(disk_super));
 	if (!inode)
 		return -ENOMEM;
+	if (inode->i_state & I_NEW) {
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+	}
 
-	root = d_alloc_root(inode);
-	if (!root) {
+	root_dentry = d_alloc_root(inode);
+	if (!root_dentry) {
 		iput(inode);
 		return -ENOMEM;
 	}
-	sb->s_root = root;
+	sb->s_root = root_dentry;
+
 	return 0;
 }
 
@@ -187,6 +390,24 @@ static struct file_system_type btrfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+static struct super_operations btrfs_super_ops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.put_super	= btrfs_put_super,
+	.read_inode	= btrfs_read_locked_inode,
+};
+
+static struct inode_operations btrfs_dir_inode_operations = {
+	.lookup		= btrfs_lookup,
+};
+
+static struct file_operations btrfs_dir_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= btrfs_readdir,
+};
+
+
 static int __init init_btrfs_fs(void)
 {
 	printk("btrfs loaded!\n");
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 3adb6e69fe4..9ab27b7491c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -10,7 +10,7 @@ struct btrfs_trans_handle {
 static inline struct btrfs_trans_handle *
 btrfs_start_transaction(struct btrfs_root *root, int num_blocks)
 {
-	struct btrfs_trans_handle *h = malloc(sizeof(*h));
+	struct btrfs_trans_handle *h = kmalloc(sizeof(*h), GFP_NOFS);
 	h->transid = root->root_key.offset;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
@@ -21,7 +21,7 @@ static inline void btrfs_free_transaction(struct btrfs_root *root,
 					  struct btrfs_trans_handle *handle)
 {
 	memset(handle, 0, sizeof(*handle));
-	free(handle);
+	kfree(handle);
 }
 
 #endif
-- 
cgit v1.2.3


From 79154b1b5bcf87903db7ff16a30b360b78d6fe4f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 22 Mar 2007 15:59:16 -0400
Subject: Btrfs: transaction rework

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/ctree.h       |   4 +-
 fs/btrfs/disk-io.c     | 115 +++-------------------------
 fs/btrfs/disk-io.h     |   8 +-
 fs/btrfs/super.c       |  18 -----
 fs/btrfs/transaction.c | 199 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.h |  35 +++++----
 7 files changed, 235 insertions(+), 146 deletions(-)
 create mode 100644 fs/btrfs/transaction.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 1a0fb742239..be7d74cdca0 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -3,7 +3,7 @@ ifneq ($(KERNELRELEASE),)
 
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
-	   hash.o file-item.o inode-item.o inode-map.o disk-io.o
+	   hash.o file-item.o inode-item.o inode-map.o disk-io.o transaction.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7748eecd930..4c19a3f12af 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -5,6 +5,7 @@
 #include <linux/fs.h>
 
 struct btrfs_trans_handle;
+struct btrfs_transaction;
 
 #define BTRFS_MAGIC "_BtRfS_M"
 
@@ -224,10 +225,11 @@ struct btrfs_fs_info {
 	u64 last_inode_alloc;
 	u64 last_inode_alloc_dirid;
 	u64 generation;
-	struct btrfs_trans_handle *running_transaction;
+	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
 	struct buffer_head *sb_buffer;
 	struct super_block *sb;
+	struct mutex trans_mutex;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index df2061a735c..9cacca0c525 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,89 +50,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return 0;
 }
 
-int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct buffer_head *buf)
-{
-	mark_buffer_dirty(buf);
-	return 0;
-}
-
-static int __commit_transaction(struct btrfs_trans_handle *trans, struct
-				btrfs_root *root)
-{
-	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
-	return 0;
-}
-
-static int commit_tree_roots(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info)
-{
-	int ret;
-	u64 old_extent_block;
-	struct btrfs_root *tree_root = fs_info->tree_root;
-	struct btrfs_root *extent_root = fs_info->extent_root;
-	struct btrfs_root *inode_root = fs_info->inode_root;
-
-	btrfs_set_root_blocknr(&inode_root->root_item,
-			       inode_root->node->b_blocknr);
-	ret = btrfs_update_root(trans, tree_root,
-				&inode_root->root_key,
-				&inode_root->root_item);
-	BUG_ON(ret);
-	while(1) {
-		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
-		if (old_extent_block == extent_root->node->b_blocknr)
-			break;
-		btrfs_set_root_blocknr(&extent_root->root_item,
-				       extent_root->node->b_blocknr);
-		ret = btrfs_update_root(trans, tree_root,
-					&extent_root->root_key,
-					&extent_root->root_item);
-		BUG_ON(ret);
-	}
-	return 0;
-}
-
-int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct
-			     btrfs_root *root, struct btrfs_super_block *s)
-{
-	int ret = 0;
-	struct buffer_head *snap = root->commit_root;
-	struct btrfs_key snap_key;
-
-	if (root->commit_root == root->node)
-		return 0;
-
-	memcpy(&snap_key, &root->root_key, sizeof(snap_key));
-	root->root_key.offset++;
-
-	btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr);
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
-				&root->root_key, &root->root_item);
-	BUG_ON(ret);
-
-	ret = commit_tree_roots(trans, root->fs_info);
-	BUG_ON(ret);
-
-	ret = __commit_transaction(trans, root);
-	BUG_ON(ret);
-
-	write_ctree_super(trans, root, s);
-	btrfs_finish_extent_commit(trans, root->fs_info->extent_root);
-	btrfs_finish_extent_commit(trans, root->fs_info->tree_root);
-
-	root->commit_root = root->node;
-	get_bh(root->node);
-	ret = btrfs_drop_snapshot(trans, root, snap);
-	BUG_ON(ret);
-
-	ret = btrfs_del_root(trans, root->fs_info->tree_root, &snap_key);
-	BUG_ON(ret);
-	root->fs_info->generation = root->root_key.offset + 1;
-
-	return ret;
-}
-
 static int __setup_root(struct btrfs_super_block *super,
 			struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
@@ -197,6 +114,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->disk_super = disk_super;
 	fs_info->sb_buffer = sb_buffer;
 	fs_info->sb = sb;
+	mutex_init(&fs_info->trans_mutex);
 	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
@@ -225,7 +143,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 }
 
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_super_block *s)
+		      *root)
 {
 	return 0;
 #if 0
@@ -242,34 +160,19 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 #endif
 }
 
-static int drop_cache(struct btrfs_root *root)
-{
-	return 0;
-#if 0
-	while(!list_empty(&root->fs_info->cache)) {
-		struct buffer_head *b = list_entry(root->fs_info->cache.next,
-						    struct buffer_head,
-						    cache);
-		list_del_init(&b->cache);
-		btrfs_block_release(root, b);
-	}
-	return 0;
-#endif
-}
-
 int close_ctree(struct btrfs_root *root)
 {
 	int ret;
 	struct btrfs_trans_handle *trans;
 
-	trans = root->fs_info->running_transaction;
-	btrfs_commit_transaction(trans, root, root->fs_info->disk_super);
-	ret = commit_tree_roots(trans, root->fs_info);
-	BUG_ON(ret);
-	ret = __commit_transaction(trans, root);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_commit_transaction(trans, root);
+	/* run commit again to  drop the original snapshot */
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_commit_transaction(trans, root);
+	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
-	write_ctree_super(trans, root, root->fs_info->disk_super);
-	drop_cache(root);
+	write_ctree_super(NULL, root);
 
 	if (root->node)
 		btrfs_block_release(root, root->node);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 7f4bb729b73..099f7eea0ec 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -28,14 +28,14 @@ int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct buffer_head *buf);
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct buffer_head *buf);
-int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root
-			     *root, struct btrfs_super_block *s);
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct buffer_head *sb_buffer,
 			      struct btrfs_super_block *disk_super);
 int close_ctree(struct btrfs_root *root);
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf);
-int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		      struct btrfs_super_block *s);
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root);
 int mkfs(int fd, u64 num_blocks, u32 blocksize);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ccc056aad69..62043082584 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -141,7 +141,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
 	int ret;
-printk("read locked inode %lu\n", inode->i_ino);
 	btrfs_init_path(&path);
 	ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0);
 	if (ret) {
@@ -152,7 +151,6 @@ printk("read locked inode %lu\n", inode->i_ino);
 				  path.slots[0],
 				  struct btrfs_inode_item);
 
-printk("found locked inode %lu\n", inode->i_ino);
 	inode->i_mode = btrfs_inode_mode(inode_item);
 	inode->i_nlink = btrfs_inode_nlink(inode_item);
 	inode->i_uid = btrfs_inode_uid(inode_item);
@@ -166,7 +164,6 @@ printk("found locked inode %lu\n", inode->i_ino);
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
-printk("about to release\n");
 	btrfs_release_path(root, &path);
 	switch (inode->i_mode & S_IFMT) {
 #if 0
@@ -176,19 +173,15 @@ printk("about to release\n");
 		break;
 #endif
 	case S_IFREG:
-printk("inode %lu now a file\n", inode->i_ino);
 		break;
 	case S_IFDIR:
-printk("inode %lu now a directory\n", inode->i_ino);
 		inode->i_op = &btrfs_dir_inode_operations;
 		inode->i_fop = &btrfs_dir_file_operations;
 		break;
 	case S_IFLNK:
-printk("inode %lu now a link\n", inode->i_ino);
 		// inode->i_op = &page_symlink_inode_operations;
 		break;
 	}
-printk("returning!\n");
 	return;
 }
 
@@ -232,7 +225,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(ret);
 	inode = NULL;
 	if (ino) {
-printk("lookup on %.*s returns %lu\n", dentry->d_name.len, dentry->d_name.name, ino);
 		inode = iget(dir->i_sb, ino);
 		if (!inode)
 			return ERR_PTR(-EACCES);
@@ -257,7 +249,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int over;
 
 	key.objectid = inode->i_ino;
-printk("readdir on dir %Lu pos %Lu\n", key.objectid, filp->f_pos);
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	key.offset = filp->f_pos;
@@ -266,15 +257,12 @@ printk("readdir on dir %Lu pos %Lu\n", key.objectid, filp->f_pos);
 	if (ret < 0) {
 		goto err;
 	}
-printk("first ret %d\n", ret);
 	advance = filp->f_pos > 0 && ret != 0;
 	while(1) {
 		leaf = btrfs_buffer_leaf(path.nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
 		slot = path.slots[0];
-printk("leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot);
 		if (advance) {
-printk("advancing!\n");
 			if (slot == nritems -1) {
 				ret = btrfs_next_leaf(root, &path);
 				if (ret)
@@ -282,7 +270,6 @@ printk("advancing!\n");
 				leaf = btrfs_buffer_leaf(path.nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
 				slot = path.slots[0];
-printk("2leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slot);
 			} else {
 				slot++;
 				path.slots[0]++;
@@ -290,15 +277,11 @@ printk("2leaf %Lu nritems %lu slot %d\n", path.nodes[0]->b_blocknr, nritems, slo
 		}
 		advance = 1;
 		item = leaf->items + slot;
-printk("item key %Lu %u %Lu\n", btrfs_disk_key_objectid(&item->key),
-       btrfs_disk_key_flags(&item->key), btrfs_disk_key_offset(&item->key));
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
 		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_ITEM_KEY)
 			continue;
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
-printk("filldir name %.*s, objectid %Lu\n", btrfs_dir_name_len(di),
-       (const char *)(di + 1), btrfs_dir_objectid(di));
 		over = filldir(dirent, (const char *)(di + 1),
 			       btrfs_dir_name_len(di),
 			       btrfs_disk_key_offset(&item->key),
@@ -307,7 +290,6 @@ printk("filldir name %.*s, objectid %Lu\n", btrfs_dir_name_len(di),
 			break;
 		filp->f_pos = btrfs_disk_key_offset(&item->key) + 1;
 	}
-printk("filldir all done\n");
 	ret = 0;
 err:
 	btrfs_release_path(root, &path);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
new file mode 100644
index 00000000000..8dc1c170f10
--- /dev/null
+++ b/fs/btrfs/transaction.c
@@ -0,0 +1,199 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+
+static void put_transaction(struct btrfs_transaction *transaction)
+{
+	transaction->use_count--;
+	if (transaction->use_count == 0)
+		kfree(transaction);
+}
+
+static int join_transaction(struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans;
+	cur_trans = root->fs_info->running_transaction;
+	if (!cur_trans) {
+		cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
+		BUG_ON(!cur_trans);
+		root->fs_info->running_transaction = cur_trans;
+		cur_trans->num_writers = 0;
+		cur_trans->transid = root->root_key.offset + 1;
+		init_waitqueue_head(&cur_trans->writer_wait);
+		init_waitqueue_head(&cur_trans->commit_wait);
+		cur_trans->in_commit = 0;
+		cur_trans->use_count = 0;
+		cur_trans->commit_done = 0;
+	}
+	cur_trans->num_writers++;
+	return 0;
+}
+
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_blocks)
+{
+	struct btrfs_trans_handle *h = kmalloc(sizeof(*h), GFP_NOFS);
+	int ret;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	ret = join_transaction(root);
+	BUG_ON(ret);
+	h->transid = root->fs_info->running_transaction->transid;
+	h->transaction = root->fs_info->running_transaction;
+	h->blocks_reserved = num_blocks;
+	h->blocks_used = 0;
+	root->fs_info->running_transaction->use_count++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+	return h;
+}
+
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans;
+	mutex_lock(&root->fs_info->trans_mutex);
+	cur_trans = root->fs_info->running_transaction;
+	WARN_ON(cur_trans->num_writers <= 1);
+	if (waitqueue_active(&cur_trans->writer_wait))
+		wake_up(&cur_trans->writer_wait);
+	cur_trans->num_writers--;
+	put_transaction(cur_trans);
+	mutex_unlock(&root->fs_info->trans_mutex);
+	kfree(trans);
+	return 0;
+}
+
+
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root)
+{
+	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
+	return 0;
+}
+
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	int ret;
+	u64 old_extent_block;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *inode_root = fs_info->inode_root;
+
+	btrfs_set_root_blocknr(&inode_root->root_item,
+			       inode_root->node->b_blocknr);
+	ret = btrfs_update_root(trans, tree_root,
+				&inode_root->root_key,
+				&inode_root->root_item);
+	BUG_ON(ret);
+	while(1) {
+		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
+		if (old_extent_block == extent_root->node->b_blocknr)
+			break;
+		btrfs_set_root_blocknr(&extent_root->root_item,
+				       extent_root->node->b_blocknr);
+		ret = btrfs_update_root(trans, tree_root,
+					&extent_root->root_key,
+					&extent_root->root_item);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
+static int wait_for_commit(struct btrfs_root *root,
+			   struct btrfs_transaction *commit)
+{
+	DEFINE_WAIT(wait);
+	commit->use_count++;
+	while(!commit->commit_done) {
+		prepare_to_wait(&commit->commit_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (commit->commit_done)
+			break;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+	finish_wait(&commit->commit_wait, &wait);
+	return 0;
+}
+
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root)
+{
+	int ret = 0;
+	struct buffer_head *snap = root->commit_root;
+	struct btrfs_key snap_key;
+	struct btrfs_transaction *cur_trans;
+	DEFINE_WAIT(wait);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (trans->transaction->in_commit) {
+		cur_trans = trans->transaction;
+		trans->transaction->use_count++;
+		btrfs_end_transaction(trans, root);
+		ret = wait_for_commit(root, cur_trans);
+		BUG_ON(ret);
+		put_transaction(cur_trans);
+		mutex_unlock(&root->fs_info->trans_mutex);
+		return 0;
+	}
+	while (trans->transaction->num_writers > 1) {
+		prepare_to_wait(&trans->transaction->writer_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (trans->transaction->num_writers <= 1)
+			break;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+	finish_wait(&trans->transaction->writer_wait, &wait);
+
+	cur_trans = root->fs_info->running_transaction;
+	root->fs_info->running_transaction = NULL;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	memcpy(&snap_key, &root->root_key, sizeof(snap_key));
+	root->root_key.offset++;
+
+	if (btrfs_root_blocknr(&root->root_item) != root->node->b_blocknr) {
+		btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr);
+		ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+					&root->root_key, &root->root_item);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_commit_tree_roots(trans, root);
+	BUG_ON(ret);
+
+	ret = btrfs_write_and_wait_transaction(trans, root);
+	BUG_ON(ret);
+
+	write_ctree_super(trans, root);
+	btrfs_finish_extent_commit(trans, root->fs_info->extent_root);
+	btrfs_finish_extent_commit(trans, root->fs_info->tree_root);
+	put_transaction(cur_trans);
+	kfree(trans);
+
+	if (root->node != root->commit_root) {
+		trans = btrfs_start_transaction(root, 1);
+		root->commit_root = root->node;
+		get_bh(root->node);
+		ret = btrfs_drop_snapshot(trans, root, snap);
+		BUG_ON(ret);
+
+		ret = btrfs_del_root(trans, root->fs_info->tree_root,
+				     &snap_key);
+		BUG_ON(ret);
+		root->fs_info->generation = root->root_key.offset + 1;
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+	}
+
+	return ret;
+}
+
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 9ab27b7491c..7e6c08a0acc 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -1,27 +1,30 @@
 #ifndef __TRANSACTION__
 #define __TRANSACTION__
 
+struct btrfs_transaction {
+	u64 transid;
+	unsigned long num_writers;
+	int in_commit;
+	int use_count;
+	int commit_done;
+	wait_queue_head_t writer_wait;
+	wait_queue_head_t commit_wait;
+};
+
 struct btrfs_trans_handle {
 	u64 transid;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
+	struct btrfs_transaction *transaction;
 };
 
-static inline struct btrfs_trans_handle *
-btrfs_start_transaction(struct btrfs_root *root, int num_blocks)
-{
-	struct btrfs_trans_handle *h = kmalloc(sizeof(*h), GFP_NOFS);
-	h->transid = root->root_key.offset;
-	h->blocks_reserved = num_blocks;
-	h->blocks_used = 0;
-	return h;
-}
-
-static inline void btrfs_free_transaction(struct btrfs_root *root,
-					  struct btrfs_trans_handle *handle)
-{
-	memset(handle, 0, sizeof(*handle));
-	kfree(handle);
-}
 
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_blocks);
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root);
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From d571976292839cec05a2820b08f7629b145ed157 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 23 Mar 2007 10:01:08 -0400
Subject: btrfs_create, btrfs_write_super, btrfs_sync_fs

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |   1 +
 fs/btrfs/ctree.c       |  49 +++++++++++--------
 fs/btrfs/ctree.h       |   2 +-
 fs/btrfs/dir-item.c    |   3 +-
 fs/btrfs/disk-io.c     |  23 ++++-----
 fs/btrfs/extent-tree.c |  26 ++++++----
 fs/btrfs/inode-map.c   |   1 +
 fs/btrfs/root-tree.c   |   1 +
 fs/btrfs/super.c       | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.c |  13 +++--
 10 files changed, 201 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index c5d67bbd4d9..4b5bd05de94 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -3,6 +3,7 @@
 * Fix ENOSPC handling
 * make a real mkfs and superblock
 * Do checksumming
+* Use a real check instead of mark_buffer_dirty
 * Define FS objects in terms of different item types
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e690e2bb47d..052434a328e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -39,7 +39,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct buffer_head *cow;
 	struct btrfs_node *cow_node;
 
-	if (!buffer_dirty(buf)) {
+	if (buffer_dirty(buf)) {
 		*cow_ret = buf;
 		return 0;
 	}
@@ -48,6 +48,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
 	btrfs_set_header_blocknr(&cow_node->header, cow->b_blocknr);
 	*cow_ret = cow;
+	mark_buffer_dirty(cow);
 	btrfs_inc_ref(trans, root, buf);
 	if (buf == root->node) {
 		root->node = cow;
@@ -58,7 +59,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	} else {
 		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
 					cow->b_blocknr);
-		BUG_ON(!buffer_dirty(parent));
+		mark_buffer_dirty(parent);
 		btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
 	}
 	btrfs_block_release(root, buf);
@@ -362,7 +363,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			memcpy(&parent->ptrs[pslot + 1].key,
 				&right->ptrs[0].key,
 				sizeof(struct btrfs_disk_key));
-			BUG_ON(!buffer_dirty(parent_buf));
+			mark_buffer_dirty(parent_buf);
 		}
 	}
 	if (btrfs_header_nritems(&mid->header) == 1) {
@@ -398,7 +399,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		/* update the parent key to reflect our changes */
 		memcpy(&parent->ptrs[pslot].key, &mid->ptrs[0].key,
 		       sizeof(struct btrfs_disk_key));
-		BUG_ON(!buffer_dirty(parent_buf));
+		mark_buffer_dirty(parent_buf);
 	}
 
 	/* update the path */
@@ -539,7 +540,7 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		t = btrfs_buffer_node(path->nodes[i]);
 		memcpy(&t->ptrs[tslot].key, key, sizeof(*key));
-		BUG_ON(!buffer_dirty(path->nodes[i]));
+		mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
@@ -583,8 +584,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
-	BUG_ON(!buffer_dirty(src_buf));
-	BUG_ON(!buffer_dirty(dst_buf));
+	mark_buffer_dirty(src_buf);
+	mark_buffer_dirty(dst_buf);
 	return ret;
 }
 
@@ -631,8 +632,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
 
-	BUG_ON(!buffer_dirty(src_buf));
-	BUG_ON(!buffer_dirty(dst_buf));
+	mark_buffer_dirty(src_buf);
+	mark_buffer_dirty(dst_buf);
 	return ret;
 }
 
@@ -669,6 +670,9 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		lower_key = &lower->ptrs[0].key;
 	memcpy(&c->ptrs[0].key, lower_key, sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->b_blocknr);
+
+	mark_buffer_dirty(t);
+
 	/* the super has an extra ref to root->node */
 	btrfs_block_release(root, root->node);
 	root->node = t;
@@ -708,7 +712,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	memcpy(&lower->ptrs[slot].key, key, sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(lower, slot, blocknr);
 	btrfs_set_header_nritems(&lower->header, nritems + 1);
-	BUG_ON(!buffer_dirty(path->nodes[level]));
+	mark_buffer_dirty(path->nodes[level]);
 	return 0;
 }
 
@@ -755,7 +759,8 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_nritems(&c->header, mid);
 	ret = 0;
 
-	BUG_ON(!buffer_dirty(t));
+	mark_buffer_dirty(t);
+	mark_buffer_dirty(split_buffer);
 	wret = insert_ptr(trans, root, path, &split->ptrs[0].key,
 			  split_buffer->b_blocknr, path->slots[level + 1] + 1,
 			  level + 1);
@@ -886,11 +891,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	left_nritems -= push_items;
 	btrfs_set_header_nritems(&left->header, left_nritems);
 
-	BUG_ON(!buffer_dirty(left_buf));
-	BUG_ON(!buffer_dirty(right_buf));
+	mark_buffer_dirty(left_buf);
+	mark_buffer_dirty(right_buf);
 	memcpy(&upper_node->ptrs[slot + 1].key,
 		&right->items[0].key, sizeof(struct btrfs_disk_key));
-	BUG_ON(!buffer_dirty(upper));
+	mark_buffer_dirty(upper);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
@@ -1004,8 +1009,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		push_space = btrfs_item_offset(right->items + i);
 	}
 
-	BUG_ON(!buffer_dirty(t));
-	BUG_ON(!buffer_dirty(right_buf));
+	mark_buffer_dirty(t);
+	mark_buffer_dirty(right_buf);
 
 	wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1);
 	if (wret)
@@ -1115,8 +1120,8 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			  right_buffer->b_blocknr, path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
-	BUG_ON(!buffer_dirty(right_buffer));
-	BUG_ON(!buffer_dirty(l_buf));
+	mark_buffer_dirty(right_buffer);
+	mark_buffer_dirty(l_buf);
 	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
 		btrfs_block_release(root, path->nodes[0]);
@@ -1202,12 +1207,12 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_item_offset(leaf->items + slot, data_end - data_size);
 	btrfs_set_item_size(leaf->items + slot, data_size);
 	btrfs_set_header_nritems(&leaf->header, nritems + 1);
+	mark_buffer_dirty(leaf_buf);
 
 	ret = 0;
 	if (slot == 0)
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 
-	BUG_ON(!buffer_dirty(leaf_buf));
 	if (btrfs_leaf_free_space(root, leaf) < 0)
 		BUG();
 	check_leaf(root, path, 0);
@@ -1233,6 +1238,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
 				     path.slots[0], u8);
 		memcpy(ptr, data, data_size);
+		mark_buffer_dirty(path.nodes[0]);
 	}
 	btrfs_release_path(root, &path);
 	return ret;
@@ -1273,7 +1279,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (wret)
 			ret = wret;
 	}
-	BUG_ON(!buffer_dirty(parent));
+	mark_buffer_dirty(parent);
 	return ret;
 }
 
@@ -1368,8 +1374,11 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				if (wret)
 					ret = wret;
 			} else {
+				mark_buffer_dirty(leaf_buf);
 				btrfs_block_release(root, leaf_buf);
 			}
+		} else {
+			mark_buffer_dirty(leaf_buf);
 		}
 	}
 	return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4c19a3f12af..983e3cc9ae9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -833,7 +833,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, char *name, int name_len, u64 dir, u64
+			  *root, const char *name, int name_len, u64 dir, u64
 			  objectid, u8 type);
 int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, u64 dir,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 75d6e373e98..f81cbcc83b6 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -5,7 +5,7 @@
 #include "transaction.h"
 
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, char *name, int name_len, u64 dir, u64
+			  *root, const char *name, int name_len, u64 dir, u64
 			  objectid, u8 type)
 {
 	int ret = 0;
@@ -35,6 +35,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
 	memcpy(name_ptr, name, name_len);
+	mark_buffer_dirty(path.nodes[0]);
 out:
 	btrfs_release_path(root, &path);
 	return ret;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9cacca0c525..8e1dcda0839 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -145,19 +145,20 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root)
 {
-	return 0;
-#if 0
-	int ret;
-	btrfs_set_super_root(s, root->fs_info->tree_root->node->b_blocknr);
-
-	ret = pwrite(root->fs_info->fp, s, sizeof(*s),
-		     BTRFS_SUPER_INFO_OFFSET);
-	if (ret != sizeof(*s)) {
-		fprintf(stderr, "failed to write new super block err %d\n", ret);
-		return ret;
+	struct buffer_head *bh = root->fs_info->sb_buffer;
+	btrfs_set_super_root(root->fs_info->disk_super,
+			     root->fs_info->tree_root->node->b_blocknr);
+	lock_buffer(bh);
+	clear_buffer_dirty(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(WRITE, bh);
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		WARN_ON(1);
+		return -EIO;
 	}
 	return 0;
-#endif
 }
 
 int close_ctree(struct btrfs_root *root)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e3af2c03568..2818f1c5717 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,6 +49,7 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(item);
 	btrfs_set_extent_refs(item, refs + 1);
+	mark_buffer_dirty(path.nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, &path);
 	finish_current_insert(trans, root->fs_info->extent_root);
@@ -103,7 +104,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root)
 {
-	unsigned long gang[8];
+	struct buffer_head *gang[8];
 	u64 first = 0;
 	int ret;
 	int i;
@@ -116,13 +117,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 		if (!ret)
 			break;
 		if (!first)
-			first = gang[0];
+			first = gang[0]->b_blocknr;
 		for (i = 0; i < ret; i++) {
 			radix_tree_delete(&root->fs_info->pinned_radix,
-					  gang[i]);
+					  gang[i]->b_blocknr);
+			brelse(gang[i]);
 		}
 	}
-	root->fs_info->last_insert.objectid = first;
+	if (root->fs_info->last_insert.objectid > first)
+		root->fs_info->last_insert.objectid = first;
 	root->fs_info->last_insert.offset = 0;
 	return 0;
 }
@@ -161,8 +164,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag)
 {
 	int err;
+	struct buffer_head *bh = sb_getblk(root->fs_info->sb, blocknr);
+	BUG_ON(!bh);
 	err = radix_tree_insert(&root->fs_info->pinned_radix,
-				blocknr, (void *)blocknr);
+				blocknr, bh);
 	BUG_ON(err);
 	if (err)
 		return err;
@@ -217,6 +222,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (ret)
 			BUG();
 	}
+	mark_buffer_dirty(path.nodes[0]);
 	btrfs_release_path(extent_root, &path);
 	finish_current_insert(trans, extent_root);
 	return ret;
@@ -232,7 +238,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	int ret;
 	int wret;
 	int err = 0;
-	unsigned long gang[4];
+	struct buffer_head *gang[4];
 	int i;
 	struct radix_tree_root *radix = &extent_root->fs_info->pinned_radix;
 
@@ -245,10 +251,12 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			radix_tree_tag_set(radix, gang[i], CTREE_EXTENT_PINNED);
-			radix_tree_tag_clear(radix, gang[i],
+			radix_tree_tag_set(radix, gang[i]->b_blocknr,
+					   CTREE_EXTENT_PINNED);
+			radix_tree_tag_clear(radix, gang[i]->b_blocknr,
 					     CTREE_EXTENT_PENDING_DEL);
-			wret = __free_extent(trans, extent_root, gang[i], 1);
+			wret = __free_extent(trans, extent_root,
+					     gang[i]->b_blocknr, 1);
 			if (wret)
 				err = wret;
 		}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c45aec258bd..ad2d375b830 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -108,6 +108,7 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
 				    path.slots[0], struct btrfs_inode_map_item);
 	btrfs_cpu_key_to_disk(&inode_item->key, location);
+	mark_buffer_dirty(path.nodes[0]);
 out:
 	btrfs_release_path(inode_root, &path);
 	return ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a4554c007ef..a821b5d1e23 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -55,6 +55,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	slot = path.slots[0];
 	memcpy(btrfs_item_ptr(l, slot, struct btrfs_root_item), item,
 		sizeof(*item));
+	mark_buffer_dirty(path.nodes[0]);
 out:
 	btrfs_release_path(root, &path);
 	return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 62043082584..6080a8133d7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -10,6 +10,7 @@
 #include <linux/backing-dev.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "transaction.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123682E
 
@@ -357,6 +358,131 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	return 0;
 }
 
+static void fill_inode_item(struct btrfs_inode_item *item,
+			    struct inode *inode)
+{
+	btrfs_set_inode_uid(item, inode->i_uid);
+	btrfs_set_inode_gid(item, inode->i_gid);
+	btrfs_set_inode_size(item, inode->i_size);
+	btrfs_set_inode_mode(item, inode->i_mode);
+	btrfs_set_inode_nlink(item, inode->i_nlink);
+	btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
+	btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
+	btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
+	btrfs_set_inode_nblocks(item, inode->i_blocks);
+	btrfs_set_inode_generation(item, inode->i_generation);
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+				     struct inode *dir, int mode)
+{
+	struct inode *inode;
+	struct btrfs_inode_item inode_item;
+	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_key key;
+	int ret;
+	u64 objectid;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	ret = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	BUG_ON(ret);
+
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_mode = mode;
+	inode->i_ino = objectid;
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	fill_inode_item(&inode_item, inode);
+
+
+	key.objectid = objectid;
+	key.flags = 0;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	ret = btrfs_insert_inode_map(trans, root, objectid, &key);
+	BUG_ON(ret);
+
+	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
+	BUG_ON(ret);
+
+	insert_inode_hash(inode);
+	// FIXME mark_inode_dirty(inode)
+	return inode;
+}
+
+static int btrfs_add_link(struct btrfs_trans_handle *trans,
+			    struct dentry *dentry, struct inode *inode)
+{
+	int ret;
+	ret = btrfs_insert_dir_item(trans, btrfs_sb(inode->i_sb),
+				    dentry->d_name.name, dentry->d_name.len,
+				    dentry->d_parent->d_inode->i_ino,
+				    inode->i_ino, 0);
+	BUG_ON(ret);
+	return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+			    struct dentry *dentry, struct inode *inode)
+{
+	int err = btrfs_add_link(trans, dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+			int mode, struct nameidata *nd)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct inode *inode;
+	int err;
+
+	trans = btrfs_start_transaction(root, 1);
+	inode = btrfs_new_inode(trans, dir, mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		return err;
+	// FIXME mark the inode dirty
+	err = btrfs_add_nondir(trans, dentry, inode);
+	dir->i_sb->s_dirt = 1;
+	btrfs_end_transaction(trans, root);
+	return err;
+}
+
+static void btrfs_write_super(struct super_block *sb)
+{
+	sb->s_dirt = 0;
+printk("btrfs write_super!\n");
+}
+
+static int btrfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	int ret;
+	sb->s_dirt = 0;
+	root = btrfs_sb(sb);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_commit_transaction(trans, root);
+	sb->s_dirt = 0;
+	BUG_ON(ret);
+printk("btrfs sync_fs\n");
+	return 0;
+}
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
@@ -377,10 +503,13 @@ static struct super_operations btrfs_super_ops = {
 	.drop_inode	= generic_delete_inode,
 	.put_super	= btrfs_put_super,
 	.read_inode	= btrfs_read_locked_inode,
+	.write_super	= btrfs_write_super,
+	.sync_fs	= btrfs_sync_fs,
 };
 
 static struct inode_operations btrfs_dir_inode_operations = {
 	.lookup		= btrfs_lookup,
+	.create		= btrfs_create,
 };
 
 static struct file_operations btrfs_dir_file_operations = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8dc1c170f10..4903b47c978 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,7 @@ static int join_transaction(struct btrfs_root *root)
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
 		cur_trans->in_commit = 0;
-		cur_trans->use_count = 0;
+		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
 	}
 	cur_trans->num_writers++;
@@ -56,7 +56,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = root->fs_info->running_transaction;
-	WARN_ON(cur_trans->num_writers <= 1);
+	WARN_ON(cur_trans->num_writers < 1);
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
 	cur_trans->num_writers--;
@@ -155,10 +155,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	cur_trans = root->fs_info->running_transaction;
 	root->fs_info->running_transaction = NULL;
-	mutex_unlock(&root->fs_info->trans_mutex);
 
-	memcpy(&snap_key, &root->root_key, sizeof(snap_key));
-	root->root_key.offset++;
+	if (root->node != root->commit_root) {
+		memcpy(&snap_key, &root->root_key, sizeof(snap_key));
+		root->root_key.offset++;
+	}
+
+	mutex_unlock(&root->fs_info->trans_mutex);
 
 	if (btrfs_root_blocknr(&root->root_item) != root->node->b_blocknr) {
 		btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr);
-- 
cgit v1.2.3


From df2ce34c8801af8e00c7f82435fcb1e4b45a2759 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 23 Mar 2007 11:00:45 -0400
Subject: Btrfs: properly set new buffers for new blocks up to date

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 9 +++++++--
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/super.c       | 3 +++
 3 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e1dcda0839..2f1eb4ae4e1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -10,8 +10,13 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 	if (buf->b_blocknr != btrfs_header_blocknr(&node->header))
 		BUG();
 	if (root->node && btrfs_header_parentid(&node->header) !=
-	    btrfs_header_parentid(btrfs_buffer_header(root->node)))
-		BUG();
+	    btrfs_header_parentid(btrfs_buffer_header(root->node))) {
+		printk("block %Lu parentids don't match buf %Lu, root %Lu\n",
+		       buf->b_blocknr,
+		       btrfs_header_parentid(&node->header),
+		       btrfs_header_parentid(btrfs_buffer_header(root->node)));
+		WARN_ON(1);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2818f1c5717..c86f0e6152f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -467,7 +467,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return NULL;
 	}
 	buf = find_tree_block(root, ins.objectid);
-	dirty_tree_block(trans, root, buf);
+	set_buffer_uptodate(buf);
 	return buf;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6080a8133d7..8970e9a8215 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -473,7 +473,10 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	int ret;
+
 	sb->s_dirt = 0;
+	return 0;
+
 	root = btrfs_sb(sb);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
-- 
cgit v1.2.3


From 7f5c15160e1436a53d01f9190db11c2a3a4d788a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 23 Mar 2007 15:56:19 -0400
Subject: Add generation number to btrfs_header, readdir fixes, hash collision
 fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c    |  8 +++++++-
 fs/btrfs/ctree.h    | 14 +++++++++++++-
 fs/btrfs/dir-item.c |  4 ++--
 fs/btrfs/disk-io.c  |  6 +-----
 fs/btrfs/super.c    | 14 +++++++++-----
 5 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 052434a328e..b33a6bfaf32 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,6 +1,7 @@
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "transaction.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
@@ -39,7 +40,8 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct buffer_head *cow;
 	struct btrfs_node *cow_node;
 
-	if (buffer_dirty(buf)) {
+	if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
+				    trans->transid) {
 		*cow_ret = buf;
 		return 0;
 	}
@@ -47,6 +49,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	cow_node = btrfs_buffer_node(cow);
 	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
 	btrfs_set_header_blocknr(&cow_node->header, cow->b_blocknr);
+	btrfs_set_header_generation(&cow_node->header, trans->transid);
 	*cow_ret = cow;
 	mark_buffer_dirty(cow);
 	btrfs_inc_ref(trans, root, buf);
@@ -661,6 +664,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_nritems(&c->header, 1);
 	btrfs_set_header_level(&c->header, level);
 	btrfs_set_header_blocknr(&c->header, t->b_blocknr);
+	btrfs_set_header_generation(&c->header, trans->transid);
 	btrfs_set_header_parentid(&c->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	lower = btrfs_buffer_node(path->nodes[level-1]);
@@ -750,6 +754,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	split = btrfs_buffer_node(split_buffer);
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
 	btrfs_set_header_blocknr(&split->header, split_buffer->b_blocknr);
+	btrfs_set_header_generation(&split->header, trans->transid);
 	btrfs_set_header_parentid(&split->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	mid = (c_nritems + 1) / 2;
@@ -1096,6 +1101,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	btrfs_set_header_nritems(&right->header, nritems - mid);
 	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
+	btrfs_set_header_generation(&right->header, trans->transid);
 	btrfs_set_header_level(&right->header, 0);
 	btrfs_set_header_parentid(&right->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 983e3cc9ae9..96cec6352f1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -52,6 +52,7 @@ struct btrfs_key {
 struct btrfs_header {
 	u8 fsid[16]; /* FS specific uuid */
 	__le64 blocknr; /* which block this node is supposed to live in */
+	__le64 generation;
 	__le64 parentid; /* objectid of the tree root */
 	__le32 csum;
 	__le32 ham;
@@ -600,6 +601,17 @@ static inline void btrfs_set_header_blocknr(struct btrfs_header *h, u64 blocknr)
 	h->blocknr = cpu_to_le64(blocknr);
 }
 
+static inline u64 btrfs_header_generation(struct btrfs_header *h)
+{
+	return le64_to_cpu(h->generation);
+}
+
+static inline void btrfs_set_header_generation(struct btrfs_header *h,
+					       u64 val)
+{
+	h->generation = cpu_to_le64(val);
+}
+
 static inline u64 btrfs_header_parentid(struct btrfs_header *h)
 {
 	return le64_to_cpu(h->parentid);
@@ -839,7 +851,7 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, u64 dir,
 			  const char *name, int name_len, int mod);
 int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
-			      char *name, int name_len);
+			      const char *name, int name_len);
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
 			     u64 dirid, u64 *objectid);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f81cbcc83b6..c18f81797a8 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -60,8 +60,8 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 }
 
 int btrfs_match_dir_item_name(struct btrfs_root *root,
-			      struct btrfs_path *path, char
-			      *name, int name_len)
+			      struct btrfs_path *path,
+			      const char *name, int name_len)
 {
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2f1eb4ae4e1..3dea757d4cb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -11,11 +11,7 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 		BUG();
 	if (root->node && btrfs_header_parentid(&node->header) !=
 	    btrfs_header_parentid(btrfs_buffer_header(root->node))) {
-		printk("block %Lu parentids don't match buf %Lu, root %Lu\n",
-		       buf->b_blocknr,
-		       btrfs_header_parentid(&node->header),
-		       btrfs_header_parentid(btrfs_buffer_header(root->node)));
-		WARN_ON(1);
+		BUG();
 	}
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8970e9a8215..a2db0559469 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -199,7 +199,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	btrfs_init_path(&path);
 	ret = btrfs_lookup_dir_item(NULL, root, &path, dir->i_ino, name,
 				    namelen, 0);
-	if (ret) {
+	if (ret || !btrfs_match_dir_item_name(root, &path, name, namelen)) {
 		*ino = 0;
 		goto out;
 	}
@@ -247,7 +247,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int slot;
 	int advance;
 	unsigned char d_type = DT_UNKNOWN;
-	int over;
+	int over = 0;
 
 	key.objectid = inode->i_ino;
 	key.flags = 0;
@@ -258,7 +258,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (ret < 0) {
 		goto err;
 	}
-	advance = filp->f_pos > 0 && ret != 0;
+	advance = 0;
 	while(1) {
 		leaf = btrfs_buffer_leaf(path.nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
@@ -282,13 +282,17 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			break;
 		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_ITEM_KEY)
 			continue;
+		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
+			continue;
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		over = filldir(dirent, (const char *)(di + 1),
 			       btrfs_dir_name_len(di),
 			       btrfs_disk_key_offset(&item->key),
 			       btrfs_dir_objectid(di), d_type);
-		if (over)
+		if (over) {
+			filp->f_pos = btrfs_disk_key_offset(&item->key);
 			break;
+		}
 		filp->f_pos = btrfs_disk_key_offset(&item->key) + 1;
 	}
 	ret = 0;
@@ -425,7 +429,6 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
 				    inode->i_ino, 0);
-	BUG_ON(ret);
 	return ret;
 }
 
@@ -466,6 +469,7 @@ static void btrfs_write_super(struct super_block *sb)
 {
 	sb->s_dirt = 0;
 printk("btrfs write_super!\n");
+	filemap_flush(sb->s_bdev->bd_inode->i_mapping);
 }
 
 static int btrfs_sync_fs(struct super_block *sb, int wait)
-- 
cgit v1.2.3


From d561c025ee3a0aee53a225bedce0ee35fe12f335 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 23 Mar 2007 19:47:49 -0400
Subject: Btrfs: very minimal locking

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/super.c       | 26 +++++++++++++++++---------
 4 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 96cec6352f1..68cafae6a85 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -231,6 +231,7 @@ struct btrfs_fs_info {
 	struct buffer_head *sb_buffer;
 	struct super_block *sb;
 	struct mutex trans_mutex;
+	struct mutex fs_mutex;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3dea757d4cb..e32ddff55b0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -116,6 +116,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->sb_buffer = sb_buffer;
 	fs_info->sb = sb;
 	mutex_init(&fs_info->trans_mutex);
+	mutex_init(&fs_info->fs_mutex);
 	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c86f0e6152f..9583a9ae8b7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -168,9 +168,10 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag)
 	BUG_ON(!bh);
 	err = radix_tree_insert(&root->fs_info->pinned_radix,
 				blocknr, bh);
-	BUG_ON(err);
-	if (err)
+	if (err && err != -EEXIST) {
+		BUG();
 		return err;
+	}
 	radix_tree_tag_set(&root->fs_info->pinned_radix, blocknr,
 			   tag);
 	return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a2db0559469..0ca1080e019 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -453,25 +453,21 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err;
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	inode = btrfs_new_inode(trans, dir, mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
-		return err;
+		goto out_unlock;
 	// FIXME mark the inode dirty
 	err = btrfs_add_nondir(trans, dentry, inode);
 	dir->i_sb->s_dirt = 1;
 	btrfs_end_transaction(trans, root);
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
 	return err;
 }
 
-static void btrfs_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-printk("btrfs write_super!\n");
-	filemap_flush(sb->s_bdev->bd_inode->i_mapping);
-}
-
 static int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
@@ -479,17 +475,29 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 	int ret;
 
 	sb->s_dirt = 0;
-	return 0;
+	if (!wait) {
+		filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+		return 0;
+	}
+	filemap_write_and_wait(sb->s_bdev->bd_inode->i_mapping);
 
 	root = btrfs_sb(sb);
+	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
 	BUG_ON(ret);
 printk("btrfs sync_fs\n");
+	mutex_unlock(&root->fs_info->fs_mutex);
 	return 0;
 }
 
+static void btrfs_write_super(struct super_block *sb)
+{
+	btrfs_sync_fs(sb, 1);
+}
+
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-- 
cgit v1.2.3


From 78fae27ebf5bd35fb9b2e4213e486635eacfc0ad Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 25 Mar 2007 11:35:08 -0400
Subject: Btrfs: leak fixes, pinning fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 30 +++++++++++++++++++++---------
 fs/btrfs/super.c       |  1 +
 fs/btrfs/transaction.c | 32 ++++++++++++++++++--------------
 3 files changed, 40 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9583a9ae8b7..369b960fce4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -165,13 +165,23 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag)
 {
 	int err;
 	struct buffer_head *bh = sb_getblk(root->fs_info->sb, blocknr);
+	struct btrfs_header *header;
 	BUG_ON(!bh);
+
+	header = btrfs_buffer_header(bh);
+	if (btrfs_header_generation(header) ==
+	    root->fs_info->running_transaction->transid) {
+		return 0;
+	}
+
 	err = radix_tree_insert(&root->fs_info->pinned_radix,
 				blocknr, bh);
 	if (err && err != -EEXIST) {
 		BUG();
 		return err;
 	}
+	if (err == -EEXIST)
+		brelse(bh);
 	radix_tree_tag_set(&root->fs_info->pinned_radix, blocknr,
 			   tag);
 	return 0;
@@ -181,7 +191,7 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag)
  * remove an extent from the root, returns 0 on success
  */
 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 blocknr, u64 num_blocks)
+			 *root, u64 blocknr, u64 num_blocks, int pin)
 {
 	struct btrfs_path path;
 	struct btrfs_key key;
@@ -213,12 +223,18 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_extent_refs(ei, refs);
 	if (refs == 0) {
 		u64 super_blocks_used;
+
+		if (pin) {
+			ret = pin_down_block(root, blocknr,
+					     CTREE_EXTENT_PINNED);
+			BUG_ON(ret);
+		}
+
 		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
 		btrfs_set_super_blocks_used(info->disk_super,
 					    super_blocks_used - num_blocks);
 		ret = btrfs_del_item(trans, extent_root, &path);
-		if (extent_root->fs_info->last_insert.objectid >
-		    blocknr)
+		if (extent_root->fs_info->last_insert.objectid > blocknr)
 			extent_root->fs_info->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
@@ -257,7 +273,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			radix_tree_tag_clear(radix, gang[i]->b_blocknr,
 					     CTREE_EXTENT_PENDING_DEL);
 			wret = __free_extent(trans, extent_root,
-					     gang[i]->b_blocknr, 1);
+					     gang[i]->b_blocknr, 1, 0);
 			if (wret)
 				err = wret;
 		}
@@ -281,11 +297,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		pin_down_block(root, blocknr, CTREE_EXTENT_PENDING_DEL);
 		return 0;
 	}
-	if (pin) {
-		ret = pin_down_block(root, blocknr, CTREE_EXTENT_PINNED);
-		BUG_ON(ret);
-	}
-	ret = __free_extent(trans, root, blocknr, num_blocks);
+	ret = __free_extent(trans, root, blocknr, num_blocks, pin);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0ca1080e019..094a66c267b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -146,6 +146,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0);
 	if (ret) {
 		make_bad_inode(inode);
+		btrfs_release_path(root, &path);
 		return;
 	}
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4903b47c978..46a596e345f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -4,12 +4,15 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-
+static int total_trans = 0;
 static void put_transaction(struct btrfs_transaction *transaction)
 {
 	transaction->use_count--;
-	if (transaction->use_count == 0)
+	if (transaction->use_count == 0) {
+		WARN_ON(total_trans == 0);
+		total_trans--;
 		kfree(transaction);
+	}
 }
 
 static int join_transaction(struct btrfs_root *root)
@@ -18,6 +21,7 @@ static int join_transaction(struct btrfs_root *root)
 	cur_trans = root->fs_info->running_transaction;
 	if (!cur_trans) {
 		cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
+		total_trans++;
 		BUG_ON(!cur_trans);
 		root->fs_info->running_transaction = cur_trans;
 		cur_trans->num_writers = 0;
@@ -108,7 +112,6 @@ static int wait_for_commit(struct btrfs_root *root,
 			   struct btrfs_transaction *commit)
 {
 	DEFINE_WAIT(wait);
-	commit->use_count++;
 	while(!commit->commit_done) {
 		prepare_to_wait(&commit->commit_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
@@ -126,7 +129,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
 	int ret = 0;
-	struct buffer_head *snap = root->commit_root;
+	struct buffer_head *snap;
 	struct btrfs_key snap_key;
 	struct btrfs_transaction *cur_trans;
 	DEFINE_WAIT(wait);
@@ -153,15 +156,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 	finish_wait(&trans->transaction->writer_wait, &wait);
 
-	cur_trans = root->fs_info->running_transaction;
-	root->fs_info->running_transaction = NULL;
-
 	if (root->node != root->commit_root) {
 		memcpy(&snap_key, &root->root_key, sizeof(snap_key));
 		root->root_key.offset++;
 	}
 
-	mutex_unlock(&root->fs_info->trans_mutex);
 
 	if (btrfs_root_blocknr(&root->root_item) != root->node->b_blocknr) {
 		btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr);
@@ -173,17 +172,24 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
+	cur_trans = root->fs_info->running_transaction;
+	root->fs_info->running_transaction = NULL;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
 
 	write_ctree_super(trans, root);
-	btrfs_finish_extent_commit(trans, root->fs_info->extent_root);
-	btrfs_finish_extent_commit(trans, root->fs_info->tree_root);
+	btrfs_finish_extent_commit(trans, root);
+	mutex_lock(&root->fs_info->trans_mutex);
+	put_transaction(cur_trans);
 	put_transaction(cur_trans);
+	mutex_unlock(&root->fs_info->trans_mutex);
 	kfree(trans);
 
 	if (root->node != root->commit_root) {
 		trans = btrfs_start_transaction(root, 1);
+		snap = root->commit_root;
 		root->commit_root = root->node;
 		get_bh(root->node);
 		ret = btrfs_drop_snapshot(trans, root, snap);
@@ -191,10 +197,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_del_root(trans, root->fs_info->tree_root,
 				     &snap_key);
-		BUG_ON(ret);
-		root->fs_info->generation = root->root_key.offset + 1;
-		ret = btrfs_end_transaction(trans, root);
-		BUG_ON(ret);
+		BUG_ON(ret); root->fs_info->generation = root->root_key.offset + 1; ret = btrfs_end_transaction(trans, root); BUG_ON(ret);
+		printk("at free, total trans %d\n", total_trans);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 134e97313491c3a3d6bc3eca3b7c9c64408cbd08 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 25 Mar 2007 13:44:56 -0400
Subject: Btrfs: unlink and delete_inode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 094a66c267b..f96bd92155c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -187,6 +187,104 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	return;
 }
 
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_path path;
+	struct btrfs_root *root;
+	struct btrfs_trans_handle *trans;
+	const char *name = dentry->d_name.name;
+	int name_len = dentry->d_name.len;
+	int ret;
+	u64 objectid;
+	struct btrfs_dir_item *di;
+
+	btrfs_init_path(&path);
+	root = btrfs_sb(dir->i_sb);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+
+	ret = btrfs_lookup_dir_item(trans, root, &path, dir->i_ino,
+				    name, name_len, -1);
+	if (ret < 0)
+		goto err;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto err;
+	}
+	di = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+			    struct btrfs_dir_item);
+	objectid = btrfs_dir_objectid(di);
+
+	ret = btrfs_del_item(trans, root, &path);
+	BUG_ON(ret);
+	dentry->d_inode->i_ctime = dir->i_ctime;
+err:
+	btrfs_release_path(root, &path);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (ret == 0)
+		inode_dec_link_count(dentry->d_inode);
+	return ret;
+}
+
+static int btrfs_free_inode(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct inode *inode)
+{
+	u64 objectid = inode->i_ino;
+	struct btrfs_path path;
+	struct btrfs_inode_map_item *map;
+	struct btrfs_key stat_data_key;
+	int ret;
+	clear_inode(inode);
+	btrfs_init_path(&path);
+	ret = btrfs_lookup_inode_map(trans, root, &path, objectid, -1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		btrfs_release_path(root, &path);
+		goto error;
+	}
+	map = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+			    struct btrfs_inode_map_item);
+	btrfs_disk_key_to_cpu(&stat_data_key, &map->key);
+	ret = btrfs_del_item(trans, root->fs_info->inode_root, &path);
+	BUG_ON(ret);
+	btrfs_release_path(root, &path);
+	btrfs_init_path(&path);
+
+	ret = btrfs_lookup_inode(trans, root, &path, objectid, -1);
+	BUG_ON(ret);
+	ret = btrfs_del_item(trans, root, &path);
+	BUG_ON(ret);
+	btrfs_release_path(root, &path);
+error:
+	return ret;
+}
+
+static void btrfs_delete_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	truncate_inode_pages(&inode->i_data, 0);
+	if (is_bad_inode(inode)) {
+		goto no_delete;
+	}
+	inode->i_size = 0;
+	if (inode->i_blocks)
+		WARN_ON(1);
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_free_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return;
+no_delete:
+	clear_inode(inode);
+}
+
+
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			      ino_t *ino)
 {
@@ -272,6 +370,13 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				leaf = btrfs_buffer_leaf(path.nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
 				slot = path.slots[0];
+#if 0
+				page_cache_readahead(
+				     inode->i_sb->s_bdev->bd_inode->i_mapping,
+				     &filp->f_ra, filp,
+				     path.nodes[0]->b_blocknr >>
+				     (PAGE_CACHE_SHIFT - inode->i_blkbits), 1);
+#endif
 			} else {
 				slot++;
 				path.slots[0]++;
@@ -441,8 +546,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 		d_instantiate(dentry, inode);
 		return 0;
 	}
-	inode_dec_link_count(inode);
-	iput(inode);
 	return err;
 }
 
@@ -453,6 +556,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = btrfs_sb(dir->i_sb);
 	struct inode *inode;
 	int err;
+	int drop_inode = 0;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
@@ -462,10 +566,16 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	// FIXME mark the inode dirty
 	err = btrfs_add_nondir(trans, dentry, inode);
+	if (err)
+		drop_inode = 1;
 	dir->i_sb->s_dirt = 1;
 	btrfs_end_transaction(trans, root);
 out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
 	return err;
 }
 
@@ -516,7 +626,7 @@ static struct file_system_type btrfs_fs_type = {
 
 static struct super_operations btrfs_super_ops = {
 	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
+	.delete_inode	= btrfs_delete_inode,
 	.put_super	= btrfs_put_super,
 	.read_inode	= btrfs_read_locked_inode,
 	.write_super	= btrfs_write_super,
@@ -526,6 +636,7 @@ static struct super_operations btrfs_super_ops = {
 static struct inode_operations btrfs_dir_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.create		= btrfs_create,
+	.unlink		= btrfs_unlink,
 };
 
 static struct file_operations btrfs_dir_file_operations = {
-- 
cgit v1.2.3


From dcea79152cfb7cf244bdd9c23f4291831ffd6adf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 25 Mar 2007 14:03:08 -0400
Subject: Btrfs: very simple readdir readahead

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f96bd92155c..e76fcd80db6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -332,6 +332,34 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	return d_splice_alias(inode, dentry);
 }
 
+static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path)
+{
+	struct buffer_head *bh;
+	struct btrfs_node *node;
+	int i;
+	int nritems;
+	u64 objectid;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1];
+	objectid = btrfs_disk_key_objectid(&node->ptrs[slot].key);
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot; i < nritems; i++) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid != objectid)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		bh = sb_getblk(root->fs_info->sb, blocknr);
+		ll_rw_block(READ, 1, &bh);
+		brelse(bh);
+	}
+
+}
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -358,6 +386,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto err;
 	}
 	advance = 0;
+	reada_leaves(root, &path);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path.nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
@@ -370,13 +399,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				leaf = btrfs_buffer_leaf(path.nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
 				slot = path.slots[0];
-#if 0
-				page_cache_readahead(
-				     inode->i_sb->s_bdev->bd_inode->i_mapping,
-				     &filp->f_ra, filp,
-				     path.nodes[0]->b_blocknr >>
-				     (PAGE_CACHE_SHIFT - inode->i_blkbits), 1);
-#endif
+				if (path.nodes[1] && path.slots[1] == 0)
+					reada_leaves(root, &path);
 			} else {
 				slot++;
 				path.slots[0]++;
-- 
cgit v1.2.3


From f7922033efe957f79ae57f6026e93c8148e7f7ed Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 25 Mar 2007 20:17:36 -0400
Subject: btrfs_mkdir

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 184 ++++++++++++++++++++-----------------------------------
 1 file changed, 65 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e76fcd80db6..820d21ee4af 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -18,124 +18,6 @@ static struct inode_operations btrfs_dir_inode_operations;
 static struct super_operations btrfs_super_ops;
 static struct file_operations btrfs_dir_file_operations;
 
-#if 0
-/* some random number */
-
-static struct super_operations ramfs_ops;
-
-static struct backing_dev_info ramfs_backing_dev_info = {
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK |
-			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
-			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
-
-struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
-{
-	struct inode * inode = new_inode(sb);
-
-	if (inode) {
-		inode->i_mode = mode;
-		inode->i_uid = current->fsuid;
-		inode->i_gid = current->fsgid;
-		inode->i_blocks = 0;
-		inode->i_mapping->a_ops = &ramfs_aops;
-		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
-		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		switch (mode & S_IFMT) {
-		default:
-			init_special_inode(inode, mode, dev);
-			break;
-		case S_IFREG:
-			inode->i_op = &ramfs_file_inode_operations;
-			inode->i_fop = &ramfs_file_operations;
-			break;
-		case S_IFDIR:
-			inode->i_op = &ramfs_dir_inode_operations;
-			inode->i_fop = &simple_dir_operations;
-
-			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inc_nlink(inode);
-			break;
-		case S_IFLNK:
-			inode->i_op = &page_symlink_inode_operations;
-			break;
-		}
-	}
-	return inode;
-}
-
-/*
- * File creation. Allocate an inode, and we're done..
- */
-/* SMP-safe */
-static int
-ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
-{
-	struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev);
-	int error = -ENOSPC;
-
-	if (inode) {
-		if (dir->i_mode & S_ISGID) {
-			inode->i_gid = dir->i_gid;
-			if (S_ISDIR(mode))
-				inode->i_mode |= S_ISGID;
-		}
-		d_instantiate(dentry, inode);
-		dget(dentry);	/* Extra count - pin the dentry in core */
-		error = 0;
-		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	}
-	return error;
-}
-
-static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
-{
-	int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0);
-	if (!retval)
-		inc_nlink(dir);
-	return retval;
-}
-
-static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
-{
-	return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
-}
-
-static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname)
-{
-	struct inode *inode;
-	int error = -ENOSPC;
-
-	inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
-	if (inode) {
-		int l = strlen(symname)+1;
-		error = page_symlink(inode, symname, l);
-		if (!error) {
-			if (dir->i_mode & S_ISGID)
-				inode->i_gid = dir->i_gid;
-			d_instantiate(dentry, inode);
-			dget(dentry);
-			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-		} else
-			iput(inode);
-	}
-	return error;
-}
-
-static struct inode_operations ramfs_dir_inode_operations = {
-	.create		= ramfs_create,
-	.lookup		= simple_lookup,
-	.link		= simple_link,
-	.unlink		= simple_unlink,
-	.symlink	= ramfs_symlink,
-	.mkdir		= ramfs_mkdir,
-	.rmdir		= simple_rmdir,
-	.mknod		= ramfs_mknod,
-	.rename		= simple_rename,
-};
-#endif
-
 static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path path;
@@ -533,9 +415,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_ino = objectid;
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	/* FIXME do this on link */
+	if (mode & S_IFDIR)
+		inode->i_size = 3;
 	fill_inode_item(&inode_item, inode);
 
-
 	key.objectid = objectid;
 	key.flags = 0;
 	key.offset = 0;
@@ -603,6 +487,67 @@ out_unlock:
 	return err;
 }
 
+static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
+				struct inode *inode, struct inode *dir)
+{
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	int ret;
+	char buf[2];
+	buf[0] = '.';
+	buf[1] = '.';
+
+	ret = btrfs_insert_dir_item(trans, root, buf, 1, inode->i_ino,
+				    inode->i_ino, 1);
+	if (ret)
+		goto error;
+	ret = btrfs_insert_dir_item(trans, root, buf, 2, inode->i_ino,
+				    dir->i_ino, 1);
+error:
+	return ret;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	int err = 0;
+	int drop_on_err = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_unlock;
+	}
+	inode = btrfs_new_inode(trans, dir, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_fail;
+	}
+	drop_on_err = 1;
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	err = btrfs_make_empty_dir(trans, inode, dir);
+	if (err)
+		goto out_fail;
+	err = btrfs_add_link(trans, dentry, inode);
+	if (err)
+		goto out_fail;
+	d_instantiate(dentry, inode);
+	mark_inode_dirty(inode);
+	drop_on_err = 0;
+
+out_fail:
+	btrfs_end_transaction(trans, root);
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (drop_on_err)
+		iput(inode);
+	return err;
+}
+
 static int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
@@ -661,6 +606,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.create		= btrfs_create,
 	.unlink		= btrfs_unlink,
+	.mkdir		= btrfs_mkdir,
 };
 
 static struct file_operations btrfs_dir_file_operations = {
-- 
cgit v1.2.3


From 8ef97622caa2d5f78d1dc58ab918e2fbfa9b357a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Mar 2007 10:15:30 -0400
Subject: Btrfs: add a radix back bit tree

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   3 +-
 fs/btrfs/bit-radix.c   | 107 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/bit-radix.h   |  15 +++++++
 fs/btrfs/ctree.h       |   3 +-
 fs/btrfs/disk-io.c     |   3 +-
 fs/btrfs/extent-tree.c |  93 ++++++++++++++++++------------------------
 6 files changed, 167 insertions(+), 57 deletions(-)
 create mode 100644 fs/btrfs/bit-radix.c
 create mode 100644 fs/btrfs/bit-radix.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index be7d74cdca0..5346f706b2c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -3,7 +3,8 @@ ifneq ($(KERNELRELEASE),)
 
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
-	   hash.o file-item.o inode-item.o inode-map.o disk-io.o transaction.o
+	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
+	   transaction.o bit-radix.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
new file mode 100644
index 00000000000..43807cc3cce
--- /dev/null
+++ b/fs/btrfs/bit-radix.c
@@ -0,0 +1,107 @@
+#include <linux/module.h>
+#include "bit-radix.h"
+
+#define BIT_ARRAY_BYTES 256
+#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8)
+
+int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
+{
+	unsigned long *bits;
+	unsigned long slot;
+	int bit_slot;
+	int ret;
+
+	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
+	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
+
+	bits = radix_tree_lookup(radix, slot);
+	if (!bits) {
+		bits = kmalloc(BIT_ARRAY_BYTES, GFP_NOIO);
+		if (!bits)
+			return -ENOMEM;
+		memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
+		bits[0] = slot;
+		ret = radix_tree_insert(radix, slot, bits);
+		if (ret)
+			return ret;
+	}
+	set_bit(bit_slot, bits + 1);
+	return 0;
+}
+
+int test_radix_bit(struct radix_tree_root *radix, unsigned long bit)
+{
+	unsigned long *bits;
+	unsigned long slot;
+	int bit_slot;
+
+	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
+	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
+
+	bits = radix_tree_lookup(radix, slot);
+	if (!bits)
+		return 0;
+	return test_bit(bit_slot, bits + 1);
+}
+
+int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
+{
+	unsigned long *bits;
+	unsigned long slot;
+	int bit_slot;
+	int i;
+	int empty = 1;
+
+	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
+	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
+
+	bits = radix_tree_lookup(radix, slot);
+	if (!bits)
+		return 0;
+	clear_bit(bit_slot, bits + 1);
+
+	for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
+		if (bits[i]) {
+			empty = 0;
+			break;
+		}
+	}
+
+	if (empty) {
+		bits = radix_tree_delete(radix, slot);
+		BUG_ON(!bits);
+	}
+	return 0;
+}
+
+int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
+			 int nr)
+{
+	unsigned long *bits;
+	unsigned long *gang[4];
+	int found;
+	int ret;
+	int i;
+	int total_found = 0;
+
+	ret = radix_tree_gang_lookup(radix, (void *)&gang, 0, ARRAY_SIZE(gang));
+	for (i = 0; i < ret && nr > 0; i++) {
+		found = 0;
+		bits = gang[i];
+		while(nr > 0) {
+			found = find_next_bit(bits + 1,
+					      BIT_RADIX_BITS_PER_ARRAY,
+					      found);
+			if (found < BIT_RADIX_BITS_PER_ARRAY) {
+				*retbits = bits[0] *
+					BIT_RADIX_BITS_PER_ARRAY + found;
+				retbits++;
+				nr--;
+				total_found++;
+				found++;
+			} else
+				break;
+		}
+	}
+	return total_found;
+}
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
new file mode 100644
index 00000000000..56aad4c7d7f
--- /dev/null
+++ b/fs/btrfs/bit-radix.h
@@ -0,0 +1,15 @@
+#ifndef __BIT_RADIX__
+#define __BIT_RADIX__
+#include <linux/radix-tree.h>
+
+int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
+int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
+int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
+int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
+			 int nr);
+
+static inline void init_bit_radix(struct radix_tree_root *radix)
+{
+	INIT_RADIX_TREE(radix, GFP_NOFS);
+}
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 68cafae6a85..0aa1052d9f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,8 +1,8 @@
 #ifndef __BTRFS__
 #define __BTRFS__
 
-#include <linux/radix-tree.h>
 #include <linux/fs.h>
+#include "bit-radix.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -222,6 +222,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *inode_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
+	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
 	u64 last_inode_alloc;
 	u64 last_inode_alloc_dirid;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e32ddff55b0..758a62aba06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -104,7 +104,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	/* FIXME: don't be stupid */
 	if (!btrfs_super_root(disk_super))
 		return NULL;
-	INIT_RADIX_TREE(&fs_info->pinned_radix, GFP_KERNEL);
+	init_bit_radix(&fs_info->pinned_radix);
+	init_bit_radix(&fs_info->pending_del_radix);
 	fs_info->running_transaction = NULL;
 	fs_info->fs_root = root;
 	fs_info->tree_root = tree_root;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 369b960fce4..b14104276ee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1,5 +1,4 @@
 #include <linux/module.h>
-#include <linux/radix-tree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -12,15 +11,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
-/*
- * pending extents are blocks that we're trying to allocate in the extent
- * map while trying to grow the map because of other allocations.  To avoid
- * recursing, they are tagged in the radix tree and cleaned up after
- * other allocations are done.  The pending tag is also used in the same
- * manner for deletes.
- */
-#define CTREE_EXTENT_PENDING_DEL 0
-#define CTREE_EXTENT_PINNED 1
 
 static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 			 *root, u64 blocknr)
@@ -104,24 +94,21 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root)
 {
-	struct buffer_head *gang[8];
+	unsigned long gang[8];
 	u64 first = 0;
 	int ret;
 	int i;
+	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
 
 	while(1) {
-		ret = radix_tree_gang_lookup_tag(&root->fs_info->pinned_radix,
-					     (void **)gang, 0,
-					     ARRAY_SIZE(gang),
-					     CTREE_EXTENT_PINNED);
+		ret = find_first_radix_bit(pinned_radix, gang,
+					   ARRAY_SIZE(gang));
 		if (!ret)
 			break;
 		if (!first)
-			first = gang[0]->b_blocknr;
+			first = gang[0];
 		for (i = 0; i < ret; i++) {
-			radix_tree_delete(&root->fs_info->pinned_radix,
-					  gang[i]->b_blocknr);
-			brelse(gang[i]);
+			clear_radix_bit(pinned_radix, gang[i]);
 		}
 	}
 	if (root->fs_info->last_insert.objectid > first)
@@ -161,29 +148,27 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	return 0;
 }
 
-static int pin_down_block(struct btrfs_root *root, u64 blocknr, int tag)
+static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 {
 	int err;
-	struct buffer_head *bh = sb_getblk(root->fs_info->sb, blocknr);
 	struct btrfs_header *header;
-	BUG_ON(!bh);
-
-	header = btrfs_buffer_header(bh);
-	if (btrfs_header_generation(header) ==
-	    root->fs_info->running_transaction->transid) {
-		return 0;
-	}
-
-	err = radix_tree_insert(&root->fs_info->pinned_radix,
-				blocknr, bh);
-	if (err && err != -EEXIST) {
-		BUG();
-		return err;
-	}
-	if (err == -EEXIST)
+	struct buffer_head *bh;
+
+	bh = sb_find_get_block(root->fs_info->sb, blocknr);
+	if (bh) {
+		header = btrfs_buffer_header(bh);
+		if (btrfs_header_generation(header) ==
+		    root->fs_info->running_transaction->transid) {
+			brelse(bh);
+			return 0;
+		}
 		brelse(bh);
-	radix_tree_tag_set(&root->fs_info->pinned_radix, blocknr,
-			   tag);
+	}
+	if (pending)
+		err = set_radix_bit(&root->fs_info->pending_del_radix, blocknr);
+	else
+		err = set_radix_bit(&root->fs_info->pinned_radix, blocknr);
+	BUG_ON(err);
 	return 0;
 }
 
@@ -225,8 +210,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		u64 super_blocks_used;
 
 		if (pin) {
-			ret = pin_down_block(root, blocknr,
-					     CTREE_EXTENT_PINNED);
+			ret = pin_down_block(root, blocknr, 0);
 			BUG_ON(ret);
 		}
 
@@ -255,25 +239,26 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	int ret;
 	int wret;
 	int err = 0;
-	struct buffer_head *gang[4];
+	unsigned long gang[4];
 	int i;
-	struct radix_tree_root *radix = &extent_root->fs_info->pinned_radix;
+	struct radix_tree_root *pending_radix;
+	struct radix_tree_root *pinned_radix;
+
+	pending_radix = &extent_root->fs_info->pending_del_radix;
+	pinned_radix = &extent_root->fs_info->pinned_radix;
 
 	while(1) {
-		ret = radix_tree_gang_lookup_tag(
-					&extent_root->fs_info->pinned_radix,
-					(void **)gang, 0,
-					ARRAY_SIZE(gang),
-					CTREE_EXTENT_PENDING_DEL);
+		ret = find_first_radix_bit(pending_radix, gang,
+					   ARRAY_SIZE(gang));
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			radix_tree_tag_set(radix, gang[i]->b_blocknr,
-					   CTREE_EXTENT_PINNED);
-			radix_tree_tag_clear(radix, gang[i]->b_blocknr,
-					     CTREE_EXTENT_PENDING_DEL);
+			wret = set_radix_bit(pinned_radix, gang[i]);
+			BUG_ON(wret);
+			wret = clear_radix_bit(pending_radix, gang[i]);
+			BUG_ON(wret);
 			wret = __free_extent(trans, extent_root,
-					     gang[i]->b_blocknr, 1, 0);
+					     gang[i], 1, 0);
 			if (wret)
 				err = wret;
 		}
@@ -294,7 +279,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (root == extent_root) {
 		t = find_tree_block(root, blocknr);
-		pin_down_block(root, blocknr, CTREE_EXTENT_PENDING_DEL);
+		pin_down_block(root, blocknr, 1);
 		return 0;
 	}
 	ret = __free_extent(trans, root, blocknr, num_blocks, pin);
@@ -393,7 +378,7 @@ check_pending:
 	BUG_ON(ins->objectid < search_start);
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + total_needed; test_block++) {
-		if (radix_tree_lookup(&root->fs_info->pinned_radix,
+		if (test_radix_bit(&root->fs_info->pinned_radix,
 				      test_block)) {
 			search_start = test_block + 1;
 			goto check_failed;
-- 
cgit v1.2.3


From b52d63fecef89191ed52ab223e9b3b9bd7d615a6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Mar 2007 11:01:09 -0400
Subject: Btrfs: TODO update

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 4b5bd05de94..454028b6bd4 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -8,11 +8,8 @@
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
-* Only pin blocks allocated in this transaction
-* Add transaction handles
 * Do actual block accounting
 * Check compat and incompat flags on the inode
-* Port into the kernel
 * Add virtual filesystems, mountable snapshots
 * Get rid of struct ctree_path, limiting tree levels held at one time
 * Release
-- 
cgit v1.2.3


From 4730a4bc5bf3861d2cb40999374b3ac0ee2a1e0d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Mar 2007 12:00:39 -0400
Subject: btrfs_dirty_inode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 820d21ee4af..905b093a85f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -242,6 +242,7 @@ static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path)
 	}
 
 }
+
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -392,6 +393,51 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 	btrfs_set_inode_generation(item, inode->i_generation);
 }
 
+static int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_path path;
+	int ret;
+
+	btrfs_init_path(&path);
+
+	ret = btrfs_lookup_inode(trans, root, &path, inode->i_ino, 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto failed;
+	}
+
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+				  path.slots[0],
+				  struct btrfs_inode_item);
+
+	fill_inode_item(inode_item, inode);
+	mark_buffer_dirty(path.nodes[0]);
+failed:
+	btrfs_release_path(root, &path);
+	return 0;
+}
+
+static int btrfs_write_inode(struct inode *inode, int wait)
+{
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_update_inode(trans, root, inode);
+	if (wait)
+		btrfs_commit_transaction(trans, root);
+	else
+		btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct inode *dir, int mode)
 {
@@ -415,9 +461,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_ino = objectid;
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
-	/* FIXME do this on link */
-	if (mode & S_IFDIR)
-		inode->i_size = 3;
 	fill_inode_item(&inode_item, inode);
 
 	key.objectid = objectid;
@@ -431,7 +474,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	insert_inode_hash(inode);
-	// FIXME mark_inode_dirty(inode)
 	return inode;
 }
 
@@ -443,6 +485,12 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
 				    inode->i_ino, 0);
+	if (ret == 0) {
+		dentry->d_parent->d_inode->i_size += dentry->d_name.len;
+		ret = btrfs_update_inode(trans, btrfs_sb(inode->i_sb),
+					 dentry->d_parent->d_inode);
+	}
+
 	return ret;
 }
 
@@ -502,6 +550,10 @@ static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 		goto error;
 	ret = btrfs_insert_dir_item(trans, root, buf, 2, inode->i_ino,
 				    dir->i_ino, 1);
+	if (ret)
+		goto error;
+	inode->i_size = 3;
+	ret = btrfs_update_inode(trans, root, inode);
 error:
 	return ret;
 }
@@ -536,7 +588,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_fail;
 	d_instantiate(dentry, inode);
-	mark_inode_dirty(inode);
 	drop_on_err = 0;
 
 out_fail:
@@ -600,6 +651,7 @@ static struct super_operations btrfs_super_ops = {
 	.read_inode	= btrfs_read_locked_inode,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
+	.write_inode	= btrfs_write_inode,
 };
 
 static struct inode_operations btrfs_dir_inode_operations = {
-- 
cgit v1.2.3


From dee26a9f7aab7ffe1193cd1415b23a69426acc9f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Mar 2007 16:00:06 -0400
Subject: btrfs_get_block, file read/write

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |   1 +
 fs/btrfs/ctree.h       |  26 ++++++-
 fs/btrfs/extent-tree.c |   4 +-
 fs/btrfs/file-item.c   |  52 +++++++++++++-
 fs/btrfs/super.c       | 188 ++++++++++++++++++++++++++++++++++++++++---------
 5 files changed, 230 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 454028b6bd4..2ca301b289a 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -8,6 +8,7 @@
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
+* Make directory hashing work on 32 bit
 * Do actual block accounting
 * Check compat and incompat flags on the inode
 * Add virtual filesystems, mountable snapshots
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0aa1052d9f6..7b7120d3ab4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -198,7 +198,7 @@ struct btrfs_file_extent_item {
 	__le64 disk_blocknr;
 	__le64 disk_num_blocks;
 	/*
-	 * the logical offset in file bytes (no csums)
+	 * the logical offset in file blocks (no csums)
 	 * this extent record is for.  This allows a file extent to point
 	 * into the middle of an existing extent on disk, sharing it
 	 * between two snapshots (useful if some bytes in the middle of the
@@ -812,12 +812,19 @@ static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 	((type *)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset((leaf)->items + (slot))))
 
+/* extent-item.c */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root);
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+			*root, u64 num_blocks, u64 search_start, u64
+			search_end, u64 owner, struct btrfs_key *ins);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
+			       btrfs_root *root);
+/* ctree.c */
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
@@ -834,8 +841,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root, struct buffer_head *snap);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *root);
+/* root-item.c */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key);
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -846,6 +852,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *item);
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
+/* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir, u64
 			  objectid, u8 type);
@@ -854,6 +861,7 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  const char *name, int name_len, int mod);
 int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 			      const char *name, int name_len);
+/* inode-map.c */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
 			     u64 dirid, u64 *objectid);
@@ -863,9 +871,21 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct btrfs_path *path,
 			   u64 objectid, int mod);
+/* inode-item.c */
 int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, u64 objectid, struct btrfs_inode_item
 		       *inode_item);
 int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, struct btrfs_path *path, u64 objectid, int mod);
+
+/* file-item.c */
+int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       u64 objectid, u64 offset,
+			       u64 num_blocks, u64 hint_block,
+			       u64 *result);
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 blocknr, u64 num_blocks, int mod);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b14104276ee..82f6e9eed1d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -403,7 +403,7 @@ error:
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-static int alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root, u64 num_blocks, u64 search_start, u64
 			search_end, u64 owner, struct btrfs_key *ins)
 {
@@ -458,7 +458,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	int ret;
 	struct buffer_head *buf;
 
-	ret = alloc_extent(trans, root, 1, 0, (unsigned long)-1,
+	ret = btrfs_alloc_extent(trans, root, 1, 0, (unsigned long)-1,
 		btrfs_header_parentid(btrfs_buffer_header(root->node)), &ins);
 	if (ret) {
 		BUG();
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 8e1e5b4ccfa..4a0367d702b 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -1,9 +1,57 @@
 #include <linux/module.h>
 #include "ctree.h"
+#include "disk-io.h"
 #include "transaction.h"
 
-int btrfs_create_file(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, u64 dirid, u64 *objectid)
+int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       u64 objectid, u64 offset,
+			       u64 num_blocks, u64 hint_block,
+			       u64 *result)
 {
+	struct btrfs_key ins;
+	int ret = 0;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key file_key;
+	struct btrfs_path path;
+
+	btrfs_init_path(&path);
+	ret = btrfs_alloc_extent(trans, root, num_blocks, hint_block,
+				 (u64)-1, objectid, &ins);
+	BUG_ON(ret);
+	file_key.objectid = objectid;
+	file_key.offset = offset;
+	file_key.flags = 0;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+
+	ret = btrfs_insert_empty_item(trans, root, &path, &file_key,
+				      sizeof(*item));
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+			      struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_blocknr(item, ins.objectid);
+	btrfs_set_file_extent_disk_num_blocks(item, ins.offset);
+	btrfs_set_file_extent_offset(item, 0);
+	btrfs_set_file_extent_num_blocks(item, ins.offset);
+	mark_buffer_dirty(path.nodes[0]);
+	*result = ins.objectid;
+	btrfs_release_path(root, &path);
 	return 0;
 }
+
+int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid,
+			     u64 blocknr, u64 num_blocks, int mod)
+{
+	int ret;
+	struct btrfs_key file_key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	file_key.objectid = objectid;
+	file_key.offset = blocknr;
+	file_key.flags = 0;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+	return ret;
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 905b093a85f..2c2883f2856 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -8,6 +8,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
+#include <linux/mpage.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -17,6 +18,9 @@
 static struct inode_operations btrfs_dir_inode_operations;
 static struct super_operations btrfs_super_ops;
 static struct file_operations btrfs_dir_file_operations;
+static struct inode_operations btrfs_file_inode_operations;
+static struct address_space_operations btrfs_aops;
+static struct file_operations btrfs_file_operations;
 
 static void btrfs_read_locked_inode(struct inode *inode)
 {
@@ -57,6 +61,9 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		break;
 #endif
 	case S_IFREG:
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
 		break;
 	case S_IFDIR:
 		inode->i_op = &btrfs_dir_inode_operations;
@@ -214,35 +221,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	return d_splice_alias(inode, dentry);
 }
 
-static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path)
-{
-	struct buffer_head *bh;
-	struct btrfs_node *node;
-	int i;
-	int nritems;
-	u64 objectid;
-	u64 item_objectid;
-	u64 blocknr;
-	int slot;
-
-	if (!path->nodes[1])
-		return;
-	node = btrfs_buffer_node(path->nodes[1]);
-	slot = path->slots[1];
-	objectid = btrfs_disk_key_objectid(&node->ptrs[slot].key);
-	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot; i < nritems; i++) {
-		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-		if (item_objectid != objectid)
-			break;
-		blocknr = btrfs_node_blockptr(node, i);
-		bh = sb_getblk(root->fs_info->sb, blocknr);
-		ll_rw_block(READ, 1, &bh);
-		brelse(bh);
-	}
-
-}
-
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -269,21 +247,18 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto err;
 	}
 	advance = 0;
-	reada_leaves(root, &path);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path.nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
 		slot = path.slots[0];
-		if (advance) {
-			if (slot == nritems -1) {
+		if (advance || slot >= nritems) {
+			if (slot >= nritems -1) {
 				ret = btrfs_next_leaf(root, &path);
 				if (ret)
 					break;
 				leaf = btrfs_buffer_leaf(path.nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
 				slot = path.slots[0];
-				if (path.nodes[1] && path.slots[1] == 0)
-					reada_leaves(root, &path);
 			} else {
 				slot++;
 				path.slots[0]++;
@@ -297,6 +272,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
 			continue;
+
+		advance = 1;
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		over = filldir(dirent, (const char *)(di + 1),
 			       btrfs_dir_name_len(di),
@@ -524,6 +501,11 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	err = btrfs_add_nondir(trans, dentry, inode);
 	if (err)
 		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_end_transaction(trans, root);
 out_unlock:
@@ -623,11 +605,124 @@ printk("btrfs sync_fs\n");
 	return 0;
 }
 
+static int btrfs_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *result, int create)
+{
+	int ret;
+	int err = 0;
+	u64 blocknr;
+	u64 extent_start = 0;
+	u64 extent_end = 0;
+	u64 objectid = inode->i_ino;
+	struct btrfs_path path;
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_leaf *leaf;
+	struct btrfs_disk_key *found_key;
+
+	btrfs_init_path(&path);
+	mutex_lock(&root->fs_info->fs_mutex);
+	if (create)
+		trans = btrfs_start_transaction(root, 1);
+
+
+	ret = btrfs_lookup_file_extent(trans, root, &path,
+				       inode->i_ino, iblock, 1, 0);
+	if (ret < 0) {
+		btrfs_release_path(root, &path);
+		err = ret;
+		goto out;
+	}
+
+	if (ret != 0) {
+		if (path.slots[0] == 0) {
+			btrfs_release_path(root, &path);
+			goto allocate;
+		}
+		path.slots[0]--;
+	}
+
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+			      struct btrfs_file_extent_item);
+	leaf = btrfs_buffer_leaf(path.nodes[0]);
+	blocknr = btrfs_file_extent_disk_blocknr(item);
+	blocknr += btrfs_file_extent_offset(item);
+
+	/* exact match found, use it */
+	if (ret == 0) {
+		err = 0;
+		map_bh(result, inode->i_sb, blocknr);
+		btrfs_release_path(root, &path);
+		goto out;
+	}
+
+	/* are we inside the extent that was found? */
+	found_key = &leaf->items[path.slots[0]].key;
+	if (btrfs_disk_key_objectid(found_key) != objectid ||
+	    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY) {
+		extent_end = 0;
+		extent_start = 0;
+		btrfs_release_path(root, &path);
+		goto allocate;
+	}
+
+	extent_start = btrfs_disk_key_offset(&leaf->items[path.slots[0]].key);
+	extent_start += btrfs_file_extent_offset(item);
+	extent_end = extent_start + btrfs_file_extent_num_blocks(item);
+	btrfs_release_path(root, &path);
+	if (iblock >= extent_start && iblock < extent_end) {
+		err = 0;
+		map_bh(result, inode->i_sb, blocknr + iblock - extent_start);
+		goto out;
+	}
+allocate:
+	/* ok, create a new extent */
+	if (!create) {
+		err = 0;
+		goto out;
+	}
+	ret = btrfs_alloc_file_extent(trans, root, objectid, iblock,
+				      1, extent_end, &blocknr);
+	if (ret) {
+		err = ret;
+		goto out;
+	}
+	map_bh(result, inode->i_sb, blocknr);
+
+out:
+	if (trans)
+		btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return err;
+}
+
+static int btrfs_prepare_write(struct file *file, struct page *page,
+			       unsigned from, unsigned to)
+{
+	return block_prepare_write(page, from, to, btrfs_get_block);
+}
+
 static void btrfs_write_super(struct super_block *sb)
 {
 	btrfs_sync_fs(sb, 1);
 }
 
+static int btrfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, btrfs_get_block);
+}
+
+static int btrfs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, btrfs_get_block);
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	return block_write_full_page(page, btrfs_get_block, wbc);
+}
 
 static int btrfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
@@ -667,6 +762,31 @@ static struct file_operations btrfs_dir_file_operations = {
 	.readdir	= btrfs_readdir,
 };
 
+static struct address_space_operations btrfs_aops = {
+	.readpage	= btrfs_readpage,
+	.readpages	= btrfs_readpages,
+	.writepage	= btrfs_writepage,
+	.sync_page	= block_sync_page,
+	.prepare_write	= btrfs_prepare_write,
+	.commit_write	= generic_commit_write,
+};
+
+static struct inode_operations btrfs_file_inode_operations = {
+	.truncate	= NULL,
+};
+
+static struct file_operations btrfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.sendfile	= generic_file_sendfile,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
 
 static int __init init_btrfs_fs(void)
 {
-- 
cgit v1.2.3


From 6407bf6d7c449cbfb0a39d985194e265eda3baf4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Mar 2007 06:33:00 -0400
Subject: Btrfs: reference counts on data extents

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |  3 ++
 fs/btrfs/ctree.h       |  9 +++---
 fs/btrfs/extent-tree.c | 87 ++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/inode-map.c   |  1 +
 fs/btrfs/super.c       |  6 ++--
 5 files changed, 82 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 2ca301b289a..7aec75e9a37 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -8,7 +8,10 @@
 * Add block mapping tree (simple dm layer)
 * Add simple tree locking (semaphore per tree)
 * Make allocator smarter
+* make level a field in header
+* add a block group to struct inode
 * Make directory hashing work on 32 bit
+* Make sure nobh stuff is working properly for cows
 * Do actual block accounting
 * Check compat and incompat flags on the inode
 * Add virtual filesystems, mountable snapshots
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7b7120d3ab4..1a98952e0fa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -9,10 +9,11 @@ struct btrfs_transaction;
 
 #define BTRFS_MAGIC "_BtRfS_M"
 
-#define BTRFS_ROOT_TREE_OBJECTID 1
-#define BTRFS_EXTENT_TREE_OBJECTID 2
-#define BTRFS_INODE_MAP_OBJECTID 3
-#define BTRFS_FS_TREE_OBJECTID 4
+#define BTRFS_ROOT_TREE_OBJECTID 1ULL
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+#define BTRFS_INODE_MAP_OBJECTID 3ULL
+#define BTRFS_FS_TREE_OBJECTID 4ULL
+#define BTRFS_FIRST_FREE_OBJECTID 5ULL
 
 /*
  * we can actually store much bigger names, but lets not confuse the rest
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 82f6e9eed1d..4d4fc48c0a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -13,7 +13,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
 static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 blocknr)
+			 *root, u64 blocknr, u64 num_blocks)
 {
 	struct btrfs_path path;
 	int ret;
@@ -29,7 +29,7 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	key.offset = 1;
+	key.offset = num_blocks;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path,
 				0, 1);
 	if (ret != 0)
@@ -48,7 +48,7 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 }
 
 static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *root, u64 blocknr, u32 *refs)
+			    *root, u64 blocknr, u64 num_blocks, u32 *refs)
 {
 	struct btrfs_path path;
 	int ret;
@@ -57,7 +57,7 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_extent_item *item;
 	btrfs_init_path(&path);
 	key.objectid = blocknr;
-	key.offset = 1;
+	key.offset = num_blocks;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path,
@@ -76,17 +76,34 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	u64 blocknr;
 	struct btrfs_node *buf_node;
+	struct btrfs_leaf *buf_leaf;
+	struct btrfs_disk_key *key;
+	struct btrfs_file_extent_item *fi;
 	int i;
+	int leaf;
+	int ret;
 
 	if (!root->ref_cows)
 		return 0;
 	buf_node = btrfs_buffer_node(buf);
-	if (btrfs_is_leaf(buf_node))
-		return 0;
-
+	leaf = btrfs_is_leaf(buf_node);
+	buf_leaf = btrfs_buffer_leaf(buf);
 	for (i = 0; i < btrfs_header_nritems(&buf_node->header); i++) {
-		blocknr = btrfs_node_blockptr(buf_node, i);
-		inc_block_ref(trans, root, blocknr);
+		if (leaf) {
+			key = &buf_leaf->items[i].key;
+			if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf_leaf, i,
+					    struct btrfs_file_extent_item);
+			ret = inc_block_ref(trans, root,
+				    btrfs_file_extent_disk_blocknr(fi),
+				    btrfs_file_extent_disk_num_blocks(fi));
+			BUG_ON(ret);
+		} else {
+			blocknr = btrfs_node_blockptr(buf_node, i);
+			ret = inc_block_ref(trans, root, blocknr, 1);
+			BUG_ON(ret);
+		}
 	}
 	return 0;
 }
@@ -469,6 +486,37 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	return buf;
 }
 
+static int drop_leaf_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct buffer_head *cur)
+{
+	struct btrfs_disk_key *key;
+	struct btrfs_leaf *leaf;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int nritems;
+	int ret;
+
+	BUG_ON(!btrfs_is_leaf(btrfs_buffer_node(cur)));
+	leaf = btrfs_buffer_leaf(cur);
+	nritems = btrfs_header_nritems(&leaf->header);
+	for (i = 0; i < nritems; i++) {
+		key = &leaf->items[i].key;
+		if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		/*
+		 * FIXME make sure to insert a trans record that
+		 * repeats the snapshot del on crash
+		 */
+		ret = btrfs_free_extent(trans, root,
+					btrfs_file_extent_disk_blocknr(fi),
+					btrfs_file_extent_disk_num_blocks(fi),
+					0);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
 /*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
@@ -483,28 +531,33 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 refs;
 
 	ret = lookup_block_ref(trans, root, path->nodes[*level]->b_blocknr,
-			       &refs);
+			       1, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
 	/*
 	 * walk down to the last node level and free all the leaves
 	 */
-	while(*level > 0) {
+	while(*level >= 0) {
 		cur = path->nodes[*level];
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(btrfs_buffer_header(cur)))
 			break;
+		if (*level == 0) {
+			ret = drop_leaf_ref(trans, root, cur);
+			BUG_ON(ret);
+			break;
+		}
 		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
 					      path->slots[*level]);
-		ret = lookup_block_ref(trans, root, blocknr, &refs);
-		if (refs != 1 || *level == 1) {
+		ret = lookup_block_ref(trans, root, blocknr, 1, &refs);
+		BUG_ON(ret);
+		if (refs != 1) {
 			path->slots[*level]++;
 			ret = btrfs_free_extent(trans, root, blocknr, 1, 1);
 			BUG_ON(ret);
 			continue;
 		}
-		BUG_ON(ret);
 		next = read_tree_block(root, blocknr);
 		if (path->nodes[*level-1])
 			btrfs_block_release(root, path->nodes[*level-1]);
@@ -513,8 +566,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		path->slots[*level] = 0;
 	}
 out:
-	ret = btrfs_free_extent(trans, root, path->nodes[*level]->b_blocknr,
-				1, 1);
+	ret = btrfs_free_extent(trans, root,
+				path->nodes[*level]->b_blocknr, 1, 1);
 	btrfs_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -544,10 +597,10 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = btrfs_free_extent(trans, root,
 						path->nodes[*level]->b_blocknr,
 						1, 1);
+			BUG_ON(ret);
 			btrfs_block_release(root, path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
-			BUG_ON(ret);
 		}
 	}
 	return 1;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ad2d375b830..1b2c5e04350 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -25,6 +25,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	if (fs_root->fs_info->last_inode_alloc_dirid == dirid)
 		search_start = fs_root->fs_info->last_inode_alloc;
 
+	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
 	search_key.flags = 0;
 	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2c2883f2856..487da5a213c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -700,7 +700,7 @@ out:
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
-	return block_prepare_write(page, from, to, btrfs_get_block);
+	return nobh_prepare_write(page, from, to, btrfs_get_block);
 }
 
 static void btrfs_write_super(struct super_block *sb)
@@ -721,7 +721,7 @@ static int btrfs_readpages(struct file *file, struct address_space *mapping,
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	return block_write_full_page(page, btrfs_get_block, wbc);
+	return nobh_writepage(page, btrfs_get_block, wbc);
 }
 
 static int btrfs_get_sb(struct file_system_type *fs_type,
@@ -768,7 +768,7 @@ static struct address_space_operations btrfs_aops = {
 	.writepage	= btrfs_writepage,
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
-	.commit_write	= generic_commit_write,
+	.commit_write	= nobh_commit_write,
 };
 
 static struct inode_operations btrfs_file_inode_operations = {
-- 
cgit v1.2.3


From 9a6f11ed8f421fb1cc7b37390e32316ff4701f5d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Mar 2007 09:06:38 -0400
Subject: Btrfs: split out level field in struct header

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO    | 5 -----
 fs/btrfs/ctree.c | 1 +
 fs/btrfs/ctree.h | 8 +++-----
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 7aec75e9a37..e9371385ccc 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,12 +1,7 @@
 * cleanup, add more error checking, get rid of BUG_ONs
-* Make IO functions look more like the page cache
 * Fix ENOSPC handling
-* make a real mkfs and superblock
 * Do checksumming
-* Use a real check instead of mark_buffer_dirty
-* Define FS objects in terms of different item types
 * Add block mapping tree (simple dm layer)
-* Add simple tree locking (semaphore per tree)
 * Make allocator smarter
 * make level a field in header
 * add a block group to struct inode
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b33a6bfaf32..48c611948d1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -753,6 +753,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	split_buffer = btrfs_alloc_free_block(trans, root);
 	split = btrfs_buffer_node(split_buffer);
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
+	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
 	btrfs_set_header_blocknr(&split->header, split_buffer->b_blocknr);
 	btrfs_set_header_generation(&split->header, trans->transid);
 	btrfs_set_header_parentid(&split->header,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1a98952e0fa..a4ad39b58a4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -59,7 +59,7 @@ struct btrfs_header {
 	__le32 ham;
 	__le16 nritems;
 	__le16 flags;
-	/* generation flags to be added */
+	u8 level;
 } __attribute__ ((__packed__));
 
 #define BTRFS_MAX_LEVEL 8
@@ -648,15 +648,13 @@ static inline void btrfs_set_header_flags(struct btrfs_header *h, u16 val)
 
 static inline int btrfs_header_level(struct btrfs_header *h)
 {
-	return btrfs_header_flags(h) & (BTRFS_MAX_LEVEL - 1);
+	return h->level;
 }
 
 static inline void btrfs_set_header_level(struct btrfs_header *h, int level)
 {
-	u16 flags;
 	BUG_ON(level > BTRFS_MAX_LEVEL);
-	flags = btrfs_header_flags(h) & ~(BTRFS_MAX_LEVEL - 1);
-	btrfs_set_header_flags(h, flags | level);
+	h->level = level;
 }
 
 static inline int btrfs_is_leaf(struct btrfs_node *n)
-- 
cgit v1.2.3


From 71951f35a6e413f2bfbd41829af8cf10a890aeb6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Mar 2007 09:16:29 -0400
Subject: Btrfs: add generation field to file extent

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     | 13 +++++++++++++
 fs/btrfs/file-item.c |  1 +
 2 files changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a4ad39b58a4..c3fa12a6b59 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -192,6 +192,7 @@ struct btrfs_root_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_file_extent_item {
+	__le64 generation;
 	/*
 	 * disk space consumed by the extent, checksum blocks are included
 	 * in these numbers
@@ -764,6 +765,18 @@ static inline void btrfs_set_file_extent_disk_blocknr(struct
 	e->disk_blocknr = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_file_extent_generation(struct btrfs_file_extent_item *e)
+{
+	return le64_to_cpu(e->generation);
+}
+
+static inline void btrfs_set_file_extent_generation(struct
+						    btrfs_file_extent_item *e,
+						    u64 val)
+{
+	e->generation = cpu_to_le64(val);
+}
+
 static inline u64 btrfs_file_extent_disk_num_blocks(struct
 						    btrfs_file_extent_item *e)
 {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4a0367d702b..09de270c5a5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -32,6 +32,7 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_disk_num_blocks(item, ins.offset);
 	btrfs_set_file_extent_offset(item, 0);
 	btrfs_set_file_extent_num_blocks(item, ins.offset);
+	btrfs_set_file_extent_generation(item, trans->transid);
 	mark_buffer_dirty(path.nodes[0]);
 	*result = ins.objectid;
 	btrfs_release_path(root, &path);
-- 
cgit v1.2.3


From f4b9aa8d3b877d0a6044a6d6d9a44b29cab9e265 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Mar 2007 11:05:53 -0400
Subject: btrfs_truncate

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  25 ++++++------
 fs/btrfs/super.c       | 107 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 114 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4d4fc48c0a3..176c6dc534b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -171,20 +171,21 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 	struct btrfs_header *header;
 	struct buffer_head *bh;
 
-	bh = sb_find_get_block(root->fs_info->sb, blocknr);
-	if (bh) {
-		header = btrfs_buffer_header(bh);
-		if (btrfs_header_generation(header) ==
-		    root->fs_info->running_transaction->transid) {
+	if (!pending) {
+		bh = sb_find_get_block(root->fs_info->sb, blocknr);
+		if (bh) {
+			header = btrfs_buffer_header(bh);
+			if (btrfs_header_generation(header) ==
+			    root->fs_info->running_transaction->transid) {
+				brelse(bh);
+				return 0;
+			}
 			brelse(bh);
-			return 0;
 		}
-		brelse(bh);
-	}
-	if (pending)
-		err = set_radix_bit(&root->fs_info->pending_del_radix, blocknr);
-	else
 		err = set_radix_bit(&root->fs_info->pinned_radix, blocknr);
+	} else {
+		err = set_radix_bit(&root->fs_info->pending_del_radix, blocknr);
+	}
 	BUG_ON(err);
 	return 0;
 }
@@ -223,6 +224,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(ei->refs == 0);
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
+	mark_buffer_dirty(path.nodes[0]);
 	if (refs == 0) {
 		u64 super_blocks_used;
 
@@ -240,7 +242,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (ret)
 			BUG();
 	}
-	mark_buffer_dirty(path.nodes[0]);
 	btrfs_release_path(extent_root, &path);
 	finish_current_insert(trans, extent_root);
 	return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 487da5a213c..fd3d9d616ff 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,11 +28,15 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
 	int ret;
+
 	btrfs_init_path(&path);
+	mutex_lock(&root->fs_info->fs_mutex);
+
 	ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0);
 	if (ret) {
-		make_bad_inode(inode);
 		btrfs_release_path(root, &path);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		make_bad_inode(inode);
 		return;
 	}
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
@@ -53,6 +57,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
 	btrfs_release_path(root, &path);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	switch (inode->i_mode & S_IFMT) {
 #if 0
 	default:
@@ -151,20 +156,85 @@ error:
 	return ret;
 }
 
+static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct inode *inode)
+{
+	int ret;
+	struct btrfs_path path;
+	struct btrfs_key key;
+	struct btrfs_disk_key *found_key;
+	struct btrfs_leaf *leaf;
+	struct btrfs_file_extent_item *fi;
+	u64 extent_start;
+	u64 extent_num_blocks;
+
+	/* FIXME, add redo link to tree so we don't leak on crash */
+	key.objectid = inode->i_ino;
+	key.offset = (u64)-1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	while(1) {
+		btrfs_init_path(&path);
+		ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
+		if (ret < 0) {
+			btrfs_release_path(root, &path);
+			goto error;
+		}
+		if (ret > 0) {
+			BUG_ON(path.slots[0] == 0);
+			path.slots[0]--;
+		}
+		leaf = btrfs_buffer_leaf(path.nodes[0]);
+		found_key = &leaf->items[path.slots[0]].key;
+		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
+			break;
+		if (btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
+			break;
+		if (btrfs_disk_key_offset(found_key) < inode->i_size)
+			break;
+		/* FIXME: add extent truncation */
+		if (btrfs_disk_key_offset(found_key) < inode->i_size)
+			break;
+		fi = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+				    path.slots[0],
+				    struct btrfs_file_extent_item);
+		extent_start = btrfs_file_extent_disk_blocknr(fi);
+		extent_num_blocks = btrfs_file_extent_disk_num_blocks(fi);
+		key.offset = btrfs_disk_key_offset(found_key) - 1;
+		ret = btrfs_del_item(trans, root, &path);
+		BUG_ON(ret);
+		inode->i_blocks -= btrfs_file_extent_num_blocks(fi) >> 9;
+		btrfs_release_path(root, &path);
+		ret = btrfs_free_extent(trans, root, extent_start,
+					extent_num_blocks, 0);
+		BUG_ON(ret);
+		if (btrfs_disk_key_offset(found_key) == 0)
+			break;
+	}
+	btrfs_release_path(root, &path);
+	ret = 0;
+error:
+	return ret;
+}
+
 static void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	int ret;
+
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode)) {
 		goto no_delete;
 	}
 	inode->i_size = 0;
-	if (inode->i_blocks)
-		WARN_ON(1);
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	if (S_ISREG(inode->i_mode)) {
+		ret = btrfs_truncate_in_trans(trans, root, inode);
+		BUG_ON(ret);
+	}
 	btrfs_free_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -173,7 +243,6 @@ no_delete:
 	clear_inode(inode);
 }
 
-
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			      ino_t *ino)
 {
@@ -688,6 +757,8 @@ allocate:
 		err = ret;
 		goto out;
 	}
+	inode->i_blocks += inode->i_sb->s_blocksize >> 9;
+	set_buffer_new(result);
 	map_bh(result, inode->i_sb, blocknr);
 
 out:
@@ -724,6 +795,30 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 	return nobh_writepage(page, btrfs_get_block, wbc);
 }
 
+static void btrfs_truncate(struct inode *inode)
+{
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	nobh_truncate_page(inode->i_mapping, inode->i_size);
+
+	/* FIXME, add redo link to tree so we don't leak on crash */
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_truncate_in_trans(trans, root, inode);
+	BUG_ON(ret);
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	mark_inode_dirty(inode);
+}
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
@@ -772,7 +867,7 @@ static struct address_space_operations btrfs_aops = {
 };
 
 static struct inode_operations btrfs_file_inode_operations = {
-	.truncate	= NULL,
+	.truncate	= btrfs_truncate,
 };
 
 static struct file_operations btrfs_file_operations = {
-- 
cgit v1.2.3


From 9773a788681db1f5c2701b7433737fdca61a14ba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Mar 2007 11:26:26 -0400
Subject: Btrfs: byte offsets for file keys

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     | 2 +-
 fs/btrfs/file-item.c | 5 +++--
 fs/btrfs/super.c     | 7 +++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c3fa12a6b59..1897f3a65b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -899,5 +899,5 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
-			     u64 blocknr, u64 num_blocks, int mod);
+			     u64 blocknr, int mod);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 09de270c5a5..5230a44cb19 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -26,6 +26,7 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_item(trans, root, &path, &file_key,
 				      sizeof(*item));
+	BUG_ON(ret);
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
 			      struct btrfs_file_extent_item);
 	btrfs_set_file_extent_disk_blocknr(item, ins.objectid);
@@ -42,7 +43,7 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
-			     u64 blocknr, u64 num_blocks, int mod)
+			     u64 offset, int mod)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -50,7 +51,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	int cow = mod != 0;
 
 	file_key.objectid = objectid;
-	file_key.offset = blocknr;
+	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index fd3d9d616ff..f2f08189903 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -697,7 +697,8 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 
 
 	ret = btrfs_lookup_file_extent(trans, root, &path,
-				       inode->i_ino, iblock, 1, 0);
+				       inode->i_ino,
+				       iblock << inode->i_blkbits, 0);
 	if (ret < 0) {
 		btrfs_release_path(root, &path);
 		err = ret;
@@ -737,6 +738,7 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	extent_start = btrfs_disk_key_offset(&leaf->items[path.slots[0]].key);
+	extent_start = extent_start >> inode->i_blkbits;
 	extent_start += btrfs_file_extent_offset(item);
 	extent_end = extent_start + btrfs_file_extent_num_blocks(item);
 	btrfs_release_path(root, &path);
@@ -751,7 +753,8 @@ allocate:
 		err = 0;
 		goto out;
 	}
-	ret = btrfs_alloc_file_extent(trans, root, objectid, iblock,
+	ret = btrfs_alloc_file_extent(trans, root, objectid,
+				      iblock << inode->i_blkbits,
 				      1, extent_end, &blocknr);
 	if (ret) {
 		err = ret;
-- 
cgit v1.2.3


From 5f443fd280c76a2170259cebaa4108c9daad6e1b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Mar 2007 13:42:32 -0400
Subject: btrfs_rmdir

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 93 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f2f08189903..67659b6ce96 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -81,11 +81,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	return;
 }
 
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *dir,
+			      struct dentry *dentry)
 {
 	struct btrfs_path path;
-	struct btrfs_root *root;
-	struct btrfs_trans_handle *trans;
 	const char *name = dentry->d_name.name;
 	int name_len = dentry->d_name.len;
 	int ret;
@@ -93,10 +94,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct btrfs_dir_item *di;
 
 	btrfs_init_path(&path);
-	root = btrfs_sb(dir->i_sb);
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-
 	ret = btrfs_lookup_dir_item(trans, root, &path, dir->i_ino,
 				    name, name_len, -1);
 	if (ret < 0)
@@ -114,13 +111,98 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	dentry->d_inode->i_ctime = dir->i_ctime;
 err:
 	btrfs_release_path(root, &path);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret == 0)
 		inode_dec_link_count(dentry->d_inode);
 	return ret;
 }
 
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_root *root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	root = btrfs_sb(dir->i_sb);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_unlink_trans(trans, root, dir, dentry);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_path path;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_disk_key *found_key;
+	struct btrfs_leaf *leaf;
+
+	btrfs_init_path(&path);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = inode->i_ino;
+	key.offset = (u64)-1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	BUG_ON(ret == 0);
+	BUG_ON(path.slots[0] == 0);
+	path.slots[0]--;
+	leaf = btrfs_buffer_leaf(path.nodes[0]);
+	found_key = &leaf->items[path.slots[0]].key;
+	if (btrfs_disk_key_objectid(found_key) != inode->i_ino) {
+		err = -ENOENT;
+		goto out;
+	}
+	if (btrfs_disk_key_type(found_key) != BTRFS_DIR_ITEM_KEY ||
+	    btrfs_disk_key_offset(found_key) != 2) {
+		err = -ENOTEMPTY;
+		goto out;
+	}
+	ret = btrfs_del_item(trans, root, &path);
+	BUG_ON(ret);
+	btrfs_release_path(root, &path);
+	key.offset = 1;
+	ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret > 0) {
+		err = -ENOTEMPTY;
+		goto out;
+	}
+	ret = btrfs_del_item(trans, root, &path);
+	if (ret) {
+		err = ret;
+		goto out;
+	}
+	btrfs_release_path(root, &path);
+
+	/* now the directory is empty */
+	err = btrfs_unlink_trans(trans, root, dir, dentry);
+	if (!err) {
+		inode->i_size = 0;
+	}
+out:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	ret = btrfs_end_transaction(trans, root);
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
 static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct inode *inode)
@@ -191,9 +273,6 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			break;
 		if (btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
 			break;
-		if (btrfs_disk_key_offset(found_key) < inode->i_size)
-			break;
-		/* FIXME: add extent truncation */
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
 			break;
 		fi = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
@@ -209,7 +288,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		ret = btrfs_free_extent(trans, root, extent_start,
 					extent_num_blocks, 0);
 		BUG_ON(ret);
-		if (btrfs_disk_key_offset(found_key) == 0)
+		if (key.offset + 1 == 0)
 			break;
 	}
 	btrfs_release_path(root, &path);
@@ -852,6 +931,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.create		= btrfs_create,
 	.unlink		= btrfs_unlink,
 	.mkdir		= btrfs_mkdir,
+	.rmdir		= btrfs_rmdir,
 };
 
 static struct file_operations btrfs_dir_file_operations = {
-- 
cgit v1.2.3


From d98237b3ede7ab98892f7fa62201a13694c526e2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Mar 2007 13:57:48 -0400
Subject: Btrfs: use a btree inode instead of sb_getblk

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/disk-io.c     | 174 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/disk-io.h     |   5 +-
 fs/btrfs/extent-tree.c |   8 +--
 fs/btrfs/super.c       |   8 ++-
 fs/btrfs/transaction.c |   2 +-
 6 files changed, 174 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1897f3a65b4..b2ebed756c1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -233,6 +233,7 @@ struct btrfs_fs_info {
 	struct btrfs_super_block *disk_super;
 	struct buffer_head *sb_buffer;
 	struct super_block *sb;
+	struct inode *btree_inode;
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
 };
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 758a62aba06..970103f2cac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1,14 +1,17 @@
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/blkdev.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 
+
 static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
 	struct btrfs_node *node = btrfs_buffer_node(buf);
-	if (buf->b_blocknr != btrfs_header_blocknr(&node->header))
+	if (buf->b_blocknr != btrfs_header_blocknr(&node->header)) {
 		BUG();
+	}
 	if (root->node && btrfs_header_parentid(&node->header) !=
 	    btrfs_header_parentid(btrfs_buffer_header(root->node))) {
 		BUG();
@@ -16,25 +19,154 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 	return 0;
 }
 
-struct buffer_head *alloc_tree_block(struct btrfs_root *root, u64 blocknr)
+struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
+{
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	int blockbits = root->fs_info->sb->s_blocksize_bits;
+	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
+	struct page *page;
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct buffer_head *ret = NULL;
+
+	page = find_lock_page(mapping, index);
+	if (!page)
+		return NULL;
+
+	if (!page_has_buffers(page))
+		goto out_unlock;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_mapped(bh) && bh->b_blocknr == blocknr) {
+			ret = bh;
+			get_bh(bh);
+			goto out_unlock;
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
+						 u64 blocknr)
+{
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	int blockbits = root->fs_info->sb->s_blocksize_bits;
+	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
+	struct page *page;
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct buffer_head *ret = NULL;
+	u64 first_block = index << (PAGE_CACHE_SHIFT - blockbits);
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		return NULL;
+
+	wait_on_page_writeback(page);
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, root->fs_info->sb->s_blocksize, 0);
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (!buffer_mapped(bh)) {
+			bh->b_bdev = root->fs_info->sb->s_bdev;
+			bh->b_blocknr = first_block;
+			set_buffer_mapped(bh);
+		}
+		if (bh->b_blocknr == blocknr) {
+			ret = bh;
+			get_bh(bh);
+			goto out_unlock;
+		}
+		bh = bh->b_this_page;
+		first_block++;
+	} while (bh != head);
+out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+static sector_t max_block(struct block_device *bdev)
+{
+	sector_t retval = ~((sector_t)0);
+	loff_t sz = i_size_read(bdev->bd_inode);
+
+	if (sz) {
+		unsigned int size = block_size(bdev);
+		unsigned int sizebits = blksize_bits(size);
+		retval = (sz >> sizebits);
+	}
+	return retval;
+}
+
+static int btree_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int create)
+{
+	if (iblock >= max_block(inode->i_sb->s_bdev)) {
+		if (create)
+			return -EIO;
+
+		/*
+		 * for reads, we're just trying to fill a partial page.
+		 * return a hole, they will have to call get_block again
+		 * before they can fill it, and they will get -EIO at that
+		 * time
+		 */
+		return 0;
+	}
+	bh->b_bdev = inode->i_sb->s_bdev;
+	bh->b_blocknr = iblock;
+	set_buffer_mapped(bh);
+	return 0;
+}
+
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
-	return sb_getblk(root->fs_info->sb, blocknr);
+	return block_write_full_page(page, btree_get_block, wbc);
 }
 
-struct buffer_head *find_tree_block(struct btrfs_root *root, u64 blocknr)
+static int btree_readpage(struct file * file, struct page * page)
 {
-	return sb_getblk(root->fs_info->sb, blocknr);
+	return block_read_full_page(page, btree_get_block);
 }
 
+static struct address_space_operations btree_aops = {
+	.readpage	= btree_readpage,
+	.writepage	= btree_writepage,
+	.sync_page	= block_sync_page,
+};
+
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct buffer_head *buf = sb_bread(root->fs_info->sb, blocknr);
+	struct buffer_head *bh = NULL;
 
-	if (!buf)
-		return buf;
-	if (check_tree_block(root, buf))
+	bh = btrfs_find_create_tree_block(root, blocknr);
+	if (!bh)
+		return bh;
+	lock_buffer(bh);
+	if (!buffer_uptodate(bh)) {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			goto fail;
+	} else {
+		unlock_buffer(bh);
+	}
+	if (check_tree_block(root, bh))
 		BUG();
-	return buf;
+	return bh;
+fail:
+	brelse(bh);
+	return NULL;
+
 }
 
 int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -101,11 +233,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 						GFP_NOFS);
 	int ret;
 
-	/* FIXME: don't be stupid */
 	if (!btrfs_super_root(disk_super))
 		return NULL;
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
+	sb_set_blocksize(sb, sb_buffer->b_size);
 	fs_info->running_transaction = NULL;
 	fs_info->fs_root = root;
 	fs_info->tree_root = tree_root;
@@ -114,14 +246,30 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->last_inode_alloc = 0;
 	fs_info->last_inode_alloc_dirid = 0;
 	fs_info->disk_super = disk_super;
-	fs_info->sb_buffer = sb_buffer;
 	fs_info->sb = sb;
+	fs_info->btree_inode = new_inode(sb);
+	fs_info->btree_inode->i_ino = 1;
+	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
+	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
 	__setup_root(disk_super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+	fs_info->sb_buffer = read_tree_block(tree_root, sb_buffer->b_blocknr);
+
+	if (!fs_info->sb_buffer)
+		return NULL;
+
+	brelse(sb_buffer);
+	sb_buffer = NULL;
+	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
+	fs_info->disk_super = disk_super;
+
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super));
 	BUG_ON(!tree_root->node);
@@ -137,7 +285,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = find_and_setup_root(disk_super, tree_root, fs_info,
 				  BTRFS_FS_TREE_OBJECTID, root);
 	BUG_ON(ret);
-
 	root->commit_root = root->node;
 	get_bh(root->node);
 	root->ref_cows = 1;
@@ -191,6 +338,7 @@ int close_ctree(struct btrfs_root *root)
 				    root->fs_info->tree_root->node);
 	btrfs_block_release(root, root->commit_root);
 	btrfs_block_release(root, root->fs_info->sb_buffer);
+	iput(root->fs_info->btree_inode);
 	kfree(root->fs_info->extent_root);
 	kfree(root->fs_info->inode_root);
 	kfree(root->fs_info->tree_root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 099f7eea0ec..c2c38bda704 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,7 +21,8 @@ static inline struct btrfs_header *btrfs_buffer_header(struct buffer_head *bh)
 }
 
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr);
-struct buffer_head *find_tree_block(struct btrfs_root *root, u64 blocknr);
+struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
+						 u64 blocknr);
 int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct buffer_head *buf);
 int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -37,5 +38,5 @@ int close_ctree(struct btrfs_root *root);
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
-int mkfs(int fd, u64 num_blocks, u32 blocksize);
+struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 176c6dc534b..b2ae8e768b6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -172,8 +172,8 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 	struct buffer_head *bh;
 
 	if (!pending) {
-		bh = sb_find_get_block(root->fs_info->sb, blocknr);
-		if (bh) {
+		bh = btrfs_find_tree_block(root, blocknr);
+		if (bh && buffer_uptodate(bh)) {
 			header = btrfs_buffer_header(bh);
 			if (btrfs_header_generation(header) ==
 			    root->fs_info->running_transaction->transid) {
@@ -291,12 +291,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	struct buffer_head *t;
 	int pending_ret;
 	int ret;
 
 	if (root == extent_root) {
-		t = find_tree_block(root, blocknr);
 		pin_down_block(root, blocknr, 1);
 		return 0;
 	}
@@ -482,7 +480,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		BUG();
 		return NULL;
 	}
-	buf = find_tree_block(root, ins.objectid);
+	buf = btrfs_find_create_tree_block(root, ins.objectid);
 	set_buffer_uptodate(buf);
 	return buf;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67659b6ce96..8f07f462236 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -474,6 +474,8 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	root = open_ctree(sb, bh, disk_super);
 	sb->s_fs_info = root;
+	disk_super = root->fs_info->disk_super;
+
 	if (!root) {
 		printk("btrfs: open_ctree failed\n");
 		return -EIO;
@@ -734,15 +736,15 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	int ret;
+	root = btrfs_sb(sb);
 
 	sb->s_dirt = 0;
 	if (!wait) {
-		filemap_flush(sb->s_bdev->bd_inode->i_mapping);
+		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
 	}
-	filemap_write_and_wait(sb->s_bdev->bd_inode->i_mapping);
+	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
 
-	root = btrfs_sb(sb);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 46a596e345f..b20fb53a0d2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -74,7 +74,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
-	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
+	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 87cbda5c1f54aba67b1e318a4a4824c9e9f16324 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 28 Mar 2007 19:44:27 -0400
Subject: Btrfs: sha256 csums on metadata

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  8 +++++--
 fs/btrfs/disk-io.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/super.c   |  4 ++--
 3 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b2ebed756c1..6ff87f44c5d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -51,11 +51,11 @@ struct btrfs_key {
  * every tree block (leaf or node) starts with this header.
  */
 struct btrfs_header {
+	__le32 csum[8];
 	u8 fsid[16]; /* FS specific uuid */
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 generation;
 	__le64 parentid; /* objectid of the tree root */
-	__le32 csum;
 	__le32 ham;
 	__le16 nritems;
 	__le16 flags;
@@ -75,9 +75,10 @@ struct buffer_head;
  * it currently lacks any block count etc etc
  */
 struct btrfs_super_block {
+	__le32 csum[8];
+	/* the first 3 fields must match struct btrfs_header */
 	u8 fsid[16];    /* FS specific uuid */
 	__le64 blocknr; /* this block number */
-	__le32 csum;
 	__le64 magic;
 	__le32 blocksize;
 	__le64 generation;
@@ -217,6 +218,7 @@ struct btrfs_inode_map_item {
 	struct btrfs_disk_key key;
 } __attribute__ ((__packed__));
 
+struct crypto_hash;
 struct btrfs_fs_info {
 	struct btrfs_root *fs_root;
 	struct btrfs_root *extent_root;
@@ -236,6 +238,8 @@ struct btrfs_fs_info {
 	struct inode *btree_inode;
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
+	struct crypto_hash *hash_tfm;
+	spinlock_t hash_lock;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 970103f2cac..2afb7922b06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1,6 +1,8 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -126,8 +128,51 @@ static int btree_get_block(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
+static int csum_tree_block(struct btrfs_root * root, struct buffer_head *bh,
+			    int verify)
+{
+	struct btrfs_node *node = btrfs_buffer_node(bh);
+	struct scatterlist sg;
+	struct crypto_hash *tfm = root->fs_info->hash_tfm;
+	struct hash_desc desc;
+	int ret;
+	char result[32];
+
+	desc.tfm = tfm;
+	desc.flags = 0;
+	sg_init_one(&sg, bh->b_data + 32, bh->b_size - 32);
+	spin_lock(&root->fs_info->hash_lock);
+	ret = crypto_hash_digest(&desc, &sg, bh->b_size - 32, result);
+	spin_unlock(&root->fs_info->hash_lock);
+	if (ret) {
+		printk("sha256 digest failed\n");
+	}
+	if (verify) {
+		if (memcmp(node->header.csum, result, sizeof(result)))
+			printk("csum verify failed on %Lu\n", bh->b_blocknr);
+		return -EINVAL;
+	} else
+		memcpy(node->header.csum, result, sizeof(node->header.csum));
+	return 0;
+}
+
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
+	struct buffer_head *bh;
+	struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
+	struct buffer_head *head;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, root->fs_info->sb->s_blocksize,
+					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_dirty(bh))
+			csum_tree_block(root, bh, 0);
+		bh = bh->b_this_page;
+	} while (bh != head);
 	return block_write_full_page(page, btree_get_block, wbc);
 }
 
@@ -157,6 +202,7 @@ struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh))
 			goto fail;
+		csum_tree_block(root, bh, 1);
 	} else {
 		unlock_buffer(bh);
 	}
@@ -233,8 +279,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 						GFP_NOFS);
 	int ret;
 
-	if (!btrfs_super_root(disk_super))
+	if (!btrfs_super_root(disk_super)) {
 		return NULL;
+	}
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
 	sb_set_blocksize(sb, sb_buffer->b_size);
@@ -252,6 +299,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+	fs_info->hash_tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
+	if (!fs_info->hash_tfm) {
+		printk("failed to allocate sha256 hash\n");
+		return NULL;
+	}
+	spin_lock_init(&fs_info->hash_lock);
 
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
@@ -262,9 +315,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	fs_info->sb_buffer = read_tree_block(tree_root, sb_buffer->b_blocknr);
 
-	if (!fs_info->sb_buffer)
+	if (!fs_info->sb_buffer) {
+printk("failed2\n");
 		return NULL;
-
+	}
 	brelse(sb_buffer);
 	sb_buffer = NULL;
 	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
@@ -300,6 +354,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 			     root->fs_info->tree_root->node->b_blocknr);
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
+	csum_tree_block(root, bh, 0);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
 	submit_bh(WRITE, bh);
@@ -338,6 +393,7 @@ int close_ctree(struct btrfs_root *root)
 				    root->fs_info->tree_root->node);
 	btrfs_block_release(root, root->commit_root);
 	btrfs_block_release(root, root->fs_info->sb_buffer);
+	crypto_free_hash(root->fs_info->hash_tfm);
 	iput(root->fs_info->btree_inode);
 	kfree(root->fs_info->extent_root);
 	kfree(root->fs_info->inode_root);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8f07f462236..151c1002d74 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -473,13 +473,13 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	root = open_ctree(sb, bh, disk_super);
-	sb->s_fs_info = root;
-	disk_super = root->fs_info->disk_super;
 
 	if (!root) {
 		printk("btrfs: open_ctree failed\n");
 		return -EIO;
 	}
+	sb->s_fs_info = root;
+	disk_super = root->fs_info->disk_super;
 	printk("read in super total blocks %Lu root %Lu\n",
 	       btrfs_super_total_blocks(disk_super),
 	       btrfs_super_root_dir(disk_super));
-- 
cgit v1.2.3


From 30ae8467483d7ab023b5e728bf7d74a575c78023 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 29 Mar 2007 09:59:15 -0400
Subject: Btrfs: btree address space fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2afb7922b06..f2c1dd40c08 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -69,7 +69,6 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 	if (!page)
 		return NULL;
 
-	wait_on_page_writeback(page);
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, root->fs_info->sb->s_blocksize, 0);
 	head = page_buffers(page);
@@ -300,11 +299,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->hash_tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
-	if (!fs_info->hash_tfm) {
+	spin_lock_init(&fs_info->hash_lock);
+
+	if (!fs_info->hash_tfm || IS_ERR(fs_info->hash_tfm)) {
 		printk("failed to allocate sha256 hash\n");
 		return NULL;
 	}
-	spin_lock_init(&fs_info->hash_lock);
 
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
@@ -394,6 +394,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_block_release(root, root->commit_root);
 	btrfs_block_release(root, root->fs_info->sb_buffer);
 	crypto_free_hash(root->fs_info->hash_tfm);
+	truncate_inode_pages(root->fs_info->btree_inode->i_mapping, 0);
 	iput(root->fs_info->btree_inode);
 	kfree(root->fs_info->extent_root);
 	kfree(root->fs_info->inode_root);
-- 
cgit v1.2.3


From 75dfe3960e602e63ea42ac7a2a0520832b189ffa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 29 Mar 2007 11:56:46 -0400
Subject: btrfs_file_write -- first pass

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 213 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 151c1002d74..7914b31f5bc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -9,6 +9,8 @@
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -755,7 +757,7 @@ printk("btrfs sync_fs\n");
 	return 0;
 }
 
-static int btrfs_get_block(struct inode *inode, sector_t iblock,
+static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
 	int ret;
@@ -772,7 +774,6 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	struct btrfs_disk_key *found_key;
 
 	btrfs_init_path(&path);
-	mutex_lock(&root->fs_info->fs_mutex);
 	if (create)
 		trans = btrfs_start_transaction(root, 1);
 
@@ -848,6 +849,16 @@ allocate:
 out:
 	if (trans)
 		btrfs_end_transaction(trans, root);
+	return err;
+}
+
+static int btrfs_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *result, int create)
+{
+	int err;
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_get_block_lock(inode, iblock, result, create);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return err;
 }
@@ -855,8 +866,15 @@ out:
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
+	WARN_ON(1);
 	return nobh_prepare_write(page, from, to, btrfs_get_block);
 }
+static int btrfs_commit_write(struct file *file, struct page *page,
+			       unsigned from, unsigned to)
+{
+	WARN_ON(1);
+	return nobh_commit_write(file, page, from, to);
+}
 
 static void btrfs_write_super(struct super_block *sb)
 {
@@ -903,6 +921,196 @@ static void btrfs_truncate(struct inode *inode)
 	mark_inode_dirty(inode);
 }
 
+static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
+				struct page **prepared_pages,
+				const char __user * buf)
+{
+	long page_fault = 0;
+	int i;
+	int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+		size_t count = min_t(size_t,
+				     PAGE_CACHE_SIZE - offset, write_bytes);
+		struct page *page = prepared_pages[i];
+		fault_in_pages_readable(buf, count);
+
+		/* Copy data from userspace to the current page */
+		kmap(page);
+		page_fault = __copy_from_user(page_address(page) + offset,
+					      buf, count);
+		/* Flush processor's dcache for this page */
+		flush_dcache_page(page);
+		kunmap(page);
+		buf += count;
+		write_bytes -= count;
+
+		if (page_fault)
+			break;
+	}
+	return page_fault ? -EFAULT : 0;
+}
+
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+	size_t i;
+	for (i = 0; i < num_pages; i++) {
+		if (!pages[i])
+			break;
+		unlock_page(pages[i]);
+		mark_page_accessed(pages[i]);
+		page_cache_release(pages[i]);
+	}
+}
+static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct file *file,
+				   struct page **pages,
+				   size_t num_pages,
+				   loff_t pos,
+				   size_t write_bytes)
+{
+	int i;
+	int offset;
+	int err = 0;
+	int ret;
+	int this_write;
+
+	for (i = 0; i < num_pages; i++) {
+		offset = pos & (PAGE_CACHE_SIZE -1);
+		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		ret = nobh_commit_write(file, pages[i], offset,
+					 offset + this_write);
+		pos += this_write;
+		if (ret) {
+			err = ret;
+			goto failed;
+		}
+		WARN_ON(this_write > write_bytes);
+		write_bytes -= this_write;
+	}
+failed:
+	return err;
+}
+
+static int prepare_pages(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct file *file,
+			 struct page **pages,
+			 size_t num_pages,
+			 loff_t pos,
+			 size_t write_bytes)
+{
+	int i;
+	unsigned long index = pos >> PAGE_CACHE_SHIFT;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int offset;
+	int err = 0;
+	int ret;
+	int this_write;
+	loff_t isize = i_size_read(inode);
+
+	memset(pages, 0, num_pages * sizeof(struct page *));
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = grab_cache_page(inode->i_mapping, index + i);
+		if (!pages[i]) {
+			err = -ENOMEM;
+			goto failed_release;
+		}
+		offset = pos & (PAGE_CACHE_SIZE -1);
+		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		ret = nobh_prepare_write(pages[i], offset,
+					 offset + this_write,
+					 btrfs_get_block_lock);
+		pos += this_write;
+		if (ret) {
+			err = ret;
+			goto failed_truncate;
+		}
+		WARN_ON(this_write > write_bytes);
+		write_bytes -= this_write;
+	}
+	return 0;
+
+failed_release:
+	btrfs_drop_pages(pages, num_pages);
+	return err;
+
+failed_truncate:
+	btrfs_drop_pages(pages, num_pages);
+	if (pos > isize)
+		vmtruncate(inode, isize);
+	return err;
+}
+
+static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	loff_t pos;
+	size_t num_written = 0;
+	int err = 0;
+	int ret = 0;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct page *pages[1];
+
+	if (file->f_flags & O_DIRECT)
+		return -EINVAL;
+	pos = *ppos;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+	if (count == 0)
+		goto out;
+	err = remove_suid(file->f_path.dentry);
+	if (err)
+		goto out;
+	file_update_time(file);
+	mutex_lock(&inode->i_mutex);
+	while(count > 0) {
+		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
+		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+
+		ret = prepare_pages(trans, root, file, pages, num_pages,
+				    pos, write_bytes);
+		BUG_ON(ret);
+		ret = btrfs_copy_from_user(pos, num_pages,
+					   write_bytes, pages, buf);
+		BUG_ON(ret);
+
+		mutex_unlock(&root->fs_info->fs_mutex);
+
+		ret = dirty_and_release_pages(trans, root, file, pages,
+					      num_pages, pos, write_bytes);
+		BUG_ON(ret);
+		btrfs_drop_pages(pages, num_pages);
+
+		ret = btrfs_end_transaction(trans, root);
+
+		buf += write_bytes;
+		count -= write_bytes;
+		pos += write_bytes;
+		num_written += write_bytes;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		cond_resched();
+	}
+	mutex_unlock(&inode->i_mutex);
+out:
+	*ppos = pos;
+	current->backing_dev_info = NULL;
+	return num_written ? num_written : err;
+}
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
@@ -948,7 +1156,7 @@ static struct address_space_operations btrfs_aops = {
 	.writepage	= btrfs_writepage,
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
-	.commit_write	= nobh_commit_write,
+	.commit_write	= btrfs_commit_write,
 };
 
 static struct inode_operations btrfs_file_inode_operations = {
@@ -958,14 +1166,10 @@ static struct inode_operations btrfs_file_inode_operations = {
 static struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.aio_read	= generic_file_aio_read,
-	.aio_write	= generic_file_aio_write,
+	.aio_read       = generic_file_aio_read,
+	.write		= btrfs_file_write,
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
-	.sendfile	= generic_file_sendfile,
-	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
 };
 
 static int __init init_btrfs_fs(void)
-- 
cgit v1.2.3


From f254e52c1ce550fdaa0d31f5e068f0d67c2485d4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 29 Mar 2007 15:15:27 -0400
Subject: Btrfs: verify csums on read

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     |   9 ++-
 fs/btrfs/ctree.h     |  47 ++++++++-----
 fs/btrfs/disk-io.c   |  37 ++++++++---
 fs/btrfs/disk-io.h   |   2 +
 fs/btrfs/file-item.c |  60 +++++++++++++++++
 fs/btrfs/super.c     | 184 +++++++++++++++++++++++++++++++++++++++++++--------
 6 files changed, 279 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 48c611948d1..1eb333e2e18 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -109,14 +109,14 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 		return 1;
 	if (k1.objectid < k2->objectid)
 		return -1;
-	if (k1.flags > k2->flags)
-		return 1;
-	if (k1.flags < k2->flags)
-		return -1;
 	if (k1.offset > k2->offset)
 		return 1;
 	if (k1.offset < k2->offset)
 		return -1;
+	if (k1.flags > k2->flags)
+		return 1;
+	if (k1.flags < k2->flags)
+		return -1;
 	return 0;
 }
 
@@ -1165,7 +1165,6 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		BUG();
 	ret = btrfs_search_slot(trans, root, cpu_key, path, data_size, 1);
 	if (ret == 0) {
-		btrfs_release_path(root, path);
 		return -EEXIST;
 	}
 	if (ret < 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6ff87f44c5d..df1a025a771 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -21,6 +21,9 @@ struct btrfs_transaction;
  */
 #define BTRFS_NAME_LEN 255
 
+/* 32 bytes in various csum fields */
+#define BTRFS_CSUM_SIZE 32
+
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
  * block layout.  objectid corresonds to the inode number.  The flags
@@ -37,21 +40,21 @@ struct btrfs_transaction;
  */
 struct btrfs_disk_key {
 	__le64 objectid;
-	__le32 flags;
 	__le64 offset;
+	__le32 flags;
 } __attribute__ ((__packed__));
 
 struct btrfs_key {
 	u64 objectid;
-	u32 flags;
 	u64 offset;
+	u32 flags;
 } __attribute__ ((__packed__));
 
 /*
  * every tree block (leaf or node) starts with this header.
  */
 struct btrfs_header {
-	__le32 csum[8];
+	u8 csum[BTRFS_CSUM_SIZE];
 	u8 fsid[16]; /* FS specific uuid */
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 generation;
@@ -75,7 +78,7 @@ struct buffer_head;
  * it currently lacks any block count etc etc
  */
 struct btrfs_super_block {
-	__le32 csum[8];
+	u8 csum[BTRFS_CSUM_SIZE];
 	/* the first 3 fields must match struct btrfs_header */
 	u8 fsid[16];    /* FS specific uuid */
 	__le64 blocknr; /* this block number */
@@ -147,7 +150,7 @@ struct btrfs_extent_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_timespec {
-	__le32 sec;
+	__le64 sec;
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
@@ -214,6 +217,10 @@ struct btrfs_file_extent_item {
 	__le64 num_blocks;
 } __attribute__ ((__packed__));
 
+struct btrfs_csum_item {
+	u8 csum[BTRFS_CSUM_SIZE];
+} __attribute__ ((__packed__));
+
 struct btrfs_inode_map_item {
 	struct btrfs_disk_key key;
 } __attribute__ ((__packed__));
@@ -283,27 +290,32 @@ struct btrfs_root {
  * a (hopefully) huge chunk of disk
  */
 #define BTRFS_EXTENT_DATA_KEY	4
+/*
+ * csum items have the checksums for data in the extents
+ */
+#define BTRFS_CSUM_ITEM_KEY	5
+
 /*
  * root items point to tree roots.  There are typically in the root
  * tree used by the super block to find all the other trees
  */
-#define BTRFS_ROOT_ITEM_KEY	5
+#define BTRFS_ROOT_ITEM_KEY	6
 /*
  * extent items are in the extent map tree.  These record which blocks
  * are used, and how many references there are to each block
  */
-#define BTRFS_EXTENT_ITEM_KEY	6
+#define BTRFS_EXTENT_ITEM_KEY	7
 
 /*
  * the inode map records which inode numbers are in use and where
  * they actually live on disk
  */
-#define BTRFS_INODE_MAP_ITEM_KEY 7
+#define BTRFS_INODE_MAP_ITEM_KEY 8
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	8
+#define BTRFS_STRING_ITEM_KEY	9
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -407,15 +419,15 @@ static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i,
 	i->compat_flags = cpu_to_le16(val);
 }
 
-static inline u32 btrfs_timespec_sec(struct btrfs_inode_timespec *ts)
+static inline u64 btrfs_timespec_sec(struct btrfs_inode_timespec *ts)
 {
-	return le32_to_cpu(ts->sec);
+	return le64_to_cpu(ts->sec);
 }
 
 static inline void btrfs_set_timespec_sec(struct btrfs_inode_timespec *ts,
-					  u32 val)
+					  u64 val)
 {
-	ts->sec = cpu_to_le32(val);
+	ts->sec = cpu_to_le64(val);
 }
 
 static inline u32 btrfs_timespec_nsec(struct btrfs_inode_timespec *ts)
@@ -429,8 +441,6 @@ static inline void btrfs_set_timespec_nsec(struct btrfs_inode_timespec *ts,
 	ts->nsec = cpu_to_le32(val);
 }
 
-
-
 static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
 {
 	return le64_to_cpu(ei->owner);
@@ -905,4 +915,11 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 blocknr, int mod);
+int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset,
+			  char *data, size_t len);
+int btrfs_csum_verify_file_block(struct btrfs_root *root,
+				 u64 objectid, u64 offset,
+				 char *data, size_t len);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f2c1dd40c08..f5db2b93650 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -127,31 +127,46 @@ static int btree_get_block(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
-static int csum_tree_block(struct btrfs_root * root, struct buffer_head *bh,
-			    int verify)
+int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
+		    char *result)
 {
-	struct btrfs_node *node = btrfs_buffer_node(bh);
 	struct scatterlist sg;
 	struct crypto_hash *tfm = root->fs_info->hash_tfm;
 	struct hash_desc desc;
 	int ret;
-	char result[32];
 
 	desc.tfm = tfm;
 	desc.flags = 0;
-	sg_init_one(&sg, bh->b_data + 32, bh->b_size - 32);
+	sg_init_one(&sg, data, len);
 	spin_lock(&root->fs_info->hash_lock);
-	ret = crypto_hash_digest(&desc, &sg, bh->b_size - 32, result);
+	ret = crypto_hash_digest(&desc, &sg, len, result);
 	spin_unlock(&root->fs_info->hash_lock);
 	if (ret) {
 		printk("sha256 digest failed\n");
 	}
+	return ret;
+}
+static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
+			   int verify)
+{
+	char result[BTRFS_CSUM_SIZE];
+	int ret;
+	struct btrfs_node *node;
+
+	ret = btrfs_csum_data(root, bh->b_data + BTRFS_CSUM_SIZE,
+			      bh->b_size - BTRFS_CSUM_SIZE, result);
+	if (ret)
+		return ret;
 	if (verify) {
-		if (memcmp(node->header.csum, result, sizeof(result)))
-			printk("csum verify failed on %Lu\n", bh->b_blocknr);
-		return -EINVAL;
-	} else
-		memcpy(node->header.csum, result, sizeof(node->header.csum));
+		if (memcmp(bh->b_data, result, BTRFS_CSUM_SIZE)) {
+			printk("checksum verify failed on %lu\n",
+			       bh->b_blocknr);
+			return 1;
+		}
+	} else {
+		node = btrfs_buffer_node(bh);
+		memcpy(&node->header.csum, result, BTRFS_CSUM_SIZE);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c2c38bda704..f6998e2192c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -39,4 +39,6 @@ void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
 struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr);
+int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
+		    char *result);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 5230a44cb19..2d2c23ca7cb 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -57,3 +57,63 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
 	return ret;
 }
+
+int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset,
+			  char *data, size_t len)
+{
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_path path;
+	struct btrfs_csum_item *item;
+
+	btrfs_init_path(&path);
+	file_key.objectid = objectid;
+	file_key.offset = offset;
+	file_key.flags = 0;
+	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	ret = btrfs_insert_empty_item(trans, root, &path, &file_key,
+				      BTRFS_CSUM_SIZE);
+	if (ret != 0 && ret != -EEXIST)
+		goto fail;
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+			      struct btrfs_csum_item);
+	ret = 0;
+	ret = btrfs_csum_data(root, data, len, item->csum);
+	mark_buffer_dirty(path.nodes[0]);
+fail:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
+int btrfs_csum_verify_file_block(struct btrfs_root *root,
+				 u64 objectid, u64 offset,
+				 char *data, size_t len)
+{
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_path path;
+	struct btrfs_csum_item *item;
+	char result[BTRFS_CSUM_SIZE];
+
+	btrfs_init_path(&path);
+	file_key.objectid = objectid;
+	file_key.offset = offset;
+	file_key.flags = 0;
+	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root, &file_key, &path, 0, 0);
+	if (ret)
+		goto fail;
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+			      struct btrfs_csum_item);
+	ret = 0;
+	ret = btrfs_csum_data(root, data, len, result);
+	WARN_ON(ret);
+	if (memcmp(result, item->csum, BTRFS_CSUM_SIZE))
+		ret = 1;
+fail:
+	btrfs_release_path(root, &path);
+	return ret;
+}
+
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7914b31f5bc..04428137d75 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -249,15 +249,16 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct btrfs_disk_key *found_key;
 	struct btrfs_leaf *leaf;
-	struct btrfs_file_extent_item *fi;
-	u64 extent_start;
-	u64 extent_num_blocks;
+	struct btrfs_file_extent_item *fi = NULL;
+	u64 extent_start = 0;
+	u64 extent_num_blocks = 0;
+	int found_extent;
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
 	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
 	while(1) {
 		btrfs_init_path(&path);
 		ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
@@ -273,25 +274,32 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		found_key = &leaf->items[path.slots[0]].key;
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
 			break;
-		if (btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
+		if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
+		    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
 			break;
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
 			break;
-		fi = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-				    path.slots[0],
-				    struct btrfs_file_extent_item);
-		extent_start = btrfs_file_extent_disk_blocknr(fi);
-		extent_num_blocks = btrfs_file_extent_disk_num_blocks(fi);
-		key.offset = btrfs_disk_key_offset(found_key) - 1;
+		if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
+			fi = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
+					    path.slots[0],
+					    struct btrfs_file_extent_item);
+			extent_start = btrfs_file_extent_disk_blocknr(fi);
+			extent_num_blocks =
+				btrfs_file_extent_disk_num_blocks(fi);
+			inode->i_blocks -=
+				btrfs_file_extent_num_blocks(fi) >> 9;
+			found_extent = 1;
+		} else {
+			found_extent = 0;
+		}
 		ret = btrfs_del_item(trans, root, &path);
 		BUG_ON(ret);
-		inode->i_blocks -= btrfs_file_extent_num_blocks(fi) >> 9;
 		btrfs_release_path(root, &path);
-		ret = btrfs_free_extent(trans, root, extent_start,
-					extent_num_blocks, 0);
-		BUG_ON(ret);
-		if (key.offset + 1 == 0)
-			break;
+		if (found_extent) {
+			ret = btrfs_free_extent(trans, root, extent_start,
+						extent_num_blocks, 0);
+			BUG_ON(ret);
+		}
 	}
 	btrfs_release_path(root, &path);
 	ret = 0;
@@ -975,10 +983,24 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int ret;
 	int this_write;
+	struct inode *inode = file->f_path.dentry->d_inode;
 
 	for (i = 0; i < num_pages; i++) {
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		/* FIXME, one block at a time */
+
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_csum_file_block(trans, root, inode->i_ino,
+				      pages[i]->index << PAGE_CACHE_SHIFT,
+				      kmap(pages[i]), PAGE_CACHE_SIZE);
+		kunmap(pages[i]);
+		SetPageChecked(pages[i]);
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->fs_mutex);
+
 		ret = nobh_commit_write(file, pages[i], offset,
 					 offset + this_write);
 		pos += this_write;
@@ -1022,7 +1044,7 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
 		ret = nobh_prepare_write(pages[i], offset,
 					 offset + this_write,
-					 btrfs_get_block_lock);
+					 btrfs_get_block);
 		pos += this_write;
 		if (ret) {
 			err = ret;
@@ -1051,7 +1073,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	size_t num_written = 0;
 	int err = 0;
 	int ret = 0;
-	struct btrfs_trans_handle *trans;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
 	struct page *pages[1];
@@ -1077,25 +1098,18 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
-		mutex_lock(&root->fs_info->fs_mutex);
-		trans = btrfs_start_transaction(root, 1);
-
-		ret = prepare_pages(trans, root, file, pages, num_pages,
+		ret = prepare_pages(NULL, root, file, pages, num_pages,
 				    pos, write_bytes);
 		BUG_ON(ret);
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		BUG_ON(ret);
 
-		mutex_unlock(&root->fs_info->fs_mutex);
-
-		ret = dirty_and_release_pages(trans, root, file, pages,
+		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, pos, write_bytes);
 		BUG_ON(ret);
 		btrfs_drop_pages(pages, num_pages);
 
-		ret = btrfs_end_transaction(trans, root);
-
 		buf += write_bytes;
 		count -= write_bytes;
 		pos += write_bytes;
@@ -1111,6 +1125,118 @@ out:
 	return num_written ? num_written : err;
 }
 
+static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
+			unsigned long offset, unsigned long size)
+{
+	char *kaddr;
+	unsigned long left, count = desc->count;
+
+	if (size > count)
+		size = count;
+
+	if (!PageChecked(page)) {
+		/* FIXME, do it per block */
+		struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
+		int ret = btrfs_csum_verify_file_block(root,
+					  page->mapping->host->i_ino,
+					  page->index << PAGE_CACHE_SHIFT,
+					  kmap(page), PAGE_CACHE_SIZE);
+		if (ret) {
+			printk("failed to verify ino %lu page %lu\n",
+			       page->mapping->host->i_ino,
+			       page->index);
+			memset(page_address(page), 0, PAGE_CACHE_SIZE);
+		}
+		SetPageChecked(page);
+		kunmap(page);
+	}
+	/*
+	 * Faults on the destination of a read are common, so do it before
+	 * taking the kmap.
+	 */
+	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		left = __copy_to_user_inatomic(desc->arg.buf,
+						kaddr + offset, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (left == 0)
+			goto success;
+	}
+
+	/* Do it the slow way */
+	kaddr = kmap(page);
+	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+	kunmap(page);
+
+	if (left) {
+		size -= left;
+		desc->error = -EFAULT;
+	}
+success:
+	desc->count = count - size;
+	desc->written += size;
+	desc->arg.buf += size;
+	return size;
+}
+
+/**
+ * btrfs_file_aio_read - filesystem read routine
+ * @iocb:	kernel I/O control block
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @pos:	current file position
+ */
+static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
+{
+	struct file *filp = iocb->ki_filp;
+	ssize_t retval;
+	unsigned long seg;
+	size_t count;
+	loff_t *ppos = &iocb->ki_pos;
+
+	count = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		count += iv->iov_len;
+		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		nr_segs = seg;
+		count -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+	retval = 0;
+	if (count) {
+		for (seg = 0; seg < nr_segs; seg++) {
+			read_descriptor_t desc;
+
+			desc.written = 0;
+			desc.arg.buf = iov[seg].iov_base;
+			desc.count = iov[seg].iov_len;
+			if (desc.count == 0)
+				continue;
+			desc.error = 0;
+			do_generic_file_read(filp, ppos, &desc,
+					     btrfs_read_actor);
+			retval += desc.written;
+			if (desc.error) {
+				retval = retval ?: desc.error;
+				break;
+			}
+		}
+	}
+	return retval;
+}
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
@@ -1166,7 +1292,7 @@ static struct inode_operations btrfs_file_inode_operations = {
 static struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.aio_read       = generic_file_aio_read,
+	.aio_read       = btrfs_file_aio_read,
 	.write		= btrfs_file_write,
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
-- 
cgit v1.2.3


From 22b0ebda6c63a1ad66b6a9e806bd226a4a03a049 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 30 Mar 2007 08:47:31 -0400
Subject: Btrfs: hunting slab corruption

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  2 ++
 fs/btrfs/dir-item.c    |  2 ++
 fs/btrfs/disk-io.c     | 16 +++++++++++++---
 fs/btrfs/super.c       |  7 ++++++-
 fs/btrfs/transaction.c |  6 ++++--
 5 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1eb333e2e18..b7f3fcb72c6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -457,6 +457,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	int level;
 
+	WARN_ON(p->nodes[0] != NULL);
+	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 again:
 	b = root->node;
 	get_bh(b);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c18f81797a8..18d3e168ad0 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -35,6 +35,8 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
 	memcpy(name_ptr, name, name_len);
+	if (name_ptr + name_len > path.nodes[0]->b_data + 4096)
+		WARN_ON(1);
 	mark_buffer_dirty(path.nodes[0]);
 out:
 	btrfs_release_path(root, &path);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f5db2b93650..6c010463b9d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3,6 +3,7 @@
 #include <linux/blkdev.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
+#include <linux/swap.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -50,6 +51,8 @@ struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
 	} while (bh != head);
 out_unlock:
 	unlock_page(page);
+	if (ret)
+		touch_buffer(ret);
 	page_cache_release(page);
 	return ret;
 }
@@ -65,6 +68,7 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 	struct buffer_head *head;
 	struct buffer_head *ret = NULL;
 	u64 first_block = index << (PAGE_CACHE_SHIFT - blockbits);
+
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		return NULL;
@@ -89,6 +93,8 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 	} while (bh != head);
 out_unlock:
 	unlock_page(page);
+	if (ret)
+		touch_buffer(ret);
 	page_cache_release(page);
 	return ret;
 }
@@ -139,7 +145,7 @@ int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 	desc.flags = 0;
 	sg_init_one(&sg, data, len);
 	spin_lock(&root->fs_info->hash_lock);
-	ret = crypto_hash_digest(&desc, &sg, len, result);
+	ret = crypto_hash_digest(&desc, &sg, 1, result);
 	spin_unlock(&root->fs_info->hash_lock);
 	if (ret) {
 		printk("sha256 digest failed\n");
@@ -153,6 +159,7 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 	int ret;
 	struct btrfs_node *node;
 
+	return 0;
 	ret = btrfs_csum_data(root, bh->b_data + BTRFS_CSUM_SIZE,
 			      bh->b_size - BTRFS_CSUM_SIZE, result);
 	if (ret)
@@ -165,17 +172,17 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 		}
 	} else {
 		node = btrfs_buffer_node(bh);
-		memcpy(&node->header.csum, result, BTRFS_CSUM_SIZE);
+		memcpy(node->header.csum, result, BTRFS_CSUM_SIZE);
 	}
 	return 0;
 }
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
+#if 0
 	struct buffer_head *bh;
 	struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
 	struct buffer_head *head;
-
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, root->fs_info->sb->s_blocksize,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -187,6 +194,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
 			csum_tree_block(root, bh, 0);
 		bh = bh->b_this_page;
 	} while (bh != head);
+#endif
 	return block_write_full_page(page, btree_get_block, wbc);
 }
 
@@ -312,6 +320,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	insert_inode_hash(fs_info->btree_inode);
+
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->hash_tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
 	spin_lock_init(&fs_info->hash_lock);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 04428137d75..ab5419eea93 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -361,13 +361,16 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
 	struct inode * inode;
+	struct btrfs_root *root = btrfs_sb(dir->i_sb);
 	ino_t ino;
 	int ret;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_inode_by_name(dir, dentry, &ino);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret < 0)
 		return ERR_PTR(ret);
 	inode = NULL;
@@ -395,6 +398,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	unsigned char d_type = DT_UNKNOWN;
 	int over = 0;
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
@@ -446,6 +450,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	ret = 0;
 err:
 	btrfs_release_path(root, &path);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -667,8 +672,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 	}
 	dir->i_sb->s_dirt = 1;
-	btrfs_end_transaction(trans, root);
 out_unlock:
+	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b20fb53a0d2..a146463c504 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -197,8 +197,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_del_root(trans, root->fs_info->tree_root,
 				     &snap_key);
-		BUG_ON(ret); root->fs_info->generation = root->root_key.offset + 1; ret = btrfs_end_transaction(trans, root); BUG_ON(ret);
-		printk("at free, total trans %d\n", total_trans);
+		BUG_ON(ret);
+		root->fs_info->generation = root->root_key.offset + 1;
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
 	}
 
 	return ret;
-- 
cgit v1.2.3


From d6025579531b7ea170ba283b171ff7a6bf7d0e12 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 30 Mar 2007 14:27:56 -0400
Subject: Btrfs: corruption hunt continues

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/bit-radix.c   |   9 ++-
 fs/btrfs/ctree.c       | 201 ++++++++++++++++++++++++++++---------------------
 fs/btrfs/ctree.h       |  32 ++++++++
 fs/btrfs/dir-item.c    |   6 +-
 fs/btrfs/disk-io.c     |  20 ++++-
 fs/btrfs/extent-tree.c |   8 +-
 fs/btrfs/file-item.c   |   4 +-
 fs/btrfs/inode-map.c   |   2 +-
 fs/btrfs/root-tree.c   |   7 +-
 fs/btrfs/super.c       |   2 +-
 fs/btrfs/transaction.c |   1 +
 11 files changed, 187 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index 43807cc3cce..845422368f4 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -16,12 +16,14 @@ int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 
 	bits = radix_tree_lookup(radix, slot);
 	if (!bits) {
-		bits = kmalloc(BIT_ARRAY_BYTES, GFP_NOIO);
+		bits = kmalloc(BIT_ARRAY_BYTES, GFP_NOFS);
 		if (!bits)
 			return -ENOMEM;
 		memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
 		bits[0] = slot;
+		radix_tree_preload(GFP_NOFS);
 		ret = radix_tree_insert(radix, slot, bits);
+		radix_tree_preload_end();
 		if (ret)
 			return ret;
 	}
@@ -59,7 +61,7 @@ int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 	if (!bits)
 		return 0;
 	clear_bit(bit_slot, bits + 1);
-
+#if 0
 	for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
 		if (bits[i]) {
 			empty = 0;
@@ -69,8 +71,11 @@ int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 
 	if (empty) {
 		bits = radix_tree_delete(radix, slot);
+		synchronize_rcu();
 		BUG_ON(!bits);
+		kfree(bits);
 	}
+#endif
 	return 0;
 }
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b7f3fcb72c6..a0dfa2d6cb9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -51,7 +51,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_blocknr(&cow_node->header, cow->b_blocknr);
 	btrfs_set_header_generation(&cow_node->header, trans->transid);
 	*cow_ret = cow;
-	mark_buffer_dirty(cow);
+	btrfs_mark_buffer_dirty(cow);
 	btrfs_inc_ref(trans, root, buf);
 	if (buf == root->node) {
 		root->node = cow;
@@ -62,7 +62,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	} else {
 		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
 					cow->b_blocknr);
-		mark_buffer_dirty(parent);
+		btrfs_mark_buffer_dirty(parent);
 		btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
 	}
 	btrfs_block_release(root, buf);
@@ -312,11 +312,12 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		BUG_ON(!child);
 		root->node = child;
 		path->nodes[level] = NULL;
+		clean_tree_block(trans, root, mid_buf);
+		wait_on_buffer(mid_buf);
 		/* once for the path */
 		btrfs_block_release(root, mid_buf);
 		/* once for the root ptr */
 		btrfs_block_release(root, mid_buf);
-		clean_tree_block(trans, root, mid_buf);
 		return btrfs_free_extent(trans, root, blocknr, 1, 1);
 	}
 	parent = btrfs_buffer_node(parent_buf);
@@ -351,8 +352,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = wret;
 		if (btrfs_header_nritems(&right->header) == 0) {
 			u64 blocknr = right_buf->b_blocknr;
-			btrfs_block_release(root, right_buf);
 			clean_tree_block(trans, root, right_buf);
+			wait_on_buffer(right_buf);
+			btrfs_block_release(root, right_buf);
 			right_buf = NULL;
 			right = NULL;
 			wret = del_ptr(trans, root, path, level + 1, pslot +
@@ -363,10 +365,11 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			if (wret)
 				ret = wret;
 		} else {
-			memcpy(&parent->ptrs[pslot + 1].key,
-				&right->ptrs[0].key,
-				sizeof(struct btrfs_disk_key));
-			mark_buffer_dirty(parent_buf);
+			btrfs_memcpy(root, parent,
+				     &parent->ptrs[pslot + 1].key,
+				     &right->ptrs[0].key,
+				     sizeof(struct btrfs_disk_key));
+			btrfs_mark_buffer_dirty(parent_buf);
 		}
 	}
 	if (btrfs_header_nritems(&mid->header) == 1) {
@@ -388,8 +391,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (btrfs_header_nritems(&mid->header) == 0) {
 		/* we've managed to empty the middle node, drop it */
 		u64 blocknr = mid_buf->b_blocknr;
-		btrfs_block_release(root, mid_buf);
 		clean_tree_block(trans, root, mid_buf);
+		wait_on_buffer(mid_buf);
+		btrfs_block_release(root, mid_buf);
 		mid_buf = NULL;
 		mid = NULL;
 		wret = del_ptr(trans, root, path, level + 1, pslot);
@@ -400,9 +404,10 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = wret;
 	} else {
 		/* update the parent key to reflect our changes */
-		memcpy(&parent->ptrs[pslot].key, &mid->ptrs[0].key,
-		       sizeof(struct btrfs_disk_key));
-		mark_buffer_dirty(parent_buf);
+		btrfs_memcpy(root, parent,
+			     &parent->ptrs[pslot].key, &mid->ptrs[0].key,
+			     sizeof(struct btrfs_disk_key));
+		btrfs_mark_buffer_dirty(parent_buf);
 	}
 
 	/* update the path */
@@ -544,8 +549,8 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (!path->nodes[i])
 			break;
 		t = btrfs_buffer_node(path->nodes[i]);
-		memcpy(&t->ptrs[tslot].key, key, sizeof(*key));
-		mark_buffer_dirty(path->nodes[i]);
+		btrfs_memcpy(root, t, &t->ptrs[tslot].key, key, sizeof(*key));
+		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
@@ -580,17 +585,17 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (src_nritems < push_items)
 		push_items = src_nritems;
 
-	memcpy(dst->ptrs + dst_nritems, src->ptrs,
-		push_items * sizeof(struct btrfs_key_ptr));
+	btrfs_memcpy(root, dst, dst->ptrs + dst_nritems, src->ptrs,
+		     push_items * sizeof(struct btrfs_key_ptr));
 	if (push_items < src_nritems) {
-		memmove(src->ptrs, src->ptrs + push_items,
+		btrfs_memmove(root, src, src->ptrs, src->ptrs + push_items,
 			(src_nritems - push_items) *
 			sizeof(struct btrfs_key_ptr));
 	}
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
-	mark_buffer_dirty(src_buf);
-	mark_buffer_dirty(dst_buf);
+	btrfs_mark_buffer_dirty(src_buf);
+	btrfs_mark_buffer_dirty(dst_buf);
 	return ret;
 }
 
@@ -629,16 +634,18 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 	if (max_push < push_items)
 		push_items = max_push;
 
-	memmove(dst->ptrs + push_items, dst->ptrs,
-		dst_nritems * sizeof(struct btrfs_key_ptr));
-	memcpy(dst->ptrs, src->ptrs + src_nritems - push_items,
-		push_items * sizeof(struct btrfs_key_ptr));
+	btrfs_memmove(root, dst, dst->ptrs + push_items, dst->ptrs,
+		      dst_nritems * sizeof(struct btrfs_key_ptr));
+
+	btrfs_memcpy(root, dst, dst->ptrs,
+		     src->ptrs + src_nritems - push_items,
+		     push_items * sizeof(struct btrfs_key_ptr));
 
 	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
 	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
 
-	mark_buffer_dirty(src_buf);
-	mark_buffer_dirty(dst_buf);
+	btrfs_mark_buffer_dirty(src_buf);
+	btrfs_mark_buffer_dirty(dst_buf);
 	return ret;
 }
 
@@ -674,10 +681,11 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		lower_key = &((struct btrfs_leaf *)lower)->items[0].key;
 	else
 		lower_key = &lower->ptrs[0].key;
-	memcpy(&c->ptrs[0].key, lower_key, sizeof(struct btrfs_disk_key));
+	btrfs_memcpy(root, c, &c->ptrs[0].key, lower_key,
+		     sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->b_blocknr);
 
-	mark_buffer_dirty(t);
+	btrfs_mark_buffer_dirty(t);
 
 	/* the super has an extra ref to root->node */
 	btrfs_block_release(root, root->node);
@@ -712,13 +720,15 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
 		BUG();
 	if (slot != nritems) {
-		memmove(lower->ptrs + slot + 1, lower->ptrs + slot,
-			(nritems - slot) * sizeof(struct btrfs_key_ptr));
+		btrfs_memmove(root, lower, lower->ptrs + slot + 1,
+			      lower->ptrs + slot,
+			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
-	memcpy(&lower->ptrs[slot].key, key, sizeof(struct btrfs_disk_key));
+	btrfs_memcpy(root, lower, &lower->ptrs[slot].key,
+		     key, sizeof(struct btrfs_disk_key));
 	btrfs_set_node_blockptr(lower, slot, blocknr);
 	btrfs_set_header_nritems(&lower->header, nritems + 1);
-	mark_buffer_dirty(path->nodes[level]);
+	btrfs_mark_buffer_dirty(path->nodes[level]);
 	return 0;
 }
 
@@ -761,14 +771,14 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_parentid(&split->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	mid = (c_nritems + 1) / 2;
-	memcpy(split->ptrs, c->ptrs + mid,
-		(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+	btrfs_memcpy(root, split, split->ptrs, c->ptrs + mid,
+		     (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
 	btrfs_set_header_nritems(&split->header, c_nritems - mid);
 	btrfs_set_header_nritems(&c->header, mid);
 	ret = 0;
 
-	mark_buffer_dirty(t);
-	mark_buffer_dirty(split_buffer);
+	btrfs_mark_buffer_dirty(t);
+	btrfs_mark_buffer_dirty(split_buffer);
 	wret = insert_ptr(trans, root, path, &split->ptrs[0].key,
 			  split_buffer->b_blocknr, path->slots[level + 1] + 1,
 			  level + 1);
@@ -875,17 +885,22 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	push_space = btrfs_item_end(left->items + left_nritems - push_items);
 	push_space -= leaf_data_end(root, left);
 	/* make room in the right data area */
-	memmove(btrfs_leaf_data(right) + leaf_data_end(root, right) -
-		push_space, btrfs_leaf_data(right) + leaf_data_end(root, right),
-		BTRFS_LEAF_DATA_SIZE(root) - leaf_data_end(root, right));
+	btrfs_memmove(root, right, btrfs_leaf_data(right) +
+		      leaf_data_end(root, right) - push_space,
+		      btrfs_leaf_data(right) +
+		      leaf_data_end(root, right), BTRFS_LEAF_DATA_SIZE(root) -
+		      leaf_data_end(root, right));
 	/* copy from the left data area */
-	memcpy(btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - push_space,
-		btrfs_leaf_data(left) + leaf_data_end(root, left), push_space);
-	memmove(right->items + push_items, right->items,
+	btrfs_memcpy(root, right, btrfs_leaf_data(right) +
+		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
+		     btrfs_leaf_data(left) + leaf_data_end(root, left),
+		     push_space);
+	btrfs_memmove(root, right, right->items + push_items, right->items,
 		right_nritems * sizeof(struct btrfs_item));
 	/* copy the items from left to right */
-	memcpy(right->items, left->items + left_nritems - push_items,
-		push_items * sizeof(struct btrfs_item));
+	btrfs_memcpy(root, right, right->items, left->items +
+		     left_nritems - push_items,
+		     push_items * sizeof(struct btrfs_item));
 
 	/* update the item pointers */
 	right_nritems += push_items;
@@ -899,11 +914,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	left_nritems -= push_items;
 	btrfs_set_header_nritems(&left->header, left_nritems);
 
-	mark_buffer_dirty(left_buf);
-	mark_buffer_dirty(right_buf);
-	memcpy(&upper_node->ptrs[slot + 1].key,
+	btrfs_mark_buffer_dirty(left_buf);
+	btrfs_mark_buffer_dirty(right_buf);
+	btrfs_memcpy(root, upper_node, &upper_node->ptrs[slot + 1].key,
 		&right->items[0].key, sizeof(struct btrfs_disk_key));
-	mark_buffer_dirty(upper);
+	btrfs_mark_buffer_dirty(upper);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
@@ -977,14 +992,16 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 	/* push data from right to left */
-	memcpy(left->items + btrfs_header_nritems(&left->header),
-		right->items, push_items * sizeof(struct btrfs_item));
+	btrfs_memcpy(root, left, left->items +
+		     btrfs_header_nritems(&left->header),
+		     right->items, push_items * sizeof(struct btrfs_item));
 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
 		     btrfs_item_offset(right->items + push_items -1);
-	memcpy(btrfs_leaf_data(left) + leaf_data_end(root, left) - push_space,
-		btrfs_leaf_data(right) +
-		btrfs_item_offset(right->items + push_items - 1),
-		push_space);
+	btrfs_memcpy(root, left, btrfs_leaf_data(left) +
+		     leaf_data_end(root, left) - push_space,
+		     btrfs_leaf_data(right) +
+		     btrfs_item_offset(right->items + push_items - 1),
+		     push_space);
 	old_left_nritems = btrfs_header_nritems(&left->header);
 	BUG_ON(old_left_nritems < 0);
 
@@ -1000,10 +1017,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	/* fixup right node */
 	push_space = btrfs_item_offset(right->items + push_items - 1) -
 		     leaf_data_end(root, right);
-	memmove(btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-		push_space, btrfs_leaf_data(right) +
-		leaf_data_end(root, right), push_space);
-	memmove(right->items, right->items + push_items,
+	btrfs_memmove(root, right, btrfs_leaf_data(right) +
+		      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+		      btrfs_leaf_data(right) +
+		      leaf_data_end(root, right), push_space);
+	btrfs_memmove(root, right, right->items, right->items + push_items,
 		(btrfs_header_nritems(&right->header) - push_items) *
 		sizeof(struct btrfs_item));
 	btrfs_set_header_nritems(&right->header,
@@ -1017,8 +1035,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		push_space = btrfs_item_offset(right->items + i);
 	}
 
-	mark_buffer_dirty(t);
-	mark_buffer_dirty(right_buf);
+	btrfs_mark_buffer_dirty(t);
+	btrfs_mark_buffer_dirty(right_buf);
 
 	wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1);
 	if (wret)
@@ -1110,11 +1128,12 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	data_copy_size = btrfs_item_end(l->items + mid) -
 			 leaf_data_end(root, l);
-	memcpy(right->items, l->items + mid,
-	       (nritems - mid) * sizeof(struct btrfs_item));
-	memcpy(btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-		data_copy_size, btrfs_leaf_data(l) +
-		leaf_data_end(root, l), data_copy_size);
+	btrfs_memcpy(root, right, right->items, l->items + mid,
+		     (nritems - mid) * sizeof(struct btrfs_item));
+	btrfs_memcpy(root, right,
+		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+		     data_copy_size, btrfs_leaf_data(l) +
+		     leaf_data_end(root, l), data_copy_size);
 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
 		      btrfs_item_end(l->items + mid);
 
@@ -1129,8 +1148,8 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			  right_buffer->b_blocknr, path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
-	mark_buffer_dirty(right_buffer);
-	mark_buffer_dirty(l_buf);
+	btrfs_mark_buffer_dirty(right_buffer);
+	btrfs_mark_buffer_dirty(l_buf);
 	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
 		btrfs_block_release(root, path->nodes[0]);
@@ -1200,22 +1219,23 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 
 		/* shift the items */
-		memmove(leaf->items + slot + 1, leaf->items + slot,
-		        (nritems - slot) * sizeof(struct btrfs_item));
+		btrfs_memmove(root, leaf, leaf->items + slot + 1,
+			      leaf->items + slot,
+			      (nritems - slot) * sizeof(struct btrfs_item));
 
 		/* shift the data */
-		memmove(btrfs_leaf_data(leaf) + data_end - data_size,
-			btrfs_leaf_data(leaf) +
-		        data_end, old_data - data_end);
+		btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+			      data_end - data_size, btrfs_leaf_data(leaf) +
+			      data_end, old_data - data_end);
 		data_end = old_data;
 	}
 	/* setup the item for the new data */
-	memcpy(&leaf->items[slot].key, &disk_key,
-		sizeof(struct btrfs_disk_key));
+	btrfs_memcpy(root, leaf, &leaf->items[slot].key, &disk_key,
+		     sizeof(struct btrfs_disk_key));
 	btrfs_set_item_offset(leaf->items + slot, data_end - data_size);
 	btrfs_set_item_size(leaf->items + slot, data_size);
 	btrfs_set_header_nritems(&leaf->header, nritems + 1);
-	mark_buffer_dirty(leaf_buf);
+	btrfs_mark_buffer_dirty(leaf_buf);
 
 	ret = 0;
 	if (slot == 0)
@@ -1245,8 +1265,9 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!ret) {
 		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
 				     path.slots[0], u8);
-		memcpy(ptr, data, data_size);
-		mark_buffer_dirty(path.nodes[0]);
+		btrfs_memcpy(root, path.nodes[0]->b_data,
+			     ptr, data, data_size);
+		btrfs_mark_buffer_dirty(path.nodes[0]);
 	}
 	btrfs_release_path(root, &path);
 	return ret;
@@ -1271,8 +1292,10 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	node = btrfs_buffer_node(parent);
 	nritems = btrfs_header_nritems(&node->header);
 	if (slot != nritems -1) {
-		memmove(node->ptrs + slot, node->ptrs + slot + 1,
-			sizeof(struct btrfs_key_ptr) * (nritems - slot - 1));
+		btrfs_memmove(root, node, node->ptrs + slot,
+			      node->ptrs + slot + 1,
+			      sizeof(struct btrfs_key_ptr) *
+			      (nritems - slot - 1));
 	}
 	nritems--;
 	btrfs_set_header_nritems(&node->header, nritems);
@@ -1287,7 +1310,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (wret)
 			ret = wret;
 	}
-	mark_buffer_dirty(parent);
+	btrfs_mark_buffer_dirty(parent);
 	return ret;
 }
 
@@ -1317,16 +1340,18 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (slot != nritems - 1) {
 		int i;
 		int data_end = leaf_data_end(root, leaf);
-		memmove(btrfs_leaf_data(leaf) + data_end + dsize,
-			btrfs_leaf_data(leaf) + data_end,
-			doff - data_end);
+		btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+			      data_end + dsize,
+			      btrfs_leaf_data(leaf) + data_end,
+			      doff - data_end);
 		for (i = slot + 1; i < nritems; i++) {
 			u32 ioff = btrfs_item_offset(leaf->items + i);
 			btrfs_set_item_offset(leaf->items + i, ioff + dsize);
 		}
-		memmove(leaf->items + slot, leaf->items + slot + 1,
-			sizeof(struct btrfs_item) *
-			(nritems - slot - 1));
+		btrfs_memmove(root, leaf, leaf->items + slot,
+			      leaf->items + slot + 1,
+			      sizeof(struct btrfs_item) *
+			      (nritems - slot - 1));
 	}
 	btrfs_set_header_nritems(&leaf->header, nritems - 1);
 	nritems--;
@@ -1336,6 +1361,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			btrfs_set_header_level(&leaf->header, 0);
 		} else {
 			clean_tree_block(trans, root, leaf_buf);
+			wait_on_buffer(leaf_buf);
 			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
@@ -1373,6 +1399,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (btrfs_header_nritems(&leaf->header) == 0) {
 				u64 blocknr = leaf_buf->b_blocknr;
 				clean_tree_block(trans, root, leaf_buf);
+				wait_on_buffer(leaf_buf);
 				wret = del_ptr(trans, root, path, 1, slot);
 				if (wret)
 					ret = wret;
@@ -1382,11 +1409,11 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				if (wret)
 					ret = wret;
 			} else {
-				mark_buffer_dirty(leaf_buf);
+				btrfs_mark_buffer_dirty(leaf_buf);
 				btrfs_block_release(root, leaf_buf);
 			}
 		} else {
-			mark_buffer_dirty(leaf_buf);
+			btrfs_mark_buffer_dirty(leaf_buf);
 		}
 	}
 	return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index df1a025a771..9ec0d65ebe9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2,6 +2,7 @@
 #define __BTRFS__
 
 #include <linux/fs.h>
+#include <linux/buffer_head.h>
 #include "bit-radix.h"
 
 struct btrfs_trans_handle;
@@ -834,6 +835,37 @@ static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
+static inline void btrfs_check_bounds(void *vptr, size_t len,
+				     void *vcontainer, size_t container_len)
+{
+	char *ptr = vptr;
+	char *container = vcontainer;
+	WARN_ON(ptr < container);
+	WARN_ON(ptr + len > container + container_len);
+}
+
+static inline void btrfs_memcpy(struct btrfs_root *root,
+				void *dst_block,
+				void *dst, const void *src, size_t nr)
+{
+	btrfs_check_bounds(dst, nr, dst_block, root->fs_info->sb->s_blocksize);
+	memcpy(dst, src, nr);
+}
+
+static inline void btrfs_memmove(struct btrfs_root *root,
+				void *dst_block,
+				void *dst, void *src, size_t nr)
+{
+	btrfs_check_bounds(dst, nr, dst_block, root->fs_info->sb->s_blocksize);
+	memmove(dst, src, nr);
+}
+
+static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
+{
+	WARN_ON(!atomic_read(&bh->b_count));
+	mark_buffer_dirty(bh);
+}
+
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 18d3e168ad0..f55c89472ac 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -34,10 +34,8 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_dir_flags(dir_item, 0);
 	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
-	memcpy(name_ptr, name, name_len);
-	if (name_ptr + name_len > path.nodes[0]->b_data + 4096)
-		WARN_ON(1);
-	mark_buffer_dirty(path.nodes[0]);
+	btrfs_memcpy(root, path.nodes[0]->b_data, name_ptr, name, name_len);
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 out:
 	btrfs_release_path(root, &path);
 	return ret;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c010463b9d..bb133104e2e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -8,6 +8,17 @@
 #include "disk-io.h"
 #include "transaction.h"
 
+#define PATTERN 0xDEADBEEFUL
+static inline void check_pattern(struct buffer_head *buf)
+{
+	if (buf->b_private != (void *)PATTERN)
+		WARN_ON(1);
+}
+
+static inline void set_pattern(struct buffer_head *buf)
+{
+	buf->b_private = (void *)PATTERN;
+}
 
 static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
@@ -51,8 +62,10 @@ struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
 	} while (bh != head);
 out_unlock:
 	unlock_page(page);
-	if (ret)
+	if (ret) {
 		touch_buffer(ret);
+		check_pattern(ret);
+	}
 	page_cache_release(page);
 	return ret;
 }
@@ -82,6 +95,7 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 			bh->b_bdev = root->fs_info->sb->s_bdev;
 			bh->b_blocknr = first_block;
 			set_buffer_mapped(bh);
+			set_pattern(bh);
 		}
 		if (bh->b_blocknr == blocknr) {
 			ret = bh;
@@ -225,6 +239,7 @@ struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 		if (!buffer_uptodate(bh))
 			goto fail;
 		csum_tree_block(root, bh, 1);
+		set_pattern(bh);
 	} else {
 		unlock_buffer(bh);
 	}
@@ -240,6 +255,7 @@ fail:
 int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct buffer_head *buf)
 {
+	WARN_ON(atomic_read(&buf->b_count) == 0);
 	mark_buffer_dirty(buf);
 	return 0;
 }
@@ -247,6 +263,7 @@ int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct buffer_head *buf)
 {
+	WARN_ON(atomic_read(&buf->b_count) == 0);
 	clear_buffer_dirty(buf);
 	return 0;
 }
@@ -431,6 +448,7 @@ int close_ctree(struct btrfs_root *root)
 
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 {
+	check_pattern(buf);
 	brelse(buf);
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b2ae8e768b6..c4194dab7a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -39,7 +39,7 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(item);
 	btrfs_set_extent_refs(item, refs + 1);
-	mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, &path);
 	finish_current_insert(trans, root->fs_info->extent_root);
@@ -177,10 +177,10 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 			header = btrfs_buffer_header(bh);
 			if (btrfs_header_generation(header) ==
 			    root->fs_info->running_transaction->transid) {
-				brelse(bh);
+				btrfs_block_release(root, bh);
 				return 0;
 			}
-			brelse(bh);
+			btrfs_block_release(root, bh);
 		}
 		err = set_radix_bit(&root->fs_info->pinned_radix, blocknr);
 	} else {
@@ -224,7 +224,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(ei->refs == 0);
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
-	mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 	if (refs == 0) {
 		u64 super_blocks_used;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 2d2c23ca7cb..c3992b7b0c6 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,7 +34,7 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_offset(item, 0);
 	btrfs_set_file_extent_num_blocks(item, ins.offset);
 	btrfs_set_file_extent_generation(item, trans->transid);
-	mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 	*result = ins.objectid;
 	btrfs_release_path(root, &path);
 	return 0;
@@ -81,7 +81,7 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			      struct btrfs_csum_item);
 	ret = 0;
 	ret = btrfs_csum_data(root, data, len, item->csum);
-	mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 fail:
 	btrfs_release_path(root, &path);
 	return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 1b2c5e04350..fa4adb81b14 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -109,7 +109,7 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
 				    path.slots[0], struct btrfs_inode_map_item);
 	btrfs_cpu_key_to_disk(&inode_item->key, location);
-	mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 out:
 	btrfs_release_path(inode_root, &path);
 	return ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a821b5d1e23..f37dab87de1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,6 +45,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_leaf *l;
 	int ret;
 	int slot;
+	struct btrfs_root_item *update_item;
 
 	btrfs_init_path(&path);
 	ret = btrfs_search_slot(trans, root, key, &path, 0, 1);
@@ -53,9 +54,9 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(ret != 0);
 	l = btrfs_buffer_leaf(path.nodes[0]);
 	slot = path.slots[0];
-	memcpy(btrfs_item_ptr(l, slot, struct btrfs_root_item), item,
-		sizeof(*item));
-	mark_buffer_dirty(path.nodes[0]);
+	update_item = btrfs_item_ptr(l, slot, struct btrfs_root_item);
+	btrfs_memcpy(root, l, update_item, item, sizeof(*item));
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 out:
 	btrfs_release_path(root, &path);
 	return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ab5419eea93..894a70bf26d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -557,7 +557,7 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode_item);
 
 	fill_inode_item(inode_item, inode);
-	mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path.nodes[0]);
 failed:
 	btrfs_release_path(root, &path);
 	return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a146463c504..84c4e278ce2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -66,6 +66,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	cur_trans->num_writers--;
 	put_transaction(cur_trans);
 	mutex_unlock(&root->fs_info->trans_mutex);
+	memset(trans, 0, sizeof(*trans));
 	kfree(trans);
 	return 0;
 }
-- 
cgit v1.2.3


From 2c90e5d658424bc71b111eb5a972240d5d06fe86 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 2 Apr 2007 10:50:19 -0400
Subject: Btrfs: still corruption hunting

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/bit-radix.c   |   7 +--
 fs/btrfs/btrfs_inode.h |  14 +++++
 fs/btrfs/ctree.c       |  42 ++++++++++----
 fs/btrfs/ctree.h       |   3 +
 fs/btrfs/disk-io.c     |  84 ++++++++++++++--------------
 fs/btrfs/disk-io.h     |   4 +-
 fs/btrfs/extent-tree.c |  18 ++++--
 fs/btrfs/super.c       | 146 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/transaction.c |  39 +++++++++----
 fs/btrfs/transaction.h |   3 +
 10 files changed, 265 insertions(+), 95 deletions(-)
 create mode 100644 fs/btrfs/btrfs_inode.h

(limited to 'fs')

diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index 845422368f4..a50a1c44eab 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -4,6 +4,7 @@
 #define BIT_ARRAY_BYTES 256
 #define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8)
 
+extern struct kmem_cache *btrfs_bit_radix_cachep;
 int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 {
 	unsigned long *bits;
@@ -16,7 +17,7 @@ int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 
 	bits = radix_tree_lookup(radix, slot);
 	if (!bits) {
-		bits = kmalloc(BIT_ARRAY_BYTES, GFP_NOFS);
+		bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS);
 		if (!bits)
 			return -ENOMEM;
 		memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
@@ -68,12 +69,10 @@ int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 			break;
 		}
 	}
-
 	if (empty) {
 		bits = radix_tree_delete(radix, slot);
-		synchronize_rcu();
 		BUG_ON(!bits);
-		kfree(bits);
+		kmem_cache_free(btrfs_bit_radix_cachep, bits);
 	}
 #endif
 	return 0;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
new file mode 100644
index 00000000000..e159841650a
--- /dev/null
+++ b/fs/btrfs/btrfs_inode.h
@@ -0,0 +1,14 @@
+#ifndef __BTRFS_I__
+#define __BTRFS_I__
+
+struct btrfs_inode {
+	u32 magic;
+	struct inode vfs_inode;
+	u32 magic2;
+};
+static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
+{
+	return container_of(inode, struct btrfs_inode, vfs_inode);
+}
+
+#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a0dfa2d6cb9..453ce835209 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,6 +16,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
+struct btrfs_path *btrfs_alloc_path(void)
+{
+	return kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+}
+
+void btrfs_free_path(struct btrfs_path *p)
+{
+	kmem_cache_free(btrfs_path_cachep, p);
+}
+
 inline void btrfs_init_path(struct btrfs_path *p)
 {
 	memset(p, 0, sizeof(*p));
@@ -47,17 +57,18 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	cow = btrfs_alloc_free_block(trans, root);
 	cow_node = btrfs_buffer_node(cow);
+	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
+		WARN_ON(1);
 	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
 	btrfs_set_header_blocknr(&cow_node->header, cow->b_blocknr);
 	btrfs_set_header_generation(&cow_node->header, trans->transid);
-	*cow_ret = cow;
-	btrfs_mark_buffer_dirty(cow);
 	btrfs_inc_ref(trans, root, buf);
 	if (buf == root->node) {
 		root->node = cow;
 		get_bh(cow);
-		if (buf != root->commit_root)
+		if (buf != root->commit_root) {
 			btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
+		}
 		btrfs_block_release(root, buf);
 	} else {
 		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
@@ -66,6 +77,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
 	}
 	btrfs_block_release(root, buf);
+	*cow_ret = cow;
 	return 0;
 }
 
@@ -477,9 +489,12 @@ again:
 					       p->slots[level + 1],
 					       &cow_buf);
 			b = cow_buf;
+			c = btrfs_buffer_node(b);
 		}
 		BUG_ON(!cow && ins_len);
-		c = btrfs_buffer_node(b);
+		if (level != btrfs_header_level(&c->header))
+			WARN_ON(1);
+		level = btrfs_header_level(&c->header);
 		p->nodes[level] = b;
 		ret = check_block(root, p, level);
 		if (ret)
@@ -1257,19 +1272,22 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      data_size)
 {
 	int ret = 0;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	u8 *ptr;
 
-	btrfs_init_path(&path);
-	ret = btrfs_insert_empty_item(trans, root, &path, cpu_key, data_size);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (!ret) {
-		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-				     path.slots[0], u8);
-		btrfs_memcpy(root, path.nodes[0]->b_data,
+		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				     path->slots[0], u8);
+		btrfs_memcpy(root, path->nodes[0]->b_data,
 			     ptr, data, data_size);
-		btrfs_mark_buffer_dirty(path.nodes[0]);
+		btrfs_mark_buffer_dirty(path->nodes[0]);
 	}
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ec0d65ebe9..d8e03bd797f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -7,6 +7,7 @@
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
+extern struct kmem_cache *btrfs_path_cachep;
 
 #define BTRFS_MAGIC "_BtRfS_M"
 
@@ -888,6 +889,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+struct btrfs_path *btrfs_alloc_path(void);
+void btrfs_free_path(struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
 int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bb133104e2e..2dbd55084a4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -8,18 +8,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-#define PATTERN 0xDEADBEEFUL
-static inline void check_pattern(struct buffer_head *buf)
-{
-	if (buf->b_private != (void *)PATTERN)
-		WARN_ON(1);
-}
-
-static inline void set_pattern(struct buffer_head *buf)
-{
-	buf->b_private = (void *)PATTERN;
-}
-
 static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
 	struct btrfs_node *node = btrfs_buffer_node(buf);
@@ -35,6 +23,8 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 
 struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
 {
+	return sb_find_get_block(root->fs_info->sb, blocknr);
+#if 0
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	int blockbits = root->fs_info->sb->s_blocksize_bits;
 	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
@@ -43,6 +33,7 @@ struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
 	struct buffer_head *head;
 	struct buffer_head *ret = NULL;
 
+
 	page = find_lock_page(mapping, index);
 	if (!page)
 		return NULL;
@@ -64,15 +55,17 @@ out_unlock:
 	unlock_page(page);
 	if (ret) {
 		touch_buffer(ret);
-		check_pattern(ret);
 	}
 	page_cache_release(page);
 	return ret;
+#endif
 }
 
 struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 blocknr)
 {
+	return sb_getblk(root->fs_info->sb, blocknr);
+#if 0
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	int blockbits = root->fs_info->sb->s_blocksize_bits;
 	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
@@ -95,7 +88,6 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 			bh->b_bdev = root->fs_info->sb->s_bdev;
 			bh->b_blocknr = first_block;
 			set_buffer_mapped(bh);
-			set_pattern(bh);
 		}
 		if (bh->b_blocknr == blocknr) {
 			ret = bh;
@@ -111,6 +103,7 @@ out_unlock:
 		touch_buffer(ret);
 	page_cache_release(page);
 	return ret;
+#endif
 }
 
 static sector_t max_block(struct block_device *bdev)
@@ -225,6 +218,8 @@ static struct address_space_operations btree_aops = {
 
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
+	return sb_bread(root->fs_info->sb, blocknr);
+#if 0
 	struct buffer_head *bh = NULL;
 
 	bh = btrfs_find_create_tree_block(root, blocknr);
@@ -239,7 +234,6 @@ struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 		if (!buffer_uptodate(bh))
 			goto fail;
 		csum_tree_block(root, bh, 1);
-		set_pattern(bh);
 	} else {
 		unlock_buffer(bh);
 	}
@@ -250,6 +244,7 @@ fail:
 	brelse(bh);
 	return NULL;
 
+#endif
 }
 
 int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -268,14 +263,14 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return 0;
 }
 
-static int __setup_root(struct btrfs_super_block *super,
+static int __setup_root(int blocksize,
 			struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
 			u64 objectid)
 {
 	root->node = NULL;
 	root->commit_root = NULL;
-	root->blocksize = btrfs_super_blocksize(super);
+	root->blocksize = blocksize;
 	root->ref_cows = 0;
 	root->fs_info = fs_info;
 	memset(&root->root_key, 0, sizeof(root->root_key));
@@ -283,7 +278,7 @@ static int __setup_root(struct btrfs_super_block *super,
 	return 0;
 }
 
-static int find_and_setup_root(struct btrfs_super_block *super,
+static int find_and_setup_root(int blocksize,
 			       struct btrfs_root *tree_root,
 			       struct btrfs_fs_info *fs_info,
 			       u64 objectid,
@@ -291,7 +286,7 @@ static int find_and_setup_root(struct btrfs_super_block *super,
 {
 	int ret;
 
-	__setup_root(super, root, fs_info, objectid);
+	__setup_root(blocksize, root, fs_info, objectid);
 	ret = btrfs_find_last_root(tree_root, objectid,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
@@ -302,9 +297,7 @@ static int find_and_setup_root(struct btrfs_super_block *super,
 	return 0;
 }
 
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct buffer_head *sb_buffer,
-			      struct btrfs_super_block *disk_super)
+struct btrfs_root *open_ctree(struct super_block *sb)
 {
 	struct btrfs_root *root = kmalloc(sizeof(struct btrfs_root),
 					  GFP_NOFS);
@@ -317,13 +310,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
 						GFP_NOFS);
 	int ret;
+	struct btrfs_super_block *disk_super;
 
-	if (!btrfs_super_root(disk_super)) {
-		return NULL;
-	}
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
-	sb_set_blocksize(sb, sb_buffer->b_size);
+	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
 	fs_info->fs_root = root;
 	fs_info->tree_root = tree_root;
@@ -331,55 +322,59 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->inode_root = inode_root;
 	fs_info->last_inode_alloc = 0;
 	fs_info->last_inode_alloc_dirid = 0;
-	fs_info->disk_super = disk_super;
 	fs_info->sb = sb;
+	fs_info->btree_inode = NULL;
+#if 0
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
+	fs_info->btree_inode->i_nlink = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	insert_inode_hash(fs_info->btree_inode);
-
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+#endif
 	fs_info->hash_tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
 	spin_lock_init(&fs_info->hash_lock);
-
 	if (!fs_info->hash_tfm || IS_ERR(fs_info->hash_tfm)) {
 		printk("failed to allocate sha256 hash\n");
 		return NULL;
 	}
-
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
-	__setup_root(disk_super, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
-
-	fs_info->sb_buffer = read_tree_block(tree_root, sb_buffer->b_blocknr);
+	__setup_root(sb->s_blocksize, tree_root,
+		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
+	fs_info->sb_buffer = read_tree_block(tree_root,
+					     BTRFS_SUPER_INFO_OFFSET /
+					     sb->s_blocksize);
 
 	if (!fs_info->sb_buffer) {
 printk("failed2\n");
 		return NULL;
 	}
-	brelse(sb_buffer);
-	sb_buffer = NULL;
 	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
+	if (!btrfs_super_root(disk_super)) {
+		return NULL;
+	}
 	fs_info->disk_super = disk_super;
-
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super));
 	BUG_ON(!tree_root->node);
 
-	ret = find_and_setup_root(disk_super, tree_root, fs_info,
+	mutex_lock(&fs_info->fs_mutex);
+	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(disk_super, tree_root, fs_info,
+	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
 				  BTRFS_INODE_MAP_OBJECTID, inode_root);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(disk_super, tree_root, fs_info,
+	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
 				  BTRFS_FS_TREE_OBJECTID, root);
+	mutex_unlock(&fs_info->fs_mutex);
 	BUG_ON(ret);
 	root->commit_root = root->node;
 	get_bh(root->node);
@@ -392,9 +387,11 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root)
 {
 	struct buffer_head *bh = root->fs_info->sb_buffer;
+
 	btrfs_set_super_root(root->fs_info->disk_super,
 			     root->fs_info->tree_root->node->b_blocknr);
 	lock_buffer(bh);
+	WARN_ON(atomic_read(&bh->b_count) < 1);
 	clear_buffer_dirty(bh);
 	csum_tree_block(root, bh, 0);
 	bh->b_end_io = end_buffer_write_sync;
@@ -413,6 +410,7 @@ int close_ctree(struct btrfs_root *root)
 	int ret;
 	struct btrfs_trans_handle *trans;
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_commit_transaction(trans, root);
 	/* run commit again to  drop the original snapshot */
@@ -421,6 +419,7 @@ int close_ctree(struct btrfs_root *root)
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
 	write_ctree_super(NULL, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
 
 	if (root->node)
 		btrfs_block_release(root, root->node);
@@ -436,8 +435,8 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_block_release(root, root->commit_root);
 	btrfs_block_release(root, root->fs_info->sb_buffer);
 	crypto_free_hash(root->fs_info->hash_tfm);
-	truncate_inode_pages(root->fs_info->btree_inode->i_mapping, 0);
-	iput(root->fs_info->btree_inode);
+	// truncate_inode_pages(root->fs_info->btree_inode->i_mapping, 0);
+	// iput(root->fs_info->btree_inode);
 	kfree(root->fs_info->extent_root);
 	kfree(root->fs_info->inode_root);
 	kfree(root->fs_info->tree_root);
@@ -448,7 +447,6 @@ int close_ctree(struct btrfs_root *root)
 
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 {
-	check_pattern(buf);
-	brelse(buf);
+	// brelse(buf);
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f6998e2192c..ac6764ba8aa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -31,9 +31,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct buffer_head *buf);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct buffer_head *sb_buffer,
-			      struct btrfs_super_block *disk_super);
+struct btrfs_root *open_ctree(struct super_block *sb);
 int close_ctree(struct btrfs_root *root);
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf);
 int write_ctree_super(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c4194dab7a3..37b87e28a2f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -173,12 +173,16 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 
 	if (!pending) {
 		bh = btrfs_find_tree_block(root, blocknr);
-		if (bh && buffer_uptodate(bh)) {
-			header = btrfs_buffer_header(bh);
-			if (btrfs_header_generation(header) ==
-			    root->fs_info->running_transaction->transid) {
-				btrfs_block_release(root, bh);
-				return 0;
+		if (bh) {
+			if (buffer_uptodate(bh)) {
+				u64 transid =
+				    root->fs_info->running_transaction->transid;
+				header = btrfs_buffer_header(bh);
+				if (btrfs_header_generation(header) ==
+				    transid) {
+					btrfs_block_release(root, bh);
+					return 0;
+				}
 			}
 			btrfs_block_release(root, bh);
 		}
@@ -539,6 +543,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	 */
 	while(*level >= 0) {
 		cur = path->nodes[*level];
+		if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+			WARN_ON(1);
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(btrfs_buffer_header(cur)))
 			break;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 894a70bf26d..6969b672b57 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -14,6 +14,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "btrfs_inode.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123682E
 
@@ -24,6 +25,14 @@ static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
 static struct file_operations btrfs_file_operations;
 
+static int check_inode(struct inode *inode)
+{
+	struct btrfs_inode *ei = BTRFS_I(inode);
+	WARN_ON(ei->magic != 0xDEADBEEF);
+	WARN_ON(ei->magic2 != 0xDEADBEAF);
+	return 0;
+}
+
 static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path path;
@@ -34,6 +43,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	btrfs_init_path(&path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
+	check_inode(inode);
 	ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0);
 	if (ret) {
 		btrfs_release_path(root, &path);
@@ -41,6 +51,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		make_bad_inode(inode);
 		return;
 	}
+	check_inode(inode);
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
 				  path.slots[0],
 				  struct btrfs_inode_item);
@@ -60,6 +71,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_generation = btrfs_inode_generation(inode_item);
 	btrfs_release_path(root, &path);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	check_inode(inode);
 	switch (inode->i_mode & S_IFMT) {
 #if 0
 	default:
@@ -80,6 +92,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		// inode->i_op = &page_symlink_inode_operations;
 		break;
 	}
+	check_inode(inode);
 	return;
 }
 
@@ -347,6 +360,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 				    namelen, 0);
 	if (ret || !btrfs_match_dir_item_name(root, &path, name, namelen)) {
 		*ino = 0;
+		ret = 0;
 		goto out;
 	}
 	di = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
@@ -354,6 +368,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	*ino = btrfs_dir_objectid(di);
 out:
 	btrfs_release_path(root, &path);
+	check_inode(dir);
 	return ret;
 }
 
@@ -367,7 +382,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_inode_by_name(dir, dentry, &ino);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -378,7 +392,9 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 		inode = iget(dir->i_sb, ino);
 		if (!inode)
 			return ERR_PTR(-EACCES);
+		check_inode(inode);
 	}
+	check_inode(dir);
 	return d_splice_alias(inode, dentry);
 }
 
@@ -471,23 +487,14 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	struct inode * inode;
 	struct dentry * root_dentry;
 	struct btrfs_super_block *disk_super;
-	struct buffer_head *bh;
 	struct btrfs_root *root;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
 	sb->s_op = &btrfs_super_ops;
 	sb->s_time_gran = 1;
 
-	bh = sb_bread(sb, BTRFS_SUPER_INFO_OFFSET / sb->s_blocksize);
-	if (!bh) {
-		printk("btrfs: unable to read on disk super\n");
-		return -EIO;
-	}
-	disk_super = (struct btrfs_super_block *)bh->b_data;
-	root = open_ctree(sb, bh, disk_super);
+	root = open_ctree(sb);
 
 	if (!root) {
 		printk("btrfs: open_ctree failed\n");
@@ -533,6 +540,7 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
 	btrfs_set_inode_nblocks(item, inode->i_blocks);
 	btrfs_set_inode_generation(item, inode->i_generation);
+	check_inode(inode);
 }
 
 static int btrfs_update_inode(struct btrfs_trans_handle *trans,
@@ -560,6 +568,7 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path.nodes[0]);
 failed:
 	btrfs_release_path(root, &path);
+	check_inode(inode);
 	return 0;
 }
 
@@ -577,6 +586,7 @@ static int btrfs_write_inode(struct inode *inode, int wait)
 	else
 		btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	check_inode(inode);
 	return ret;
 }
 
@@ -594,6 +604,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	check_inode(inode);
 	ret = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	BUG_ON(ret);
 
@@ -616,6 +627,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	insert_inode_hash(inode);
+	check_inode(inode);
+	check_inode(dir);
 	return inode;
 }
 
@@ -632,7 +645,8 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 		ret = btrfs_update_inode(trans, btrfs_sb(inode->i_sb),
 					 dentry->d_parent->d_inode);
 	}
-
+	check_inode(inode);
+	check_inode(dentry->d_parent->d_inode);
 	return ret;
 }
 
@@ -644,6 +658,9 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 		d_instantiate(dentry, inode);
 		return 0;
 	}
+	if (err > 0)
+		err = -EEXIST;
+	check_inode(inode);
 	return err;
 }
 
@@ -675,6 +692,9 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	check_inode(inode);
+	check_inode(dir);
+
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -755,11 +775,11 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	sb->s_dirt = 0;
 	if (!wait) {
-		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		// filemap_flush(root->fs_info->btree_inode->i_mapping);
+		filemap_flush(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
 		return 0;
 	}
-	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
-
+	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
@@ -1242,6 +1262,95 @@ static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	return retval;
 }
 
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_bit_radix_cachep;
+struct kmem_cache *btrfs_path_cachep;
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+static struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+	struct btrfs_inode *ei;
+
+	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	ei->magic = 0xDEADBEEF;
+	ei->magic2 = 0xDEADBEAF;
+	return &ei->vfs_inode;
+}
+
+static void btrfs_destroy_inode(struct inode *inode)
+{
+	struct btrfs_inode *ei = BTRFS_I(inode);
+	WARN_ON(ei->magic != 0xDEADBEEF);
+	WARN_ON(ei->magic2 != 0xDEADBEAF);
+	WARN_ON(!list_empty(&inode->i_dentry));
+	WARN_ON(inode->i_ino == 1);
+	WARN_ON(inode->i_data.nrpages);
+
+	ei->magic = 0;
+	ei->magic2 = 0;
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+static void init_once(void * foo, struct kmem_cache * cachep,
+		      unsigned long flags)
+{
+	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	    SLAB_CTOR_CONSTRUCTOR) {
+		inode_init_once(&ei->vfs_inode);
+	}
+}
+
+static int init_inodecache(void)
+{
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+					     sizeof(struct btrfs_inode),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once, NULL);
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+					     sizeof(struct btrfs_trans_handle),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     NULL, NULL);
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+					     sizeof(struct btrfs_transaction),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     NULL, NULL);
+	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+					     sizeof(struct btrfs_transaction),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     NULL, NULL);
+	btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
+					     256,
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD |
+						SLAB_DESTROY_BY_RCU),
+					     NULL, NULL);
+	if (btrfs_inode_cachep == NULL || btrfs_trans_handle_cachep == NULL ||
+	    btrfs_transaction_cachep == NULL || btrfs_bit_radix_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(btrfs_inode_cachep);
+	kmem_cache_destroy(btrfs_trans_handle_cachep);
+	kmem_cache_destroy(btrfs_transaction_cachep);
+	kmem_cache_destroy(btrfs_bit_radix_cachep);
+	kmem_cache_destroy(btrfs_path_cachep);
+}
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
@@ -1265,6 +1374,8 @@ static struct super_operations btrfs_super_ops = {
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
 	.write_inode	= btrfs_write_inode,
+	.alloc_inode	= btrfs_alloc_inode,
+	.destroy_inode	= btrfs_destroy_inode,
 };
 
 static struct inode_operations btrfs_dir_inode_operations = {
@@ -1305,12 +1416,17 @@ static struct file_operations btrfs_file_operations = {
 
 static int __init init_btrfs_fs(void)
 {
+	int err;
 	printk("btrfs loaded!\n");
+	err = init_inodecache();
+	if (err)
+		return err;
 	return register_filesystem(&btrfs_fs_type);
 }
 
 static void __exit exit_btrfs_fs(void)
 {
+	destroy_inodecache();
 	unregister_filesystem(&btrfs_fs_type);
 	printk("btrfs unloaded\n");
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 84c4e278ce2..72b52e1e0b1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -5,13 +5,20 @@
 #include "transaction.h"
 
 static int total_trans = 0;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+
+#define TRANS_MAGIC 0xE1E10E
 static void put_transaction(struct btrfs_transaction *transaction)
 {
+	WARN_ON(transaction->use_count == 0);
 	transaction->use_count--;
+	WARN_ON(transaction->magic != TRANS_MAGIC);
 	if (transaction->use_count == 0) {
 		WARN_ON(total_trans == 0);
 		total_trans--;
-		kfree(transaction);
+		memset(transaction, 0, sizeof(*transaction));
+		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
 }
 
@@ -20,7 +27,8 @@ static int join_transaction(struct btrfs_root *root)
 	struct btrfs_transaction *cur_trans;
 	cur_trans = root->fs_info->running_transaction;
 	if (!cur_trans) {
-		cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
+		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+					     GFP_NOFS);
 		total_trans++;
 		BUG_ON(!cur_trans);
 		root->fs_info->running_transaction = cur_trans;
@@ -28,6 +36,7 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->transid = root->root_key.offset + 1;
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
+		cur_trans->magic = TRANS_MAGIC;
 		cur_trans->in_commit = 0;
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
@@ -39,7 +48,8 @@ static int join_transaction(struct btrfs_root *root)
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_blocks)
 {
-	struct btrfs_trans_handle *h = kmalloc(sizeof(*h), GFP_NOFS);
+	struct btrfs_trans_handle *h =
+		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	int ret;
 
 	mutex_lock(&root->fs_info->trans_mutex);
@@ -51,6 +61,7 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	h->blocks_used = 0;
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
+	h->magic = h->magic2 = TRANS_MAGIC;
 	return h;
 }
 
@@ -58,6 +69,8 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
+	WARN_ON(trans->magic != TRANS_MAGIC);
+	WARN_ON(trans->magic2 != TRANS_MAGIC);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = root->fs_info->running_transaction;
 	WARN_ON(cur_trans->num_writers < 1);
@@ -67,7 +80,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	memset(trans, 0, sizeof(*trans));
-	kfree(trans);
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 	return 0;
 }
 
@@ -75,7 +88,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
-	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
+	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
 	return 0;
 }
 
@@ -137,6 +150,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
+printk("already in commit!, waiting\n");
 		cur_trans = trans->transaction;
 		trans->transaction->use_count++;
 		btrfs_end_transaction(trans, root);
@@ -146,7 +160,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		return 0;
 	}
+	cur_trans = trans->transaction;
+	trans->transaction->in_commit = 1;
 	while (trans->transaction->num_writers > 1) {
+		WARN_ON(cur_trans != trans->transaction);
 		prepare_to_wait(&trans->transaction->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (trans->transaction->num_writers <= 1)
@@ -154,15 +171,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		schedule();
 		mutex_lock(&root->fs_info->trans_mutex);
+		finish_wait(&trans->transaction->writer_wait, &wait);
 	}
 	finish_wait(&trans->transaction->writer_wait, &wait);
-
+	WARN_ON(cur_trans != trans->transaction);
 	if (root->node != root->commit_root) {
 		memcpy(&snap_key, &root->root_key, sizeof(snap_key));
 		root->root_key.offset++;
 	}
 
-
 	if (btrfs_root_blocknr(&root->root_item) != root->node->b_blocknr) {
 		btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr);
 		ret = btrfs_insert_root(trans, root->fs_info->tree_root,
@@ -172,22 +189,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
-
 	cur_trans = root->fs_info->running_transaction;
 	root->fs_info->running_transaction = NULL;
 	mutex_unlock(&root->fs_info->trans_mutex);
-
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
 
 	write_ctree_super(trans, root);
 	btrfs_finish_extent_commit(trans, root);
 	mutex_lock(&root->fs_info->trans_mutex);
+	cur_trans->commit_done = 1;
+	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 	mutex_unlock(&root->fs_info->trans_mutex);
-	kfree(trans);
-
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 	if (root->node != root->commit_root) {
 		trans = btrfs_start_transaction(root, 1);
 		snap = root->commit_root;
@@ -203,7 +219,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		ret = btrfs_end_transaction(trans, root);
 		BUG_ON(ret);
 	}
-
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7e6c08a0acc..4f1496ae6f2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -7,15 +7,18 @@ struct btrfs_transaction {
 	int in_commit;
 	int use_count;
 	int commit_done;
+	int magic;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 };
 
 struct btrfs_trans_handle {
+	int magic;
 	u64 transid;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	struct btrfs_transaction *transaction;
+	int magic2;
 };
 
 
-- 
cgit v1.2.3


From 5caf2a002901f0fde475371c4bf1c553b51884af Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 2 Apr 2007 11:20:42 -0400
Subject: Btrfs: dynamic allocation of path struct

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/dir-item.c    |  18 +++--
 fs/btrfs/extent-tree.c | 100 +++++++++++++++----------
 fs/btrfs/file-item.c   |  43 ++++++-----
 fs/btrfs/inode-item.c  |   9 ++-
 fs/btrfs/root-tree.c   |  47 +++++++-----
 fs/btrfs/super.c       | 198 ++++++++++++++++++++++++++++---------------------
 6 files changed, 243 insertions(+), 172 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f55c89472ac..baceb1da609 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -9,7 +9,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  objectid, u8 type)
 {
 	int ret = 0;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
 	struct btrfs_key key;
@@ -20,24 +20,26 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	btrfs_init_path(path);
 	data_size = sizeof(*dir_item) + name_len;
-	ret = btrfs_insert_empty_item(trans, root, &path, &key, data_size);
+	ret = btrfs_insert_empty_item(trans, root, path, &key, data_size);
 	if (ret)
 		goto out;
 
-	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-				  path.slots[0],
+	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
 				  struct btrfs_dir_item);
 	btrfs_set_dir_objectid(dir_item, objectid);
 	btrfs_set_dir_type(dir_item, type);
 	btrfs_set_dir_flags(dir_item, 0);
 	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
-	btrfs_memcpy(root, path.nodes[0]->b_data, name_ptr, name, name_len);
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 37b87e28a2f..d785b721b46 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -15,7 +15,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 			 *root, u64 blocknr, u64 num_blocks)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_leaf *l;
@@ -25,23 +25,26 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	find_free_extent(trans, root->fs_info->extent_root, 0, 0, (u64)-1,
 			 &ins);
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path,
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 1);
 	if (ret != 0)
 		BUG();
 	BUG_ON(ret != 0);
-	l = btrfs_buffer_leaf(path.nodes[0]);
-	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
+	l = btrfs_buffer_leaf(path->nodes[0]);
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(item);
 	btrfs_set_extent_refs(item, refs + 1);
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 
-	btrfs_release_path(root->fs_info->extent_root, &path);
+	btrfs_release_path(root->fs_info->extent_root, path);
+	btrfs_free_path(path);
 	finish_current_insert(trans, root->fs_info->extent_root);
 	del_pending_extents(trans, root->fs_info->extent_root);
 	return 0;
@@ -50,24 +53,27 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *root, u64 blocknr, u64 num_blocks, u32 *refs)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_leaf *l;
 	struct btrfs_extent_item *item;
-	btrfs_init_path(&path);
+
+	path = btrfs_alloc_path();
+	btrfs_init_path(path);
 	key.objectid = blocknr;
 	key.offset = num_blocks;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, &path,
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 0);
 	if (ret != 0)
 		BUG();
-	l = btrfs_buffer_leaf(path.nodes[0]);
-	item = btrfs_item_ptr(l, path.slots[0], struct btrfs_extent_item);
+	l = btrfs_buffer_leaf(path->nodes[0]);
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
 	*refs = btrfs_extent_refs(item);
-	btrfs_release_path(root->fs_info->extent_root, &path);
+	btrfs_release_path(root->fs_info->extent_root, path);
+	btrfs_free_path(path);
 	return 0;
 }
 
@@ -200,7 +206,7 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			 *root, u64 blocknr, u64 num_blocks, int pin)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
@@ -215,20 +221,22 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.offset = num_blocks;
 
 	find_free_extent(trans, root, 0, 0, (u64)-1, &ins);
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(trans, extent_root, &key, &path, -1, 1);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
 	if (ret) {
 		printk("failed to find %Lu\n", key.objectid);
 		btrfs_print_tree(extent_root, extent_root->node);
 		printk("failed to find %Lu\n", key.objectid);
 		BUG();
 	}
-	ei = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_extent_item);
 	BUG_ON(ei->refs == 0);
 	refs = btrfs_extent_refs(ei) - 1;
 	btrfs_set_extent_refs(ei, refs);
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 	if (refs == 0) {
 		u64 super_blocks_used;
 
@@ -240,13 +248,14 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
 		btrfs_set_super_blocks_used(info->disk_super,
 					    super_blocks_used - num_blocks);
-		ret = btrfs_del_item(trans, extent_root, &path);
+		ret = btrfs_del_item(trans, extent_root, path);
 		if (extent_root->fs_info->last_insert.objectid > blocknr)
 			extent_root->fs_info->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
 	}
-	btrfs_release_path(extent_root, &path);
+	btrfs_release_path(extent_root, path);
+	btrfs_free_path(path);
 	finish_current_insert(trans, extent_root);
 	return ret;
 }
@@ -319,7 +328,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *orig_root, u64 num_blocks, u64 search_start, u64
 			    search_end, struct btrfs_key *ins)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
 	u64 hole_size = 0;
@@ -339,24 +348,25 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+	path = btrfs_alloc_path();
 
 check_failed:
-	btrfs_init_path(&path);
+	btrfs_init_path(path);
 	ins->objectid = search_start;
 	ins->offset = 0;
 	start_found = 0;
-	ret = btrfs_search_slot(trans, root, ins, &path, 0, 0);
+	ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
 	if (ret < 0)
 		goto error;
 
-	if (path.slots[0] > 0)
-		path.slots[0]--;
+	if (path->slots[0] > 0)
+		path->slots[0]--;
 
 	while (1) {
-		l = btrfs_buffer_leaf(path.nodes[0]);
-		slot = path.slots[0];
+		l = btrfs_buffer_leaf(path->nodes[0]);
+		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
-			ret = btrfs_next_leaf(root, &path);
+			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
 			if (ret < 0)
@@ -387,14 +397,14 @@ check_failed:
 		}
 		start_found = 1;
 		last_block = key.objectid + key.offset;
-		path.slots[0]++;
+		path->slots[0]++;
 	}
 	// FIXME -ENOSPC
 check_pending:
 	/* we have to make sure we didn't find an extent that has already
 	 * been allocated by the map tree or the original allocation
 	 */
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + total_needed; test_block++) {
@@ -410,9 +420,11 @@ check_pending:
 	root->fs_info->current_insert.flags = 0;
 	root->fs_info->last_insert.objectid = ins->objectid;
 	ins->offset = num_blocks;
+	btrfs_free_path(path);
 	return 0;
 error:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -533,6 +545,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	u32 refs;
 
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = lookup_block_ref(trans, root, path->nodes[*level]->b_blocknr,
 			       1, &refs);
 	BUG_ON(ret);
@@ -542,6 +556,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	 * walk down to the last node level and free all the leaves
 	 */
 	while(*level >= 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
 		if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
 			WARN_ON(1);
@@ -564,6 +580,7 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			continue;
 		}
 		next = read_tree_block(root, blocknr);
+		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
 			btrfs_block_release(root, path->nodes[*level-1]);
 		path->nodes[*level-1] = next;
@@ -571,6 +588,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		path->slots[*level] = 0;
 	}
 out:
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = btrfs_free_extent(trans, root,
 				path->nodes[*level]->b_blocknr, 1, 1);
 	btrfs_block_release(root, path->nodes[*level]);
@@ -622,33 +641,36 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret = 0;
 	int wret;
 	int level;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	int i;
 	int orig_level;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 
 	level = btrfs_header_level(btrfs_buffer_header(snap));
 	orig_level = level;
-	path.nodes[level] = snap;
-	path.slots[level] = 0;
+	path->nodes[level] = snap;
+	path->slots[level] = 0;
 	while(1) {
-		wret = walk_down_tree(trans, root, &path, &level);
+		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
 		if (wret < 0)
 			ret = wret;
 
-		wret = walk_up_tree(trans, root, &path, &level);
+		wret = walk_up_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
 		if (wret < 0)
 			ret = wret;
 	}
 	for (i = 0; i <= orig_level; i++) {
-		if (path.nodes[i]) {
-			btrfs_block_release(root, path.nodes[i]);
+		if (path->nodes[i]) {
+			btrfs_block_release(root, path->nodes[i]);
 		}
 	}
+	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index c3992b7b0c6..e7510ac5559 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -13,9 +13,11 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_key file_key;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	ret = btrfs_alloc_extent(trans, root, num_blocks, hint_block,
 				 (u64)-1, objectid, &ins);
 	BUG_ON(ret);
@@ -24,19 +26,20 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 
-	ret = btrfs_insert_empty_item(trans, root, &path, &file_key,
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(*item));
 	BUG_ON(ret);
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_file_extent_item);
 	btrfs_set_file_extent_disk_blocknr(item, ins.objectid);
 	btrfs_set_file_extent_disk_num_blocks(item, ins.offset);
 	btrfs_set_file_extent_offset(item, 0);
 	btrfs_set_file_extent_num_blocks(item, ins.offset);
 	btrfs_set_file_extent_generation(item, trans->transid);
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 	*result = ins.objectid;
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return 0;
 }
 
@@ -65,25 +68,28 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	struct btrfs_key file_key;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
-	ret = btrfs_insert_empty_item(trans, root, &path, &file_key,
+	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      BTRFS_CSUM_SIZE);
 	if (ret != 0 && ret != -EEXIST)
 		goto fail;
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_csum_item);
 	ret = 0;
 	ret = btrfs_csum_data(root, data, len, item->csum);
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -93,19 +99,21 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 {
 	int ret;
 	struct btrfs_key file_key;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	char result[BTRFS_CSUM_SIZE];
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
-	ret = btrfs_search_slot(NULL, root, &file_key, &path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &file_key, path, 0, 0);
 	if (ret)
 		goto fail;
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_csum_item);
 	ret = 0;
 	ret = btrfs_csum_data(root, data, len, result);
@@ -113,7 +121,8 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 	if (memcmp(result, item->csum, BTRFS_CSUM_SIZE))
 		ret = 1;
 fail:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 8d8c26a6c1a..6bfa980790c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -7,7 +7,7 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, u64 objectid, struct btrfs_inode_item
 		       *inode_item)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
 	key.objectid = objectid;
@@ -15,10 +15,13 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	ret = btrfs_insert_item(trans, root, &key, inode_item,
 				sizeof(*inode_item));
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f37dab87de1..ddc1c13a535 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -6,7 +6,7 @@
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 			struct btrfs_root_item *item, struct btrfs_key *key)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_key search_key;
 	struct btrfs_leaf *l;
 	int ret;
@@ -16,14 +16,16 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	search_key.flags = (u32)-1;
 	search_key.offset = (u32)-1;
 
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(NULL, root, &search_key, &path, 0, 0);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
-	l = btrfs_buffer_leaf(path.nodes[0]);
-	BUG_ON(path.slots[0] == 0);
-	slot = path.slots[0] - 1;
+	l = btrfs_buffer_leaf(path->nodes[0]);
+	BUG_ON(path->slots[0] == 0);
+	slot = path->slots[0] - 1;
 	if (btrfs_disk_key_objectid(&l->items[slot].key) != objectid) {
 		ret = 1;
 		goto out;
@@ -31,9 +33,10 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	memcpy(item, btrfs_item_ptr(l, slot, struct btrfs_root_item),
 		sizeof(*item));
 	btrfs_disk_key_to_cpu(key, &l->items[slot].key);
-	btrfs_release_path(root, &path);
 	ret = 0;
 out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -41,24 +44,27 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_root_item
 		      *item)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_leaf *l;
 	int ret;
 	int slot;
 	struct btrfs_root_item *update_item;
 
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(trans, root, key, &path, 0, 1);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
-	l = btrfs_buffer_leaf(path.nodes[0]);
-	slot = path.slots[0];
+	l = btrfs_buffer_leaf(path->nodes[0]);
+	slot = path->slots[0];
 	update_item = btrfs_item_ptr(l, slot, struct btrfs_root_item);
 	btrfs_memcpy(root, l, update_item, item, sizeof(*item));
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -75,16 +81,19 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	int ret;
 
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(trans, root, key, &path, -1, 1);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
-	ret = btrfs_del_item(trans, root, &path);
+	ret = btrfs_del_item(trans, root, path);
 out:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6969b672b57..c260fcad17b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -35,25 +35,27 @@ static int check_inode(struct inode *inode)
 
 static void btrfs_read_locked_inode(struct inode *inode)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
 	int ret;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
 	check_inode(inode);
-	ret = btrfs_lookup_inode(NULL, root, &path, inode->i_ino, 0);
+	ret = btrfs_lookup_inode(NULL, root, path, inode->i_ino, 0);
 	if (ret) {
-		btrfs_release_path(root, &path);
+		btrfs_release_path(root, path);
 		mutex_unlock(&root->fs_info->fs_mutex);
 		make_bad_inode(inode);
 		return;
 	}
 	check_inode(inode);
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-				  path.slots[0],
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
 				  struct btrfs_inode_item);
 
 	inode->i_mode = btrfs_inode_mode(inode_item);
@@ -69,7 +71,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
-	btrfs_release_path(root, &path);
+
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	inode_item = NULL;
+
 	mutex_unlock(&root->fs_info->fs_mutex);
 	check_inode(inode);
 	switch (inode->i_mode & S_IFMT) {
@@ -101,15 +107,17 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 			      struct inode *dir,
 			      struct dentry *dentry)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	const char *name = dentry->d_name.name;
 	int name_len = dentry->d_name.len;
 	int ret;
 	u64 objectid;
 	struct btrfs_dir_item *di;
 
-	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(trans, root, &path, dir->i_ino,
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
 	if (ret < 0)
 		goto err;
@@ -117,15 +125,16 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 		ret = -ENOENT;
 		goto err;
 	}
-	di = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_dir_item);
 	objectid = btrfs_dir_objectid(di);
 
-	ret = btrfs_del_item(trans, root, &path);
+	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 	dentry->d_inode->i_ctime = dir->i_ctime;
 err:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	if (ret == 0)
 		inode_dec_link_count(dentry->d_inode);
 	return ret;
@@ -152,30 +161,32 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 	int ret;
 	struct btrfs_root *root = btrfs_sb(dir->i_sb);
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_disk_key *found_key;
 	struct btrfs_leaf *leaf;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
 	}
 
 	BUG_ON(ret == 0);
-	BUG_ON(path.slots[0] == 0);
-	path.slots[0]--;
-	leaf = btrfs_buffer_leaf(path.nodes[0]);
-	found_key = &leaf->items[path.slots[0]].key;
+	BUG_ON(path->slots[0] == 0);
+	path->slots[0]--;
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	found_key = &leaf->items[path->slots[0]].key;
 	if (btrfs_disk_key_objectid(found_key) != inode->i_ino) {
 		err = -ENOENT;
 		goto out;
@@ -185,11 +196,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		err = -ENOTEMPTY;
 		goto out;
 	}
-	ret = btrfs_del_item(trans, root, &path);
+	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
 	key.offset = 1;
-	ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
@@ -198,12 +209,13 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		err = -ENOTEMPTY;
 		goto out;
 	}
-	ret = btrfs_del_item(trans, root, &path);
+	ret = btrfs_del_item(trans, root, path);
 	if (ret) {
 		err = ret;
 		goto out;
 	}
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 
 	/* now the directory is empty */
 	err = btrfs_unlink_trans(trans, root, dir, dentry);
@@ -223,33 +235,36 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 			    struct inode *inode)
 {
 	u64 objectid = inode->i_ino;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_inode_map_item *map;
 	struct btrfs_key stat_data_key;
 	int ret;
+
 	clear_inode(inode);
-	btrfs_init_path(&path);
-	ret = btrfs_lookup_inode_map(trans, root, &path, objectid, -1);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_lookup_inode_map(trans, root, path, objectid, -1);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
-		btrfs_release_path(root, &path);
 		goto error;
 	}
-	map = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	map = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_inode_map_item);
 	btrfs_disk_key_to_cpu(&stat_data_key, &map->key);
-	ret = btrfs_del_item(trans, root->fs_info->inode_root, &path);
+	ret = btrfs_del_item(trans, root->fs_info->inode_root, path);
 	BUG_ON(ret);
-	btrfs_release_path(root, &path);
-	btrfs_init_path(&path);
+	btrfs_release_path(root, path);
 
-	ret = btrfs_lookup_inode(trans, root, &path, objectid, -1);
+	ret = btrfs_lookup_inode(trans, root, path, objectid, -1);
 	BUG_ON(ret);
-	ret = btrfs_del_item(trans, root, &path);
+	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
-	btrfs_release_path(root, &path);
 error:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -258,7 +273,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				   struct inode *inode)
 {
 	int ret;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_disk_key *found_key;
 	struct btrfs_leaf *leaf;
@@ -267,24 +282,25 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	u64 extent_num_blocks = 0;
 	int found_extent;
 
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
 	while(1) {
-		btrfs_init_path(&path);
-		ret = btrfs_search_slot(trans, root, &key, &path, -1, 1);
+		btrfs_init_path(path);
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0) {
-			btrfs_release_path(root, &path);
 			goto error;
 		}
 		if (ret > 0) {
-			BUG_ON(path.slots[0] == 0);
-			path.slots[0]--;
+			BUG_ON(path->slots[0] == 0);
+			path->slots[0]--;
 		}
-		leaf = btrfs_buffer_leaf(path.nodes[0]);
-		found_key = &leaf->items[path.slots[0]].key;
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		found_key = &leaf->items[path->slots[0]].key;
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
 			break;
 		if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
@@ -293,8 +309,8 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
 			break;
 		if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
-			fi = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-					    path.slots[0],
+			fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+					    path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_start = btrfs_file_extent_disk_blocknr(fi);
 			extent_num_blocks =
@@ -305,18 +321,19 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		} else {
 			found_extent = 0;
 		}
-		ret = btrfs_del_item(trans, root, &path);
+		ret = btrfs_del_item(trans, root, path);
 		BUG_ON(ret);
-		btrfs_release_path(root, &path);
+		btrfs_release_path(root, path);
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_blocks, 0);
 			BUG_ON(ret);
 		}
 	}
-	btrfs_release_path(root, &path);
 	ret = 0;
 error:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -351,23 +368,26 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
 	struct btrfs_dir_item *di;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_root *root = btrfs_sb(dir->i_sb);
 	int ret;
 
-	btrfs_init_path(&path);
-	ret = btrfs_lookup_dir_item(NULL, root, &path, dir->i_ino, name,
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
-	if (ret || !btrfs_match_dir_item_name(root, &path, name, namelen)) {
+	if (ret || !btrfs_match_dir_item_name(root, path, name, namelen)) {
 		*ino = 0;
 		ret = 0;
 		goto out;
 	}
-	di = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_dir_item);
 	*ino = btrfs_dir_objectid(di);
 out:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	check_inode(dir);
 	return ret;
 }
@@ -405,7 +425,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	int ret;
 	u32 nritems;
 	struct btrfs_leaf *leaf;
@@ -419,27 +439,28 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	key.offset = filp->f_pos;
-	btrfs_init_path(&path);
-	ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+	path = btrfs_alloc_path();
+	btrfs_init_path(path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0) {
 		goto err;
 	}
 	advance = 0;
 	while(1) {
-		leaf = btrfs_buffer_leaf(path.nodes[0]);
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
-		slot = path.slots[0];
+		slot = path->slots[0];
 		if (advance || slot >= nritems) {
 			if (slot >= nritems -1) {
-				ret = btrfs_next_leaf(root, &path);
+				ret = btrfs_next_leaf(root, path);
 				if (ret)
 					break;
-				leaf = btrfs_buffer_leaf(path.nodes[0]);
+				leaf = btrfs_buffer_leaf(path->nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
-				slot = path.slots[0];
+				slot = path->slots[0];
 			} else {
 				slot++;
-				path.slots[0]++;
+				path->slots[0]++;
 			}
 		}
 		advance = 1;
@@ -465,7 +486,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	}
 	ret = 0;
 err:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
@@ -548,26 +570,29 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	int ret;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 
-	ret = btrfs_lookup_inode(trans, root, &path, inode->i_ino, 1);
+	ret = btrfs_lookup_inode(trans, root, path, inode->i_ino, 1);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
 		goto failed;
 	}
 
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-				  path.slots[0],
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
 				  struct btrfs_inode_item);
 
 	fill_inode_item(inode_item, inode);
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 failed:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	check_inode(inode);
 	return 0;
 }
@@ -799,38 +824,39 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	u64 extent_start = 0;
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_leaf *leaf;
 	struct btrfs_disk_key *found_key;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	if (create)
 		trans = btrfs_start_transaction(root, 1);
 
 
-	ret = btrfs_lookup_file_extent(trans, root, &path,
+	ret = btrfs_lookup_file_extent(trans, root, path,
 				       inode->i_ino,
 				       iblock << inode->i_blkbits, 0);
 	if (ret < 0) {
-		btrfs_release_path(root, &path);
 		err = ret;
 		goto out;
 	}
 
 	if (ret != 0) {
-		if (path.slots[0] == 0) {
-			btrfs_release_path(root, &path);
+		if (path->slots[0] == 0) {
+			btrfs_release_path(root, path);
 			goto allocate;
 		}
-		path.slots[0]--;
+		path->slots[0]--;
 	}
 
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]), path.slots[0],
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_file_extent_item);
-	leaf = btrfs_buffer_leaf(path.nodes[0]);
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	blocknr = btrfs_file_extent_disk_blocknr(item);
 	blocknr += btrfs_file_extent_offset(item);
 
@@ -838,25 +864,23 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	if (ret == 0) {
 		err = 0;
 		map_bh(result, inode->i_sb, blocknr);
-		btrfs_release_path(root, &path);
 		goto out;
 	}
 
 	/* are we inside the extent that was found? */
-	found_key = &leaf->items[path.slots[0]].key;
+	found_key = &leaf->items[path->slots[0]].key;
 	if (btrfs_disk_key_objectid(found_key) != objectid ||
 	    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY) {
 		extent_end = 0;
 		extent_start = 0;
-		btrfs_release_path(root, &path);
+		btrfs_release_path(root, path);
 		goto allocate;
 	}
 
-	extent_start = btrfs_disk_key_offset(&leaf->items[path.slots[0]].key);
+	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
 	extent_start = extent_start >> inode->i_blkbits;
 	extent_start += btrfs_file_extent_offset(item);
 	extent_end = extent_start + btrfs_file_extent_num_blocks(item);
-	btrfs_release_path(root, &path);
 	if (iblock >= extent_start && iblock < extent_end) {
 		err = 0;
 		map_bh(result, inode->i_sb, blocknr + iblock - extent_start);
@@ -880,6 +904,8 @@ allocate:
 	map_bh(result, inode->i_sb, blocknr);
 
 out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	if (trans)
 		btrfs_end_transaction(trans, root);
 	return err;
-- 
cgit v1.2.3


From 7cfcc17e7f2334a0503331264c1340de4e564c85 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 2 Apr 2007 14:53:59 -0400
Subject: Btrfs: corruptions fixed

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/bit-radix.c   |  2 --
 fs/btrfs/disk-io.c     | 21 +++------------------
 fs/btrfs/inode-map.c   | 43 +++++++++++++++++++++++++------------------
 fs/btrfs/super.c       |  9 +++++----
 fs/btrfs/transaction.c |  2 +-
 5 files changed, 34 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index a50a1c44eab..783f54c2a16 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -62,7 +62,6 @@ int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 	if (!bits)
 		return 0;
 	clear_bit(bit_slot, bits + 1);
-#if 0
 	for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
 		if (bits[i]) {
 			empty = 0;
@@ -74,7 +73,6 @@ int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 		BUG_ON(!bits);
 		kmem_cache_free(btrfs_bit_radix_cachep, bits);
 	}
-#endif
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2dbd55084a4..a227e94b9e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,8 +23,6 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 
 struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	return sb_find_get_block(root->fs_info->sb, blocknr);
-#if 0
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	int blockbits = root->fs_info->sb->s_blocksize_bits;
 	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
@@ -58,14 +56,11 @@ out_unlock:
 	}
 	page_cache_release(page);
 	return ret;
-#endif
 }
 
 struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 blocknr)
 {
-	return sb_getblk(root->fs_info->sb, blocknr);
-#if 0
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	int blockbits = root->fs_info->sb->s_blocksize_bits;
 	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
@@ -103,7 +98,6 @@ out_unlock:
 		touch_buffer(ret);
 	page_cache_release(page);
 	return ret;
-#endif
 }
 
 static sector_t max_block(struct block_device *bdev)
@@ -186,7 +180,6 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
-#if 0
 	struct buffer_head *bh;
 	struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
 	struct buffer_head *head;
@@ -201,7 +194,6 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
 			csum_tree_block(root, bh, 0);
 		bh = bh->b_this_page;
 	} while (bh != head);
-#endif
 	return block_write_full_page(page, btree_get_block, wbc);
 }
 
@@ -218,8 +210,6 @@ static struct address_space_operations btree_aops = {
 
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	return sb_bread(root->fs_info->sb, blocknr);
-#if 0
 	struct buffer_head *bh = NULL;
 
 	bh = btrfs_find_create_tree_block(root, blocknr);
@@ -243,8 +233,6 @@ struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 fail:
 	brelse(bh);
 	return NULL;
-
-#endif
 }
 
 int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -323,8 +311,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->last_inode_alloc = 0;
 	fs_info->last_inode_alloc_dirid = 0;
 	fs_info->sb = sb;
-	fs_info->btree_inode = NULL;
-#if 0
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
@@ -332,7 +318,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	insert_inode_hash(fs_info->btree_inode);
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-#endif
 	fs_info->hash_tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
 	spin_lock_init(&fs_info->hash_lock);
 	if (!fs_info->hash_tfm || IS_ERR(fs_info->hash_tfm)) {
@@ -435,8 +420,8 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_block_release(root, root->commit_root);
 	btrfs_block_release(root, root->fs_info->sb_buffer);
 	crypto_free_hash(root->fs_info->hash_tfm);
-	// truncate_inode_pages(root->fs_info->btree_inode->i_mapping, 0);
-	// iput(root->fs_info->btree_inode);
+	truncate_inode_pages(root->fs_info->btree_inode->i_mapping, 0);
+	iput(root->fs_info->btree_inode);
 	kfree(root->fs_info->extent_root);
 	kfree(root->fs_info->inode_root);
 	kfree(root->fs_info->tree_root);
@@ -447,6 +432,6 @@ int close_ctree(struct btrfs_root *root)
 
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 {
-	// brelse(buf);
+	brelse(buf);
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index fa4adb81b14..b3de823eb62 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -10,7 +10,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
 			     u64 dirid, u64 *objectid)
 {
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
 	u64 hole_size = 0;
@@ -31,20 +31,22 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
 	search_key.offset = 0;
 
-	btrfs_init_path(&path);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
 	start_found = 0;
-	ret = btrfs_search_slot(trans, root, &search_key, &path, 0, 0);
+	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto error;
 
-	if (path.slots[0] > 0)
-		path.slots[0]--;
+	if (path->slots[0] > 0)
+		path->slots[0]--;
 
 	while (1) {
-		l = btrfs_buffer_leaf(path.nodes[0]);
-		slot = path.slots[0];
+		l = btrfs_buffer_leaf(path->nodes[0]);
+		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
-			ret = btrfs_next_leaf(root, &path);
+			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
 			if (ret < 0)
@@ -72,17 +74,19 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 		}
 		start_found = 1;
 		last_ino = key.objectid + 1;
-		path.slots[0]++;
+		path->slots[0]++;
 	}
 	// FIXME -ENOSPC
 found:
 	root->fs_info->last_inode_alloc = *objectid;
 	root->fs_info->last_inode_alloc_dirid = dirid;
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	BUG_ON(*objectid < search_start);
 	return 0;
 error:
-	btrfs_release_path(root, &path);
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -91,7 +95,7 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 			   u64 objectid, struct btrfs_key *location)
 {
 	int ret = 0;
-	struct btrfs_path path;
+	struct btrfs_path *path;
 	struct btrfs_inode_map_item *inode_item;
 	struct btrfs_key key;
 	struct btrfs_root *inode_root = fs_root->fs_info->inode_root;
@@ -100,18 +104,21 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY);
 	key.offset = 0;
-	btrfs_init_path(&path);
-	ret = btrfs_insert_empty_item(trans, inode_root, &path, &key,
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_insert_empty_item(trans, inode_root, path, &key,
 				      sizeof(struct btrfs_inode_map_item));
 	if (ret)
 		goto out;
 
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path.nodes[0]),
-				    path.slots[0], struct btrfs_inode_map_item);
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				    path->slots[0], struct btrfs_inode_map_item);
 	btrfs_cpu_key_to_disk(&inode_item->key, location);
-	btrfs_mark_buffer_dirty(path.nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
-	btrfs_release_path(inode_root, &path);
+	btrfs_release_path(inode_root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c260fcad17b..9fd87f5edbc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -49,6 +49,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	ret = btrfs_lookup_inode(NULL, root, path, inode->i_ino, 0);
 	if (ret) {
 		btrfs_release_path(root, path);
+		btrfs_free_path(path);
 		mutex_unlock(&root->fs_info->fs_mutex);
 		make_bad_inode(inode);
 		return;
@@ -215,7 +216,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto out;
 	}
 	btrfs_release_path(root, path);
-	btrfs_free_path(path);
 
 	/* now the directory is empty */
 	err = btrfs_unlink_trans(trans, root, dir, dentry);
@@ -223,6 +223,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		inode->i_size = 0;
 	}
 out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_end_transaction(trans, root);
 	if (ret && !err)
@@ -800,11 +802,10 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	sb->s_dirt = 0;
 	if (!wait) {
-		// filemap_flush(root->fs_info->btree_inode->i_mapping);
-		filemap_flush(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
+		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
 	}
-	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
+	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 72b52e1e0b1..2b0f98c9212 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -88,7 +88,7 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
-	filemap_write_and_wait(root->fs_info->sb->s_bdev->bd_inode->i_mapping);
+	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2da566edd8ce32ae2952b863ee634bcc2e7d63c1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 2 Apr 2007 15:43:21 -0400
Subject: Btrfs: csum_verify_file_block locking fix

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 1 -
 fs/btrfs/extent-tree.c | 2 --
 fs/btrfs/file-item.c   | 2 ++
 fs/btrfs/super.c       | 1 -
 4 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a227e94b9e8..39b117a57a5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -160,7 +160,6 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 	int ret;
 	struct btrfs_node *node;
 
-	return 0;
 	ret = btrfs_csum_data(root, bh->b_data + BTRFS_CSUM_SIZE,
 			      bh->b_size - BTRFS_CSUM_SIZE, result);
 	if (ret)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d785b721b46..cd40a08dd8a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -249,8 +249,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_set_super_blocks_used(info->disk_super,
 					    super_blocks_used - num_blocks);
 		ret = btrfs_del_item(trans, extent_root, path);
-		if (extent_root->fs_info->last_insert.objectid > blocknr)
-			extent_root->fs_info->last_insert.objectid = blocknr;
 		if (ret)
 			BUG();
 	}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index e7510ac5559..2c98b620c59 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -110,6 +110,7 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_search_slot(NULL, root, &file_key, path, 0, 0);
 	if (ret)
 		goto fail;
@@ -123,6 +124,7 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 fail:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9fd87f5edbc..fbc26eafc58 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1316,7 +1316,6 @@ static void btrfs_destroy_inode(struct inode *inode)
 	WARN_ON(ei->magic != 0xDEADBEEF);
 	WARN_ON(ei->magic2 != 0xDEADBEAF);
 	WARN_ON(!list_empty(&inode->i_dentry));
-	WARN_ON(inode->i_ino == 1);
 	WARN_ON(inode->i_data.nrpages);
 
 	ei->magic = 0;
-- 
cgit v1.2.3


From 9d64272cd3be2aae71f89f7ababbee90a7587b33 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 3 Apr 2007 11:43:19 -0400
Subject: Btrfs: don't wait on uptodate buffers on read

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 39b117a57a5..055cd45e802 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -214,6 +214,8 @@ struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 	bh = btrfs_find_create_tree_block(root, blocknr);
 	if (!bh)
 		return bh;
+	if (buffer_uptodate(bh))
+		goto uptodate;
 	lock_buffer(bh);
 	if (!buffer_uptodate(bh)) {
 		get_bh(bh);
@@ -226,6 +228,7 @@ struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 	} else {
 		unlock_buffer(bh);
 	}
+uptodate:
 	if (check_tree_block(root, bh))
 		BUG();
 	return bh;
-- 
cgit v1.2.3


From df24a2b9c9bcef3348e4b1a8f206cd484a248d36 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Apr 2007 09:36:31 -0400
Subject: Btrfs: early inline file data code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c |  18 +++--
 fs/btrfs/super.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 248 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 453ce835209..9ef65e2ecf5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,19 +16,24 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
-struct btrfs_path *btrfs_alloc_path(void)
+inline void btrfs_init_path(struct btrfs_path *p)
 {
-	return kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+	memset(p, 0, sizeof(*p));
 }
 
-void btrfs_free_path(struct btrfs_path *p)
+struct btrfs_path *btrfs_alloc_path(void)
 {
-	kmem_cache_free(btrfs_path_cachep, p);
+	struct btrfs_path *path;
+	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
+	if (path)
+		btrfs_init_path(path);
+	return path;
 }
 
-inline void btrfs_init_path(struct btrfs_path *p)
+void btrfs_free_path(struct btrfs_path *p)
 {
-	memset(p, 0, sizeof(*p));
+	btrfs_release_path(NULL, p);
+	kmem_cache_free(btrfs_path_cachep, p);
 }
 
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
@@ -77,6 +82,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
 	}
 	btrfs_block_release(root, buf);
+	mark_buffer_dirty(cow);
 	*cow_ret = cow;
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index fbc26eafc58..68127b8462a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -816,6 +816,73 @@ printk("btrfs sync_fs\n");
 	return 0;
 }
 
+static int btrfs_get_block_inline(struct inode *inode, sector_t iblock,
+			   struct buffer_head *result, int create)
+{
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_leaf *leaf;
+	int num_bytes = result->b_size;
+	int item_size;
+	int ret;
+	u64 pos;
+	char *ptr;
+	int copy_size;
+	int err = 0;
+	char *safe_ptr;
+	char *data_ptr;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	WARN_ON(create);
+	if (create) {
+		return 0;
+	}
+	pos = iblock << inode->i_blkbits;
+	key.objectid = inode->i_ino;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INLINE_DATA_KEY);
+	ptr = kmap(result->b_page);
+	safe_ptr = ptr;
+	ptr += (pos & (PAGE_CACHE_SIZE -1));
+again:
+	key.offset = pos;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret) {
+		if (ret < 0)
+			err = ret;
+		else
+			err = 0;
+		goto out;
+	}
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	item_size = btrfs_item_size(leaf->items + path->slots[0]);
+	copy_size = min(num_bytes, item_size);
+	data_ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+	WARN_ON(safe_ptr + PAGE_CACHE_SIZE < ptr + copy_size);
+	memcpy(ptr, data_ptr, copy_size);
+	pos += copy_size;
+	num_bytes -= copy_size;
+	WARN_ON(num_bytes < 0);
+	ptr += copy_size;
+	btrfs_release_path(root, path);
+	if (num_bytes != 0) {
+		if (pos >= i_size_read(inode))
+			memset(ptr, 0, num_bytes);
+		else
+			goto again;
+	}
+	set_buffer_uptodate(result);
+	map_bh(result, inode->i_sb, 0);
+	err = 0;
+out:
+	btrfs_free_path(path);
+	kunmap(result->b_page);
+	return err;
+}
+
 static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
@@ -918,7 +985,8 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	int err;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
 	mutex_lock(&root->fs_info->fs_mutex);
-	err = btrfs_get_block_lock(inode, iblock, result, create);
+	// err = btrfs_get_block_lock(inode, iblock, result, create);
+	err = btrfs_get_block_inline(inode, iblock, result, create);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return err;
 }
@@ -1177,6 +1245,170 @@ out:
 	return num_written ? num_written : err;
 }
 
+static ssize_t inline_one_page(struct btrfs_root *root, struct inode *inode,
+			   struct page *page, loff_t pos,
+			   size_t offset, size_t write_bytes)
+{
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_leaf *leaf;
+	struct btrfs_key found_key;
+	int ret;
+	size_t copy_size = 0;
+	char *dst = NULL;
+	int err = 0;
+	size_t num_written = 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = inode->i_ino;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INLINE_DATA_KEY);
+
+again:
+	key.offset = pos;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret == 0) {
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key,
+				      &leaf->items[path->slots[0]].key);
+		copy_size = btrfs_item_size(leaf->items + path->slots[0]);
+		dst = btrfs_item_ptr(leaf, path->slots[0], char);
+		copy_size = min(write_bytes, copy_size);
+		goto copyit;
+	} else {
+		int slot = path->slots[0];
+		if (slot > 0) {
+			slot--;
+		}
+		// FIXME find max key
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key,
+				      &leaf->items[slot].key);
+		if (found_key.objectid != inode->i_ino)
+			goto insert;
+		if (btrfs_key_type(&found_key) != BTRFS_INLINE_DATA_KEY)
+			goto insert;
+		copy_size = btrfs_item_size(leaf->items + slot);
+		if (found_key.offset + copy_size <= pos)
+			goto insert;
+		dst = btrfs_item_ptr(leaf, path->slots[0], char);
+		dst += pos - found_key.offset;
+		copy_size = copy_size - (pos - found_key.offset);
+		BUG_ON(copy_size < 0);
+		copy_size = min(write_bytes, copy_size);
+		WARN_ON(copy_size == 0);
+		goto copyit;
+	}
+insert:
+	btrfs_release_path(root, path);
+	copy_size = min(write_bytes, (size_t)512);
+	ret = btrfs_insert_empty_item(trans, root, path, &key, copy_size);
+	BUG_ON(ret);
+	dst = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+			     path->slots[0], char);
+copyit:
+	WARN_ON(copy_size == 0);
+	WARN_ON(dst + copy_size >
+		btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+						 path->slots[0], char) +
+		btrfs_item_size(btrfs_buffer_leaf(path->nodes[0])->items +
+						  path->slots[0]));
+	btrfs_memcpy(root, path->nodes[0]->b_data, dst,
+		     page_address(page) + offset, copy_size);
+	mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(root, path);
+	pos += copy_size;
+	offset += copy_size;
+	num_written += copy_size;
+	write_bytes -= copy_size;
+	if (write_bytes)
+		goto again;
+out:
+	btrfs_free_path(path);
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return num_written ? num_written : err;
+}
+
+static ssize_t btrfs_file_inline_write(struct file *file,
+				       const char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	loff_t pos;
+	size_t num_written = 0;
+	int err = 0;
+	int ret = 0;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	unsigned long page_index;
+
+	if (file->f_flags & O_DIRECT)
+		return -EINVAL;
+	pos = *ppos;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+	if (count == 0)
+		goto out;
+	err = remove_suid(file->f_path.dentry);
+	if (err)
+		goto out;
+	file_update_time(file);
+	mutex_lock(&inode->i_mutex);
+	while(count > 0) {
+		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
+		struct page *page;
+
+		page_index = pos >> PAGE_CACHE_SHIFT;
+		page = grab_cache_page(inode->i_mapping, page_index);
+		if (!PageUptodate(page)) {
+			ret = mpage_readpage(page, btrfs_get_block);
+			BUG_ON(ret);
+			lock_page(page);
+		}
+		ret = btrfs_copy_from_user(pos, 1,
+					   write_bytes, &page, buf);
+		BUG_ON(ret);
+		write_bytes = inline_one_page(root, inode, page, pos,
+				      offset, write_bytes);
+		SetPageUptodate(page);
+		if (write_bytes > 0 && pos + write_bytes > inode->i_size) {
+			i_size_write(inode, pos + write_bytes);
+			mark_inode_dirty(inode);
+		}
+		page_cache_release(page);
+		unlock_page(page);
+		if (write_bytes < 0)
+			goto out_unlock;
+		buf += write_bytes;
+		count -= write_bytes;
+		pos += write_bytes;
+		num_written += write_bytes;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		cond_resched();
+	}
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+out:
+	*ppos = pos;
+	current->backing_dev_info = NULL;
+	return num_written ? num_written : err;
+}
+
 static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 			unsigned long offset, unsigned long size)
 {
@@ -1420,7 +1652,7 @@ static struct file_operations btrfs_dir_file_operations = {
 
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
-	.readpages	= btrfs_readpages,
+	// .readpages	= btrfs_readpages,
 	.writepage	= btrfs_writepage,
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
@@ -1434,8 +1666,8 @@ static struct inode_operations btrfs_file_inode_operations = {
 static struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.aio_read       = btrfs_file_aio_read,
-	.write		= btrfs_file_write,
+	.aio_read       = generic_file_aio_read,
+	.write		= btrfs_file_inline_write,
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 };
-- 
cgit v1.2.3


From d4dbff953e1f6f4079126c0404cc24f2ef14e925 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Apr 2007 14:08:15 -0400
Subject: Btrfs: support for items bigger than 1/2 the blocksize

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 132 ++++++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/super.c |  16 +++++--
 2 files changed, 110 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9ef65e2ecf5..864ee423b30 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -6,7 +6,8 @@
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, int data_size);
+		      *root, struct btrfs_key *ins_key,
+		      struct btrfs_path *path, int data_size);
 static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct buffer_head *dst, struct buffer_head
 			  *src);
@@ -101,19 +102,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset(leaf->items + nr - 1);
 }
 
-/*
- * The space between the end of the leaf items and
- * the start of the leaf data.  IOW, how much room
- * the leaf has left for both items and data
- */
-int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf)
-{
-	int data_end = leaf_data_end(root, leaf);
-	int nritems = btrfs_header_nritems(&leaf->header);
-	char *items_end = (char *)(leaf->items + nritems + 1);
-	return (char *)(btrfs_leaf_data(leaf) + data_end) - (char *)items_end;
-}
-
 /*
  * compare two keys in a memcmp fashion
  */
@@ -510,8 +498,8 @@ again:
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			if (ins_len > 0 && btrfs_header_nritems(&c->header) ==
-			    BTRFS_NODEPTRS_PER_BLOCK(root)) {
+			if (ins_len > 0 && btrfs_header_nritems(&c->header) >=
+			    BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 				int sret = split_node(trans, root, p, level);
 				BUG_ON(sret > 0);
 				if (sret)
@@ -537,7 +525,8 @@ again:
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, l) <
 			    sizeof(struct btrfs_item) + ins_len) {
-				int sret = split_leaf(trans, root, p, ins_len);
+				int sret = split_leaf(trans, root, key,
+						      p, ins_len);
 				BUG_ON(sret > 0);
 				if (sret)
 					return sret;
@@ -825,16 +814,29 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 static int leaf_space_used(struct btrfs_leaf *l, int start, int nr)
 {
 	int data_len;
-	int end = start + nr - 1;
+	int nritems = btrfs_header_nritems(&l->header);
+	int end = min(nritems, start + nr) - 1;
 
 	if (!nr)
 		return 0;
 	data_len = btrfs_item_end(l->items + start);
 	data_len = data_len - btrfs_item_offset(l->items + end);
 	data_len += sizeof(struct btrfs_item) * nr;
+	WARN_ON(data_len < 0);
 	return data_len;
 }
 
+/*
+ * The space between the end of the leaf items and
+ * the start of the leaf data.  IOW, how much room
+ * the leaf has left for both items and data
+ */
+int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf)
+{
+	int nritems = btrfs_header_nritems(&leaf->header);
+	return BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+}
+
 /*
  * push some data in the path leaf to the right, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
@@ -1084,7 +1086,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
  * returns 0 if all went well and < 0 on failure.
  */
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, int data_size)
+		      *root, struct btrfs_key *ins_key,
+		      struct btrfs_path *path, int data_size)
 {
 	struct buffer_head *l_buf;
 	struct btrfs_leaf *l;
@@ -1097,8 +1100,10 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	int data_copy_size;
 	int rt_data_off;
 	int i;
-	int ret;
+	int ret = 0;
 	int wret;
+	int double_split = 0;
+	struct btrfs_disk_key disk_key;
 
 	/* first try to make some room by pushing left and right */
 	wret = push_leaf_left(trans, root, path, data_size);
@@ -1127,26 +1132,58 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	mid = (nritems + 1)/ 2;
 	right_buffer = btrfs_alloc_free_block(trans, root);
 	BUG_ON(!right_buffer);
-	BUG_ON(mid == nritems);
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
-	if (mid <= slot) {
-		/* FIXME, just alloc a new leaf here */
-		if (leaf_space_used(l, mid, nritems - mid) + space_needed >
-			BTRFS_LEAF_DATA_SIZE(root))
-			BUG();
-	} else {
-		/* FIXME, just alloc a new leaf here */
-		if (leaf_space_used(l, 0, mid + 1) + space_needed >
-			BTRFS_LEAF_DATA_SIZE(root))
-			BUG();
-	}
-	btrfs_set_header_nritems(&right->header, nritems - mid);
 	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
 	btrfs_set_header_generation(&right->header, trans->transid);
 	btrfs_set_header_level(&right->header, 0);
 	btrfs_set_header_parentid(&right->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
+	if (mid <= slot) {
+		if (nritems == 1 ||
+		    leaf_space_used(l, mid, nritems - mid) + space_needed >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot >= nritems) {
+				btrfs_cpu_key_to_disk(&disk_key, ins_key);
+				btrfs_set_header_nritems(&right->header, 0);
+				wret = insert_ptr(trans, root, path,
+						  &disk_key,
+						  right_buffer->b_blocknr,
+						  path->slots[1] + 1, 1);
+				if (wret)
+					ret = wret;
+				btrfs_block_release(root, path->nodes[0]);
+				path->nodes[0] = right_buffer;
+				path->slots[0] = 0;
+				path->slots[1] += 1;
+				return ret;
+			}
+			mid = slot;
+			double_split = 1;
+		}
+	} else {
+		if (leaf_space_used(l, 0, mid + 1) + space_needed >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot == 0) {
+				btrfs_cpu_key_to_disk(&disk_key, ins_key);
+				btrfs_set_header_nritems(&right->header, 0);
+				wret = insert_ptr(trans, root, path,
+						  &disk_key,
+						  right_buffer->b_blocknr,
+						  path->slots[1] - 1, 1);
+				if (wret)
+					ret = wret;
+				btrfs_block_release(root, path->nodes[0]);
+				path->nodes[0] = right_buffer;
+				path->slots[0] = 0;
+				path->slots[1] -= 1;
+				return ret;
+			}
+			mid = slot;
+			double_split = 1;
+		}
+	}
+	btrfs_set_header_nritems(&right->header, nritems - mid);
 	data_copy_size = btrfs_item_end(l->items + mid) -
 			 leaf_data_end(root, l);
 	btrfs_memcpy(root, right, right->items, l->items + mid,
@@ -1180,6 +1217,31 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	} else
 		btrfs_block_release(root, right_buffer);
 	BUG_ON(path->slots[0] < 0);
+
+	if (!double_split)
+		return ret;
+	right_buffer = btrfs_alloc_free_block(trans, root);
+	BUG_ON(!right_buffer);
+	right = btrfs_buffer_leaf(right_buffer);
+	memset(&right->header, 0, sizeof(right->header));
+	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
+	btrfs_set_header_generation(&right->header, trans->transid);
+	btrfs_set_header_level(&right->header, 0);
+	btrfs_set_header_parentid(&right->header,
+	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
+	btrfs_cpu_key_to_disk(&disk_key, ins_key);
+	btrfs_set_header_nritems(&right->header, 0);
+	wret = insert_ptr(trans, root, path,
+			  &disk_key,
+			  right_buffer->b_blocknr,
+			  path->slots[1], 1);
+	if (wret)
+		ret = wret;
+	btrfs_block_release(root, path->nodes[0]);
+	path->nodes[0] = right_buffer;
+	path->slots[0] = 0;
+	check_node(root, path, 1);
+	check_leaf(root, path, 0);
 	return ret;
 }
 
@@ -1220,9 +1282,9 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	data_end = leaf_data_end(root, leaf);
 
 	if (btrfs_leaf_free_space(root, leaf) <
-	    sizeof(struct btrfs_item) + data_size)
+	    sizeof(struct btrfs_item) + data_size) {
 		BUG();
-
+	}
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 	if (slot != nritems) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 68127b8462a..0afb6cece82 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -136,8 +136,11 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 err:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	if (ret == 0)
+	if (ret == 0) {
 		inode_dec_link_count(dentry->d_inode);
+		dir->i_size -= name_len;
+		mark_inode_dirty(dir);
+	}
 	return ret;
 }
 
@@ -290,6 +293,10 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
 	key.flags = 0;
+	/*
+	 * use BTRFS_CSUM_ITEM_KEY because it is larger than inline keys
+	 * or extent data
+	 */
 	btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
 	while(1) {
 		btrfs_init_path(path);
@@ -306,6 +313,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
 			break;
 		if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
+		    btrfs_disk_key_type(found_key) != BTRFS_INLINE_DATA_KEY &&
 		    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
 			break;
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
@@ -1036,7 +1044,7 @@ static void btrfs_truncate(struct inode *inode)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 
-	nobh_truncate_page(inode->i_mapping, inode->i_size);
+	// nobh_truncate_page(inode->i_mapping, inode->i_size);
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -1309,7 +1317,9 @@ again:
 	}
 insert:
 	btrfs_release_path(root, path);
-	copy_size = min(write_bytes, (size_t)512);
+	copy_size = min(write_bytes,
+			(size_t)BTRFS_LEAF_DATA_SIZE(root) -
+			sizeof(struct btrfs_item) * 4);
 	ret = btrfs_insert_empty_item(trans, root, path, &key, copy_size);
 	BUG_ON(ret);
 	dst = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-- 
cgit v1.2.3


From e8f05c4508aed906146919f2efa89242a7f90a8b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Apr 2007 14:30:09 -0400
Subject: Btrfs: disable inline data code for now

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0afb6cece82..822243650ae 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -824,6 +824,7 @@ printk("btrfs sync_fs\n");
 	return 0;
 }
 
+#if 0
 static int btrfs_get_block_inline(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
@@ -890,6 +891,7 @@ out:
 	kunmap(result->b_page);
 	return err;
 }
+#endif
 
 static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
@@ -993,8 +995,8 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	int err;
 	struct btrfs_root *root = btrfs_sb(inode->i_sb);
 	mutex_lock(&root->fs_info->fs_mutex);
-	// err = btrfs_get_block_lock(inode, iblock, result, create);
-	err = btrfs_get_block_inline(inode, iblock, result, create);
+	err = btrfs_get_block_lock(inode, iblock, result, create);
+	// err = btrfs_get_block_inline(inode, iblock, result, create);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return err;
 }
@@ -1002,13 +1004,11 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
-	WARN_ON(1);
 	return nobh_prepare_write(page, from, to, btrfs_get_block);
 }
 static int btrfs_commit_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
-	WARN_ON(1);
 	return nobh_commit_write(file, page, from, to);
 }
 
@@ -1044,7 +1044,7 @@ static void btrfs_truncate(struct inode *inode)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 
-	// nobh_truncate_page(inode->i_mapping, inode->i_size);
+	nobh_truncate_page(inode->i_mapping, inode->i_size);
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -1253,6 +1253,7 @@ out:
 	return num_written ? num_written : err;
 }
 
+#if 0
 static ssize_t inline_one_page(struct btrfs_root *root, struct inode *inode,
 			   struct page *page, loff_t pos,
 			   size_t offset, size_t write_bytes)
@@ -1418,6 +1419,7 @@ out:
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
 }
+#endif
 
 static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 			unsigned long offset, unsigned long size)
@@ -1662,7 +1664,7 @@ static struct file_operations btrfs_dir_file_operations = {
 
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
-	// .readpages	= btrfs_readpages,
+	.readpages	= btrfs_readpages,
 	.writepage	= btrfs_writepage,
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
@@ -1676,8 +1678,8 @@ static struct inode_operations btrfs_file_inode_operations = {
 static struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.aio_read       = generic_file_aio_read,
-	.write		= btrfs_file_inline_write,
+	.aio_read       = btrfs_file_aio_read,
+	.write		= btrfs_file_write,
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 };
-- 
cgit v1.2.3


From b1a4d96509a78ad234d94e0b914b289c60d2969d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Apr 2007 15:27:52 -0400
Subject: Btrfs: tweak the inode-map and free extent search starts on cold
 mount

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |  2 --
 fs/btrfs/ctree.h       |  1 -
 fs/btrfs/disk-io.c     |  1 -
 fs/btrfs/extent-tree.c | 21 +++++++++++++++++++--
 fs/btrfs/inode-map.c   | 27 ++++++++++++++++++++-------
 5 files changed, 39 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index e9371385ccc..db986c46a55 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,9 +1,7 @@
 * cleanup, add more error checking, get rid of BUG_ONs
 * Fix ENOSPC handling
-* Do checksumming
 * Add block mapping tree (simple dm layer)
 * Make allocator smarter
-* make level a field in header
 * add a block group to struct inode
 * Make directory hashing work on 32 bit
 * Make sure nobh stuff is working properly for cows
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d8e03bd797f..41cc013ef08 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -238,7 +238,6 @@ struct btrfs_fs_info {
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
 	u64 last_inode_alloc;
-	u64 last_inode_alloc_dirid;
 	u64 generation;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 055cd45e802..de9ee3aa0aa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -311,7 +311,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->extent_root = extent_root;
 	fs_info->inode_root = inode_root;
 	fs_info->last_inode_alloc = 0;
-	fs_info->last_inode_alloc_dirid = 0;
 	fs_info->sb = sb;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cd40a08dd8a..688aa861a92 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -339,13 +339,30 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int total_needed = num_blocks;
 	int level;
 
+	path = btrfs_alloc_path();
+	ins->flags = 0;
+	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+
 	level = btrfs_header_level(btrfs_buffer_header(root->node));
 	total_needed += (level + 1) * 3;
+	if (root->fs_info->last_insert.objectid == 0 && search_end == (u64)-1) {
+		struct btrfs_disk_key *last_key;
+		btrfs_init_path(path);
+		ins->objectid = (u64)-1;
+		ins->offset = (u64)-1;
+		ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0);
+		if (path->slots[0] > 0)
+			path->slots[0]--;
+		l = btrfs_buffer_leaf(path->nodes[0]);
+		last_key = &l->items[path->slots[0]].key;
+		search_start = btrfs_disk_key_objectid(last_key);
+	}
 	if (root->fs_info->last_insert.objectid > search_start)
 		search_start = root->fs_info->last_insert.objectid;
 
-	ins->flags = 0;
-	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 	path = btrfs_alloc_path();
 
 check_failed:
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b3de823eb62..329edb42897 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -22,17 +22,31 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	struct btrfs_key search_key;
 	u64 search_start = dirid;
 
-	if (fs_root->fs_info->last_inode_alloc_dirid == dirid)
-		search_start = fs_root->fs_info->last_inode_alloc;
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	search_key.flags = 0;
+	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
 
+	search_start = fs_root->fs_info->last_inode_alloc;
+	if (search_start == 0) {
+		struct btrfs_disk_key *last_key;
+		btrfs_init_path(path);
+		search_key.objectid = (u64)-1;
+		search_key.offset = (u64)-1;
+		ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0);
+		if (path->slots[0] > 0)
+			path->slots[0]--;
+		l = btrfs_buffer_leaf(path->nodes[0]);
+		last_key = &l->items[path->slots[0]].key;
+		search_start = btrfs_disk_key_objectid(last_key);
+	}
 	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
-	search_key.flags = 0;
-	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
 	search_key.offset = 0;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
 	btrfs_init_path(path);
 	start_found = 0;
 	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
@@ -79,7 +93,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	// FIXME -ENOSPC
 found:
 	root->fs_info->last_inode_alloc = *objectid;
-	root->fs_info->last_inode_alloc_dirid = dirid;
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	BUG_ON(*objectid < search_start);
-- 
cgit v1.2.3


From bae45de03c4d54a9893dedf8a015beb2608b896a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 4 Apr 2007 21:22:22 -0400
Subject: Btrfs: add dir inode index

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h    | 15 ++++++++-------
 fs/btrfs/dir-item.c | 19 +++++++++++++++++++
 fs/btrfs/super.c    |  4 ++--
 fs/btrfs/sysfs.c    |  4 ++++
 4 files changed, 33 insertions(+), 9 deletions(-)
 create mode 100644 fs/btrfs/sysfs.c

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 41cc013ef08..87c56222a62 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -282,41 +282,42 @@ struct btrfs_root {
  * for every name in a directory.
  */
 #define BTRFS_DIR_ITEM_KEY	2
+#define BTRFS_DIR_INDEX_KEY	3
 /*
  * inline data is file data that fits in the btree.
  */
-#define BTRFS_INLINE_DATA_KEY	3
+#define BTRFS_INLINE_DATA_KEY	4
 /*
  * extent data is for data that can't fit in the btree.  It points to
  * a (hopefully) huge chunk of disk
  */
-#define BTRFS_EXTENT_DATA_KEY	4
+#define BTRFS_EXTENT_DATA_KEY	5
 /*
  * csum items have the checksums for data in the extents
  */
-#define BTRFS_CSUM_ITEM_KEY	5
+#define BTRFS_CSUM_ITEM_KEY	6
 
 /*
  * root items point to tree roots.  There are typically in the root
  * tree used by the super block to find all the other trees
  */
-#define BTRFS_ROOT_ITEM_KEY	6
+#define BTRFS_ROOT_ITEM_KEY	7
 /*
  * extent items are in the extent map tree.  These record which blocks
  * are used, and how many references there are to each block
  */
-#define BTRFS_EXTENT_ITEM_KEY	7
+#define BTRFS_EXTENT_ITEM_KEY	8
 
 /*
  * the inode map records which inode numbers are in use and where
  * they actually live on disk
  */
-#define BTRFS_INODE_MAP_ITEM_KEY 8
+#define BTRFS_INODE_MAP_ITEM_KEY 9
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	9
+#define BTRFS_STRING_ITEM_KEY	10
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index baceb1da609..2a87e23ac9c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -37,6 +37,25 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	name_ptr = (char *)(dir_item + 1);
 	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(root, path);
+
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = objectid;
+	ret = btrfs_insert_empty_item(trans, root, path, &key, data_size);
+	// FIXME clear the dirindex bit
+	if (ret)
+		goto out;
+
+	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
+				  struct btrfs_dir_item);
+	btrfs_set_dir_objectid(dir_item, objectid);
+	btrfs_set_dir_type(dir_item, type);
+	btrfs_set_dir_flags(dir_item, 0);
+	btrfs_set_dir_name_len(dir_item, name_len);
+	name_ptr = (char *)(dir_item + 1);
+	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 822243650ae..4b042460e87 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -447,7 +447,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
 	key.offset = filp->f_pos;
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
@@ -477,7 +477,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		item = leaf->items + slot;
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
-		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_ITEM_KEY)
+		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_INDEX_KEY)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
 			continue;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
new file mode 100644
index 00000000000..ed8b7e20cfb
--- /dev/null
+++ b/fs/btrfs/sysfs.c
@@ -0,0 +1,4 @@
+#include <linux/module.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
-- 
cgit v1.2.3


From 5f26f772e5c4e833ffcb0599f54deda466d2a3e5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 5 Apr 2007 10:38:44 -0400
Subject: Btrfs: more inode indexed directory work

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  4 +++
 fs/btrfs/dir-item.c    | 18 ++++++++++
 fs/btrfs/extent-tree.c |  3 +-
 fs/btrfs/super.c       | 90 ++++++++++++++++++++++++++------------------------
 4 files changed, 70 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 87c56222a62..61d7b4738af 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -921,6 +921,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, u64 dir,
 			  const char *name, int name_len, int mod);
+int btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path, u64 dir,
+				u64 objectid, int mod);
 int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 			      const char *name, int name_len);
 /* inode-map.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 2a87e23ac9c..62d0c0916a7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -80,6 +80,24 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+int btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path, u64 dir,
+				u64 objectid, int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = dir;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = objectid;
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	return ret;
+}
+
 int btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 688aa861a92..7c21f63f1b9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -224,6 +224,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
+
 	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
 	if (ret) {
 		printk("failed to find %Lu\n", key.objectid);
@@ -363,8 +364,6 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (root->fs_info->last_insert.objectid > search_start)
 		search_start = root->fs_info->last_insert.objectid;
 
-	path = btrfs_alloc_path();
-
 check_failed:
 	btrfs_init_path(path);
 	ins->objectid = search_start;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4b042460e87..d776b29a167 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,6 +130,13 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 			    struct btrfs_dir_item);
 	objectid = btrfs_dir_objectid(di);
 
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_release_path(root, path);
+	ret = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					  objectid, -1);
+	BUG_ON(ret);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 	dentry->d_inode->i_ctime = dir->i_ctime;
@@ -138,7 +145,7 @@ err:
 	btrfs_free_path(path);
 	if (ret == 0) {
 		inode_dec_link_count(dentry->d_inode);
-		dir->i_size -= name_len;
+		dir->i_size -= name_len * 2;
 		mark_inode_dirty(dir);
 	}
 	return ret;
@@ -168,8 +175,10 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_disk_key *found_key;
+	struct btrfs_key found_key;
+	int found_type;
 	struct btrfs_leaf *leaf;
+	char *goodnames = "..";
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -178,46 +187,42 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	trans = btrfs_start_transaction(root, 1);
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
+	key.flags = (u32)-1;
+	while(1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		BUG_ON(ret == 0);
+		if (path->slots[0] == 0) {
+			err = -ENOENT;
+			goto out;
+		}
+		path->slots[0]--;
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key,
+				      &leaf->items[path->slots[0]].key);
+		found_type = btrfs_key_type(&found_key);
+		if (found_key.objectid != inode->i_ino) {
+			err = -ENOENT;
+			goto out;
+		}
+		if ((found_type != BTRFS_DIR_ITEM_KEY &&
+		     found_type != BTRFS_DIR_INDEX_KEY) ||
+	            (!btrfs_match_dir_item_name(root, path, goodnames, 2) &&
+	            !btrfs_match_dir_item_name(root, path, goodnames, 1))) {
+			err = -ENOTEMPTY;
+			goto out;
+		}
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
 
-	BUG_ON(ret == 0);
-	BUG_ON(path->slots[0] == 0);
-	path->slots[0]--;
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	found_key = &leaf->items[path->slots[0]].key;
-	if (btrfs_disk_key_objectid(found_key) != inode->i_ino) {
-		err = -ENOENT;
-		goto out;
-	}
-	if (btrfs_disk_key_type(found_key) != BTRFS_DIR_ITEM_KEY ||
-	    btrfs_disk_key_offset(found_key) != 2) {
-		err = -ENOTEMPTY;
-		goto out;
-	}
-	ret = btrfs_del_item(trans, root, path);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
-	key.offset = 1;
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret > 0) {
-		err = -ENOTEMPTY;
-		goto out;
-	}
-	ret = btrfs_del_item(trans, root, path);
-	if (ret) {
-		err = ret;
-		goto out;
+		if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1)
+			break;
+		btrfs_release_path(root, path);
 	}
+	ret = 0;
 	btrfs_release_path(root, path);
 
 	/* now the directory is empty */
@@ -676,7 +681,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_parent->d_inode->i_ino,
 				    inode->i_ino, 0);
 	if (ret == 0) {
-		dentry->d_parent->d_inode->i_size += dentry->d_name.len;
+		dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
 		ret = btrfs_update_inode(trans, btrfs_sb(inode->i_sb),
 					 dentry->d_parent->d_inode);
 	}
@@ -754,7 +759,7 @@ static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 				    dir->i_ino, 1);
 	if (ret)
 		goto error;
-	inode->i_size = 3;
+	inode->i_size = 6;
 	ret = btrfs_update_inode(trans, root, inode);
 error:
 	return ret;
@@ -915,7 +920,6 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	if (create)
 		trans = btrfs_start_transaction(root, 1);
 
-
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       inode->i_ino,
 				       iblock << inode->i_blkbits, 0);
-- 
cgit v1.2.3


From 7fcde0e3298c00ee18e2ae7f01c67f99133ef7be Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 5 Apr 2007 12:13:21 -0400
Subject: Btrfs: finish off inode indexing in dirs, add overflows

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h    | 33 ++++++++++++++++++++++++
 fs/btrfs/dir-item.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/super.c    | 11 ++++----
 3 files changed, 107 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 61d7b4738af..794f7e4f1c0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -270,6 +270,10 @@ struct btrfs_root {
 #define BTRFS_KEY_TYPE_MAX	256
 #define BTRFS_KEY_TYPE_MASK	(BTRFS_KEY_TYPE_MAX - 1)
 
+#define BTRFS_KEY_OVERFLOW_MAX 128
+#define BTRFS_KEY_OVERFLOW_SHIFT 8
+#define BTRFS_KEY_OVERFLOW_MASK (0x7FULL << BTRFS_KEY_OVERFLOW_SHIFT)
+
 /*
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
@@ -588,6 +592,19 @@ static inline void btrfs_set_disk_key_flags(struct btrfs_disk_key *disk,
 	disk->flags = cpu_to_le32(val);
 }
 
+static inline u32 btrfs_key_overflow(struct btrfs_key *key)
+{
+	u32 over = key->flags & BTRFS_KEY_OVERFLOW_MASK;
+	return over >> BTRFS_KEY_OVERFLOW_SHIFT;
+}
+
+static inline void btrfs_set_key_overflow(struct btrfs_key *key, u32 over)
+{
+	BUG_ON(over > BTRFS_KEY_OVERFLOW_MAX);
+	over = over << BTRFS_KEY_OVERFLOW_SHIFT;
+	key->flags = (key->flags & ~((u64)BTRFS_KEY_OVERFLOW_MASK)) | over;
+}
+
 static inline u32 btrfs_key_type(struct btrfs_key *key)
 {
 	return key->flags & BTRFS_KEY_TYPE_MASK;
@@ -612,6 +629,22 @@ static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key, u32 type)
 	btrfs_set_disk_key_flags(key, flags);
 }
 
+static inline u32 btrfs_disk_key_overflow(struct btrfs_disk_key *key)
+{
+	u32 over = le32_to_cpu(key->flags) & BTRFS_KEY_OVERFLOW_MASK;
+	return over >> BTRFS_KEY_OVERFLOW_SHIFT;
+}
+
+static inline void btrfs_set_disK_key_overflow(struct btrfs_disk_key *key,
+					       u32 over)
+{
+	u32 flags = btrfs_disk_key_flags(key);
+	BUG_ON(over > BTRFS_KEY_OVERFLOW_MAX);
+	over = over << BTRFS_KEY_OVERFLOW_SHIFT;
+	flags = (flags & ~((u64)BTRFS_KEY_OVERFLOW_MASK)) | over;
+	btrfs_set_disk_key_flags(key, flags);
+}
+
 static inline u64 btrfs_header_blocknr(struct btrfs_header *h)
 {
 	return le64_to_cpu(h->blocknr);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 62d0c0916a7..b1629a5d73c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -4,6 +4,26 @@
 #include "hash.h"
 #include "transaction.h"
 
+int insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root
+			    *root, struct btrfs_path *path, struct btrfs_key
+			    *cpu_key, u32 data_size)
+{
+	int overflow;
+	int ret;
+
+	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
+	overflow = btrfs_key_overflow(cpu_key);
+
+	while(ret == -EEXIST && overflow < BTRFS_KEY_OVERFLOW_MAX) {
+		overflow++;
+		btrfs_set_key_overflow(cpu_key, overflow);
+		btrfs_release_path(root, path);
+		ret = btrfs_insert_empty_item(trans, root, path, cpu_key,
+					      data_size);
+	}
+	return ret;
+}
+
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir, u64
 			  objectid, u8 type)
@@ -23,7 +43,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
 	data_size = sizeof(*dir_item) + name_len;
-	ret = btrfs_insert_empty_item(trans, root, path, &key, data_size);
+	ret = insert_with_overflow(trans, root, path, &key, data_size);
 	if (ret)
 		goto out;
 
@@ -41,7 +61,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
 	key.offset = objectid;
-	ret = btrfs_insert_empty_item(trans, root, path, &key, data_size);
+	ret = insert_with_overflow(trans, root, path, &key, data_size);
 	// FIXME clear the dirindex bit
 	if (ret)
 		goto out;
@@ -70,14 +90,40 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
+	struct btrfs_disk_key *found_key;
+	struct btrfs_leaf *leaf;
 
 	key.objectid = dir;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	btrfs_set_key_overflow(&key, BTRFS_KEY_OVERFLOW_MAX - 1);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
-	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-	return ret;
+	while(1) {
+		ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+		if (ret < 0)
+			return ret;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				return 1;
+			path->slots[0]--;
+		}
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		found_key = &leaf->items[path->slots[0]].key;
+
+		if (btrfs_disk_key_objectid(found_key) != dir ||
+		    btrfs_disk_key_type(found_key) != BTRFS_DIR_ITEM_KEY ||
+		    btrfs_disk_key_offset(found_key) != key.offset)
+			return 1;
+
+		if (btrfs_match_dir_item_name(root, path, name, name_len))
+			return 0;
+
+		if (btrfs_disk_key_overflow(found_key) == 0)
+			return 1;
+		btrfs_release_path(root, path);
+	}
+	return 1;
 }
 
 int btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
@@ -89,13 +135,31 @@ int btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
+	struct btrfs_disk_key *found_key;
+	struct btrfs_leaf *leaf;
 
 	key.objectid = dir;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	btrfs_set_key_overflow(&key, BTRFS_KEY_OVERFLOW_MAX - 1);
 	key.offset = objectid;
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-	return ret;
+	if (ret < 0)
+		return ret;
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			return 1;
+		path->slots[0]--;
+	}
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	found_key = &leaf->items[path->slots[0]].key;
+
+	if (btrfs_disk_key_objectid(found_key) != dir ||
+	    btrfs_disk_key_type(found_key) != BTRFS_DIR_INDEX_KEY)
+		return 1;
+	if (btrfs_disk_key_offset(found_key) == objectid)
+		return 0;
+	return 1;
 }
 
 int btrfs_match_dir_item_name(struct btrfs_root *root,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d776b29a167..4fd2b168b2c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -486,19 +486,18 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
 			continue;
-
+		filp->f_pos = btrfs_disk_key_offset(&item->key);
 		advance = 1;
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		over = filldir(dirent, (const char *)(di + 1),
 			       btrfs_dir_name_len(di),
 			       btrfs_disk_key_offset(&item->key),
 			       btrfs_dir_objectid(di), d_type);
-		if (over) {
-			filp->f_pos = btrfs_disk_key_offset(&item->key);
-			break;
-		}
-		filp->f_pos = btrfs_disk_key_offset(&item->key) + 1;
+		if (over)
+			goto nopos;
 	}
+	filp->f_pos++;
+nopos:
 	ret = 0;
 err:
 	btrfs_release_path(root, path);
-- 
cgit v1.2.3


From 5be6f7f174146d91039a27ebb2f1b4ac599172b3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 5 Apr 2007 13:35:25 -0400
Subject: Btrfs: dirindex optimizations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  3 +++
 fs/btrfs/dir-item.c  |  5 ++++-
 fs/btrfs/disk-io.c   |  6 +++++-
 fs/btrfs/inode-map.c | 48 +++++++++++++++++++++++++++++++++---------------
 fs/btrfs/super.c     |  5 +++++
 5 files changed, 50 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 794f7e4f1c0..6d40758b893 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -239,6 +239,7 @@ struct btrfs_fs_info {
 	struct radix_tree_root pinned_radix;
 	u64 last_inode_alloc;
 	u64 generation;
+	u64 highest_inode;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
 	struct buffer_head *sb_buffer;
@@ -970,6 +971,8 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct btrfs_path *path,
 			   u64 objectid, int mod);
+int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
+
 /* inode-item.c */
 int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, u64 objectid, struct btrfs_inode_item
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index b1629a5d73c..0ee9945fb1b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -92,6 +92,7 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	int cow = mod != 0;
 	struct btrfs_disk_key *found_key;
 	struct btrfs_leaf *leaf;
+	u32 overflow;
 
 	key.objectid = dir;
 	key.flags = 0;
@@ -119,8 +120,10 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (btrfs_match_dir_item_name(root, path, name, name_len))
 			return 0;
 
-		if (btrfs_disk_key_overflow(found_key) == 0)
+		overflow = btrfs_disk_key_overflow(found_key);
+		if (overflow == 0)
 			return 1;
+		btrfs_set_key_overflow(&key, overflow - 1);
 		btrfs_release_path(root, path);
 	}
 	return 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index de9ee3aa0aa..5230554380d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -311,6 +311,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->extent_root = extent_root;
 	fs_info->inode_root = inode_root;
 	fs_info->last_inode_alloc = 0;
+	fs_info->highest_inode = 0;
 	fs_info->sb = sb;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
@@ -360,12 +361,15 @@ printk("failed2\n");
 
 	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
 				  BTRFS_FS_TREE_OBJECTID, root);
-	mutex_unlock(&fs_info->fs_mutex);
 	BUG_ON(ret);
 	root->commit_root = root->node;
 	get_bh(root->node);
 	root->ref_cows = 1;
 	root->fs_info->generation = root->root_key.offset + 1;
+	ret = btrfs_find_highest_inode(root, &root->fs_info->last_inode_alloc);
+	if (ret == 0)
+		fs_info->highest_inode = fs_info->last_inode_alloc;
+	mutex_unlock(&fs_info->fs_mutex);
 	return root;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 329edb42897..f665221409a 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -3,6 +3,37 @@
 #include "disk-io.h"
 #include "transaction.h"
 
+int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_leaf *l;
+	struct btrfs_root *root = fs_root->fs_info->inode_root;
+	struct btrfs_key search_key;
+	int slot;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	search_key.objectid = (u64)-1;
+	search_key.offset = (u64)-1;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	BUG_ON(ret == 0);
+	if (path->slots[0] > 0) {
+		slot = path->slots[0] - 1;
+		l = btrfs_buffer_leaf(path->nodes[0]);
+		*objectid = btrfs_disk_key_objectid(&l->items[slot].key);
+	} else {
+		*objectid = BTRFS_FIRST_FREE_OBJECTID;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * walks the btree of allocated inodes and find a hole.
  */
@@ -28,21 +59,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
 
 	search_start = fs_root->fs_info->last_inode_alloc;
-	if (search_start == 0) {
-		struct btrfs_disk_key *last_key;
-		btrfs_init_path(path);
-		search_key.objectid = (u64)-1;
-		search_key.offset = (u64)-1;
-		ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
-		if (ret < 0)
-			goto error;
-		BUG_ON(ret == 0);
-		if (path->slots[0] > 0)
-			path->slots[0]--;
-		l = btrfs_buffer_leaf(path->nodes[0]);
-		last_key = &l->items[path->slots[0]].key;
-		search_start = btrfs_disk_key_objectid(last_key);
-	}
 	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
 	search_key.offset = 0;
@@ -129,6 +145,8 @@ int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
 				    path->slots[0], struct btrfs_inode_map_item);
 	btrfs_cpu_key_to_disk(&inode_item->key, location);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	if (objectid > fs_root->fs_info->highest_inode)
+		fs_root->fs_info->highest_inode = objectid;
 out:
 	btrfs_release_path(inode_root, path);
 	btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4fd2b168b2c..d4ee78046b8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -482,6 +482,11 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		item = leaf->items + slot;
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
+		if (btrfs_disk_key_offset(&item->key) >
+		    root->fs_info->highest_inode) {
+printk("stopping at highest inode %Lu\n", root->fs_info->highest_inode);
+			break;
+		}
 		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_INDEX_KEY)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
-- 
cgit v1.2.3


From 3eb0314dc1053b1ae617dcc8d6d93f776c5baa31 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 5 Apr 2007 14:28:50 -0400
Subject: Btrfs: uuids

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO    |  1 -
 fs/btrfs/ctree.c | 12 ++++++++++++
 fs/btrfs/super.c |  4 +---
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index db986c46a55..ea4c38c790b 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -3,7 +3,6 @@
 * Add block mapping tree (simple dm layer)
 * Make allocator smarter
 * add a block group to struct inode
-* Make directory hashing work on 32 bit
 * Make sure nobh stuff is working properly for cows
 * Do actual block accounting
 * Check compat and incompat flags on the inode
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 864ee423b30..39b551564fc 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -200,6 +200,10 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 static int check_block(struct btrfs_root *root, struct btrfs_path *path,
 			int level)
 {
+	struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]);
+	if (memcmp(node->header.fsid, root->fs_info->disk_super->fsid,
+		   sizeof(node->header.fsid)))
+		BUG();
 	if (level == 0)
 		return check_leaf(root, path, level);
 	return check_node(root, path, level);
@@ -687,6 +691,8 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_parentid(&c->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	lower = btrfs_buffer_node(path->nodes[level-1]);
+	memcpy(c->header.fsid, root->fs_info->disk_super->fsid,
+	       sizeof(c->header.fsid));
 	if (btrfs_is_leaf(lower))
 		lower_key = &((struct btrfs_leaf *)lower)->items[0].key;
 	else
@@ -780,6 +786,8 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_generation(&split->header, trans->transid);
 	btrfs_set_header_parentid(&split->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
+	memcpy(split->header.fsid, root->fs_info->disk_super->fsid,
+	       sizeof(split->header.fsid));
 	mid = (c_nritems + 1) / 2;
 	btrfs_memcpy(root, split, split->ptrs, c->ptrs + mid,
 		     (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
@@ -1139,6 +1147,8 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_level(&right->header, 0);
 	btrfs_set_header_parentid(&right->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
+	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
+	       sizeof(right->header.fsid));
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + space_needed >
@@ -1229,6 +1239,8 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_level(&right->header, 0);
 	btrfs_set_header_parentid(&right->header,
 	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
+	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
+	       sizeof(right->header.fsid));
 	btrfs_cpu_key_to_disk(&disk_key, ins_key);
 	btrfs_set_header_nritems(&right->header, 0);
 	wret = insert_ptr(trans, root, path,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d4ee78046b8..32224ffdcec 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -483,10 +483,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
 		if (btrfs_disk_key_offset(&item->key) >
-		    root->fs_info->highest_inode) {
-printk("stopping at highest inode %Lu\n", root->fs_info->highest_inode);
+		    root->fs_info->highest_inode)
 			break;
-		}
 		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_INDEX_KEY)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
-- 
cgit v1.2.3


From d6e4a428eb8f92bbb3537ccabadfb1195efb432b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 6 Apr 2007 15:37:36 -0400
Subject: Btrfs: start of support for many FS volumes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |   7 ++
 fs/btrfs/btrfs_inode.h |   2 +
 fs/btrfs/ctree.h       |  36 ++++----
 fs/btrfs/dir-item.c    |  14 ++--
 fs/btrfs/disk-io.c     |   7 +-
 fs/btrfs/inode-item.c  |  26 ++++--
 fs/btrfs/print-tree.c  |   2 +-
 fs/btrfs/super.c       | 218 +++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/transaction.c |   9 ++
 9 files changed, 242 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index ea4c38c790b..73d4b842a31 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -19,3 +19,10 @@
 * fsck
 * Scrub & defrag
 
+---
+metata FS ideas:
+mount metadata FS on /sys/fs/btrfs/dev/
+subvolumes in /sys/fs/btrfs/dev/fs/
+snapshots in /sys/fs/btrfs/dev/fs/snapshots
+mount -o bind
+ioctl to create a snapshot
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e159841650a..3439ffb467b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -3,6 +3,8 @@
 
 struct btrfs_inode {
 	u32 magic;
+	struct btrfs_root *root;
+	struct btrfs_key location;
 	struct inode vfs_inode;
 	u32 magic2;
 };
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6d40758b893..1ff5b99af68 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3,6 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/kobject.h>
 #include "bit-radix.h"
 
 struct btrfs_trans_handle;
@@ -183,13 +184,15 @@ struct btrfs_inline_data_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_dir_item {
-	__le64 objectid;
+	struct btrfs_disk_key location;
 	__le16 flags;
 	__le16 name_len;
 	u8 type;
 } __attribute__ ((__packed__));
 
 struct btrfs_root_item {
+	struct btrfs_inode_item inode;
+	__le64 root_dirid;
 	__le64 blocknr;
 	__le32 flags;
 	__le64 block_limit;
@@ -249,6 +252,7 @@ struct btrfs_fs_info {
 	struct mutex fs_mutex;
 	struct crypto_hash *hash_tfm;
 	spinlock_t hash_lock;
+	struct kobject kobj;
 };
 
 /*
@@ -504,16 +508,6 @@ static inline void btrfs_set_item_size(struct btrfs_item *item, u16 val)
 	item->size = cpu_to_le16(val);
 }
 
-static inline u64 btrfs_dir_objectid(struct btrfs_dir_item *d)
-{
-	return le64_to_cpu(d->objectid);
-}
-
-static inline void btrfs_set_dir_objectid(struct btrfs_dir_item *d, u64 val)
-{
-	d->objectid = cpu_to_le64(val);
-}
-
 static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d)
 {
 	return le16_to_cpu(d->flags);
@@ -724,6 +718,16 @@ static inline void btrfs_set_root_blocknr(struct btrfs_root_item *item, u64 val)
 	item->blocknr = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_root_dirid(struct btrfs_root_item *item)
+{
+	return le64_to_cpu(item->root_dirid);
+}
+
+static inline void btrfs_set_root_dirid(struct btrfs_root_item *item, u64 val)
+{
+	item->root_dirid = cpu_to_le64(val);
+}
+
 static inline u32 btrfs_root_refs(struct btrfs_root_item *item)
 {
 	return le32_to_cpu(item->refs);
@@ -950,8 +954,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir, u64
-			  objectid, u8 type);
+			  *root, const char *name, int name_len, u64 dir,
+			  struct btrfs_key *location, u8 type);
 int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, u64 dir,
 			  const char *name, int name_len, int mod);
@@ -978,7 +982,8 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, u64 objectid, struct btrfs_inode_item
 		       *inode_item);
 int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct btrfs_path *path, u64 objectid, int mod);
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod);
 
 /* file-item.c */
 int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
@@ -997,4 +1002,7 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 int btrfs_csum_verify_file_block(struct btrfs_root *root,
 				 u64 objectid, u64 offset,
 				 char *data, size_t len);
+/* super.c */
+extern struct subsystem btrfs_subsys;
+
 #endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 0ee9945fb1b..7aed9f015b5 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -25,8 +25,8 @@ int insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root
 }
 
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir, u64
-			  objectid, u8 type)
+			  *root, const char *name, int name_len, u64 dir,
+			  struct btrfs_key *location, u8 type)
 {
 	int ret = 0;
 	struct btrfs_path *path;
@@ -50,17 +50,21 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 				  path->slots[0],
 				  struct btrfs_dir_item);
-	btrfs_set_dir_objectid(dir_item, objectid);
+	btrfs_cpu_key_to_disk(&dir_item->location, location);
 	btrfs_set_dir_type(dir_item, type);
 	btrfs_set_dir_flags(dir_item, 0);
 	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
+	/* FIXME, use some real flag for selecting the extra index */
+	if (root == root->fs_info->tree_root)
+		goto out;
+
 	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(root, path);
 
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	key.offset = objectid;
+	key.offset = location->objectid;
 	ret = insert_with_overflow(trans, root, path, &key, data_size);
 	// FIXME clear the dirindex bit
 	if (ret)
@@ -69,7 +73,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 				  path->slots[0],
 				  struct btrfs_dir_item);
-	btrfs_set_dir_objectid(dir_item, objectid);
+	btrfs_cpu_key_to_disk(&dir_item->location, location);
 	btrfs_set_dir_type(dir_item, type);
 	btrfs_set_dir_flags(dir_item, 0);
 	btrfs_set_dir_name_len(dir_item, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5230554380d..b9301a5e460 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -338,7 +338,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 					     sb->s_blocksize);
 
 	if (!fs_info->sb_buffer) {
-printk("failed2\n");
 		return NULL;
 	}
 	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
@@ -369,6 +368,10 @@ printk("failed2\n");
 	ret = btrfs_find_highest_inode(root, &root->fs_info->last_inode_alloc);
 	if (ret == 0)
 		fs_info->highest_inode = fs_info->last_inode_alloc;
+	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
+	kobj_set_kset_s(fs_info, btrfs_subsys);
+	kobject_set_name(&fs_info->kobj, "%s", sb->s_id);
+	kobject_register(&fs_info->kobj);
 	mutex_unlock(&fs_info->fs_mutex);
 	return root;
 }
@@ -430,7 +433,7 @@ int close_ctree(struct btrfs_root *root)
 	kfree(root->fs_info->extent_root);
 	kfree(root->fs_info->inode_root);
 	kfree(root->fs_info->tree_root);
-	kfree(root->fs_info);
+	kobject_unregister(&root->fs_info->kobj);
 	kfree(root);
 	return 0;
 }
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6bfa980790c..b276a3b40a6 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -26,15 +26,27 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 }
 
 int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, struct btrfs_path *path, u64 objectid, int mod)
+		       *root, struct btrfs_path *path,
+		       struct btrfs_key *location, int mod)
 {
-	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
+	int ret;
+	int slot;
+	struct btrfs_leaf *leaf;
+	struct btrfs_key found_key;
 
-	key.objectid = objectid;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	key.offset = 0;
-	return btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
+	if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
+	    location->offset == (u64)-1 && path->slots[0] != 0) {
+		slot = path->slots[0] - 1;
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key, &leaf->items[slot].key);
+		if (found_key.objectid == location->objectid &&
+		    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
+			path->slots[0]--;
+			return 0;
+		}
+	}
+	return ret;
 }
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index c8ee938c125..f0da65c4f96 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -38,7 +38,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			printk("\t\tdir oid %Lu flags %u type %u\n",
-				btrfs_dir_objectid(di),
+				btrfs_disk_key_objectid(&di->location),
 				btrfs_dir_flags(di),
 				btrfs_dir_type(di));
 			printk("\t\tname %.*s\n",
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 32224ffdcec..66d9fb2288c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -16,9 +16,23 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 
+void btrfs_fsinfo_release(struct kobject *obj)
+{
+	struct btrfs_fs_info *fsinfo = container_of(obj,
+					    struct btrfs_fs_info, kobj);
+	kfree(fsinfo);
+}
+
+struct kobj_type btrfs_fsinfo_ktype = {
+	.release = btrfs_fsinfo_release,
+};
+
+decl_subsys(btrfs, &btrfs_fsinfo_ktype, NULL);
+
 #define BTRFS_SUPER_MAGIC 0x9123682E
 
 static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_dir_ro_inode_operations;
 static struct super_operations btrfs_super_ops;
 static struct file_operations btrfs_dir_file_operations;
 static struct inode_operations btrfs_file_inode_operations;
@@ -37,7 +51,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key location;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -46,13 +61,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	mutex_lock(&root->fs_info->fs_mutex);
 
 	check_inode(inode);
-	ret = btrfs_lookup_inode(NULL, root, path, inode->i_ino, 0);
+
+	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret) {
-		btrfs_release_path(root, path);
 		btrfs_free_path(path);
-		mutex_unlock(&root->fs_info->fs_mutex);
-		make_bad_inode(inode);
-		return;
+		goto make_bad;
 	}
 	check_inode(inode);
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
@@ -73,7 +87,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
 
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	inode_item = NULL;
 
@@ -92,8 +105,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
 	case S_IFDIR:
-		inode->i_op = &btrfs_dir_inode_operations;
 		inode->i_fop = &btrfs_dir_file_operations;
+		if (root == root->fs_info->tree_root)
+			inode->i_op = &btrfs_dir_ro_inode_operations;
+		else
+			inode->i_op = &btrfs_dir_inode_operations;
 		break;
 	case S_IFLNK:
 		// inode->i_op = &page_symlink_inode_operations;
@@ -101,6 +117,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	}
 	check_inode(inode);
 	return;
+
+make_bad:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	make_bad_inode(inode);
 }
 
 static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
@@ -128,7 +150,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	}
 	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_dir_item);
-	objectid = btrfs_dir_objectid(di);
+	objectid = btrfs_disk_key_objectid(&di->location);
 
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
@@ -157,7 +179,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	root = btrfs_sb(dir->i_sb);
+	root = BTRFS_I(dir)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
@@ -171,7 +193,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int err;
 	int ret;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_trans_handle *trans;
@@ -268,7 +290,8 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
-	ret = btrfs_lookup_inode(trans, root, path, objectid, -1);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, -1);
 	BUG_ON(ret);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
@@ -355,7 +378,7 @@ error:
 static void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	truncate_inode_pages(&inode->i_data, 0);
@@ -378,13 +401,13 @@ no_delete:
 }
 
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
-			      ino_t *ino)
+			       struct btrfs_key *location)
 {
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -393,13 +416,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	ret = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
 	if (ret || !btrfs_match_dir_item_name(root, path, name, namelen)) {
-		*ino = 0;
+		location->objectid = 0;
 		ret = 0;
 		goto out;
 	}
 	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_dir_item);
-	*ino = btrfs_dir_objectid(di);
+	btrfs_disk_key_to_cpu(location, &di->location);
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
@@ -407,26 +430,76 @@ out:
 	return ret;
 }
 
+int fixup_tree_root_location(struct btrfs_root *root,
+			     struct btrfs_key *location,
+			     struct btrfs_root **sub_root)
+{
+	struct btrfs_path *path;
+	struct btrfs_root_item *ri;
+	int ret;
+
+	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+		return 0;
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	mutex_lock(&root->fs_info->fs_mutex);
+
+	ret = btrfs_lookup_inode(NULL, root, path, location, 0);
+	if (ret)
+		goto out;
+	ri = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+			  path->slots[0],
+			  struct btrfs_root_item);
+	location->objectid = btrfs_root_dirid(ri);
+	location->flags = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+	location->offset = 0;
+	/* FIXME properly select the root */
+	*sub_root = root->fs_info->fs_root;
+out:
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
 	struct inode * inode;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
-	ino_t ino;
+	struct btrfs_inode *bi = BTRFS_I(dir);
+	struct btrfs_root *root = bi->root;
+	struct btrfs_root *sub_root = root;
+	struct btrfs_key location;
 	int ret;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_inode_by_name(dir, dentry, &ino);
+	ret = btrfs_inode_by_name(dir, dentry, &location);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret < 0)
 		return ERR_PTR(ret);
 	inode = NULL;
-	if (ino) {
-		inode = iget(dir->i_sb, ino);
+	if (location.objectid) {
+		ret = fixup_tree_root_location(root, &location, &sub_root);
+		if (ret < 0)
+			return ERR_PTR(ret);
+		if (ret > 0)
+			return ERR_PTR(-ENOENT);
+		inode = iget_locked(dir->i_sb, location.objectid);
 		if (!inode)
 			return ERR_PTR(-EACCES);
+		if (inode->i_state & I_NEW) {
+			BTRFS_I(inode)->root = sub_root;
+			memcpy(&BTRFS_I(inode)->location, &location,
+			       sizeof(location));
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+		}
 		check_inode(inode);
 	}
 	check_inode(dir);
@@ -436,7 +509,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
@@ -448,11 +521,16 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int advance;
 	unsigned char d_type = DT_UNKNOWN;
 	int over = 0;
+	int key_type = BTRFS_DIR_INDEX_KEY;
+
+	/* FIXME, use a real flag for deciding about the key type */
+	if (root->fs_info->tree_root == root)
+		key_type = BTRFS_DIR_ITEM_KEY;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
@@ -482,10 +560,11 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		item = leaf->items + slot;
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
-		if (btrfs_disk_key_offset(&item->key) >
+		if (key_type == BTRFS_DIR_INDEX_KEY &&
+		    btrfs_disk_key_offset(&item->key) >
 		    root->fs_info->highest_inode)
 			break;
-		if (btrfs_disk_key_type(&item->key) != BTRFS_DIR_INDEX_KEY)
+		if (btrfs_disk_key_type(&item->key) != key_type)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
 			continue;
@@ -495,7 +574,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		over = filldir(dirent, (const char *)(di + 1),
 			       btrfs_dir_name_len(di),
 			       btrfs_disk_key_offset(&item->key),
-			       btrfs_dir_objectid(di), d_type);
+			       btrfs_disk_key_objectid(&di->location), d_type);
 		if (over)
 			goto nopos;
 	}
@@ -527,6 +606,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	struct dentry * root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *root;
+	struct btrfs_inode *bi;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
@@ -546,6 +626,13 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	       btrfs_super_root_dir(disk_super));
 
 	inode = iget_locked(sb, btrfs_super_root_dir(disk_super));
+	bi = BTRFS_I(inode);
+	bi->location.objectid = inode->i_ino;
+	bi->location.offset = 0;
+	bi->location.flags = 0;
+	bi->root = root->fs_info->tree_root;
+	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
+
 	if (!inode)
 		return -ENOMEM;
 	if (inode->i_state & I_NEW) {
@@ -594,7 +681,8 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 	btrfs_init_path(path);
 
-	ret = btrfs_lookup_inode(trans, root, path, inode->i_ino, 1);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
@@ -616,7 +704,7 @@ failed:
 
 static int btrfs_write_inode(struct inode *inode, int wait)
 {
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
 
@@ -637,8 +725,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 {
 	struct inode *inode;
 	struct btrfs_inode_item inode_item;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
-	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_key *key;
 	int ret;
 	u64 objectid;
 
@@ -646,6 +734,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	BTRFS_I(inode)->root = BTRFS_I(dir)->root;
+	key = &BTRFS_I(inode)->location;
 	check_inode(inode);
 	ret = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	BUG_ON(ret);
@@ -658,11 +748,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	fill_inode_item(&inode_item, inode);
 
-	key.objectid = objectid;
-	key.flags = 0;
-	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	ret = btrfs_insert_inode_map(trans, root, objectid, &key);
+	key->objectid = objectid;
+	key->flags = 0;
+	key->offset = 0;
+	btrfs_set_key_type(key, BTRFS_INODE_ITEM_KEY);
+	ret = btrfs_insert_inode_map(trans, root, objectid, key);
 	BUG_ON(ret);
 
 	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
@@ -678,13 +768,20 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode)
 {
 	int ret;
-	ret = btrfs_insert_dir_item(trans, btrfs_sb(inode->i_sb),
+	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
+	key.objectid = inode->i_ino;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	ret = btrfs_insert_dir_item(trans, root,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
-				    inode->i_ino, 0);
+				    &key, 0);
 	if (ret == 0) {
 		dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
-		ret = btrfs_update_inode(trans, btrfs_sb(inode->i_sb),
+		ret = btrfs_update_inode(trans, root,
 					 dentry->d_parent->d_inode);
 	}
 	check_inode(inode);
@@ -710,7 +807,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 			int mode, struct nameidata *nd)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode;
 	int err;
 	int drop_inode = 0;
@@ -747,18 +844,26 @@ out_unlock:
 static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 				struct inode *inode, struct inode *dir)
 {
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret;
 	char buf[2];
+	struct btrfs_key key;
+
 	buf[0] = '.';
 	buf[1] = '.';
 
+	key.objectid = inode->i_ino;
+	key.offset = 0;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+
 	ret = btrfs_insert_dir_item(trans, root, buf, 1, inode->i_ino,
-				    inode->i_ino, 1);
+				    &key, 1);
 	if (ret)
 		goto error;
+	key.objectid = dir->i_ino;
 	ret = btrfs_insert_dir_item(trans, root, buf, 2, inode->i_ino,
-				    dir->i_ino, 1);
+				    &key, 1);
 	if (ret)
 		goto error;
 	inode->i_size = 6;
@@ -771,7 +876,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct inode *inode;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(dir->i_sb);
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int err = 0;
 	int drop_on_err = 0;
 
@@ -910,7 +1015,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
 	struct btrfs_path *path;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_leaf *leaf;
@@ -999,7 +1104,7 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
 	int err;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_get_block_lock(inode, iblock, result, create);
 	// err = btrfs_get_block_inline(inode, iblock, result, create);
@@ -1041,7 +1146,7 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 
 static void btrfs_truncate(struct inode *inode)
 {
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 	struct btrfs_trans_handle *trans;
 
@@ -1208,7 +1313,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	int err = 0;
 	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *pages[1];
 
 	if (file->f_flags & O_DIRECT)
@@ -1365,7 +1470,7 @@ static ssize_t btrfs_file_inline_write(struct file *file,
 	int err = 0;
 	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	unsigned long page_index;
 
 	if (file->f_flags & O_DIRECT)
@@ -1432,13 +1537,14 @@ static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 {
 	char *kaddr;
 	unsigned long left, count = desc->count;
+	struct inode *inode = page->mapping->host;
 
 	if (size > count)
 		size = count;
 
 	if (!PageChecked(page)) {
 		/* FIXME, do it per block */
-		struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
+		struct btrfs_root *root = BTRFS_I(inode)->root;
 		int ret = btrfs_csum_verify_file_block(root,
 					  page->mapping->host->i_ino,
 					  page->index << PAGE_CACHE_SHIFT,
@@ -1662,6 +1768,10 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.rmdir		= btrfs_rmdir,
 };
 
+static struct inode_operations btrfs_dir_ro_inode_operations = {
+	.lookup		= btrfs_lookup,
+};
+
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
@@ -1697,13 +1807,21 @@ static int __init init_btrfs_fs(void)
 	err = init_inodecache();
 	if (err)
 		return err;
+	kset_set_kset_s(&btrfs_subsys, fs_subsys);
+	err = subsystem_register(&btrfs_subsys);
+	if (err)
+		goto out;
 	return register_filesystem(&btrfs_fs_type);
+out:
+	destroy_inodecache();
+	return err;
 }
 
 static void __exit exit_btrfs_fs(void)
 {
 	destroy_inodecache();
 	unregister_filesystem(&btrfs_fs_type);
+	subsystem_unregister(&btrfs_subsys);
 	printk("btrfs unloaded\n");
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b0f98c9212..20d84bd0322 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -52,6 +52,8 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	int ret;
 
+	/* FIXME, use the right root */
+	root = root->fs_info->fs_root;
 	mutex_lock(&root->fs_info->trans_mutex);
 	ret = join_transaction(root);
 	BUG_ON(ret);
@@ -69,6 +71,10 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
+
+	/* FIXME, use the right root */
+	root = root->fs_info->fs_root;
+
 	WARN_ON(trans->magic != TRANS_MAGIC);
 	WARN_ON(trans->magic2 != TRANS_MAGIC);
 	mutex_lock(&root->fs_info->trans_mutex);
@@ -148,6 +154,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	DEFINE_WAIT(wait);
 
+	/* FIXME, use the right root */
+	root = root->fs_info->fs_root;
+
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
 printk("already in commit!, waiting\n");
-- 
cgit v1.2.3


From 0f7d52f4431c530b4f39c524448c688bb7754de5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Apr 2007 10:42:37 -0400
Subject: Btrfs: groundwork for subvolume and snapshot roots

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/bit-radix.c   |   2 +-
 fs/btrfs/ctree.h       |  20 +++++-
 fs/btrfs/disk-io.c     | 178 +++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/disk-io.h     |   2 +
 fs/btrfs/super.c       |  39 ++++++-----
 fs/btrfs/transaction.c | 140 ++++++++++++++++++++++++++++----------
 6 files changed, 282 insertions(+), 99 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index 783f54c2a16..9fc42e99c7d 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -86,7 +86,7 @@ int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
 	int i;
 	int total_found = 0;
 
-	ret = radix_tree_gang_lookup(radix, (void *)&gang, 0, ARRAY_SIZE(gang));
+	ret = radix_tree_gang_lookup(radix, (void **)gang, 0, ARRAY_SIZE(gang));
 	for (i = 0; i < ret && nr > 0; i++) {
 		found = 0;
 		bits = gang[i];
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1ff5b99af68..5460030c9e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -232,12 +232,12 @@ struct btrfs_inode_map_item {
 
 struct crypto_hash;
 struct btrfs_fs_info {
-	struct btrfs_root *fs_root;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *inode_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
+	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
 	u64 last_inode_alloc;
@@ -266,6 +266,9 @@ struct btrfs_root {
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
+	struct inode *inode;
+	u64 objectid;
+	u64 last_trans;
 	u32 blocksize;
 	int ref_cows;
 	u32 type;
@@ -595,7 +598,7 @@ static inline u32 btrfs_key_overflow(struct btrfs_key *key)
 
 static inline void btrfs_set_key_overflow(struct btrfs_key *key, u32 over)
 {
-	BUG_ON(over > BTRFS_KEY_OVERFLOW_MAX);
+	BUG_ON(over >= BTRFS_KEY_OVERFLOW_MAX);
 	over = over << BTRFS_KEY_OVERFLOW_SHIFT;
 	key->flags = (key->flags & ~((u64)BTRFS_KEY_OVERFLOW_MASK)) | over;
 }
@@ -634,7 +637,7 @@ static inline void btrfs_set_disK_key_overflow(struct btrfs_disk_key *key,
 					       u32 over)
 {
 	u32 flags = btrfs_disk_key_flags(key);
-	BUG_ON(over > BTRFS_KEY_OVERFLOW_MAX);
+	BUG_ON(over >= BTRFS_KEY_OVERFLOW_MAX);
 	over = over << BTRFS_KEY_OVERFLOW_SHIFT;
 	flags = (flags & ~((u64)BTRFS_KEY_OVERFLOW_MASK)) | over;
 	btrfs_set_disk_key_flags(key, flags);
@@ -748,6 +751,17 @@ static inline void btrfs_set_super_blocknr(struct btrfs_super_block *s, u64 val)
 	s->blocknr = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_super_generation(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->generation);
+}
+
+static inline void btrfs_set_super_generation(struct btrfs_super_block *s,
+					      u64 val)
+{
+	s->generation = cpu_to_le64(val);
+}
+
 static inline u64 btrfs_super_root(struct btrfs_super_block *s)
 {
 	return le64_to_cpu(s->root);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9301a5e460..b557bdd1e26 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4,9 +4,11 @@
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 #include <linux/swap.h>
+#include <linux/radix-tree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "btrfs_inode.h"
 
 static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
@@ -180,7 +182,7 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct buffer_head *bh;
-	struct btrfs_root *root = btrfs_sb(page->mapping->host->i_sb);
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct buffer_head *head;
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, root->fs_info->sb->s_blocksize,
@@ -259,10 +261,13 @@ static int __setup_root(int blocksize,
 			u64 objectid)
 {
 	root->node = NULL;
+	root->inode = NULL;
 	root->commit_root = NULL;
 	root->blocksize = blocksize;
 	root->ref_cows = 0;
 	root->fs_info = fs_info;
+	root->objectid = objectid;
+	root->last_trans = 0;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	return 0;
@@ -287,10 +292,78 @@ static int find_and_setup_root(int blocksize,
 	return 0;
 }
 
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+				      struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_path *path;
+	struct btrfs_leaf *l;
+	int ret = 0;
+
+printk("read_fs_root looking for %Lu %Lu %u\n", location->objectid, location->offset, location->flags);
+	root = kmalloc(sizeof(*root), GFP_NOFS);
+	if (!root) {
+printk("failed1\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	if (location->offset == (u64)-1) {
+		ret = find_and_setup_root(fs_info->sb->s_blocksize,
+					  fs_info->tree_root, fs_info,
+					  location->objectid, root);
+		if (ret) {
+printk("failed2\n");
+			kfree(root);
+			return ERR_PTR(ret);
+		}
+		goto insert;
+	}
+
+	__setup_root(fs_info->sb->s_blocksize, root, fs_info,
+		     location->objectid);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
+	if (ret != 0) {
+printk("internal search_slot gives us %d\n", ret);
+		if (ret > 0)
+			ret = -ENOENT;
+		goto out;
+	}
+	l = btrfs_buffer_leaf(path->nodes[0]);
+	memcpy(&root->root_item,
+	       btrfs_item_ptr(l, path->slots[0], struct btrfs_root_item),
+	       sizeof(root->root_item));
+	memcpy(&root->root_key, location, sizeof(*location));
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	if (ret) {
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+	root->node = read_tree_block(root,
+				     btrfs_root_blocknr(&root->root_item));
+	BUG_ON(!root->node);
+insert:
+printk("inserting %p\n", root);
+	root->ref_cows = 1;
+	ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root,
+				root);
+	if (ret) {
+printk("radix_tree_insert gives us %d\n", ret);
+		brelse(root->node);
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+printk("all worked\n");
+	return root;
+}
+
 struct btrfs_root *open_ctree(struct super_block *sb)
 {
-	struct btrfs_root *root = kmalloc(sizeof(struct btrfs_root),
-					  GFP_NOFS);
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
@@ -304,9 +377,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
+	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
-	fs_info->fs_root = root;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->inode_root = inode_root;
@@ -318,6 +391,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_nlink = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	BTRFS_I(fs_info->btree_inode)->root = tree_root;
+	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+	       sizeof(struct btrfs_key));
 	insert_inode_hash(fs_info->btree_inode);
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->hash_tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
@@ -337,13 +413,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 					     BTRFS_SUPER_INFO_OFFSET /
 					     sb->s_blocksize);
 
-	if (!fs_info->sb_buffer) {
+	if (!fs_info->sb_buffer)
 		return NULL;
-	}
 	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
-	if (!btrfs_super_root(disk_super)) {
+	if (!btrfs_super_root(disk_super))
 		return NULL;
-	}
+
 	fs_info->disk_super = disk_super;
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super));
@@ -358,14 +433,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 				  BTRFS_INODE_MAP_OBJECTID, inode_root);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
-				  BTRFS_FS_TREE_OBJECTID, root);
-	BUG_ON(ret);
-	root->commit_root = root->node;
-	get_bh(root->node);
-	root->ref_cows = 1;
-	root->fs_info->generation = root->root_key.offset + 1;
-	ret = btrfs_find_highest_inode(root, &root->fs_info->last_inode_alloc);
+	fs_info->generation = btrfs_super_generation(disk_super) + 1;
+	ret = btrfs_find_highest_inode(tree_root, &fs_info->last_inode_alloc);
 	if (ret == 0)
 		fs_info->highest_inode = fs_info->last_inode_alloc;
 	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
@@ -373,7 +442,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	kobject_set_name(&fs_info->kobj, "%s", sb->s_id);
 	kobject_register(&fs_info->kobj);
 	mutex_unlock(&fs_info->fs_mutex);
-	return root;
+	return tree_root;
 }
 
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -398,12 +467,42 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 0;
 }
 
+int del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *gang[8];
+	int i;
+
+	while(1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			radix_tree_delete(&fs_info->fs_roots_radix,
+					  (unsigned long)gang[i]);
+			if (gang[i]->inode)
+				iput(gang[i]->inode);
+			else
+				printk("no inode for root %p\n", gang[i]);
+			if (gang[i]->node)
+				brelse(gang[i]->node);
+			if (gang[i]->commit_root)
+				brelse(gang[i]->commit_root);
+			kfree(gang[i]);
+		}
+	}
+	return 0;
+}
+
 int close_ctree(struct btrfs_root *root)
 {
 	int ret;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_commit_transaction(trans, root);
 	/* run commit again to  drop the original snapshot */
@@ -412,29 +511,26 @@ int close_ctree(struct btrfs_root *root)
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
 	write_ctree_super(NULL, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	if (root->node)
-		btrfs_block_release(root, root->node);
-	if (root->fs_info->extent_root->node)
-		btrfs_block_release(root->fs_info->extent_root,
-				    root->fs_info->extent_root->node);
-	if (root->fs_info->inode_root->node)
-		btrfs_block_release(root->fs_info->inode_root,
-				    root->fs_info->inode_root->node);
-	if (root->fs_info->tree_root->node)
-		btrfs_block_release(root->fs_info->tree_root,
-				    root->fs_info->tree_root->node);
-	btrfs_block_release(root, root->commit_root);
-	btrfs_block_release(root, root->fs_info->sb_buffer);
-	crypto_free_hash(root->fs_info->hash_tfm);
-	truncate_inode_pages(root->fs_info->btree_inode->i_mapping, 0);
-	iput(root->fs_info->btree_inode);
-	kfree(root->fs_info->extent_root);
-	kfree(root->fs_info->inode_root);
-	kfree(root->fs_info->tree_root);
-	kobject_unregister(&root->fs_info->kobj);
-	kfree(root);
+	mutex_unlock(&fs_info->fs_mutex);
+
+	if (fs_info->extent_root->node)
+		btrfs_block_release(fs_info->extent_root,
+				    fs_info->extent_root->node);
+	if (fs_info->inode_root->node)
+		btrfs_block_release(fs_info->inode_root,
+				    fs_info->inode_root->node);
+	if (fs_info->tree_root->node)
+		btrfs_block_release(fs_info->tree_root,
+				    fs_info->tree_root->node);
+	btrfs_block_release(root, fs_info->sb_buffer);
+	crypto_free_hash(fs_info->hash_tfm);
+	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+	iput(fs_info->btree_inode);
+	del_fs_roots(fs_info);
+	kfree(fs_info->extent_root);
+	kfree(fs_info->inode_root);
+	kfree(fs_info->tree_root);
+	kobject_unregister(&fs_info->kobj);
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index ac6764ba8aa..0ef6e6f714a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -39,4 +39,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr);
 int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 		    char *result);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+				      struct btrfs_key *location);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 66d9fb2288c..3c9236ca889 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -436,7 +436,6 @@ int fixup_tree_root_location(struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	struct btrfs_root_item *ri;
-	int ret;
 
 	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
 		return 0;
@@ -447,22 +446,19 @@ int fixup_tree_root_location(struct btrfs_root *root,
 	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
-	ret = btrfs_lookup_inode(NULL, root, path, location, 0);
-	if (ret)
-		goto out;
-	ri = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-			  path->slots[0],
-			  struct btrfs_root_item);
+	*sub_root = btrfs_read_fs_root(root->fs_info, location);
+	if (IS_ERR(*sub_root))
+		return PTR_ERR(*sub_root);
+
+	ri = &(*sub_root)->root_item;
 	location->objectid = btrfs_root_dirid(ri);
 	location->flags = 0;
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 	location->offset = 0;
-	/* FIXME properly select the root */
-	*sub_root = root->fs_info->fs_root;
-out:
+
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
+	return 0;
 }
 
 
@@ -494,6 +490,15 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 		if (!inode)
 			return ERR_PTR(-EACCES);
 		if (inode->i_state & I_NEW) {
+			if (sub_root != root) {
+				ret = radix_tree_insert(
+						&root->fs_info->fs_roots_radix,
+						(unsigned long)sub_root,
+						sub_root);
+printk("adding new root for inode %lu\n", inode->i_ino);
+				igrab(inode);
+				sub_root->inode = inode;
+			}
 			BTRFS_I(inode)->root = sub_root;
 			memcpy(&BTRFS_I(inode)->location, &location,
 			       sizeof(location));
@@ -605,7 +610,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	struct inode * inode;
 	struct dentry * root_dentry;
 	struct btrfs_super_block *disk_super;
-	struct btrfs_root *root;
+	struct btrfs_root *tree_root;
 	struct btrfs_inode *bi;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -613,14 +618,14 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_op = &btrfs_super_ops;
 	sb->s_time_gran = 1;
 
-	root = open_ctree(sb);
+	tree_root = open_ctree(sb);
 
-	if (!root) {
+	if (!tree_root) {
 		printk("btrfs: open_ctree failed\n");
 		return -EIO;
 	}
-	sb->s_fs_info = root;
-	disk_super = root->fs_info->disk_super;
+	sb->s_fs_info = tree_root;
+	disk_super = tree_root->fs_info->disk_super;
 	printk("read in super total blocks %Lu root %Lu\n",
 	       btrfs_super_total_blocks(disk_super),
 	       btrfs_super_root_dir(disk_super));
@@ -630,7 +635,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	bi->location.objectid = inode->i_ino;
 	bi->location.offset = 0;
 	bi->location.flags = 0;
-	bi->root = root->fs_info->tree_root;
+	bi->root = tree_root;
 	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 
 	if (!inode)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 20d84bd0322..83a0194ab16 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -8,6 +8,8 @@ static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
+#define BTRFS_ROOT_TRANS_TAG 0
+
 #define TRANS_MAGIC 0xE1E10E
 static void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -31,9 +33,10 @@ static int join_transaction(struct btrfs_root *root)
 					     GFP_NOFS);
 		total_trans++;
 		BUG_ON(!cur_trans);
+		root->fs_info->generation++;
 		root->fs_info->running_transaction = cur_trans;
 		cur_trans->num_writers = 0;
-		cur_trans->transid = root->root_key.offset + 1;
+		cur_trans->transid = root->fs_info->generation;
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
 		cur_trans->magic = TRANS_MAGIC;
@@ -51,13 +54,22 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	struct btrfs_trans_handle *h =
 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	int ret;
+	u64 running_trans_id;
 
-	/* FIXME, use the right root */
-	root = root->fs_info->fs_root;
 	mutex_lock(&root->fs_info->trans_mutex);
 	ret = join_transaction(root);
 	BUG_ON(ret);
-	h->transid = root->fs_info->running_transaction->transid;
+	running_trans_id = root->fs_info->running_transaction->transid;
+
+	if (root != root->fs_info->tree_root && root->last_trans <
+	    running_trans_id) {
+		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+				   (unsigned long)root, BTRFS_ROOT_TRANS_TAG);
+		root->commit_root = root->node;
+		get_bh(root->node);
+	}
+	root->last_trans = running_trans_id;
+	h->transid = running_trans_id;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
@@ -72,9 +84,6 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans;
 
-	/* FIXME, use the right root */
-	root = root->fs_info->fs_root;
-
 	WARN_ON(trans->magic != TRANS_MAGIC);
 	WARN_ON(trans->magic2 != TRANS_MAGIC);
 	mutex_lock(&root->fs_info->trans_mutex);
@@ -145,17 +154,96 @@ static int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+struct dirty_root {
+	struct list_head list;
+	struct btrfs_key snap_key;
+	struct buffer_head *commit_root;
+	struct btrfs_root *root;
+};
+
+int add_dirty_roots(struct btrfs_trans_handle *trans,
+		    struct radix_tree_root *radix, struct list_head *list)
+{
+	struct dirty_root *dirty;
+	struct btrfs_root *gang[8];
+	struct btrfs_root *root;
+	int i;
+	int ret;
+	int err;
+printk("add dirty\n");
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+						 ARRAY_SIZE(gang),
+						 BTRFS_ROOT_TRANS_TAG);
+		if (ret == 0)
+			break;
+		for (i = 0; i < ret; i++) {
+			root = gang[i];
+			radix_tree_tag_clear(radix, (unsigned long)root,
+					     BTRFS_ROOT_TRANS_TAG);
+			if (root->commit_root == root->node) {
+				WARN_ON(root->node->b_blocknr !=
+					btrfs_root_blocknr(&root->root_item));
+				brelse(root->commit_root);
+				root->commit_root = NULL;
+				continue;
+			}
+			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+			BUG_ON(!dirty);
+			memcpy(&dirty->snap_key, &root->root_key,
+			       sizeof(root->root_key));
+			dirty->commit_root = root->commit_root;
+			root->commit_root = NULL;
+			dirty->root = root;
+printk("adding dirty root %Lu gen %Lu blocknr %Lu\n", root->root_key.objectid, root->root_key.offset, dirty->commit_root->b_blocknr);
+			root->root_key.offset = root->fs_info->generation;
+			btrfs_set_root_blocknr(&root->root_item,
+					       root->node->b_blocknr);
+			err = btrfs_insert_root(trans, root->fs_info->tree_root,
+						&root->root_key,
+						&root->root_item);
+			BUG_ON(err);
+			list_add(&dirty->list, list);
+		}
+	}
+printk("add dirty done\n");
+	return 0;
+}
+
+int drop_dirty_roots(struct btrfs_root *tree_root, struct list_head *list)
+{
+	struct dirty_root *dirty;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	while(!list_empty(list)) {
+		dirty = list_entry(list->next, struct dirty_root, list);
+		list_del_init(&dirty->list);
+		trans = btrfs_start_transaction(tree_root, 1);
+printk("drop snapshot root %p, commit_root blocknr %Lu generation %Lu\n", dirty->root, dirty->commit_root->b_blocknr, dirty->snap_key.offset);
+		ret = btrfs_drop_snapshot(trans, dirty->root,
+					  dirty->commit_root);
+		BUG_ON(ret);
+
+printk("del root objectid %Lu, offset %Lu\n", dirty->snap_key.objectid, dirty->snap_key.offset);
+		ret = btrfs_del_root(trans, tree_root, &dirty->snap_key);
+		BUG_ON(ret);
+		ret = btrfs_end_transaction(trans, tree_root);
+		BUG_ON(ret);
+		kfree(dirty);
+	}
+	return 0;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
 	int ret = 0;
-	struct buffer_head *snap;
-	struct btrfs_key snap_key;
 	struct btrfs_transaction *cur_trans;
+	struct list_head dirty_fs_roots;
 	DEFINE_WAIT(wait);
 
-	/* FIXME, use the right root */
-	root = root->fs_info->fs_root;
+	INIT_LIST_HEAD(&dirty_fs_roots);
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
@@ -184,22 +272,13 @@ printk("already in commit!, waiting\n");
 	}
 	finish_wait(&trans->transaction->writer_wait, &wait);
 	WARN_ON(cur_trans != trans->transaction);
-	if (root->node != root->commit_root) {
-		memcpy(&snap_key, &root->root_key, sizeof(snap_key));
-		root->root_key.offset++;
-	}
-
-	if (btrfs_root_blocknr(&root->root_item) != root->node->b_blocknr) {
-		btrfs_set_root_blocknr(&root->root_item, root->node->b_blocknr);
-		ret = btrfs_insert_root(trans, root->fs_info->tree_root,
-					&root->root_key, &root->root_item);
-		BUG_ON(ret);
-	}
-
+	add_dirty_roots(trans, &root->fs_info->fs_roots_radix, &dirty_fs_roots);
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 	cur_trans = root->fs_info->running_transaction;
 	root->fs_info->running_transaction = NULL;
+	btrfs_set_super_generation(root->fs_info->disk_super,
+				   root->fs_info->generation + 1);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
@@ -213,21 +292,8 @@ printk("already in commit!, waiting\n");
 	put_transaction(cur_trans);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
-	if (root->node != root->commit_root) {
-		trans = btrfs_start_transaction(root, 1);
-		snap = root->commit_root;
-		root->commit_root = root->node;
-		get_bh(root->node);
-		ret = btrfs_drop_snapshot(trans, root, snap);
-		BUG_ON(ret);
 
-		ret = btrfs_del_root(trans, root->fs_info->tree_root,
-				     &snap_key);
-		BUG_ON(ret);
-		root->fs_info->generation = root->root_key.offset + 1;
-		ret = btrfs_end_transaction(trans, root);
-		BUG_ON(ret);
-	}
+	drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
 	return ret;
 }
 
-- 
cgit v1.2.3


From c5739bba5260a59cebd20a51a55080592c8d3b07 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 09:27:04 -0400
Subject: Btrfs: snapshot progress

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  15 +++++-
 fs/btrfs/dir-item.c    |   6 ++-
 fs/btrfs/disk-io.c     |   4 --
 fs/btrfs/extent-tree.c |   6 +++
 fs/btrfs/ioctl.h       |  13 +++++
 fs/btrfs/root-tree.c   |  16 +++++-
 fs/btrfs/super.c       | 135 +++++++++++++++++++++++++++++++++++++++++++++++--
 7 files changed, 183 insertions(+), 12 deletions(-)
 create mode 100644 fs/btrfs/ioctl.h

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5460030c9e6..2cbcaaeac9e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -227,7 +227,7 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_map_item {
-	struct btrfs_disk_key key;
+	u32 refs;
 } __attribute__ ((__packed__));
 
 struct crypto_hash;
@@ -883,6 +883,17 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
+static inline u32 btrfs_inode_map_refs(struct btrfs_inode_map_item *m)
+{
+	return le32_to_cpu(m->refs);
+}
+
+static inline void btrfs_set_inode_map_refs(struct btrfs_inode_map_item *m,
+					    u32 val)
+{
+	m->refs = cpu_to_le32(val);
+}
+
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -925,6 +936,8 @@ static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 	btrfs_item_offset((leaf)->items + (slot))))
 
 /* extent-item.c */
+int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7aed9f015b5..0ba46bc0da9 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -55,12 +55,14 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_dir_flags(dir_item, 0);
 	btrfs_set_dir_name_len(dir_item, name_len);
 	name_ptr = (char *)(dir_item + 1);
+
+	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
 	/* FIXME, use some real flag for selecting the extra index */
 	if (root == root->fs_info->tree_root)
 		goto out;
 
-	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(root, path);
 
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b557bdd1e26..6b097ede80b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,10 +16,6 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 	if (buf->b_blocknr != btrfs_header_blocknr(&node->header)) {
 		BUG();
 	}
-	if (root->node && btrfs_header_parentid(&node->header) !=
-	    btrfs_header_parentid(btrfs_buffer_header(root->node))) {
-		BUG();
-	}
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7c21f63f1b9..efc604eea0b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -77,6 +77,12 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 0;
 }
 
+int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root)
+{
+	return inc_block_ref(trans, root, root->node->b_blocknr, 1);
+}
+
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf)
 {
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
new file mode 100644
index 00000000000..201fb327072
--- /dev/null
+++ b/fs/btrfs/ioctl.h
@@ -0,0 +1,13 @@
+#ifndef __IOCTL_
+#define __IOCTL_
+#include <linux/ioctl.h>
+
+#define BTRFS_IOCTL_MAGIC 0x94
+#define BTRFS_VOL_NAME_MAX 255
+struct btrfs_ioctl_vol_args {
+	char name[BTRFS_VOL_NAME_MAX + 1];
+};
+
+#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
+				   struct btrfs_ioctl_vol_args)
+#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ddc1c13a535..72be9836932 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -83,6 +83,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct btrfs_path *path;
 	int ret;
+	u32 refs;
+	struct btrfs_root_item *ri;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -91,7 +93,19 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
-	ret = btrfs_del_item(trans, root, path);
+	ri = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+			    path->slots[0], struct btrfs_root_item);
+
+	refs = btrfs_root_refs(ri);
+	BUG_ON(refs == 0);
+	if (refs == 1) {
+		ret = btrfs_del_item(trans, root, path);
+printk("deleting root %Lu %Lu %u\n", key->objectid, key->offset, key->flags);
+	} else {
+		btrfs_set_root_refs(ri, refs - 1);
+printk("ref now %u root %Lu %Lu %u\n", refs -1, key->objectid, key->offset, key->flags);
+		mark_buffer_dirty(path->nodes[0]);
+	}
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3c9236ca889..bbe5cabcb42 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -15,6 +15,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "ioctl.h"
 
 void btrfs_fsinfo_release(struct kobject *obj)
 {
@@ -27,6 +28,11 @@ struct kobj_type btrfs_fsinfo_ktype = {
 	.release = btrfs_fsinfo_release,
 };
 
+struct btrfs_iget_args {
+	u64 ino;
+	struct btrfs_root *root;
+};
+
 decl_subsys(btrfs, &btrfs_fsinfo_ktype, NULL);
 
 #define BTRFS_SUPER_MAGIC 0x9123682E
@@ -461,6 +467,34 @@ int fixup_tree_root_location(struct btrfs_root *root,
 	return 0;
 }
 
+int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct btrfs_iget_args *args = p;
+	inode->i_ino = args->ino;
+	BTRFS_I(inode)->root = args->root;
+	return 0;
+}
+
+int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct btrfs_iget_args *args = opaque;
+	return (args->ino == inode->i_ino &&
+		args->root == BTRFS_I(inode)->root);
+}
+
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				struct btrfs_root *root)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	inode = iget5_locked(s, objectid, btrfs_find_actor,
+			     btrfs_init_locked_inode,
+			     (void *)&args);
+	return inode;
+}
 
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
@@ -486,7 +520,8 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			return ERR_PTR(ret);
 		if (ret > 0)
 			return ERR_PTR(-ENOENT);
-		inode = iget_locked(dir->i_sb, location.objectid);
+		inode = btrfs_iget_locked(dir->i_sb, location.objectid,
+					  sub_root);
 		if (!inode)
 			return ERR_PTR(-EACCES);
 		if (inode->i_state & I_NEW) {
@@ -495,7 +530,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 						&root->fs_info->fs_roots_radix,
 						(unsigned long)sub_root,
 						sub_root);
-printk("adding new root for inode %lu\n", inode->i_ino);
+printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_root, BTRFS_I(inode)->root);
 				igrab(inode);
 				sub_root->inode = inode;
 			}
@@ -630,7 +665,8 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	       btrfs_super_total_blocks(disk_super),
 	       btrfs_super_root_dir(disk_super));
 
-	inode = iget_locked(sb, btrfs_super_root_dir(disk_super));
+	inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
+				  tree_root);
 	bi = BTRFS_I(inode);
 	bi->location.objectid = inode->i_ino;
 	bi->location.offset = 0;
@@ -750,7 +786,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	fill_inode_item(&inode_item, inode);
 
 	key->objectid = objectid;
@@ -1650,6 +1686,95 @@ static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	return retval;
 }
 
+static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item new_root_item;
+	int ret;
+	u64 objectid;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_update_inode(trans, root, root->inode);
+	BUG_ON(ret);
+
+	ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+	BUG_ON(ret);
+
+	memset(&new_root_item, 0, sizeof(new_root_item));
+	memcpy(&new_root_item, &root->root_item,
+	       sizeof(new_root_item));
+
+	key.objectid = objectid;
+	key.flags = 0;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	ret = btrfs_insert_inode_map(trans, root, objectid, &key);
+	BUG_ON(ret);
+
+	key.objectid = objectid;
+	key.offset = 1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	btrfs_set_root_blocknr(&new_root_item, root->node->b_blocknr);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&new_root_item);
+	BUG_ON(ret);
+
+printk("adding snapshot name %.*s root %Lu %Lu %u\n", namelen, name, key.objectid, key.offset, key.flags);
+
+	/*
+	 * insert the directory item
+	 */
+	key.offset = (u64)-1;
+	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
+				    name, namelen,
+				    root->fs_info->sb->s_root->d_inode->i_ino,
+				    &key, 0);
+
+	BUG_ON(ret);
+
+	ret = btrfs_inc_root_ref(trans, root);
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return 0;
+}
+
+static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
+		       cmd, unsigned long arg)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ioctl_vol_args vol_args;
+	int ret;
+	int namelen;
+
+	if (!root->ref_cows)
+		return -EINVAL;
+	switch (cmd) {
+	case BTRFS_IOC_SNAP_CREATE:
+		if (copy_from_user(&vol_args,
+				   (struct btrfs_ioctl_vol_args __user *)arg,
+				   sizeof(vol_args)))
+			return -EFAULT;
+		namelen = strlen(vol_args.name);
+		if (namelen > BTRFS_VOL_NAME_MAX)
+			return -EINVAL;
+		ret = create_snapshot(root, vol_args.name, namelen);
+		WARN_ON(ret);
+		break;
+	default:
+		return -ENOTTY;
+	}
+	return 0;
+}
+
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
@@ -1781,6 +1906,7 @@ static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= btrfs_readdir,
+	.ioctl		= btrfs_ioctl,
 };
 
 static struct address_space_operations btrfs_aops = {
@@ -1803,6 +1929,7 @@ static struct file_operations btrfs_file_operations = {
 	.write		= btrfs_file_write,
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
+	.ioctl		= btrfs_ioctl,
 };
 
 static int __init init_btrfs_fs(void)
-- 
cgit v1.2.3


From 1b05da2ee6217e7d55460d04335813fec25be4ca Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 12:13:09 -0400
Subject: Btrfs: drop the inode map tree

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  2 --
 fs/btrfs/ctree.h       | 39 +++-------------------
 fs/btrfs/disk-io.c     | 25 +++++---------
 fs/btrfs/inode-item.c  |  2 ++
 fs/btrfs/inode-map.c   | 65 +++---------------------------------
 fs/btrfs/print-tree.c  |  8 -----
 fs/btrfs/super.c       | 89 ++++++++------------------------------------------
 fs/btrfs/transaction.c |  7 ----
 8 files changed, 33 insertions(+), 204 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3439ffb467b..43a4f1ddb37 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -2,11 +2,9 @@
 #define __BTRFS_I__
 
 struct btrfs_inode {
-	u32 magic;
 	struct btrfs_root *root;
 	struct btrfs_key location;
 	struct inode vfs_inode;
-	u32 magic2;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cbcaaeac9e..5103709bb2b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -14,9 +14,8 @@ extern struct kmem_cache *btrfs_path_cachep;
 
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
-#define BTRFS_INODE_MAP_OBJECTID 3ULL
-#define BTRFS_FS_TREE_OBJECTID 4ULL
-#define BTRFS_FIRST_FREE_OBJECTID 5ULL
+#define BTRFS_FS_TREE_OBJECTID 3ULL
+#define BTRFS_FIRST_FREE_OBJECTID 4ULL
 
 /*
  * we can actually store much bigger names, but lets not confuse the rest
@@ -62,7 +61,6 @@ struct btrfs_header {
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 generation;
 	__le64 parentid; /* objectid of the tree root */
-	__le32 ham;
 	__le16 nritems;
 	__le16 flags;
 	u8 level;
@@ -226,23 +224,16 @@ struct btrfs_csum_item {
 	u8 csum[BTRFS_CSUM_SIZE];
 } __attribute__ ((__packed__));
 
-struct btrfs_inode_map_item {
-	u32 refs;
-} __attribute__ ((__packed__));
-
 struct crypto_hash;
 struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
-	struct btrfs_root *inode_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
-	u64 last_inode_alloc;
 	u64 generation;
-	u64 highest_inode;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
 	struct buffer_head *sb_buffer;
@@ -272,6 +263,8 @@ struct btrfs_root {
 	u32 blocksize;
 	int ref_cows;
 	u32 type;
+	u64 highest_inode;
+	u64 last_inode_alloc;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -320,16 +313,11 @@ struct btrfs_root {
  */
 #define BTRFS_EXTENT_ITEM_KEY	8
 
-/*
- * the inode map records which inode numbers are in use and where
- * they actually live on disk
- */
-#define BTRFS_INODE_MAP_ITEM_KEY 9
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	10
+#define BTRFS_STRING_ITEM_KEY	9
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -883,17 +871,6 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
-static inline u32 btrfs_inode_map_refs(struct btrfs_inode_map_item *m)
-{
-	return le32_to_cpu(m->refs);
-}
-
-static inline void btrfs_set_inode_map_refs(struct btrfs_inode_map_item *m,
-					    u32 val)
-{
-	m->refs = cpu_to_le32(val);
-}
-
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -996,12 +973,6 @@ int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
 			     u64 dirid, u64 *objectid);
-int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   u64 objectid, struct btrfs_key *location);
-int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct btrfs_path *path,
-			   u64 objectid, int mod);
 int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 
 /* inode-item.c */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6b097ede80b..760fdc9a766 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -264,6 +264,8 @@ static int __setup_root(int blocksize,
 	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
+	root->highest_inode = 0;
+	root->last_inode_alloc = 0;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	return 0;
@@ -295,6 +297,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_path *path;
 	struct btrfs_leaf *l;
+	u64 highest_inode;
 	int ret = 0;
 
 printk("read_fs_root looking for %Lu %Lu %u\n", location->objectid, location->offset, location->flags);
@@ -354,6 +357,12 @@ printk("radix_tree_insert gives us %d\n", ret);
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+	ret = btrfs_find_highest_inode(root, &highest_inode);
+	if (ret == 0) {
+		root->highest_inode = highest_inode;
+		root->last_inode_alloc = highest_inode;
+printk("highest inode is %Lu\n", highest_inode);
+	}
 printk("all worked\n");
 	return root;
 }
@@ -364,8 +373,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
-	struct btrfs_root *inode_root = kmalloc(sizeof(struct btrfs_root),
-						GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
 						GFP_NOFS);
 	int ret;
@@ -378,9 +385,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->running_transaction = NULL;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
-	fs_info->inode_root = inode_root;
-	fs_info->last_inode_alloc = 0;
-	fs_info->highest_inode = 0;
 	fs_info->sb = sb;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
@@ -425,14 +429,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	BUG_ON(ret);
 
-	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
-				  BTRFS_INODE_MAP_OBJECTID, inode_root);
-	BUG_ON(ret);
-
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
-	ret = btrfs_find_highest_inode(tree_root, &fs_info->last_inode_alloc);
-	if (ret == 0)
-		fs_info->highest_inode = fs_info->last_inode_alloc;
 	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
 	kobj_set_kset_s(fs_info, btrfs_subsys);
 	kobject_set_name(&fs_info->kobj, "%s", sb->s_id);
@@ -512,9 +509,6 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->extent_root->node)
 		btrfs_block_release(fs_info->extent_root,
 				    fs_info->extent_root->node);
-	if (fs_info->inode_root->node)
-		btrfs_block_release(fs_info->inode_root,
-				    fs_info->inode_root->node);
 	if (fs_info->tree_root->node)
 		btrfs_block_release(fs_info->tree_root,
 				    fs_info->tree_root->node);
@@ -524,7 +518,6 @@ int close_ctree(struct btrfs_root *root)
 	iput(fs_info->btree_inode);
 	del_fs_roots(fs_info);
 	kfree(fs_info->extent_root);
-	kfree(fs_info->inode_root);
 	kfree(fs_info->tree_root);
 	kobject_unregister(&fs_info->kobj);
 	return 0;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b276a3b40a6..c3b990b661c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -22,6 +22,8 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 				sizeof(*inode_item));
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+	if (ret == 0 && objectid > root->highest_inode)
+		root->highest_inode = objectid;
 	return ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f665221409a..318e27a6378 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -3,12 +3,11 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid)
+int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 {
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_leaf *l;
-	struct btrfs_root *root = fs_root->fs_info->inode_root;
 	struct btrfs_key search_key;
 	int slot;
 
@@ -38,7 +37,7 @@ error:
  * walks the btree of allocated inodes and find a hole.
  */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *fs_root,
+			     struct btrfs_root *root,
 			     u64 dirid, u64 *objectid)
 {
 	struct btrfs_path *path;
@@ -49,16 +48,13 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	u64 last_ino = 0;
 	int start_found;
 	struct btrfs_leaf *l;
-	struct btrfs_root *root = fs_root->fs_info->inode_root;
 	struct btrfs_key search_key;
 	u64 search_start = dirid;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	search_key.flags = 0;
-	btrfs_set_key_type(&search_key, BTRFS_INODE_MAP_ITEM_KEY);
-
-	search_start = fs_root->fs_info->last_inode_alloc;
+	search_start = root->last_inode_alloc;
 	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
 	search_key.offset = 0;
@@ -108,7 +104,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	}
 	// FIXME -ENOSPC
 found:
-	root->fs_info->last_inode_alloc = *objectid;
+	root->last_inode_alloc = *objectid;
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	BUG_ON(*objectid < search_start);
@@ -118,56 +114,3 @@ error:
 	btrfs_free_path(path);
 	return ret;
 }
-
-int btrfs_insert_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *fs_root,
-			   u64 objectid, struct btrfs_key *location)
-{
-	int ret = 0;
-	struct btrfs_path *path;
-	struct btrfs_inode_map_item *inode_item;
-	struct btrfs_key key;
-	struct btrfs_root *inode_root = fs_root->fs_info->inode_root;
-
-	key.objectid = objectid;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY);
-	key.offset = 0;
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	ret = btrfs_insert_empty_item(trans, inode_root, path, &key,
-				      sizeof(struct btrfs_inode_map_item));
-	if (ret)
-		goto out;
-
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				    path->slots[0], struct btrfs_inode_map_item);
-	btrfs_cpu_key_to_disk(&inode_item->key, location);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-	if (objectid > fs_root->fs_info->highest_inode)
-		fs_root->fs_info->highest_inode = objectid;
-out:
-	btrfs_release_path(inode_root, path);
-	btrfs_free_path(path);
-	return ret;
-}
-
-int btrfs_lookup_inode_map(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *fs_root, struct btrfs_path *path,
-			   u64 objectid, int mod)
-{
-	int ret;
-	struct btrfs_key key;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
-	struct btrfs_root *inode_root = fs_root->fs_info->inode_root;
-
-	key.objectid = objectid;
-	key.flags = 0;
-	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_MAP_ITEM_KEY);
-	ret = btrfs_search_slot(trans, inode_root, &key, path, ins_len, cow);
-	return ret;
-}
-
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f0da65c4f96..854d47d9bdc 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -10,7 +10,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
-	struct btrfs_inode_map_item *mi;
 	struct btrfs_inode_item *ii;
 	u32 type;
 
@@ -54,13 +53,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			printk("\t\textent data refs %u owner %Lu\n",
 				btrfs_extent_refs(ei), btrfs_extent_owner(ei));
 			break;
-		case BTRFS_INODE_MAP_ITEM_KEY:
-			mi = btrfs_item_ptr(l, i, struct btrfs_inode_map_item);
-			printk("\t\tinode map key %Lu %u %Lu\n",
-			       btrfs_disk_key_objectid(&mi->key),
-			       btrfs_disk_key_flags(&mi->key),
-			       btrfs_disk_key_offset(&mi->key));
-			break;
 		case BTRFS_STRING_ITEM_KEY:
 			printk("\t\titem data %.*s\n", btrfs_item_size(item),
 				btrfs_leaf_data(l) + btrfs_item_offset(item));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index bbe5cabcb42..3e8bfb0e5d7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -45,14 +45,6 @@ static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
 static struct file_operations btrfs_file_operations;
 
-static int check_inode(struct inode *inode)
-{
-	struct btrfs_inode *ei = BTRFS_I(inode);
-	WARN_ON(ei->magic != 0xDEADBEEF);
-	WARN_ON(ei->magic2 != 0xDEADBEAF);
-	return 0;
-}
-
 static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -66,15 +58,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
-	check_inode(inode);
-
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret) {
 		btrfs_free_path(path);
 		goto make_bad;
 	}
-	check_inode(inode);
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 				  path->slots[0],
 				  struct btrfs_inode_item);
@@ -97,7 +86,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode_item = NULL;
 
 	mutex_unlock(&root->fs_info->fs_mutex);
-	check_inode(inode);
+
 	switch (inode->i_mode & S_IFMT) {
 #if 0
 	default:
@@ -121,7 +110,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		// inode->i_op = &page_symlink_inode_operations;
 		break;
 	}
-	check_inode(inode);
 	return;
 
 make_bad:
@@ -272,10 +260,7 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct inode *inode)
 {
-	u64 objectid = inode->i_ino;
 	struct btrfs_path *path;
-	struct btrfs_inode_map_item *map;
-	struct btrfs_key stat_data_key;
 	int ret;
 
 	clear_inode(inode);
@@ -283,26 +268,11 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-	ret = btrfs_lookup_inode_map(trans, root, path, objectid, -1);
-	if (ret) {
-		if (ret > 0)
-			ret = -ENOENT;
-		goto error;
-	}
-	map = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			    struct btrfs_inode_map_item);
-	btrfs_disk_key_to_cpu(&stat_data_key, &map->key);
-	ret = btrfs_del_item(trans, root->fs_info->inode_root, path);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
-
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, -1);
 	BUG_ON(ret);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
-error:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -432,7 +402,6 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	check_inode(dir);
 	return ret;
 }
 
@@ -540,9 +509,7 @@ printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_r
 			btrfs_read_locked_inode(inode);
 			unlock_new_inode(inode);
 		}
-		check_inode(inode);
 	}
-	check_inode(dir);
 	return d_splice_alias(inode, dentry);
 }
 
@@ -566,7 +533,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
 		key_type = BTRFS_DIR_ITEM_KEY;
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	key.flags = 0;
@@ -575,9 +541,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0) {
+	if (ret < 0)
 		goto err;
-	}
 	advance = 0;
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
@@ -601,8 +566,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
 		if (key_type == BTRFS_DIR_INDEX_KEY &&
-		    btrfs_disk_key_offset(&item->key) >
-		    root->fs_info->highest_inode)
+		    btrfs_disk_key_offset(&item->key) > root->highest_inode)
 			break;
 		if (btrfs_disk_key_type(&item->key) != key_type)
 			continue;
@@ -707,7 +671,6 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
 	btrfs_set_inode_nblocks(item, inode->i_blocks);
 	btrfs_set_inode_generation(item, inode->i_generation);
-	check_inode(inode);
 }
 
 static int btrfs_update_inode(struct btrfs_trans_handle *trans,
@@ -721,7 +684,6 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
@@ -736,11 +698,11 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	fill_inode_item(inode_item, inode);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	ret = 0;
 failed:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	check_inode(inode);
-	return 0;
+	return ret;
 }
 
 static int btrfs_write_inode(struct inode *inode, int wait)
@@ -757,7 +719,6 @@ static int btrfs_write_inode(struct inode *inode, int wait)
 	else
 		btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	check_inode(inode);
 	return ret;
 }
 
@@ -767,7 +728,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	struct inode *inode;
 	struct btrfs_inode_item inode_item;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_key *key;
+	struct btrfs_key *location;
 	int ret;
 	u64 objectid;
 
@@ -776,8 +737,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	BTRFS_I(inode)->root = BTRFS_I(dir)->root;
-	key = &BTRFS_I(inode)->location;
-	check_inode(inode);
 	ret = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	BUG_ON(ret);
 
@@ -788,20 +747,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	fill_inode_item(&inode_item, inode);
-
-	key->objectid = objectid;
-	key->flags = 0;
-	key->offset = 0;
-	btrfs_set_key_type(key, BTRFS_INODE_ITEM_KEY);
-	ret = btrfs_insert_inode_map(trans, root, objectid, key);
-	BUG_ON(ret);
+	location = &BTRFS_I(inode)->location;
+	location->objectid = objectid;
+	location->flags = 0;
+	location->offset = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
 	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
 	BUG_ON(ret);
 
 	insert_inode_hash(inode);
-	check_inode(inode);
-	check_inode(dir);
 	return inode;
 }
 
@@ -825,8 +780,6 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 		ret = btrfs_update_inode(trans, root,
 					 dentry->d_parent->d_inode);
 	}
-	check_inode(inode);
-	check_inode(dentry->d_parent->d_inode);
 	return ret;
 }
 
@@ -840,7 +793,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 	}
 	if (err > 0)
 		err = -EEXIST;
-	check_inode(inode);
 	return err;
 }
 
@@ -872,8 +824,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	check_inode(inode);
-	check_inode(dir);
 
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -1701,20 +1651,14 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_update_inode(trans, root, root->inode);
 	BUG_ON(ret);
 
-	ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
 	BUG_ON(ret);
 
 	memset(&new_root_item, 0, sizeof(new_root_item));
 	memcpy(&new_root_item, &root->root_item,
 	       sizeof(new_root_item));
 
-	key.objectid = objectid;
-	key.flags = 0;
-	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	ret = btrfs_insert_inode_map(trans, root, objectid, &key);
-	BUG_ON(ret);
-
 	key.objectid = objectid;
 	key.offset = 1;
 	key.flags = 0;
@@ -1791,21 +1735,14 @@ static struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
-	ei->magic = 0xDEADBEEF;
-	ei->magic2 = 0xDEADBEAF;
 	return &ei->vfs_inode;
 }
 
 static void btrfs_destroy_inode(struct inode *inode)
 {
-	struct btrfs_inode *ei = BTRFS_I(inode);
-	WARN_ON(ei->magic != 0xDEADBEEF);
-	WARN_ON(ei->magic2 != 0xDEADBEAF);
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
-	ei->magic = 0;
-	ei->magic2 = 0;
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 83a0194ab16..54978d1bd95 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -115,14 +115,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *extent_root = fs_info->extent_root;
-	struct btrfs_root *inode_root = fs_info->inode_root;
 
-	btrfs_set_root_blocknr(&inode_root->root_item,
-			       inode_root->node->b_blocknr);
-	ret = btrfs_update_root(trans, tree_root,
-				&inode_root->root_key,
-				&inode_root->root_item);
-	BUG_ON(ret);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
 		if (old_extent_block == extent_root->node->b_blocknr)
-- 
cgit v1.2.3


From d0dbc6245cefa36e19dff49c557ccf05e3063e9c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 12:36:36 -0400
Subject: Btrfs: drop owner and parentid

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  8 --------
 fs/btrfs/ctree.h       | 25 +------------------------
 fs/btrfs/extent-tree.c |  8 ++------
 fs/btrfs/file-item.c   |  2 +-
 fs/btrfs/print-tree.c  |  4 ++--
 fs/btrfs/transaction.c |  6 ------
 6 files changed, 6 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 39b551564fc..f31ac843bf2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -688,8 +688,6 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_level(&c->header, level);
 	btrfs_set_header_blocknr(&c->header, t->b_blocknr);
 	btrfs_set_header_generation(&c->header, trans->transid);
-	btrfs_set_header_parentid(&c->header,
-	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	lower = btrfs_buffer_node(path->nodes[level-1]);
 	memcpy(c->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(c->header.fsid));
@@ -784,8 +782,6 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
 	btrfs_set_header_blocknr(&split->header, split_buffer->b_blocknr);
 	btrfs_set_header_generation(&split->header, trans->transid);
-	btrfs_set_header_parentid(&split->header,
-	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	memcpy(split->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(split->header.fsid));
 	mid = (c_nritems + 1) / 2;
@@ -1145,8 +1141,6 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
 	btrfs_set_header_generation(&right->header, trans->transid);
 	btrfs_set_header_level(&right->header, 0);
-	btrfs_set_header_parentid(&right->header,
-	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(right->header.fsid));
 	if (mid <= slot) {
@@ -1237,8 +1231,6 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
 	btrfs_set_header_generation(&right->header, trans->transid);
 	btrfs_set_header_level(&right->header, 0);
-	btrfs_set_header_parentid(&right->header,
-	      btrfs_header_parentid(btrfs_buffer_header(root->node)));
 	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(right->header.fsid));
 	btrfs_cpu_key_to_disk(&disk_key, ins_key);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5103709bb2b..0ba560f0d2a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -60,7 +60,6 @@ struct btrfs_header {
 	u8 fsid[16]; /* FS specific uuid */
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 generation;
-	__le64 parentid; /* objectid of the tree root */
 	__le16 nritems;
 	__le16 flags;
 	u8 level;
@@ -147,7 +146,6 @@ struct btrfs_path {
  */
 struct btrfs_extent_item {
 	__le32 refs;
-	__le64 owner;
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_timespec {
@@ -443,16 +441,6 @@ static inline void btrfs_set_timespec_nsec(struct btrfs_inode_timespec *ts,
 	ts->nsec = cpu_to_le32(val);
 }
 
-static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
-{
-	return le64_to_cpu(ei->owner);
-}
-
-static inline void btrfs_set_extent_owner(struct btrfs_extent_item *ei, u64 val)
-{
-	ei->owner = cpu_to_le64(val);
-}
-
 static inline u32 btrfs_extent_refs(struct btrfs_extent_item *ei)
 {
 	return le32_to_cpu(ei->refs);
@@ -652,17 +640,6 @@ static inline void btrfs_set_header_generation(struct btrfs_header *h,
 	h->generation = cpu_to_le64(val);
 }
 
-static inline u64 btrfs_header_parentid(struct btrfs_header *h)
-{
-	return le64_to_cpu(h->parentid);
-}
-
-static inline void btrfs_set_header_parentid(struct btrfs_header *h,
-					     u64 parentid)
-{
-	h->parentid = cpu_to_le64(parentid);
-}
-
 static inline u16 btrfs_header_nritems(struct btrfs_header *h)
 {
 	return le16_to_cpu(h->nritems);
@@ -919,7 +896,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root, u64 num_blocks, u64 search_start, u64
-			search_end, u64 owner, struct btrfs_key *ins);
+			search_end, struct btrfs_key *ins);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index efc604eea0b..be9630df506 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -157,8 +157,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	struct btrfs_fs_info *info = extent_root->fs_info;
 
 	btrfs_set_extent_refs(&extent_item, 1);
-	btrfs_set_extent_owner(&extent_item,
-		btrfs_header_parentid(btrfs_buffer_header(extent_root->node)));
 	ins.offset = 1;
 	ins.flags = 0;
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
@@ -457,7 +455,7 @@ error:
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root, u64 num_blocks, u64 search_start, u64
-			search_end, u64 owner, struct btrfs_key *ins)
+			search_end, struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
@@ -467,7 +465,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_extent_item extent_item;
 
 	btrfs_set_extent_refs(&extent_item, 1);
-	btrfs_set_extent_owner(&extent_item, owner);
 
 	if (root == extent_root) {
 		BUG_ON(extent_root->fs_info->current_insert.offset == 0);
@@ -510,8 +507,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	int ret;
 	struct buffer_head *buf;
 
-	ret = btrfs_alloc_extent(trans, root, 1, 0, (unsigned long)-1,
-		btrfs_header_parentid(btrfs_buffer_header(root->node)), &ins);
+	ret = btrfs_alloc_extent(trans, root, 1, 0, (unsigned long)-1, &ins);
 	if (ret) {
 		BUG();
 		return NULL;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 2c98b620c59..d9fd7f4d379 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -19,7 +19,7 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 	btrfs_init_path(path);
 	ret = btrfs_alloc_extent(trans, root, num_blocks, hint_block,
-				 (u64)-1, objectid, &ins);
+				 (u64)-1, &ins);
 	BUG_ON(ret);
 	file_key.objectid = objectid;
 	file_key.offset = offset;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 854d47d9bdc..0732a2fbb23 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -50,8 +50,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
 			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-			printk("\t\textent data refs %u owner %Lu\n",
-				btrfs_extent_refs(ei), btrfs_extent_owner(ei));
+			printk("\t\textent data refs %u\n",
+				btrfs_extent_refs(ei));
 			break;
 		case BTRFS_STRING_ITEM_KEY:
 			printk("\t\titem data %.*s\n", btrfs_item_size(item),
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 54978d1bd95..f64c1729b0e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -163,7 +163,6 @@ int add_dirty_roots(struct btrfs_trans_handle *trans,
 	int i;
 	int ret;
 	int err;
-printk("add dirty\n");
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
 						 ARRAY_SIZE(gang),
@@ -188,7 +187,6 @@ printk("add dirty\n");
 			dirty->commit_root = root->commit_root;
 			root->commit_root = NULL;
 			dirty->root = root;
-printk("adding dirty root %Lu gen %Lu blocknr %Lu\n", root->root_key.objectid, root->root_key.offset, dirty->commit_root->b_blocknr);
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_blocknr(&root->root_item,
 					       root->node->b_blocknr);
@@ -199,7 +197,6 @@ printk("adding dirty root %Lu gen %Lu blocknr %Lu\n", root->root_key.objectid, r
 			list_add(&dirty->list, list);
 		}
 	}
-printk("add dirty done\n");
 	return 0;
 }
 
@@ -213,12 +210,10 @@ int drop_dirty_roots(struct btrfs_root *tree_root, struct list_head *list)
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 		trans = btrfs_start_transaction(tree_root, 1);
-printk("drop snapshot root %p, commit_root blocknr %Lu generation %Lu\n", dirty->root, dirty->commit_root->b_blocknr, dirty->snap_key.offset);
 		ret = btrfs_drop_snapshot(trans, dirty->root,
 					  dirty->commit_root);
 		BUG_ON(ret);
 
-printk("del root objectid %Lu, offset %Lu\n", dirty->snap_key.objectid, dirty->snap_key.offset);
 		ret = btrfs_del_root(trans, tree_root, &dirty->snap_key);
 		BUG_ON(ret);
 		ret = btrfs_end_transaction(trans, tree_root);
@@ -240,7 +235,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
-printk("already in commit!, waiting\n");
 		cur_trans = trans->transaction;
 		trans->transaction->use_count++;
 		btrfs_end_transaction(trans, root);
-- 
cgit v1.2.3


From 48ddc6f44b5a167c00573b6012727d674e9c667a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 13:20:21 -0400
Subject: Btrfs: cow file extents before writing

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3e8bfb0e5d7..91aa1b4c50a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1020,7 +1020,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       inode->i_ino,
-				       iblock << inode->i_blkbits, 0);
+				       iblock << inode->i_blkbits, 1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
@@ -1040,9 +1040,25 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	blocknr = btrfs_file_extent_disk_blocknr(item);
 	blocknr += btrfs_file_extent_offset(item);
 
-	/* exact match found, use it */
+	/* exact match found, use it, FIXME, deal with extents
+	 * other than the page size
+	 */
 	if (ret == 0) {
 		err = 0;
+		BUG_ON(btrfs_file_extent_disk_num_blocks(item) != 1);
+		if (btrfs_file_extent_generation(item) != trans->transid) {
+			struct btrfs_key ins;
+			ret = btrfs_alloc_extent(trans, root, 1,
+						 blocknr, (u64)-1, &ins);
+			BUG_ON(ret);
+			btrfs_set_file_extent_disk_blocknr(item, ins.objectid);
+			mark_buffer_dirty(path->nodes[0]);
+			ret = btrfs_free_extent(trans, root,
+						blocknr, 1, 0);
+			BUG_ON(ret);
+			blocknr = ins.objectid;
+
+		}
 		map_bh(result, inode->i_sb, blocknr);
 		goto out;
 	}
-- 
cgit v1.2.3


From 79b2cb1fa6430501907cb1384abbddf074833e45 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 13:51:41 -0400
Subject: Btrfs: only cow in get_block when create==1

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 91aa1b4c50a..a3183972f31 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1020,7 +1020,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       inode->i_ino,
-				       iblock << inode->i_blkbits, 1);
+				       iblock << inode->i_blkbits, create);
 	if (ret < 0) {
 		err = ret;
 		goto out;
@@ -1046,7 +1046,8 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	if (ret == 0) {
 		err = 0;
 		BUG_ON(btrfs_file_extent_disk_num_blocks(item) != 1);
-		if (btrfs_file_extent_generation(item) != trans->transid) {
+		if (create &&
+		    btrfs_file_extent_generation(item) != trans->transid) {
 			struct btrfs_key ins;
 			ret = btrfs_alloc_extent(trans, root, 1,
 						 blocknr, (u64)-1, &ins);
-- 
cgit v1.2.3


From 2932f3ec94ff0212313b8187064f93bcb76ce411 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 14:22:02 -0400
Subject: Btrfs: when forced to cow for file_write, get the page uptodate first

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a3183972f31..b93d790e748 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1268,6 +1268,8 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 			 struct page **pages,
 			 size_t num_pages,
 			 loff_t pos,
+			 unsigned long first_index,
+			 unsigned long last_index,
 			 size_t write_bytes)
 {
 	int i;
@@ -1289,6 +1291,13 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 		}
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		if (!PageUptodate(pages[i]) &&
+		   (pages[i]->index == first_index ||
+		    pages[i]->index == last_index) && pos < isize) {
+			ret = mpage_readpage(pages[i], btrfs_get_block);
+			BUG_ON(ret);
+			lock_page(pages[i]);
+		}
 		ret = nobh_prepare_write(pages[i], offset,
 					 offset + this_write,
 					 btrfs_get_block);
@@ -1323,6 +1332,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *pages[1];
+	unsigned long first_index;
+	unsigned long last_index;
 
 	if (file->f_flags & O_DIRECT)
 		return -EINVAL;
@@ -1340,13 +1351,15 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		goto out;
 	file_update_time(file);
 	mutex_lock(&inode->i_mutex);
+	first_index = pos >> PAGE_CACHE_SHIFT;
+	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 	while(count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
 		ret = prepare_pages(NULL, root, file, pages, num_pages,
-				    pos, write_bytes);
+				    pos, first_index, last_index, write_bytes);
 		BUG_ON(ret);
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
-- 
cgit v1.2.3


From 2619ba1f0ff9540a9d84683310a1e350b5efde3d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 16:58:11 -0400
Subject: Btrfs: subvolumes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |   1 +
 fs/btrfs/disk-io.c     |  38 +++++++----
 fs/btrfs/super.c       | 178 +++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/transaction.c |   8 ++-
 4 files changed, 182 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 73d4b842a31..4cd11a1b4b3 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -8,6 +8,7 @@
 * Check compat and incompat flags on the inode
 * Add virtual filesystems, mountable snapshots
 * Get rid of struct ctree_path, limiting tree levels held at one time
+* EEXIST for dirs instead of hash overflow
 * Release
 * Do real tree locking
 * Add extent mirroring (backup copies of blocks)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 760fdc9a766..ff69162829b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -301,6 +301,12 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	int ret = 0;
 
 printk("read_fs_root looking for %Lu %Lu %u\n", location->objectid, location->offset, location->flags);
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)location->objectid);
+	if (root) {
+printk("found %p in cache\n", root);
+		return root;
+	}
 	root = kmalloc(sizeof(*root), GFP_NOFS);
 	if (!root) {
 printk("failed1\n");
@@ -349,7 +355,8 @@ out:
 insert:
 printk("inserting %p\n", root);
 	root->ref_cows = 1;
-	ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root,
+	ret = radix_tree_insert(&fs_info->fs_roots_radix,
+				(unsigned long)root->root_key.objectid,
 				root);
 	if (ret) {
 printk("radix_tree_insert gives us %d\n", ret);
@@ -460,6 +467,20 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 0;
 }
 
+static int free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+	radix_tree_delete(&fs_info->fs_roots_radix,
+			  (unsigned long)root->root_key.objectid);
+	if (root->inode)
+		iput(root->inode);
+	if (root->node)
+		brelse(root->node);
+	if (root->commit_root)
+		brelse(root->commit_root);
+	kfree(root);
+	return 0;
+}
+
 int del_fs_roots(struct btrfs_fs_info *fs_info)
 {
 	int ret;
@@ -472,19 +493,8 @@ int del_fs_roots(struct btrfs_fs_info *fs_info)
 					     ARRAY_SIZE(gang));
 		if (!ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			radix_tree_delete(&fs_info->fs_roots_radix,
-					  (unsigned long)gang[i]);
-			if (gang[i]->inode)
-				iput(gang[i]->inode);
-			else
-				printk("no inode for root %p\n", gang[i]);
-			if (gang[i]->node)
-				brelse(gang[i]->node);
-			if (gang[i]->commit_root)
-				brelse(gang[i]->commit_root);
-			kfree(gang[i]);
-		}
+		for (i = 0; i < ret; i++)
+			free_fs_root(fs_info, gang[i]);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b93d790e748..84abdde4e30 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -495,10 +495,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			return ERR_PTR(-EACCES);
 		if (inode->i_state & I_NEW) {
 			if (sub_root != root) {
-				ret = radix_tree_insert(
-						&root->fs_info->fs_roots_radix,
-						(unsigned long)sub_root,
-						sub_root);
 printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_root, BTRFS_I(inode)->root);
 				igrab(inode);
 				sub_root->inode = inode;
@@ -723,22 +719,19 @@ static int btrfs_write_inode(struct inode *inode, int wait)
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
-				     struct inode *dir, int mode)
+				     struct btrfs_root *root,
+				     u64 objectid, int mode)
 {
 	struct inode *inode;
 	struct btrfs_inode_item inode_item;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_key *location;
 	int ret;
-	u64 objectid;
 
-	inode = new_inode(dir->i_sb);
+	inode = new_inode(root->fs_info->sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	BTRFS_I(inode)->root = BTRFS_I(dir)->root;
-	ret = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	BUG_ON(ret);
+	BTRFS_I(inode)->root = root;
 
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
@@ -804,10 +797,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err;
 	int drop_inode = 0;
+	u64 objectid;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	inode = btrfs_new_inode(trans, dir, mode);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, objectid, mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
@@ -833,9 +834,9 @@ out_unlock:
 }
 
 static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
-				struct inode *inode, struct inode *dir)
+				struct btrfs_root *root,
+				u64 objectid, u64 dirid)
 {
-	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret;
 	char buf[2];
 	struct btrfs_key key;
@@ -843,22 +844,20 @@ static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 	buf[0] = '.';
 	buf[1] = '.';
 
-	key.objectid = inode->i_ino;
+	key.objectid = objectid;
 	key.offset = 0;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 
-	ret = btrfs_insert_dir_item(trans, root, buf, 1, inode->i_ino,
+	ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
 				    &key, 1);
 	if (ret)
 		goto error;
-	key.objectid = dir->i_ino;
-	ret = btrfs_insert_dir_item(trans, root, buf, 2, inode->i_ino,
+	key.objectid = dirid;
+	ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
 				    &key, 1);
 	if (ret)
 		goto error;
-	inode->i_size = 6;
-	ret = btrfs_update_inode(trans, root, inode);
 error:
 	return ret;
 }
@@ -870,6 +869,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int err = 0;
 	int drop_on_err = 0;
+	u64 objectid;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
@@ -877,7 +877,14 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		err = PTR_ERR(trans);
 		goto out_unlock;
 	}
-	inode = btrfs_new_inode(trans, dir, S_IFDIR | mode);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, objectid, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fail;
@@ -886,7 +893,12 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 
-	err = btrfs_make_empty_dir(trans, inode, dir);
+	err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
+	if (err)
+		goto out_fail;
+
+	inode->i_size = 6;
+	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		goto out_fail;
 	err = btrfs_add_link(trans, dentry, inode);
@@ -1666,6 +1678,102 @@ static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	return retval;
 }
 
+static int create_subvol(struct btrfs_root *root, char *name, int namelen)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct buffer_head *subvol;
+	struct btrfs_leaf *leaf;
+	struct btrfs_root *new_root;
+	struct inode *inode;
+	int ret;
+	u64 objectid;
+	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	subvol = btrfs_alloc_free_block(trans, root);
+	leaf = btrfs_buffer_leaf(subvol);
+	btrfs_set_header_nritems(&leaf->header, 0);
+	btrfs_set_header_level(&leaf->header, 0);
+	btrfs_set_header_blocknr(&leaf->header, subvol->b_blocknr);
+	btrfs_set_header_generation(&leaf->header, trans->transid);
+	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
+	       sizeof(leaf->header.fsid));
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	btrfs_set_inode_generation(inode_item, 1);
+	btrfs_set_inode_size(inode_item, 3);
+	btrfs_set_inode_nlink(inode_item, 1);
+	btrfs_set_inode_nblocks(inode_item, 1);
+	btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
+
+	btrfs_set_root_blocknr(&root_item, subvol->b_blocknr);
+	btrfs_set_root_refs(&root_item, 1);
+
+	mark_buffer_dirty(subvol);
+	brelse(subvol);
+	subvol = NULL;
+
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
+	BUG_ON(ret);
+
+	btrfs_set_root_dirid(&root_item, new_dirid);
+
+	key.objectid = objectid;
+	key.offset = 1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&root_item);
+	BUG_ON(ret);
+
+	/*
+	 * insert the directory item
+	 */
+	key.offset = (u64)-1;
+	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
+				    name, namelen,
+				    root->fs_info->sb->s_root->d_inode->i_ino,
+				    &key, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+
+	new_root = btrfs_read_fs_root(root->fs_info, &key);
+	BUG_ON(!new_root);
+
+	trans = btrfs_start_transaction(new_root, 1);
+	BUG_ON(!trans);
+
+	inode = btrfs_new_inode(trans, new_root, new_dirid, S_IFDIR | 0700);
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
+	BUG_ON(ret);
+
+	inode->i_nlink = 1;
+	inode->i_size = 6;
+	ret = btrfs_update_inode(trans, new_root, inode);
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, new_root);
+	BUG_ON(ret);
+
+	iput(inode);
+
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return 0;
+}
+
 static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 {
 	struct btrfs_trans_handle *trans;
@@ -1674,6 +1782,9 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	int ret;
 	u64 objectid;
 
+	if (!root->ref_cows)
+		return -EINVAL;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
@@ -1685,7 +1796,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 				       0, &objectid);
 	BUG_ON(ret);
 
-	memset(&new_root_item, 0, sizeof(new_root_item));
 	memcpy(&new_root_item, &root->root_item,
 	       sizeof(new_root_item));
 
@@ -1728,9 +1838,9 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 	struct btrfs_ioctl_vol_args vol_args;
 	int ret;
 	int namelen;
+	struct btrfs_path *path;
+	u64 root_dirid;
 
-	if (!root->ref_cows)
-		return -EINVAL;
 	switch (cmd) {
 	case BTRFS_IOC_SNAP_CREATE:
 		if (copy_from_user(&vol_args,
@@ -1740,7 +1850,23 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		namelen = strlen(vol_args.name);
 		if (namelen > BTRFS_VOL_NAME_MAX)
 			return -EINVAL;
-		ret = create_snapshot(root, vol_args.name, namelen);
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+		root_dirid = btrfs_root_dirid(
+				      &root->fs_info->tree_root->root_item);
+		mutex_lock(&root->fs_info->fs_mutex);
+		ret = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+				    path, root_dirid,
+				    vol_args.name, namelen, 0);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		if (ret == 0)
+			return -EEXIST;
+
+		if (root == root->fs_info->tree_root)
+			ret = create_subvol(root, vol_args.name, namelen);
+		else
+			ret = create_snapshot(root, vol_args.name, namelen);
 		WARN_ON(ret);
 		break;
 	default:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f64c1729b0e..e15a072407b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -64,7 +64,8 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	if (root != root->fs_info->tree_root && root->last_trans <
 	    running_trans_id) {
 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-				   (unsigned long)root, BTRFS_ROOT_TRANS_TAG);
+				   (unsigned long)root->root_key.objectid,
+				   BTRFS_ROOT_TRANS_TAG);
 		root->commit_root = root->node;
 		get_bh(root->node);
 	}
@@ -171,8 +172,9 @@ int add_dirty_roots(struct btrfs_trans_handle *trans,
 			break;
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
-			radix_tree_tag_clear(radix, (unsigned long)root,
-					     BTRFS_ROOT_TRANS_TAG);
+			radix_tree_tag_clear(radix,
+				     (unsigned long)root->root_key.objectid,
+				     BTRFS_ROOT_TRANS_TAG);
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->b_blocknr !=
 					btrfs_root_blocknr(&root->root_item));
-- 
cgit v1.2.3


From 2d13d8d065cc7bbee6b1ad6ee3fe965d66badfe6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 10 Apr 2007 20:07:20 -0400
Subject: Btrfs: detect duplicate subvol names

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 84abdde4e30..2972aaa9eb7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1853,13 +1853,13 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		path = btrfs_alloc_path();
 		if (!path)
 			return -ENOMEM;
-		root_dirid = btrfs_root_dirid(
-				      &root->fs_info->tree_root->root_item);
+		root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
 		mutex_lock(&root->fs_info->fs_mutex);
 		ret = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
 				    path, root_dirid,
 				    vol_args.name, namelen, 0);
 		mutex_unlock(&root->fs_info->fs_mutex);
+		btrfs_free_path(path);
 		if (ret == 0)
 			return -EEXIST;
 
-- 
cgit v1.2.3


From cac87faa09f56776602d4b6f15c1bd44f6da300e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Apr 2007 08:59:20 -0400
Subject: Btrfs: use a dedicated inode num for root root dir

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO    | 9 ---------
 fs/btrfs/ctree.h | 3 ++-
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 4cd11a1b4b3..1c3acfcf146 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -6,9 +6,7 @@
 * Make sure nobh stuff is working properly for cows
 * Do actual block accounting
 * Check compat and incompat flags on the inode
-* Add virtual filesystems, mountable snapshots
 * Get rid of struct ctree_path, limiting tree levels held at one time
-* EEXIST for dirs instead of hash overflow
 * Release
 * Do real tree locking
 * Add extent mirroring (backup copies of blocks)
@@ -20,10 +18,3 @@
 * fsck
 * Scrub & defrag
 
----
-metata FS ideas:
-mount metadata FS on /sys/fs/btrfs/dev/
-subvolumes in /sys/fs/btrfs/dev/fs/
-snapshots in /sys/fs/btrfs/dev/fs/snapshots
-mount -o bind
-ioctl to create a snapshot
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0ba560f0d2a..3f3a23f52a3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -15,7 +15,8 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
 #define BTRFS_FS_TREE_OBJECTID 3ULL
-#define BTRFS_FIRST_FREE_OBJECTID 4ULL
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
+#define BTRFS_FIRST_FREE_OBJECTID 5ULL
 
 /*
  * we can actually store much bigger names, but lets not confuse the rest
-- 
cgit v1.2.3


From 0bd93ba022f5c1ddb4e5b722b02f3c4a4a8865ad Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Apr 2007 13:57:44 -0400
Subject: Btrfs: early support for multiple devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/disk-io.c | 11 +++++++++
 2 files changed, 76 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f3a23f52a3..01310de2bf4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -13,10 +13,11 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_MAGIC "_BtRfS_M"
 
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
-#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
-#define BTRFS_FS_TREE_OBJECTID 3ULL
-#define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
-#define BTRFS_FIRST_FREE_OBJECTID 5ULL
+#define BTRFS_DEV_TREE_OBJECTID 2ULL
+#define BTRFS_EXTENT_TREE_OBJECTID 3ULL
+#define BTRFS_FS_TREE_OBJECTID 4ULL
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 5ULL
+#define BTRFS_FIRST_FREE_OBJECTID 6ULL
 
 /*
  * we can actually store much bigger names, but lets not confuse the rest
@@ -90,6 +91,10 @@ struct btrfs_super_block {
 	__le64 total_blocks;
 	__le64 blocks_used;
 	__le64 root_dir_objectid;
+	/* fields below here vary with the underlying disk */
+	__le64 device_block_start;
+	__le64 device_num_blocks;
+	__le64 device_root;
 } __attribute__ ((__packed__));
 
 /*
@@ -223,10 +228,15 @@ struct btrfs_csum_item {
 	u8 csum[BTRFS_CSUM_SIZE];
 } __attribute__ ((__packed__));
 
+struct btrfs_device_item {
+	__le16 pathlen;
+} __attribute__ ((__packed__));
+
 struct crypto_hash;
 struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
+	struct btrfs_root *dev_root;
 	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
 	struct radix_tree_root fs_roots_radix;
@@ -312,11 +322,16 @@ struct btrfs_root {
  */
 #define BTRFS_EXTENT_ITEM_KEY	8
 
+/*
+ * dev items list the devices that make up the FS
+ */
+#define BTRFS_DEV_ITEM_KEY	9
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	9
+#define BTRFS_STRING_ITEM_KEY	10
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -782,6 +797,40 @@ static inline void btrfs_set_super_root_dir(struct btrfs_super_block *s, u64
 	s->root_dir_objectid = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_super_device_block_start(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->device_block_start);
+}
+
+static inline void btrfs_set_super_device_block_start(struct btrfs_super_block
+						      *s, u64 val)
+{
+	s->device_block_start = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_super_device_num_blocks(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->device_num_blocks);
+}
+
+static inline void btrfs_set_super_device_num_blocks(struct btrfs_super_block
+						     *s, u64 val)
+{
+	s->device_num_blocks = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_super_device_root(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->device_root);
+}
+
+static inline void btrfs_set_super_device_root(struct btrfs_super_block
+						      *s, u64 val)
+{
+	s->device_root = cpu_to_le64(val);
+}
+
+
 static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 {
 	return (u8 *)l->items;
@@ -849,6 +898,17 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
+static inline u16 btrfs_device_pathlen(struct btrfs_device_item *d)
+{
+	return le16_to_cpu(d->pathlen);
+}
+
+static inline void btrfs_set_device_pathlen(struct btrfs_device_item *d,
+						u16 val)
+{
+	d->pathlen = cpu_to_le16(val);
+}
+
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ff69162829b..e09233262af 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -378,6 +378,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 {
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
+	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
@@ -392,6 +394,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->running_transaction = NULL;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
+	fs_info->dev_root = dev_root;
 	fs_info->sb = sb;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
@@ -414,6 +417,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
+	__setup_root(sb->s_blocksize, dev_root,
+		     fs_info, BTRFS_DEV_TREE_OBJECTID);
+
 	__setup_root(sb->s_blocksize, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 	fs_info->sb_buffer = read_tree_block(tree_root,
@@ -427,6 +433,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		return NULL;
 
 	fs_info->disk_super = disk_super;
+	dev_root->node = read_tree_block(tree_root,
+					  btrfs_super_device_root(disk_super));
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super));
 	BUG_ON(!tree_root->node);
@@ -519,6 +527,9 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->extent_root->node)
 		btrfs_block_release(fs_info->extent_root,
 				    fs_info->extent_root->node);
+	if (fs_info->dev_root->node)
+		btrfs_block_release(fs_info->dev_root,
+				    fs_info->dev_root->node);
 	if (fs_info->tree_root->node)
 		btrfs_block_release(fs_info->tree_root,
 				    fs_info->tree_root->node);
-- 
cgit v1.2.3


From 7eccb903a817e890c947ba4bc90c6a9af9b4219a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Apr 2007 15:53:25 -0400
Subject: Btrfs: create a logical->phsyical block number mapping scheme

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  38 +++++++-------
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/disk-io.c     | 140 +++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/disk-io.h     |   1 +
 fs/btrfs/extent-tree.c |   8 +--
 fs/btrfs/super.c       |   6 +--
 fs/btrfs/transaction.c |   8 +--
 7 files changed, 134 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f31ac843bf2..e2fc6f0d024 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -66,21 +66,21 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
 		WARN_ON(1);
 	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
-	btrfs_set_header_blocknr(&cow_node->header, cow->b_blocknr);
+	btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
 	btrfs_set_header_generation(&cow_node->header, trans->transid);
 	btrfs_inc_ref(trans, root, buf);
 	if (buf == root->node) {
 		root->node = cow;
 		get_bh(cow);
 		if (buf != root->commit_root) {
-			btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
+			btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
 		}
 		btrfs_block_release(root, buf);
 	} else {
 		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
-					cow->b_blocknr);
+					bh_blocknr(cow));
 		btrfs_mark_buffer_dirty(parent);
-		btrfs_free_extent(trans, root, buf->b_blocknr, 1, 1);
+		btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
 	}
 	btrfs_block_release(root, buf);
 	mark_buffer_dirty(cow);
@@ -312,7 +312,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	 */
 	if (!parent_buf) {
 		struct buffer_head *child;
-		u64 blocknr = mid_buf->b_blocknr;
+		u64 blocknr = bh_blocknr(mid_buf);
 
 		if (btrfs_header_nritems(&mid->header) != 1)
 			return 0;
@@ -361,7 +361,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret < 0)
 			ret = wret;
 		if (btrfs_header_nritems(&right->header) == 0) {
-			u64 blocknr = right_buf->b_blocknr;
+			u64 blocknr = bh_blocknr(right_buf);
 			clean_tree_block(trans, root, right_buf);
 			wait_on_buffer(right_buf);
 			btrfs_block_release(root, right_buf);
@@ -400,7 +400,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	if (btrfs_header_nritems(&mid->header) == 0) {
 		/* we've managed to empty the middle node, drop it */
-		u64 blocknr = mid_buf->b_blocknr;
+		u64 blocknr = bh_blocknr(mid_buf);
 		clean_tree_block(trans, root, mid_buf);
 		wait_on_buffer(mid_buf);
 		btrfs_block_release(root, mid_buf);
@@ -686,7 +686,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	memset(c, 0, root->blocksize);
 	btrfs_set_header_nritems(&c->header, 1);
 	btrfs_set_header_level(&c->header, level);
-	btrfs_set_header_blocknr(&c->header, t->b_blocknr);
+	btrfs_set_header_blocknr(&c->header, bh_blocknr(t));
 	btrfs_set_header_generation(&c->header, trans->transid);
 	lower = btrfs_buffer_node(path->nodes[level-1]);
 	memcpy(c->header.fsid, root->fs_info->disk_super->fsid,
@@ -697,7 +697,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		lower_key = &lower->ptrs[0].key;
 	btrfs_memcpy(root, c, &c->ptrs[0].key, lower_key,
 		     sizeof(struct btrfs_disk_key));
-	btrfs_set_node_blockptr(c, 0, path->nodes[level - 1]->b_blocknr);
+	btrfs_set_node_blockptr(c, 0, bh_blocknr(path->nodes[level - 1]));
 
 	btrfs_mark_buffer_dirty(t);
 
@@ -780,7 +780,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	split = btrfs_buffer_node(split_buffer);
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
 	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
-	btrfs_set_header_blocknr(&split->header, split_buffer->b_blocknr);
+	btrfs_set_header_blocknr(&split->header, bh_blocknr(split_buffer));
 	btrfs_set_header_generation(&split->header, trans->transid);
 	memcpy(split->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(split->header.fsid));
@@ -794,7 +794,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_mark_buffer_dirty(t);
 	btrfs_mark_buffer_dirty(split_buffer);
 	wret = insert_ptr(trans, root, path, &split->ptrs[0].key,
-			  split_buffer->b_blocknr, path->slots[level + 1] + 1,
+			  bh_blocknr(split_buffer), path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
 		ret = wret;
@@ -1138,7 +1138,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(!right_buffer);
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
-	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
+	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
 	btrfs_set_header_generation(&right->header, trans->transid);
 	btrfs_set_header_level(&right->header, 0);
 	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
@@ -1152,7 +1152,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				btrfs_set_header_nritems(&right->header, 0);
 				wret = insert_ptr(trans, root, path,
 						  &disk_key,
-						  right_buffer->b_blocknr,
+						  bh_blocknr(right_buffer),
 						  path->slots[1] + 1, 1);
 				if (wret)
 					ret = wret;
@@ -1173,7 +1173,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				btrfs_set_header_nritems(&right->header, 0);
 				wret = insert_ptr(trans, root, path,
 						  &disk_key,
-						  right_buffer->b_blocknr,
+						  bh_blocknr(right_buffer),
 						  path->slots[1] - 1, 1);
 				if (wret)
 					ret = wret;
@@ -1207,7 +1207,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_nritems(&l->header, mid);
 	ret = 0;
 	wret = insert_ptr(trans, root, path, &right->items[0].key,
-			  right_buffer->b_blocknr, path->slots[1] + 1, 1);
+			  bh_blocknr(right_buffer), path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
 	btrfs_mark_buffer_dirty(right_buffer);
@@ -1228,7 +1228,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(!right_buffer);
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
-	btrfs_set_header_blocknr(&right->header, right_buffer->b_blocknr);
+	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
 	btrfs_set_header_generation(&right->header, trans->transid);
 	btrfs_set_header_level(&right->header, 0);
 	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
@@ -1237,7 +1237,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_nritems(&right->header, 0);
 	wret = insert_ptr(trans, root, path,
 			  &disk_key,
-			  right_buffer->b_blocknr,
+			  bh_blocknr(right_buffer),
 			  path->slots[1], 1);
 	if (wret)
 		ret = wret;
@@ -1456,7 +1456,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root,
-						 leaf_buf->b_blocknr, 1, 1);
+						 bh_blocknr(leaf_buf), 1, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -1487,7 +1487,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					ret = wret;
 			}
 			if (btrfs_header_nritems(&leaf->header) == 0) {
-				u64 blocknr = leaf_buf->b_blocknr;
+				u64 blocknr = bh_blocknr(leaf_buf);
 				clean_tree_block(trans, root, leaf_buf);
 				wait_on_buffer(leaf_buf);
 				wret = del_ptr(trans, root, path, 1, slot);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 01310de2bf4..454eb88611b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -242,6 +242,7 @@ struct btrfs_fs_info {
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
+	struct radix_tree_root dev_radix;
 	u64 generation;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e09233262af..c872a7e67ab 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -10,10 +10,30 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 
+struct dev_lookup {
+	u64 block_start;
+	u64 num_blocks;
+	struct block_device *bdev;
+};
+
+u64 bh_blocknr(struct buffer_head *bh)
+{
+	int blkbits = bh->b_page->mapping->host->i_blkbits;
+	u64 blocknr = bh->b_page->index << (PAGE_CACHE_SHIFT - blkbits);
+	unsigned long offset;
+
+	if (PageHighMem(bh->b_page))
+		offset = (unsigned long)bh->b_data;
+	else
+		offset = bh->b_data - (char *)page_address(bh->b_page);
+	blocknr += offset >> (PAGE_CACHE_SHIFT - blkbits);
+	return blocknr;
+}
+
 static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
 	struct btrfs_node *node = btrfs_buffer_node(buf);
-	if (buf->b_blocknr != btrfs_header_blocknr(&node->header)) {
+	if (bh_blocknr(buf) != btrfs_header_blocknr(&node->header)) {
 		BUG();
 	}
 	return 0;
@@ -40,7 +60,7 @@ struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
 	head = page_buffers(page);
 	bh = head;
 	do {
-		if (buffer_mapped(bh) && bh->b_blocknr == blocknr) {
+		if (buffer_mapped(bh) && bh_blocknr(bh) == blocknr) {
 			ret = bh;
 			get_bh(bh);
 			goto out_unlock;
@@ -56,6 +76,33 @@ out_unlock:
 	return ret;
 }
 
+static int map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
+			     u64 logical)
+{
+	struct dev_lookup *lookup[2];
+	char b[BDEVNAME_SIZE];
+
+	int ret;
+
+	root = root->fs_info->dev_root;
+	ret = radix_tree_gang_lookup(&root->fs_info->dev_radix,
+				     (void **)lookup,
+				     (unsigned long)logical,
+				     ARRAY_SIZE(lookup));
+	if (ret == 0 || lookup[0]->block_start > logical ||
+	    lookup[0]->block_start + lookup[0]->num_blocks <= logical) {
+		ret = -ENOENT;
+		goto out;
+	}
+	bh->b_bdev = lookup[0]->bdev;
+	bh->b_blocknr = logical - lookup[0]->block_start;
+printk("logical mapping %Lu to %lu bdev  %s\n", logical, bh->b_blocknr, bdevname(bh->b_bdev, b));
+	set_buffer_mapped(bh);
+	ret = 0;
+out:
+	return ret;
+}
+
 struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 blocknr)
 {
@@ -66,6 +113,7 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct buffer_head *ret = NULL;
+	int err;
 	u64 first_block = index << (PAGE_CACHE_SHIFT - blockbits);
 
 	page = grab_cache_page(mapping, index);
@@ -78,11 +126,10 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 	bh = head;
 	do {
 		if (!buffer_mapped(bh)) {
-			bh->b_bdev = root->fs_info->sb->s_bdev;
-			bh->b_blocknr = first_block;
-			set_buffer_mapped(bh);
+			err = map_bh_to_logical(root, bh, first_block);
+			BUG_ON(err);
 		}
-		if (bh->b_blocknr == blocknr) {
+		if (bh_blocknr(bh) == blocknr) {
 			ret = bh;
 			get_bh(bh);
 			goto out_unlock;
@@ -98,38 +145,13 @@ out_unlock:
 	return ret;
 }
 
-static sector_t max_block(struct block_device *bdev)
-{
-	sector_t retval = ~((sector_t)0);
-	loff_t sz = i_size_read(bdev->bd_inode);
-
-	if (sz) {
-		unsigned int size = block_size(bdev);
-		unsigned int sizebits = blksize_bits(size);
-		retval = (sz >> sizebits);
-	}
-	return retval;
-}
-
 static int btree_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh, int create)
 {
-	if (iblock >= max_block(inode->i_sb->s_bdev)) {
-		if (create)
-			return -EIO;
-
-		/*
-		 * for reads, we're just trying to fill a partial page.
-		 * return a hole, they will have to call get_block again
-		 * before they can fill it, and they will get -EIO at that
-		 * time
-		 */
-		return 0;
-	}
-	bh->b_bdev = inode->i_sb->s_bdev;
-	bh->b_blocknr = iblock;
-	set_buffer_mapped(bh);
-	return 0;
+	int err;
+	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
+	err = map_bh_to_logical(root, bh, iblock);
+	return err;
 }
 
 int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
@@ -164,8 +186,8 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 		return ret;
 	if (verify) {
 		if (memcmp(bh->b_data, result, BTRFS_CSUM_SIZE)) {
-			printk("checksum verify failed on %lu\n",
-			       bh->b_blocknr);
+			printk("checksum verify failed on %Lu\n",
+			       bh_blocknr(bh));
 			return 1;
 		}
 	} else {
@@ -386,10 +408,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 						GFP_NOFS);
 	int ret;
 	struct btrfs_super_block *disk_super;
+	struct dev_lookup *dev_lookup;
 
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+	INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
 	fs_info->tree_root = tree_root;
@@ -422,6 +446,13 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 
 	__setup_root(sb->s_blocksize, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
+
+	dev_lookup = kmalloc(sizeof(*dev_lookup), GFP_NOFS);
+	dev_lookup->block_start = 0;
+	dev_lookup->num_blocks = (u32)-2;
+	dev_lookup->bdev = sb->s_bdev;
+	ret = radix_tree_insert(&fs_info->dev_radix, (u32)-2, dev_lookup);
+	BUG_ON(ret);
 	fs_info->sb_buffer = read_tree_block(tree_root,
 					     BTRFS_SUPER_INFO_OFFSET /
 					     sb->s_blocksize);
@@ -432,6 +463,14 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	if (!btrfs_super_root(disk_super))
 		return NULL;
 
+	radix_tree_delete(&fs_info->dev_radix, (u32)-2);
+	dev_lookup->block_start = btrfs_super_device_block_start(disk_super);
+	dev_lookup->num_blocks = btrfs_super_device_num_blocks(disk_super);
+	ret = radix_tree_insert(&fs_info->dev_radix,
+				dev_lookup->block_start +
+				dev_lookup->num_blocks, dev_lookup);
+	BUG_ON(ret);
+
 	fs_info->disk_super = disk_super;
 	dev_root->node = read_tree_block(tree_root,
 					  btrfs_super_device_root(disk_super));
@@ -459,7 +498,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct buffer_head *bh = root->fs_info->sb_buffer;
 
 	btrfs_set_super_root(root->fs_info->disk_super,
-			     root->fs_info->tree_root->node->b_blocknr);
+			     bh_blocknr(root->fs_info->tree_root->node));
 	lock_buffer(bh);
 	WARN_ON(atomic_read(&bh->b_count) < 1);
 	clear_buffer_dirty(bh);
@@ -506,6 +545,29 @@ int del_fs_roots(struct btrfs_fs_info *fs_info)
 	}
 	return 0;
 }
+static int free_dev_radix(struct btrfs_fs_info *fs_info)
+{
+	struct dev_lookup *lookup[8];
+	struct block_device *super_bdev = fs_info->sb->s_bdev;
+	int ret;
+	int i;
+	while(1) {
+		ret = radix_tree_gang_lookup(&fs_info->dev_radix,
+					     (void **)lookup, 0,
+					     ARRAY_SIZE(lookup));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			if (lookup[i]->bdev != super_bdev)
+				close_bdev_excl(lookup[i]->bdev);
+			radix_tree_delete(&fs_info->dev_radix,
+					  lookup[i]->block_start +
+					  lookup[i]->num_blocks);
+			kfree(lookup[i]);
+		}
+	}
+	return 0;
+}
 
 int close_ctree(struct btrfs_root *root)
 {
@@ -537,6 +599,8 @@ int close_ctree(struct btrfs_root *root)
 	crypto_free_hash(fs_info->hash_tfm);
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
+
+	free_dev_radix(fs_info);
 	del_fs_roots(fs_info);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 0ef6e6f714a..46cc5dc9d57 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -41,4 +41,5 @@ int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 		    char *result);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location);
+u64 bh_blocknr(struct buffer_head *bh);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be9630df506..d560831c10a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -80,7 +80,7 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root)
 {
-	return inc_block_ref(trans, root, root->node->b_blocknr, 1);
+	return inc_block_ref(trans, root, bh_blocknr(root->node), 1);
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -563,7 +563,7 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = lookup_block_ref(trans, root, path->nodes[*level]->b_blocknr,
+	ret = lookup_block_ref(trans, root, bh_blocknr(path->nodes[*level]),
 			       1, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
@@ -607,7 +607,7 @@ out:
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = btrfs_free_extent(trans, root,
-				path->nodes[*level]->b_blocknr, 1, 1);
+				bh_blocknr(path->nodes[*level]), 1, 1);
 	btrfs_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -635,7 +635,7 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			return 0;
 		} else {
 			ret = btrfs_free_extent(trans, root,
-						path->nodes[*level]->b_blocknr,
+						bh_blocknr(path->nodes[*level]),
 						1, 1);
 			BUG_ON(ret);
 			btrfs_block_release(root, path->nodes[*level]);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2972aaa9eb7..4ff0cc1efb1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1700,7 +1700,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	leaf = btrfs_buffer_leaf(subvol);
 	btrfs_set_header_nritems(&leaf->header, 0);
 	btrfs_set_header_level(&leaf->header, 0);
-	btrfs_set_header_blocknr(&leaf->header, subvol->b_blocknr);
+	btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
 	btrfs_set_header_generation(&leaf->header, trans->transid);
 	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(leaf->header.fsid));
@@ -1713,7 +1713,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	btrfs_set_inode_nblocks(inode_item, 1);
 	btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
 
-	btrfs_set_root_blocknr(&root_item, subvol->b_blocknr);
+	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
 	btrfs_set_root_refs(&root_item, 1);
 
 	mark_buffer_dirty(subvol);
@@ -1803,7 +1803,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	key.offset = 1;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	btrfs_set_root_blocknr(&new_root_item, root->node->b_blocknr);
+	btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
 
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e15a072407b..3a15943ea8e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -119,10 +119,10 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
-		if (old_extent_block == extent_root->node->b_blocknr)
+		if (old_extent_block == bh_blocknr(extent_root->node))
 			break;
 		btrfs_set_root_blocknr(&extent_root->root_item,
-				       extent_root->node->b_blocknr);
+				       bh_blocknr(extent_root->node));
 		ret = btrfs_update_root(trans, tree_root,
 					&extent_root->root_key,
 					&extent_root->root_item);
@@ -176,7 +176,7 @@ int add_dirty_roots(struct btrfs_trans_handle *trans,
 				     (unsigned long)root->root_key.objectid,
 				     BTRFS_ROOT_TRANS_TAG);
 			if (root->commit_root == root->node) {
-				WARN_ON(root->node->b_blocknr !=
+				WARN_ON(bh_blocknr(root->node) !=
 					btrfs_root_blocknr(&root->root_item));
 				brelse(root->commit_root);
 				root->commit_root = NULL;
@@ -191,7 +191,7 @@ int add_dirty_roots(struct btrfs_trans_handle *trans,
 			dirty->root = root;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_blocknr(&root->root_item,
-					       root->node->b_blocknr);
+					       bh_blocknr(root->node));
 			err = btrfs_insert_root(trans, root->fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
-- 
cgit v1.2.3


From 8352d8a473ac84bf7a1c69690b626946d744ca58 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Apr 2007 10:43:05 -0400
Subject: Btrfs: add disk ioctl, mostly working

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 128 ++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/disk-io.h     |   6 +++
 fs/btrfs/ioctl.h       |   2 +
 fs/btrfs/super.c       |  96 +++++++++++++++++++++++++++++++++----
 fs/btrfs/transaction.c |   6 +++
 5 files changed, 221 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c872a7e67ab..3ba4df20c91 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,6 +16,28 @@ struct dev_lookup {
 	struct block_device *bdev;
 };
 
+int btrfs_insert_dev_radix(struct btrfs_root *root,
+			   struct block_device *bdev,
+			   u64 block_start,
+			   u64 num_blocks)
+{
+	struct dev_lookup *lookup;
+	char b[BDEVNAME_SIZE];
+	int ret;
+
+	lookup = kmalloc(sizeof(*lookup), GFP_NOFS);
+	if (!lookup)
+		return -ENOMEM;
+	lookup->block_start = block_start;
+	lookup->num_blocks = num_blocks;
+	lookup->bdev = bdev;
+printk("inserting %s into dev radix %Lu %Lu\n", bdevname(bdev, b), block_start, num_blocks);
+
+	ret = radix_tree_insert(&root->fs_info->dev_radix, block_start +
+				num_blocks - 1, lookup);
+	return ret;
+}
+
 u64 bh_blocknr(struct buffer_head *bh)
 {
 	int blkbits = bh->b_page->mapping->host->i_blkbits;
@@ -34,6 +56,8 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
 	struct btrfs_node *node = btrfs_buffer_node(buf);
 	if (bh_blocknr(buf) != btrfs_header_blocknr(&node->header)) {
+		printk(KERN_CRIT "bh_blocknr(buf) is %Lu, header is %Lu\n",
+		       bh_blocknr(buf), btrfs_header_blocknr(&node->header));
 		BUG();
 	}
 	return 0;
@@ -76,11 +100,10 @@ out_unlock:
 	return ret;
 }
 
-static int map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
+int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 			     u64 logical)
 {
 	struct dev_lookup *lookup[2];
-	char b[BDEVNAME_SIZE];
 
 	int ret;
 
@@ -96,7 +119,6 @@ static int map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 	}
 	bh->b_bdev = lookup[0]->bdev;
 	bh->b_blocknr = logical - lookup[0]->block_start;
-printk("logical mapping %Lu to %lu bdev  %s\n", logical, bh->b_blocknr, bdevname(bh->b_bdev, b));
 	set_buffer_mapped(bh);
 	ret = 0;
 out:
@@ -126,7 +148,7 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 	bh = head;
 	do {
 		if (!buffer_mapped(bh)) {
-			err = map_bh_to_logical(root, bh, first_block);
+			err = btrfs_map_bh_to_logical(root, bh, first_block);
 			BUG_ON(err);
 		}
 		if (bh_blocknr(bh) == blocknr) {
@@ -150,7 +172,7 @@ static int btree_get_block(struct inode *inode, sector_t iblock,
 {
 	int err;
 	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
-	err = map_bh_to_logical(root, bh, iblock);
+	err = btrfs_map_bh_to_logical(root, bh, iblock);
 	return err;
 }
 
@@ -396,6 +418,89 @@ printk("all worked\n");
 	return root;
 }
 
+int btrfs_open_disk(struct btrfs_root *root, u64 block_start, u64 num_blocks,
+		    char *filename, int name_len)
+{
+	char *null_filename;
+	struct block_device *bdev;
+	int ret;
+
+	if (block_start == 0) {
+printk("skipping disk with block_start == 0\n");
+return 0;
+	}
+	null_filename = kmalloc(name_len + 1, GFP_NOFS);
+	if (!null_filename)
+		return -ENOMEM;
+	memcpy(null_filename, filename, name_len);
+	null_filename[name_len] = '\0';
+
+	bdev = open_bdev_excl(null_filename, O_RDWR, root->fs_info->sb);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto out;
+	}
+	set_blocksize(bdev, root->fs_info->sb->s_blocksize);
+	ret = btrfs_insert_dev_radix(root, bdev, block_start, num_blocks);
+	BUG_ON(ret);
+	ret = 0;
+out:
+	kfree(null_filename);
+	return ret;
+}
+
+static int read_device_info(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_leaf *leaf;
+	struct btrfs_device_item *dev_item;
+	int nritems;
+	int slot;
+
+	root = root->fs_info->dev_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	key.objectid = 0;
+	key.offset = 0;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DEV_ITEM_KEY);
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	nritems = btrfs_header_nritems(&leaf->header);
+	while(1) {
+		slot = path->slots[0];
+		if (slot >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				break;
+			leaf = btrfs_buffer_leaf(path->nodes[0]);
+			nritems = btrfs_header_nritems(&leaf->header);
+			slot = path->slots[0];
+		}
+		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+		if (btrfs_key_type(&key) != BTRFS_DEV_ITEM_KEY) {
+			path->slots[0]++;
+			continue;
+		}
+		dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_device_item);
+printk("found key %Lu %Lu\n", key.objectid, key.offset);
+		ret = btrfs_open_disk(root, key.objectid, key.offset,
+				      (char *)(dev_item + 1),
+				      btrfs_device_pathlen(dev_item));
+		BUG_ON(ret);
+		path->slots[0]++;
+	}
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return 0;
+}
+
 struct btrfs_root *open_ctree(struct super_block *sb)
 {
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
@@ -463,17 +568,26 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	if (!btrfs_super_root(disk_super))
 		return NULL;
 
+	i_size_write(fs_info->btree_inode,
+		     btrfs_super_total_blocks(disk_super) <<
+		     fs_info->btree_inode->i_blkbits);
+
 	radix_tree_delete(&fs_info->dev_radix, (u32)-2);
 	dev_lookup->block_start = btrfs_super_device_block_start(disk_super);
 	dev_lookup->num_blocks = btrfs_super_device_num_blocks(disk_super);
 	ret = radix_tree_insert(&fs_info->dev_radix,
 				dev_lookup->block_start +
-				dev_lookup->num_blocks, dev_lookup);
+				dev_lookup->num_blocks - 1, dev_lookup);
 	BUG_ON(ret);
 
 	fs_info->disk_super = disk_super;
+
 	dev_root->node = read_tree_block(tree_root,
 					  btrfs_super_device_root(disk_super));
+
+	ret = read_device_info(dev_root);
+	BUG_ON(ret);
+
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super));
 	BUG_ON(!tree_root->node);
@@ -562,7 +676,7 @@ static int free_dev_radix(struct btrfs_fs_info *fs_info)
 				close_bdev_excl(lookup[i]->bdev);
 			radix_tree_delete(&fs_info->dev_radix,
 					  lookup[i]->block_start +
-					  lookup[i]->num_blocks);
+					  lookup[i]->num_blocks - 1);
 			kfree(lookup[i]);
 		}
 	}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 46cc5dc9d57..aa9f56c3841 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -42,4 +42,10 @@ int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location);
 u64 bh_blocknr(struct buffer_head *bh);
+int btrfs_insert_dev_radix(struct btrfs_root *root,
+			   struct block_device *bdev,
+			   u64 block_start,
+			   u64 num_blocks);
+int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
+			     u64 logical);
 #endif
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 201fb327072..497fadd98c3 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -10,4 +10,6 @@ struct btrfs_ioctl_vol_args {
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_ADD_DISK _IOW(BTRFS_IOCTL_MAGIC, 2, \
+				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4ff0cc1efb1..c46d7eafcf6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1073,6 +1073,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 
 		}
 		map_bh(result, inode->i_sb, blocknr);
+		btrfs_map_bh_to_logical(root, result, blocknr);
 		goto out;
 	}
 
@@ -1092,7 +1093,8 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	extent_end = extent_start + btrfs_file_extent_num_blocks(item);
 	if (iblock >= extent_start && iblock < extent_end) {
 		err = 0;
-		map_bh(result, inode->i_sb, blocknr + iblock - extent_start);
+		btrfs_map_bh_to_logical(root, result, blocknr + iblock -
+					extent_start);
 		goto out;
 	}
 allocate:
@@ -1112,6 +1114,7 @@ allocate:
 	set_buffer_new(result);
 	map_bh(result, inode->i_sb, blocknr);
 
+	btrfs_map_bh_to_logical(root, result, blocknr);
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
@@ -1153,12 +1156,6 @@ static int btrfs_readpage(struct file *file, struct page *page)
 	return mpage_readpage(page, btrfs_get_block);
 }
 
-static int btrfs_readpages(struct file *file, struct address_space *mapping,
-			   struct list_head *pages, unsigned nr_pages)
-{
-	return mpage_readpages(mapping, pages, nr_pages, btrfs_get_block);
-}
-
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return nobh_writepage(page, btrfs_get_block, wbc);
@@ -1831,12 +1828,81 @@ printk("adding snapshot name %.*s root %Lu %Lu %u\n", namelen, name, key.objecti
 	return 0;
 }
 
+static int add_disk(struct btrfs_root *root, char *name, int namelen)
+{
+	struct block_device *bdev;
+	struct btrfs_path *path;
+	struct super_block *sb = root->fs_info->sb;
+	struct btrfs_root *dev_root = root->fs_info->dev_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device_item *dev_item;
+	struct btrfs_key key;
+	u16 item_size;
+	u64 num_blocks;
+	u64 new_blocks;
+	int ret;
+printk("adding disk %s\n", name);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	num_blocks = btrfs_super_total_blocks(root->fs_info->disk_super);
+	bdev = open_bdev_excl(name, O_RDWR, sb);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+printk("open bdev excl failed ret %d\n", ret);
+		goto out_nolock;
+	}
+	set_blocksize(bdev, sb->s_blocksize);
+	new_blocks = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+	key.objectid = num_blocks;
+	key.offset = new_blocks;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DEV_ITEM_KEY);
+
+	mutex_lock(&dev_root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(dev_root, 1);
+	item_size = sizeof(*dev_item) + namelen;
+printk("insert empty on %Lu %Lu %u size %d\n", num_blocks, new_blocks, key.flags, item_size);
+	ret = btrfs_insert_empty_item(trans, dev_root, path, &key, item_size);
+	if (ret) {
+printk("insert failed %d\n", ret);
+		close_bdev_excl(bdev);
+		if (ret > 0)
+			ret = -EEXIST;
+		goto out;
+	}
+	dev_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0], struct btrfs_device_item);
+	btrfs_set_device_pathlen(dev_item, namelen);
+	memcpy(dev_item + 1, name, namelen);
+	mark_buffer_dirty(path->nodes[0]);
+
+	ret = btrfs_insert_dev_radix(root, bdev, num_blocks, new_blocks);
+
+	if (!ret) {
+		btrfs_set_super_total_blocks(root->fs_info->disk_super,
+					     num_blocks + new_blocks);
+		i_size_write(root->fs_info->btree_inode,
+			     (num_blocks + new_blocks) <<
+			     root->fs_info->btree_inode->i_blkbits);
+	}
+
+out:
+	ret = btrfs_commit_transaction(trans, dev_root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+out_nolock:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
 static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		       cmd, unsigned long arg)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_vol_args vol_args;
-	int ret;
+	int ret = 0;
 	int namelen;
 	struct btrfs_path *path;
 	u64 root_dirid;
@@ -1869,10 +1935,21 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 			ret = create_snapshot(root, vol_args.name, namelen);
 		WARN_ON(ret);
 		break;
+	case BTRFS_IOC_ADD_DISK:
+		if (copy_from_user(&vol_args,
+				   (struct btrfs_ioctl_vol_args __user *)arg,
+				   sizeof(vol_args)))
+			return -EFAULT;
+		namelen = strlen(vol_args.name);
+		if (namelen > BTRFS_VOL_NAME_MAX)
+			return -EINVAL;
+		vol_args.name[namelen] = '\0';
+		ret = add_disk(root, vol_args.name, namelen);
+		break;
 	default:
 		return -ENOTTY;
 	}
-	return 0;
+	return ret;
 }
 
 static struct kmem_cache *btrfs_inode_cachep;
@@ -2004,7 +2081,6 @@ static struct file_operations btrfs_dir_file_operations = {
 
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
-	.readpages	= btrfs_readpages,
 	.writepage	= btrfs_writepage,
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3a15943ea8e..8740752f384 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -116,7 +116,13 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *dev_root = fs_info->dev_root;
 
+	if (btrfs_super_device_root(fs_info->disk_super) !=
+	    bh_blocknr(dev_root->node)) {
+		btrfs_set_super_device_root(fs_info->disk_super,
+					    bh_blocknr(dev_root->node));
+	}
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
 		if (old_extent_block == bh_blocknr(extent_root->node))
-- 
cgit v1.2.3


From b4100d64241fed93a3f821ddf59d11ab4443a3ba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Apr 2007 12:14:00 -0400
Subject: Btrfs: add a device id to device items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 36 ++++++++++++++++++++++++++++++++++++
 fs/btrfs/disk-io.c | 31 ++++++++++++++++++++-----------
 fs/btrfs/disk-io.h |  1 +
 fs/btrfs/super.c   |  9 ++++++++-
 4 files changed, 65 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 454eb88611b..ef3583cf74d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -91,10 +91,12 @@ struct btrfs_super_block {
 	__le64 total_blocks;
 	__le64 blocks_used;
 	__le64 root_dir_objectid;
+	__le64 last_device_id;
 	/* fields below here vary with the underlying disk */
 	__le64 device_block_start;
 	__le64 device_num_blocks;
 	__le64 device_root;
+	__le64 device_id;
 } __attribute__ ((__packed__));
 
 /*
@@ -230,6 +232,7 @@ struct btrfs_csum_item {
 
 struct btrfs_device_item {
 	__le16 pathlen;
+	__le64 device_id;
 } __attribute__ ((__packed__));
 
 struct crypto_hash;
@@ -798,6 +801,28 @@ static inline void btrfs_set_super_root_dir(struct btrfs_super_block *s, u64
 	s->root_dir_objectid = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_super_last_device_id(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->last_device_id);
+}
+
+static inline void btrfs_set_super_last_device_id(struct btrfs_super_block *s,
+						  u64 val)
+{
+	s->last_device_id = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_super_device_id(struct btrfs_super_block *s)
+{
+	return le64_to_cpu(s->device_id);
+}
+
+static inline void btrfs_set_super_device_id(struct btrfs_super_block *s,
+						  u64 val)
+{
+	s->device_id = cpu_to_le64(val);
+}
+
 static inline u64 btrfs_super_device_block_start(struct btrfs_super_block *s)
 {
 	return le64_to_cpu(s->device_block_start);
@@ -910,6 +935,17 @@ static inline void btrfs_set_device_pathlen(struct btrfs_device_item *d,
 	d->pathlen = cpu_to_le16(val);
 }
 
+static inline u64 btrfs_device_id(struct btrfs_device_item *d)
+{
+	return le64_to_cpu(d->device_id);
+}
+
+static inline void btrfs_set_device_id(struct btrfs_device_item *d,
+						u64 val)
+{
+	d->device_id = cpu_to_le64(val);
+}
+
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3ba4df20c91..06b969c1462 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -13,11 +13,13 @@
 struct dev_lookup {
 	u64 block_start;
 	u64 num_blocks;
+	u64 device_id;
 	struct block_device *bdev;
 };
 
 int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   struct block_device *bdev,
+			   u64 device_id,
 			   u64 block_start,
 			   u64 num_blocks)
 {
@@ -31,6 +33,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 	lookup->block_start = block_start;
 	lookup->num_blocks = num_blocks;
 	lookup->bdev = bdev;
+	lookup->device_id = device_id;
 printk("inserting %s into dev radix %Lu %Lu\n", bdevname(bdev, b), block_start, num_blocks);
 
 	ret = radix_tree_insert(&root->fs_info->dev_radix, block_start +
@@ -418,17 +421,14 @@ printk("all worked\n");
 	return root;
 }
 
-int btrfs_open_disk(struct btrfs_root *root, u64 block_start, u64 num_blocks,
-		    char *filename, int name_len)
+static int btrfs_open_disk(struct btrfs_root *root, u64 device_id,
+			   u64 block_start, u64 num_blocks,
+			   char *filename, int name_len)
 {
 	char *null_filename;
 	struct block_device *bdev;
 	int ret;
 
-	if (block_start == 0) {
-printk("skipping disk with block_start == 0\n");
-return 0;
-	}
 	null_filename = kmalloc(name_len + 1, GFP_NOFS);
 	if (!null_filename)
 		return -ENOMEM;
@@ -441,7 +441,8 @@ return 0;
 		goto out;
 	}
 	set_blocksize(bdev, root->fs_info->sb->s_blocksize);
-	ret = btrfs_insert_dev_radix(root, bdev, block_start, num_blocks);
+	ret = btrfs_insert_dev_radix(root, bdev, device_id,
+				     block_start, num_blocks);
 	BUG_ON(ret);
 	ret = 0;
 out:
@@ -490,10 +491,14 @@ static int read_device_info(struct btrfs_root *root)
 		}
 		dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_device_item);
 printk("found key %Lu %Lu\n", key.objectid, key.offset);
-		ret = btrfs_open_disk(root, key.objectid, key.offset,
-				      (char *)(dev_item + 1),
-				      btrfs_device_pathlen(dev_item));
-		BUG_ON(ret);
+		if (btrfs_device_id(dev_item) !=
+		    btrfs_super_device_id(root->fs_info->disk_super)) {
+			ret = btrfs_open_disk(root, btrfs_device_id(dev_item),
+					      key.objectid, key.offset,
+					      (char *)(dev_item + 1),
+					      btrfs_device_pathlen(dev_item));
+			BUG_ON(ret);
+		}
 		path->slots[0]++;
 	}
 	btrfs_free_path(path);
@@ -556,6 +561,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	dev_lookup->block_start = 0;
 	dev_lookup->num_blocks = (u32)-2;
 	dev_lookup->bdev = sb->s_bdev;
+	dev_lookup->device_id = 0;
 	ret = radix_tree_insert(&fs_info->dev_radix, (u32)-2, dev_lookup);
 	BUG_ON(ret);
 	fs_info->sb_buffer = read_tree_block(tree_root,
@@ -575,6 +581,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	radix_tree_delete(&fs_info->dev_radix, (u32)-2);
 	dev_lookup->block_start = btrfs_super_device_block_start(disk_super);
 	dev_lookup->num_blocks = btrfs_super_device_num_blocks(disk_super);
+	dev_lookup->device_id = btrfs_super_device_id(disk_super);
+
 	ret = radix_tree_insert(&fs_info->dev_radix,
 				dev_lookup->block_start +
 				dev_lookup->num_blocks - 1, dev_lookup);
@@ -659,6 +667,7 @@ int del_fs_roots(struct btrfs_fs_info *fs_info)
 	}
 	return 0;
 }
+
 static int free_dev_radix(struct btrfs_fs_info *fs_info)
 {
 	struct dev_lookup *lookup[8];
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index aa9f56c3841..444ebb0141a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -44,6 +44,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 u64 bh_blocknr(struct buffer_head *bh);
 int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   struct block_device *bdev,
+			   u64 device_id,
 			   u64 block_start,
 			   u64 num_blocks);
 int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c46d7eafcf6..8dcf600e39e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1840,7 +1840,9 @@ static int add_disk(struct btrfs_root *root, char *name, int namelen)
 	u16 item_size;
 	u64 num_blocks;
 	u64 new_blocks;
+	u64 device_id;
 	int ret;
+
 printk("adding disk %s\n", name);
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1875,9 +1877,14 @@ printk("insert failed %d\n", ret);
 				  path->slots[0], struct btrfs_device_item);
 	btrfs_set_device_pathlen(dev_item, namelen);
 	memcpy(dev_item + 1, name, namelen);
+
+	device_id = btrfs_super_last_device_id(root->fs_info->disk_super) + 1;
+	btrfs_set_super_last_device_id(root->fs_info->disk_super, device_id);
+	btrfs_set_device_id(dev_item, device_id);
 	mark_buffer_dirty(path->nodes[0]);
 
-	ret = btrfs_insert_dev_radix(root, bdev, num_blocks, new_blocks);
+	ret = btrfs_insert_dev_radix(root, bdev, device_id, num_blocks,
+				     new_blocks);
 
 	if (!ret) {
 		btrfs_set_super_total_blocks(root->fs_info->disk_super,
-- 
cgit v1.2.3


From 6567e837df07e43bffc08ac40858af8133a007bf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 16 Apr 2007 09:22:45 -0400
Subject: Btrfs: early work to file_write in big extents

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     |  54 +++++++++++++++++++++++++
 fs/btrfs/ctree.h     |  15 +++++++
 fs/btrfs/file-item.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/super.c     |  75 ++++++++++++++++++++++++++--------
 4 files changed, 232 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e2fc6f0d024..6b76fd96715 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1249,6 +1249,60 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, u32 data_size)
+{
+	int ret = 0;
+	int slot;
+	int slot_orig;
+	struct btrfs_leaf *leaf;
+	struct buffer_head *leaf_buf;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data;
+	unsigned int old_size;
+	int i;
+
+	slot_orig = path->slots[0];
+	leaf_buf = path->nodes[0];
+	leaf = btrfs_buffer_leaf(leaf_buf);
+
+	nritems = btrfs_header_nritems(&leaf->header);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < data_size)
+		BUG();
+	slot = path->slots[0];
+	old_data = btrfs_item_end(leaf->items + slot);
+
+	BUG_ON(slot < 0);
+	BUG_ON(slot >= nritems);
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff = btrfs_item_offset(leaf->items + i);
+		btrfs_set_item_offset(leaf->items + i,
+				      ioff - data_size);
+	}
+	/* shift the data */
+	btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+		      data_end - data_size, btrfs_leaf_data(leaf) +
+		      data_end, old_data - data_end);
+	data_end = old_data;
+	old_size = btrfs_item_size(leaf->items + slot);
+	btrfs_set_item_size(leaf->items + slot, old_size + data_size);
+	btrfs_mark_buffer_dirty(leaf_buf);
+
+	ret = 0;
+	if (btrfs_leaf_free_space(root, leaf) < 0)
+		BUG();
+	check_leaf(root, path, 0);
+	return ret;
+}
+
 /*
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ef3583cf74d..796f19d03ab 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -227,6 +227,7 @@ struct btrfs_file_extent_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
+	__le64 extent_offset;
 	u8 csum[BTRFS_CSUM_SIZE];
 } __attribute__ ((__packed__));
 
@@ -924,6 +925,17 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_csum_extent_offset(struct btrfs_csum_item *c)
+{
+	return le64_to_cpu(c->extent_offset);
+}
+
+static inline void btrfs_set_csum_extent_offset(struct btrfs_csum_item *c,
+						u64 val)
+{
+	c->extent_offset = cpu_to_le64(val);
+}
+
 static inline u16 btrfs_device_pathlen(struct btrfs_device_item *d)
 {
 	return le16_to_cpu(d->pathlen);
@@ -1002,6 +1014,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root);
 /* ctree.c */
+int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
+		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
@@ -1071,6 +1085,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset,
+			  u64 extent_offset,
 			  char *data, size_t len);
 int btrfs_csum_verify_file_block(struct btrfs_root *root,
 				 u64 objectid, u64 offset,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d9fd7f4d379..93d42d65082 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -3,6 +3,9 @@
 #include "disk-io.h"
 #include "transaction.h"
 
+#define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+				 sizeof(struct btrfs_item)) / \
+				sizeof(struct btrfs_csum_item)) - 1))
 int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 offset,
@@ -43,6 +46,54 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static struct btrfs_csum_item *__lookup_csum_item(struct btrfs_root *root,
+						  struct btrfs_path *path,
+						  u64 objectid, u64 offset)
+{
+	int ret;
+	struct btrfs_key file_key;
+	struct btrfs_key found_key;
+	struct btrfs_csum_item *item;
+	struct btrfs_leaf *leaf;
+	u64 csum_offset = 0;
+
+	file_key.objectid = objectid;
+	file_key.offset = offset;
+	file_key.flags = 0;
+	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root, &file_key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	if (ret > 0) {
+		ret = 1;
+		if (path->slots[0] == 0)
+			goto fail;
+		path->slots[0]--;
+		btrfs_disk_key_to_cpu(&found_key,
+				      &leaf->items[path->slots[0]].key);
+		if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
+		    found_key.objectid != objectid) {
+			goto fail;
+		}
+		csum_offset = (offset - found_key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+		if (csum_offset >=
+		    btrfs_item_size(leaf->items + path->slots[0]) /
+		    sizeof(struct btrfs_csum_item)) {
+			goto fail;
+		}
+	}
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item += csum_offset;
+	return item;
+fail:
+	if (ret > 0)
+		ret = -EIO;
+	return ERR_PTR(ret);
+}
+
+
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -52,11 +103,16 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_key file_key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
+	struct btrfs_csum_item *csum_item;
 
+	csum_item = __lookup_csum_item(root, path, objectid, offset);
+	if (IS_ERR(csum_item))
+		return PTR_ERR(csum_item);
 	file_key.objectid = objectid;
-	file_key.offset = offset;
+	file_key.offset = btrfs_csum_extent_offset(csum_item);
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+	btrfs_release_path(root, path);
 	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
 	return ret;
 }
@@ -64,12 +120,16 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset,
+			  u64 extent_offset,
 			  char *data, size_t len)
 {
 	int ret;
 	struct btrfs_key file_key;
+	struct btrfs_key found_key;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
+	struct btrfs_leaf *leaf;
+	u64 csum_offset;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -78,14 +138,50 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	ret = btrfs_search_slot(trans, root, &file_key, path,
+				sizeof(struct btrfs_csum_item), 1);
+	if (ret < 0)
+		goto fail;
+	if (ret == 0) {
+		csum_offset = 0;
+		goto csum;
+	}
+	if (path->slots[0] == 0) {
+		btrfs_release_path(root, path);
+		goto insert;
+	}
+	path->slots[0]--;
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	btrfs_disk_key_to_cpu(&found_key, &leaf->items[path->slots[0]].key);
+	csum_offset = (offset - found_key.offset) >>
+			root->fs_info->sb->s_blocksize_bits;
+	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
+	    found_key.objectid != objectid ||
+	    csum_offset >= MAX_CSUM_ITEMS(root)) {
+		btrfs_release_path(root, path);
+		goto insert;
+	}
+	if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) /
+	    sizeof(struct btrfs_csum_item)) {
+		ret = btrfs_extend_item(trans, root, path,
+					sizeof(struct btrfs_csum_item));
+		BUG_ON(ret);
+		goto csum;
+	}
+
+insert:
+	csum_offset = 0;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
-				      BTRFS_CSUM_SIZE);
+				      sizeof(struct btrfs_csum_item));
 	if (ret != 0 && ret != -EEXIST)
 		goto fail;
+csum:
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_csum_item);
 	ret = 0;
+	item += csum_offset;
 	ret = btrfs_csum_data(root, data, len, item->csum);
+	btrfs_set_csum_extent_offset(item, extent_offset);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
@@ -111,12 +207,13 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
 	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_search_slot(NULL, root, &file_key, path, 0, 0);
-	if (ret)
+
+	item = __lookup_csum_item(root, path, objectid, offset);
+	if (IS_ERR(item)) {
+		ret = PTR_ERR(item);
 		goto fail;
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			      struct btrfs_csum_item);
-	ret = 0;
+	}
+
 	ret = btrfs_csum_data(root, data, len, result);
 	WARN_ON(ret);
 	if (memcmp(result, item->csum, BTRFS_CSUM_SIZE))
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8dcf600e39e..ec689992fdf 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1027,8 +1027,10 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-	if (create)
+	if (create) {
 		trans = btrfs_start_transaction(root, 1);
+		WARN_ON(1);
+	}
 
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       inode->i_ino,
@@ -1055,9 +1057,8 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	/* exact match found, use it, FIXME, deal with extents
 	 * other than the page size
 	 */
-	if (ret == 0) {
+	if (0 && ret == 0) {
 		err = 0;
-		BUG_ON(btrfs_file_extent_disk_num_blocks(item) != 1);
 		if (create &&
 		    btrfs_file_extent_generation(item) != trans->transid) {
 			struct btrfs_key ins;
@@ -1072,7 +1073,6 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 			blocknr = ins.objectid;
 
 		}
-		map_bh(result, inode->i_sb, blocknr);
 		btrfs_map_bh_to_logical(root, result, blocknr);
 		goto out;
 	}
@@ -1231,6 +1231,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct file *file,
 				   struct page **pages,
 				   size_t num_pages,
+				   u64 extent_offset,
 				   loff_t pos,
 				   size_t write_bytes)
 {
@@ -1250,6 +1251,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_csum_file_block(trans, root, inode->i_ino,
 				      pages[i]->index << PAGE_CACHE_SHIFT,
+				      extent_offset,
 				      kmap(pages[i]), PAGE_CACHE_SIZE);
 		kunmap(pages[i]);
 		SetPageChecked(pages[i]);
@@ -1279,7 +1281,8 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 			 loff_t pos,
 			 unsigned long first_index,
 			 unsigned long last_index,
-			 size_t write_bytes)
+			 size_t write_bytes,
+			 u64 alloc_extent_start)
 {
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
@@ -1288,6 +1291,8 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int ret;
 	int this_write;
+	struct buffer_head *bh;
+	struct buffer_head *head;
 	loff_t isize = i_size_read(inode);
 
 	memset(pages, 0, num_pages * sizeof(struct page *));
@@ -1307,14 +1312,20 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 			lock_page(pages[i]);
 		}
-		ret = nobh_prepare_write(pages[i], offset,
-					 offset + this_write,
-					 btrfs_get_block);
+		create_empty_buffers(pages[i], root->fs_info->sb->s_blocksize,
+				     (1 << BH_Uptodate));
+		head = page_buffers(pages[i]);
+		bh = head;
+		do {
+			err = btrfs_map_bh_to_logical(root, bh,
+						      alloc_extent_start);
+			BUG_ON(err);
+			if (err)
+				goto failed_truncate;
+			bh = bh->b_this_page;
+			alloc_extent_start++;
+		} while (bh != head);
 		pos += this_write;
-		if (ret) {
-			err = ret;
-			goto failed_truncate;
-		}
 		WARN_ON(this_write > write_bytes);
 		write_bytes -= this_write;
 	}
@@ -1343,11 +1354,23 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	struct page *pages[1];
 	unsigned long first_index;
 	unsigned long last_index;
+	u64 start_pos;
+	u64 num_blocks;
+	u64 alloc_extent_start;
+	u64 orig_extent_start;
+	struct btrfs_trans_handle *trans;
 
 	if (file->f_flags & O_DIRECT)
 		return -EINVAL;
 	pos = *ppos;
 
+	start_pos = pos & ~(root->blocksize - 1);
+	/* FIXME */
+	if (start_pos != pos)
+		return -EINVAL;
+	num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
+			inode->i_blkbits;
+
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
@@ -1362,20 +1385,41 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	mutex_lock(&inode->i_mutex);
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	ret = btrfs_alloc_file_extent(trans, root, inode->i_ino,
+				      start_pos, num_blocks, 1,
+				      &alloc_extent_start);
+	BUG_ON(ret);
+
+	orig_extent_start = start_pos;
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
 	while(count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
 		ret = prepare_pages(NULL, root, file, pages, num_pages,
-				    pos, first_index, last_index, write_bytes);
+				    pos, first_index, last_index,
+				    write_bytes, alloc_extent_start);
 		BUG_ON(ret);
+		/* FIXME blocks != pagesize */
+		alloc_extent_start += num_pages;
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		BUG_ON(ret);
 
 		ret = dirty_and_release_pages(NULL, root, file, pages,
-					      num_pages, pos, write_bytes);
+					      num_pages, orig_extent_start,
+					      pos, write_bytes);
 		BUG_ON(ret);
 		btrfs_drop_pages(pages, num_pages);
 
@@ -1387,6 +1431,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		balance_dirty_pages_ratelimited(inode->i_mapping);
 		cond_resched();
 	}
+out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
 	*ppos = pos;
@@ -1806,8 +1851,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 				&new_root_item);
 	BUG_ON(ret);
 
-printk("adding snapshot name %.*s root %Lu %Lu %u\n", namelen, name, key.objectid, key.offset, key.flags);
-
 	/*
 	 * insert the directory item
 	 */
-- 
cgit v1.2.3


From b18c6685810af8e6763760711aece31ccc7a8ea8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 17 Apr 2007 13:26:50 -0400
Subject: Btrfs: progress on file_write

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  57 ++++++++++
 fs/btrfs/ctree.h       |  21 +++-
 fs/btrfs/extent-tree.c |  20 ++--
 fs/btrfs/file-item.c   |  61 +++++++----
 fs/btrfs/super.c       | 284 ++++++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 394 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6b76fd96715..43d4779abdd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1249,6 +1249,63 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			u32 new_size)
+{
+	int ret = 0;
+	int slot;
+	int slot_orig;
+	struct btrfs_leaf *leaf;
+	struct buffer_head *leaf_buf;
+	u32 nritems;
+	unsigned int data_end;
+	unsigned int old_data_start;
+	unsigned int old_size;
+	unsigned int size_diff;
+	int i;
+
+	slot_orig = path->slots[0];
+	leaf_buf = path->nodes[0];
+	leaf = btrfs_buffer_leaf(leaf_buf);
+
+	nritems = btrfs_header_nritems(&leaf->header);
+	data_end = leaf_data_end(root, leaf);
+
+	slot = path->slots[0];
+	old_data_start = btrfs_item_offset(leaf->items + slot);
+	old_size = btrfs_item_size(leaf->items + slot);
+	BUG_ON(old_size <= new_size);
+	size_diff = old_size - new_size;
+
+	BUG_ON(slot < 0);
+	BUG_ON(slot >= nritems);
+
+	/*
+	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+	 */
+	/* first correct the data pointers */
+	for (i = slot; i < nritems; i++) {
+		u32 ioff = btrfs_item_offset(leaf->items + i);
+		btrfs_set_item_offset(leaf->items + i,
+				      ioff + size_diff);
+	}
+	/* shift the data */
+printk("truncate item, new_size %u old_size %u, diff %u, bufp %p, dst, %p, num %u, old_data_start %u, data_end %u\n", new_size, old_size, size_diff, leaf, btrfs_leaf_data(leaf) + data_end + size_diff, old_data_start-data_end, old_data_start, data_end);
+	btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+		      data_end + size_diff, btrfs_leaf_data(leaf) +
+		      data_end, old_data_start + new_size - data_end);
+	btrfs_set_item_size(leaf->items + slot, new_size);
+	btrfs_mark_buffer_dirty(leaf_buf);
+
+	ret = 0;
+	if (btrfs_leaf_free_space(root, leaf) < 0)
+		BUG();
+	check_leaf(root, path, 0);
+	return ret;
+}
+
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 796f19d03ab..ca3ab160f46 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -999,7 +999,7 @@ static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 	((type *)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset((leaf)->items + (slot))))
 
-/* extent-item.c */
+/* extent-tree.c */
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -1013,9 +1013,16 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root);
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 blocknr, u64 num_blocks);
 /* ctree.c */
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
+int btrfs_truncate_item(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct btrfs_path *path,
+			u32 new_size);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
@@ -1073,11 +1080,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       struct btrfs_key *location, int mod);
 
 /* file-item.c */
-int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       u64 objectid, u64 offset,
-			       u64 num_blocks, u64 hint_block,
-			       u64 *result);
+			       u64 objectid, u64 pos, u64 offset,
+			       u64 num_blocks);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -1090,6 +1096,11 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 int btrfs_csum_verify_file_block(struct btrfs_root *root,
 				 u64 objectid, u64 offset,
 				 char *data, size_t len);
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 objectid, u64 offset,
+					  int cow);
 /* super.c */
 extern struct subsystem btrfs_subsys;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d560831c10a..2cee9df001f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,8 +12,9 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
-static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 blocknr, u64 num_blocks)
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 blocknr, u64 num_blocks)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -50,8 +51,9 @@ static int inc_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 0;
 }
 
-static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *root, u64 blocknr, u64 num_blocks, u32 *refs)
+static int lookup_extent_ref(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 blocknr,
+			     u64 num_blocks, u32 *refs)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -80,7 +82,7 @@ static int lookup_block_ref(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root)
 {
-	return inc_block_ref(trans, root, bh_blocknr(root->node), 1);
+	return btrfs_inc_extent_ref(trans, root, bh_blocknr(root->node), 1);
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -107,13 +109,13 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				continue;
 			fi = btrfs_item_ptr(buf_leaf, i,
 					    struct btrfs_file_extent_item);
-			ret = inc_block_ref(trans, root,
+			ret = btrfs_inc_extent_ref(trans, root,
 				    btrfs_file_extent_disk_blocknr(fi),
 				    btrfs_file_extent_disk_num_blocks(fi));
 			BUG_ON(ret);
 		} else {
 			blocknr = btrfs_node_blockptr(buf_node, i);
-			ret = inc_block_ref(trans, root, blocknr, 1);
+			ret = btrfs_inc_extent_ref(trans, root, blocknr, 1);
 			BUG_ON(ret);
 		}
 	}
@@ -563,7 +565,7 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = lookup_block_ref(trans, root, bh_blocknr(path->nodes[*level]),
+	ret = lookup_extent_ref(trans, root, bh_blocknr(path->nodes[*level]),
 			       1, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
@@ -587,7 +589,7 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
 					      path->slots[*level]);
-		ret = lookup_block_ref(trans, root, blocknr, 1, &refs);
+		ret = lookup_extent_ref(trans, root, blocknr, 1, &refs);
 		BUG_ON(ret);
 		if (refs != 1) {
 			path->slots[*level]++;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 93d42d65082..f49968ad0a0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -6,13 +6,11 @@
 #define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
 				 sizeof(struct btrfs_item)) / \
 				sizeof(struct btrfs_csum_item)) - 1))
-int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
+int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       u64 objectid, u64 offset,
-			       u64 num_blocks, u64 hint_block,
-			       u64 *result)
+			       u64 objectid, u64 pos,
+			       u64 offset, u64 num_blocks)
 {
-	struct btrfs_key ins;
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_key file_key;
@@ -21,11 +19,13 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
+	/*
 	ret = btrfs_alloc_extent(trans, root, num_blocks, hint_block,
 				 (u64)-1, &ins);
+				 */
 	BUG_ON(ret);
 	file_key.objectid = objectid;
-	file_key.offset = offset;
+	file_key.offset = pos;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 
@@ -34,21 +34,22 @@ int btrfs_alloc_file_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_file_extent_item);
-	btrfs_set_file_extent_disk_blocknr(item, ins.objectid);
-	btrfs_set_file_extent_disk_num_blocks(item, ins.offset);
+	btrfs_set_file_extent_disk_blocknr(item, offset);
+	btrfs_set_file_extent_disk_num_blocks(item, num_blocks);
 	btrfs_set_file_extent_offset(item, 0);
-	btrfs_set_file_extent_num_blocks(item, ins.offset);
+	btrfs_set_file_extent_num_blocks(item, num_blocks);
 	btrfs_set_file_extent_generation(item, trans->transid);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	*result = ins.objectid;
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return 0;
 }
 
-static struct btrfs_csum_item *__lookup_csum_item(struct btrfs_root *root,
-						  struct btrfs_path *path,
-						  u64 objectid, u64 offset)
+struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 objectid, u64 offset,
+					  int cow)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -61,19 +62,23 @@ static struct btrfs_csum_item *__lookup_csum_item(struct btrfs_root *root,
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
-	ret = btrfs_search_slot(NULL, root, &file_key, path, 0, 0);
+printk("__lookup for %Lu\n", offset);
+	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
 	if (ret < 0)
 		goto fail;
 	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	if (ret > 0) {
 		ret = 1;
-		if (path->slots[0] == 0)
+		if (path->slots[0] == 0) {
+printk("fail1\n");
 			goto fail;
+		}
 		path->slots[0]--;
 		btrfs_disk_key_to_cpu(&found_key,
 				      &leaf->items[path->slots[0]].key);
 		if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 		    found_key.objectid != objectid) {
+printk("fail2 type %u %Lu %Lu\n", btrfs_key_type(&found_key), found_key.objectid, objectid);
 			goto fail;
 		}
 		csum_offset = (offset - found_key.offset) >>
@@ -81,6 +86,7 @@ static struct btrfs_csum_item *__lookup_csum_item(struct btrfs_root *root,
 		if (csum_offset >=
 		    btrfs_item_size(leaf->items + path->slots[0]) /
 		    sizeof(struct btrfs_csum_item)) {
+printk("fail3, csum offset %lu size %u\n", csum_offset, btrfs_item_size(leaf->items + path->slots[0]) / sizeof(struct btrfs_csum_item));
 			goto fail;
 		}
 	}
@@ -89,7 +95,7 @@ static struct btrfs_csum_item *__lookup_csum_item(struct btrfs_root *root,
 	return item;
 fail:
 	if (ret > 0)
-		ret = -EIO;
+		ret = -ENOENT;
 	return ERR_PTR(ret);
 }
 
@@ -105,7 +111,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	int cow = mod != 0;
 	struct btrfs_csum_item *csum_item;
 
-	csum_item = __lookup_csum_item(root, path, objectid, offset);
+	csum_item = btrfs_lookup_csum(trans, root, path, objectid, offset, 0);
 	if (IS_ERR(csum_item))
 		return PTR_ERR(csum_item);
 	file_key.objectid = objectid;
@@ -113,7 +119,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 	btrfs_release_path(root, path);
+printk("lookup file extent searches for %Lu\n", file_key.offset);
 	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
+printk("ret is %d\n", ret);
 	return ret;
 }
 
@@ -134,17 +142,23 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
+
+	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 0);
+	if (!IS_ERR(item))
+		goto found;
+	btrfs_release_path(root, path);
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+printk("searching for csum %Lu %Lu\n", objectid, offset);
 	ret = btrfs_search_slot(trans, root, &file_key, path,
 				sizeof(struct btrfs_csum_item), 1);
+printk("ret %d\n", ret);
 	if (ret < 0)
 		goto fail;
 	if (ret == 0) {
-		csum_offset = 0;
-		goto csum;
+		BUG();
 	}
 	if (path->slots[0] == 0) {
 		btrfs_release_path(root, path);
@@ -153,12 +167,15 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	path->slots[0]--;
 	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	btrfs_disk_key_to_cpu(&found_key, &leaf->items[path->slots[0]].key);
+printk("found key %Lu %Lu %u\n", found_key.objectid, found_key.offset, found_key.flags);
 	csum_offset = (offset - found_key.offset) >>
 			root->fs_info->sb->s_blocksize_bits;
+printk("csum_offset %Lu\n", csum_offset);
 	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 	    found_key.objectid != objectid ||
 	    csum_offset >= MAX_CSUM_ITEMS(root)) {
 		btrfs_release_path(root, path);
+printk("insert1\n");
 		goto insert;
 	}
 	if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) /
@@ -166,11 +183,13 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 		ret = btrfs_extend_item(trans, root, path,
 					sizeof(struct btrfs_csum_item));
 		BUG_ON(ret);
+printk("item extended\n");
 		goto csum;
 	}
 
 insert:
 	csum_offset = 0;
+printk("inserting item %Lu %Lu %u\n", file_key.objectid, file_key.offset, file_key.flags);
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(struct btrfs_csum_item));
 	if (ret != 0 && ret != -EEXIST)
@@ -180,12 +199,14 @@ csum:
 			      struct btrfs_csum_item);
 	ret = 0;
 	item += csum_offset;
+found:
 	ret = btrfs_csum_data(root, data, len, item->csum);
 	btrfs_set_csum_extent_offset(item, extent_offset);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+printk("return ret %d\n", ret);
 	return ret;
 }
 
@@ -208,7 +229,7 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
 	mutex_lock(&root->fs_info->fs_mutex);
 
-	item = __lookup_csum_item(root, path, objectid, offset);
+	item = btrfs_lookup_csum(NULL, root, path, objectid, offset, 0);
 	if (IS_ERR(item)) {
 		ret = PTR_ERR(item);
 		goto fail;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ec689992fdf..6a56416147e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1089,7 +1089,6 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 
 	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
 	extent_start = extent_start >> inode->i_blkbits;
-	extent_start += btrfs_file_extent_offset(item);
 	extent_end = extent_start + btrfs_file_extent_num_blocks(item);
 	if (iblock >= extent_start && iblock < extent_end) {
 		err = 0;
@@ -1103,6 +1102,7 @@ allocate:
 		err = 0;
 		goto out;
 	}
+#if 0
 	ret = btrfs_alloc_file_extent(trans, root, objectid,
 				      iblock << inode->i_blkbits,
 				      1, extent_end, &blocknr);
@@ -1115,9 +1115,11 @@ allocate:
 	map_bh(result, inode->i_sb, blocknr);
 
 	btrfs_map_bh_to_logical(root, result, blocknr);
+#endif
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+printk("mapping iblock %lu to %lu\n", iblock, result->b_blocknr);
 	if (trans)
 		btrfs_end_transaction(trans, root);
 	return err;
@@ -1273,8 +1275,244 @@ failed:
 	return err;
 }
 
-static int prepare_pages(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
+static int drop_csums(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct inode *inode,
+			  u64 start, u64 end)
+{
+	struct btrfs_path *path;
+	struct btrfs_leaf *leaf;
+	struct btrfs_key key;
+	int slot;
+	struct btrfs_csum_item *item;
+	char *old_block = NULL;
+	u64 cur = start;
+	u64 found_end;
+	u64 num_csums;
+	u64 item_size;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	while(cur < end) {
+		item = btrfs_lookup_csum(trans, root, path,
+					 inode->i_ino, cur, 1);
+		if (IS_ERR(item)) {
+			cur += root->blocksize;
+			continue;
+		}
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		slot = path->slots[0];
+		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+		item_size = btrfs_item_size(leaf->items + slot);
+		num_csums = item_size / sizeof(struct btrfs_csum_item);
+		found_end = key.offset + (num_csums << inode->i_blkbits);
+		cur = found_end;
+
+		if (found_end > end) {
+			char *src;
+			old_block = kmalloc(root->blocksize, GFP_NOFS);
+			src = btrfs_item_ptr(leaf, slot, char);
+			memcpy(old_block, src, item_size);
+		}
+		if (key.offset < start) {
+			u64 new_size = (start - key.offset) >>
+					inode->i_blkbits;
+			new_size *= sizeof(struct btrfs_csum_item);
+			ret = btrfs_truncate_item(trans, root, path, new_size);
+			BUG_ON(ret);
+		} else {
+			btrfs_del_item(trans, root, path);
+		}
+		btrfs_release_path(root, path);
+		if (found_end > end) {
+			char *dst;
+			int i;
+			int new_size;
+
+			num_csums = (found_end - end) >> inode->i_blkbits;
+			new_size = num_csums * sizeof(struct btrfs_csum_item);
+			key.offset = end;
+			ret = btrfs_insert_empty_item(trans, root, path,
+						      &key, new_size);
+			BUG_ON(ret);
+			dst = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+					     path->slots[0], char);
+			memcpy(dst, old_block + item_size - new_size,
+			       new_size);
+			item = (struct btrfs_csum_item *)dst;
+			for (i = 0; i < num_csums; i++) {
+				btrfs_set_csum_extent_offset(item, end);
+				item++;
+			}
+			mark_buffer_dirty(path->nodes[0]);
+			kfree(old_block);
+			break;
+		}
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int drop_extents(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct inode *inode,
+			  u64 start, u64 end)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_leaf *leaf;
+	int slot;
+	struct btrfs_file_extent_item *extent;
+	u64 extent_end;
+	int keep;
+	struct btrfs_file_extent_item old;
+	struct btrfs_path *path;
+	u64 search_start = start;
+	int bookend;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+search_again:
+printk("drop extent inode %lu start %Lu end %Lu\n", inode->i_ino, start, end);
+	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+				       search_start, -1);
+	if (ret != 0) {
+printk("lookup failed\n");
+		goto out;
+	}
+	while(1) {
+		keep = 0;
+		bookend = 0;
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		slot = path->slots[0];
+		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+
+printk("found key %Lu %Lu %u\n", key.objectid, key.offset, key.flags);
+
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+		extent_end = key.offset +
+			(btrfs_file_extent_num_blocks(extent) <<
+			 inode->i_blkbits);
+printk("extent end is %Lu\n", extent_end);
+		if (key.offset >= end || key.objectid != inode->i_ino) {
+			ret = 0;
+			goto out;
+		}
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			goto next_leaf;
+
+		if (end < extent_end && end >= key.offset) {
+			memcpy(&old, extent, sizeof(old));
+			ret = btrfs_inc_extent_ref(trans, root,
+				   btrfs_file_extent_disk_blocknr(&old),
+				   btrfs_file_extent_disk_num_blocks(&old));
+			BUG_ON(ret);
+			bookend = 1;
+		}
+
+		if (start > key.offset) {
+			u64 new_num;
+			/* truncate existing extent */
+			keep = 1;
+			WARN_ON(start & (root->blocksize - 1));
+			new_num = (start - key.offset) >> inode->i_blkbits;
+printk("truncating existing extent, was %Lu ", btrfs_file_extent_num_blocks(extent));
+			btrfs_set_file_extent_num_blocks(extent, new_num);
+printk("now %Lu\n", btrfs_file_extent_num_blocks(extent));
+
+			mark_buffer_dirty(path->nodes[0]);
+		}
+		if (!keep) {
+			u64 disk_blocknr;
+			u64 disk_num_blocks;
+printk("del old\n");
+			disk_blocknr = btrfs_file_extent_disk_blocknr(extent);
+			disk_num_blocks =
+				btrfs_file_extent_disk_num_blocks(extent);
+			search_start = key.offset +
+				(btrfs_file_extent_num_blocks(extent) <<
+				inode->i_blkbits);
+			ret = btrfs_del_item(trans, root, path);
+			BUG_ON(ret);
+			btrfs_release_path(root, path);
+
+			ret = btrfs_free_extent(trans, root, disk_blocknr,
+						disk_num_blocks, 0);
+
+			BUG_ON(ret);
+			if (!bookend && search_start >= end) {
+				ret = 0;
+				goto out;
+			}
+			if (!bookend)
+				goto search_again;
+		}
+		if (bookend) {
+			/* create bookend */
+			struct btrfs_key ins;
+printk("bookend! extent end %Lu\n", extent_end);
+			ins.objectid = inode->i_ino;
+			ins.offset = end;
+			ins.flags = 0;
+			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+
+			btrfs_release_path(root, path);
+			ret = drop_csums(trans, root, inode, start, end);
+			BUG_ON(ret);
+			ret = btrfs_insert_empty_item(trans, root, path, &ins,
+						      sizeof(*extent));
+			BUG_ON(ret);
+			extent = btrfs_item_ptr(
+				    btrfs_buffer_leaf(path->nodes[0]),
+				    path->slots[0],
+				    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_disk_blocknr(extent,
+				    btrfs_file_extent_disk_blocknr(&old));
+			btrfs_set_file_extent_disk_num_blocks(extent,
+				    btrfs_file_extent_disk_num_blocks(&old));
+
+			btrfs_set_file_extent_offset(extent,
+				    btrfs_file_extent_offset(&old) +
+				    ((end - key.offset) >> inode->i_blkbits));
+			WARN_ON(btrfs_file_extent_num_blocks(&old) <
+				(end - key.offset) >> inode->i_blkbits);
+			btrfs_set_file_extent_num_blocks(extent,
+				    btrfs_file_extent_num_blocks(&old) -
+				    ((end - key.offset) >> inode->i_blkbits));
+
+			btrfs_set_file_extent_generation(extent,
+				    btrfs_file_extent_generation(&old));
+printk("new bookend at offset %Lu, file_extent_offset %Lu, file_extent_num_blocks %Lu\n", end, btrfs_file_extent_offset(extent), btrfs_file_extent_num_blocks(extent));
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+			ret = 0;
+			goto out_nocsum;
+		}
+next_leaf:
+		if (slot >= btrfs_header_nritems(&leaf->header) - 1) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret) {
+				ret = 0;
+				goto out;
+			}
+		} else {
+			path->slots[0]++;
+		}
+	}
+
+out:
+	ret = drop_csums(trans, root, inode, start, end);
+	BUG_ON(ret);
+
+out_nocsum:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int prepare_pages(struct btrfs_root *root,
 			 struct file *file,
 			 struct page **pages,
 			 size_t num_pages,
@@ -1289,7 +1527,6 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	int offset;
 	int err = 0;
-	int ret;
 	int this_write;
 	struct buffer_head *bh;
 	struct buffer_head *head;
@@ -1305,18 +1542,21 @@ static int prepare_pages(struct btrfs_trans_handle *trans,
 		}
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
-		if (!PageUptodate(pages[i]) &&
-		   (pages[i]->index == first_index ||
-		    pages[i]->index == last_index) && pos < isize) {
+#if 0
+		if ((pages[i]->index == first_index ||
+		    pages[i]->index == last_index) && pos < isize &&
+		    !PageUptodate(pages[i])) {
 			ret = mpage_readpage(pages[i], btrfs_get_block);
 			BUG_ON(ret);
 			lock_page(pages[i]);
 		}
+#endif
 		create_empty_buffers(pages[i], root->fs_info->sb->s_blocksize,
 				     (1 << BH_Uptodate));
 		head = page_buffers(pages[i]);
 		bh = head;
 		do {
+printk("mapping page %lu to block %Lu\n", pages[i]->index, alloc_extent_start);
 			err = btrfs_map_bh_to_logical(root, bh,
 						      alloc_extent_start);
 			BUG_ON(err);
@@ -1351,7 +1591,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *pages[1];
+	struct page *pages[8];
 	unsigned long first_index;
 	unsigned long last_index;
 	u64 start_pos;
@@ -1359,6 +1599,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	u64 alloc_extent_start;
 	u64 orig_extent_start;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_key ins;
 
 	if (file->f_flags & O_DIRECT)
 		return -EINVAL;
@@ -1390,16 +1631,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		err = -ENOMEM;
+		mutex_unlock(&root->fs_info->fs_mutex);
 		goto out_unlock;
 	}
-	ret = btrfs_alloc_file_extent(trans, root, inode->i_ino,
-				      start_pos, num_blocks, 1,
-				      &alloc_extent_start);
-	BUG_ON(ret);
-
+	if (start_pos < inode->i_size) {
+		ret = drop_extents(trans, root, inode,
+				   start_pos,
+				   (pos + count + root->blocksize -1) &
+				   ~(root->blocksize - 1));
+	}
 	orig_extent_start = start_pos;
-	ret = btrfs_end_transaction(trans, root);
+	ret = btrfs_alloc_extent(trans, root, num_blocks, 1,
+				 (u64)-1, &ins);
+	BUG_ON(ret);
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       start_pos, ins.objectid, ins.offset);
 	BUG_ON(ret);
+	alloc_extent_start = ins.objectid;
+	ret = btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
 	while(count > 0) {
@@ -1407,16 +1656,21 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
-		ret = prepare_pages(NULL, root, file, pages, num_pages,
+printk("num_pages is %lu\n", num_pages);
+
+		memset(pages, 0, sizeof(pages));
+		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes, alloc_extent_start);
 		BUG_ON(ret);
+
 		/* FIXME blocks != pagesize */
 		alloc_extent_start += num_pages;
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		BUG_ON(ret);
 
+printk("2num_pages is %lu\n", num_pages);
 		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, orig_extent_start,
 					      pos, write_bytes);
-- 
cgit v1.2.3


From 70b2befd0c8a4064715d8b340270650cc9d15af8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 17 Apr 2007 15:39:32 -0400
Subject: Btrfs: rework csums and extent item ordering

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     |   9 ++--
 fs/btrfs/ctree.h     |  17 +------
 fs/btrfs/dir-item.c  |  28 ++++++++----
 fs/btrfs/disk-io.c   |   2 -
 fs/btrfs/file-item.c |  26 +----------
 fs/btrfs/super.c     | 123 +++++----------------------------------------------
 6 files changed, 37 insertions(+), 168 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 43d4779abdd..5c160c23677 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -115,14 +115,14 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 		return 1;
 	if (k1.objectid < k2->objectid)
 		return -1;
-	if (k1.offset > k2->offset)
-		return 1;
-	if (k1.offset < k2->offset)
-		return -1;
 	if (k1.flags > k2->flags)
 		return 1;
 	if (k1.flags < k2->flags)
 		return -1;
+	if (k1.offset > k2->offset)
+		return 1;
+	if (k1.offset < k2->offset)
+		return -1;
 	return 0;
 }
 
@@ -1292,7 +1292,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 				      ioff + size_diff);
 	}
 	/* shift the data */
-printk("truncate item, new_size %u old_size %u, diff %u, bufp %p, dst, %p, num %u, old_data_start %u, data_end %u\n", new_size, old_size, size_diff, leaf, btrfs_leaf_data(leaf) + data_end + size_diff, old_data_start-data_end, old_data_start, data_end);
 	btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
 		      data_end + size_diff, btrfs_leaf_data(leaf) +
 		      data_end, old_data_start + new_size - data_end);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ca3ab160f46..d75a4d5bc01 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -44,14 +44,14 @@ extern struct kmem_cache *btrfs_path_cachep;
  */
 struct btrfs_disk_key {
 	__le64 objectid;
-	__le64 offset;
 	__le32 flags;
+	__le64 offset;
 } __attribute__ ((__packed__));
 
 struct btrfs_key {
 	u64 objectid;
-	u64 offset;
 	u32 flags;
+	u64 offset;
 } __attribute__ ((__packed__));
 
 /*
@@ -227,7 +227,6 @@ struct btrfs_file_extent_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
-	__le64 extent_offset;
 	u8 csum[BTRFS_CSUM_SIZE];
 } __attribute__ ((__packed__));
 
@@ -925,17 +924,6 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
-static inline u64 btrfs_csum_extent_offset(struct btrfs_csum_item *c)
-{
-	return le64_to_cpu(c->extent_offset);
-}
-
-static inline void btrfs_set_csum_extent_offset(struct btrfs_csum_item *c,
-						u64 val)
-{
-	c->extent_offset = cpu_to_le64(val);
-}
-
 static inline u16 btrfs_device_pathlen(struct btrfs_device_item *d)
 {
 	return le16_to_cpu(d->pathlen);
@@ -1091,7 +1079,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset,
-			  u64 extent_offset,
 			  char *data, size_t len);
 int btrfs_csum_verify_file_block(struct btrfs_root *root,
 				 u64 objectid, u64 offset,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 0ba46bc0da9..cd4137a8b87 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -103,7 +103,7 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = dir;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	btrfs_set_key_overflow(&key, BTRFS_KEY_OVERFLOW_MAX - 1);
+	// btrfs_set_key_overflow(&key, BTRFS_KEY_OVERFLOW_MAX - 1);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
 	while(1) {
@@ -146,19 +146,29 @@ int btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 	int cow = mod != 0;
 	struct btrfs_disk_key *found_key;
 	struct btrfs_leaf *leaf;
+	int overflow = 0;
 
 	key.objectid = dir;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	btrfs_set_key_overflow(&key, BTRFS_KEY_OVERFLOW_MAX - 1);
 	key.offset = objectid;
-	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-	if (ret < 0)
-		return ret;
-	if (ret > 0) {
-		if (path->slots[0] == 0)
-			return 1;
-		path->slots[0]--;
+
+	while(1) {
+		btrfs_set_key_overflow(&key, overflow);
+		ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+		if (ret < 0)
+			return ret;
+		if (ret > 0) {
+			if (overflow >= BTRFS_KEY_OVERFLOW_MAX)
+				return 1;
+			overflow++;
+			btrfs_set_key_overflow(&key, overflow);
+			btrfs_release_path(root, path);
+			continue;
+		} else {
+			/* found */
+			break;
+		}
 	}
 	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	found_key = &leaf->items[path->slots[0]].key;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 06b969c1462..a2a3f529cad 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -24,7 +24,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   u64 num_blocks)
 {
 	struct dev_lookup *lookup;
-	char b[BDEVNAME_SIZE];
 	int ret;
 
 	lookup = kmalloc(sizeof(*lookup), GFP_NOFS);
@@ -34,7 +33,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 	lookup->num_blocks = num_blocks;
 	lookup->bdev = bdev;
 	lookup->device_id = device_id;
-printk("inserting %s into dev radix %Lu %Lu\n", bdevname(bdev, b), block_start, num_blocks);
 
 	ret = radix_tree_insert(&root->fs_info->dev_radix, block_start +
 				num_blocks - 1, lookup);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f49968ad0a0..ff8f3339c68 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -62,23 +62,19 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
-printk("__lookup for %Lu\n", offset);
 	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
 	if (ret < 0)
 		goto fail;
 	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	if (ret > 0) {
 		ret = 1;
-		if (path->slots[0] == 0) {
-printk("fail1\n");
+		if (path->slots[0] == 0)
 			goto fail;
-		}
 		path->slots[0]--;
 		btrfs_disk_key_to_cpu(&found_key,
 				      &leaf->items[path->slots[0]].key);
 		if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 		    found_key.objectid != objectid) {
-printk("fail2 type %u %Lu %Lu\n", btrfs_key_type(&found_key), found_key.objectid, objectid);
 			goto fail;
 		}
 		csum_offset = (offset - found_key.offset) >>
@@ -86,7 +82,6 @@ printk("fail2 type %u %Lu %Lu\n", btrfs_key_type(&found_key), found_key.objectid
 		if (csum_offset >=
 		    btrfs_item_size(leaf->items + path->slots[0]) /
 		    sizeof(struct btrfs_csum_item)) {
-printk("fail3, csum offset %lu size %u\n", csum_offset, btrfs_item_size(leaf->items + path->slots[0]) / sizeof(struct btrfs_csum_item));
 			goto fail;
 		}
 	}
@@ -109,26 +104,18 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_key file_key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
-	struct btrfs_csum_item *csum_item;
 
-	csum_item = btrfs_lookup_csum(trans, root, path, objectid, offset, 0);
-	if (IS_ERR(csum_item))
-		return PTR_ERR(csum_item);
 	file_key.objectid = objectid;
-	file_key.offset = btrfs_csum_extent_offset(csum_item);
+	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
-	btrfs_release_path(root, path);
-printk("lookup file extent searches for %Lu\n", file_key.offset);
 	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
-printk("ret is %d\n", ret);
 	return ret;
 }
 
 int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset,
-			  u64 extent_offset,
 			  char *data, size_t len)
 {
 	int ret;
@@ -151,10 +138,8 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
-printk("searching for csum %Lu %Lu\n", objectid, offset);
 	ret = btrfs_search_slot(trans, root, &file_key, path,
 				sizeof(struct btrfs_csum_item), 1);
-printk("ret %d\n", ret);
 	if (ret < 0)
 		goto fail;
 	if (ret == 0) {
@@ -167,15 +152,12 @@ printk("ret %d\n", ret);
 	path->slots[0]--;
 	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	btrfs_disk_key_to_cpu(&found_key, &leaf->items[path->slots[0]].key);
-printk("found key %Lu %Lu %u\n", found_key.objectid, found_key.offset, found_key.flags);
 	csum_offset = (offset - found_key.offset) >>
 			root->fs_info->sb->s_blocksize_bits;
-printk("csum_offset %Lu\n", csum_offset);
 	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 	    found_key.objectid != objectid ||
 	    csum_offset >= MAX_CSUM_ITEMS(root)) {
 		btrfs_release_path(root, path);
-printk("insert1\n");
 		goto insert;
 	}
 	if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) /
@@ -183,13 +165,11 @@ printk("insert1\n");
 		ret = btrfs_extend_item(trans, root, path,
 					sizeof(struct btrfs_csum_item));
 		BUG_ON(ret);
-printk("item extended\n");
 		goto csum;
 	}
 
 insert:
 	csum_offset = 0;
-printk("inserting item %Lu %Lu %u\n", file_key.objectid, file_key.offset, file_key.flags);
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(struct btrfs_csum_item));
 	if (ret != 0 && ret != -EEXIST)
@@ -201,12 +181,10 @@ csum:
 	item += csum_offset;
 found:
 	ret = btrfs_csum_data(root, data, len, item->csum);
-	btrfs_set_csum_extent_offset(item, extent_offset);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-printk("return ret %d\n", ret);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6a56416147e..b2a2220d135 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -524,7 +524,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int advance;
 	unsigned char d_type = DT_UNKNOWN;
 	int over = 0;
-	int key_type = BTRFS_DIR_INDEX_KEY;
+	int key_type = BTRFS_DIR_ITEM_KEY;
 
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
@@ -561,9 +561,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		item = leaf->items + slot;
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
-		if (key_type == BTRFS_DIR_INDEX_KEY &&
-		    btrfs_disk_key_offset(&item->key) > root->highest_inode)
-			break;
 		if (btrfs_disk_key_type(&item->key) != key_type)
 			continue;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
@@ -1119,7 +1116,6 @@ allocate:
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-printk("mapping iblock %lu to %lu\n", iblock, result->b_blocknr);
 	if (trans)
 		btrfs_end_transaction(trans, root);
 	return err;
@@ -1233,7 +1229,6 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct file *file,
 				   struct page **pages,
 				   size_t num_pages,
-				   u64 extent_offset,
 				   loff_t pos,
 				   size_t write_bytes)
 {
@@ -1253,7 +1248,6 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_csum_file_block(trans, root, inode->i_ino,
 				      pages[i]->index << PAGE_CACHE_SHIFT,
-				      extent_offset,
 				      kmap(pages[i]), PAGE_CACHE_SIZE);
 		kunmap(pages[i]);
 		SetPageChecked(pages[i]);
@@ -1275,86 +1269,6 @@ failed:
 	return err;
 }
 
-static int drop_csums(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct inode *inode,
-			  u64 start, u64 end)
-{
-	struct btrfs_path *path;
-	struct btrfs_leaf *leaf;
-	struct btrfs_key key;
-	int slot;
-	struct btrfs_csum_item *item;
-	char *old_block = NULL;
-	u64 cur = start;
-	u64 found_end;
-	u64 num_csums;
-	u64 item_size;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	while(cur < end) {
-		item = btrfs_lookup_csum(trans, root, path,
-					 inode->i_ino, cur, 1);
-		if (IS_ERR(item)) {
-			cur += root->blocksize;
-			continue;
-		}
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		slot = path->slots[0];
-		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
-		item_size = btrfs_item_size(leaf->items + slot);
-		num_csums = item_size / sizeof(struct btrfs_csum_item);
-		found_end = key.offset + (num_csums << inode->i_blkbits);
-		cur = found_end;
-
-		if (found_end > end) {
-			char *src;
-			old_block = kmalloc(root->blocksize, GFP_NOFS);
-			src = btrfs_item_ptr(leaf, slot, char);
-			memcpy(old_block, src, item_size);
-		}
-		if (key.offset < start) {
-			u64 new_size = (start - key.offset) >>
-					inode->i_blkbits;
-			new_size *= sizeof(struct btrfs_csum_item);
-			ret = btrfs_truncate_item(trans, root, path, new_size);
-			BUG_ON(ret);
-		} else {
-			btrfs_del_item(trans, root, path);
-		}
-		btrfs_release_path(root, path);
-		if (found_end > end) {
-			char *dst;
-			int i;
-			int new_size;
-
-			num_csums = (found_end - end) >> inode->i_blkbits;
-			new_size = num_csums * sizeof(struct btrfs_csum_item);
-			key.offset = end;
-			ret = btrfs_insert_empty_item(trans, root, path,
-						      &key, new_size);
-			BUG_ON(ret);
-			dst = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-					     path->slots[0], char);
-			memcpy(dst, old_block + item_size - new_size,
-			       new_size);
-			item = (struct btrfs_csum_item *)dst;
-			for (i = 0; i < num_csums; i++) {
-				btrfs_set_csum_extent_offset(item, end);
-				item++;
-			}
-			mark_buffer_dirty(path->nodes[0]);
-			kfree(old_block);
-			break;
-		}
-	}
-	btrfs_free_path(path);
-	return 0;
-}
-
 static int drop_extents(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct inode *inode,
@@ -1376,12 +1290,16 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 search_again:
-printk("drop extent inode %lu start %Lu end %Lu\n", inode->i_ino, start, end);
 	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 				       search_start, -1);
-	if (ret != 0) {
-printk("lookup failed\n");
+	if (ret < 0)
 		goto out;
+	if (ret > 0) {
+		if (path->slots[0] == 0) {
+			ret = -ENOENT;
+			goto out;
+		}
+		path->slots[0]--;
 	}
 	while(1) {
 		keep = 0;
@@ -1390,14 +1308,11 @@ printk("lookup failed\n");
 		slot = path->slots[0];
 		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
 
-printk("found key %Lu %Lu %u\n", key.objectid, key.offset, key.flags);
-
 		extent = btrfs_item_ptr(leaf, slot,
 					struct btrfs_file_extent_item);
 		extent_end = key.offset +
 			(btrfs_file_extent_num_blocks(extent) <<
 			 inode->i_blkbits);
-printk("extent end is %Lu\n", extent_end);
 		if (key.offset >= end || key.objectid != inode->i_ino) {
 			ret = 0;
 			goto out;
@@ -1420,16 +1335,12 @@ printk("extent end is %Lu\n", extent_end);
 			keep = 1;
 			WARN_ON(start & (root->blocksize - 1));
 			new_num = (start - key.offset) >> inode->i_blkbits;
-printk("truncating existing extent, was %Lu ", btrfs_file_extent_num_blocks(extent));
 			btrfs_set_file_extent_num_blocks(extent, new_num);
-printk("now %Lu\n", btrfs_file_extent_num_blocks(extent));
-
 			mark_buffer_dirty(path->nodes[0]);
 		}
 		if (!keep) {
 			u64 disk_blocknr;
 			u64 disk_num_blocks;
-printk("del old\n");
 			disk_blocknr = btrfs_file_extent_disk_blocknr(extent);
 			disk_num_blocks =
 				btrfs_file_extent_disk_num_blocks(extent);
@@ -1454,15 +1365,12 @@ printk("del old\n");
 		if (bookend) {
 			/* create bookend */
 			struct btrfs_key ins;
-printk("bookend! extent end %Lu\n", extent_end);
 			ins.objectid = inode->i_ino;
 			ins.offset = end;
 			ins.flags = 0;
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 
 			btrfs_release_path(root, path);
-			ret = drop_csums(trans, root, inode, start, end);
-			BUG_ON(ret);
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
 			BUG_ON(ret);
@@ -1486,10 +1394,9 @@ printk("bookend! extent end %Lu\n", extent_end);
 
 			btrfs_set_file_extent_generation(extent,
 				    btrfs_file_extent_generation(&old));
-printk("new bookend at offset %Lu, file_extent_offset %Lu, file_extent_num_blocks %Lu\n", end, btrfs_file_extent_offset(extent), btrfs_file_extent_num_blocks(extent));
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 			ret = 0;
-			goto out_nocsum;
+			goto out;
 		}
 next_leaf:
 		if (slot >= btrfs_header_nritems(&leaf->header) - 1) {
@@ -1504,10 +1411,6 @@ next_leaf:
 	}
 
 out:
-	ret = drop_csums(trans, root, inode, start, end);
-	BUG_ON(ret);
-
-out_nocsum:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1556,7 +1459,6 @@ static int prepare_pages(struct btrfs_root *root,
 		head = page_buffers(pages[i]);
 		bh = head;
 		do {
-printk("mapping page %lu to block %Lu\n", pages[i]->index, alloc_extent_start);
 			err = btrfs_map_bh_to_logical(root, bh,
 						      alloc_extent_start);
 			BUG_ON(err);
@@ -1597,7 +1499,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	u64 start_pos;
 	u64 num_blocks;
 	u64 alloc_extent_start;
-	u64 orig_extent_start;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 
@@ -1640,7 +1541,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 				   (pos + count + root->blocksize -1) &
 				   ~(root->blocksize - 1));
 	}
-	orig_extent_start = start_pos;
 	ret = btrfs_alloc_extent(trans, root, num_blocks, 1,
 				 (u64)-1, &ins);
 	BUG_ON(ret);
@@ -1656,7 +1556,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
-printk("num_pages is %lu\n", num_pages);
 
 		memset(pages, 0, sizeof(pages));
 		ret = prepare_pages(root, file, pages, num_pages,
@@ -1670,10 +1569,8 @@ printk("num_pages is %lu\n", num_pages);
 					   write_bytes, pages, buf);
 		BUG_ON(ret);
 
-printk("2num_pages is %lu\n", num_pages);
 		ret = dirty_and_release_pages(NULL, root, file, pages,
-					      num_pages, orig_extent_start,
-					      pos, write_bytes);
+					      num_pages, pos, write_bytes);
 		BUG_ON(ret);
 		btrfs_drop_pages(pages, num_pages);
 
-- 
cgit v1.2.3


From a429e51371eee3c989160c003ee40bc3947c6a76 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 18 Apr 2007 16:15:28 -0400
Subject: Btrfs: working file_write, reorganized key flags

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  30 +++++++++++--
 fs/btrfs/ctree.h       |  52 +++++++++++-----------
 fs/btrfs/dir-item.c    |  70 -----------------------------
 fs/btrfs/extent-tree.c |   4 +-
 fs/btrfs/file-item.c   |  66 +++++++++++++++++++--------
 fs/btrfs/print-tree.c  |   2 +-
 fs/btrfs/super.c       | 118 ++++++++++++++++++++++++++-----------------------
 7 files changed, 167 insertions(+), 175 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5c160c23677..4efcd1bd63e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -893,7 +893,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	left_nritems = btrfs_header_nritems(&left->header);
-	for (i = left_nritems - 1; i >= 0; i--) {
+	if (left_nritems == 0) {
+		btrfs_block_release(root, right_buf);
+		return 1;
+	}
+	for (i = left_nritems - 1; i >= 1; i--) {
 		item = left->items + i;
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
@@ -907,6 +911,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_block_release(root, right_buf);
 		return 1;
 	}
+	if (push_items == left_nritems)
+		WARN_ON(1);
 	right_nritems = btrfs_header_nritems(&right->header);
 	/* push left to right */
 	push_space = btrfs_item_end(left->items + left_nritems - push_items);
@@ -943,6 +949,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	btrfs_mark_buffer_dirty(left_buf);
 	btrfs_mark_buffer_dirty(right_buf);
+
 	btrfs_memcpy(root, upper_node, &upper_node->ptrs[slot + 1].key,
 		&right->items[0].key, sizeof(struct btrfs_disk_key));
 	btrfs_mark_buffer_dirty(upper);
@@ -1004,7 +1011,12 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 
-	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
+	if (btrfs_header_nritems(&right->header) == 0) {
+		btrfs_block_release(root, t);
+		return 1;
+	}
+
+	for (i = 0; i < btrfs_header_nritems(&right->header) - 1; i++) {
 		item = right->items + i;
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
@@ -1018,6 +1030,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_block_release(root, t);
 		return 1;
 	}
+	if (push_items == btrfs_header_nritems(&right->header))
+		WARN_ON(1);
 	/* push data from right to left */
 	btrfs_memcpy(root, left, left->items +
 		     btrfs_header_nritems(&left->header),
@@ -1064,7 +1078,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	btrfs_mark_buffer_dirty(t);
 	btrfs_mark_buffer_dirty(right_buf);
-
 	wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1);
 	if (wret)
 		ret = wret;
@@ -1181,6 +1194,12 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				path->nodes[0] = right_buffer;
 				path->slots[0] = 0;
 				path->slots[1] -= 1;
+				if (path->slots[1] == 0) {
+					wret = fixup_low_keys(trans, root,
+					           path, &disk_key, 1);
+					if (wret)
+						ret = wret;
+				}
 				return ret;
 			}
 			mid = slot;
@@ -1241,6 +1260,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			  path->slots[1], 1);
 	if (wret)
 		ret = wret;
+	if (path->slots[1] == 0) {
+		wret = fixup_low_keys(trans, root, path, &disk_key, 1);
+		if (wret)
+			ret = wret;
+	}
 	btrfs_block_release(root, path->nodes[0]);
 	path->nodes[0] = right_buffer;
 	path->slots[0] = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d75a4d5bc01..8a329d3901a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -282,11 +282,12 @@ struct btrfs_root {
 
 /* the lower bits in the key flags defines the item type */
 #define BTRFS_KEY_TYPE_MAX	256
-#define BTRFS_KEY_TYPE_MASK	(BTRFS_KEY_TYPE_MAX - 1)
+#define BTRFS_KEY_TYPE_SHIFT	24
+#define BTRFS_KEY_TYPE_MASK	(((u32)BTRFS_KEY_TYPE_MAX - 1) << \
+				  BTRFS_KEY_TYPE_SHIFT)
 
 #define BTRFS_KEY_OVERFLOW_MAX 128
-#define BTRFS_KEY_OVERFLOW_SHIFT 8
-#define BTRFS_KEY_OVERFLOW_MASK (0x7FULL << BTRFS_KEY_OVERFLOW_SHIFT)
+#define BTRFS_KEY_OVERFLOW_MASK ((u32)BTRFS_KEY_OVERFLOW_MAX - 1)
 
 /*
  * inode items have the data typically returned from stat and store other
@@ -586,56 +587,55 @@ static inline void btrfs_set_disk_key_flags(struct btrfs_disk_key *disk,
 	disk->flags = cpu_to_le32(val);
 }
 
-static inline u32 btrfs_key_overflow(struct btrfs_key *key)
+static inline u32 btrfs_disk_key_type(struct btrfs_disk_key *key)
 {
-	u32 over = key->flags & BTRFS_KEY_OVERFLOW_MASK;
-	return over >> BTRFS_KEY_OVERFLOW_SHIFT;
+	return le32_to_cpu(key->flags) >> BTRFS_KEY_TYPE_SHIFT;
 }
 
-static inline void btrfs_set_key_overflow(struct btrfs_key *key, u32 over)
+static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key,
+					       u32 val)
 {
-	BUG_ON(over >= BTRFS_KEY_OVERFLOW_MAX);
-	over = over << BTRFS_KEY_OVERFLOW_SHIFT;
-	key->flags = (key->flags & ~((u64)BTRFS_KEY_OVERFLOW_MASK)) | over;
+	u32 flags = btrfs_disk_key_flags(key);
+	BUG_ON(val >= BTRFS_KEY_TYPE_MAX);
+	val = val << BTRFS_KEY_TYPE_SHIFT;
+	flags = (flags & ~BTRFS_KEY_TYPE_MASK) | val;
+	btrfs_set_disk_key_flags(key, flags);
 }
 
 static inline u32 btrfs_key_type(struct btrfs_key *key)
 {
-	return key->flags & BTRFS_KEY_TYPE_MASK;
+	return key->flags >> BTRFS_KEY_TYPE_SHIFT;
 }
 
-static inline u32 btrfs_disk_key_type(struct btrfs_disk_key *key)
+static inline void btrfs_set_key_type(struct btrfs_key *key, u32 val)
 {
-	return le32_to_cpu(key->flags) & BTRFS_KEY_TYPE_MASK;
+	BUG_ON(val >= BTRFS_KEY_TYPE_MAX);
+	val = val << BTRFS_KEY_TYPE_SHIFT;
+	key->flags = (key->flags & ~(BTRFS_KEY_TYPE_MASK)) | val;
 }
 
-static inline void btrfs_set_key_type(struct btrfs_key *key, u32 type)
+static inline u32 btrfs_key_overflow(struct btrfs_key *key)
 {
-	BUG_ON(type >= BTRFS_KEY_TYPE_MAX);
-	key->flags = (key->flags & ~((u64)BTRFS_KEY_TYPE_MASK)) | type;
+	return key->flags & BTRFS_KEY_OVERFLOW_MASK;
 }
 
-static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key, u32 type)
+static inline void btrfs_set_key_overflow(struct btrfs_key *key, u32 over)
 {
-	u32 flags = btrfs_disk_key_flags(key);
-	BUG_ON(type >= BTRFS_KEY_TYPE_MAX);
-	flags = (flags & ~((u64)BTRFS_KEY_TYPE_MASK)) | type;
-	btrfs_set_disk_key_flags(key, flags);
+	BUG_ON(over >= BTRFS_KEY_OVERFLOW_MAX);
+	key->flags = (key->flags & ~BTRFS_KEY_OVERFLOW_MASK) | over;
 }
 
 static inline u32 btrfs_disk_key_overflow(struct btrfs_disk_key *key)
 {
-	u32 over = le32_to_cpu(key->flags) & BTRFS_KEY_OVERFLOW_MASK;
-	return over >> BTRFS_KEY_OVERFLOW_SHIFT;
+	return le32_to_cpu(key->flags) & BTRFS_KEY_OVERFLOW_MASK;
 }
 
-static inline void btrfs_set_disK_key_overflow(struct btrfs_disk_key *key,
+static inline void btrfs_set_disk_key_overflow(struct btrfs_disk_key *key,
 					       u32 over)
 {
 	u32 flags = btrfs_disk_key_flags(key);
 	BUG_ON(over >= BTRFS_KEY_OVERFLOW_MAX);
-	over = over << BTRFS_KEY_OVERFLOW_SHIFT;
-	flags = (flags & ~((u64)BTRFS_KEY_OVERFLOW_MASK)) | over;
+	flags = (flags & ~BTRFS_KEY_OVERFLOW_MASK) | over;
 	btrfs_set_disk_key_flags(key, flags);
 }
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index cd4137a8b87..a43deb72648 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -58,30 +58,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-
-	/* FIXME, use some real flag for selecting the extra index */
-	if (root == root->fs_info->tree_root)
-		goto out;
-
-	btrfs_release_path(root, path);
-
-	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	key.offset = location->objectid;
-	ret = insert_with_overflow(trans, root, path, &key, data_size);
-	// FIXME clear the dirindex bit
-	if (ret)
-		goto out;
-
-	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
-				  struct btrfs_dir_item);
-	btrfs_cpu_key_to_disk(&dir_item->location, location);
-	btrfs_set_dir_type(dir_item, type);
-	btrfs_set_dir_flags(dir_item, 0);
-	btrfs_set_dir_name_len(dir_item, name_len);
-	name_ptr = (char *)(dir_item + 1);
-	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
@@ -135,52 +111,6 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 1;
 }
 
-int btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path, u64 dir,
-				u64 objectid, int mod)
-{
-	int ret;
-	struct btrfs_key key;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
-	struct btrfs_disk_key *found_key;
-	struct btrfs_leaf *leaf;
-	int overflow = 0;
-
-	key.objectid = dir;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	key.offset = objectid;
-
-	while(1) {
-		btrfs_set_key_overflow(&key, overflow);
-		ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-		if (ret < 0)
-			return ret;
-		if (ret > 0) {
-			if (overflow >= BTRFS_KEY_OVERFLOW_MAX)
-				return 1;
-			overflow++;
-			btrfs_set_key_overflow(&key, overflow);
-			btrfs_release_path(root, path);
-			continue;
-		} else {
-			/* found */
-			break;
-		}
-	}
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	found_key = &leaf->items[path->slots[0]].key;
-
-	if (btrfs_disk_key_objectid(found_key) != dir ||
-	    btrfs_disk_key_type(found_key) != BTRFS_DIR_INDEX_KEY)
-		return 1;
-	if (btrfs_disk_key_offset(found_key) == objectid)
-		return 0;
-	return 1;
-}
-
 int btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2cee9df001f..cb04a70eb7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,8 +35,10 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	key.offset = num_blocks;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 1);
-	if (ret != 0)
+	if (ret != 0) {
+printk("can't find block %Lu %Lu\n", blocknr, num_blocks);
 		BUG();
+	}
 	BUG_ON(ret != 0);
 	l = btrfs_buffer_leaf(path->nodes[0]);
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ff8f3339c68..8cc3c1d1541 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -4,7 +4,7 @@
 #include "transaction.h"
 
 #define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
-				 sizeof(struct btrfs_item)) / \
+				 sizeof(struct btrfs_item) * 2) / \
 				sizeof(struct btrfs_csum_item)) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
@@ -19,11 +19,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-	/*
-	ret = btrfs_alloc_extent(trans, root, num_blocks, hint_block,
-				 (u64)-1, &ins);
-				 */
-	BUG_ON(ret);
 	file_key.objectid = objectid;
 	file_key.offset = pos;
 	file_key.flags = 0;
@@ -40,6 +35,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_num_blocks(item, num_blocks);
 	btrfs_set_file_extent_generation(item, trans->transid);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return 0;
@@ -57,6 +53,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item;
 	struct btrfs_leaf *leaf;
 	u64 csum_offset = 0;
+	int csums_in_item;
 
 	file_key.objectid = objectid;
 	file_key.offset = offset;
@@ -79,9 +76,11 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		}
 		csum_offset = (offset - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
-		if (csum_offset >=
-		    btrfs_item_size(leaf->items + path->slots[0]) /
-		    sizeof(struct btrfs_csum_item)) {
+		csums_in_item = btrfs_item_size(leaf->items + path->slots[0]);
+		csums_in_item /= sizeof(struct btrfs_csum_item);
+
+		if (csum_offset >= csums_in_item) {
+			ret = -EFBIG;
 			goto fail;
 		}
 	}
@@ -128,16 +127,36 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 
-	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 0);
-	if (!IS_ERR(item))
-		goto found;
-	btrfs_release_path(root, path);
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+
+	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
+	if (!IS_ERR(item))
+		goto found;
+	ret = PTR_ERR(item);
+	if (ret == -EFBIG) {
+		u32 item_size;
+		/* we found one, but it isn't big enough yet */
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		item_size = btrfs_item_size(leaf->items + path->slots[0]);
+		if ((item_size / sizeof(struct btrfs_csum_item)) >=
+		    MAX_CSUM_ITEMS(root)) {
+			/* already at max size, make a new one */
+			goto insert;
+		}
+	} else {
+		/* we didn't find a csum item, insert one */
+		goto insert;
+	}
+
+	/*
+	 * at this point, we know the tree has an item, but it isn't big
+	 * enough yet to put our csum in.  Grow it
+	 */
+	btrfs_release_path(root, path);
 	ret = btrfs_search_slot(trans, root, &file_key, path,
 				sizeof(struct btrfs_csum_item), 1);
 	if (ret < 0)
@@ -146,7 +165,6 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 	if (path->slots[0] == 0) {
-		btrfs_release_path(root, path);
 		goto insert;
 	}
 	path->slots[0]--;
@@ -157,29 +175,36 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 	    found_key.objectid != objectid ||
 	    csum_offset >= MAX_CSUM_ITEMS(root)) {
-		btrfs_release_path(root, path);
+		WARN_ON(1);
 		goto insert;
 	}
 	if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) /
 	    sizeof(struct btrfs_csum_item)) {
-		ret = btrfs_extend_item(trans, root, path,
-					sizeof(struct btrfs_csum_item));
+		u32 diff = (csum_offset + 1) * sizeof(struct btrfs_csum_item);
+		diff = diff - btrfs_item_size(leaf->items + path->slots[0]);
+		WARN_ON(diff != sizeof(struct btrfs_csum_item));
+		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
 		goto csum;
 	}
 
 insert:
+	btrfs_release_path(root, path);
 	csum_offset = 0;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(struct btrfs_csum_item));
-	if (ret != 0 && ret != -EEXIST)
+	if (ret != 0) {
+		printk("at insert for %Lu %u %Lu ret is %d\n", file_key.objectid, file_key.flags, file_key.offset, ret);
+		WARN_ON(1);
 		goto fail;
+	}
 csum:
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_csum_item);
 	ret = 0;
 	item += csum_offset;
 found:
+	btrfs_check_bounds(item->csum, BTRFS_CSUM_SIZE, path->nodes[0]->b_data, root->fs_info->sb->s_blocksize);
 	ret = btrfs_csum_data(root, data, len, item->csum);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
@@ -210,6 +235,9 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 	item = btrfs_lookup_csum(NULL, root, path, objectid, offset, 0);
 	if (IS_ERR(item)) {
 		ret = PTR_ERR(item);
+		/* a csum that isn't present is a preallocated region. */
+		if (ret == -ENOENT || ret == -EFBIG)
+			ret = 1;
 		goto fail;
 	}
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0732a2fbb23..1e7038b070a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -19,7 +19,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
 		type = btrfs_disk_key_type(&item->key);
-		printk("\titem %d key (%Lu %u %Lu) itemoff %d itemsize %d\n",
+		printk("\titem %d key (%Lu %x %Lu) itemoff %d itemsize %d\n",
 			i,
 			btrfs_disk_key_objectid(&item->key),
 			btrfs_disk_key_flags(&item->key),
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b2a2220d135..583cd87a62e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -150,11 +150,6 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	btrfs_release_path(root, path);
-	ret = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					  objectid, -1);
-	BUG_ON(ret);
-	ret = btrfs_del_item(trans, root, path);
-	BUG_ON(ret);
 	dentry->d_inode->i_ctime = dir->i_ctime;
 err:
 	btrfs_release_path(root, path);
@@ -329,8 +324,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			extent_start = btrfs_file_extent_disk_blocknr(fi);
 			extent_num_blocks =
 				btrfs_file_extent_disk_num_blocks(fi);
+			/* FIXME blocksize != 4096 */
 			inode->i_blocks -=
-				btrfs_file_extent_num_blocks(fi) >> 9;
+				btrfs_file_extent_num_blocks(fi) << 3;
 			found_extent = 1;
 		} else {
 			found_extent = 0;
@@ -562,7 +558,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
 			break;
 		if (btrfs_disk_key_type(&item->key) != key_type)
-			continue;
+			break;
 		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
 			continue;
 		filp->f_pos = btrfs_disk_key_offset(&item->key);
@@ -1285,29 +1281,27 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	u64 search_start = start;
 	int bookend;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-search_again:
-	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
-				       search_start, -1);
-	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		if (path->slots[0] == 0) {
-			ret = -ENOENT;
+	while(1) {
+		btrfs_release_path(root, path);
+		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+					       search_start, -1);
+		if (ret < 0)
 			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = -ENOENT;
+				goto out;
+			}
+			path->slots[0]--;
 		}
-		path->slots[0]--;
-	}
-	while(1) {
 		keep = 0;
 		bookend = 0;
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
-
 		extent = btrfs_item_ptr(leaf, slot,
 					struct btrfs_file_extent_item);
 		extent_end = key.offset +
@@ -1318,7 +1312,10 @@ search_again:
 			goto out;
 		}
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-			goto next_leaf;
+			goto out;
+		if (search_start >= extent_end)
+			goto out;
+		search_start = extent_end;
 
 		if (end < extent_end && end >= key.offset) {
 			memcpy(&old, extent, sizeof(old));
@@ -1331,10 +1328,13 @@ search_again:
 
 		if (start > key.offset) {
 			u64 new_num;
+			u64 old_num;
 			/* truncate existing extent */
 			keep = 1;
 			WARN_ON(start & (root->blocksize - 1));
 			new_num = (start - key.offset) >> inode->i_blkbits;
+			old_num = btrfs_file_extent_num_blocks(extent);
+			inode->i_blocks -= (old_num - new_num) << 3;
 			btrfs_set_file_extent_num_blocks(extent, new_num);
 			mark_buffer_dirty(path->nodes[0]);
 		}
@@ -1344,13 +1344,11 @@ search_again:
 			disk_blocknr = btrfs_file_extent_disk_blocknr(extent);
 			disk_num_blocks =
 				btrfs_file_extent_disk_num_blocks(extent);
-			search_start = key.offset +
-				(btrfs_file_extent_num_blocks(extent) <<
-				inode->i_blkbits);
 			ret = btrfs_del_item(trans, root, path);
 			BUG_ON(ret);
+			inode->i_blocks -=
+				btrfs_file_extent_num_blocks(extent) << 3;
 			btrfs_release_path(root, path);
-
 			ret = btrfs_free_extent(trans, root, disk_blocknr,
 						disk_num_blocks, 0);
 
@@ -1360,7 +1358,7 @@ search_again:
 				goto out;
 			}
 			if (!bookend)
-				goto search_again;
+				continue;
 		}
 		if (bookend) {
 			/* create bookend */
@@ -1395,21 +1393,12 @@ search_again:
 			btrfs_set_file_extent_generation(extent,
 				    btrfs_file_extent_generation(&old));
 			btrfs_mark_buffer_dirty(path->nodes[0]);
+			inode->i_blocks +=
+				btrfs_file_extent_num_blocks(extent) << 3;
 			ret = 0;
 			goto out;
 		}
-next_leaf:
-		if (slot >= btrfs_header_nritems(&leaf->header) - 1) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret) {
-				ret = 0;
-				goto out;
-			}
-		} else {
-			path->slots[0]++;
-		}
 	}
-
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -1445,15 +1434,6 @@ static int prepare_pages(struct btrfs_root *root,
 		}
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
-#if 0
-		if ((pages[i]->index == first_index ||
-		    pages[i]->index == last_index) && pos < isize &&
-		    !PageUptodate(pages[i])) {
-			ret = mpage_readpage(pages[i], btrfs_get_block);
-			BUG_ON(ret);
-			lock_page(pages[i]);
-		}
-#endif
 		create_empty_buffers(pages[i], root->fs_info->sb->s_blocksize,
 				     (1 << BH_Uptodate));
 		head = page_buffers(pages[i]);
@@ -1494,6 +1474,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *pages[8];
+	struct page *pinned[2] = { NULL, NULL };
 	unsigned long first_index;
 	unsigned long last_index;
 	u64 start_pos;
@@ -1505,14 +1486,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	if (file->f_flags & O_DIRECT)
 		return -EINVAL;
 	pos = *ppos;
-
-	start_pos = pos & ~(root->blocksize - 1);
-	/* FIXME */
-	if (start_pos != pos)
-		return -EINVAL;
-	num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
-			inode->i_blkbits;
-
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
@@ -1524,10 +1497,37 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	if (err)
 		goto out;
 	file_update_time(file);
+
+	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+	num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
+			inode->i_blkbits;
+
 	mutex_lock(&inode->i_mutex);
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 
+	if ((first_index << PAGE_CACHE_SHIFT) < inode->i_size &&
+	    (pos & (PAGE_CACHE_SIZE - 1))) {
+		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+		if (!PageUptodate(pinned[0])) {
+			ret = mpage_readpage(pinned[0], btrfs_get_block);
+			BUG_ON(ret);
+		} else {
+			unlock_page(pinned[0]);
+		}
+	}
+	if (first_index != last_index &&
+	    (last_index << PAGE_CACHE_SHIFT) < inode->i_size &&
+	    (count & (PAGE_CACHE_SIZE - 1))) {
+		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+		if (!PageUptodate(pinned[1])) {
+			ret = mpage_readpage(pinned[1], btrfs_get_block);
+			BUG_ON(ret);
+		} else {
+			unlock_page(pinned[1]);
+		}
+	}
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
@@ -1535,11 +1535,14 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		mutex_unlock(&root->fs_info->fs_mutex);
 		goto out_unlock;
 	}
+	/* FIXME blocksize != 4096 */
+	inode->i_blocks += num_blocks << 3;
 	if (start_pos < inode->i_size) {
+		/* FIXME blocksize != pagesize */
 		ret = drop_extents(trans, root, inode,
 				   start_pos,
 				   (pos + count + root->blocksize -1) &
-				   ~(root->blocksize - 1));
+				   ~((u64)root->blocksize - 1));
 	}
 	ret = btrfs_alloc_extent(trans, root, num_blocks, 1,
 				 (u64)-1, &ins);
@@ -1585,8 +1588,13 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
+	if (pinned[0])
+		page_cache_release(pinned[0]);
+	if (pinned[1])
+		page_cache_release(pinned[1]);
 	*ppos = pos;
 	current->backing_dev_info = NULL;
+	mark_inode_dirty(inode);
 	return num_written ? num_written : err;
 }
 
-- 
cgit v1.2.3


From 236454dfffb64a95ee01c50a215153f5de61c475 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 19 Apr 2007 13:37:44 -0400
Subject: Btrfs: many file_write fixes, inline data

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  35 ++++
 fs/btrfs/disk-io.c     |   6 +
 fs/btrfs/extent-tree.c |   5 +
 fs/btrfs/file-item.c   |   1 +
 fs/btrfs/super.c       | 553 ++++++++++++++++++++++++++++++++++++++-----------
 5 files changed, 484 insertions(+), 116 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8a329d3901a..f1800008be3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -73,6 +73,9 @@ struct btrfs_header {
 			       (sizeof(struct btrfs_disk_key) + sizeof(u64)))
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
 #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize))
+#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+					sizeof(struct btrfs_item) - \
+					sizeof(struct btrfs_file_extent_item))
 
 struct buffer_head;
 /*
@@ -204,8 +207,12 @@ struct btrfs_root_item {
 	__le32 refs;
 } __attribute__ ((__packed__));
 
+#define BTRFS_FILE_EXTENT_REG 0
+#define BTRFS_FILE_EXTENT_INLINE 1
+
 struct btrfs_file_extent_item {
 	__le64 generation;
+	u8 type;
 	/*
 	 * disk space consumed by the extent, checksum blocks are included
 	 * in these numbers
@@ -862,6 +869,34 @@ static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 	return (u8 *)l->items;
 }
 
+static inline int btrfs_file_extent_type(struct btrfs_file_extent_item *e)
+{
+	return e->type;
+}
+static inline void btrfs_set_file_extent_type(struct btrfs_file_extent_item *e,
+					      u8 val)
+{
+	e->type = val;
+}
+
+static inline char *btrfs_file_extent_inline_start(struct
+						   btrfs_file_extent_item *e)
+{
+	return (char *)(&e->disk_blocknr);
+}
+
+static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
+{
+	return (unsigned long)(&((struct
+		  btrfs_file_extent_item *)NULL)->disk_blocknr) + datasize;
+}
+
+static inline u32 btrfs_file_extent_inline_len(struct btrfs_item *e)
+{
+	struct btrfs_file_extent_item *fe = NULL;
+	return btrfs_item_size(e) - (unsigned long)(&fe->disk_blocknr);
+}
+
 static inline u64 btrfs_file_extent_disk_blocknr(struct btrfs_file_extent_item
 						 *e)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a2a3f529cad..11e17a2f736 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -108,6 +108,12 @@ int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 
 	int ret;
 
+	if (logical == 0) {
+		bh->b_bdev = NULL;
+		bh->b_blocknr = 0;
+		set_buffer_mapped(bh);
+		return 0;
+	}
 	root = root->fs_info->dev_root;
 	ret = radix_tree_gang_lookup(&root->fs_info->dev_radix,
 				     (void **)lookup,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cb04a70eb7e..b2faad3e879 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -111,6 +111,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				continue;
 			fi = btrfs_item_ptr(buf_leaf, i,
 					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
 			ret = btrfs_inc_extent_ref(trans, root,
 				    btrfs_file_extent_disk_blocknr(fi),
 				    btrfs_file_extent_disk_num_blocks(fi));
@@ -539,6 +542,8 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
 			continue;
 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(fi) == BTRFS_FILE_EXTENT_INLINE)
+			continue;
 		/*
 		 * FIXME make sure to insert a trans record that
 		 * repeats the snapshot del on crash
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 8cc3c1d1541..10e4cf08e9e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,6 +34,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_offset(item, 0);
 	btrfs_set_file_extent_num_blocks(item, num_blocks);
 	btrfs_set_file_extent_generation(item, trans->transid);
+	btrfs_set_file_extent_type(item, BTRFS_FILE_EXTENT_REG);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	btrfs_release_path(root, path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 583cd87a62e..1b286bb26f1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -317,19 +317,22 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			break;
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
 			break;
+		found_extent = 0;
 		if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
 			fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 					    path->slots[0],
 					    struct btrfs_file_extent_item);
-			extent_start = btrfs_file_extent_disk_blocknr(fi);
-			extent_num_blocks =
-				btrfs_file_extent_disk_num_blocks(fi);
-			/* FIXME blocksize != 4096 */
-			inode->i_blocks -=
-				btrfs_file_extent_num_blocks(fi) << 3;
-			found_extent = 1;
-		} else {
-			found_extent = 0;
+			if (btrfs_file_extent_type(fi) !=
+			    BTRFS_FILE_EXTENT_INLINE) {
+				extent_start =
+					btrfs_file_extent_disk_blocknr(fi);
+				extent_num_blocks =
+					btrfs_file_extent_disk_num_blocks(fi);
+				/* FIXME blocksize != 4096 */
+				inode->i_blocks -=
+					btrfs_file_extent_num_blocks(fi) << 3;
+				found_extent = 1;
+			}
 		}
 		ret = btrfs_del_item(trans, root, path);
 		BUG_ON(ret);
@@ -1010,9 +1013,9 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	u64 extent_start = 0;
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
+	u32 found_type;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_leaf *leaf;
 	struct btrfs_disk_key *found_key;
@@ -1021,13 +1024,12 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	BUG_ON(!path);
 	btrfs_init_path(path);
 	if (create) {
-		trans = btrfs_start_transaction(root, 1);
 		WARN_ON(1);
 	}
 
-	ret = btrfs_lookup_file_extent(trans, root, path,
+	ret = btrfs_lookup_file_extent(NULL, root, path,
 				       inode->i_ino,
-				       iblock << inode->i_blkbits, create);
+				       iblock << inode->i_blkbits, 0);
 	if (ret < 0) {
 		err = ret;
 		goto out;
@@ -1036,7 +1038,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	if (ret != 0) {
 		if (path->slots[0] == 0) {
 			btrfs_release_path(root, path);
-			goto allocate;
+			goto out;
 		}
 		path->slots[0]--;
 	}
@@ -1047,73 +1049,51 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	blocknr = btrfs_file_extent_disk_blocknr(item);
 	blocknr += btrfs_file_extent_offset(item);
 
-	/* exact match found, use it, FIXME, deal with extents
-	 * other than the page size
-	 */
-	if (0 && ret == 0) {
-		err = 0;
-		if (create &&
-		    btrfs_file_extent_generation(item) != trans->transid) {
-			struct btrfs_key ins;
-			ret = btrfs_alloc_extent(trans, root, 1,
-						 blocknr, (u64)-1, &ins);
-			BUG_ON(ret);
-			btrfs_set_file_extent_disk_blocknr(item, ins.objectid);
-			mark_buffer_dirty(path->nodes[0]);
-			ret = btrfs_free_extent(trans, root,
-						blocknr, 1, 0);
-			BUG_ON(ret);
-			blocknr = ins.objectid;
-
-		}
-		btrfs_map_bh_to_logical(root, result, blocknr);
-		goto out;
-	}
-
 	/* are we inside the extent that was found? */
 	found_key = &leaf->items[path->slots[0]].key;
+	found_type = btrfs_disk_key_type(found_key);
 	if (btrfs_disk_key_objectid(found_key) != objectid ||
-	    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY) {
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
 		extent_end = 0;
 		extent_start = 0;
 		btrfs_release_path(root, path);
-		goto allocate;
-	}
-
-	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
-	extent_start = extent_start >> inode->i_blkbits;
-	extent_end = extent_start + btrfs_file_extent_num_blocks(item);
-	if (iblock >= extent_start && iblock < extent_end) {
-		err = 0;
-		btrfs_map_bh_to_logical(root, result, blocknr + iblock -
-					extent_start);
-		goto out;
-	}
-allocate:
-	/* ok, create a new extent */
-	if (!create) {
-		err = 0;
 		goto out;
 	}
-#if 0
-	ret = btrfs_alloc_file_extent(trans, root, objectid,
-				      iblock << inode->i_blkbits,
-				      1, extent_end, &blocknr);
-	if (ret) {
-		err = ret;
-		goto out;
+	found_type = btrfs_file_extent_type(item);
+	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		extent_start = extent_start >> inode->i_blkbits;
+		extent_end = extent_start + btrfs_file_extent_num_blocks(item);
+		if (iblock >= extent_start && iblock < extent_end) {
+			err = 0;
+			btrfs_map_bh_to_logical(root, result, blocknr +
+						iblock - extent_start);
+			goto out;
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		char *ptr;
+		char *map;
+		u32 size;
+		size = btrfs_file_extent_inline_len(leaf->items +
+						    path->slots[0]);
+		extent_end = (extent_start + size) >> inode->i_blkbits;
+		extent_start >>= inode->i_blkbits;
+		if (iblock < extent_start || iblock > extent_end) {
+			goto out;
+		}
+		ptr = btrfs_file_extent_inline_start(item);
+		map = kmap(result->b_page);
+		memcpy(map, ptr, size);
+		memset(map + size, 0, PAGE_CACHE_SIZE - size);
+		flush_dcache_page(result->b_page);
+		kunmap(result->b_page);
+		set_buffer_uptodate(result);
+		SetPageChecked(result->b_page);
+		btrfs_map_bh_to_logical(root, result, 0);
 	}
-	inode->i_blocks += inode->i_sb->s_blocksize >> 9;
-	set_buffer_new(result);
-	map_bh(result, inode->i_sb, blocknr);
-
-	btrfs_map_bh_to_logical(root, result, blocknr);
-#endif
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	if (trans)
-		btrfs_end_transaction(trans, root);
 	return err;
 }
 
@@ -1124,7 +1104,6 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_get_block_lock(inode, iblock, result, create);
-	// err = btrfs_get_block_inline(inode, iblock, result, create);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return err;
 }
@@ -1134,11 +1113,6 @@ static int btrfs_prepare_write(struct file *file, struct page *page,
 {
 	return nobh_prepare_write(page, from, to, btrfs_get_block);
 }
-static int btrfs_commit_write(struct file *file, struct page *page,
-			       unsigned from, unsigned to)
-{
-	return nobh_commit_write(file, page, from, to);
-}
 
 static void btrfs_write_super(struct super_block *sb)
 {
@@ -1150,9 +1124,227 @@ static int btrfs_readpage(struct file *file, struct page *page)
 	return mpage_readpage(page, btrfs_get_block);
 }
 
+/*
+ * While block_write_full_page is writing back the dirty buffers under
+ * the page lock, whoever dirtied the buffers may decide to clean them
+ * again at any time.  We handle that by only looking at the buffer
+ * state inside lock_buffer().
+ *
+ * If block_write_full_page() is called for regular writeback
+ * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
+ * locked buffer.   This only can happen if someone has written the buffer
+ * directly, with submit_bh().  At the address_space level PageWriteback
+ * prevents this contention from occurring.
+ */
+static int __btrfs_write_full_page(struct inode *inode, struct page *page,
+				   struct writeback_control *wbc)
+{
+	int err;
+	sector_t block;
+	sector_t last_block;
+	struct buffer_head *bh, *head;
+	const unsigned blocksize = 1 << inode->i_blkbits;
+	int nr_underway = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, blocksize,
+					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+
+	/*
+	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+	 * here, and the (potentially unmapped) buffers may become dirty at
+	 * any time.  If a buffer becomes dirty here after we've inspected it
+	 * then we just miss that fact, and the page stays dirty.
+	 *
+	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+	 * handle that here by just cleaning them.
+	 */
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	head = page_buffers(page);
+	bh = head;
+
+	/*
+	 * Get all the dirty buffers mapped to disk addresses and
+	 * handle any aliases from the underlying blockdev's mapping.
+	 */
+	do {
+		if (block > last_block) {
+			/*
+			 * mapped buffers outside i_size will occur, because
+			 * this page can be outside i_size when there is a
+			 * truncate in progress.
+			 */
+			/*
+			 * The buffer was zeroed by block_write_full_page()
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = btrfs_get_block(inode, block, bh, 0);
+			if (err)
+				goto recover;
+			if (buffer_new(bh)) {
+				/* blockdev mappings never come here */
+				clear_buffer_new(bh);
+				unmap_underlying_metadata(bh->b_bdev,
+							bh->b_blocknr);
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+			lock_buffer(bh);
+		} else if (test_set_buffer_locked(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	err = 0;
+done:
+	if (nr_underway == 0) {
+		/*
+		 * The page was marked dirty, but the buffers were
+		 * clean.  Someone wrote them back by hand with
+		 * ll_rw_block/submit_bh.  A rare case.
+		 */
+		int uptodate = 1;
+		do {
+			if (!buffer_uptodate(bh)) {
+				uptodate = 0;
+				break;
+			}
+			bh = bh->b_this_page;
+		} while (bh != head);
+		if (uptodate)
+			SetPageUptodate(page);
+		end_page_writeback(page);
+		/*
+		 * The page and buffer_heads can be released at any time from
+		 * here on.
+		 */
+		wbc->pages_skipped++;	/* We didn't write this page */
+	}
+	return err;
+
+recover:
+	/*
+	 * ENOSPC, or some other error.  We may already have added some
+	 * blocks to the file, so we need to write these out to avoid
+	 * exposing stale data.
+	 * The page is currently locked and not marked for writeback
+	 */
+	bh = head;
+	/* Recovery: lock and submit the mapped buffers */
+	do {
+		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+			lock_buffer(bh);
+			mark_buffer_async_write(bh);
+		} else {
+			/*
+			 * The buffer may have been set dirty during
+			 * attachment to a dirty page.
+			 */
+			clear_buffer_dirty(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(WRITE, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+	goto done;
+}
+
+/*
+ * The generic ->writepage function for buffer-backed address_spaces
+ */
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	return nobh_writepage(page, btrfs_get_block, wbc);
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	void *kaddr;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		return __btrfs_write_full_page(inode, page, wbc);
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		block_invalidatepage(page, 0);
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	return __btrfs_write_full_page(inode, page, wbc);
 }
 
 static void btrfs_truncate(struct inode *inode)
@@ -1179,6 +1371,29 @@ static void btrfs_truncate(struct inode *inode)
 	mark_inode_dirty(inode);
 }
 
+/*
+ * Make sure any changes to nobh_commit_write() are reflected in
+ * nobh_truncate_page(), since it doesn't call commit_write().
+ */
+static int btrfs_commit_write(struct file *file, struct page *page,
+			      unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *bh;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	SetPageUptodate(page);
+	bh = page_buffers(page);
+	if (buffer_mapped(bh) && bh->b_blocknr != 0) {
+		set_page_dirty(page);
+	}
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
 static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
 				struct page **prepared_pages,
 				const char __user * buf)
@@ -1234,6 +1449,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	int ret;
 	int this_write;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct buffer_head *bh;
+	struct btrfs_file_extent_item *ei;
 
 	for (i = 0; i < num_pages; i++) {
 		offset = pos & (PAGE_CACHE_SIZE -1);
@@ -1242,16 +1459,47 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
-		btrfs_csum_file_block(trans, root, inode->i_ino,
+
+		bh = page_buffers(pages[i]);
+		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
+			struct btrfs_key key;
+			struct btrfs_path *path;
+			char *ptr;
+			u32 datasize;
+
+			path = btrfs_alloc_path();
+			BUG_ON(!path);
+			key.objectid = inode->i_ino;
+			key.offset = pages[i]->index << PAGE_CACHE_SHIFT;
+			key.flags = 0;
+			btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+			BUG_ON(write_bytes >= PAGE_CACHE_SIZE);
+			datasize = offset +
+				btrfs_file_extent_calc_inline_size(write_bytes);
+			ret = btrfs_insert_empty_item(trans, root, path, &key,
+						      datasize);
+			BUG_ON(ret);
+			ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+			       path->slots[0], struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(ei, trans->transid);
+			btrfs_set_file_extent_type(ei,
+						   BTRFS_FILE_EXTENT_INLINE);
+			ptr = btrfs_file_extent_inline_start(ei);
+			memcpy(ptr, bh->b_data, offset + write_bytes);
+			mark_buffer_dirty(path->nodes[0]);
+			btrfs_free_path(path);
+		} else {
+			btrfs_csum_file_block(trans, root, inode->i_ino,
 				      pages[i]->index << PAGE_CACHE_SHIFT,
 				      kmap(pages[i]), PAGE_CACHE_SIZE);
-		kunmap(pages[i]);
+			kunmap(pages[i]);
+		}
 		SetPageChecked(pages[i]);
 		ret = btrfs_end_transaction(trans, root);
 		BUG_ON(ret);
 		mutex_unlock(&root->fs_info->fs_mutex);
 
-		ret = nobh_commit_write(file, pages[i], offset,
+		ret = btrfs_commit_write(file, pages[i], offset,
 					 offset + this_write);
 		pos += this_write;
 		if (ret) {
@@ -1275,12 +1523,16 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 	struct btrfs_leaf *leaf;
 	int slot;
 	struct btrfs_file_extent_item *extent;
-	u64 extent_end;
+	u64 extent_end = 0;
 	int keep;
 	struct btrfs_file_extent_item old;
 	struct btrfs_path *path;
 	u64 search_start = start;
 	int bookend;
+	int found_type;
+	int found_extent;
+	int found_inline;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1292,37 +1544,62 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 			goto out;
 		if (ret > 0) {
 			if (path->slots[0] == 0) {
-				ret = -ENOENT;
+				ret = 0;
 				goto out;
 			}
 			path->slots[0]--;
 		}
 		keep = 0;
 		bookend = 0;
+		found_extent = 0;
+		found_inline = 0;
+		extent = NULL;
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
-		extent = btrfs_item_ptr(leaf, slot,
-					struct btrfs_file_extent_item);
-		extent_end = key.offset +
-			(btrfs_file_extent_num_blocks(extent) <<
-			 inode->i_blkbits);
 		if (key.offset >= end || key.objectid != inode->i_ino) {
 			ret = 0;
 			goto out;
 		}
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) {
+			ret = 0;
+			goto out;
+		}
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(extent);
+		if (found_type == BTRFS_FILE_EXTENT_REG) {
+			extent_end = key.offset +
+				(btrfs_file_extent_num_blocks(extent) <<
+				 inode->i_blkbits);
+			found_extent = 1;
+		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+			found_inline = 1;
+			extent_end = key.offset +
+			     btrfs_file_extent_inline_len(leaf->items + slot);
+		}
+
+		if (!found_extent && !found_inline) {
+			ret = 0;
 			goto out;
-		if (search_start >= extent_end)
+		}
+
+		if (search_start >= extent_end) {
+			ret = 0;
 			goto out;
+		}
+
 		search_start = extent_end;
 
 		if (end < extent_end && end >= key.offset) {
-			memcpy(&old, extent, sizeof(old));
-			ret = btrfs_inc_extent_ref(trans, root,
-				   btrfs_file_extent_disk_blocknr(&old),
-				   btrfs_file_extent_disk_num_blocks(&old));
-			BUG_ON(ret);
+			if (found_extent) {
+				memcpy(&old, extent, sizeof(old));
+				ret = btrfs_inc_extent_ref(trans, root,
+				      btrfs_file_extent_disk_blocknr(&old),
+				      btrfs_file_extent_disk_num_blocks(&old));
+				BUG_ON(ret);
+			}
+			WARN_ON(found_inline);
 			bookend = 1;
 		}
 
@@ -1332,25 +1609,45 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 			/* truncate existing extent */
 			keep = 1;
 			WARN_ON(start & (root->blocksize - 1));
-			new_num = (start - key.offset) >> inode->i_blkbits;
-			old_num = btrfs_file_extent_num_blocks(extent);
-			inode->i_blocks -= (old_num - new_num) << 3;
-			btrfs_set_file_extent_num_blocks(extent, new_num);
-			mark_buffer_dirty(path->nodes[0]);
+			if (found_extent) {
+				new_num = (start - key.offset) >>
+					inode->i_blkbits;
+				old_num = btrfs_file_extent_num_blocks(extent);
+				inode->i_blocks -= (old_num - new_num) << 3;
+				btrfs_set_file_extent_num_blocks(extent,
+								 new_num);
+				mark_buffer_dirty(path->nodes[0]);
+			} else {
+				WARN_ON(1);
+				/*
+				ret = btrfs_truncate_item(trans, root, path,
+							  start - key.offset);
+				BUG_ON(ret);
+				*/
+			}
 		}
 		if (!keep) {
-			u64 disk_blocknr;
-			u64 disk_num_blocks;
-			disk_blocknr = btrfs_file_extent_disk_blocknr(extent);
-			disk_num_blocks =
-				btrfs_file_extent_disk_num_blocks(extent);
+			u64 disk_blocknr = 0;
+			u64 disk_num_blocks = 0;
+			u64 extent_num_blocks = 0;
+			if (found_extent) {
+				disk_blocknr =
+				      btrfs_file_extent_disk_blocknr(extent);
+				disk_num_blocks =
+				      btrfs_file_extent_disk_num_blocks(extent);
+				extent_num_blocks =
+				      btrfs_file_extent_num_blocks(extent);
+			}
 			ret = btrfs_del_item(trans, root, path);
 			BUG_ON(ret);
-			inode->i_blocks -=
-				btrfs_file_extent_num_blocks(extent) << 3;
 			btrfs_release_path(root, path);
-			ret = btrfs_free_extent(trans, root, disk_blocknr,
-						disk_num_blocks, 0);
+			if (found_extent) {
+				inode->i_blocks -=
+				btrfs_file_extent_num_blocks(extent) << 3;
+				ret = btrfs_free_extent(trans, root,
+							disk_blocknr,
+							disk_num_blocks, 0);
+			}
 
 			BUG_ON(ret);
 			if (!bookend && search_start >= end) {
@@ -1360,7 +1657,7 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 			if (!bookend)
 				continue;
 		}
-		if (bookend) {
+		if (bookend && found_extent) {
 			/* create bookend */
 			struct btrfs_key ins;
 			ins.objectid = inode->i_ino;
@@ -1390,6 +1687,8 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 				    btrfs_file_extent_num_blocks(&old) -
 				    ((end - key.offset) >> inode->i_blkbits));
 
+			btrfs_set_file_extent_type(extent,
+						   BTRFS_FILE_EXTENT_REG);
 			btrfs_set_file_extent_generation(extent,
 				    btrfs_file_extent_generation(&old));
 			btrfs_mark_buffer_dirty(path->nodes[0]);
@@ -1445,7 +1744,8 @@ static int prepare_pages(struct btrfs_root *root,
 			if (err)
 				goto failed_truncate;
 			bh = bh->b_this_page;
-			alloc_extent_start++;
+			if (alloc_extent_start)
+				alloc_extent_start++;
 		} while (bh != head);
 		pos += this_write;
 		WARN_ON(this_write > write_bytes);
@@ -1543,12 +1843,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 				   start_pos,
 				   (pos + count + root->blocksize -1) &
 				   ~((u64)root->blocksize - 1));
+		BUG_ON(ret);
 	}
-	ret = btrfs_alloc_extent(trans, root, num_blocks, 1,
+	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
+	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+		ret = btrfs_alloc_extent(trans, root, num_blocks, 1,
 				 (u64)-1, &ins);
-	BUG_ON(ret);
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+		BUG_ON(ret);
+		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       start_pos, ins.objectid, ins.offset);
+		BUG_ON(ret);
+	} else {
+		ins.offset = 0;
+		ins.objectid = 0;
+	}
 	BUG_ON(ret);
 	alloc_extent_start = ins.objectid;
 	ret = btrfs_end_transaction(trans, root);
@@ -1567,7 +1875,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		BUG_ON(ret);
 
 		/* FIXME blocks != pagesize */
-		alloc_extent_start += num_pages;
+		if (alloc_extent_start)
+			alloc_extent_start += num_pages;
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		BUG_ON(ret);
@@ -1779,10 +2088,11 @@ static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 	if (!PageChecked(page)) {
 		/* FIXME, do it per block */
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+
 		int ret = btrfs_csum_verify_file_block(root,
-					  page->mapping->host->i_ino,
-					  page->index << PAGE_CACHE_SHIFT,
-					  kmap(page), PAGE_CACHE_SIZE);
+				  page->mapping->host->i_ino,
+				  page->index << PAGE_CACHE_SHIFT,
+				  kmap(page), PAGE_CACHE_SIZE);
 		if (ret) {
 			printk("failed to verify ino %lu page %lu\n",
 			       page->mapping->host->i_ino,
@@ -2249,6 +2559,16 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 			   btrfs_fill_super, mnt);
 }
 
+
+static int btrfs_getattr(struct vfsmount *mnt,
+			 struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->blksize = 256 * 1024;
+	return 0;
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -2298,6 +2618,7 @@ static struct address_space_operations btrfs_aops = {
 
 static struct inode_operations btrfs_file_inode_operations = {
 	.truncate	= btrfs_truncate,
+	.getattr	= btrfs_getattr,
 };
 
 static struct file_operations btrfs_file_operations = {
-- 
cgit v1.2.3


From 7e38180e207c8707691c12395e990a5d583c5e71 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 19 Apr 2007 15:36:27 -0400
Subject: Btrfs: directory inode index is back

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO       |   4 +-
 fs/btrfs/ctree.h    |  58 +++-------
 fs/btrfs/dir-item.c | 204 ++++++++++++++++++++++++-----------
 fs/btrfs/super.c    | 302 ++++++++--------------------------------------------
 4 files changed, 206 insertions(+), 362 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 1c3acfcf146..8d7e5a3ed62 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,12 +1,12 @@
 * cleanup, add more error checking, get rid of BUG_ONs
 * Fix ENOSPC handling
-* Add block mapping tree (simple dm layer)
 * Make allocator smarter
 * add a block group to struct inode
-* Make sure nobh stuff is working properly for cows
 * Do actual block accounting
 * Check compat and incompat flags on the inode
 * Get rid of struct ctree_path, limiting tree levels held at one time
+* Add generation number to key pointer in nodes
+* Add parent pointer back to extent map.
 * Release
 * Do real tree locking
 * Add extent mirroring (backup copies of blocks)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f1800008be3..d628a3b9441 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -185,11 +185,6 @@ struct btrfs_inode_item {
 	struct btrfs_inode_timespec otime;
 } __attribute__ ((__packed__));
 
-/* inline data is just a blob of bytes */
-struct btrfs_inline_data_item {
-	u8 data;
-} __attribute__ ((__packed__));
-
 struct btrfs_dir_item {
 	struct btrfs_disk_key location;
 	__le16 flags;
@@ -293,9 +288,6 @@ struct btrfs_root {
 #define BTRFS_KEY_TYPE_MASK	(((u32)BTRFS_KEY_TYPE_MAX - 1) << \
 				  BTRFS_KEY_TYPE_SHIFT)
 
-#define BTRFS_KEY_OVERFLOW_MAX 128
-#define BTRFS_KEY_OVERFLOW_MASK ((u32)BTRFS_KEY_OVERFLOW_MAX - 1)
-
 /*
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
@@ -621,31 +613,6 @@ static inline void btrfs_set_key_type(struct btrfs_key *key, u32 val)
 	key->flags = (key->flags & ~(BTRFS_KEY_TYPE_MASK)) | val;
 }
 
-static inline u32 btrfs_key_overflow(struct btrfs_key *key)
-{
-	return key->flags & BTRFS_KEY_OVERFLOW_MASK;
-}
-
-static inline void btrfs_set_key_overflow(struct btrfs_key *key, u32 over)
-{
-	BUG_ON(over >= BTRFS_KEY_OVERFLOW_MAX);
-	key->flags = (key->flags & ~BTRFS_KEY_OVERFLOW_MASK) | over;
-}
-
-static inline u32 btrfs_disk_key_overflow(struct btrfs_disk_key *key)
-{
-	return le32_to_cpu(key->flags) & BTRFS_KEY_OVERFLOW_MASK;
-}
-
-static inline void btrfs_set_disk_key_overflow(struct btrfs_disk_key *key,
-					       u32 over)
-{
-	u32 flags = btrfs_disk_key_flags(key);
-	BUG_ON(over >= BTRFS_KEY_OVERFLOW_MAX);
-	flags = (flags & ~BTRFS_KEY_OVERFLOW_MASK) | over;
-	btrfs_set_disk_key_flags(key, flags);
-}
-
 static inline u64 btrfs_header_blocknr(struct btrfs_header *h)
 {
 	return le64_to_cpu(h->blocknr);
@@ -1079,15 +1046,24 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
 			  struct btrfs_key *location, u8 type);
-int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, u64 dir,
-			  const char *name, int name_len, int mod);
-int btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path, u64 dir,
-				u64 objectid, int mod);
-int btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod);
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
 			      const char *name, int name_len);
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di);
 /* inode-map.c */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a43deb72648..7a7e9846860 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -4,24 +4,31 @@
 #include "hash.h"
 #include "transaction.h"
 
-int insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *root, struct btrfs_path *path, struct btrfs_key
-			    *cpu_key, u32 data_size)
+struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    struct btrfs_path *path,
+					    struct btrfs_key *cpu_key,
+					    u32 data_size)
 {
-	int overflow;
 	int ret;
+	char *ptr;
+	struct btrfs_item *item;
+	struct btrfs_leaf *leaf;
 
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
-	overflow = btrfs_key_overflow(cpu_key);
-
-	while(ret == -EEXIST && overflow < BTRFS_KEY_OVERFLOW_MAX) {
-		overflow++;
-		btrfs_set_key_overflow(cpu_key, overflow);
-		btrfs_release_path(root, path);
-		ret = btrfs_insert_empty_item(trans, root, path, cpu_key,
-					      data_size);
+	if (ret == -EEXIST) {
+		ret = btrfs_extend_item(trans, root, path, data_size);
+		WARN_ON(ret > 0);
+		if (ret)
+			return ERR_PTR(ret);
 	}
-	return ret;
+	WARN_ON(ret > 0);
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	item = leaf->items + path->slots[0];
+	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+	BUG_ON(data_size > btrfs_item_size(item));
+	ptr += btrfs_item_size(item) - data_size;
+	return (struct btrfs_dir_item *)ptr;
 }
 
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -43,13 +50,12 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
 	data_size = sizeof(*dir_item) + name_len;
-	ret = insert_with_overflow(trans, root, path, &key, data_size);
-	if (ret)
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size);
+	if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
 		goto out;
+	}
 
-	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
-				  struct btrfs_dir_item);
 	btrfs_cpu_key_to_disk(&dir_item->location, location);
 	btrfs_set_dir_type(dir_item, type);
 	btrfs_set_dir_flags(dir_item, 0);
@@ -58,15 +64,39 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-out:
+
+	/* FIXME, use some real flag for selecting the extra index */
+	if (root == root->fs_info->tree_root) {
+		ret = 0;
+		goto out;
+	}
+
 	btrfs_release_path(root, path);
+
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = location->objectid;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size);
+	if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		goto out;
+	}
+	btrfs_cpu_key_to_disk(&dir_item->location, location);
+	btrfs_set_dir_type(dir_item, type);
+	btrfs_set_dir_flags(dir_item, 0);
+	btrfs_set_dir_name_len(dir_item, name_len);
+	name_ptr = (char *)(dir_item + 1);
+	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
 	btrfs_free_path(path);
 	return ret;
 }
 
-int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, u64 dir,
-			  const char *name, int name_len, int mod)
+struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path, u64 dir,
+					     const char *name, int name_len,
+					     int mod)
 {
 	int ret;
 	struct btrfs_key key;
@@ -74,57 +104,111 @@ int btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	int cow = mod != 0;
 	struct btrfs_disk_key *found_key;
 	struct btrfs_leaf *leaf;
-	u32 overflow;
 
 	key.objectid = dir;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	// btrfs_set_key_overflow(&key, BTRFS_KEY_OVERFLOW_MAX - 1);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
-	while(1) {
-		ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
-		if (ret < 0)
-			return ret;
-		if (ret > 0) {
-			if (path->slots[0] == 0)
-				return 1;
-			path->slots[0]--;
-		}
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		found_key = &leaf->items[path->slots[0]].key;
-
-		if (btrfs_disk_key_objectid(found_key) != dir ||
-		    btrfs_disk_key_type(found_key) != BTRFS_DIR_ITEM_KEY ||
-		    btrfs_disk_key_offset(found_key) != key.offset)
-			return 1;
-
-		if (btrfs_match_dir_item_name(root, path, name, name_len))
-			return 0;
-
-		overflow = btrfs_disk_key_overflow(found_key);
-		if (overflow == 0)
-			return 1;
-		btrfs_set_key_overflow(&key, overflow - 1);
-		btrfs_release_path(root, path);
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			return NULL;
+		path->slots[0]--;
 	}
-	return 1;
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	found_key = &leaf->items[path->slots[0]].key;
+
+	if (btrfs_disk_key_objectid(found_key) != dir ||
+	    btrfs_disk_key_type(found_key) != BTRFS_DIR_ITEM_KEY ||
+	    btrfs_disk_key_offset(found_key) != key.offset)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
-int btrfs_match_dir_item_name(struct btrfs_root *root,
+struct btrfs_dir_item *
+btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 dir,
+			    u64 objectid, const char *name, int name_len,
+			    int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	key.objectid = dir;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return ERR_PTR(-ENOENT);
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
 {
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
+	u32 total_len;
+	u32 cur = 0;
+	u32 this_len;
+	struct btrfs_leaf *leaf;
 
-	dir_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
-				  struct btrfs_dir_item);
-	if (btrfs_dir_name_len(dir_item) != name_len)
-		return 0;
-	name_ptr = (char *)(dir_item + 1);
-	if (memcmp(name_ptr, name, name_len))
-		return 0;
-	return 1;
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+	total_len = btrfs_item_size(leaf->items + path->slots[0]);
+	while(cur < total_len) {
+		this_len = sizeof(*dir_item) + btrfs_dir_name_len(dir_item);
+		name_ptr = (char *)(dir_item + 1);
+
+		if (btrfs_dir_name_len(dir_item) == name_len &&
+		    memcmp(name_ptr, name, name_len) == 0)
+			return dir_item;
+
+		cur += this_len;
+		dir_item = (struct btrfs_dir_item *)((char *)dir_item +
+						     this_len);
+	}
+	return NULL;
 }
+
+int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      struct btrfs_dir_item *di)
+{
+
+	struct btrfs_leaf *leaf;
+	u32 sub_item_len;
+	u32 item_len;
+	int ret;
+
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	sub_item_len = sizeof(*di) + btrfs_dir_name_len(di);
+	item_len = btrfs_item_size(leaf->items + path->slots[0]);
+	if (sub_item_len == btrfs_item_size(leaf->items + path->slots[0])) {
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
+	} else {
+		char *ptr = (char *)di;
+		char *start = btrfs_item_ptr(leaf, path->slots[0], char);
+		btrfs_memmove(root, leaf, ptr, ptr + sub_item_len,
+			item_len - (ptr + sub_item_len - start));
+		ret = btrfs_truncate_item(trans, root, path,
+					  item_len - sub_item_len);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1b286bb26f1..dff2db0d1dc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -127,32 +127,43 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	const char *name = dentry->d_name.name;
 	int name_len = dentry->d_name.len;
-	int ret;
+	int ret = 0;
 	u64 objectid;
 	struct btrfs_dir_item *di;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-	ret = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
-	if (ret < 0)
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
 		goto err;
-	if (ret > 0) {
+	}
+	if (!di) {
 		ret = -ENOENT;
 		goto err;
 	}
-	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			    struct btrfs_dir_item);
 	objectid = btrfs_disk_key_objectid(&di->location);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
 
-	ret = btrfs_del_item(trans, root, path);
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 objectid, name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		ret = -ENOENT;
+		goto err;
+	}
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 	BUG_ON(ret);
 
-	btrfs_release_path(root, path);
 	dentry->d_inode->i_ctime = dir->i_ctime;
 err:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	if (ret == 0) {
 		inode_dec_link_count(dentry->d_inode);
@@ -388,15 +399,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-	ret = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
-	if (ret || !btrfs_match_dir_item_name(root, path, name, namelen)) {
+	if (!di || IS_ERR(di)) {
 		location->objectid = 0;
 		ret = 0;
 		goto out;
 	}
-	di = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			    struct btrfs_dir_item);
 	btrfs_disk_key_to_cpu(location, &di->location);
 out:
 	btrfs_release_path(root, path);
@@ -523,7 +532,10 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int advance;
 	unsigned char d_type = DT_UNKNOWN;
 	int over = 0;
-	int key_type = BTRFS_DIR_ITEM_KEY;
+	u32 di_cur;
+	u32 di_total;
+	u32 di_len;
+	int key_type = BTRFS_DIR_INDEX_KEY;
 
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
@@ -567,12 +579,20 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		filp->f_pos = btrfs_disk_key_offset(&item->key);
 		advance = 1;
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
-		over = filldir(dirent, (const char *)(di + 1),
-			       btrfs_dir_name_len(di),
-			       btrfs_disk_key_offset(&item->key),
-			       btrfs_disk_key_objectid(&di->location), d_type);
-		if (over)
-			goto nopos;
+		di_cur = 0;
+		di_total = btrfs_item_size(leaf->items + slot);
+		while(di_cur < di_total) {
+			over = filldir(dirent, (const char *)(di + 1),
+				       btrfs_dir_name_len(di),
+				       btrfs_disk_key_offset(&item->key),
+				       btrfs_disk_key_objectid(&di->location),
+				       d_type);
+			if (over)
+				goto nopos;
+			di_len = btrfs_dir_name_len(di) + sizeof(*di);
+			di_cur += di_len;
+			di = (struct btrfs_dir_item *)((char *)di + di_len);
+		}
 	}
 	filp->f_pos++;
 nopos:
@@ -935,75 +955,6 @@ printk("btrfs sync_fs\n");
 	return 0;
 }
 
-#if 0
-static int btrfs_get_block_inline(struct inode *inode, sector_t iblock,
-			   struct buffer_head *result, int create)
-{
-	struct btrfs_root *root = btrfs_sb(inode->i_sb);
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	struct btrfs_leaf *leaf;
-	int num_bytes = result->b_size;
-	int item_size;
-	int ret;
-	u64 pos;
-	char *ptr;
-	int copy_size;
-	int err = 0;
-	char *safe_ptr;
-	char *data_ptr;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
-	WARN_ON(create);
-	if (create) {
-		return 0;
-	}
-	pos = iblock << inode->i_blkbits;
-	key.objectid = inode->i_ino;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INLINE_DATA_KEY);
-	ptr = kmap(result->b_page);
-	safe_ptr = ptr;
-	ptr += (pos & (PAGE_CACHE_SIZE -1));
-again:
-	key.offset = pos;
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret) {
-		if (ret < 0)
-			err = ret;
-		else
-			err = 0;
-		goto out;
-	}
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	item_size = btrfs_item_size(leaf->items + path->slots[0]);
-	copy_size = min(num_bytes, item_size);
-	data_ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-	WARN_ON(safe_ptr + PAGE_CACHE_SIZE < ptr + copy_size);
-	memcpy(ptr, data_ptr, copy_size);
-	pos += copy_size;
-	num_bytes -= copy_size;
-	WARN_ON(num_bytes < 0);
-	ptr += copy_size;
-	btrfs_release_path(root, path);
-	if (num_bytes != 0) {
-		if (pos >= i_size_read(inode))
-			memset(ptr, 0, num_bytes);
-		else
-			goto again;
-	}
-	set_buffer_uptodate(result);
-	map_bh(result, inode->i_sb, 0);
-	err = 0;
-out:
-	btrfs_free_path(path);
-	kunmap(result->b_page);
-	return err;
-}
-#endif
-
 static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
@@ -1907,174 +1858,6 @@ out:
 	return num_written ? num_written : err;
 }
 
-#if 0
-static ssize_t inline_one_page(struct btrfs_root *root, struct inode *inode,
-			   struct page *page, loff_t pos,
-			   size_t offset, size_t write_bytes)
-{
-	struct btrfs_path *path;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_key key;
-	struct btrfs_leaf *leaf;
-	struct btrfs_key found_key;
-	int ret;
-	size_t copy_size = 0;
-	char *dst = NULL;
-	int err = 0;
-	size_t num_written = 0;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	key.objectid = inode->i_ino;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INLINE_DATA_KEY);
-
-again:
-	key.offset = pos;
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret == 0) {
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		btrfs_disk_key_to_cpu(&found_key,
-				      &leaf->items[path->slots[0]].key);
-		copy_size = btrfs_item_size(leaf->items + path->slots[0]);
-		dst = btrfs_item_ptr(leaf, path->slots[0], char);
-		copy_size = min(write_bytes, copy_size);
-		goto copyit;
-	} else {
-		int slot = path->slots[0];
-		if (slot > 0) {
-			slot--;
-		}
-		// FIXME find max key
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		btrfs_disk_key_to_cpu(&found_key,
-				      &leaf->items[slot].key);
-		if (found_key.objectid != inode->i_ino)
-			goto insert;
-		if (btrfs_key_type(&found_key) != BTRFS_INLINE_DATA_KEY)
-			goto insert;
-		copy_size = btrfs_item_size(leaf->items + slot);
-		if (found_key.offset + copy_size <= pos)
-			goto insert;
-		dst = btrfs_item_ptr(leaf, path->slots[0], char);
-		dst += pos - found_key.offset;
-		copy_size = copy_size - (pos - found_key.offset);
-		BUG_ON(copy_size < 0);
-		copy_size = min(write_bytes, copy_size);
-		WARN_ON(copy_size == 0);
-		goto copyit;
-	}
-insert:
-	btrfs_release_path(root, path);
-	copy_size = min(write_bytes,
-			(size_t)BTRFS_LEAF_DATA_SIZE(root) -
-			sizeof(struct btrfs_item) * 4);
-	ret = btrfs_insert_empty_item(trans, root, path, &key, copy_size);
-	BUG_ON(ret);
-	dst = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-			     path->slots[0], char);
-copyit:
-	WARN_ON(copy_size == 0);
-	WARN_ON(dst + copy_size >
-		btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-						 path->slots[0], char) +
-		btrfs_item_size(btrfs_buffer_leaf(path->nodes[0])->items +
-						  path->slots[0]));
-	btrfs_memcpy(root, path->nodes[0]->b_data, dst,
-		     page_address(page) + offset, copy_size);
-	mark_buffer_dirty(path->nodes[0]);
-	btrfs_release_path(root, path);
-	pos += copy_size;
-	offset += copy_size;
-	num_written += copy_size;
-	write_bytes -= copy_size;
-	if (write_bytes)
-		goto again;
-out:
-	btrfs_free_path(path);
-	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return num_written ? num_written : err;
-}
-
-static ssize_t btrfs_file_inline_write(struct file *file,
-				       const char __user *buf,
-				       size_t count, loff_t *ppos)
-{
-	loff_t pos;
-	size_t num_written = 0;
-	int err = 0;
-	int ret = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	unsigned long page_index;
-
-	if (file->f_flags & O_DIRECT)
-		return -EINVAL;
-	pos = *ppos;
-
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-	current->backing_dev_info = inode->i_mapping->backing_dev_info;
-	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-	if (err)
-		goto out;
-	if (count == 0)
-		goto out;
-	err = remove_suid(file->f_path.dentry);
-	if (err)
-		goto out;
-	file_update_time(file);
-	mutex_lock(&inode->i_mutex);
-	while(count > 0) {
-		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
-		struct page *page;
-
-		page_index = pos >> PAGE_CACHE_SHIFT;
-		page = grab_cache_page(inode->i_mapping, page_index);
-		if (!PageUptodate(page)) {
-			ret = mpage_readpage(page, btrfs_get_block);
-			BUG_ON(ret);
-			lock_page(page);
-		}
-		ret = btrfs_copy_from_user(pos, 1,
-					   write_bytes, &page, buf);
-		BUG_ON(ret);
-		write_bytes = inline_one_page(root, inode, page, pos,
-				      offset, write_bytes);
-		SetPageUptodate(page);
-		if (write_bytes > 0 && pos + write_bytes > inode->i_size) {
-			i_size_write(inode, pos + write_bytes);
-			mark_inode_dirty(inode);
-		}
-		page_cache_release(page);
-		unlock_page(page);
-		if (write_bytes < 0)
-			goto out_unlock;
-		buf += write_bytes;
-		count -= write_bytes;
-		pos += write_bytes;
-		num_written += write_bytes;
-
-		balance_dirty_pages_ratelimited(inode->i_mapping);
-		cond_resched();
-	}
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
-out:
-	*ppos = pos;
-	current->backing_dev_info = NULL;
-	return num_written ? num_written : err;
-}
-#endif
-
 static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 			unsigned long offset, unsigned long size)
 {
@@ -2422,6 +2205,7 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_vol_args vol_args;
 	int ret = 0;
+	struct btrfs_dir_item *di;
 	int namelen;
 	struct btrfs_path *path;
 	u64 root_dirid;
@@ -2440,12 +2224,12 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 			return -ENOMEM;
 		root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
 		mutex_lock(&root->fs_info->fs_mutex);
-		ret = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+		di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
 				    path, root_dirid,
 				    vol_args.name, namelen, 0);
 		mutex_unlock(&root->fs_info->fs_mutex);
 		btrfs_free_path(path);
-		if (ret == 0)
+		if (di && !IS_ERR(di))
 			return -EEXIST;
 
 		if (root == root->fs_info->tree_root)
-- 
cgit v1.2.3


From 8fd17795b2261ecb1bad2a6df09ef14c4957a3fb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 19 Apr 2007 21:01:03 -0400
Subject: Btrfs: early fsync support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |  1 +
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/root-tree.c   |  1 -
 fs/btrfs/super.c       | 37 ++++++++++++++++++++++++++++++++++++-
 fs/btrfs/transaction.c | 25 ++++++++++++++++++++++---
 fs/btrfs/transaction.h |  1 +
 7 files changed, 62 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 8d7e5a3ed62..5ff2cef9f02 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -7,6 +7,7 @@
 * Get rid of struct ctree_path, limiting tree levels held at one time
 * Add generation number to key pointer in nodes
 * Add parent pointer back to extent map.
+* Implement real list of pending transactions
 * Release
 * Do real tree locking
 * Add extent mirroring (backup copies of blocks)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d628a3b9441..047a84762d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -256,6 +256,7 @@ struct btrfs_fs_info {
 	struct inode *btree_inode;
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
+	struct list_head trans_list;
 	struct crypto_hash *hash_tfm;
 	spinlock_t hash_lock;
 	struct kobject kobj;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 11e17a2f736..b666c3571ae 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -528,6 +528,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	init_bit_radix(&fs_info->pending_del_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
+	INIT_LIST_HEAD(&fs_info->trans_list);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
 	fs_info->tree_root = tree_root;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 72be9836932..a42943bd917 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -100,7 +100,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	BUG_ON(refs == 0);
 	if (refs == 1) {
 		ret = btrfs_del_item(trans, root, path);
-printk("deleting root %Lu %Lu %u\n", key->objectid, key->offset, key->flags);
 	} else {
 		btrfs_set_root_refs(ri, refs - 1);
 printk("ref now %u root %Lu %Lu %u\n", refs -1, key->objectid, key->offset, key->flags);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dff2db0d1dc..f99c764a59d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -11,6 +11,7 @@
 #include <linux/mpage.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
+#include <linux/statfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -932,6 +933,26 @@ out_unlock:
 	return err;
 }
 
+static int btrfs_sync_file(struct file *file,
+			   struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+out:
+	return ret > 0 ? EIO : ret;
+}
+
 static int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
@@ -2353,6 +2374,19 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	return 0;
 }
 
+static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = root->fs_info->disk_super;
+
+	buf->f_namelen = BTRFS_NAME_LEN;
+	buf->f_blocks = btrfs_super_total_blocks(disk_super);
+	buf->f_bfree = buf->f_blocks - btrfs_super_blocks_used(disk_super);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_bsize = dentry->d_sb->s_blocksize;
+	buf->f_type = BTRFS_SUPER_MAGIC;
+	return 0;
+}
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -2362,7 +2396,6 @@ static struct file_system_type btrfs_fs_type = {
 };
 
 static struct super_operations btrfs_super_ops = {
-	.statfs		= simple_statfs,
 	.delete_inode	= btrfs_delete_inode,
 	.put_super	= btrfs_put_super,
 	.read_inode	= btrfs_read_locked_inode,
@@ -2371,6 +2404,7 @@ static struct super_operations btrfs_super_ops = {
 	.write_inode	= btrfs_write_inode,
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
+	.statfs		= btrfs_statfs,
 };
 
 static struct inode_operations btrfs_dir_inode_operations = {
@@ -2413,6 +2447,7 @@ static struct file_operations btrfs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.ioctl		= btrfs_ioctl,
+	.fsync		= btrfs_sync_file,
 };
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8740752f384..078cb9cbf9d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -19,6 +19,7 @@ static void put_transaction(struct btrfs_transaction *transaction)
 	if (transaction->use_count == 0) {
 		WARN_ON(total_trans == 0);
 		total_trans--;
+		list_del_init(&transaction->list);
 		memset(transaction, 0, sizeof(*transaction));
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
@@ -43,6 +44,7 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->in_commit = 0;
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
+		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 	}
 	cur_trans->num_writers++;
 	return 0;
@@ -236,6 +238,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 {
 	int ret = 0;
 	struct btrfs_transaction *cur_trans;
+	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
 	DEFINE_WAIT(wait);
 
@@ -272,13 +275,29 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	cur_trans = root->fs_info->running_transaction;
 	root->fs_info->running_transaction = NULL;
-	btrfs_set_super_generation(root->fs_info->disk_super,
-				   root->fs_info->generation + 1);
+	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+		prev_trans = list_entry(cur_trans->list.prev,
+					struct btrfs_transaction, list);
+		if (prev_trans->commit_done)
+			prev_trans = NULL;
+		else
+			prev_trans->use_count++;
+	}
 	mutex_unlock(&root->fs_info->trans_mutex);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
+	if (prev_trans) {
+		mutex_lock(&root->fs_info->trans_mutex);
+		wait_for_commit(root, prev_trans);
+		put_transaction(prev_trans);
+		mutex_unlock(&root->fs_info->trans_mutex);
+	}
+	btrfs_set_super_generation(root->fs_info->disk_super,
+				   cur_trans->transid);
 	BUG_ON(ret);
-
 	write_ctree_super(trans, root);
+
+	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_finish_extent_commit(trans, root);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans->commit_done = 1;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4f1496ae6f2..3cc29900a07 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -8,6 +8,7 @@ struct btrfs_transaction {
 	int use_count;
 	int commit_done;
 	int magic;
+	struct list_head list;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 };
-- 
cgit v1.2.3


From e66f709b157ee8557166c14b67c01bae978ac32e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 20 Apr 2007 13:16:02 -0400
Subject: Btrfs: write barriers on commit, balance level before split

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ctree.h   |   1 +
 fs/btrfs/disk-io.c |  12 +++++-
 3 files changed, 131 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4efcd1bd63e..744fd728e5d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -151,6 +151,11 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
 		struct btrfs_key cpukey;
 		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[i + 1].key);
+if (comp_keys(&node->ptrs[i].key, &cpukey) >= 0) {
+	struct btrfs_key bad;
+	btrfs_disk_key_to_cpu(&bad, &node->ptrs[i].key);
+printk("check_node level %d i is %d bad comp %Lu %u %Lu, %Lu %u %Lu\n",level, i, bad.objectid, bad.flags, bad.offset, cpukey.objectid, cpukey.flags, cpukey.offset);
+}
 		BUG_ON(comp_keys(&node->ptrs[i].key, &cpukey) >= 0);
 	}
 	return 0;
@@ -448,6 +453,111 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+/* returns zero if the push worked, non-zero otherwise */
+static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path, int level)
+{
+	struct buffer_head *right_buf;
+	struct buffer_head *mid_buf;
+	struct buffer_head *left_buf;
+	struct buffer_head *parent_buf = NULL;
+	struct btrfs_node *right = NULL;
+	struct btrfs_node *mid;
+	struct btrfs_node *left = NULL;
+	struct btrfs_node *parent = NULL;
+	int ret = 0;
+	int wret;
+	int pslot;
+	int orig_slot = path->slots[level];
+	u64 orig_ptr;
+
+	if (level == 0)
+		return 1;
+
+	mid_buf = path->nodes[level];
+	mid = btrfs_buffer_node(mid_buf);
+	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
+
+	if (level < BTRFS_MAX_LEVEL - 1)
+		parent_buf = path->nodes[level + 1];
+	pslot = path->slots[level + 1];
+
+	if (!parent_buf)
+		return 1;
+	parent = btrfs_buffer_node(parent_buf);
+
+	left_buf = read_node_slot(root, parent_buf, pslot - 1);
+
+	/* first, try to make some room in the middle buffer */
+	if (left_buf) {
+		u32 left_nr;
+		btrfs_cow_block(trans, root, left_buf, parent_buf, pslot - 1,
+				&left_buf);
+		left = btrfs_buffer_node(left_buf);
+		left_nr = btrfs_header_nritems(&left->header);
+		wret = push_node_left(trans, root, left_buf, mid_buf);
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			orig_slot += left_nr;
+			btrfs_memcpy(root, parent,
+				     &parent->ptrs[pslot].key,
+				     &mid->ptrs[0].key,
+				     sizeof(struct btrfs_disk_key));
+			btrfs_mark_buffer_dirty(parent_buf);
+			if (btrfs_header_nritems(&left->header) > orig_slot) {
+				path->nodes[level] = left_buf;
+				path->slots[level + 1] -= 1;
+				path->slots[level] = orig_slot;
+				btrfs_block_release(root, mid_buf);
+			} else {
+				orig_slot -=
+					btrfs_header_nritems(&left->header);
+				path->slots[level] = orig_slot;
+				btrfs_block_release(root, left_buf);
+			}
+			check_node(root, path, level);
+			return 0;
+		}
+		btrfs_block_release(root, left_buf);
+	}
+	right_buf = read_node_slot(root, parent_buf, pslot + 1);
+
+	/*
+	 * then try to empty the right most buffer into the middle
+	 */
+	if (right_buf) {
+		btrfs_cow_block(trans, root, right_buf, parent_buf, pslot + 1,
+				&right_buf);
+		right = btrfs_buffer_node(right_buf);
+		wret = balance_node_right(trans, root, right_buf, mid_buf);
+		if (wret < 0)
+			ret = wret;
+		if (wret == 0) {
+			btrfs_memcpy(root, parent,
+				     &parent->ptrs[pslot + 1].key,
+				     &right->ptrs[0].key,
+				     sizeof(struct btrfs_disk_key));
+			btrfs_mark_buffer_dirty(parent_buf);
+			if (btrfs_header_nritems(&mid->header) <= orig_slot) {
+				path->nodes[level] = right_buf;
+				path->slots[level + 1] += 1;
+				path->slots[level] = orig_slot -
+					btrfs_header_nritems(&mid->header);
+				btrfs_block_release(root, mid_buf);
+			} else {
+				btrfs_block_release(root, right_buf);
+			}
+			check_node(root, path, level);
+			return 0;
+		}
+		btrfs_block_release(root, right_buf);
+	}
+	check_node(root, path, level);
+	return 1;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -774,7 +884,16 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
+	} else {
+		ret = push_nodes_for_insert(trans, root, path, level);
+		t = path->nodes[level];
+		c = btrfs_buffer_node(t);
+		if (!ret &&
+		    btrfs_header_nritems(&c->header) <
+		    BTRFS_NODEPTRS_PER_BLOCK(root) - 1)
+			return 0;
 	}
+
 	c_nritems = btrfs_header_nritems(&c->header);
 	split_buffer = btrfs_alloc_free_block(trans, root);
 	split = btrfs_buffer_node(split_buffer);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 047a84762d9..681b23933d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -259,6 +259,7 @@ struct btrfs_fs_info {
 	struct list_head trans_list;
 	struct crypto_hash *hash_tfm;
 	spinlock_t hash_lock;
+	int do_barriers;
 	struct kobject kobj;
 };
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b666c3571ae..13046295bf7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -540,6 +540,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_nlink = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	fs_info->do_barriers = 1;
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -622,6 +623,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root)
 {
+	int ret;
 	struct buffer_head *bh = root->fs_info->sb_buffer;
 
 	btrfs_set_super_root(root->fs_info->disk_super,
@@ -632,7 +634,15 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	csum_tree_block(root, bh, 0);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(WRITE, bh);
+	if (root->fs_info->do_barriers)
+		ret = submit_bh(WRITE_BARRIER, bh);
+	else
+		ret = submit_bh(WRITE, bh);
+	if (ret == -EOPNOTSUPP) {
+		set_buffer_uptodate(bh);
+		root->fs_info->do_barriers = 0;
+		ret = submit_bh(WRITE, bh);
+	}
 	wait_on_buffer(bh);
 	if (!buffer_uptodate(bh)) {
 		WARN_ON(1);
-- 
cgit v1.2.3


From 33ade1f826a7c348856a98930814f33ced6d1337 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 20 Apr 2007 13:48:57 -0400
Subject: Btrfs: node balance optimizations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 744fd728e5d..1bae035f517 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -151,11 +151,6 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
 		struct btrfs_key cpukey;
 		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[i + 1].key);
-if (comp_keys(&node->ptrs[i].key, &cpukey) >= 0) {
-	struct btrfs_key bad;
-	btrfs_disk_key_to_cpu(&bad, &node->ptrs[i].key);
-printk("check_node level %d i is %d bad comp %Lu %u %Lu, %Lu %u %Lu\n",level, i, bad.objectid, bad.flags, bad.offset, cpukey.objectid, cpukey.flags, cpukey.offset);
-}
 		BUG_ON(comp_keys(&node->ptrs[i].key, &cpukey) >= 0);
 	}
 	return 0;
@@ -492,11 +487,16 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	/* first, try to make some room in the middle buffer */
 	if (left_buf) {
 		u32 left_nr;
-		btrfs_cow_block(trans, root, left_buf, parent_buf, pslot - 1,
-				&left_buf);
 		left = btrfs_buffer_node(left_buf);
 		left_nr = btrfs_header_nritems(&left->header);
-		wret = push_node_left(trans, root, left_buf, mid_buf);
+		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			btrfs_cow_block(trans, root, left_buf, parent_buf,
+					pslot - 1, &left_buf);
+			left = btrfs_buffer_node(left_buf);
+			wret = push_node_left(trans, root, left_buf, mid_buf);
+		}
 		if (wret < 0)
 			ret = wret;
 		if (wret == 0) {
@@ -528,10 +528,18 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right_buf) {
-		btrfs_cow_block(trans, root, right_buf, parent_buf, pslot + 1,
-				&right_buf);
+		u32 right_nr;
 		right = btrfs_buffer_node(right_buf);
-		wret = balance_node_right(trans, root, right_buf, mid_buf);
+		right_nr = btrfs_header_nritems(&right->header);
+		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			wret = 1;
+		} else {
+			btrfs_cow_block(trans, root, right_buf,
+					parent_buf, pslot + 1, &right_buf);
+			right = btrfs_buffer_node(right_buf);
+			wret = balance_node_right(trans, root,
+						  right_buf, mid_buf);
+		}
 		if (wret < 0)
 			ret = wret;
 		if (wret == 0) {
-- 
cgit v1.2.3


From 4d775673091d43b39fa9d086071009f98dec289e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 20 Apr 2007 20:23:12 -0400
Subject: Btrfs: add owner and type fields to the extents aand block headers

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  5 +++++
 fs/btrfs/ctree.h       | 45 ++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 15 ++++++++++-----
 fs/btrfs/super.c       | 13 +++++++------
 5 files changed, 65 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1bae035f517..b4783bf8bf4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -68,6 +68,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
 	btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
 	btrfs_set_header_generation(&cow_node->header, trans->transid);
+	btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
 	btrfs_inc_ref(trans, root, buf);
 	if (buf == root->node) {
 		root->node = cow;
@@ -806,6 +807,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_level(&c->header, level);
 	btrfs_set_header_blocknr(&c->header, bh_blocknr(t));
 	btrfs_set_header_generation(&c->header, trans->transid);
+	btrfs_set_header_owner(&c->header, root->root_key.objectid);
 	lower = btrfs_buffer_node(path->nodes[level-1]);
 	memcpy(c->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(c->header.fsid));
@@ -909,6 +911,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
 	btrfs_set_header_blocknr(&split->header, bh_blocknr(split_buffer));
 	btrfs_set_header_generation(&split->header, trans->transid);
+	btrfs_set_header_owner(&split->header, root->root_key.objectid);
 	memcpy(split->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(split->header.fsid));
 	mid = (c_nritems + 1) / 2;
@@ -1280,6 +1283,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	memset(&right->header, 0, sizeof(right->header));
 	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
 	btrfs_set_header_generation(&right->header, trans->transid);
+	btrfs_set_header_owner(&right->header, root->root_key.objectid);
 	btrfs_set_header_level(&right->header, 0);
 	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(right->header.fsid));
@@ -1376,6 +1380,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	memset(&right->header, 0, sizeof(right->header));
 	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
 	btrfs_set_header_generation(&right->header, trans->transid);
+	btrfs_set_header_owner(&right->header, root->root_key.objectid);
 	btrfs_set_header_level(&right->header, 0);
 	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(right->header.fsid));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 681b23933d9..78248d57729 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -62,6 +62,7 @@ struct btrfs_header {
 	u8 fsid[16]; /* FS specific uuid */
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 generation;
+	__le64 owner;
 	__le16 nritems;
 	__le16 flags;
 	u8 level;
@@ -151,12 +152,17 @@ struct btrfs_path {
 	int slots[BTRFS_MAX_LEVEL];
 };
 
+/* values for the type field in btrfs_extent_item */
+#define BTRFS_EXTENT_TREE 1
+#define BTRFS_EXTENT_FILE 2
 /*
  * items in the extent btree are used to record the objectid of the
  * owner of the block and the number of references
  */
 struct btrfs_extent_item {
 	__le32 refs;
+	__le64 owner;
+	u8 type;
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_timespec {
@@ -473,11 +479,32 @@ static inline void btrfs_set_extent_refs(struct btrfs_extent_item *ei, u32 val)
 	ei->refs = cpu_to_le32(val);
 }
 
+static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
+{
+	return le64_to_cpu(ei->owner);
+}
+
+static inline void btrfs_set_extent_owner(struct btrfs_extent_item *ei, u64 val)
+{
+	ei->owner = cpu_to_le64(val);
+}
+
+static inline u8 btrfs_extent_type(struct btrfs_extent_item *ei)
+{
+	return ei->type;
+}
+
+static inline void btrfs_set_extent_type(struct btrfs_extent_item *ei, u8 val)
+{
+	ei->type = val;
+}
+
 static inline u64 btrfs_node_blockptr(struct btrfs_node *n, int nr)
 {
 	return le64_to_cpu(n->ptrs[nr].blockptr);
 }
 
+
 static inline void btrfs_set_node_blockptr(struct btrfs_node *n, int nr,
 					   u64 val)
 {
@@ -636,6 +663,17 @@ static inline void btrfs_set_header_generation(struct btrfs_header *h,
 	h->generation = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_header_owner(struct btrfs_header *h)
+{
+	return le64_to_cpu(h->owner);
+}
+
+static inline void btrfs_set_header_owner(struct btrfs_header *h,
+					       u64 val)
+{
+	h->owner = cpu_to_le64(val);
+}
+
 static inline u16 btrfs_header_nritems(struct btrfs_header *h)
 {
 	return le16_to_cpu(h->nritems);
@@ -996,9 +1034,10 @@ int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root);
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, u64 num_blocks, u64 search_start, u64
-			search_end, struct btrfs_key *ins);
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, u64 owner,
+		       u8 type, u64 num_blocks, u64 search_start,
+		       u64 search_end, struct btrfs_key *ins);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 13046295bf7..7aff6bb55d9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -319,6 +319,7 @@ static int __setup_root(int blocksize,
 	root->last_inode_alloc = 0;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
+	root->root_key.objectid = objectid;
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b2faad3e879..49f7cd6e067 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -460,9 +460,10 @@ error:
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, u64 num_blocks, u64 search_start, u64
-			search_end, struct btrfs_key *ins)
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, u64 owner,
+		       u8 type, u64 num_blocks, u64 search_start,
+		       u64 search_end, struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
@@ -472,6 +473,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_extent_item extent_item;
 
 	btrfs_set_extent_refs(&extent_item, 1);
+	btrfs_set_extent_owner(&extent_item, owner);
+	btrfs_set_extent_type(&extent_item, type);
 
 	if (root == extent_root) {
 		BUG_ON(extent_root->fs_info->current_insert.offset == 0);
@@ -508,13 +511,15 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  * returns the tree buffer or NULL.
  */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root)
+					   struct btrfs_root *root)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct buffer_head *buf;
 
-	ret = btrfs_alloc_extent(trans, root, 1, 0, (unsigned long)-1, &ins);
+	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
+				 BTRFS_EXTENT_TREE,
+				 1, 0, (unsigned long)-1, &ins);
 	if (ret) {
 		BUG();
 		return NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f99c764a59d..eba239cce21 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1819,8 +1819,9 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
 	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		ret = btrfs_alloc_extent(trans, root, num_blocks, 1,
-				 (u64)-1, &ins);
+		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
+					 BTRFS_EXTENT_FILE, num_blocks, 1,
+					 (u64)-1, &ins);
 		BUG_ON(ret);
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       start_pos, ins.objectid, ins.offset);
@@ -2017,8 +2018,12 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	btrfs_set_header_level(&leaf->header, 0);
 	btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
 	btrfs_set_header_generation(&leaf->header, trans->transid);
+	btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
 	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(leaf->header.fsid));
+	mark_buffer_dirty(subvol);
+	brelse(subvol);
+	subvol = NULL;
 
 	inode_item = &root_item.inode;
 	memset(inode_item, 0, sizeof(*inode_item));
@@ -2031,10 +2036,6 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
 	btrfs_set_root_refs(&root_item, 1);
 
-	mark_buffer_dirty(subvol);
-	brelse(subvol);
-	subvol = NULL;
-
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
 	BUG_ON(ret);
-- 
cgit v1.2.3


From 5d0c3e60fea7e23fbfbe3ab2886b444b4c702441 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 23 Apr 2007 17:01:05 -0400
Subject: Btrfs: fix extent owner/type setting on extent tree blocks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49f7cd6e067..aac3eccffb8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -167,6 +167,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	ins.offset = 1;
 	ins.flags = 0;
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
+	btrfs_set_extent_type(&extent_item, BTRFS_EXTENT_TREE);
+	btrfs_set_extent_owner(&extent_item, extent_root->root_key.objectid);
 
 	for (i = 0; i < extent_root->fs_info->current_insert.flags; i++) {
 		ins.objectid = extent_root->fs_info->current_insert.objectid +
@@ -356,7 +358,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
 	level = btrfs_header_level(btrfs_buffer_header(root->node));
-	total_needed += (level + 1) * 3;
+	total_needed += (level + 2) * 3;
 	if (root->fs_info->last_insert.objectid == 0 && search_end == (u64)-1) {
 		struct btrfs_disk_key *last_key;
 		btrfs_init_path(path);
-- 
cgit v1.2.3


From 5e82849e63723afe720d4b325d04a989f6247cce Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 23 Apr 2007 19:10:49 -0400
Subject: Btrfs: new subvolume oops fix

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index eba239cce21..6940a907654 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2013,6 +2013,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	BUG_ON(!trans);
 
 	subvol = btrfs_alloc_free_block(trans, root);
+	if (subvol == NULL)
+		return -ENOSPC;
 	leaf = btrfs_buffer_leaf(subvol);
 	btrfs_set_header_nritems(&leaf->header, 0);
 	btrfs_set_header_level(&leaf->header, 0);
@@ -2022,8 +2024,6 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(leaf->header.fsid));
 	mark_buffer_dirty(subvol);
-	brelse(subvol);
-	subvol = NULL;
 
 	inode_item = &root_item.inode;
 	memset(inode_item, 0, sizeof(*inode_item));
@@ -2035,6 +2035,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 
 	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
 	btrfs_set_root_refs(&root_item, 1);
+	brelse(subvol);
+	subvol = NULL;
 
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
-- 
cgit v1.2.3


From b51338628f604d9b4ed3c776839a484a30e12fb7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 24 Apr 2007 11:52:22 -0400
Subject: Btrfs: add dirty_inode call

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6940a907654..43f081593c4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -722,17 +722,27 @@ static int btrfs_write_inode(struct inode *inode, int wait)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	int ret;
+	int ret = 0;
+
+	if (wait) {
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_commit_transaction(trans, root);
+		mutex_unlock(&root->fs_info->fs_mutex);
+	}
+	return ret;
+}
+
+static void btrfs_dirty_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	ret = btrfs_update_inode(trans, root, inode);
-	if (wait)
-		btrfs_commit_transaction(trans, root);
-	else
-		btrfs_end_transaction(trans, root);
+	btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -2390,6 +2400,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_type = BTRFS_SUPER_MAGIC;
 	return 0;
 }
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -2405,6 +2416,7 @@ static struct super_operations btrfs_super_ops = {
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
 	.write_inode	= btrfs_write_inode,
+	.dirty_inode	= btrfs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
-- 
cgit v1.2.3


From c62a1920ced752e86f57ab1d4ad0ec65012bce4d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 24 Apr 2007 12:07:39 -0400
Subject: Btrfs: get rid of the extent_item type field

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 16 +---------------
 fs/btrfs/extent-tree.c |  5 +----
 fs/btrfs/super.c       |  3 +--
 3 files changed, 3 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78248d57729..2d166ca8b10 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -152,9 +152,6 @@ struct btrfs_path {
 	int slots[BTRFS_MAX_LEVEL];
 };
 
-/* values for the type field in btrfs_extent_item */
-#define BTRFS_EXTENT_TREE 1
-#define BTRFS_EXTENT_FILE 2
 /*
  * items in the extent btree are used to record the objectid of the
  * owner of the block and the number of references
@@ -162,7 +159,6 @@ struct btrfs_path {
 struct btrfs_extent_item {
 	__le32 refs;
 	__le64 owner;
-	u8 type;
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_timespec {
@@ -489,16 +485,6 @@ static inline void btrfs_set_extent_owner(struct btrfs_extent_item *ei, u64 val)
 	ei->owner = cpu_to_le64(val);
 }
 
-static inline u8 btrfs_extent_type(struct btrfs_extent_item *ei)
-{
-	return ei->type;
-}
-
-static inline void btrfs_set_extent_type(struct btrfs_extent_item *ei, u8 val)
-{
-	ei->type = val;
-}
-
 static inline u64 btrfs_node_blockptr(struct btrfs_node *n, int nr)
 {
 	return le64_to_cpu(n->ptrs[nr].blockptr);
@@ -1036,7 +1022,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u8 type, u64 num_blocks, u64 search_start,
+		       u64 num_blocks, u64 search_start,
 		       u64 search_end, struct btrfs_key *ins);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aac3eccffb8..116519503d0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -167,7 +167,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	ins.offset = 1;
 	ins.flags = 0;
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
-	btrfs_set_extent_type(&extent_item, BTRFS_EXTENT_TREE);
 	btrfs_set_extent_owner(&extent_item, extent_root->root_key.objectid);
 
 	for (i = 0; i < extent_root->fs_info->current_insert.flags; i++) {
@@ -464,7 +463,7 @@ error:
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u8 type, u64 num_blocks, u64 search_start,
+		       u64 num_blocks, u64 search_start,
 		       u64 search_end, struct btrfs_key *ins)
 {
 	int ret;
@@ -476,7 +475,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item, owner);
-	btrfs_set_extent_type(&extent_item, type);
 
 	if (root == extent_root) {
 		BUG_ON(extent_root->fs_info->current_insert.offset == 0);
@@ -520,7 +518,6 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 BTRFS_EXTENT_TREE,
 				 1, 0, (unsigned long)-1, &ins);
 	if (ret) {
 		BUG();
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 43f081593c4..c076474c8bf 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1830,8 +1830,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
 	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 BTRFS_EXTENT_FILE, num_blocks, 1,
-					 (u64)-1, &ins);
+					 num_blocks, 1, (u64)-1, &ins);
 		BUG_ON(ret);
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       start_pos, ins.objectid, ins.offset);
-- 
cgit v1.2.3


From f68cad0f9eb3c3cc100635bd0ea191ee093cf887 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 24 Apr 2007 12:44:26 -0400
Subject: Btrfs: fixup dirty_inode related deadlocks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 109 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 56 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c076474c8bf..5bbccbc7e3c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -120,6 +120,58 @@ make_bad:
 	make_bad_inode(inode);
 }
 
+static void fill_inode_item(struct btrfs_inode_item *item,
+			    struct inode *inode)
+{
+	btrfs_set_inode_uid(item, inode->i_uid);
+	btrfs_set_inode_gid(item, inode->i_gid);
+	btrfs_set_inode_size(item, inode->i_size);
+	btrfs_set_inode_mode(item, inode->i_mode);
+	btrfs_set_inode_nlink(item, inode->i_nlink);
+	btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
+	btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
+	btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
+	btrfs_set_inode_nblocks(item, inode->i_blocks);
+	btrfs_set_inode_generation(item, inode->i_generation);
+}
+
+
+static int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto failed;
+	}
+
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
+				  struct btrfs_inode_item);
+
+	fill_inode_item(inode_item, inode);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	ret = 0;
+failed:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	return ret;
+}
+
+
 static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *dir,
@@ -166,10 +218,11 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	dentry->d_inode->i_ctime = dir->i_ctime;
 err:
 	btrfs_free_path(path);
-	if (ret == 0) {
-		inode_dec_link_count(dentry->d_inode);
+	if (!ret) {
 		dir->i_size -= name_len * 2;
-		mark_inode_dirty(dir);
+		btrfs_update_inode(trans, root, dir);
+		drop_nlink(dentry->d_inode);
+		btrfs_update_inode(trans, root, dentry->d_inode);
 	}
 	return ret;
 }
@@ -668,56 +721,6 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	return 0;
 }
 
-static void fill_inode_item(struct btrfs_inode_item *item,
-			    struct inode *inode)
-{
-	btrfs_set_inode_uid(item, inode->i_uid);
-	btrfs_set_inode_gid(item, inode->i_gid);
-	btrfs_set_inode_size(item, inode->i_size);
-	btrfs_set_inode_mode(item, inode->i_mode);
-	btrfs_set_inode_nlink(item, inode->i_nlink);
-	btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
-	btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
-	btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
-	btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
-	btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
-	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
-	btrfs_set_inode_nblocks(item, inode->i_blocks);
-	btrfs_set_inode_generation(item, inode->i_generation);
-}
-
-static int btrfs_update_inode(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *inode)
-{
-	struct btrfs_inode_item *inode_item;
-	struct btrfs_path *path;
-	int ret;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	ret = btrfs_lookup_inode(trans, root, path,
-				 &BTRFS_I(inode)->location, 1);
-	if (ret) {
-		if (ret > 0)
-			ret = -ENOENT;
-		goto failed;
-	}
-
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
-				  struct btrfs_inode_item);
-
-	fill_inode_item(inode_item, inode);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-	ret = 0;
-failed:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	return ret;
-}
-
 static int btrfs_write_inode(struct inode *inode, int wait)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-- 
cgit v1.2.3


From f2458e1d8c90958ed3631654cb7fd5ab01478505 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Apr 2007 15:52:25 -0400
Subject: Btrfs: change around extent-tree prealloc

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  10 +++--
 fs/btrfs/disk-io.c     |   3 +-
 fs/btrfs/extent-tree.c | 101 ++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 87 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2d166ca8b10..26d0cdd46f4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -244,12 +244,17 @@ struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *dev_root;
-	struct btrfs_key current_insert;
 	struct btrfs_key last_insert;
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
 	struct radix_tree_root dev_radix;
+
+	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 3];
+	int extent_tree_insert_nr;
+	u64 extent_tree_prealloc[BTRFS_MAX_LEVEL * 3];
+	int extent_tree_prealloc_nr;
+
 	u64 generation;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
@@ -267,8 +272,7 @@ struct btrfs_fs_info {
 
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
- * and for the extent tree extent_root root.  current_insert is used
- * only for the extent tree.
+ * and for the extent tree extent_root root.
  */
 struct btrfs_root {
 	struct buffer_head *node;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7aff6bb55d9..956727f015a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -542,6 +542,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	fs_info->do_barriers = 1;
+	fs_info->extent_tree_insert_nr = 0;
+	fs_info->extent_tree_prealloc_nr = 0;
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -555,7 +557,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
-	memset(&fs_info->current_insert, 0, sizeof(fs_info->current_insert));
 	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
 
 	__setup_root(sb->s_blocksize, dev_root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 116519503d0..e6fe3fd3881 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -169,9 +169,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 	btrfs_set_extent_owner(&extent_item, extent_root->root_key.objectid);
 
-	for (i = 0; i < extent_root->fs_info->current_insert.flags; i++) {
-		ins.objectid = extent_root->fs_info->current_insert.objectid +
-				i;
+	for (i = 0; i < extent_root->fs_info->extent_tree_insert_nr; i++) {
+		ins.objectid = extent_root->fs_info->extent_tree_insert[i];
 		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
 		btrfs_set_super_blocks_used(info->disk_super,
 					    super_blocks_used + 1);
@@ -179,7 +178,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 					sizeof(extent_item));
 		BUG_ON(ret);
 	}
-	extent_root->fs_info->current_insert.offset = 0;
+	extent_root->fs_info->extent_tree_insert_nr = 0;
+	extent_root->fs_info->extent_tree_prealloc_nr = 0;
 	return 0;
 }
 
@@ -349,7 +349,10 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int start_found;
 	struct btrfs_leaf *l;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
+	struct btrfs_fs_info *info = root->fs_info;
 	int total_needed = num_blocks;
+	int total_found = 0;
+	int fill_prealloc = 0;
 	int level;
 
 	path = btrfs_alloc_path();
@@ -357,8 +360,12 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
 	level = btrfs_header_level(btrfs_buffer_header(root->node));
-	total_needed += (level + 2) * 3;
-	if (root->fs_info->last_insert.objectid == 0 && search_end == (u64)-1) {
+	if (num_blocks == 0) {
+		fill_prealloc = 1;
+		num_blocks = 1;
+		total_needed = min(level + 2, BTRFS_MAX_LEVEL) * 3;
+	}
+	if (info->last_insert.objectid == 0 && search_end == (u64)-1) {
 		struct btrfs_disk_key *last_key;
 		btrfs_init_path(path);
 		ins->objectid = (u64)-1;
@@ -373,8 +380,8 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		last_key = &l->items[path->slots[0]].key;
 		search_start = btrfs_disk_key_objectid(last_key);
 	}
-	if (root->fs_info->last_insert.objectid > search_start)
-		search_start = root->fs_info->last_insert.objectid;
+	if (info->last_insert.objectid > search_start)
+		search_start = info->last_insert.objectid;
 
 check_failed:
 	btrfs_init_path(path);
@@ -392,6 +399,10 @@ check_failed:
 		l = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
+			if (fill_prealloc) {
+				info->extent_tree_prealloc_nr = 0;
+				total_found = 0;
+			}
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
@@ -399,13 +410,13 @@ check_failed:
 				goto error;
 			if (!start_found) {
 				ins->objectid = search_start;
-				ins->offset = (u64)-1;
+				ins->offset = (u64)-1 - search_start;
 				start_found = 1;
 				goto check_pending;
 			}
 			ins->objectid = last_block > search_start ?
 					last_block : search_start;
-			ins->offset = (u64)-1;
+			ins->offset = (u64)-1 - ins->objectid;
 			goto check_pending;
 		}
 		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
@@ -414,7 +425,7 @@ check_failed:
 				if (last_block < search_start)
 					last_block = search_start;
 				hole_size = key.objectid - last_block;
-				if (hole_size > total_needed) {
+				if (hole_size > num_blocks) {
 					ins->objectid = last_block;
 					ins->offset = hole_size;
 					goto check_pending;
@@ -433,17 +444,51 @@ check_pending:
 	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
 	for (test_block = ins->objectid;
-	     test_block < ins->objectid + total_needed; test_block++) {
-		if (test_radix_bit(&root->fs_info->pinned_radix,
-				      test_block)) {
+	     test_block < ins->objectid + num_blocks; test_block++) {
+		if (test_radix_bit(&info->pinned_radix, test_block)) {
 			search_start = test_block + 1;
 			goto check_failed;
 		}
 	}
-	BUG_ON(root->fs_info->current_insert.offset);
-	root->fs_info->current_insert.offset = total_needed - num_blocks;
-	root->fs_info->current_insert.objectid = ins->objectid + num_blocks;
-	root->fs_info->current_insert.flags = 0;
+	if (!fill_prealloc && info->extent_tree_insert_nr) {
+		u64 last =
+		  info->extent_tree_insert[info->extent_tree_insert_nr - 1];
+		if (ins->objectid + num_blocks >
+		    info->extent_tree_insert[0] &&
+		    ins->objectid <= last) {
+			search_start = last + 1;
+			WARN_ON(1);
+			goto check_failed;
+		}
+	}
+	if (!fill_prealloc && info->extent_tree_prealloc_nr) {
+		u64 first =
+		  info->extent_tree_prealloc[info->extent_tree_prealloc_nr - 1];
+		if (ins->objectid + num_blocks > first &&
+		    ins->objectid <= info->extent_tree_prealloc[0]) {
+			search_start = info->extent_tree_prealloc[0] + 1;
+			WARN_ON(1);
+			goto check_failed;
+		}
+	}
+	if (fill_prealloc) {
+		int nr;
+		test_block = ins->objectid;
+		while(test_block < ins->objectid + ins->offset &&
+		      total_found < total_needed) {
+			nr = total_needed - total_found - 1;
+			BUG_ON(nr < 0);
+			root->fs_info->extent_tree_prealloc[nr] =
+				test_block;
+			total_found++;
+			test_block++;
+		}
+		if (total_found < total_needed) {
+			search_start = test_block;
+			goto check_failed;
+		}
+		root->fs_info->extent_tree_prealloc_nr = total_found;
+	}
 	root->fs_info->last_insert.objectid = ins->objectid;
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
@@ -472,25 +517,35 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
+	struct btrfs_key prealloc_key;
 
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item, owner);
 
 	if (root == extent_root) {
-		BUG_ON(extent_root->fs_info->current_insert.offset == 0);
+		int nr;
+		BUG_ON(info->extent_tree_prealloc_nr == 0);
 		BUG_ON(num_blocks != 1);
-		BUG_ON(extent_root->fs_info->current_insert.flags ==
-		       extent_root->fs_info->current_insert.offset);
 		ins->offset = 1;
-		ins->objectid = extent_root->fs_info->current_insert.objectid +
-				extent_root->fs_info->current_insert.flags++;
+		info->extent_tree_prealloc_nr--;
+		nr = info->extent_tree_prealloc_nr;
+		ins->objectid = info->extent_tree_prealloc[nr];
+		info->extent_tree_insert[info->extent_tree_insert_nr++] =
+			ins->objectid;
 		return 0;
 	}
+	/* do the real allocation */
 	ret = find_free_extent(trans, root, num_blocks, search_start,
 			       search_end, ins);
 	if (ret)
 		return ret;
 
+	/* then do prealloc for the extent tree */
+	ret = find_free_extent(trans, root, 0, ins->objectid + ins->offset,
+			       search_end, &prealloc_key);
+	if (ret)
+		return ret;
+
 	super_blocks_used = btrfs_super_blocks_used(info->disk_super);
 	btrfs_set_super_blocks_used(info->disk_super, super_blocks_used +
 				    num_blocks);
-- 
cgit v1.2.3


From 9078a3e1e4e489dddc636c7bb8780349d4298743 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 26 Apr 2007 16:46:15 -0400
Subject: Btrfs: start of block group code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  66 ++++++++++++----
 fs/btrfs/disk-io.c     |   4 +
 fs/btrfs/extent-tree.c | 204 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/print-tree.c  |   7 ++
 fs/btrfs/super.c       |   1 -
 fs/btrfs/transaction.c |   2 +
 6 files changed, 252 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 26d0cdd46f4..419917279e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -239,6 +239,19 @@ struct btrfs_device_item {
 	__le64 device_id;
 } __attribute__ ((__packed__));
 
+/* tag for the radix tree of block groups in ram */
+#define BTRFS_BLOCK_GROUP_DIRTY 0
+#define BTRFS_BLOCK_GROUP_HINTS 8
+#define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
+struct btrfs_block_group_item {
+	__le64 used;
+} __attribute__ ((__packed__));
+
+struct btrfs_block_group_cache {
+	struct btrfs_key key;
+	struct btrfs_block_group_item item;
+};
+
 struct crypto_hash;
 struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
@@ -249,6 +262,7 @@ struct btrfs_fs_info {
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
 	struct radix_tree_root dev_radix;
+	struct radix_tree_root block_group_radix;
 
 	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 3];
 	int extent_tree_insert_nr;
@@ -301,49 +315,67 @@ struct btrfs_root {
  * info about object characteristics.  There is one for every file and dir in
  * the FS
  */
-#define BTRFS_INODE_ITEM_KEY	1
+#define BTRFS_INODE_ITEM_KEY		1
+
+/* reserve 2-15 close to the inode for later flexibility */
 
 /*
  * dir items are the name -> inode pointers in a directory.  There is one
  * for every name in a directory.
  */
-#define BTRFS_DIR_ITEM_KEY	2
-#define BTRFS_DIR_INDEX_KEY	3
+#define BTRFS_DIR_ITEM_KEY	16
+#define BTRFS_DIR_INDEX_KEY	17
 /*
- * inline data is file data that fits in the btree.
+ * extent data is for file data
  */
-#define BTRFS_INLINE_DATA_KEY	4
-/*
- * extent data is for data that can't fit in the btree.  It points to
- * a (hopefully) huge chunk of disk
- */
-#define BTRFS_EXTENT_DATA_KEY	5
+#define BTRFS_EXTENT_DATA_KEY	18
 /*
  * csum items have the checksums for data in the extents
  */
-#define BTRFS_CSUM_ITEM_KEY	6
+#define BTRFS_CSUM_ITEM_KEY	19
+
+/* reserve 20-31 for other file stuff */
 
 /*
  * root items point to tree roots.  There are typically in the root
  * tree used by the super block to find all the other trees
  */
-#define BTRFS_ROOT_ITEM_KEY	7
+#define BTRFS_ROOT_ITEM_KEY	32
 /*
  * extent items are in the extent map tree.  These record which blocks
  * are used, and how many references there are to each block
  */
-#define BTRFS_EXTENT_ITEM_KEY	8
+#define BTRFS_EXTENT_ITEM_KEY	33
+
+/*
+ * block groups give us hints into the extent allocation trees.  Which
+ * blocks are free etc etc
+ */
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 34
 
 /*
  * dev items list the devices that make up the FS
  */
-#define BTRFS_DEV_ITEM_KEY	9
+#define BTRFS_DEV_ITEM_KEY	35
 
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
-#define BTRFS_STRING_ITEM_KEY	10
+#define BTRFS_STRING_ITEM_KEY	253
+
+
+static inline u64 btrfs_block_group_used(struct btrfs_block_group_item *bi)
+{
+	return le64_to_cpu(bi->used);
+}
+
+static inline void btrfs_set_block_group_used(struct
+						   btrfs_block_group_item *bi,
+						   u64 val)
+{
+	bi->used = cpu_to_le64(val);
+}
 
 static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
 {
@@ -1037,6 +1069,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 blocknr, u64 num_blocks);
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root);
+int btrfs_free_block_groups(struct btrfs_fs_info *info);
+int btrfs_read_block_groups(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 956727f015a..1c27eb64551 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -529,6 +529,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	init_bit_radix(&fs_info->pending_del_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
+	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
@@ -613,6 +614,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	BUG_ON(ret);
 
+	btrfs_read_block_groups(extent_root);
+
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
 	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
 	kobj_set_kset_s(fs_info, btrfs_subsys);
@@ -741,6 +744,7 @@ int close_ctree(struct btrfs_root *root)
 	iput(fs_info->btree_inode);
 
 	free_dev_radix(fs_info);
+	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e6fe3fd3881..0bb4fc83cfd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -127,6 +127,105 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return 0;
 }
 
+static int write_one_cache_group(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_block_group_cache *cache)
+{
+	int ret;
+	int pending_ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_block_group_item *bi;
+	struct btrfs_key ins;
+
+	find_free_extent(trans, extent_root, 0, 0, (u64)-1, &ins);
+	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+	BUG_ON(ret);
+	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+			    struct btrfs_block_group_item);
+	memcpy(bi, &cache->item, sizeof(*bi));
+	mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(extent_root, path);
+
+	finish_current_insert(trans, extent_root);
+	pending_ret = del_pending_extents(trans, extent_root);
+	if (ret)
+		return ret;
+	if (pending_ret)
+		return pending_ret;
+	return 0;
+
+}
+
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *cache[8];
+	int ret;
+	int err = 0;
+	int werr = 0;
+	struct radix_tree_root *radix = &root->fs_info->block_group_radix;
+	int i;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
+						 0, ARRAY_SIZE(cache),
+						 BTRFS_BLOCK_GROUP_DIRTY);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			radix_tree_tag_clear(radix, cache[i]->key.objectid +
+					     cache[i]->key.offset - 1,
+					     BTRFS_BLOCK_GROUP_DIRTY);
+			err = write_one_cache_group(trans, root,
+						    path, cache[i]);
+			if (err)
+				werr = err;
+		}
+	}
+	btrfs_free_path(path);
+	return werr;
+}
+
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      u64 blocknr, u64 num, int alloc)
+{
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 total = num;
+	u64 old_val;
+	u64 block_in_group;
+	int ret;
+	while(total) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&cache, blocknr, 1);
+		if (!ret)
+			return -1;
+		block_in_group = blocknr - cache->key.objectid;
+		WARN_ON(block_in_group > cache->key.offset);
+		radix_tree_tag_set(&info->block_group_radix,
+				   cache->key.objectid + cache->key.offset - 1,
+				   BTRFS_BLOCK_GROUP_DIRTY);
+
+		old_val = btrfs_block_group_used(&cache->item);
+		num = min(total, cache->key.offset - block_in_group);
+		total -= num;
+		blocknr += num;
+		if (alloc)
+			old_val += num;
+		else
+			old_val -= num;
+		btrfs_set_block_group_used(&cache->item, old_val);
+	}
+	return 0;
+}
+
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root)
 {
@@ -264,6 +363,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		ret = btrfs_del_item(trans, extent_root, path);
 		if (ret)
 			BUG();
+		ret = update_block_group(trans, root, blocknr, num_blocks, 0);
+		BUG_ON(ret);
 	}
 	btrfs_release_path(extent_root, path);
 	btrfs_free_path(path);
@@ -365,21 +466,6 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		num_blocks = 1;
 		total_needed = min(level + 2, BTRFS_MAX_LEVEL) * 3;
 	}
-	if (info->last_insert.objectid == 0 && search_end == (u64)-1) {
-		struct btrfs_disk_key *last_key;
-		btrfs_init_path(path);
-		ins->objectid = (u64)-1;
-		ins->offset = (u64)-1;
-		ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
-		if (ret < 0)
-			goto error;
-		BUG_ON(ret == 0);
-		if (path->slots[0] > 0)
-			path->slots[0]--;
-		l = btrfs_buffer_leaf(path->nodes[0]);
-		last_key = &l->items[path->slots[0]].key;
-		search_start = btrfs_disk_key_objectid(last_key);
-	}
 	if (info->last_insert.objectid > search_start)
 		search_start = info->last_insert.objectid;
 
@@ -420,6 +506,8 @@ check_failed:
 			goto check_pending;
 		}
 		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+			goto next;
 		if (key.objectid >= search_start) {
 			if (start_found) {
 				if (last_block < search_start)
@@ -434,6 +522,7 @@ check_failed:
 		}
 		start_found = 1;
 		last_block = key.objectid + key.offset;
+next:
 		path->slots[0]++;
 	}
 	// FIXME -ENOSPC
@@ -498,7 +587,6 @@ error:
 	btrfs_free_path(path);
 	return ret;
 }
-
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -532,6 +620,9 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		ins->objectid = info->extent_tree_prealloc[nr];
 		info->extent_tree_insert[info->extent_tree_insert_nr++] =
 			ins->objectid;
+		ret = update_block_group(trans, root,
+					 ins->objectid, ins->offset, 1);
+		BUG_ON(ret);
 		return 0;
 	}
 	/* do the real allocation */
@@ -558,6 +649,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		return ret;
 	if (pending_ret)
 		return pending_ret;
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	return 0;
 }
 
@@ -578,6 +670,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		BUG();
 		return NULL;
 	}
+	BUG_ON(ret);
 	buf = btrfs_find_create_tree_block(root, ins.objectid);
 	set_buffer_uptodate(buf);
 	return buf;
@@ -758,3 +851,82 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	int ret;
+	struct btrfs_block_group_cache *cache[8];
+	int i;
+
+	while(1) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)cache, 0,
+					     ARRAY_SIZE(cache));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			radix_tree_delete(&info->block_group_radix,
+					  cache[i]->key.objectid +
+					  cache[i]->key.offset - 1);
+			kfree(cache[i]);
+		}
+	}
+	return 0;
+}
+
+int btrfs_read_block_groups(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	int ret;
+	int err = 0;
+	struct btrfs_block_group_item *bi;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_leaf *leaf;
+	u64 group_size_blocks = BTRFS_BLOCK_GROUP_SIZE / root->blocksize;
+
+	root = root->fs_info->extent_root;
+	key.objectid = 0;
+	key.offset = group_size_blocks;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root->fs_info->extent_root,
+					&key, path, 0, 0);
+		if (ret != 0) {
+			err = ret;
+			break;
+		}
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key,
+				      &leaf->items[path->slots[0]].key);
+		cache = kmalloc(sizeof(*cache), GFP_NOFS);
+		if (!cache) {
+			err = -1;
+			break;
+		}
+		bi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_block_group_item);
+		memcpy(&cache->item, bi, sizeof(*bi));
+		memcpy(&cache->key, &found_key, sizeof(found_key));
+		key.objectid = found_key.objectid + found_key.offset;
+		btrfs_release_path(root, path);
+		ret = radix_tree_insert(&root->fs_info->block_group_radix,
+					found_key.objectid +
+					found_key.offset - 1,
+					(void *)cache);
+		BUG_ON(ret);
+		if (key.objectid >=
+		    btrfs_super_total_blocks(root->fs_info->disk_super))
+			break;
+	}
+
+	btrfs_free_path(path);
+	return 0;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 1e7038b070a..2f95fc67a03 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -11,6 +11,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_item *ii;
+	struct btrfs_block_group_item *bi;
 	u32 type;
 
 	printk("leaf %Lu total ptrs %d free space %d\n",
@@ -53,6 +54,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			printk("\t\textent data refs %u\n",
 				btrfs_extent_refs(ei));
 			break;
+		case BTRFS_BLOCK_GROUP_ITEM_KEY:
+			bi = btrfs_item_ptr(l, i,
+					    struct btrfs_block_group_item);
+			printk("\t\tblock group used %Lu\n",
+			       btrfs_block_group_used(bi));
+			break;
 		case BTRFS_STRING_ITEM_KEY:
 			printk("\t\titem data %.*s\n", btrfs_item_size(item),
 				btrfs_leaf_data(l) + btrfs_item_offset(item));
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5bbccbc7e3c..edcebf79b04 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -377,7 +377,6 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
 			break;
 		if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
-		    btrfs_disk_key_type(found_key) != BTRFS_INLINE_DATA_KEY &&
 		    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
 			break;
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 078cb9cbf9d..8a2545f6a5b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -125,6 +125,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 		btrfs_set_super_device_root(fs_info->disk_super,
 					    bh_blocknr(dev_root->node));
 	}
+	btrfs_write_dirty_block_groups(trans, extent_root);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
 		if (old_extent_block == bh_blocknr(extent_root->node))
@@ -135,6 +136,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 					&extent_root->root_key,
 					&extent_root->root_item);
 		BUG_ON(ret);
+		btrfs_write_dirty_block_groups(trans, extent_root);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From cd1bc4653dc37f6390f4d6df4f987044c64f700b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 27 Apr 2007 10:08:34 -0400
Subject: Btrfs: more block allocator work

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  4 ++-
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent-tree.c | 95 +++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/super.c       |  3 ++
 4 files changed, 90 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 419917279e6..c432222d40e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -250,6 +250,8 @@ struct btrfs_block_group_item {
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
+	u64 first_free;
+	u64 last_alloc;
 };
 
 struct crypto_hash;
@@ -257,7 +259,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *dev_root;
-	struct btrfs_key last_insert;
+	struct btrfs_block_group_cache *block_group_cache;
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1c27eb64551..2489ffa5fb3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -558,7 +558,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
-	memset(&fs_info->last_insert, 0, sizeof(fs_info->last_insert));
+	fs_info->block_group_cache = NULL;
 
 	__setup_root(sb->s_blocksize, dev_root,
 		     fs_info, BTRFS_DEV_TREE_OBJECTID);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0bb4fc83cfd..71e3b311fc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,6 +12,63 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
+static int find_search_start(struct btrfs_root *root, int data)
+{
+	struct btrfs_block_group_cache *cache[8];
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 used;
+	u64 last;
+	int i;
+	int ret;
+
+	cache[0] = info->block_group_cache;
+	if (!cache[0])
+		goto find_new;
+	used = btrfs_block_group_used(&cache[0]->item);
+	if (used < (cache[0]->key.offset * 3 / 2))
+		return 0;
+find_new:
+	last = 0;
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&info->block_group_radix,
+						 (void **)cache,
+						 last, ARRAY_SIZE(cache),
+						 BTRFS_BLOCK_GROUP_DIRTY);
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			used = btrfs_block_group_used(&cache[i]->item);
+			if (used < (cache[i]->key.offset * 3 / 2)) {
+				info->block_group_cache = cache[i];
+				cache[i]->last_alloc = cache[i]->first_free;
+				return 0;
+			}
+			last = cache[i]->key.objectid +
+				cache[i]->key.offset - 1;
+		}
+	}
+	last = 0;
+	while(1) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+						 (void **)cache,
+						 last, ARRAY_SIZE(cache));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			used = btrfs_block_group_used(&cache[i]->item);
+			if (used < (cache[i]->key.offset * 3 / 2)) {
+				info->block_group_cache = cache[i];
+				cache[i]->last_alloc = cache[i]->first_free;
+				return 0;
+			}
+			last = cache[i]->key.objectid +
+				cache[i]->key.offset - 1;
+		}
+	}
+	info->block_group_cache = NULL;
+	return 0;
+}
+
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 blocknr, u64 num_blocks)
@@ -205,8 +262,11 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	while(total) {
 		ret = radix_tree_gang_lookup(&info->block_group_radix,
 					     (void **)&cache, blocknr, 1);
-		if (!ret)
+		if (!ret) {
+			printk(KERN_CRIT "blocknr %Lu lookup failed\n",
+			       blocknr);
 			return -1;
+		}
 		block_in_group = blocknr - cache->key.objectid;
 		WARN_ON(block_in_group > cache->key.offset);
 		radix_tree_tag_set(&info->block_group_radix,
@@ -217,10 +277,15 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		num = min(total, cache->key.offset - block_in_group);
 		total -= num;
 		blocknr += num;
-		if (alloc)
+		if (alloc) {
 			old_val += num;
-		else
+			if (blocknr > cache->last_alloc)
+				cache->last_alloc = blocknr;
+		} else {
 			old_val -= num;
+			if (blocknr < cache->first_free)
+				cache->first_free = blocknr;
+		}
 		btrfs_set_block_group_used(&cache->item, old_val);
 	}
 	return 0;
@@ -246,9 +311,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			clear_radix_bit(pinned_radix, gang[i]);
 		}
 	}
-	if (root->fs_info->last_insert.objectid > first)
-		root->fs_info->last_insert.objectid = first;
-	root->fs_info->last_insert.offset = 0;
+	root->fs_info->block_group_cache = NULL;
 	return 0;
 }
 
@@ -466,8 +529,10 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		num_blocks = 1;
 		total_needed = min(level + 2, BTRFS_MAX_LEVEL) * 3;
 	}
-	if (info->last_insert.objectid > search_start)
-		search_start = info->last_insert.objectid;
+	find_search_start(root, 0);
+	if (info->block_group_cache &&
+	    info->block_group_cache->last_alloc > search_start)
+		search_start = info->block_group_cache->last_alloc;
 
 check_failed:
 	btrfs_init_path(path);
@@ -567,8 +632,7 @@ check_pending:
 		      total_found < total_needed) {
 			nr = total_needed - total_found - 1;
 			BUG_ON(nr < 0);
-			root->fs_info->extent_tree_prealloc[nr] =
-				test_block;
+			info->extent_tree_prealloc[nr] = test_block;
 			total_found++;
 			test_block++;
 		}
@@ -576,9 +640,14 @@ check_pending:
 			search_start = test_block;
 			goto check_failed;
 		}
-		root->fs_info->extent_tree_prealloc_nr = total_found;
+		info->extent_tree_prealloc_nr = total_found;
+	}
+	ret = radix_tree_gang_lookup(&info->block_group_radix,
+				     (void **)&info->block_group_cache,
+				     ins->objectid, 1);
+	if (ret) {
+		info->block_group_cache->last_alloc = ins->objectid;
 	}
-	root->fs_info->last_insert.objectid = ins->objectid;
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
 	return 0;
@@ -915,6 +984,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				    struct btrfs_block_group_item);
 		memcpy(&cache->item, bi, sizeof(*bi));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
+		cache->last_alloc = 0;
+		cache->first_free = 0;
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 		ret = radix_tree_insert(&root->fs_info->block_group_radix,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index edcebf79b04..a10e902d310 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -223,6 +223,7 @@ err:
 		btrfs_update_inode(trans, root, dir);
 		drop_nlink(dentry->d_inode);
 		btrfs_update_inode(trans, root, dentry->d_inode);
+		dir->i_sb->s_dirt = 1;
 	}
 	return ret;
 }
@@ -411,6 +412,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 error:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+	inode->i_sb->s_dirt = 1;
 	return ret;
 }
 
@@ -935,6 +937,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_fail;
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
+	dir->i_sb->s_dirt = 1;
 
 out_fail:
 	btrfs_end_transaction(trans, root);
-- 
cgit v1.2.3


From 28b8bb9e001aa1c4abdd10a8e36972658237fe43 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 27 Apr 2007 11:42:05 -0400
Subject: Btrfs: allocator tweaks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 3 ---
 fs/btrfs/extent-tree.c | 7 +++++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2489ffa5fb3..e1b6e13a5ae 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -94,9 +94,6 @@ struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
 	} while (bh != head);
 out_unlock:
 	unlock_page(page);
-	if (ret) {
-		touch_buffer(ret);
-	}
 	page_cache_release(page);
 	return ret;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71e3b311fc4..aca5802a8a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -311,7 +311,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			clear_radix_bit(pinned_radix, gang[i]);
 		}
 	}
-	root->fs_info->block_group_cache = NULL;
+	if (root->fs_info->block_group_cache) {
+		root->fs_info->block_group_cache->last_alloc =
+			root->fs_info->block_group_cache->first_free;
+	}
 	return 0;
 }
 
@@ -578,7 +581,7 @@ check_failed:
 				if (last_block < search_start)
 					last_block = search_start;
 				hole_size = key.objectid - last_block;
-				if (hole_size > num_blocks) {
+				if (hole_size >= num_blocks) {
 					ins->objectid = last_block;
 					ins->offset = hole_size;
 					goto check_pending;
-- 
cgit v1.2.3


From 06a2f9fa4c12a055cc396936408a78ae0acfb6b4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 28 Apr 2007 08:48:10 -0400
Subject: Btrfs: try to drop dead cow pages from ram

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |  3 +--
 fs/btrfs/extent-tree.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 5ff2cef9f02..6a8c8cd03ca 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -6,8 +6,7 @@
 * Check compat and incompat flags on the inode
 * Get rid of struct ctree_path, limiting tree levels held at one time
 * Add generation number to key pointer in nodes
-* Add parent pointer back to extent map.
-* Implement real list of pending transactions
+* Add generation number to inode
 * Release
 * Do real tree locking
 * Add extent mirroring (backup copies of blocks)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index aca5802a8a1..afc5267515e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -291,10 +291,18 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int try_remove_page(struct address_space *mapping, unsigned long index)
+{
+	int ret;
+	ret = invalidate_mapping_pages(mapping, index, index);
+	return ret;
+}
+
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root)
 {
 	unsigned long gang[8];
+	struct inode *btree_inode = root->fs_info->btree_inode;
 	u64 first = 0;
 	int ret;
 	int i;
@@ -309,6 +317,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			clear_radix_bit(pinned_radix, gang[i]);
+			try_remove_page(btree_inode->i_mapping,
+					gang[i] << (PAGE_CACHE_SHIFT -
+						    btree_inode->i_blkbits));
 		}
 	}
 	if (root->fs_info->block_group_cache) {
@@ -600,6 +611,12 @@ check_pending:
 	 */
 	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
+	if (ins->objectid >= btrfs_super_total_blocks(info->disk_super)) {
+		if (search_start == 0)
+			return -ENOSPC;
+		search_start = 0;
+		goto check_failed;
+	}
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + num_blocks; test_block++) {
 		if (test_radix_bit(&info->pinned_radix, test_block)) {
-- 
cgit v1.2.3


From 7c4452b9a6ca7aabe37ea2e43d443110bdc08cd8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 28 Apr 2007 09:29:35 -0400
Subject: Btrfs: smarter transaction writeback

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/super.c       |  1 -
 fs/btrfs/transaction.c | 37 +++++++++++++++++++++++++++++++++++--
 fs/btrfs/transaction.h |  1 +
 4 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index afc5267515e..652cf305a96 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -762,6 +762,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	buf = btrfs_find_create_tree_block(root, ins.objectid);
 	set_buffer_uptodate(buf);
+	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
 	return buf;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a10e902d310..1890e8648db 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -980,7 +980,6 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
 	}
-	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a2545f6a5b..f9b8864dcc4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -45,6 +45,7 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+		init_bit_radix(&cur_trans->dirty_pages);
 	}
 	cur_trans->num_writers++;
 	return 0;
@@ -106,8 +107,40 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
-	filemap_write_and_wait(root->fs_info->btree_inode->i_mapping);
-	return 0;
+	unsigned long gang[16];
+	int ret;
+	int i;
+	int err;
+	int werr = 0;
+	struct page *page;
+	struct radix_tree_root *dirty_pages;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+
+	if (!trans || !trans->transaction) {
+		return filemap_write_and_wait(btree_inode->i_mapping);
+	}
+	dirty_pages = &trans->transaction->dirty_pages;
+	while(1) {
+		ret = find_first_radix_bit(dirty_pages, gang, ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			/* FIXME EIO */
+			clear_radix_bit(dirty_pages, gang[i]);
+			page = find_lock_page(btree_inode->i_mapping,
+					      gang[i]);
+			if (!page)
+				continue;
+			err = write_one_page(page, 0);
+			if (err)
+				werr = err;
+			page_cache_release(page);
+		}
+	}
+	err = filemap_fdatawait(btree_inode->i_mapping);
+	if (err)
+		werr = err;
+	return werr;
 }
 
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 3cc29900a07..afe42d167ce 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -9,6 +9,7 @@ struct btrfs_transaction {
 	int commit_done;
 	int magic;
 	struct list_head list;
+	struct radix_tree_root dirty_pages;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 };
-- 
cgit v1.2.3


From 308535a05e4c39d2be26e0aeee722682deeb6f77 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 28 Apr 2007 15:17:08 -0400
Subject: Btrfs: prealloc more blocks for the extent map

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 652cf305a96..62051a36664 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -541,7 +541,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (num_blocks == 0) {
 		fill_prealloc = 1;
 		num_blocks = 1;
-		total_needed = min(level + 2, BTRFS_MAX_LEVEL) * 3;
+		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
 	}
 	find_search_start(root, 0);
 	if (info->block_group_cache &&
-- 
cgit v1.2.3


From 31f3c99b73483f7b738a886c552050cbd6128ff3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 30 Apr 2007 15:25:45 -0400
Subject: Btrfs: allocator improvements, inode block groups

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   1 +
 fs/btrfs/ctree.c       |  10 ++--
 fs/btrfs/ctree.h       |  18 ++++++-
 fs/btrfs/extent-tree.c | 138 +++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/super.c       |  53 +++++++++++++++----
 fs/btrfs/transaction.c |   1 +
 fs/btrfs/transaction.h |  15 ++++++
 7 files changed, 181 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43a4f1ddb37..6b50076b02d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -3,6 +3,7 @@
 
 struct btrfs_inode {
 	struct btrfs_root *root;
+	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
 	struct inode vfs_inode;
 };
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b4783bf8bf4..dbd3f636dd3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -61,7 +61,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		*cow_ret = buf;
 		return 0;
 	}
-	cow = btrfs_alloc_free_block(trans, root);
+	cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
 	cow_node = btrfs_buffer_node(cow);
 	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
 		WARN_ON(1);
@@ -800,7 +800,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	t = btrfs_alloc_free_block(trans, root);
+	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
 	c = btrfs_buffer_node(t);
 	memset(c, 0, root->blocksize);
 	btrfs_set_header_nritems(&c->header, 1);
@@ -905,7 +905,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	c_nritems = btrfs_header_nritems(&c->header);
-	split_buffer = btrfs_alloc_free_block(trans, root);
+	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
 	split = btrfs_buffer_node(split_buffer);
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
 	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
@@ -1277,7 +1277,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
-	right_buffer = btrfs_alloc_free_block(trans, root);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
 	BUG_ON(!right_buffer);
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
@@ -1374,7 +1374,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (!double_split)
 		return ret;
-	right_buffer = btrfs_alloc_free_block(trans, root);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
 	BUG_ON(!right_buffer);
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c432222d40e..e6bf9919536 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,6 +174,7 @@ struct btrfs_inode_item {
 	__le64 generation;
 	__le64 size;
 	__le64 nblocks;
+	__le64 block_group;
 	__le32 nlink;
 	__le32 uid;
 	__le32 gid;
@@ -241,6 +242,7 @@ struct btrfs_device_item {
 
 /* tag for the radix tree of block groups in ram */
 #define BTRFS_BLOCK_GROUP_DIRTY 0
+#define BTRFS_BLOCK_GROUP_AVAIL 1
 #define BTRFS_BLOCK_GROUP_HINTS 8
 #define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
 struct btrfs_block_group_item {
@@ -410,6 +412,17 @@ static inline void btrfs_set_inode_nblocks(struct btrfs_inode_item *i, u64 val)
 	i->nblocks = cpu_to_le64(val);
 }
 
+static inline u64 btrfs_inode_block_group(struct btrfs_inode_item *i)
+{
+	return le64_to_cpu(i->block_group);
+}
+
+static inline void btrfs_set_inode_block_group(struct btrfs_inode_item *i,
+						u64 val)
+{
+	i->block_group = cpu_to_le64(val);
+}
+
 static inline u32 btrfs_inode_nlink(struct btrfs_inode_item *i)
 {
 	return le32_to_cpu(i->nlink);
@@ -1054,10 +1067,13 @@ static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 	btrfs_item_offset((leaf)->items + (slot))))
 
 /* extent-tree.c */
+struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
+						 struct btrfs_block_group_cache
+						 *hint, int data);
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root);
+					    struct btrfs_root *root, u64 hint);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
 		       u64 num_blocks, u64 search_start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62051a36664..8b8cbe25fff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,42 +12,57 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
-static int find_search_start(struct btrfs_root *root, int data)
+struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
+						 struct btrfs_block_group_cache
+						 *hint, int data)
 {
 	struct btrfs_block_group_cache *cache[8];
+	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
-	u64 last;
+	u64 last = 0;
+	u64 hint_last;
 	int i;
 	int ret;
-
-	cache[0] = info->block_group_cache;
-	if (!cache[0])
-		goto find_new;
-	used = btrfs_block_group_used(&cache[0]->item);
-	if (used < (cache[0]->key.offset * 3 / 2))
-		return 0;
-find_new:
-	last = 0;
+	int full_search = 0;
+	if (hint) {
+		used = btrfs_block_group_used(&hint->item);
+		if (used < (hint->key.offset * 2) / 3) {
+			return hint;
+		}
+		radix_tree_tag_clear(&info->block_group_radix,
+				     hint->key.objectid + hint->key.offset - 1,
+				     BTRFS_BLOCK_GROUP_AVAIL);
+		last = hint->key.objectid + hint->key.offset;
+		hint_last = last;
+	} else {
+		hint_last = 0;
+		last = 0;
+	}
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(&info->block_group_radix,
 						 (void **)cache,
 						 last, ARRAY_SIZE(cache),
-						 BTRFS_BLOCK_GROUP_DIRTY);
+						 BTRFS_BLOCK_GROUP_AVAIL);
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
 			used = btrfs_block_group_used(&cache[i]->item);
-			if (used < (cache[i]->key.offset * 3 / 2)) {
+			if (used < (cache[i]->key.offset * 2) / 3) {
 				info->block_group_cache = cache[i];
-				cache[i]->last_alloc = cache[i]->first_free;
-				return 0;
+				found_group = cache[i];
+				goto found;
 			}
+			radix_tree_tag_clear(&info->block_group_radix,
+					   cache[i]->key.objectid +
+					   cache[i]->key.offset - 1,
+					   BTRFS_BLOCK_GROUP_AVAIL);
 			last = cache[i]->key.objectid +
-				cache[i]->key.offset - 1;
+				cache[i]->key.offset;
 		}
 	}
-	last = 0;
+	last = hint_last;
+again:
 	while(1) {
 		ret = radix_tree_gang_lookup(&info->block_group_radix,
 						 (void **)cache,
@@ -56,17 +71,32 @@ find_new:
 			break;
 		for (i = 0; i < ret; i++) {
 			used = btrfs_block_group_used(&cache[i]->item);
-			if (used < (cache[i]->key.offset * 3 / 2)) {
+			if (used < cache[i]->key.offset) {
 				info->block_group_cache = cache[i];
-				cache[i]->last_alloc = cache[i]->first_free;
-				return 0;
+				found_group = cache[i];
+				goto found;
 			}
+			radix_tree_tag_clear(&info->block_group_radix,
+					   cache[i]->key.objectid +
+					   cache[i]->key.offset - 1,
+					   BTRFS_BLOCK_GROUP_AVAIL);
 			last = cache[i]->key.objectid +
-				cache[i]->key.offset - 1;
+				cache[i]->key.offset;
 		}
 	}
 	info->block_group_cache = NULL;
-	return 0;
+	if (!full_search) {
+		last = 0;
+		full_search = 1;
+		goto again;
+	}
+found:
+	if (!found_group) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&found_group, 0, 1);
+		BUG_ON(ret != 1);
+	}
+	return found_group;
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -243,6 +273,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 						    path, cache[i]);
 			if (err)
 				werr = err;
+			cache[i]->last_alloc = cache[i]->first_free;
 		}
 	}
 	btrfs_free_path(path);
@@ -322,10 +353,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 						    btree_inode->i_blkbits));
 		}
 	}
-	if (root->fs_info->block_group_cache) {
-		root->fs_info->block_group_cache->last_alloc =
-			root->fs_info->block_group_cache->first_free;
-	}
 	return 0;
 }
 
@@ -532,22 +559,43 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int total_found = 0;
 	int fill_prealloc = 0;
 	int level;
+	int update_block_group = 0;
+	struct btrfs_block_group_cache *hint_block_group;
 
 	path = btrfs_alloc_path();
 	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
 	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	/* find search start here */
+	if (0 && search_start && num_blocks) {
+		u64 used;
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&hint_block_group,
+					     search_start, 1);
+		if (ret) {
+			used = btrfs_block_group_used(&hint_block_group->item);
+			if (used > (hint_block_group->key.offset * 9) / 10)
+				search_start = 0;
+			else if (search_start < hint_block_group->last_alloc)
+				search_start = hint_block_group->last_alloc;
+		} else {
+			search_start = 0;
+		}
+	}
 	if (num_blocks == 0) {
 		fill_prealloc = 1;
 		num_blocks = 1;
 		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
 	}
-	find_search_start(root, 0);
-	if (info->block_group_cache &&
-	    info->block_group_cache->last_alloc > search_start)
-		search_start = info->block_group_cache->last_alloc;
-
+	if (1 || !search_start) {
+		trans->block_group = btrfs_find_block_group(root,
+							    trans->block_group,
+							    0);
+		if (trans->block_group->last_alloc > search_start)
+			search_start = trans->block_group->last_alloc;
+		update_block_group = 1;
+	}
 check_failed:
 	btrfs_init_path(path);
 	ins->objectid = search_start;
@@ -662,11 +710,13 @@ check_pending:
 		}
 		info->extent_tree_prealloc_nr = total_found;
 	}
-	ret = radix_tree_gang_lookup(&info->block_group_radix,
-				     (void **)&info->block_group_cache,
-				     ins->objectid, 1);
-	if (ret) {
-		info->block_group_cache->last_alloc = ins->objectid;
+	if (update_block_group) {
+		ret = radix_tree_gang_lookup(&info->block_group_radix,
+					     (void **)&trans->block_group,
+					     ins->objectid, 1);
+		if (ret) {
+			trans->block_group->last_alloc = ins->objectid;
+		}
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
@@ -747,14 +797,14 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
  * returns the tree buffer or NULL.
  */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root)
+					   struct btrfs_root *root, u64 hint)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, 0, (unsigned long)-1, &ins);
+				 1, hint, (unsigned long)-1, &ins);
 	if (ret) {
 		BUG();
 		return NULL;
@@ -975,6 +1025,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_key found_key;
 	struct btrfs_leaf *leaf;
 	u64 group_size_blocks = BTRFS_BLOCK_GROUP_SIZE / root->blocksize;
+	u64 used;
 
 	root = root->fs_info->extent_root;
 	key.objectid = 0;
@@ -1005,8 +1056,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				    struct btrfs_block_group_item);
 		memcpy(&cache->item, bi, sizeof(*bi));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
-		cache->last_alloc = 0;
-		cache->first_free = 0;
+		cache->last_alloc = cache->key.objectid;
+		cache->first_free = cache->key.objectid;
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 		ret = radix_tree_insert(&root->fs_info->block_group_radix,
@@ -1014,6 +1065,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 					found_key.offset - 1,
 					(void *)cache);
 		BUG_ON(ret);
+		used = btrfs_block_group_used(bi);
+		if (used < (key.offset * 2) / 3) {
+			radix_tree_tag_set(&root->fs_info->block_group_radix,
+					   found_key.objectid +
+					   found_key.offset - 1,
+					   BTRFS_BLOCK_GROUP_AVAIL);
+		}
 		if (key.objectid >=
 		    btrfs_super_total_blocks(root->fs_info->disk_super))
 			break;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1890e8648db..7ecbe7c8618 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,6 +52,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
+	struct btrfs_block_group_cache *alloc_group;
+	u64 alloc_group_block;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -82,6 +84,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
+	alloc_group_block = btrfs_inode_block_group(inode_item);
+	ret = radix_tree_gang_lookup(&root->fs_info->block_group_radix,
+				     (void **)&alloc_group,
+				     alloc_group_block, 1);
+	BUG_ON(!ret);
+	BTRFS_I(inode)->block_group = alloc_group;
 
 	btrfs_free_path(path);
 	inode_item = NULL;
@@ -136,6 +144,8 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
 	btrfs_set_inode_nblocks(item, inode->i_blocks);
 	btrfs_set_inode_generation(item, inode->i_generation);
+	btrfs_set_inode_block_group(item,
+				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
 
@@ -237,6 +247,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	root = BTRFS_I(dir)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -262,6 +273,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
 	key.flags = (u32)-1;
@@ -429,6 +441,7 @@ static void btrfs_delete_inode(struct inode *inode)
 	inode->i_size = 0;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 	if (S_ISREG(inode->i_mode)) {
 		ret = btrfs_truncate_in_trans(trans, root, inode);
 		BUG_ON(ret);
@@ -731,6 +744,7 @@ static int btrfs_write_inode(struct inode *inode, int wait)
 	if (wait) {
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
 	}
@@ -744,6 +758,7 @@ static void btrfs_dirty_inode(struct inode *inode)
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -751,7 +766,9 @@ static void btrfs_dirty_inode(struct inode *inode)
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
-				     u64 objectid, int mode)
+				     u64 objectid,
+				     struct btrfs_block_group_cache *group,
+				     int mode)
 {
 	struct inode *inode;
 	struct btrfs_inode_item inode_item;
@@ -763,6 +780,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	BTRFS_I(inode)->root = root;
+	group = btrfs_find_block_group(root, group, 0);
+	BTRFS_I(inode)->block_group = group;
 
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
@@ -832,6 +851,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	if (err) {
@@ -839,11 +859,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid, mode);
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
-	// FIXME mark the inode dirty
+
+	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode);
 	if (err)
 		drop_inode = 1;
@@ -853,6 +875,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 	}
 	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -904,6 +928,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out_unlock;
@@ -915,7 +940,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid, S_IFDIR | mode);
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fail;
@@ -923,6 +949,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	drop_on_err = 1;
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
+	btrfs_set_trans_block_group(trans, inode);
 
 	err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
 	if (err)
@@ -938,6 +965,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
 	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
 
 out_fail:
 	btrfs_end_transaction(trans, root);
@@ -1349,6 +1378,7 @@ static void btrfs_truncate(struct inode *inode)
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode);
 	BUG_ON(ret);
 	ret = btrfs_end_transaction(trans, root);
@@ -1445,6 +1475,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
 
 		bh = page_buffers(pages[i]);
 		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
@@ -1481,6 +1512,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			kunmap(pages[i]);
 		}
 		SetPageChecked(pages[i]);
+		btrfs_update_inode_block_group(trans, inode);
 		ret = btrfs_end_transaction(trans, root);
 		BUG_ON(ret);
 		mutex_unlock(&root->fs_info->fs_mutex);
@@ -1821,6 +1853,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		mutex_unlock(&root->fs_info->fs_mutex);
 		goto out_unlock;
 	}
+	btrfs_set_trans_block_group(trans, inode);
 	/* FIXME blocksize != 4096 */
 	inode->i_blocks += num_blocks << 3;
 	if (start_pos < inode->i_size) {
@@ -1845,6 +1878,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 	BUG_ON(ret);
 	alloc_extent_start = ins.objectid;
+	btrfs_update_inode_block_group(trans, inode);
 	ret = btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
@@ -2017,6 +2051,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	struct btrfs_leaf *leaf;
 	struct btrfs_root *new_root;
 	struct inode *inode;
+	struct inode *dir;
 	int ret;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
@@ -2025,7 +2060,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
-	subvol = btrfs_alloc_free_block(trans, root);
+	subvol = btrfs_alloc_free_block(trans, root, 0);
 	if (subvol == NULL)
 		return -ENOSPC;
 	leaf = btrfs_buffer_leaf(subvol);
@@ -2069,10 +2104,9 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	 * insert the directory item
 	 */
 	key.offset = (u64)-1;
+	dir = root->fs_info->sb->s_root->d_inode;
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    name, namelen,
-				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, 0);
+				    name, namelen, dir->i_ino, &key, 0);
 	BUG_ON(ret);
 
 	ret = btrfs_commit_transaction(trans, root);
@@ -2084,7 +2118,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(new_root, 1);
 	BUG_ON(!trans);
 
-	inode = btrfs_new_inode(trans, new_root, new_dirid, S_IFDIR | 0700);
+	inode = btrfs_new_inode(trans, new_root, new_dirid,
+				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f9b8864dcc4..bdbf514c111 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -77,6 +77,7 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
+	h->block_group = NULL;
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 	h->magic = h->magic2 = TRANS_MAGIC;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index afe42d167ce..b5378119e76 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -1,5 +1,6 @@
 #ifndef __TRANSACTION__
 #define __TRANSACTION__
+#include "btrfs_inode.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -20,10 +21,24 @@ struct btrfs_trans_handle {
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	struct btrfs_transaction *transaction;
+	struct btrfs_block_group_cache *block_group;
 	int magic2;
 };
 
 
+static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
+					       struct inode *inode)
+{
+	trans->block_group = BTRFS_I(inode)->block_group;
+}
+
+static inline void btrfs_update_inode_block_group(struct
+						  btrfs_trans_handle *trans,
+						  struct inode *inode)
+{
+	BTRFS_I(inode)->block_group = trans->block_group;
+}
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-- 
cgit v1.2.3


From 090d18753c7fb73f7d846c3a89a50cd35136d144 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 1 May 2007 08:53:32 -0400
Subject: Btrfs: directory readahead

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 29 ++++++++++++++++++++++++++++-
 fs/btrfs/disk-io.h     |  6 ++++++
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/super.c       | 28 ++++++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e1b6e13a5ae..354524adf98 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,6 +5,7 @@
 #include <linux/scatterlist.h>
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
+#include <linux/file.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -253,6 +254,29 @@ static struct address_space_operations btree_aops = {
 	.sync_page	= block_sync_page,
 };
 
+int readahead_tree_block(struct btrfs_root *root, u64 blocknr)
+{
+	struct buffer_head *bh = NULL;
+
+	bh = btrfs_find_create_tree_block(root, blocknr);
+	if (!bh)
+		return 0;
+	if (buffer_uptodate(bh))
+		goto done;
+	if (test_set_buffer_locked(bh))
+		goto done;
+	if (!buffer_uptodate(bh)) {
+		get_bh(bh);
+		bh->b_end_io = end_buffer_read_sync;
+		submit_bh(READ, bh);
+	} else {
+		unlock_buffer(bh);
+	}
+done:
+	brelse(bh);
+	return 0;
+}
+
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
 	struct buffer_head *bh = NULL;
@@ -270,11 +294,14 @@ struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh))
 			goto fail;
-		csum_tree_block(root, bh, 1);
 	} else {
 		unlock_buffer(bh);
 	}
 uptodate:
+	if (!buffer_checked(bh)) {
+		csum_tree_block(root, bh, 1);
+		set_buffer_checked(bh);
+	}
 	if (check_tree_block(root, bh))
 		BUG();
 	return bh;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 444ebb0141a..1ee7d2a55b5 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -5,6 +5,11 @@
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
 
+enum btrfs_bh_state_bits {
+	BH_Checked = BH_PrivateStart,
+};
+BUFFER_FNS(Checked, checked);
+
 static inline struct btrfs_node *btrfs_buffer_node(struct buffer_head *bh)
 {
 	return (struct btrfs_node *)bh->b_data;
@@ -21,6 +26,7 @@ static inline struct btrfs_header *btrfs_buffer_header(struct buffer_head *bh)
 }
 
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr);
+int readahead_tree_block(struct btrfs_root *root, u64 blocknr);
 struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 blocknr);
 int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8b8cbe25fff..2837fddf7b1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -812,6 +812,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	buf = btrfs_find_create_tree_block(root, ins.objectid);
 	set_buffer_uptodate(buf);
+	set_buffer_checked(buf);
 	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
 	return buf;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7ecbe7c8618..a29a781b86c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -585,6 +585,31 @@ printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_r
 	return d_splice_alias(inode, dentry);
 }
 
+static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path)
+{
+	struct btrfs_node *node;
+	int i;
+	int nritems;
+	u64 objectid;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1];
+	objectid = btrfs_disk_key_objectid(&node->ptrs[slot].key);
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot; i < nritems; i++) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid != objectid)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		readahead_tree_block(root, blocknr);
+	}
+}
+
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
@@ -619,6 +644,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (ret < 0)
 		goto err;
 	advance = 0;
+	reada_leaves(root, path);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
@@ -631,6 +657,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				leaf = btrfs_buffer_leaf(path->nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
 				slot = path->slots[0];
+				if (path->slots[1] == 0)
+					reada_leaves(root, path);
 			} else {
 				slot++;
 				path->slots[0]++;
-- 
cgit v1.2.3


From 35b7e476107e3d54f03384e0f2fa3dfd68933353 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 May 2007 15:53:43 -0400
Subject: Btrfs: fix page cache memory leak

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/bit-radix.c   |  2 --
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/dir-item.c    | 11 +++++-----
 fs/btrfs/disk-io.c     |  8 ++++++--
 fs/btrfs/disk-io.h     |  2 ++
 fs/btrfs/extent-tree.c | 12 +----------
 fs/btrfs/hash.c        |  1 +
 fs/btrfs/print-tree.c  |  1 +
 fs/btrfs/super.c       | 56 ++++++++++++++++++++++++++++++--------------------
 fs/btrfs/transaction.c |  8 +++++---
 10 files changed, 59 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index 9fc42e99c7d..e9ace32d7ba 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -22,9 +22,7 @@ int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 			return -ENOMEM;
 		memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
 		bits[0] = slot;
-		radix_tree_preload(GFP_NOFS);
 		ret = radix_tree_insert(radix, slot, bits);
-		radix_tree_preload_end();
 		if (ret)
 			return ret;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e6bf9919536..b5855a5365e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -8,6 +8,9 @@
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
+extern struct kmem_cache *btrfs_trans_handle_cachep;
+extern struct kmem_cache *btrfs_transaction_cachep;
+extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 
 #define BTRFS_MAGIC "_BtRfS_M"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7a7e9846860..00a28d90fea 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -4,11 +4,12 @@
 #include "hash.h"
 #include "transaction.h"
 
-struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    struct btrfs_path *path,
-					    struct btrfs_key *cpu_key,
-					    u32 data_size)
+static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
+						   *trans,
+						   struct btrfs_root *root,
+						   struct btrfs_path *path,
+						   struct btrfs_key *cpu_key,
+						   u32 data_size)
 {
 	int ret;
 	char *ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 354524adf98..5828a104dfe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -5,7 +5,7 @@
 #include <linux/scatterlist.h>
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
-#include <linux/file.h>
+#include <linux/writeback.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -694,7 +694,7 @@ static int free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 	return 0;
 }
 
-int del_fs_roots(struct btrfs_fs_info *fs_info)
+static int del_fs_roots(struct btrfs_fs_info *fs_info)
 {
 	int ret;
 	struct btrfs_root *gang[8];
@@ -781,3 +781,7 @@ void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 	brelse(buf);
 }
 
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
+{
+	balance_dirty_pages_ratelimited(root->fs_info->btree_inode->i_mapping);
+}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 1ee7d2a55b5..822ccb8aa4a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -55,4 +55,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   u64 num_blocks);
 int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 			     u64 logical);
+int btrfs_releasepage(struct page *page, gfp_t flags);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2837fddf7b1..0e20d1c42fc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -322,18 +322,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int try_remove_page(struct address_space *mapping, unsigned long index)
-{
-	int ret;
-	ret = invalidate_mapping_pages(mapping, index, index);
-	return ret;
-}
-
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root)
 {
 	unsigned long gang[8];
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	u64 first = 0;
 	int ret;
 	int i;
@@ -348,9 +340,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			clear_radix_bit(pinned_radix, gang[i]);
-			try_remove_page(btree_inode->i_mapping,
-					gang[i] << (PAGE_CACHE_SHIFT -
-						    btree_inode->i_blkbits));
 		}
 	}
 	return 0;
@@ -983,6 +972,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
+		btrfs_btree_balance_dirty(root);
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 22519b8e0cf..32de1ea1b64 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/types.h>
+#include "hash.h"
 #define DELTA 0x9E3779B9
 
 static void TEA_transform(__u32 buf[2], __u32 const in[])
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 2f95fc67a03..28813411de6 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -1,6 +1,7 @@
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "print-tree.h"
 
 void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a29a781b86c..130a1d3d9f5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -18,14 +18,14 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 
-void btrfs_fsinfo_release(struct kobject *obj)
+static void btrfs_fsinfo_release(struct kobject *obj)
 {
 	struct btrfs_fs_info *fsinfo = container_of(obj,
 					    struct btrfs_fs_info, kobj);
 	kfree(fsinfo);
 }
 
-struct kobj_type btrfs_fsinfo_ktype = {
+static struct kobj_type btrfs_fsinfo_ktype = {
 	.release = btrfs_fsinfo_release,
 };
 
@@ -148,7 +148,6 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
-
 static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode)
@@ -251,6 +250,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
 	return ret;
 }
 
@@ -324,6 +324,7 @@ out:
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -449,6 +450,7 @@ static void btrfs_delete_inode(struct inode *inode)
 	btrfs_free_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
 	return;
 no_delete:
 	clear_inode(inode);
@@ -481,7 +483,7 @@ out:
 	return ret;
 }
 
-int fixup_tree_root_location(struct btrfs_root *root,
+static int fixup_tree_root_location(struct btrfs_root *root,
 			     struct btrfs_key *location,
 			     struct btrfs_root **sub_root)
 {
@@ -512,7 +514,7 @@ int fixup_tree_root_location(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_init_locked_inode(struct inode *inode, void *p)
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
@@ -520,15 +522,15 @@ int btrfs_init_locked_inode(struct inode *inode, void *p)
 	return 0;
 }
 
-int btrfs_find_actor(struct inode *inode, void *opaque)
+static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct btrfs_iget_args *args = opaque;
 	return (args->ino == inode->i_ino &&
 		args->root == BTRFS_I(inode)->root);
 }
 
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-				struct btrfs_root *root)
+static struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				       struct btrfs_root *root)
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
@@ -790,6 +792,7 @@ static void btrfs_dirty_inode(struct inode *inode)
 	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -913,6 +916,7 @@ out_unlock:
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -1002,6 +1006,7 @@ out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_on_err)
 		iput(inode);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -1099,7 +1104,6 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
 		extent_end = 0;
 		extent_start = 0;
-		btrfs_release_path(root, path);
 		goto out;
 	}
 	found_type = btrfs_file_extent_type(item);
@@ -1135,7 +1139,6 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 		btrfs_map_bh_to_logical(root, result, 0);
 	}
 out:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return err;
 }
@@ -1231,13 +1234,13 @@ static int __btrfs_write_full_page(struct inode *inode, struct page *page,
 		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = btrfs_get_block(inode, block, bh, 0);
-			if (err)
+			if (err) {
+printk("writepage going to recovery err %d\n", err);
 				goto recover;
+			}
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
-				unmap_underlying_metadata(bh->b_bdev,
-							bh->b_blocknr);
 			}
 		}
 		bh = bh->b_this_page;
@@ -1303,11 +1306,6 @@ done:
 		if (uptodate)
 			SetPageUptodate(page);
 		end_page_writeback(page);
-		/*
-		 * The page and buffer_heads can be released at any time from
-		 * here on.
-		 */
-		wbc->pages_skipped++;	/* We didn't write this page */
 	}
 	return err;
 
@@ -1409,10 +1407,11 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode);
 	BUG_ON(ret);
+	btrfs_update_inode(trans, root, inode);
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	mark_inode_dirty(inode);
+	btrfs_btree_balance_dirty(root);
 }
 
 /*
@@ -1777,10 +1776,15 @@ static int prepare_pages(struct btrfs_root *root,
 			err = -ENOMEM;
 			goto failed_release;
 		}
+		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+		wait_on_page_writeback(pages[i]);
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
-		create_empty_buffers(pages[i], root->fs_info->sb->s_blocksize,
-				     (1 << BH_Uptodate));
+		if (!page_has_buffers(pages[i])) {
+			create_empty_buffers(pages[i],
+					     root->fs_info->sb->s_blocksize,
+					     (1 << BH_Uptodate));
+		}
 		head = page_buffers(pages[i]);
 		bh = head;
 		do {
@@ -1820,7 +1824,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *pages[8];
-	struct page *pinned[2] = { NULL, NULL };
+	struct page *pinned[2];
 	unsigned long first_index;
 	unsigned long last_index;
 	u64 start_pos;
@@ -1829,6 +1833,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 
+	pinned[0] = NULL;
+	pinned[1] = NULL;
 	if (file->f_flags & O_DIRECT)
 		return -EINVAL;
 	pos = *ppos;
@@ -1858,6 +1864,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		if (!PageUptodate(pinned[0])) {
 			ret = mpage_readpage(pinned[0], btrfs_get_block);
 			BUG_ON(ret);
+			wait_on_page_locked(pinned[0]);
 		} else {
 			unlock_page(pinned[0]);
 		}
@@ -1869,6 +1876,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		if (!PageUptodate(pinned[1])) {
 			ret = mpage_readpage(pinned[1], btrfs_get_block);
 			BUG_ON(ret);
+			wait_on_page_locked(pinned[1]);
 		} else {
 			unlock_page(pinned[1]);
 		}
@@ -1940,6 +1948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		num_written += write_bytes;
 
 		balance_dirty_pages_ratelimited(inode->i_mapping);
+		btrfs_btree_balance_dirty(root);
 		cond_resched();
 	}
 out_unlock:
@@ -2165,6 +2174,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	iput(inode);
 
 	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
 	return 0;
 }
 
@@ -2220,6 +2230,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_commit_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
 	return 0;
 }
 
@@ -2295,6 +2306,7 @@ out:
 	mutex_unlock(&root->fs_info->fs_mutex);
 out_nolock:
 	btrfs_free_path(path);
+	btrfs_btree_balance_dirty(root);
 
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bdbf514c111..8bbe9107ff7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -199,8 +199,9 @@ struct dirty_root {
 	struct btrfs_root *root;
 };
 
-int add_dirty_roots(struct btrfs_trans_handle *trans,
-		    struct radix_tree_root *radix, struct list_head *list)
+static int add_dirty_roots(struct btrfs_trans_handle *trans,
+			   struct radix_tree_root *radix,
+			   struct list_head *list)
 {
 	struct dirty_root *dirty;
 	struct btrfs_root *gang[8];
@@ -246,7 +247,8 @@ int add_dirty_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int drop_dirty_roots(struct btrfs_root *tree_root, struct list_head *list)
+static int drop_dirty_roots(struct btrfs_root *tree_root,
+			    struct list_head *list)
 {
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
-- 
cgit v1.2.3


From be08c1b9f8e679d45e086728445ac36cf250e92e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 May 2007 09:06:49 -0400
Subject: Btrfs: early metadata/data split

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent-tree.c | 95 +++++++++++++++++++++++++-------------------------
 fs/btrfs/super.c       |  2 +-
 3 files changed, 50 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b5855a5365e..179a046ce5a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1080,7 +1080,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
 		       u64 num_blocks, u64 search_start,
-		       u64 search_end, struct btrfs_key *ins);
+		       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0e20d1c42fc..c5ae51893f7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6,7 +6,7 @@
 
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *orig_root, u64 num_blocks, u64 search_start, u64
-			    search_end, struct btrfs_key *ins);
+			    search_end, struct btrfs_key *ins, int data);
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -25,7 +25,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	int i;
 	int ret;
 	int full_search = 0;
-	if (hint) {
+	if (!data && hint) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used < (hint->key.offset * 2) / 3) {
 			return hint;
@@ -47,6 +47,14 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
+			last = cache[i]->key.objectid +
+				cache[i]->key.offset;
+			if (!full_search && !data &&
+			   (cache[i]->key.objectid & cache[i]->key.offset))
+				continue;
+			if (!full_search && data &&
+			   (cache[i]->key.objectid & cache[i]->key.offset) == 0)
+				continue;
 			used = btrfs_block_group_used(&cache[i]->item);
 			if (used < (cache[i]->key.offset * 2) / 3) {
 				info->block_group_cache = cache[i];
@@ -57,8 +65,6 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 					   cache[i]->key.objectid +
 					   cache[i]->key.offset - 1,
 					   BTRFS_BLOCK_GROUP_AVAIL);
-			last = cache[i]->key.objectid +
-				cache[i]->key.offset;
 		}
 	}
 	last = hint_last;
@@ -70,6 +76,14 @@ again:
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
+			last = cache[i]->key.objectid +
+				cache[i]->key.offset;
+			if (!full_search && !data &&
+			   (cache[i]->key.objectid & cache[i]->key.offset))
+				continue;
+			if (!full_search && data &&
+			   (cache[i]->key.objectid & cache[i]->key.offset) == 0)
+				continue;
 			used = btrfs_block_group_used(&cache[i]->item);
 			if (used < cache[i]->key.offset) {
 				info->block_group_cache = cache[i];
@@ -80,8 +94,6 @@ again:
 					   cache[i]->key.objectid +
 					   cache[i]->key.offset - 1,
 					   BTRFS_BLOCK_GROUP_AVAIL);
-			last = cache[i]->key.objectid +
-				cache[i]->key.offset;
 		}
 	}
 	info->block_group_cache = NULL;
@@ -112,7 +124,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	u32 refs;
 
 	find_free_extent(trans, root->fs_info->extent_root, 0, 0, (u64)-1,
-			 &ins);
+			 &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
@@ -225,7 +237,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_item *bi;
 	struct btrfs_key ins;
 
-	find_free_extent(trans, extent_root, 0, 0, (u64)-1, &ins);
+	find_free_extent(trans, extent_root, 0, 0, (u64)-1, &ins, 0);
 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 	BUG_ON(ret);
 	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
@@ -322,10 +334,18 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int try_remove_page(struct address_space *mapping, unsigned long index)
+{
+	int ret;
+	ret = invalidate_mapping_pages(mapping, index, index);
+	return ret;
+}
+
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *root)
 {
 	unsigned long gang[8];
+	struct inode *btree_inode = root->fs_info->btree_inode;
 	u64 first = 0;
 	int ret;
 	int i;
@@ -340,6 +360,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			clear_radix_bit(pinned_radix, gang[i]);
+			try_remove_page(btree_inode->i_mapping,
+					gang[i] << (PAGE_CACHE_SHIFT -
+						    btree_inode->i_blkbits));
 		}
 	}
 	return 0;
@@ -424,7 +447,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 
-	find_free_extent(trans, root, 0, 0, (u64)-1, &ins);
+	find_free_extent(trans, root, 0, 0, (u64)-1, &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
@@ -531,7 +554,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  */
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *orig_root, u64 num_blocks, u64 search_start, u64
-			    search_end, struct btrfs_key *ins)
+			    search_end, struct btrfs_key *ins, int data)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -548,43 +571,21 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int total_found = 0;
 	int fill_prealloc = 0;
 	int level;
-	int update_block_group = 0;
-	struct btrfs_block_group_cache *hint_block_group;
+	struct btrfs_block_group_cache *block_group;
 
 	path = btrfs_alloc_path();
 	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
 	level = btrfs_header_level(btrfs_buffer_header(root->node));
-	/* find search start here */
-	if (0 && search_start && num_blocks) {
-		u64 used;
-		ret = radix_tree_gang_lookup(&info->block_group_radix,
-					     (void **)&hint_block_group,
-					     search_start, 1);
-		if (ret) {
-			used = btrfs_block_group_used(&hint_block_group->item);
-			if (used > (hint_block_group->key.offset * 9) / 10)
-				search_start = 0;
-			else if (search_start < hint_block_group->last_alloc)
-				search_start = hint_block_group->last_alloc;
-		} else {
-			search_start = 0;
-		}
-	}
 	if (num_blocks == 0) {
 		fill_prealloc = 1;
 		num_blocks = 1;
 		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
 	}
-	if (1 || !search_start) {
-		trans->block_group = btrfs_find_block_group(root,
-							    trans->block_group,
-							    0);
-		if (trans->block_group->last_alloc > search_start)
-			search_start = trans->block_group->last_alloc;
-		update_block_group = 1;
-	}
+	block_group = btrfs_find_block_group(root, trans->block_group, data);
+	if (block_group->last_alloc > search_start)
+		search_start = block_group->last_alloc;
 check_failed:
 	btrfs_init_path(path);
 	ins->objectid = search_start;
@@ -699,13 +700,13 @@ check_pending:
 		}
 		info->extent_tree_prealloc_nr = total_found;
 	}
-	if (update_block_group) {
-		ret = radix_tree_gang_lookup(&info->block_group_radix,
-					     (void **)&trans->block_group,
-					     ins->objectid, 1);
-		if (ret) {
-			trans->block_group->last_alloc = ins->objectid;
-		}
+	ret = radix_tree_gang_lookup(&info->block_group_radix,
+				     (void **)&block_group,
+				     ins->objectid, 1);
+	if (ret) {
+		block_group->last_alloc = ins->objectid;
+		if (!data)
+			trans->block_group = block_group;
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
@@ -725,7 +726,7 @@ error:
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
 		       u64 num_blocks, u64 search_start,
-		       u64 search_end, struct btrfs_key *ins)
+		       u64 search_end, struct btrfs_key *ins, int data)
 {
 	int ret;
 	int pending_ret;
@@ -755,13 +756,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	}
 	/* do the real allocation */
 	ret = find_free_extent(trans, root, num_blocks, search_start,
-			       search_end, ins);
+			       search_end, ins, data);
 	if (ret)
 		return ret;
 
 	/* then do prealloc for the extent tree */
 	ret = find_free_extent(trans, root, 0, ins->objectid + ins->offset,
-			       search_end, &prealloc_key);
+			       search_end, &prealloc_key, 0);
 	if (ret)
 		return ret;
 
@@ -793,7 +794,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, hint, (unsigned long)-1, &ins);
+				 1, 0, (unsigned long)-1, &ins, 0);
 	if (ret) {
 		BUG();
 		return NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 130a1d3d9f5..23ba4e979fc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1903,7 +1903,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
 	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, 1, (u64)-1, &ins);
+					 num_blocks, 1, (u64)-1, &ins, 1);
 		BUG_ON(ret);
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       start_pos, ins.objectid, ins.offset);
-- 
cgit v1.2.3


From be74417553f4b2ee46be2088007a674ef2f02330 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 6 May 2007 10:15:01 -0400
Subject: Btrfs: more allocator enhancements

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/bit-radix.c   |   6 +-
 fs/btrfs/ctree.h       |   7 +-
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c | 317 +++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/super.c       |   2 +-
 5 files changed, 249 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index e9ace32d7ba..1a0271445df 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -26,8 +26,10 @@ int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 		if (ret)
 			return ret;
 	}
-	set_bit(bit_slot, bits + 1);
-	return 0;
+	ret = test_and_set_bit(bit_slot, bits + 1);
+	if (ret < 0)
+		ret = 1;
+	return ret;
 }
 
 int test_radix_bit(struct radix_tree_root *radix, unsigned long bit)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 179a046ce5a..086e7dea3c9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -257,6 +257,8 @@ struct btrfs_block_group_cache {
 	struct btrfs_block_group_item item;
 	u64 first_free;
 	u64 last_alloc;
+	u64 pinned;
+	int data;
 };
 
 struct crypto_hash;
@@ -264,12 +266,12 @@ struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *dev_root;
-	struct btrfs_block_group_cache *block_group_cache;
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
 	struct radix_tree_root dev_radix;
 	struct radix_tree_root block_group_radix;
+	struct radix_tree_root block_group_data_radix;
 
 	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 3];
 	int extent_tree_insert_nr;
@@ -1072,7 +1074,8 @@ static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 /* extent-tree.c */
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
-						 *hint, int data);
+						 *hint, u64 search_start,
+						 int data);
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5828a104dfe..7930458c227 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -554,6 +554,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
+	INIT_RADIX_TREE(&fs_info->block_group_data_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
@@ -582,7 +583,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
-	fs_info->block_group_cache = NULL;
 
 	__setup_root(sb->s_blocksize, dev_root,
 		     fs_info, BTRFS_DEV_TREE_OBJECTID);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c5ae51893f7..2937fd9aba7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,36 +12,88 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
+static struct btrfs_block_group_cache *lookup_block_group(struct
+							  btrfs_fs_info *info,
+							  u64 blocknr)
+{
+	struct btrfs_block_group_cache *block_group;
+	int ret;
+
+	ret = radix_tree_gang_lookup(&info->block_group_radix,
+				     (void **)&block_group,
+				     blocknr, 1);
+	if (ret) {
+		if (block_group->key.objectid <= blocknr && blocknr <
+		    block_group->key.objectid + block_group->key.offset)
+			return block_group;
+	}
+	ret = radix_tree_gang_lookup(&info->block_group_data_radix,
+				     (void **)&block_group,
+				     blocknr, 1);
+	if (ret) {
+		if (block_group->key.objectid <= blocknr && blocknr <
+		    block_group->key.objectid + block_group->key.offset)
+			return block_group;
+	}
+printk("lookup_block_group fails for blocknr %Lu\n", blocknr);
+	return NULL;
+}
+
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
-						 *hint, int data)
+						 *hint, u64 search_start,
+						 int data)
 {
 	struct btrfs_block_group_cache *cache[8];
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct radix_tree_root *radix;
 	u64 used;
 	u64 last = 0;
 	u64 hint_last;
 	int i;
 	int ret;
 	int full_search = 0;
-	if (!data && hint) {
+
+	if (data)
+		radix = &info->block_group_data_radix;
+	else
+		radix = &info->block_group_radix;
+
+	if (search_start) {
+		struct btrfs_block_group_cache *shint;
+		shint = lookup_block_group(info, search_start);
+		if (shint->data == data) {
+			used = btrfs_block_group_used(&shint->item);
+			if (used + shint->pinned <
+			    (shint->key.offset * 8) / 10) {
+				return shint;
+			}
+		}
+	}
+	if (hint && hint->data == data) {
 		used = btrfs_block_group_used(&hint->item);
-		if (used < (hint->key.offset * 2) / 3) {
+		if (used + hint->pinned < (hint->key.offset * 8) / 10) {
 			return hint;
 		}
-		radix_tree_tag_clear(&info->block_group_radix,
-				     hint->key.objectid + hint->key.offset - 1,
-				     BTRFS_BLOCK_GROUP_AVAIL);
-		last = hint->key.objectid + hint->key.offset;
+		if (used >= (hint->key.offset * 8) / 10) {
+			radix_tree_tag_clear(radix,
+					     hint->key.objectid +
+					     hint->key.offset - 1,
+					     BTRFS_BLOCK_GROUP_AVAIL);
+		}
+		last = hint->key.offset * 2;
+		if (hint->key.objectid >= last)
+			last = max(search_start, hint->key.objectid - last);
+		else
+			last = hint->key.objectid + hint->key.offset;
 		hint_last = last;
 	} else {
-		hint_last = 0;
-		last = 0;
+		hint_last = search_start;
+		last = search_start;
 	}
 	while(1) {
-		ret = radix_tree_gang_lookup_tag(&info->block_group_radix,
-						 (void **)cache,
+		ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
 						 last, ARRAY_SIZE(cache),
 						 BTRFS_BLOCK_GROUP_AVAIL);
 		if (!ret)
@@ -49,65 +101,54 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		for (i = 0; i < ret; i++) {
 			last = cache[i]->key.objectid +
 				cache[i]->key.offset;
-			if (!full_search && !data &&
-			   (cache[i]->key.objectid & cache[i]->key.offset))
-				continue;
-			if (!full_search && data &&
-			   (cache[i]->key.objectid & cache[i]->key.offset) == 0)
-				continue;
 			used = btrfs_block_group_used(&cache[i]->item);
-			if (used < (cache[i]->key.offset * 2) / 3) {
-				info->block_group_cache = cache[i];
+			if (used + cache[i]->pinned <
+			    (cache[i]->key.offset * 8) / 10) {
 				found_group = cache[i];
 				goto found;
 			}
-			radix_tree_tag_clear(&info->block_group_radix,
-					   cache[i]->key.objectid +
-					   cache[i]->key.offset - 1,
-					   BTRFS_BLOCK_GROUP_AVAIL);
+			if (used >= (cache[i]->key.offset * 8) / 10) {
+				radix_tree_tag_clear(radix,
+						     cache[i]->key.objectid +
+						     cache[i]->key.offset - 1,
+						     BTRFS_BLOCK_GROUP_AVAIL);
+			}
 		}
 	}
 	last = hint_last;
 again:
 	while(1) {
-		ret = radix_tree_gang_lookup(&info->block_group_radix,
-						 (void **)cache,
-						 last, ARRAY_SIZE(cache));
+		ret = radix_tree_gang_lookup(radix, (void **)cache,
+					     last, ARRAY_SIZE(cache));
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
 			last = cache[i]->key.objectid +
 				cache[i]->key.offset;
-			if (!full_search && !data &&
-			   (cache[i]->key.objectid & cache[i]->key.offset))
-				continue;
-			if (!full_search && data &&
-			   (cache[i]->key.objectid & cache[i]->key.offset) == 0)
-				continue;
 			used = btrfs_block_group_used(&cache[i]->item);
-			if (used < cache[i]->key.offset) {
-				info->block_group_cache = cache[i];
+			if (used + cache[i]->pinned < cache[i]->key.offset) {
 				found_group = cache[i];
 				goto found;
 			}
-			radix_tree_tag_clear(&info->block_group_radix,
-					   cache[i]->key.objectid +
-					   cache[i]->key.offset - 1,
-					   BTRFS_BLOCK_GROUP_AVAIL);
+			if (used >= cache[i]->key.offset) {
+				radix_tree_tag_clear(radix,
+						     cache[i]->key.objectid +
+						     cache[i]->key.offset - 1,
+						     BTRFS_BLOCK_GROUP_AVAIL);
+			}
 		}
 	}
-	info->block_group_cache = NULL;
 	if (!full_search) {
-		last = 0;
+		last = search_start;
 		full_search = 1;
 		goto again;
 	}
-found:
 	if (!found_group) {
-		ret = radix_tree_gang_lookup(&info->block_group_radix,
+		ret = radix_tree_gang_lookup(radix,
 					     (void **)&found_group, 0, 1);
 		BUG_ON(ret != 1);
 	}
+found:
 	return found_group;
 }
 
@@ -252,18 +293,20 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 		return ret;
 	if (pending_ret)
 		return pending_ret;
+	if (cache->data)
+		cache->last_alloc = cache->first_free;
 	return 0;
 
 }
 
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root)
+static int write_dirty_block_radix(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct radix_tree_root *radix)
 {
 	struct btrfs_block_group_cache *cache[8];
 	int ret;
 	int err = 0;
 	int werr = 0;
-	struct radix_tree_root *radix = &root->fs_info->block_group_radix;
 	int i;
 	struct btrfs_path *path;
 
@@ -285,35 +328,74 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 						    path, cache[i]);
 			if (err)
 				werr = err;
-			cache[i]->last_alloc = cache[i]->first_free;
 		}
 	}
 	btrfs_free_path(path);
 	return werr;
 }
 
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	int ret;
+	int ret2;
+	ret = write_dirty_block_radix(trans, root,
+				      &root->fs_info->block_group_radix);
+	ret2 = write_dirty_block_radix(trans, root,
+				      &root->fs_info->block_group_data_radix);
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
+}
+
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 blocknr, u64 num, int alloc)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct radix_tree_root *radix;
 	u64 total = num;
 	u64 old_val;
 	u64 block_in_group;
 	int ret;
+	if (num != 1)
+		radix = &info->block_group_data_radix;
+	else
+		radix = &info->block_group_radix;
 	while(total) {
-		ret = radix_tree_gang_lookup(&info->block_group_radix,
-					     (void **)&cache, blocknr, 1);
+		ret = radix_tree_gang_lookup(radix, (void **)&cache,
+					     blocknr, 1);
 		if (!ret) {
 			printk(KERN_CRIT "blocknr %Lu lookup failed\n",
 			       blocknr);
 			return -1;
 		}
 		block_in_group = blocknr - cache->key.objectid;
+		if (block_in_group > cache->key.offset || cache->key.objectid >
+		    blocknr) {
+			if (radix == &info->block_group_data_radix)
+				radix = &info->block_group_radix;
+			else
+				radix = &info->block_group_data_radix;
+			ret = radix_tree_gang_lookup(radix, (void **)&cache,
+						     blocknr, 1);
+			if (!ret) {
+				printk(KERN_CRIT "blocknr %Lu lookup failed\n",
+				       blocknr);
+				return -1;
+			}
+			block_in_group = blocknr - cache->key.objectid;
+			if (block_in_group > cache->key.offset ||
+			    cache->key.objectid > blocknr) {
+				BUG();
+			}
+		}
 		WARN_ON(block_in_group > cache->key.offset);
-		radix_tree_tag_set(&info->block_group_radix,
-				   cache->key.objectid + cache->key.offset - 1,
+		radix_tree_tag_set(radix, cache->key.objectid +
+				   cache->key.offset - 1,
 				   BTRFS_BLOCK_GROUP_DIRTY);
 
 		old_val = btrfs_block_group_used(&cache->item);
@@ -346,6 +428,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 {
 	unsigned long gang[8];
 	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct btrfs_block_group_cache *block_group;
 	u64 first = 0;
 	int ret;
 	int i;
@@ -360,6 +443,14 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			clear_radix_bit(pinned_radix, gang[i]);
+			block_group = lookup_block_group(root->fs_info,
+							 gang[i]);
+			if (block_group) {
+				WARN_ON(block_group->pinned == 0);
+				block_group->pinned--;
+				if (gang[i] < block_group->last_alloc)
+					block_group->last_alloc = gang[i];
+			}
 			try_remove_page(btree_inode->i_mapping,
 					gang[i] << (PAGE_CACHE_SHIFT -
 						    btree_inode->i_blkbits));
@@ -420,10 +511,16 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 			btrfs_block_release(root, bh);
 		}
 		err = set_radix_bit(&root->fs_info->pinned_radix, blocknr);
+		if (!err) {
+			struct btrfs_block_group_cache *cache;
+			cache = lookup_block_group(root->fs_info, blocknr);
+			if (cache)
+				cache->pinned++;
+		}
 	} else {
 		err = set_radix_bit(&root->fs_info->pending_del_radix, blocknr);
 	}
-	BUG_ON(err);
+	BUG_ON(err < 0);
 	return 0;
 }
 
@@ -502,6 +599,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	int i;
 	struct radix_tree_root *pending_radix;
 	struct radix_tree_root *pinned_radix;
+	struct btrfs_block_group_cache *cache;
 
 	pending_radix = &extent_root->fs_info->pending_del_radix;
 	pinned_radix = &extent_root->fs_info->pinned_radix;
@@ -513,7 +611,17 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			break;
 		for (i = 0; i < ret; i++) {
 			wret = set_radix_bit(pinned_radix, gang[i]);
-			BUG_ON(wret);
+			if (wret == 0) {
+				cache = lookup_block_group(extent_root->fs_info,
+							   gang[i]);
+				if (cache)
+					cache->pinned++;
+			}
+			if (wret < 0) {
+				printk(KERN_CRIT "set_radix_bit, err %d\n",
+				       wret);
+				BUG_ON(wret < 0);
+			}
 			wret = clear_radix_bit(pending_radix, gang[i]);
 			BUG_ON(wret);
 			wret = __free_extent(trans, extent_root,
@@ -563,6 +671,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int slot = 0;
 	u64 last_block = 0;
 	u64 test_block;
+	u64 orig_search_start = search_start;
 	int start_found;
 	struct btrfs_leaf *l;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
@@ -572,6 +681,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int fill_prealloc = 0;
 	int level;
 	struct btrfs_block_group_cache *block_group;
+	int full_scan = 0;
 
 	path = btrfs_alloc_path();
 	ins->flags = 0;
@@ -583,10 +693,21 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		num_blocks = 1;
 		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
 	}
-	block_group = btrfs_find_block_group(root, trans->block_group, data);
+	if (search_start) {
+		block_group = lookup_block_group(info, search_start);
+		block_group = btrfs_find_block_group(root, block_group,
+						     search_start, data);
+	} else {
+		block_group = btrfs_find_block_group(root,
+						     trans->block_group, 0,
+						     data);
+	}
+
+check_failed:
+	if (block_group->data != data)
+		WARN_ON(1);
 	if (block_group->last_alloc > search_start)
 		search_start = block_group->last_alloc;
-check_failed:
 	btrfs_init_path(path);
 	ins->objectid = search_start;
 	ins->offset = 0;
@@ -639,6 +760,13 @@ check_failed:
 		}
 		start_found = 1;
 		last_block = key.objectid + key.offset;
+		if (last_block >= block_group->key.objectid +
+		    block_group->key.offset) {
+			btrfs_release_path(root, path);
+			search_start = block_group->key.objectid +
+				block_group->key.offset * 2;
+			goto new_group;
+		}
 next:
 		path->slots[0]++;
 	}
@@ -650,16 +778,17 @@ check_pending:
 	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
 	if (ins->objectid >= btrfs_super_total_blocks(info->disk_super)) {
-		if (search_start == 0)
+		if (full_scan)
 			return -ENOSPC;
-		search_start = 0;
-		goto check_failed;
+		search_start = orig_search_start;
+		full_scan = 1;
+		goto new_group;
 	}
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + num_blocks; test_block++) {
 		if (test_radix_bit(&info->pinned_radix, test_block)) {
 			search_start = test_block + 1;
-			goto check_failed;
+			goto new_group;
 		}
 	}
 	if (!fill_prealloc && info->extent_tree_insert_nr) {
@@ -670,7 +799,7 @@ check_pending:
 		    ins->objectid <= last) {
 			search_start = last + 1;
 			WARN_ON(1);
-			goto check_failed;
+			goto new_group;
 		}
 	}
 	if (!fill_prealloc && info->extent_tree_prealloc_nr) {
@@ -680,7 +809,7 @@ check_pending:
 		    ins->objectid <= info->extent_tree_prealloc[0]) {
 			search_start = info->extent_tree_prealloc[0] + 1;
 			WARN_ON(1);
-			goto check_failed;
+			goto new_group;
 		}
 	}
 	if (fill_prealloc) {
@@ -696,14 +825,12 @@ check_pending:
 		}
 		if (total_found < total_needed) {
 			search_start = test_block;
-			goto check_failed;
+			goto new_group;
 		}
 		info->extent_tree_prealloc_nr = total_found;
 	}
-	ret = radix_tree_gang_lookup(&info->block_group_radix,
-				     (void **)&block_group,
-				     ins->objectid, 1);
-	if (ret) {
+	block_group = lookup_block_group(info, ins->objectid);
+	if (block_group) {
 		block_group->last_alloc = ins->objectid;
 		if (!data)
 			trans->block_group = block_group;
@@ -711,6 +838,18 @@ check_pending:
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
 	return 0;
+
+new_group:
+	if (search_start >= btrfs_super_total_blocks(info->disk_super)) {
+		search_start = orig_search_start;
+		full_scan = 1;
+	}
+	block_group = lookup_block_group(info, search_start);
+	if (!full_scan)
+		block_group = btrfs_find_block_group(root, block_group,
+						     search_start, data);
+	goto check_failed;
+
 error:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
@@ -794,7 +933,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, 0, (unsigned long)-1, &ins, 0);
+				 1, hint, (unsigned long)-1, &ins, 0);
 	if (ret) {
 		BUG();
 		return NULL;
@@ -984,21 +1123,19 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
+static int free_block_group_radix(struct radix_tree_root *radix)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache[8];
 	int i;
 
 	while(1) {
-		ret = radix_tree_gang_lookup(&info->block_group_radix,
-					     (void **)cache, 0,
+		ret = radix_tree_gang_lookup(radix, (void **)cache, 0,
 					     ARRAY_SIZE(cache));
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			radix_tree_delete(&info->block_group_radix,
-					  cache[i]->key.objectid +
+			radix_tree_delete(radix, cache[i]->key.objectid +
 					  cache[i]->key.offset - 1);
 			kfree(cache[i]);
 		}
@@ -1006,6 +1143,20 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	return 0;
 }
 
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	int ret;
+	int ret2;
+
+	ret = free_block_group_radix(&info->block_group_radix);
+	ret2 = free_block_group_radix(&info->block_group_data_radix);
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -1013,13 +1164,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	int err = 0;
 	struct btrfs_block_group_item *bi;
 	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct radix_tree_root *radix;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_leaf *leaf;
 	u64 group_size_blocks = BTRFS_BLOCK_GROUP_SIZE / root->blocksize;
 	u64 used;
+	u64 nr = 0;
 
-	root = root->fs_info->extent_root;
+	root = info->extent_root;
 	key.objectid = 0;
 	key.offset = group_size_blocks;
 	key.flags = 0;
@@ -1030,7 +1184,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		return -ENOMEM;
 
 	while(1) {
-		ret = btrfs_search_slot(NULL, root->fs_info->extent_root,
+		ret = btrfs_search_slot(NULL, info->extent_root,
 					&key, path, 0, 0);
 		if (ret != 0) {
 			err = ret;
@@ -1050,23 +1204,28 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->last_alloc = cache->key.objectid;
 		cache->first_free = cache->key.objectid;
+		cache->pinned = 0;
+		cache->data = (nr & 1);
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
-		ret = radix_tree_insert(&root->fs_info->block_group_radix,
-					found_key.objectid +
+		if (nr & 1)
+			radix = &info->block_group_data_radix;
+		else
+			radix = &info->block_group_radix;
+		ret = radix_tree_insert(radix, found_key.objectid +
 					found_key.offset - 1,
 					(void *)cache);
 		BUG_ON(ret);
 		used = btrfs_block_group_used(bi);
-		if (used < (key.offset * 2) / 3) {
-			radix_tree_tag_set(&root->fs_info->block_group_radix,
-					   found_key.objectid +
+		if (used < (key.offset * 8) / 10) {
+			radix_tree_tag_set(radix, found_key.objectid +
 					   found_key.offset - 1,
 					   BTRFS_BLOCK_GROUP_AVAIL);
 		}
 		if (key.objectid >=
-		    btrfs_super_total_blocks(root->fs_info->disk_super))
+		    btrfs_super_total_blocks(info->disk_super))
 			break;
+		nr++;
 	}
 
 	btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 23ba4e979fc..d5ac0d8ee3c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -811,7 +811,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	BTRFS_I(inode)->root = root;
-	group = btrfs_find_block_group(root, group, 0);
+	group = btrfs_find_block_group(root, group, 0, 0);
 	BTRFS_I(inode)->block_group = group;
 
 	inode->i_uid = current->fsuid;
-- 
cgit v1.2.3


From 3e1ad54fe2839319c1aa66b954da0753f5b1f906 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 7 May 2007 20:03:49 -0400
Subject: Btrfs: allocator and tuning

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c | 75 ++++++++++++++++++++++----------------------------
 2 files changed, 34 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 086e7dea3c9..cdb7c23c41f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -255,6 +255,7 @@ struct btrfs_block_group_item {
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
+	struct radix_tree_root *radix;
 	u64 first_free;
 	u64 last_alloc;
 	u64 pinned;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2937fd9aba7..3edfc300289 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,7 +23,7 @@ static struct btrfs_block_group_cache *lookup_block_group(struct
 				     (void **)&block_group,
 				     blocknr, 1);
 	if (ret) {
-		if (block_group->key.objectid <= blocknr && blocknr <
+		if (block_group->key.objectid <= blocknr && blocknr <=
 		    block_group->key.objectid + block_group->key.offset)
 			return block_group;
 	}
@@ -31,11 +31,16 @@ static struct btrfs_block_group_cache *lookup_block_group(struct
 				     (void **)&block_group,
 				     blocknr, 1);
 	if (ret) {
-		if (block_group->key.objectid <= blocknr && blocknr <
+		if (block_group->key.objectid <= blocknr && blocknr <=
 		    block_group->key.objectid + block_group->key.offset)
 			return block_group;
 	}
-printk("lookup_block_group fails for blocknr %Lu\n", blocknr);
+	WARN_ON(1);
+	printk("lookup_block_group fails for blocknr %Lu\n", blocknr);
+	printk("last ret was %d\n", ret);
+	if (ret) {
+		printk("last block group was %Lu %Lu\n", block_group->key.objectid, block_group->key.offset);
+	}
 	return NULL;
 }
 
@@ -356,45 +361,20 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct radix_tree_root *radix;
 	u64 total = num;
 	u64 old_val;
 	u64 block_in_group;
-	int ret;
-	if (num != 1)
-		radix = &info->block_group_data_radix;
-	else
-		radix = &info->block_group_radix;
+
 	while(total) {
-		ret = radix_tree_gang_lookup(radix, (void **)&cache,
-					     blocknr, 1);
-		if (!ret) {
+		cache = lookup_block_group(info, blocknr);
+		if (!cache) {
 			printk(KERN_CRIT "blocknr %Lu lookup failed\n",
 			       blocknr);
 			return -1;
 		}
 		block_in_group = blocknr - cache->key.objectid;
-		if (block_in_group > cache->key.offset || cache->key.objectid >
-		    blocknr) {
-			if (radix == &info->block_group_data_radix)
-				radix = &info->block_group_radix;
-			else
-				radix = &info->block_group_data_radix;
-			ret = radix_tree_gang_lookup(radix, (void **)&cache,
-						     blocknr, 1);
-			if (!ret) {
-				printk(KERN_CRIT "blocknr %Lu lookup failed\n",
-				       blocknr);
-				return -1;
-			}
-			block_in_group = blocknr - cache->key.objectid;
-			if (block_in_group > cache->key.offset ||
-			    cache->key.objectid > blocknr) {
-				BUG();
-			}
-		}
 		WARN_ON(block_in_group > cache->key.offset);
-		radix_tree_tag_set(radix, cache->key.objectid +
+		radix_tree_tag_set(cache->radix, cache->key.objectid +
 				   cache->key.offset - 1,
 				   BTRFS_BLOCK_GROUP_DIRTY);
 
@@ -693,6 +673,8 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		num_blocks = 1;
 		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
 	}
+	if (search_end == (u64)-1)
+		search_end = btrfs_super_total_blocks(info->disk_super);
 	if (search_start) {
 		block_group = lookup_block_group(info, search_start);
 		block_group = btrfs_find_block_group(root, block_group,
@@ -704,7 +686,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 check_failed:
-	if (block_group->data != data)
+	if (!full_scan && block_group->data != data)
 		WARN_ON(1);
 	if (block_group->last_alloc > search_start)
 		search_start = block_group->last_alloc;
@@ -734,13 +716,13 @@ check_failed:
 				goto error;
 			if (!start_found) {
 				ins->objectid = search_start;
-				ins->offset = (u64)-1 - search_start;
+				ins->offset = search_end - search_start;
 				start_found = 1;
 				goto check_pending;
 			}
 			ins->objectid = last_block > search_start ?
 					last_block : search_start;
-			ins->offset = (u64)-1 - ins->objectid;
+			ins->offset = search_end - ins->objectid;
 			goto check_pending;
 		}
 		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
@@ -777,7 +759,7 @@ check_pending:
 	 */
 	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
-	if (ins->objectid >= btrfs_super_total_blocks(info->disk_super)) {
+	if (ins->objectid + num_blocks >= search_end) {
 		if (full_scan)
 			return -ENOSPC;
 		search_start = orig_search_start;
@@ -840,7 +822,7 @@ check_pending:
 	return 0;
 
 new_group:
-	if (search_start >= btrfs_super_total_blocks(info->disk_super)) {
+	if (search_start + num_blocks >= search_end) {
 		search_start = orig_search_start;
 		full_scan = 1;
 	}
@@ -900,7 +882,12 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		return ret;
 
 	/* then do prealloc for the extent tree */
-	ret = find_free_extent(trans, root, 0, ins->objectid + ins->offset,
+	if (ins->objectid + ins->offset >= search_end)
+		search_end = ins->objectid - 1;
+	else
+		search_start = ins->objectid + ins->offset;
+
+	ret = find_free_extent(trans, root, 0, search_start,
 			       search_end, &prealloc_key, 0);
 	if (ret)
 		return ret;
@@ -1198,6 +1185,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			err = -1;
 			break;
 		}
+
+		if (nr & 1)
+			radix = &info->block_group_data_radix;
+		else
+			radix = &info->block_group_radix;
+
 		bi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_block_group_item);
 		memcpy(&cache->item, bi, sizeof(*bi));
@@ -1206,12 +1199,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		cache->first_free = cache->key.objectid;
 		cache->pinned = 0;
 		cache->data = (nr & 1);
+		cache->radix = radix;
+
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
-		if (nr & 1)
-			radix = &info->block_group_data_radix;
-		else
-			radix = &info->block_group_radix;
 		ret = radix_tree_insert(radix, found_key.objectid +
 					found_key.offset - 1,
 					(void *)cache);
-- 
cgit v1.2.3


From e37c9e6921207cf503634b06bee37ecb7904408d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 May 2007 20:13:14 -0400
Subject: Btrfs: many allocator fixes, pretty solid

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |   1 +
 fs/btrfs/bit-radix.c   |  10 +-
 fs/btrfs/bit-radix.h   |   2 +-
 fs/btrfs/ctree.h       |   3 +
 fs/btrfs/disk-io.c     |   1 +
 fs/btrfs/extent-tree.c | 375 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/super.c       |   4 +-
 fs/btrfs/transaction.c |   3 +-
 8 files changed, 340 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 6a8c8cd03ca..f6df246f26c 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -7,6 +7,7 @@
 * Get rid of struct ctree_path, limiting tree levels held at one time
 * Add generation number to key pointer in nodes
 * Add generation number to inode
+* Add ability to switch a block group from data to metadata or vice versa
 * Release
 * Do real tree locking
 * Add extent mirroring (backup copies of blocks)
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index 1a0271445df..8f9cd427723 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -77,7 +77,7 @@ int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
 }
 
 int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
-			 int nr)
+			 unsigned long start, int nr)
 {
 	unsigned long *bits;
 	unsigned long *gang[4];
@@ -85,10 +85,13 @@ int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
 	int ret;
 	int i;
 	int total_found = 0;
+	unsigned long slot;
 
-	ret = radix_tree_gang_lookup(radix, (void **)gang, 0, ARRAY_SIZE(gang));
+	slot = start / BIT_RADIX_BITS_PER_ARRAY;
+	ret = radix_tree_gang_lookup(radix, (void **)gang, slot,
+				     ARRAY_SIZE(gang));
+	found = start % BIT_RADIX_BITS_PER_ARRAY;
 	for (i = 0; i < ret && nr > 0; i++) {
-		found = 0;
 		bits = gang[i];
 		while(nr > 0) {
 			found = find_next_bit(bits + 1,
@@ -104,6 +107,7 @@ int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
 			} else
 				break;
 		}
+		found = 0;
 	}
 	return total_found;
 }
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
index 56aad4c7d7f..4e717e30db4 100644
--- a/fs/btrfs/bit-radix.h
+++ b/fs/btrfs/bit-radix.h
@@ -6,7 +6,7 @@ int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
 int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
 int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
 int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
-			 int nr);
+			 unsigned long start, int nr);
 
 static inline void init_bit_radix(struct radix_tree_root *radix)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cdb7c23c41f..92a6078de82 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -259,7 +259,9 @@ struct btrfs_block_group_cache {
 	u64 first_free;
 	u64 last_alloc;
 	u64 pinned;
+	u64 last_prealloc;
 	int data;
+	int cached;
 };
 
 struct crypto_hash;
@@ -273,6 +275,7 @@ struct btrfs_fs_info {
 	struct radix_tree_root dev_radix;
 	struct radix_tree_root block_group_radix;
 	struct radix_tree_root block_group_data_radix;
+	struct radix_tree_root extent_map_radix;
 
 	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 3];
 	int extent_tree_insert_nr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7930458c227..2dbf422a2b9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -551,6 +551,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
+	init_bit_radix(&fs_info->extent_map_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3edfc300289..3ac9da45347 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,6 +12,97 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
+static int cache_block_group(struct btrfs_root *root,
+			     struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_leaf *leaf;
+	struct radix_tree_root *extent_radix;
+	int slot;
+	u64 i;
+	u64 last = 0;
+	u64 hole_size;
+	int found = 0;
+
+	root = root->fs_info->extent_root;
+	extent_radix = &root->fs_info->extent_map_radix;
+
+	if (block_group->cached)
+		return 0;
+	if (block_group->data)
+		return 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+printk("cache block group %Lu\n", block_group->key.objectid);
+	key.objectid = block_group->key.objectid;
+	key.flags = 0;
+	key.offset = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	if (ret && path->slots[0] > 0)
+		path->slots[0]--;
+	while(1) {
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(&leaf->header)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			else {
+				if (found) {
+					hole_size = block_group->key.objectid +
+						block_group->key.offset - last;
+				} else {
+					last = block_group->key.objectid;
+					hole_size = block_group->key.offset;
+				}
+				for (i = 0; i < hole_size; i++) {
+					set_radix_bit(extent_radix,
+						      last + i);
+				}
+				break;
+			}
+		}
+		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset) {
+			if (found) {
+				hole_size = block_group->key.objectid +
+					block_group->key.offset - last;
+			} else {
+				last = block_group->key.objectid;
+				hole_size = block_group->key.offset;
+			}
+			for (i = 0; i < hole_size; i++) {
+				set_radix_bit(extent_radix, last + i);
+			}
+			break;
+		}
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
+			if (!found) {
+				last = key.objectid + key.offset;
+				found = 1;
+			} else {
+				hole_size = key.objectid - last;
+				for (i = 0; i < hole_size; i++) {
+					set_radix_bit(extent_radix, last + i);
+				}
+				last = key.objectid + key.offset;
+			}
+		}
+		path->slots[0]++;
+	}
+
+	block_group->cached = 1;
+	btrfs_free_path(path);
+	return 0;
+}
+
 static struct btrfs_block_group_cache *lookup_block_group(struct
 							  btrfs_fs_info *info,
 							  u64 blocknr)
@@ -44,6 +135,63 @@ static struct btrfs_block_group_cache *lookup_block_group(struct
 	return NULL;
 }
 
+static u64 leaf_range(struct btrfs_root *root)
+{
+	u64 size = BTRFS_LEAF_DATA_SIZE(root);
+	size = size / (sizeof(struct btrfs_extent_item) +
+		       sizeof(struct btrfs_item));
+	return size;
+}
+
+static u64 find_search_start(struct btrfs_root *root,
+			     struct btrfs_block_group_cache **cache_ret,
+			     u64 search_start, int num)
+{
+	unsigned long gang[8];
+	int ret;
+	struct btrfs_block_group_cache *cache = *cache_ret;
+	u64 last = max(search_start, cache->key.objectid);
+
+	if (cache->data)
+		goto out;
+	if (num > 1) {
+		last = max(last, cache->last_prealloc);
+	}
+again:
+	cache_block_group(root, cache);
+	while(1) {
+		ret = find_first_radix_bit(&root->fs_info->extent_map_radix,
+					   gang, last, ARRAY_SIZE(gang));
+		if (!ret)
+			goto out;
+		last = gang[ret-1] + 1;
+		if (num > 1) {
+			if (ret != ARRAY_SIZE(gang)) {
+				goto new_group;
+			}
+			if (gang[ret-1] - gang[0] > leaf_range(root)) {
+				continue;
+			}
+		}
+		if (gang[0] >= cache->key.objectid + cache->key.offset) {
+			goto new_group;
+		}
+		return gang[0];
+	}
+out:
+	return max(cache->last_alloc, search_start);
+
+new_group:
+	cache = lookup_block_group(root->fs_info, last + cache->key.offset - 1);
+	if (!cache) {
+		return max((*cache_ret)->last_alloc, search_start);
+	}
+	cache = btrfs_find_block_group(root, cache,
+				       last + cache->key.offset - 1, 0);
+	*cache_ret = cache;
+	goto again;
+}
+
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
@@ -89,13 +237,18 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		}
 		last = hint->key.offset * 2;
 		if (hint->key.objectid >= last)
-			last = max(search_start, hint->key.objectid - last);
+			last = max(search_start + hint->key.offset - 1,
+				   hint->key.objectid - last);
 		else
 			last = hint->key.objectid + hint->key.offset;
 		hint_last = last;
 	} else {
-		hint_last = search_start;
-		last = search_start;
+		if (hint)
+			hint_last = max(hint->key.objectid, search_start);
+		else
+			hint_last = search_start;
+
+		last = hint_last;
 	}
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
@@ -357,13 +510,14 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
-			      u64 blocknr, u64 num, int alloc)
+			      u64 blocknr, u64 num, int alloc, int mark_free)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total = num;
 	u64 old_val;
 	u64 block_in_group;
+	u64 i;
 
 	while(total) {
 		cache = lookup_block_group(info, blocknr);
@@ -380,18 +534,38 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 		old_val = btrfs_block_group_used(&cache->item);
 		num = min(total, cache->key.offset - block_in_group);
-		total -= num;
-		blocknr += num;
 		if (alloc) {
 			old_val += num;
 			if (blocknr > cache->last_alloc)
 				cache->last_alloc = blocknr;
+			if (!cache->data) {
+				for (i = 0; i < num; i++) {
+					clear_radix_bit(&info->extent_map_radix,
+						        blocknr + i);
+				}
+			}
 		} else {
 			old_val -= num;
 			if (blocknr < cache->first_free)
 				cache->first_free = blocknr;
+			if (!cache->data && mark_free) {
+				for (i = 0; i < num; i++) {
+					set_radix_bit(&info->extent_map_radix,
+						      blocknr + i);
+				}
+			}
+			if (old_val < (cache->key.offset * 8) / 10 &&
+			    old_val + num >= (cache->key.offset * 8) / 10) {
+printk("group %Lu now available\n", cache->key.objectid);
+				radix_tree_tag_set(cache->radix,
+						   cache->key.objectid +
+						   cache->key.offset - 1,
+						   BTRFS_BLOCK_GROUP_AVAIL);
+			}
 		}
 		btrfs_set_block_group_used(&cache->item, old_val);
+		total -= num;
+		blocknr += num;
 	}
 	return 0;
 }
@@ -413,9 +587,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 	int ret;
 	int i;
 	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
+	struct radix_tree_root *extent_radix = &root->fs_info->extent_map_radix;
 
 	while(1) {
-		ret = find_first_radix_bit(pinned_radix, gang,
+		ret = find_first_radix_bit(pinned_radix, gang, 0,
 					   ARRAY_SIZE(gang));
 		if (!ret)
 			break;
@@ -430,6 +605,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 				block_group->pinned--;
 				if (gang[i] < block_group->last_alloc)
 					block_group->last_alloc = gang[i];
+				if (gang[i] < block_group->last_prealloc)
+					block_group->last_prealloc = gang[i];
+				if (!block_group->data)
+					set_radix_bit(extent_radix, gang[i]);
 			}
 			try_remove_page(btree_inode->i_mapping,
 					gang[i] << (PAGE_CACHE_SHIFT -
@@ -508,7 +687,8 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
  * remove an extent from the root, returns 0 on success
  */
 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 blocknr, u64 num_blocks, int pin)
+			 *root, u64 blocknr, u64 num_blocks, int pin,
+			 int mark_free)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -556,10 +736,10 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		ret = btrfs_del_item(trans, extent_root, path);
 		if (ret)
 			BUG();
-		ret = update_block_group(trans, root, blocknr, num_blocks, 0);
+		ret = update_block_group(trans, root, blocknr, num_blocks, 0,
+					 mark_free);
 		BUG_ON(ret);
 	}
-	btrfs_release_path(extent_root, path);
 	btrfs_free_path(path);
 	finish_current_insert(trans, extent_root);
 	return ret;
@@ -585,7 +765,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	pinned_radix = &extent_root->fs_info->pinned_radix;
 
 	while(1) {
-		ret = find_first_radix_bit(pending_radix, gang,
+		ret = find_first_radix_bit(pending_radix, gang, 0,
 					   ARRAY_SIZE(gang));
 		if (!ret)
 			break;
@@ -605,7 +785,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			wret = clear_radix_bit(pending_radix, gang[i]);
 			BUG_ON(wret);
 			wret = __free_extent(trans, extent_root,
-					     gang[i], 1, 0);
+					     gang[i], 1, 0, 0);
 			if (wret)
 				err = wret;
 		}
@@ -627,7 +807,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		pin_down_block(root, blocknr, 1);
 		return 0;
 	}
-	ret = __free_extent(trans, root, blocknr, num_blocks, pin);
+	ret = __free_extent(trans, root, blocknr, num_blocks, pin, pin == 0);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
@@ -688,18 +868,45 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 check_failed:
 	if (!full_scan && block_group->data != data)
 		WARN_ON(1);
-	if (block_group->last_alloc > search_start)
-		search_start = block_group->last_alloc;
+
+	if (!data)
+		search_start = find_search_start(root, &block_group,
+						 search_start, total_needed);
+	else
+		search_start = max(block_group->last_alloc, search_start);
+
 	btrfs_init_path(path);
 	ins->objectid = search_start;
 	ins->offset = 0;
 	start_found = 0;
+
 	ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
 	if (ret < 0)
 		goto error;
 
-	if (path->slots[0] > 0)
+	if (path->slots[0] > 0) {
 		path->slots[0]--;
+	}
+
+	l = btrfs_buffer_leaf(path->nodes[0]);
+	btrfs_disk_key_to_cpu(&key, &l->items[path->slots[0]].key);
+	/*
+	 * a rare case, go back one key if we hit a block group item
+	 * instead of an extent item
+	 */
+	if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY &&
+	    key.objectid + key.offset >= search_start) {
+		ins->objectid = key.objectid;
+		ins->offset = key.offset - 1;
+		btrfs_release_path(root, path);
+		ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
+		if (ret < 0)
+			goto error;
+
+		if (path->slots[0] > 0) {
+			path->slots[0]--;
+		}
+	}
 
 	while (1) {
 		l = btrfs_buffer_leaf(path->nodes[0]);
@@ -725,21 +932,23 @@ check_failed:
 			ins->offset = search_end - ins->objectid;
 			goto check_pending;
 		}
+
 		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
-			goto next;
-		if (key.objectid >= search_start) {
-			if (start_found) {
-				if (last_block < search_start)
-					last_block = search_start;
-				hole_size = key.objectid - last_block;
-				if (hole_size >= num_blocks) {
-					ins->objectid = last_block;
-					ins->offset = hole_size;
-					goto check_pending;
-				}
+		if (key.objectid >= search_start && key.objectid > last_block &&
+		    start_found) {
+			if (last_block < search_start)
+				last_block = search_start;
+			hole_size = key.objectid - last_block;
+			if (hole_size >= num_blocks) {
+				ins->objectid = last_block;
+				ins->offset = hole_size;
+				goto check_pending;
 			}
 		}
+
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+			goto next;
+
 		start_found = 1;
 		last_block = key.objectid + key.offset;
 		if (last_block >= block_group->key.objectid +
@@ -759,6 +968,7 @@ check_pending:
 	 */
 	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
+
 	if (ins->objectid + num_blocks >= search_end) {
 		if (full_scan)
 			return -ENOSPC;
@@ -780,7 +990,7 @@ check_pending:
 		    info->extent_tree_insert[0] &&
 		    ins->objectid <= last) {
 			search_start = last + 1;
-			WARN_ON(1);
+			WARN_ON(!full_scan);
 			goto new_group;
 		}
 	}
@@ -790,13 +1000,18 @@ check_pending:
 		if (ins->objectid + num_blocks > first &&
 		    ins->objectid <= info->extent_tree_prealloc[0]) {
 			search_start = info->extent_tree_prealloc[0] + 1;
-			WARN_ON(1);
+			WARN_ON(!full_scan);
 			goto new_group;
 		}
 	}
 	if (fill_prealloc) {
 		int nr;
 		test_block = ins->objectid;
+		if (test_block - info->extent_tree_prealloc[total_needed - 1] >=
+		    leaf_range(root)) {
+			total_found = 0;
+			info->extent_tree_prealloc_nr = total_found;
+		}
 		while(test_block < ins->objectid + ins->offset &&
 		      total_found < total_needed) {
 			nr = total_needed - total_found - 1;
@@ -811,11 +1026,15 @@ check_pending:
 		}
 		info->extent_tree_prealloc_nr = total_found;
 	}
-	block_group = lookup_block_group(info, ins->objectid);
-	if (block_group) {
-		block_group->last_alloc = ins->objectid;
-		if (!data)
-			trans->block_group = block_group;
+	if (!data) {
+		block_group = lookup_block_group(info, ins->objectid);
+		if (block_group) {
+			if (fill_prealloc)
+				block_group->last_prealloc =
+				     info->extent_tree_prealloc[total_needed-1];
+			else
+				trans->block_group = block_group;
+		}
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
@@ -824,6 +1043,7 @@ check_pending:
 new_group:
 	if (search_start + num_blocks >= search_end) {
 		search_start = orig_search_start;
+printk("doing full scan!\n");
 		full_scan = 1;
 	}
 	block_group = lookup_block_group(info, search_start);
@@ -871,26 +1091,57 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		info->extent_tree_insert[info->extent_tree_insert_nr++] =
 			ins->objectid;
 		ret = update_block_group(trans, root,
-					 ins->objectid, ins->offset, 1);
+					 ins->objectid, ins->offset, 1, 0);
 		BUG_ON(ret);
 		return 0;
 	}
+
+	/*
+	 * if we're doing a data allocation, preallocate room in the
+	 * extent tree first.  This way the extent tree blocks end up
+	 * in the correct block group.
+	 */
+	if (data) {
+		ret = find_free_extent(trans, root, 0, search_start,
+				       search_end, &prealloc_key, 0);
+		if (ret) {
+			return ret;
+		}
+		if (prealloc_key.objectid + prealloc_key.offset >= search_end) {
+			int nr = info->extent_tree_prealloc_nr;
+			search_end = info->extent_tree_prealloc[nr - 1] - 1;
+		} else {
+			search_start = info->extent_tree_prealloc[0] + 1;
+		}
+	}
 	/* do the real allocation */
 	ret = find_free_extent(trans, root, num_blocks, search_start,
 			       search_end, ins, data);
-	if (ret)
+	if (ret) {
 		return ret;
+	}
 
-	/* then do prealloc for the extent tree */
-	if (ins->objectid + ins->offset >= search_end)
-		search_end = ins->objectid - 1;
-	else
-		search_start = ins->objectid + ins->offset;
+	/*
+	 * if we're doing a metadata allocation, preallocate space in the
+	 * extent tree second.  This way, we don't create a tiny hole
+	 * in the allocation map between any unused preallocation blocks
+	 * and the metadata block we're actually allocating.  On disk,
+	 * it'll go:
+	 * [block we've allocated], [used prealloc 1], [ unused prealloc ]
+	 * The unused prealloc will get reused the next time around.
+	 */
+	if (!data) {
+		if (ins->objectid + ins->offset >= search_end)
+			search_end = ins->objectid - 1;
+		else
+			search_start = ins->objectid + ins->offset;
 
-	ret = find_free_extent(trans, root, 0, search_start,
-			       search_end, &prealloc_key, 0);
-	if (ret)
-		return ret;
+		ret = find_free_extent(trans, root, 0, search_start,
+				       search_end, &prealloc_key, 0);
+		if (ret) {
+			return ret;
+		}
+	}
 
 	super_blocks_used = btrfs_super_blocks_used(info->disk_super);
 	btrfs_set_super_blocks_used(info->disk_super, super_blocks_used +
@@ -900,11 +1151,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
-	if (ret)
+	if (ret) {
 		return ret;
-	if (pending_ret)
+	}
+	if (pending_ret) {
 		return pending_ret;
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+	}
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
 	return 0;
 }
 
@@ -920,7 +1173,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, hint, (unsigned long)-1, &ins, 0);
+				 1, 0, (unsigned long)-1, &ins, 0);
 	if (ret) {
 		BUG();
 		return NULL;
@@ -1134,6 +1387,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	int ret;
 	int ret2;
+	unsigned long gang[16];
+	int i;
 
 	ret = free_block_group_radix(&info->block_group_radix);
 	ret2 = free_block_group_radix(&info->block_group_data_radix);
@@ -1141,6 +1396,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		return ret;
 	if (ret2)
 		return ret2;
+
+	while(1) {
+		ret = find_first_radix_bit(&info->extent_map_radix,
+					   gang, 0, ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			clear_radix_bit(&info->extent_map_radix, gang[i]);
+		}
+	}
 	return 0;
 }
 
@@ -1186,7 +1451,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		}
 
-		if (nr & 1)
+		if (nr % 3)
 			radix = &info->block_group_data_radix;
 		else
 			radix = &info->block_group_radix;
@@ -1197,8 +1462,14 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->last_alloc = cache->key.objectid;
 		cache->first_free = cache->key.objectid;
+		cache->last_prealloc = cache->key.objectid;
 		cache->pinned = 0;
-		cache->data = (nr & 1);
+		cache->cached = 0;
+
+		if (nr % 3)
+			cache->data = 1;
+		else
+			cache->data = 0;
 		cache->radix = radix;
 
 		key.objectid = found_key.objectid + found_key.offset;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d5ac0d8ee3c..4c2870e2d12 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1539,7 +1539,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			kunmap(pages[i]);
 		}
 		SetPageChecked(pages[i]);
-		btrfs_update_inode_block_group(trans, inode);
+		// btrfs_update_inode_block_group(trans, inode);
 		ret = btrfs_end_transaction(trans, root);
 		BUG_ON(ret);
 		mutex_unlock(&root->fs_info->fs_mutex);
@@ -1914,7 +1914,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 	BUG_ON(ret);
 	alloc_extent_start = ins.objectid;
-	btrfs_update_inode_block_group(trans, inode);
+	// btrfs_update_inode_block_group(trans, inode);
 	ret = btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8bbe9107ff7..f0f03121b7b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -122,7 +122,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 	}
 	dirty_pages = &trans->transaction->dirty_pages;
 	while(1) {
-		ret = find_first_radix_bit(dirty_pages, gang, ARRAY_SIZE(gang));
+		ret = find_first_radix_bit(dirty_pages, gang,
+					   0, ARRAY_SIZE(gang));
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-- 
cgit v1.2.3


From 8d7be552a733e53135a91d2d28f57ab87dc0e889 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 May 2007 11:24:42 -0400
Subject: Btrfs: fix check_node and check_leaf to use less cpu

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 44 ++++++++++++++++++++++++++------------------
 fs/btrfs/extent-tree.c |  6 +++---
 2 files changed, 29 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dbd3f636dd3..990d297a7a1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -130,15 +130,17 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
-	int i;
 	struct btrfs_node *parent = NULL;
 	struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]);
 	int parent_slot;
+	int slot;
+	struct btrfs_key cpukey;
 	u32 nritems = btrfs_header_nritems(&node->header);
 
 	if (path->nodes[level + 1])
 		parent = btrfs_buffer_node(path->nodes[level + 1]);
 	parent_slot = path->slots[level + 1];
+	slot = path->slots[level];
 	BUG_ON(nritems == 0);
 	if (parent) {
 		struct btrfs_disk_key *parent_key;
@@ -149,10 +151,13 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		       btrfs_header_blocknr(&node->header));
 	}
 	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
-	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
-		struct btrfs_key cpukey;
-		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[i + 1].key);
-		BUG_ON(comp_keys(&node->ptrs[i].key, &cpukey) >= 0);
+	if (slot != 0) {
+		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[slot - 1].key);
+		BUG_ON(comp_keys(&node->ptrs[slot].key, &cpukey) <= 0);
+	}
+	if (slot < nritems - 1) {
+		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[slot + 1].key);
+		BUG_ON(comp_keys(&node->ptrs[slot].key, &cpukey) >= 0);
 	}
 	return 0;
 }
@@ -160,10 +165,12 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
-	int i;
 	struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[level]);
 	struct btrfs_node *parent = NULL;
 	int parent_slot;
+	int slot = path->slots[0];
+	struct btrfs_key cpukey;
+
 	u32 nritems = btrfs_header_nritems(&leaf->header);
 
 	if (path->nodes[level + 1])
@@ -182,19 +189,20 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
 		       btrfs_header_blocknr(&leaf->header));
 	}
-	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
-		struct btrfs_key cpukey;
-		btrfs_disk_key_to_cpu(&cpukey, &leaf->items[i + 1].key);
-		BUG_ON(comp_keys(&leaf->items[i].key,
-		                 &cpukey) >= 0);
-		BUG_ON(btrfs_item_offset(leaf->items + i) !=
-			btrfs_item_end(leaf->items + i + 1));
-		if (i == 0) {
-			BUG_ON(btrfs_item_offset(leaf->items + i) +
-			       btrfs_item_size(leaf->items + i) !=
-			       BTRFS_LEAF_DATA_SIZE(root));
-		}
+	if (slot != 0) {
+		btrfs_disk_key_to_cpu(&cpukey, &leaf->items[slot - 1].key);
+		BUG_ON(comp_keys(&leaf->items[slot].key, &cpukey) <= 0);
+		BUG_ON(btrfs_item_offset(leaf->items + slot - 1) !=
+			btrfs_item_end(leaf->items + slot));
+	}
+	if (slot < nritems - 1) {
+		btrfs_disk_key_to_cpu(&cpukey, &leaf->items[slot + 1].key);
+		BUG_ON(comp_keys(&leaf->items[slot].key, &cpukey) >= 0);
+		BUG_ON(btrfs_item_offset(leaf->items + slot) !=
+			btrfs_item_end(leaf->items + slot + 1));
 	}
+	BUG_ON(btrfs_item_offset(leaf->items) +
+	       btrfs_item_size(leaf->items) != BTRFS_LEAF_DATA_SIZE(root));
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3ac9da45347..1497ff98f0d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -235,7 +235,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 					     hint->key.offset - 1,
 					     BTRFS_BLOCK_GROUP_AVAIL);
 		}
-		last = hint->key.offset * 2;
+		last = hint->key.offset * 3;
 		if (hint->key.objectid >= last)
 			last = max(search_start + hint->key.offset - 1,
 				   hint->key.objectid - last);
@@ -554,8 +554,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 						      blocknr + i);
 				}
 			}
-			if (old_val < (cache->key.offset * 8) / 10 &&
-			    old_val + num >= (cache->key.offset * 8) / 10) {
+			if (old_val < (cache->key.offset * 6) / 10 &&
+			    old_val + num >= (cache->key.offset * 6) / 10) {
 printk("group %Lu now available\n", cache->key.objectid);
 				radix_tree_tag_set(cache->radix,
 						   cache->key.objectid +
-- 
cgit v1.2.3


From 509659cde578d891445afd67d87121dd13e71596 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 May 2007 12:36:17 -0400
Subject: Btrfs: switch to crc32c instead of sha256

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  4 +++-
 fs/btrfs/disk-io.c   | 12 ++++++------
 fs/btrfs/file-item.c | 35 +++++++++++++++++++----------------
 3 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 92a6078de82..d8caeba5d75 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,8 @@ extern struct kmem_cache *btrfs_path_cachep;
 
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
+/* four bytes for CRC32 */
+#define BTRFS_CRC32_SIZE 4
 
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
@@ -235,7 +237,7 @@ struct btrfs_file_extent_item {
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
-	u8 csum[BTRFS_CSUM_SIZE];
+	u8 csum;
 } __attribute__ ((__packed__));
 
 struct btrfs_device_item {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2dbf422a2b9..678456d1e2f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -196,14 +196,14 @@ int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 	ret = crypto_hash_digest(&desc, &sg, 1, result);
 	spin_unlock(&root->fs_info->hash_lock);
 	if (ret) {
-		printk("sha256 digest failed\n");
+		printk("digest failed\n");
 	}
 	return ret;
 }
 static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 			   int verify)
 {
-	char result[BTRFS_CSUM_SIZE];
+	char result[BTRFS_CRC32_SIZE];
 	int ret;
 	struct btrfs_node *node;
 
@@ -212,14 +212,14 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 	if (ret)
 		return ret;
 	if (verify) {
-		if (memcmp(bh->b_data, result, BTRFS_CSUM_SIZE)) {
+		if (memcmp(bh->b_data, result, BTRFS_CRC32_SIZE)) {
 			printk("checksum verify failed on %Lu\n",
 			       bh_blocknr(bh));
 			return 1;
 		}
 	} else {
 		node = btrfs_buffer_node(bh);
-		memcpy(node->header.csum, result, BTRFS_CSUM_SIZE);
+		memcpy(node->header.csum, result, BTRFS_CRC32_SIZE);
 	}
 	return 0;
 }
@@ -576,10 +576,10 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	       sizeof(struct btrfs_key));
 	insert_inode_hash(fs_info->btree_inode);
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-	fs_info->hash_tfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
+	fs_info->hash_tfm = crypto_alloc_hash("crc32c", 0, CRYPTO_ALG_ASYNC);
 	spin_lock_init(&fs_info->hash_lock);
 	if (!fs_info->hash_tfm || IS_ERR(fs_info->hash_tfm)) {
-		printk("failed to allocate sha256 hash\n");
+		printk("failed to allocate digest hash\n");
 		return NULL;
 	}
 	mutex_init(&fs_info->trans_mutex);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 10e4cf08e9e..a66709e6d03 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -4,8 +4,8 @@
 #include "transaction.h"
 
 #define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
-				 sizeof(struct btrfs_item) * 2) / \
-				sizeof(struct btrfs_csum_item)) - 1))
+			       sizeof(struct btrfs_item) * 2) / \
+			       BTRFS_CRC32_SIZE) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 pos,
@@ -78,7 +78,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		csum_offset = (offset - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 		csums_in_item = btrfs_item_size(leaf->items + path->slots[0]);
-		csums_in_item /= sizeof(struct btrfs_csum_item);
+		csums_in_item /= BTRFS_CRC32_SIZE;
 
 		if (csum_offset >= csums_in_item) {
 			ret = -EFBIG;
@@ -86,7 +86,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		}
 	}
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-	item += csum_offset;
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * BTRFS_CRC32_SIZE);
 	return item;
 fail:
 	if (ret > 0)
@@ -143,8 +144,7 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 		/* we found one, but it isn't big enough yet */
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		item_size = btrfs_item_size(leaf->items + path->slots[0]);
-		if ((item_size / sizeof(struct btrfs_csum_item)) >=
-		    MAX_CSUM_ITEMS(root)) {
+		if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) {
 			/* already at max size, make a new one */
 			goto insert;
 		}
@@ -159,7 +159,7 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_release_path(root, path);
 	ret = btrfs_search_slot(trans, root, &file_key, path,
-				sizeof(struct btrfs_csum_item), 1);
+				BTRFS_CRC32_SIZE, 1);
 	if (ret < 0)
 		goto fail;
 	if (ret == 0) {
@@ -180,10 +180,10 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 		goto insert;
 	}
 	if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) /
-	    sizeof(struct btrfs_csum_item)) {
-		u32 diff = (csum_offset + 1) * sizeof(struct btrfs_csum_item);
+	    BTRFS_CRC32_SIZE) {
+		u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
 		diff = diff - btrfs_item_size(leaf->items + path->slots[0]);
-		WARN_ON(diff != sizeof(struct btrfs_csum_item));
+		WARN_ON(diff != BTRFS_CRC32_SIZE);
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
 		goto csum;
@@ -193,7 +193,7 @@ insert:
 	btrfs_release_path(root, path);
 	csum_offset = 0;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
-				      sizeof(struct btrfs_csum_item));
+				      BTRFS_CRC32_SIZE);
 	if (ret != 0) {
 		printk("at insert for %Lu %u %Lu ret is %d\n", file_key.objectid, file_key.flags, file_key.offset, ret);
 		WARN_ON(1);
@@ -203,10 +203,13 @@ csum:
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_csum_item);
 	ret = 0;
-	item += csum_offset;
+	item = (struct btrfs_csum_item *)((unsigned char *)item +
+					  csum_offset * BTRFS_CRC32_SIZE);
 found:
-	btrfs_check_bounds(item->csum, BTRFS_CSUM_SIZE, path->nodes[0]->b_data, root->fs_info->sb->s_blocksize);
-	ret = btrfs_csum_data(root, data, len, item->csum);
+	btrfs_check_bounds(&item->csum, BTRFS_CRC32_SIZE,
+			   path->nodes[0]->b_data,
+			   root->fs_info->sb->s_blocksize);
+	ret = btrfs_csum_data(root, data, len, &item->csum);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
@@ -222,7 +225,7 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 	struct btrfs_key file_key;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
-	char result[BTRFS_CSUM_SIZE];
+	char result[BTRFS_CRC32_SIZE];
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -244,7 +247,7 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 
 	ret = btrfs_csum_data(root, data, len, result);
 	WARN_ON(ret);
-	if (memcmp(result, item->csum, BTRFS_CSUM_SIZE))
+	if (memcmp(result, &item->csum, BTRFS_CRC32_SIZE))
 		ret = 1;
 fail:
 	btrfs_release_path(root, path);
-- 
cgit v1.2.3


From 098f59c2512426926722a96f82af127bd91fb5e4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 May 2007 11:33:21 -0400
Subject: Btrfs: patch queue: fix corruption when splitting large items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 10 ++++++++--
 fs/btrfs/super.c |  8 ++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 990d297a7a1..20e01f95ce6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -871,6 +871,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_node_blockptr(lower, slot, blocknr);
 	btrfs_set_header_nritems(&lower->header, nritems + 1);
 	btrfs_mark_buffer_dirty(path->nodes[level]);
+	check_node(root, path, level);
 	return 0;
 }
 
@@ -1101,6 +1102,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	} else {
 		btrfs_block_release(root, right_buf);
 	}
+	if (path->nodes[1])
+		check_node(root, path, 1);
 	return 0;
 }
 /*
@@ -1216,6 +1219,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	btrfs_mark_buffer_dirty(t);
 	btrfs_mark_buffer_dirty(right_buf);
+
 	wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1);
 	if (wret)
 		ret = wret;
@@ -1231,6 +1235,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		path->slots[0] -= push_items;
 	}
 	BUG_ON(path->slots[0] < 0);
+	if (path->nodes[1])
+		check_node(root, path, 1);
 	return ret;
 }
 
@@ -1326,13 +1332,12 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				wret = insert_ptr(trans, root, path,
 						  &disk_key,
 						  bh_blocknr(right_buffer),
-						  path->slots[1] - 1, 1);
+						  path->slots[1], 1);
 				if (wret)
 					ret = wret;
 				btrfs_block_release(root, path->nodes[0]);
 				path->nodes[0] = right_buffer;
 				path->slots[0] = 0;
-				path->slots[1] -= 1;
 				if (path->slots[1] == 0) {
 					wret = fixup_low_keys(trans, root,
 					           path, &disk_key, 1);
@@ -1379,6 +1384,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	} else
 		btrfs_block_release(root, right_buffer);
 	BUG_ON(path->slots[0] < 0);
+	check_node(root, path, 1);
 
 	if (!double_split)
 		return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4c2870e2d12..eaa48f09d1c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1529,7 +1529,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			btrfs_set_file_extent_type(ei,
 						   BTRFS_FILE_EXTENT_INLINE);
 			ptr = btrfs_file_extent_inline_start(ei);
-			memcpy(ptr, bh->b_data, offset + write_bytes);
+			btrfs_memcpy(root, path->nodes[0]->b_data,
+				     ptr, bh->b_data, offset + write_bytes);
 			mark_buffer_dirty(path->nodes[0]);
 			btrfs_free_path(path);
 		} else {
@@ -1686,9 +1687,9 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 			ret = btrfs_del_item(trans, root, path);
 			BUG_ON(ret);
 			btrfs_release_path(root, path);
+			extent = NULL;
 			if (found_extent) {
-				inode->i_blocks -=
-				btrfs_file_extent_num_blocks(extent) << 3;
+				inode->i_blocks -= extent_num_blocks << 3;
 				ret = btrfs_free_extent(trans, root,
 							disk_blocknr,
 							disk_num_blocks, 0);
@@ -1832,7 +1833,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	u64 alloc_extent_start;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
-
 	pinned[0] = NULL;
 	pinned[1] = NULL;
 	if (file->f_flags & O_DIRECT)
-- 
cgit v1.2.3


From de428b63b16f9c74a24228a517713e8c65f79475 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 May 2007 13:28:27 -0400
Subject: Btrfs: allocator optimizations, truncate readahead

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/disk-io.c     | 12 ++++++--
 fs/btrfs/extent-tree.c | 76 ++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/super.c       | 76 +++++++++++++++++++++++++++++++++++++-------------
 4 files changed, 129 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d8caeba5d75..5830b4333d8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1081,7 +1081,7 @@ static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
-						 int data);
+						 int data, int owner);
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 678456d1e2f..4c0262b5766 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -257,24 +257,30 @@ static struct address_space_operations btree_aops = {
 int readahead_tree_block(struct btrfs_root *root, u64 blocknr)
 {
 	struct buffer_head *bh = NULL;
+	int ret = 0;
 
 	bh = btrfs_find_create_tree_block(root, blocknr);
 	if (!bh)
 		return 0;
-	if (buffer_uptodate(bh))
+	if (buffer_uptodate(bh)) {
+		ret = 1;
 		goto done;
-	if (test_set_buffer_locked(bh))
+	}
+	if (test_set_buffer_locked(bh)) {
+		ret = 1;
 		goto done;
+	}
 	if (!buffer_uptodate(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh);
 	} else {
 		unlock_buffer(bh);
+		ret = 1;
 	}
 done:
 	brelse(bh);
-	return 0;
+	return ret;
 }
 
 struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1497ff98f0d..e3c6bfea375 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -12,6 +12,33 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
+static void reada_extent_leaves(struct btrfs_root *root,
+				struct btrfs_path *path, u64 limit)
+{
+	struct btrfs_node *node;
+	int i;
+	int nritems;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+	int ret;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1] + 1;
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot; i < nritems && i < slot + 8; i++) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid > limit)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -24,6 +51,7 @@ static int cache_block_group(struct btrfs_root *root,
 	u64 i;
 	u64 last = 0;
 	u64 hole_size;
+	u64 limit;
 	int found = 0;
 
 	root = root->fs_info->extent_root;
@@ -46,14 +74,17 @@ printk("cache block group %Lu\n", block_group->key.objectid);
 		return ret;
 	if (ret && path->slots[0] > 0)
 		path->slots[0]--;
+	limit = block_group->key.objectid + block_group->key.offset;
+	reada_extent_leaves(root, path, limit);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&leaf->header)) {
+			reada_extent_leaves(root, path, limit);
 			ret = btrfs_next_leaf(root, path);
-			if (ret == 0)
+			if (ret == 0) {
 				continue;
-			else {
+			} else {
 				if (found) {
 					hole_size = block_group->key.objectid +
 						block_group->key.offset - last;
@@ -187,7 +218,7 @@ new_group:
 		return max((*cache_ret)->last_alloc, search_start);
 	}
 	cache = btrfs_find_block_group(root, cache,
-				       last + cache->key.offset - 1, 0);
+				       last + cache->key.offset - 1, 0, 0);
 	*cache_ret = cache;
 	goto again;
 }
@@ -195,7 +226,7 @@ new_group:
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
-						 int data)
+						 int data, int owner)
 {
 	struct btrfs_block_group_cache *cache[8];
 	struct btrfs_block_group_cache *found_group = NULL;
@@ -207,6 +238,10 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	int i;
 	int ret;
 	int full_search = 0;
+	int factor = 8;
+
+	if (!owner)
+		factor = 5;
 
 	if (data)
 		radix = &info->block_group_data_radix;
@@ -219,14 +254,14 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		if (shint->data == data) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
-			    (shint->key.offset * 8) / 10) {
+			    (shint->key.offset * factor) / 10) {
 				return shint;
 			}
 		}
 	}
 	if (hint && hint->data == data) {
 		used = btrfs_block_group_used(&hint->item);
-		if (used + hint->pinned < (hint->key.offset * 8) / 10) {
+		if (used + hint->pinned < (hint->key.offset * factor) / 10) {
 			return hint;
 		}
 		if (used >= (hint->key.offset * 8) / 10) {
@@ -261,7 +296,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 				cache[i]->key.offset;
 			used = btrfs_block_group_used(&cache[i]->item);
 			if (used + cache[i]->pinned <
-			    (cache[i]->key.offset * 8) / 10) {
+			    (cache[i]->key.offset * factor) / 10) {
 				found_group = cache[i];
 				goto found;
 			}
@@ -272,6 +307,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						     BTRFS_BLOCK_GROUP_AVAIL);
 			}
 		}
+		cond_resched();
 	}
 	last = hint_last;
 again:
@@ -295,13 +331,16 @@ again:
 						     BTRFS_BLOCK_GROUP_AVAIL);
 			}
 		}
+		cond_resched();
 	}
 	if (!full_search) {
+printk("find block group doing full search data %d start %Lu\n", data, search_start);
 		last = search_start;
 		full_search = 1;
 		goto again;
 	}
 	if (!found_group) {
+printk("find block group bailing to zero data %d\n", data);
 		ret = radix_tree_gang_lookup(radix,
 					     (void **)&found_group, 0, 1);
 		BUG_ON(ret != 1);
@@ -554,8 +593,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 						      blocknr + i);
 				}
 			}
-			if (old_val < (cache->key.offset * 6) / 10 &&
-			    old_val + num >= (cache->key.offset * 6) / 10) {
+			if (old_val < (cache->key.offset * 5) / 10 &&
+			    old_val + num >= (cache->key.offset * 5) / 10) {
 printk("group %Lu now available\n", cache->key.objectid);
 				radix_tree_tag_set(cache->radix,
 						   cache->key.objectid +
@@ -842,6 +881,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int level;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
+	u64 limit;
 
 	path = btrfs_alloc_path();
 	ins->flags = 0;
@@ -858,11 +898,11 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (search_start) {
 		block_group = lookup_block_group(info, search_start);
 		block_group = btrfs_find_block_group(root, block_group,
-						     search_start, data);
+						     search_start, data, 1);
 	} else {
 		block_group = btrfs_find_block_group(root,
 						     trans->block_group, 0,
-						     data);
+						     data, 1);
 	}
 
 check_failed:
@@ -916,6 +956,12 @@ check_failed:
 				info->extent_tree_prealloc_nr = 0;
 				total_found = 0;
 			}
+			if (start_found)
+				limit = last_block +
+					block_group->key.offset / 2;
+			else
+				limit = search_start +
+					block_group->key.offset / 2;
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
@@ -960,6 +1006,7 @@ check_failed:
 		}
 next:
 		path->slots[0]++;
+		cond_resched();
 	}
 	// FIXME -ENOSPC
 check_pending:
@@ -1049,7 +1096,8 @@ printk("doing full scan!\n");
 	block_group = lookup_block_group(info, search_start);
 	if (!full_scan)
 		block_group = btrfs_find_block_group(root, block_group,
-						     search_start, data);
+						     search_start, data, 0);
+	cond_resched();
 	goto check_failed;
 
 error:
@@ -1102,7 +1150,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	 * in the correct block group.
 	 */
 	if (data) {
-		ret = find_free_extent(trans, root, 0, search_start,
+		ret = find_free_extent(trans, root, 0, 0,
 				       search_end, &prealloc_key, 0);
 		if (ret) {
 			return ret;
@@ -1173,7 +1221,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, 0, (unsigned long)-1, &ins, 0);
+				 1, hint, (unsigned long)-1, &ins, 0);
 	if (ret) {
 		BUG();
 		return NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index eaa48f09d1c..0f79490123c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -351,6 +351,35 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path,
+			   u64 objectid)
+{
+	struct btrfs_node *node;
+	int i;
+	int nritems;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+	int ret;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1];
+	if (slot == 0)
+		return;
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot - 1; i >= 0; i--) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid != objectid)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct inode *inode)
@@ -386,6 +415,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			BUG_ON(path->slots[0] == 0);
 			path->slots[0]--;
 		}
+		reada_truncate(root, path, inode->i_ino);
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		found_key = &leaf->items[path->slots[0]].key;
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
@@ -587,28 +617,30 @@ printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_r
 	return d_splice_alias(inode, dentry);
 }
 
-static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path)
+static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path,
+			 u64 objectid)
 {
 	struct btrfs_node *node;
 	int i;
-	int nritems;
-	u64 objectid;
+	u32 nritems;
 	u64 item_objectid;
 	u64 blocknr;
 	int slot;
+	int ret;
 
 	if (!path->nodes[1])
 		return;
 	node = btrfs_buffer_node(path->nodes[1]);
 	slot = path->slots[1];
-	objectid = btrfs_disk_key_objectid(&node->ptrs[slot].key);
 	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot; i < nritems; i++) {
+	for (i = slot + 1; i < nritems; i++) {
 		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
 		if (item_objectid != objectid)
 			break;
 		blocknr = btrfs_node_blockptr(node, i);
-		readahead_tree_block(root, blocknr);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
 	}
 }
 
@@ -646,21 +678,20 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (ret < 0)
 		goto err;
 	advance = 0;
-	reada_leaves(root, path);
+	reada_leaves(root, path, inode->i_ino);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
 		slot = path->slots[0];
 		if (advance || slot >= nritems) {
 			if (slot >= nritems -1) {
+				reada_leaves(root, path, inode->i_ino);
 				ret = btrfs_next_leaf(root, path);
 				if (ret)
 					break;
 				leaf = btrfs_buffer_leaf(path->nodes[0]);
 				nritems = btrfs_header_nritems(&leaf->header);
 				slot = path->slots[0];
-				if (path->slots[1] == 0)
-					reada_leaves(root, path);
 			} else {
 				slot++;
 				path->slots[0]++;
@@ -805,13 +836,18 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_item inode_item;
 	struct btrfs_key *location;
 	int ret;
+	int owner;
 
 	inode = new_inode(root->fs_info->sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
 	BTRFS_I(inode)->root = root;
-	group = btrfs_find_block_group(root, group, 0, 0);
+	if (mode & S_IFDIR)
+		owner = 0;
+	else
+		owner = 1;
+	group = btrfs_find_block_group(root, group, 0, 0, owner);
 	BTRFS_I(inode)->block_group = group;
 
 	inode->i_uid = current->fsuid;
@@ -1562,7 +1598,7 @@ failed:
 static int drop_extents(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct inode *inode,
-			  u64 start, u64 end)
+			  u64 start, u64 end, u64 *hint_block)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1659,17 +1695,14 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 				new_num = (start - key.offset) >>
 					inode->i_blkbits;
 				old_num = btrfs_file_extent_num_blocks(extent);
+				*hint_block =
+					btrfs_file_extent_disk_blocknr(extent);
 				inode->i_blocks -= (old_num - new_num) << 3;
 				btrfs_set_file_extent_num_blocks(extent,
 								 new_num);
 				mark_buffer_dirty(path->nodes[0]);
 			} else {
 				WARN_ON(1);
-				/*
-				ret = btrfs_truncate_item(trans, root, path,
-							  start - key.offset);
-				BUG_ON(ret);
-				*/
 			}
 		}
 		if (!keep) {
@@ -1683,6 +1716,8 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 				      btrfs_file_extent_disk_num_blocks(extent);
 				extent_num_blocks =
 				      btrfs_file_extent_num_blocks(extent);
+				*hint_block =
+					btrfs_file_extent_disk_blocknr(extent);
 			}
 			ret = btrfs_del_item(trans, root, path);
 			BUG_ON(ret);
@@ -1831,6 +1866,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	u64 start_pos;
 	u64 num_blocks;
 	u64 alloc_extent_start;
+	u64 hint_block;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 	pinned[0] = NULL;
@@ -1871,6 +1907,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 	if (first_index != last_index &&
 	    (last_index << PAGE_CACHE_SHIFT) < inode->i_size &&
+	    pos + count < inode->i_size &&
 	    (count & (PAGE_CACHE_SIZE - 1))) {
 		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
 		if (!PageUptodate(pinned[1])) {
@@ -1892,18 +1929,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	btrfs_set_trans_block_group(trans, inode);
 	/* FIXME blocksize != 4096 */
 	inode->i_blocks += num_blocks << 3;
+	hint_block = 0;
 	if (start_pos < inode->i_size) {
 		/* FIXME blocksize != pagesize */
 		ret = drop_extents(trans, root, inode,
 				   start_pos,
 				   (pos + count + root->blocksize -1) &
-				   ~((u64)root->blocksize - 1));
+				   ~((u64)root->blocksize - 1), &hint_block);
 		BUG_ON(ret);
 	}
 	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
 	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, 1, (u64)-1, &ins, 1);
+					 num_blocks, hint_block, (u64)-1,
+					 &ins, 1);
 		BUG_ON(ret);
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       start_pos, ins.objectid, ins.offset);
@@ -2455,7 +2494,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 			   btrfs_fill_super, mnt);
 }
 
-
 static int btrfs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
-- 
cgit v1.2.3


From f9f3c6b666f717510b67036c314ec915b9059eaa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 May 2007 14:05:12 -0400
Subject: Btrfs: 2.6.21-git fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  2 --
 fs/btrfs/disk-io.c |  3 ---
 fs/btrfs/super.c   | 21 +--------------------
 3 files changed, 1 insertion(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5830b4333d8..da12d827581 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1197,6 +1197,4 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  u64 objectid, u64 offset,
 					  int cow);
 /* super.c */
-extern struct subsystem btrfs_subsys;
-
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4c0262b5766..3da057f78bc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -649,9 +649,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
 	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
-	kobj_set_kset_s(fs_info, btrfs_subsys);
-	kobject_set_name(&fs_info->kobj, "%s", sb->s_id);
-	kobject_register(&fs_info->kobj);
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0f79490123c..0220c82841a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -18,24 +18,11 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 
-static void btrfs_fsinfo_release(struct kobject *obj)
-{
-	struct btrfs_fs_info *fsinfo = container_of(obj,
-					    struct btrfs_fs_info, kobj);
-	kfree(fsinfo);
-}
-
-static struct kobj_type btrfs_fsinfo_ktype = {
-	.release = btrfs_fsinfo_release,
-};
-
 struct btrfs_iget_args {
 	u64 ino;
 	struct btrfs_root *root;
 };
 
-decl_subsys(btrfs, &btrfs_fsinfo_ktype, NULL);
-
 #define BTRFS_SUPER_MAGIC 0x9123682E
 
 static struct inode_operations btrfs_dir_inode_operations;
@@ -2438,7 +2425,7 @@ static void init_once(void * foo, struct kmem_cache * cachep,
 {
 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	if ((flags & (SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		inode_init_once(&ei->vfs_inode);
 	}
@@ -2588,12 +2575,7 @@ static int __init init_btrfs_fs(void)
 	err = init_inodecache();
 	if (err)
 		return err;
-	kset_set_kset_s(&btrfs_subsys, fs_subsys);
-	err = subsystem_register(&btrfs_subsys);
-	if (err)
-		goto out;
 	return register_filesystem(&btrfs_fs_type);
-out:
 	destroy_inodecache();
 	return err;
 }
@@ -2602,7 +2584,6 @@ static void __exit exit_btrfs_fs(void)
 {
 	destroy_inodecache();
 	unregister_filesystem(&btrfs_fs_type);
-	subsystem_unregister(&btrfs_subsys);
 	printk("btrfs unloaded\n");
 }
 
-- 
cgit v1.2.3


From e06afa839e726959be2166bec4cb85c117e213f1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 23 May 2007 15:44:28 -0400
Subject: Btrfs: rename

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO       |   1 +
 fs/btrfs/ctree.h    |   1 +
 fs/btrfs/dir-item.c |  27 ++++++++---
 fs/btrfs/super.c    | 132 +++++++++++++++++++++++++++++++++++++++++++++++-----
 4 files changed, 143 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index f6df246f26c..861e34ae097 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -8,6 +8,7 @@
 * Add generation number to key pointer in nodes
 * Add generation number to inode
 * Add ability to switch a block group from data to metadata or vice versa
+* forbid cross subvolume renames and hardlinks
 * Release
 * Do real tree locking
 * Add extent mirroring (backup copies of blocks)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index da12d827581..3330004af1a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -32,6 +32,7 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_CSUM_SIZE 32
 /* four bytes for CRC32 */
 #define BTRFS_CRC32_SIZE 4
+#define BTRFS_EMPTY_DIR_SIZE 6
 
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 00a28d90fea..b408a3d20c7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -9,7 +9,9 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 						   struct btrfs_root *root,
 						   struct btrfs_path *path,
 						   struct btrfs_key *cpu_key,
-						   u32 data_size)
+						   u32 data_size,
+						   const char *name,
+						   int name_len)
 {
 	int ret;
 	char *ptr;
@@ -18,6 +20,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (ret == -EEXIST) {
+		struct btrfs_dir_item *di;
+		di = btrfs_match_dir_item_name(root, path, name, name_len);
+		if (di)
+			return ERR_PTR(-EEXIST);
 		ret = btrfs_extend_item(trans, root, path, data_size);
 		WARN_ON(ret > 0);
 		if (ret)
@@ -37,6 +43,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  struct btrfs_key *location, u8 type)
 {
 	int ret = 0;
+	int ret2 = 0;
 	struct btrfs_path *path;
 	struct btrfs_dir_item *dir_item;
 	char *name_ptr;
@@ -51,9 +58,12 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	path = btrfs_alloc_path();
 	btrfs_init_path(path);
 	data_size = sizeof(*dir_item) + name_len;
-	dir_item = insert_with_overflow(trans, root, path, &key, data_size);
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
 	if (IS_ERR(dir_item)) {
 		ret = PTR_ERR(dir_item);
+		if (ret == -EEXIST)
+			goto second_insert;
 		goto out;
 	}
 
@@ -66,19 +76,20 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
+second_insert:
 	/* FIXME, use some real flag for selecting the extra index */
 	if (root == root->fs_info->tree_root) {
 		ret = 0;
 		goto out;
 	}
-
 	btrfs_release_path(root, path);
 
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
 	key.offset = location->objectid;
-	dir_item = insert_with_overflow(trans, root, path, &key, data_size);
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
 	if (IS_ERR(dir_item)) {
-		ret = PTR_ERR(dir_item);
+		ret2 = PTR_ERR(dir_item);
 		goto out;
 	}
 	btrfs_cpu_key_to_disk(&dir_item->location, location);
@@ -90,7 +101,11 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_free_path(path);
-	return ret;
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
 }
 
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0220c82841a..f49cad603ee 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -375,6 +375,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_disk_key *found_key;
+	u32 found_type;
 	struct btrfs_leaf *leaf;
 	struct btrfs_file_extent_item *fi = NULL;
 	u64 extent_start = 0;
@@ -386,12 +387,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
-	key.flags = 0;
-	/*
-	 * use BTRFS_CSUM_ITEM_KEY because it is larger than inline keys
-	 * or extent data
-	 */
-	btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
+	key.flags = (u32)-1;
 	while(1) {
 		btrfs_init_path(path);
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -405,10 +401,13 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		reada_truncate(root, path, inode->i_ino);
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		found_key = &leaf->items[path->slots[0]].key;
+		found_type = btrfs_disk_key_type(found_key);
 		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
 			break;
-		if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
-		    btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
+		if (found_type != BTRFS_CSUM_ITEM_KEY &&
+		    found_type != BTRFS_DIR_ITEM_KEY &&
+		    found_type != BTRFS_DIR_INDEX_KEY &&
+		    found_type != BTRFS_EXTENT_DATA_KEY)
 			break;
 		if (btrfs_disk_key_offset(found_key) < inode->i_size)
 			break;
@@ -460,10 +459,8 @@ static void btrfs_delete_inode(struct inode *inode)
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
-	if (S_ISREG(inode->i_mode)) {
-		ret = btrfs_truncate_in_trans(trans, root, inode);
-		BUG_ON(ret);
-	}
+	ret = btrfs_truncate_in_trans(trans, root, inode);
+	BUG_ON(ret);
 	btrfs_free_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -2504,6 +2501,116 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
+			   struct inode * new_dir,struct dentry *new_dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct timespec ctime = CURRENT_TIME;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
+	int ret;
+
+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+		return -ENOTEMPTY;
+	}
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, new_dir);
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out_fail;
+	}
+
+	old_dentry->d_inode->i_nlink++;
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	old_inode->i_ctime = ctime;
+	if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
+		struct btrfs_key *location = &BTRFS_I(new_dir)->location;
+		u64 old_parent_oid;
+		di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino,
+					   "..", 2, -1);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out_fail;
+		}
+		if (!di) {
+			ret = -ENOENT;
+			goto out_fail;
+		}
+		old_parent_oid = btrfs_disk_key_objectid(&di->location);
+		ret = btrfs_del_item(trans, root, path);
+		if (ret) {
+			ret = -EIO;
+			goto out_fail;
+		}
+		btrfs_release_path(root, path);
+
+		di = btrfs_lookup_dir_index_item(trans, root, path,
+						 old_inode->i_ino,
+						 old_parent_oid,
+						 "..", 2, -1);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out_fail;
+		}
+		if (!di) {
+			ret = -ENOENT;
+			goto out_fail;
+		}
+		ret = btrfs_del_item(trans, root, path);
+		if (ret) {
+			ret = -EIO;
+			goto out_fail;
+		}
+		btrfs_release_path(root, path);
+
+		ret = btrfs_insert_dir_item(trans, root, "..", 2,
+					    old_inode->i_ino, location, 0);
+		if (ret)
+			goto out_fail;
+	}
+
+
+	ret = btrfs_add_link(trans, new_dentry, old_inode);
+	if (ret == -EEXIST && new_inode)
+		ret = 0;
+	else if (ret)
+		goto out_fail;
+
+	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry);
+	if (ret)
+		goto out_fail;
+
+	if (new_inode) {
+		new_inode->i_ctime = CURRENT_TIME;
+		di = btrfs_lookup_dir_index_item(trans, root, path,
+						 new_dir->i_ino,
+						 new_inode->i_ino,
+						 new_dentry->d_name.name,
+						 new_dentry->d_name.len, -1);
+		if (di && !IS_ERR(di)) {
+			btrfs_del_item(trans, root, path);
+			btrfs_release_path(root, path);
+		}
+		if (S_ISDIR(new_inode->i_mode))
+			clear_nlink(new_inode);
+		else
+			drop_nlink(new_inode);
+		btrfs_update_inode(trans, root, new_inode);
+	}
+out_fail:
+	btrfs_free_path(path);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -2531,6 +2638,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.unlink		= btrfs_unlink,
 	.mkdir		= btrfs_mkdir,
 	.rmdir		= btrfs_rmdir,
+	.rename		= btrfs_rename,
 };
 
 static struct inode_operations btrfs_dir_ro_inode_operations = {
-- 
cgit v1.2.3


From 2b8d99a723a3186a69b5bd941a272dbecae9205b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 May 2007 08:34:10 -0400
Subject: Btrfs: symlinks and hard links

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 163 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 147 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f49cad603ee..c1a0adde9b1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,11 +26,13 @@ struct btrfs_iget_args {
 #define BTRFS_SUPER_MAGIC 0x9123682E
 
 static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_symlink_inode_operations;
 static struct inode_operations btrfs_dir_ro_inode_operations;
 static struct super_operations btrfs_super_ops;
 static struct file_operations btrfs_dir_file_operations;
 static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
+static struct address_space_operations btrfs_symlink_aops;
 static struct file_operations btrfs_file_operations;
 
 static void btrfs_read_locked_inode(struct inode *inode)
@@ -103,7 +105,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 			inode->i_op = &btrfs_dir_inode_operations;
 		break;
 	case S_IFLNK:
-		// inode->i_op = &page_symlink_inode_operations;
+		inode->i_op = &btrfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &btrfs_symlink_aops;
 		break;
 	}
 	return;
@@ -940,6 +943,41 @@ out_unlock:
 	return err;
 }
 
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	int drop_inode = 0;
+
+	if (inode->i_nlink == 0)
+		return -ENOENT;
+
+	inc_nlink(inode);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+	atomic_inc(&inode->i_count);
+	err = btrfs_add_nondir(trans, dentry, inode);
+	if (err)
+		drop_inode = 1;
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, dir);
+	btrfs_update_inode(trans, root, inode);
+
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
 static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 objectid, u64 dirid)
@@ -2577,33 +2615,25 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	}
 
 
-	ret = btrfs_add_link(trans, new_dentry, old_inode);
-	if (ret == -EEXIST && new_inode)
-		ret = 0;
-	else if (ret)
-		goto out_fail;
-
 	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry);
 	if (ret)
 		goto out_fail;
 
 	if (new_inode) {
 		new_inode->i_ctime = CURRENT_TIME;
-		di = btrfs_lookup_dir_index_item(trans, root, path,
-						 new_dir->i_ino,
-						 new_inode->i_ino,
-						 new_dentry->d_name.name,
-						 new_dentry->d_name.len, -1);
-		if (di && !IS_ERR(di)) {
-			btrfs_del_item(trans, root, path);
-			btrfs_release_path(root, path);
-		}
+		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
+		if (ret)
+			goto out_fail;
 		if (S_ISDIR(new_inode->i_mode))
 			clear_nlink(new_inode);
 		else
 			drop_nlink(new_inode);
 		btrfs_update_inode(trans, root, new_inode);
 	}
+	ret = btrfs_add_link(trans, new_dentry, old_inode);
+	if (ret)
+		goto out_fail;
+
 out_fail:
 	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
@@ -2611,6 +2641,94 @@ out_fail:
 	return ret;
 }
 
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct inode *inode;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	int name_len;
+	int datasize;
+	char *ptr;
+	struct btrfs_file_extent_item *ei;
+
+	name_len = strlen(symname) + 1;
+	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+		return -ENAMETOOLONG;
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_unlock;
+
+	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_add_nondir(trans, dentry, inode);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+	}
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+	if (drop_inode)
+		goto out_unlock;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	key.objectid = inode->i_ino;
+	key.offset = 0;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	datasize = btrfs_file_extent_calc_inline_size(name_len);
+	err = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	BUG_ON(err);
+	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+	       path->slots[0], struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(ei, trans->transid);
+	btrfs_set_file_extent_type(ei,
+				   BTRFS_FILE_EXTENT_INLINE);
+	ptr = btrfs_file_extent_inline_start(ei);
+	btrfs_memcpy(root, path->nodes[0]->b_data,
+		     ptr, symname, name_len);
+	mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+	inode->i_op = &btrfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_symlink_aops;
+	inode->i_size = name_len - 1;
+	btrfs_update_inode(trans, root, inode);
+	err = 0;
+
+out_unlock:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -2636,9 +2754,11 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.create		= btrfs_create,
 	.unlink		= btrfs_unlink,
+	.link		= btrfs_link,
 	.mkdir		= btrfs_mkdir,
 	.rmdir		= btrfs_rmdir,
 	.rename		= btrfs_rename,
+	.symlink	= btrfs_symlink,
 };
 
 static struct inode_operations btrfs_dir_ro_inode_operations = {
@@ -2660,6 +2780,11 @@ static struct address_space_operations btrfs_aops = {
 	.commit_write	= btrfs_commit_write,
 };
 
+static struct address_space_operations btrfs_symlink_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+};
+
 static struct inode_operations btrfs_file_inode_operations = {
 	.truncate	= btrfs_truncate,
 	.getattr	= btrfs_getattr,
@@ -2676,6 +2801,12 @@ static struct file_operations btrfs_file_operations = {
 	.fsync		= btrfs_sync_file,
 };
 
+static struct inode_operations btrfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+
 static int __init init_btrfs_fs(void)
 {
 	int err;
-- 
cgit v1.2.3


From 3a686375629da5d2e2ad019265b66ef113c87455 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 May 2007 13:35:57 -0400
Subject: Btrfs: sparse files!

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/extent-tree.c |  14 ++++--
 fs/btrfs/file-item.c   |  11 +++--
 fs/btrfs/super.c       | 121 ++++++++++++++++++++++++++++++++++++++++---------
 4 files changed, 116 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3330004af1a..e8f741167c6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1180,6 +1180,7 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 pos, u64 offset,
+			       u64 disk_num_blocks,
 			       u64 num_blocks);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e3c6bfea375..a366415e03a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -443,6 +443,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	buf_leaf = btrfs_buffer_leaf(buf);
 	for (i = 0; i < btrfs_header_nritems(&buf_node->header); i++) {
 		if (leaf) {
+			u64 disk_blocknr;
 			key = &buf_leaf->items[i].key;
 			if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
@@ -451,8 +452,10 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (btrfs_file_extent_type(fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
 				continue;
-			ret = btrfs_inc_extent_ref(trans, root,
-				    btrfs_file_extent_disk_blocknr(fi),
+			disk_blocknr = btrfs_file_extent_disk_blocknr(fi);
+			if (disk_blocknr == 0)
+				continue;
+			ret = btrfs_inc_extent_ref(trans, root, disk_blocknr,
 				    btrfs_file_extent_disk_num_blocks(fi));
 			BUG_ON(ret);
 		} else {
@@ -1248,6 +1251,7 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 	leaf = btrfs_buffer_leaf(cur);
 	nritems = btrfs_header_nritems(&leaf->header);
 	for (i = 0; i < nritems; i++) {
+		u64 disk_blocknr;
 		key = &leaf->items[i].key;
 		if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
 			continue;
@@ -1258,8 +1262,10 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 		 * FIXME make sure to insert a trans record that
 		 * repeats the snapshot del on crash
 		 */
-		ret = btrfs_free_extent(trans, root,
-					btrfs_file_extent_disk_blocknr(fi),
+		disk_blocknr = btrfs_file_extent_disk_blocknr(fi);
+		if (disk_blocknr == 0)
+			continue;
+		ret = btrfs_free_extent(trans, root, disk_blocknr,
 					btrfs_file_extent_disk_num_blocks(fi),
 					0);
 		BUG_ON(ret);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a66709e6d03..7990b574211 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -9,7 +9,8 @@
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 pos,
-			       u64 offset, u64 num_blocks)
+			       u64 offset, u64 disk_num_blocks,
+			       u64 num_blocks)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -30,7 +31,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_file_extent_item);
 	btrfs_set_file_extent_disk_blocknr(item, offset);
-	btrfs_set_file_extent_disk_num_blocks(item, num_blocks);
+	btrfs_set_file_extent_disk_num_blocks(item, disk_num_blocks);
 	btrfs_set_file_extent_offset(item, 0);
 	btrfs_set_file_extent_num_blocks(item, num_blocks);
 	btrfs_set_file_extent_generation(item, trans->transid);
@@ -176,14 +177,14 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 	    found_key.objectid != objectid ||
 	    csum_offset >= MAX_CSUM_ITEMS(root)) {
-		WARN_ON(1);
 		goto insert;
 	}
 	if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) /
 	    BTRFS_CRC32_SIZE) {
 		u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
 		diff = diff - btrfs_item_size(leaf->items + path->slots[0]);
-		WARN_ON(diff != BTRFS_CRC32_SIZE);
+		if (diff != BTRFS_CRC32_SIZE)
+			goto insert;
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
 		goto csum;
@@ -241,7 +242,7 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 		ret = PTR_ERR(item);
 		/* a csum that isn't present is a preallocated region. */
 		if (ret == -ENOENT || ret == -EFBIG)
-			ret = 1;
+			ret = -ENOENT;
 		goto fail;
 	}
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c1a0adde9b1..5b87c4e9d49 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -17,6 +17,7 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "ioctl.h"
+#include "print-tree.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -421,14 +422,17 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(fi) !=
 			    BTRFS_FILE_EXTENT_INLINE) {
+				u64 num_dec;
 				extent_start =
 					btrfs_file_extent_disk_blocknr(fi);
 				extent_num_blocks =
 					btrfs_file_extent_disk_num_blocks(fi);
 				/* FIXME blocksize != 4096 */
-				inode->i_blocks -=
-					btrfs_file_extent_num_blocks(fi) << 3;
-				found_extent = 1;
+				num_dec = btrfs_file_extent_num_blocks(fi) << 3;
+				if (extent_start != 0) {
+					found_extent = 1;
+					inode->i_blocks -= num_dec;
+				}
 			}
 		}
 		ret = btrfs_del_item(trans, root, path);
@@ -448,6 +452,43 @@ error:
 	return ret;
 }
 
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+		struct btrfs_trans_handle *trans;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 mask = root->blocksize - 1;
+		u64 pos = (inode->i_size + mask) & ~mask;
+		u64 hole_size;
+
+		if (attr->ia_size < pos)
+			goto out;
+		hole_size = (attr->ia_size - pos + mask) & ~mask;
+		hole_size >>= inode->i_blkbits;
+
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
+					       pos, 0, 0, hole_size);
+		BUG_ON(err);
+		btrfs_end_transaction(trans, root);
+		mutex_unlock(&root->fs_info->fs_mutex);
+	}
+
+	err = inode_setattr(inode, attr);
+
+out:
+	return err;
+}
 static void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
@@ -1169,8 +1210,10 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
 		extent_start = extent_start >> inode->i_blkbits;
 		extent_end = extent_start + btrfs_file_extent_num_blocks(item);
+		err = 0;
+		if (blocknr == 0)
+			goto out;
 		if (iblock >= extent_start && iblock < extent_end) {
-			err = 0;
 			btrfs_map_bh_to_logical(root, result, blocknr +
 						iblock - extent_start);
 			goto out;
@@ -1591,7 +1634,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				     ptr, bh->b_data, offset + write_bytes);
 			mark_buffer_dirty(path->nodes[0]);
 			btrfs_free_path(path);
-		} else {
+		} else if (buffer_mapped(bh)) {
 			btrfs_csum_file_block(trans, root, inode->i_ino,
 				      pages[i]->index << PAGE_CACHE_SHIFT,
 				      kmap(pages[i]), PAGE_CACHE_SIZE);
@@ -1693,15 +1736,24 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 			goto out;
 		}
 
-		search_start = extent_end;
+		if (found_inline) {
+			u64 mask = root->blocksize - 1;
+			search_start = (extent_end + mask) & ~mask;
+		} else
+			search_start = extent_end;
 
 		if (end < extent_end && end >= key.offset) {
 			if (found_extent) {
+				u64 disk_blocknr =
+					btrfs_file_extent_disk_blocknr(extent);
+				u64 disk_num_blocks =
+				      btrfs_file_extent_disk_num_blocks(extent);
 				memcpy(&old, extent, sizeof(old));
-				ret = btrfs_inc_extent_ref(trans, root,
-				      btrfs_file_extent_disk_blocknr(&old),
-				      btrfs_file_extent_disk_num_blocks(&old));
-				BUG_ON(ret);
+				if (disk_blocknr != 0) {
+					ret = btrfs_inc_extent_ref(trans, root,
+					         disk_blocknr, disk_num_blocks);
+					BUG_ON(ret);
+				}
 			}
 			WARN_ON(found_inline);
 			bookend = 1;
@@ -1719,7 +1771,10 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 				old_num = btrfs_file_extent_num_blocks(extent);
 				*hint_block =
 					btrfs_file_extent_disk_blocknr(extent);
-				inode->i_blocks -= (old_num - new_num) << 3;
+				if (btrfs_file_extent_disk_blocknr(extent)) {
+					inode->i_blocks -=
+						(old_num - new_num) << 3;
+				}
 				btrfs_set_file_extent_num_blocks(extent,
 								 new_num);
 				mark_buffer_dirty(path->nodes[0]);
@@ -1745,7 +1800,7 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 			btrfs_release_path(root, path);
 			extent = NULL;
-			if (found_extent) {
+			if (found_extent && disk_blocknr != 0) {
 				inode->i_blocks -= extent_num_blocks << 3;
 				ret = btrfs_free_extent(trans, root,
 							disk_blocknr,
@@ -1785,18 +1840,19 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 				    btrfs_file_extent_offset(&old) +
 				    ((end - key.offset) >> inode->i_blkbits));
 			WARN_ON(btrfs_file_extent_num_blocks(&old) <
-				(end - key.offset) >> inode->i_blkbits);
+				(extent_end - end) >> inode->i_blkbits);
 			btrfs_set_file_extent_num_blocks(extent,
-				    btrfs_file_extent_num_blocks(&old) -
-				    ((end - key.offset) >> inode->i_blkbits));
+				    (extent_end - end) >> inode->i_blkbits);
 
 			btrfs_set_file_extent_type(extent,
 						   BTRFS_FILE_EXTENT_REG);
 			btrfs_set_file_extent_generation(extent,
 				    btrfs_file_extent_generation(&old));
 			btrfs_mark_buffer_dirty(path->nodes[0]);
-			inode->i_blocks +=
-				btrfs_file_extent_num_blocks(extent) << 3;
+			if (btrfs_file_extent_disk_blocknr(&old) != 0) {
+				inode->i_blocks +=
+				      btrfs_file_extent_num_blocks(extent) << 3;
+			}
 			ret = 0;
 			goto out;
 		}
@@ -1960,6 +2016,21 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 				   ~((u64)root->blocksize - 1), &hint_block);
 		BUG_ON(ret);
 	}
+	if (inode->i_size < start_pos) {
+		u64 last_pos_in_file;
+		u64 hole_size;
+		u64 mask = root->blocksize - 1;
+		last_pos_in_file = (inode->i_size + mask) & ~mask;
+		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		hole_size >>= inode->i_blkbits;
+		if (last_pos_in_file < start_pos) {
+			ret = btrfs_insert_file_extent(trans, root,
+						       inode->i_ino,
+						       last_pos_in_file,
+						       0, 0, hole_size);
+		}
+		BUG_ON(ret);
+	}
 	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
 	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
@@ -1967,7 +2038,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 					 &ins, 1);
 		BUG_ON(ret);
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       start_pos, ins.objectid, ins.offset);
+				       start_pos, ins.objectid, ins.offset,
+				       ins.offset);
 		BUG_ON(ret);
 	} else {
 		ins.offset = 0;
@@ -2044,10 +2116,13 @@ static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 				  page->index << PAGE_CACHE_SHIFT,
 				  kmap(page), PAGE_CACHE_SIZE);
 		if (ret) {
-			printk("failed to verify ino %lu page %lu\n",
-			       page->mapping->host->i_ino,
-			       page->index);
-			memset(page_address(page), 0, PAGE_CACHE_SIZE);
+			if (ret != -ENOENT) {
+				printk("failed to verify ino %lu page %lu\n",
+				       page->mapping->host->i_ino,
+				       page->index);
+				memset(page_address(page), 0, PAGE_CACHE_SIZE);
+				flush_dcache_page(page);
+			}
 		}
 		SetPageChecked(page);
 		kunmap(page);
@@ -2759,6 +2834,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.rmdir		= btrfs_rmdir,
 	.rename		= btrfs_rename,
 	.symlink	= btrfs_symlink,
+	.setattr	= btrfs_setattr,
 };
 
 static struct inode_operations btrfs_dir_ro_inode_operations = {
@@ -2788,6 +2864,7 @@ static struct address_space_operations btrfs_symlink_aops = {
 static struct inode_operations btrfs_file_inode_operations = {
 	.truncate	= btrfs_truncate,
 	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
 };
 
 static struct file_operations btrfs_file_operations = {
-- 
cgit v1.2.3


From 1de037a43edf67f3a9f66dd197195b3c08febb16 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 May 2007 15:17:08 -0400
Subject: Btrfs: fixup various fsx failures

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |   3 +
 fs/btrfs/file-item.c |  30 ++++++++
 fs/btrfs/super.c     | 213 +++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 214 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e8f741167c6..756ea9d0123 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1198,5 +1198,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path,
 					  u64 objectid, u64 offset,
 					  int cow);
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct btrfs_path *path,
+			u64 isize);
 /* super.c */
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7990b574211..0782e924dde 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -2,6 +2,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "print-tree.h"
 
 #define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
 			       sizeof(struct btrfs_item) * 2) / \
@@ -27,6 +28,10 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(*item));
+	if (ret) {
+printk("failed to insert %Lu %Lu ret %d\n", objectid, pos, ret);
+btrfs_print_leaf(root, btrfs_buffer_leaf(path->nodes[0]));
+	}
 	BUG_ON(ret);
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_file_extent_item);
@@ -218,6 +223,31 @@ fail:
 	return ret;
 }
 
+int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct btrfs_path *path,
+			u64 isize)
+{
+	struct btrfs_key key;
+	struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[0]);
+	int slot = path->slots[0];
+	int ret;
+	u32 new_item_size;
+	u64 new_item_span;
+	u64 blocks;
+
+	btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+	if (isize <= key.offset)
+		return 0;
+	new_item_span = isize - key.offset;
+	blocks = (new_item_span + root->blocksize - 1) / root->blocksize;
+	new_item_size = blocks * BTRFS_CRC32_SIZE;
+	if (new_item_size >= btrfs_item_size(leaf->items + slot))
+		return 0;
+	ret = btrfs_truncate_item(trans, root, path, new_item_size);
+	BUG_ON(ret);
+	return ret;
+}
+
 int btrfs_csum_verify_file_block(struct btrfs_root *root,
 				 u64 objectid, u64 offset,
 				 char *data, size_t len)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5b87c4e9d49..0eb64d6eaf9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -36,6 +36,13 @@ static struct address_space_operations btrfs_aops;
 static struct address_space_operations btrfs_symlink_aops;
 static struct file_operations btrfs_file_operations;
 
+static int drop_extents(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct inode *inode,
+			  u64 start, u64 end, u64 *hint_block);
+static int btrfs_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *result, int create);
+
 static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -381,10 +388,12 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	struct btrfs_disk_key *found_key;
 	u32 found_type;
 	struct btrfs_leaf *leaf;
-	struct btrfs_file_extent_item *fi = NULL;
+	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
 	u64 extent_num_blocks = 0;
+	u64 item_end = 0;
 	int found_extent;
+	int del_item;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -394,6 +403,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	key.flags = (u32)-1;
 	while(1) {
 		btrfs_init_path(path);
+		fi = NULL;
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0) {
 			goto error;
@@ -413,16 +423,52 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		    found_type != BTRFS_DIR_INDEX_KEY &&
 		    found_type != BTRFS_EXTENT_DATA_KEY)
 			break;
-		if (btrfs_disk_key_offset(found_key) < inode->i_size)
-			break;
-		found_extent = 0;
-		if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
+		item_end = btrfs_disk_key_offset(found_key);
+		if (found_type == BTRFS_EXTENT_DATA_KEY) {
 			fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 					    path->slots[0],
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(fi) !=
 			    BTRFS_FILE_EXTENT_INLINE) {
-				u64 num_dec;
+				item_end += btrfs_file_extent_num_blocks(fi) <<
+						inode->i_blkbits;
+			}
+		}
+		if (found_type == BTRFS_CSUM_ITEM_KEY) {
+			ret = btrfs_csum_truncate(trans, root, path,
+						  inode->i_size);
+			BUG_ON(ret);
+		}
+		if (item_end < inode->i_size) {
+			if (found_type) {
+				btrfs_set_key_type(&key, found_type - 1);
+				continue;
+			}
+			break;
+		}
+		if (btrfs_disk_key_offset(found_key) >= inode->i_size)
+			del_item = 1;
+		else
+			del_item = 0;
+		found_extent = 0;
+
+		if (found_type == BTRFS_EXTENT_DATA_KEY &&
+			   btrfs_file_extent_type(fi) !=
+			   BTRFS_FILE_EXTENT_INLINE) {
+			u64 num_dec;
+			if (!del_item) {
+				u64 orig_num_blocks =
+					btrfs_file_extent_num_blocks(fi);
+				extent_num_blocks = inode->i_size -
+					btrfs_disk_key_offset(found_key) +
+					root->blocksize - 1;
+				extent_num_blocks >>= inode->i_blkbits;
+				btrfs_set_file_extent_num_blocks(fi,
+							 extent_num_blocks);
+				inode->i_blocks -= (orig_num_blocks -
+					extent_num_blocks) << 3;
+				mark_buffer_dirty(path->nodes[0]);
+			} else {
 				extent_start =
 					btrfs_file_extent_disk_blocknr(fi);
 				extent_num_blocks =
@@ -435,8 +481,12 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				}
 			}
 		}
-		ret = btrfs_del_item(trans, root, path);
-		BUG_ON(ret);
+		if (del_item) {
+			ret = btrfs_del_item(trans, root, path);
+			BUG_ON(ret);
+		} else {
+			break;
+		}
 		btrfs_release_path(root, path);
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
@@ -452,6 +502,68 @@ error:
 	return ret;
 }
 
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	struct page *page;
+	char *kaddr;
+	int ret = 0;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 alloc_hint;
+	struct btrfs_key ins;
+	struct btrfs_trans_handle *trans;
+
+	if ((offset & (blocksize - 1)) == 0)
+		goto out;
+
+	ret = -ENOMEM;
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		goto out;
+
+	if (!PageUptodate(page)) {
+		ret = mpage_readpage(page, btrfs_get_block);
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+			goto out;
+		}
+	}
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	ret = drop_extents(trans, root, inode, page->index << PAGE_CACHE_SHIFT,
+			   (page->index + 1) << PAGE_CACHE_SHIFT, &alloc_hint);
+	BUG_ON(ret);
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+				 alloc_hint, (u64)-1, &ins, 1);
+	BUG_ON(ret);
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       page->index << PAGE_CACHE_SHIFT,
+				       ins.objectid, 1, 1);
+	BUG_ON(ret);
+	SetPageChecked(page);
+	kaddr = kmap(page);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	btrfs_csum_file_block(trans, root, inode->i_ino,
+			      page->index << PAGE_CACHE_SHIFT,
+			      kaddr, PAGE_CACHE_SIZE);
+	kunmap(page);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -469,8 +581,11 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		u64 pos = (inode->i_size + mask) & ~mask;
 		u64 hole_size;
 
-		if (attr->ia_size < pos)
+		if (attr->ia_size <= pos)
 			goto out;
+
+		btrfs_truncate_page(inode->i_mapping, inode->i_size);
+
 		hole_size = (attr->ia_size - pos + mask) & ~mask;
 		hole_size >>= inode->i_blkbits;
 
@@ -483,10 +598,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
 	}
-
+out:
 	err = inode_setattr(inode, attr);
 
-out:
 	return err;
 }
 static void btrfs_delete_inode(struct inode *inode)
@@ -1161,17 +1275,30 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
 	u32 found_type;
+	u64 alloc_hint = 0;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_file_extent_item *item;
 	struct btrfs_leaf *leaf;
 	struct btrfs_disk_key *found_key;
+	struct btrfs_trans_handle *trans = NULL;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
 	if (create) {
 		WARN_ON(1);
+		/* this almost but not quite works */
+		trans = btrfs_start_transaction(root, 1);
+		if (!trans) {
+			err = -ENOMEM;
+			goto out;
+		}
+		ret = drop_extents(trans, root, inode,
+				   iblock << inode->i_blkbits,
+				   (iblock + 1) << inode->i_blkbits,
+				   &alloc_hint);
+		BUG_ON(ret);
 	}
 
 	ret = btrfs_lookup_file_extent(NULL, root, path,
@@ -1185,7 +1312,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	if (ret != 0) {
 		if (path->slots[0] == 0) {
 			btrfs_release_path(root, path);
-			goto out;
+			goto not_found;
 		}
 		path->slots[0]--;
 	}
@@ -1203,7 +1330,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
 		extent_end = 0;
 		extent_start = 0;
-		goto out;
+		goto not_found;
 	}
 	found_type = btrfs_file_extent_type(item);
 	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
@@ -1211,7 +1338,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 		extent_start = extent_start >> inode->i_blkbits;
 		extent_end = extent_start + btrfs_file_extent_num_blocks(item);
 		err = 0;
-		if (blocknr == 0)
+		if (btrfs_file_extent_disk_blocknr(item) == 0)
 			goto out;
 		if (iblock >= extent_start && iblock < extent_end) {
 			btrfs_map_bh_to_logical(root, result, blocknr +
@@ -1227,7 +1354,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 		extent_end = (extent_start + size) >> inode->i_blkbits;
 		extent_start >>= inode->i_blkbits;
 		if (iblock < extent_start || iblock > extent_end) {
-			goto out;
+			goto not_found;
 		}
 		ptr = btrfs_file_extent_inline_start(item);
 		map = kmap(result->b_page);
@@ -1239,7 +1366,24 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 		SetPageChecked(result->b_page);
 		btrfs_map_bh_to_logical(root, result, 0);
 	}
+not_found:
+	if (create) {
+		struct btrfs_key ins;
+		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
+					 1, alloc_hint, (u64)-1,
+					 &ins, 1);
+		BUG_ON(ret);
+		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+					       iblock << inode->i_blkbits,
+					       ins.objectid, ins.offset,
+					       ins.offset);
+		BUG_ON(ret);
+		SetPageChecked(result->b_page);
+		btrfs_map_bh_to_logical(root, result, ins.objectid);
+	}
 out:
+	if (trans)
+		err = btrfs_end_transaction(trans, root);
 	btrfs_free_path(path);
 	return err;
 }
@@ -1258,7 +1402,7 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
-	return nobh_prepare_write(page, from, to, btrfs_get_block);
+	return block_prepare_write(page, from, to, btrfs_get_block);
 }
 
 static void btrfs_write_super(struct super_block *sb)
@@ -1500,12 +1644,13 @@ static void btrfs_truncate(struct inode *inode)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 
-	nobh_truncate_page(inode->i_mapping, inode->i_size);
+	btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-	/* FIXME, add redo link to tree so we don't leak on crash */
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
+
+	/* FIXME, add redo link to tree so we don't leak on crash */
 	ret = btrfs_truncate_in_trans(trans, root, inode);
 	BUG_ON(ret);
 	btrfs_update_inode(trans, root, inode);
@@ -1515,10 +1660,6 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_btree_balance_dirty(root);
 }
 
-/*
- * Make sure any changes to nobh_commit_write() are reflected in
- * nobh_truncate_page(), since it doesn't call commit_write().
- */
 static int btrfs_commit_write(struct file *file, struct page *page,
 			      unsigned from, unsigned to)
 {
@@ -1528,6 +1669,7 @@ static int btrfs_commit_write(struct file *file, struct page *page,
 
 	SetPageUptodate(page);
 	bh = page_buffers(page);
+	set_buffer_uptodate(bh);
 	if (buffer_mapped(bh) && bh->b_blocknr != 0) {
 		set_page_dirty(page);
 	}
@@ -1972,8 +2114,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 
-	if ((first_index << PAGE_CACHE_SHIFT) < inode->i_size &&
-	    (pos & (PAGE_CACHE_SIZE - 1))) {
+	if ((pos & (PAGE_CACHE_SIZE - 1))) {
 		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
 		if (!PageUptodate(pinned[0])) {
 			ret = mpage_readpage(pinned[0], btrfs_get_block);
@@ -1983,10 +2124,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 			unlock_page(pinned[0]);
 		}
 	}
-	if (first_index != last_index &&
-	    (last_index << PAGE_CACHE_SHIFT) < inode->i_size &&
-	    pos + count < inode->i_size &&
-	    (count & (PAGE_CACHE_SIZE - 1))) {
+	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
 		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
 		if (!PageUptodate(pinned[1])) {
 			ret = mpage_readpage(pinned[1], btrfs_get_block);
@@ -2110,23 +2248,34 @@ static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
 	if (!PageChecked(page)) {
 		/* FIXME, do it per block */
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		int ret;
+		struct buffer_head *bh;
+
+		if (page_has_buffers(page)) {
+			bh = page_buffers(page);
+			if (!buffer_mapped(bh)) {
+				SetPageChecked(page);
+				goto checked;
+			}
+		}
 
-		int ret = btrfs_csum_verify_file_block(root,
+		ret = btrfs_csum_verify_file_block(root,
 				  page->mapping->host->i_ino,
 				  page->index << PAGE_CACHE_SHIFT,
 				  kmap(page), PAGE_CACHE_SIZE);
 		if (ret) {
 			if (ret != -ENOENT) {
-				printk("failed to verify ino %lu page %lu\n",
+				printk("failed to verify ino %lu page %lu ret %d\n",
 				       page->mapping->host->i_ino,
-				       page->index);
-				memset(page_address(page), 0, PAGE_CACHE_SIZE);
+				       page->index, ret);
+				memset(page_address(page), 1, PAGE_CACHE_SIZE);
 				flush_dcache_page(page);
 			}
 		}
 		SetPageChecked(page);
 		kunmap(page);
 	}
+checked:
 	/*
 	 * Faults on the destination of a read are common, so do it before
 	 * taking the kmap.
-- 
cgit v1.2.3


From 1e2677e000fbc7f1892b1ea9f952f1667f4443f1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 May 2007 16:52:18 -0400
Subject: Btrfs: block group switching

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  5 ++-
 fs/btrfs/extent-tree.c | 85 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 756ea9d0123..1bb62867ded 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -249,10 +249,13 @@ struct btrfs_device_item {
 /* tag for the radix tree of block groups in ram */
 #define BTRFS_BLOCK_GROUP_DIRTY 0
 #define BTRFS_BLOCK_GROUP_AVAIL 1
-#define BTRFS_BLOCK_GROUP_HINTS 8
 #define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
+
+
+#define BTRFS_BLOCK_GROUP_DATA 1
 struct btrfs_block_group_item {
 	__le64 used;
+	u8 flags;
 } __attribute__ ((__packed__));
 
 struct btrfs_block_group_cache {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a366415e03a..b6dc020bdde 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -232,6 +232,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct radix_tree_root *radix;
+	struct radix_tree_root *swap_radix;
 	u64 used;
 	u64 last = 0;
 	u64 hint_last;
@@ -239,14 +240,18 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	int ret;
 	int full_search = 0;
 	int factor = 8;
+	int data_swap = 0;
 
 	if (!owner)
 		factor = 5;
 
-	if (data)
+	if (data) {
 		radix = &info->block_group_data_radix;
-	else
+		swap_radix = &info->block_group_radix;
+	} else {
 		radix = &info->block_group_radix;
+		swap_radix = &info->block_group_data_radix;
+	}
 
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
@@ -334,15 +339,27 @@ again:
 		cond_resched();
 	}
 	if (!full_search) {
-printk("find block group doing full search data %d start %Lu\n", data, search_start);
 		last = search_start;
 		full_search = 1;
 		goto again;
 	}
+	if (!data_swap) {
+		struct radix_tree_root *tmp = radix;
+		data_swap = 1;
+		radix = swap_radix;
+		swap_radix = tmp;
+		last = search_start;
+		goto again;
+	}
 	if (!found_group) {
 printk("find block group bailing to zero data %d\n", data);
 		ret = radix_tree_gang_lookup(radix,
 					     (void **)&found_group, 0, 1);
+		if (ret == 0) {
+			ret = radix_tree_gang_lookup(swap_radix,
+						     (void **)&found_group,
+						     0, 1);
+		}
 		BUG_ON(ret != 1);
 	}
 found:
@@ -552,7 +569,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
-			      u64 blocknr, u64 num, int alloc, int mark_free)
+			      u64 blocknr, u64 num, int alloc, int mark_free,
+			      int data)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -560,6 +578,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 block_in_group;
 	u64 i;
+	int ret;
 
 	while(total) {
 		cache = lookup_block_group(info, blocknr);
@@ -577,7 +596,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		old_val = btrfs_block_group_used(&cache->item);
 		num = min(total, cache->key.offset - block_in_group);
 		if (alloc) {
-			old_val += num;
 			if (blocknr > cache->last_alloc)
 				cache->last_alloc = blocknr;
 			if (!cache->data) {
@@ -586,6 +604,30 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 						        blocknr + i);
 				}
 			}
+			if (cache->data != data &&
+			    old_val < cache->key.offset / 2) {
+printk("changing block group %Lu from %d to %d\n", cache->key.objectid, cache->data, data);
+				cache->data = data;
+				radix_tree_delete(cache->radix,
+						  cache->key.objectid +
+						  cache->key.offset - 1);
+
+				if (data) {
+					cache->radix =
+						&info->block_group_data_radix;
+					cache->item.flags |=
+						BTRFS_BLOCK_GROUP_DATA;
+				} else {
+					cache->radix = &info->block_group_radix;
+					cache->item.flags &=
+						~BTRFS_BLOCK_GROUP_DATA;
+				}
+				ret = radix_tree_insert(cache->radix,
+							cache->key.objectid +
+							cache->key.offset - 1,
+							(void *)cache);
+			}
+			old_val += num;
 		} else {
 			old_val -= num;
 			if (blocknr < cache->first_free)
@@ -596,8 +638,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 						      blocknr + i);
 				}
 			}
-			if (old_val < (cache->key.offset * 5) / 10 &&
-			    old_val + num >= (cache->key.offset * 5) / 10) {
+			if (old_val < cache->key.offset / 2 &&
+			    old_val + num >= cache->key.offset / 2) {
 printk("group %Lu now available\n", cache->key.objectid);
 				radix_tree_tag_set(cache->radix,
 						   cache->key.objectid +
@@ -779,7 +821,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (ret)
 			BUG();
 		ret = update_block_group(trans, root, blocknr, num_blocks, 0,
-					 mark_free);
+					 mark_free, 0);
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
@@ -909,10 +951,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 check_failed:
-	if (!full_scan && block_group->data != data)
-		WARN_ON(1);
-
-	if (!data)
+	if (!block_group->data)
 		search_start = find_search_start(root, &block_group,
 						 search_start, total_needed);
 	else
@@ -1142,7 +1181,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		info->extent_tree_insert[info->extent_tree_insert_nr++] =
 			ins->objectid;
 		ret = update_block_group(trans, root,
-					 ins->objectid, ins->offset, 1, 0);
+					 ins->objectid, ins->offset, 1, 0, 0);
 		BUG_ON(ret);
 		return 0;
 	}
@@ -1208,7 +1247,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (pending_ret) {
 		return pending_ret;
 	}
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0,
+				 data);
 	return 0;
 }
 
@@ -1477,7 +1517,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_leaf *leaf;
 	u64 group_size_blocks = BTRFS_BLOCK_GROUP_SIZE / root->blocksize;
 	u64 used;
-	u64 nr = 0;
 
 	root = info->extent_root;
 	key.objectid = 0;
@@ -1505,13 +1544,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		}
 
-		if (nr % 3)
+		bi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_block_group_item);
+		if (bi->flags & BTRFS_BLOCK_GROUP_DATA) {
 			radix = &info->block_group_data_radix;
-		else
+			cache->data = 1;
+		} else {
 			radix = &info->block_group_radix;
+			cache->data = 0;
+		}
 
-		bi = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_block_group_item);
 		memcpy(&cache->item, bi, sizeof(*bi));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->last_alloc = cache->key.objectid;
@@ -1520,10 +1562,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		cache->pinned = 0;
 		cache->cached = 0;
 
-		if (nr % 3)
-			cache->data = 1;
-		else
-			cache->data = 0;
 		cache->radix = radix;
 
 		key.objectid = found_key.objectid + found_key.offset;
@@ -1541,7 +1579,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		if (key.objectid >=
 		    btrfs_super_total_blocks(info->disk_super))
 			break;
-		nr++;
 	}
 
 	btrfs_free_path(path);
-- 
cgit v1.2.3


From fbdc762b4e1833b5d75cada5aabeadccd8379792 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 30 May 2007 10:22:12 -0400
Subject: Btrfs: use a separate flag for search_start vs a hint in
 find_free_extent

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 69 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b6dc020bdde..85616b458e1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5,8 +5,9 @@
 #include "transaction.h"
 
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_blocks, u64 search_start, u64
-			    search_end, struct btrfs_key *ins, int data);
+			    *orig_root, u64 num_blocks, u64 search_start,
+			    u64 search_end, u64 hint_block,
+			    struct btrfs_key *ins, int data);
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -157,12 +158,6 @@ static struct btrfs_block_group_cache *lookup_block_group(struct
 		    block_group->key.objectid + block_group->key.offset)
 			return block_group;
 	}
-	WARN_ON(1);
-	printk("lookup_block_group fails for blocknr %Lu\n", blocknr);
-	printk("last ret was %d\n", ret);
-	if (ret) {
-		printk("last block group was %Lu %Lu\n", block_group->key.objectid, block_group->key.offset);
-	}
 	return NULL;
 }
 
@@ -378,7 +373,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_key ins;
 	u32 refs;
 
-	find_free_extent(trans, root->fs_info->extent_root, 0, 0, (u64)-1,
+	find_free_extent(trans, root->fs_info->extent_root, 0, 0, (u64)-1, 0,
 			 &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -495,7 +490,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_item *bi;
 	struct btrfs_key ins;
 
-	find_free_extent(trans, extent_root, 0, 0, (u64)-1, &ins, 0);
+	find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins, 0);
 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 	BUG_ON(ret);
 	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
@@ -788,7 +783,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 
-	find_free_extent(trans, root, 0, 0, (u64)-1, &ins, 0);
+	find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
@@ -906,7 +901,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  */
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *orig_root, u64 num_blocks, u64 search_start, u64
-			    search_end, struct btrfs_key *ins, int data)
+			    search_end, u64 hint_block,
+			    struct btrfs_key *ins, int data)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -926,6 +922,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int level;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
+	int wrapped = 0;
 	u64 limit;
 
 	path = btrfs_alloc_path();
@@ -940,10 +937,10 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_blocks(info->disk_super);
-	if (search_start) {
-		block_group = lookup_block_group(info, search_start);
+	if (hint_block) {
+		block_group = lookup_block_group(info, hint_block);
 		block_group = btrfs_find_block_group(root, block_group,
-						     search_start, data, 1);
+						     hint_block, data, 1);
 	} else {
 		block_group = btrfs_find_block_group(root,
 						     trans->block_group, 0,
@@ -954,7 +951,7 @@ check_failed:
 	if (!block_group->data)
 		search_start = find_search_start(root, &block_group,
 						 search_start, total_needed);
-	else
+	else if (!full_scan)
 		search_start = max(block_group->last_alloc, search_start);
 
 	btrfs_init_path(path);
@@ -1039,7 +1036,7 @@ check_failed:
 
 		start_found = 1;
 		last_block = key.objectid + key.offset;
-		if (last_block >= block_group->key.objectid +
+		if (!full_scan && last_block >= block_group->key.objectid +
 		    block_group->key.offset) {
 			btrfs_release_path(root, path);
 			search_start = block_group->key.objectid +
@@ -1059,10 +1056,15 @@ check_pending:
 	BUG_ON(ins->objectid < search_start);
 
 	if (ins->objectid + num_blocks >= search_end) {
-		if (full_scan)
-			return -ENOSPC;
+		if (full_scan) {
+			ret = -ENOSPC;
+			goto error;
+		}
 		search_start = orig_search_start;
-		full_scan = 1;
+		if (wrapped)
+			full_scan = 1;
+		else
+			wrapped = 1;
 		goto new_group;
 	}
 	for (test_block = ins->objectid;
@@ -1132,14 +1134,20 @@ check_pending:
 new_group:
 	if (search_start + num_blocks >= search_end) {
 		search_start = orig_search_start;
-printk("doing full scan!\n");
-		full_scan = 1;
+		if (full_scan) {
+			ret = -ENOSPC;
+			goto error;
+		}
+		if (wrapped)
+			full_scan = 1;
+		else
+			wrapped = 1;
 	}
 	block_group = lookup_block_group(info, search_start);
+	cond_resched();
 	if (!full_scan)
 		block_group = btrfs_find_block_group(root, block_group,
 						     search_start, data, 0);
-	cond_resched();
 	goto check_failed;
 
 error:
@@ -1156,12 +1164,13 @@ error:
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u64 num_blocks, u64 search_start,
+		       u64 num_blocks, u64 hint_block,
 		       u64 search_end, struct btrfs_key *ins, int data)
 {
 	int ret;
 	int pending_ret;
 	u64 super_blocks_used;
+	u64 search_start = 0;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
@@ -1193,7 +1202,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	 */
 	if (data) {
 		ret = find_free_extent(trans, root, 0, 0,
-				       search_end, &prealloc_key, 0);
+				       search_end, 0, &prealloc_key, 0);
 		if (ret) {
 			return ret;
 		}
@@ -1204,9 +1213,11 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 			search_start = info->extent_tree_prealloc[0] + 1;
 		}
 	}
+	if (hint_block < search_start)
+		hint_block = search_start;
 	/* do the real allocation */
 	ret = find_free_extent(trans, root, num_blocks, search_start,
-			       search_end, ins, data);
+			       search_end, hint_block, ins, data);
 	if (ret) {
 		return ret;
 	}
@@ -1226,8 +1237,12 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		else
 			search_start = ins->objectid + ins->offset;
 
+		if (hint_block < search_start)
+			hint_block = search_start;
+
 		ret = find_free_extent(trans, root, 0, search_start,
-				       search_end, &prealloc_key, 0);
+				       search_end, hint_block,
+				       &prealloc_key, 0);
 		if (ret) {
 			return ret;
 		}
-- 
cgit v1.2.3


From fabb568183de7996257080260d3537fa75b3667e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 7 Jun 2007 22:13:21 -0400
Subject: Btrfs: d_type optimization

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 10 ++++++++
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/super.c       | 66 +++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 68 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1bb62867ded..a1c95c980fd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,16 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_CRC32_SIZE 4
 #define BTRFS_EMPTY_DIR_SIZE 6
 
+#define BTRFS_FT_UNKNOWN	0
+#define BTRFS_FT_REG_FILE	1
+#define BTRFS_FT_DIR		2
+#define BTRFS_FT_CHRDEV		3
+#define BTRFS_FT_BLKDEV		4
+#define BTRFS_FT_FIFO		5
+#define BTRFS_FT_SOCK		6
+#define BTRFS_FT_SYMLINK	7
+#define BTRFS_FT_MAX		8
+
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
  * block layout.  objectid corresonds to the inode number.  The flags
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 85616b458e1..645a4228bfe 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1264,6 +1264,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	}
 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0,
 				 data);
+	BUG_ON(ret);
 	return 0;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0eb64d6eaf9..b58b4cf6676 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -43,6 +43,18 @@ static int drop_extents(struct btrfs_trans_handle *trans,
 static int btrfs_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create);
 
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
+};
+
 static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -785,6 +797,9 @@ static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path,
 			break;
 	}
 }
+static unsigned char btrfs_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
 
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
@@ -799,7 +814,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct btrfs_leaf *leaf;
 	int slot;
 	int advance;
-	unsigned char d_type = DT_UNKNOWN;
+	unsigned char d_type;
 	int over = 0;
 	u32 di_cur;
 	u32 di_total;
@@ -853,6 +868,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		di_cur = 0;
 		di_total = btrfs_item_size(leaf->items + slot);
 		while(di_cur < di_total) {
+			d_type = btrfs_filetype_table[btrfs_dir_type(di)];
 			over = filldir(dirent, (const char *)(di + 1),
 				       btrfs_dir_name_len(di),
 				       btrfs_disk_key_offset(&item->key),
@@ -1012,6 +1028,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	return inode;
 }
 
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
 static int btrfs_add_link(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode)
 {
@@ -1026,7 +1047,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_dir_item(trans, root,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
-				    &key, 0);
+				    &key, btrfs_inode_type(inode));
 	if (ret == 0) {
 		dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
 		ret = btrfs_update_inode(trans, root,
@@ -1150,12 +1171,12 @@ static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 
 	ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
-				    &key, 1);
+				    &key, BTRFS_FT_DIR);
 	if (ret)
 		goto error;
 	key.objectid = dirid;
 	ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
-				    &key, 1);
+				    &key, BTRFS_FT_DIR);
 	if (ret)
 		goto error;
 error:
@@ -1265,6 +1286,10 @@ printk("btrfs sync_fs\n");
 	return 0;
 }
 
+#define BTRFS_GET_BLOCK_NO_CREATE 0
+#define BTRFS_GET_BLOCK_CREATE 1
+#define BTRFS_GET_BLOCK_NO_DIRECT 2
+
 static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
@@ -1286,7 +1311,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	btrfs_init_path(path);
-	if (create) {
+	if (create & BTRFS_GET_BLOCK_CREATE) {
 		WARN_ON(1);
 		/* this almost but not quite works */
 		trans = btrfs_start_transaction(root, 1);
@@ -1349,6 +1374,11 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 		char *ptr;
 		char *map;
 		u32 size;
+
+		if (create & BTRFS_GET_BLOCK_NO_DIRECT) {
+			err = -EINVAL;
+			goto out;
+		}
 		size = btrfs_file_extent_inline_len(leaf->items +
 						    path->slots[0]);
 		extent_end = (extent_start + size) >> inode->i_blkbits;
@@ -1367,7 +1397,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 		btrfs_map_bh_to_logical(root, result, 0);
 	}
 not_found:
-	if (create) {
+	if (create & BTRFS_GET_BLOCK_CREATE) {
 		struct btrfs_key ins;
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
 					 1, alloc_hint, (u64)-1,
@@ -1399,6 +1429,21 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	return err;
 }
 
+static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
+			   struct buffer_head *result, int create)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	mutex_lock(&root->fs_info->fs_mutex);
+	btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return 0;
+}
+
+static sector_t btrfs_bmap(struct address_space *as, sector_t block)
+{
+	return generic_block_bmap(as, block, btrfs_get_block_bmap);
+}
+
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
@@ -2428,7 +2473,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	key.offset = (u64)-1;
 	dir = root->fs_info->sb->s_root->d_inode;
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    name, namelen, dir->i_ino, &key, 0);
+				    name, namelen, dir->i_ino, &key,
+				    BTRFS_FT_DIR);
 	BUG_ON(ret);
 
 	ret = btrfs_commit_transaction(trans, root);
@@ -2505,7 +2551,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
 				    name, namelen,
 				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, 0);
+				    &key, BTRFS_FT_DIR);
 
 	BUG_ON(ret);
 
@@ -2833,7 +2879,8 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		btrfs_release_path(root, path);
 
 		ret = btrfs_insert_dir_item(trans, root, "..", 2,
-					    old_inode->i_ino, location, 0);
+					    old_inode->i_ino, location,
+					    BTRFS_FT_DIR);
 		if (ret)
 			goto out_fail;
 	}
@@ -3003,6 +3050,7 @@ static struct address_space_operations btrfs_aops = {
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
 	.commit_write	= btrfs_commit_write,
+	.bmap		= btrfs_bmap,
 };
 
 static struct address_space_operations btrfs_symlink_aops = {
-- 
cgit v1.2.3


From 08607c1b182b3b8b54d7141a0c012cda17d201e6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 8 Jun 2007 15:33:54 -0400
Subject: Btrfs: add compat ioctl

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |  1 -
 fs/btrfs/ctree.h       |  5 +++--
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/super.c       | 27 +++++++++++++++++++++--
 fs/btrfs/transaction.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.h |  8 +++++++
 6 files changed, 98 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 861e34ae097..61d25b5de28 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -7,7 +7,6 @@
 * Get rid of struct ctree_path, limiting tree levels held at one time
 * Add generation number to key pointer in nodes
 * Add generation number to inode
-* Add ability to switch a block group from data to metadata or vice versa
 * forbid cross subvolume renames and hardlinks
 * Release
 * Do real tree locking
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a1c95c980fd..74e6aadf6bb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3,7 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
-#include <linux/kobject.h>
+#include <linux/workqueue.h>
 #include "bit-radix.h"
 
 struct btrfs_trans_handle;
@@ -281,6 +281,7 @@ struct btrfs_block_group_cache {
 };
 
 struct crypto_hash;
+
 struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
@@ -308,9 +309,9 @@ struct btrfs_fs_info {
 	struct mutex fs_mutex;
 	struct list_head trans_list;
 	struct crypto_hash *hash_tfm;
+	struct delayed_work trans_work;
 	spinlock_t hash_lock;
 	int do_barriers;
-	struct kobject kobj;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3da057f78bc..048282f06af 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -577,6 +577,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->do_barriers = 1;
 	fs_info->extent_tree_insert_nr = 0;
 	fs_info->extent_tree_prealloc_nr = 0;
+	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -648,7 +649,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
-	memset(&fs_info->kobj, 0, sizeof(fs_info->kobj));
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 }
@@ -746,6 +746,7 @@ int close_ctree(struct btrfs_root *root)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
+	btrfs_transaction_flush_work(root);
 	mutex_lock(&fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_commit_transaction(trans, root);
@@ -776,7 +777,6 @@ int close_ctree(struct btrfs_root *root)
 	del_fs_roots(fs_info);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
-	kobject_unregister(&fs_info->kobj);
 	return 0;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b58b4cf6676..5864917953a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -12,6 +12,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
+#include <linux/compat.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -950,7 +951,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 		return -ENOMEM;
 	}
 	sb->s_root = root_dentry;
-
+	btrfs_transaction_queue_work(tree_root, HZ * 30);
 	return 0;
 }
 
@@ -1452,7 +1453,7 @@ static int btrfs_prepare_write(struct file *file, struct page *page,
 
 static void btrfs_write_super(struct super_block *sb)
 {
-	btrfs_sync_fs(sb, 1);
+	sb->s_dirt = 0;
 }
 
 static int btrfs_readpage(struct file *file, struct page *page)
@@ -2698,6 +2699,20 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+static long btrfs_compat_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int ret;
+	lock_kernel();
+	ret = btrfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+	unlock_kernel();
+	return ret;
+
+}
+#endif
+
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
@@ -3042,6 +3057,9 @@ static struct file_operations btrfs_dir_file_operations = {
 	.read		= generic_read_dir,
 	.readdir	= btrfs_readdir,
 	.ioctl		= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_compat_ioctl,
+#endif
 };
 
 static struct address_space_operations btrfs_aops = {
@@ -3073,6 +3091,9 @@ static struct file_operations btrfs_file_operations = {
 	.open		= generic_file_open,
 	.ioctl		= btrfs_ioctl,
 	.fsync		= btrfs_sync_file,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_compat_ioctl,
+#endif
 };
 
 static struct inode_operations btrfs_symlink_inode_operations = {
@@ -3085,6 +3106,7 @@ static int __init init_btrfs_fs(void)
 {
 	int err;
 	printk("btrfs loaded!\n");
+	btrfs_init_transaction_sys();
 	err = init_inodecache();
 	if (err)
 		return err;
@@ -3095,6 +3117,7 @@ static int __init init_btrfs_fs(void)
 
 static void __exit exit_btrfs_fs(void)
 {
+	btrfs_exit_transaction_sys();
 	destroy_inodecache();
 	unregister_filesystem(&btrfs_fs_type);
 	printk("btrfs unloaded\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f0f03121b7b..bf7eef67ba0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -8,6 +8,8 @@ static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
+static struct workqueue_struct *trans_wq;
+
 #define BTRFS_ROOT_TRANS_TAG 0
 
 #define TRANS_MAGIC 0xE1E10E
@@ -44,6 +46,7 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->in_commit = 0;
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
+		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		init_bit_radix(&cur_trans->dirty_pages);
 	}
@@ -350,3 +353,60 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+void btrfs_transaction_cleaner(struct work_struct *work)
+{
+	struct btrfs_fs_info *fs_info = container_of(work,
+						     struct btrfs_fs_info,
+						     trans_work.work);
+
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_transaction *cur;
+	struct btrfs_trans_handle *trans;
+	unsigned long now;
+	unsigned long delay = HZ * 30;
+	int ret;
+
+printk("btrfs transaction cleaner\n");
+	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->trans_mutex);
+	cur = root->fs_info->running_transaction;
+	if (!cur) {
+		mutex_unlock(&root->fs_info->trans_mutex);
+		goto out;
+	}
+	now = get_seconds();
+	if (now < cur->start_time || now - cur->start_time < 30) {
+		mutex_unlock(&root->fs_info->trans_mutex);
+		delay = HZ * 5;
+		goto out;
+	}
+	mutex_unlock(&root->fs_info->trans_mutex);
+printk("forcing commit\n");
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_commit_transaction(trans, root);
+out:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_transaction_queue_work(root, delay);
+}
+
+void btrfs_transaction_queue_work(struct btrfs_root *root, int delay)
+{
+	queue_delayed_work(trans_wq, &root->fs_info->trans_work, delay);
+}
+
+void btrfs_transaction_flush_work(struct btrfs_root *root)
+{
+	cancel_rearming_delayed_workqueue(trans_wq, &root->fs_info->trans_work);
+	flush_workqueue(trans_wq);
+}
+
+void __init btrfs_init_transaction_sys(void)
+{
+	trans_wq = create_workqueue("btrfs");
+}
+
+void __exit btrfs_exit_transaction_sys(void)
+{
+	destroy_workqueue(trans_wq);
+}
+
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index b5378119e76..0b08208be85 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -11,6 +11,7 @@ struct btrfs_transaction {
 	int magic;
 	struct list_head list;
 	struct radix_tree_root dirty_pages;
+	unsigned long start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 };
@@ -47,4 +48,11 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
+
+void btrfs_transaction_cleaner(struct work_struct *work);
+void btrfs_transaction_flush_work(struct btrfs_root *root);
+void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
+void btrfs_init_transaction_sys(void);
+void btrfs_exit_transaction_sys(void);
+
 #endif
-- 
cgit v1.2.3


From facda1e787d43191a3368c322f682054991c41b8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 8 Jun 2007 18:11:48 -0400
Subject: Btrfs: get forced transaction commits via workqueue

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  4 +++-
 fs/btrfs/disk-io.c     |  4 ++++
 fs/btrfs/transaction.c | 30 ++++++++++++++++++++----------
 fs/btrfs/transaction.h |  3 ---
 4 files changed, 27 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 74e6aadf6bb..e93ba1a5c81 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -283,6 +283,7 @@ struct btrfs_block_group_cache {
 struct crypto_hash;
 
 struct btrfs_fs_info {
+	spinlock_t hash_lock;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *dev_root;
@@ -308,10 +309,11 @@ struct btrfs_fs_info {
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
 	struct list_head trans_list;
+	struct list_head dead_roots;
 	struct crypto_hash *hash_tfm;
 	struct delayed_work trans_work;
-	spinlock_t hash_lock;
 	int do_barriers;
+	int closing;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 048282f06af..751069c0e9f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -563,6 +563,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&fs_info->block_group_data_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans_list);
+	INIT_LIST_HEAD(&fs_info->dead_roots);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
 	fs_info->tree_root = tree_root;
@@ -577,6 +578,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->do_barriers = 1;
 	fs_info->extent_tree_insert_nr = 0;
 	fs_info->extent_tree_prealloc_nr = 0;
+	fs_info->closing = 0;
+
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
@@ -746,6 +749,7 @@ int close_ctree(struct btrfs_root *root)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
+	fs_info->closing = 1;
 	btrfs_transaction_flush_work(root);
 	mutex_lock(&fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bf7eef67ba0..b859db395fd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -12,12 +12,10 @@ static struct workqueue_struct *trans_wq;
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
-#define TRANS_MAGIC 0xE1E10E
 static void put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(transaction->use_count == 0);
 	transaction->use_count--;
-	WARN_ON(transaction->magic != TRANS_MAGIC);
 	if (transaction->use_count == 0) {
 		WARN_ON(total_trans == 0);
 		total_trans--;
@@ -42,7 +40,6 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->transid = root->fs_info->generation;
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
-		cur_trans->magic = TRANS_MAGIC;
 		cur_trans->in_commit = 0;
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
@@ -83,7 +80,6 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	h->block_group = NULL;
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
-	h->magic = h->magic2 = TRANS_MAGIC;
 	return h;
 }
 
@@ -92,8 +88,6 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_transaction *cur_trans;
 
-	WARN_ON(trans->magic != TRANS_MAGIC);
-	WARN_ON(trans->magic2 != TRANS_MAGIC);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = root->fs_info->running_transaction;
 	WARN_ON(cur_trans->num_writers < 1);
@@ -257,8 +251,8 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
 	int ret;
-
 	while(!list_empty(list)) {
+		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 		trans = btrfs_start_transaction(tree_root, 1);
@@ -271,6 +265,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 		kfree(dirty);
+		mutex_unlock(&tree_root->fs_info->fs_mutex);
 	}
 	return 0;
 }
@@ -346,10 +341,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
+	if (root->fs_info->closing)
+		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
+	else
+		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-	drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
+	if (root->fs_info->closing) {
+		mutex_unlock(&root->fs_info->fs_mutex);
+		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
+		mutex_lock(&root->fs_info->fs_mutex);
+	}
 	return ret;
 }
 
@@ -362,11 +365,19 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_transaction *cur;
 	struct btrfs_trans_handle *trans;
+	struct list_head dirty_roots;
 	unsigned long now;
 	unsigned long delay = HZ * 30;
 	int ret;
 
-printk("btrfs transaction cleaner\n");
+	INIT_LIST_HEAD(&dirty_roots);
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	if (!list_empty(&dirty_roots)) {
+		drop_dirty_roots(root, &dirty_roots);
+	}
 	mutex_lock(&root->fs_info->fs_mutex);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur = root->fs_info->running_transaction;
@@ -381,7 +392,6 @@ printk("btrfs transaction cleaner\n");
 		goto out;
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
-printk("forcing commit\n");
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 out:
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0b08208be85..f25b4900db4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -8,7 +8,6 @@ struct btrfs_transaction {
 	int in_commit;
 	int use_count;
 	int commit_done;
-	int magic;
 	struct list_head list;
 	struct radix_tree_root dirty_pages;
 	unsigned long start_time;
@@ -17,13 +16,11 @@ struct btrfs_transaction {
 };
 
 struct btrfs_trans_handle {
-	int magic;
 	u64 transid;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_group_cache *block_group;
-	int magic2;
 };
 
 
-- 
cgit v1.2.3


From ad693af684757478e5fcb225aef319cab7ba6c75 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 9 Jun 2007 08:19:57 -0400
Subject: Btrfs: reap dead roots right after commit

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b859db395fd..a2c5820f1d3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -371,13 +371,6 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	int ret;
 
 	INIT_LIST_HEAD(&dirty_roots);
-	mutex_lock(&root->fs_info->trans_mutex);
-	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
-
-	if (!list_empty(&dirty_roots)) {
-		drop_dirty_roots(root, &dirty_roots);
-	}
 	mutex_lock(&root->fs_info->fs_mutex);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur = root->fs_info->running_transaction;
@@ -396,6 +389,14 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	ret = btrfs_commit_transaction(trans, root);
 out:
 	mutex_unlock(&root->fs_info->fs_mutex);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	if (!list_empty(&dirty_roots)) {
+		drop_dirty_roots(root, &dirty_roots);
+	}
 	btrfs_transaction_queue_work(root, delay);
 }
 
-- 
cgit v1.2.3


From 0cf6c620176a294bdf1bedddc492f6ae857e0019 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 9 Jun 2007 09:22:25 -0400
Subject: Btrfs: remove device tree

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 105 +----------------------
 fs/btrfs/disk-io.c     | 224 ++-----------------------------------------------
 fs/btrfs/super.c       |  88 -------------------
 fs/btrfs/transaction.c |   6 --
 4 files changed, 10 insertions(+), 413 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e93ba1a5c81..5ab25a0cb16 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -16,11 +16,10 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_MAGIC "_BtRfS_M"
 
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
-#define BTRFS_DEV_TREE_OBJECTID 2ULL
-#define BTRFS_EXTENT_TREE_OBJECTID 3ULL
-#define BTRFS_FS_TREE_OBJECTID 4ULL
-#define BTRFS_ROOT_TREE_DIR_OBJECTID 5ULL
-#define BTRFS_FIRST_FREE_OBJECTID 6ULL
+#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+#define BTRFS_FS_TREE_OBJECTID 3ULL
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
+#define BTRFS_FIRST_FREE_OBJECTID 5ULL
 
 /*
  * we can actually store much bigger names, but lets not confuse the rest
@@ -111,12 +110,6 @@ struct btrfs_super_block {
 	__le64 total_blocks;
 	__le64 blocks_used;
 	__le64 root_dir_objectid;
-	__le64 last_device_id;
-	/* fields below here vary with the underlying disk */
-	__le64 device_block_start;
-	__le64 device_num_blocks;
-	__le64 device_root;
-	__le64 device_id;
 } __attribute__ ((__packed__));
 
 /*
@@ -251,11 +244,6 @@ struct btrfs_csum_item {
 	u8 csum;
 } __attribute__ ((__packed__));
 
-struct btrfs_device_item {
-	__le16 pathlen;
-	__le64 device_id;
-} __attribute__ ((__packed__));
-
 /* tag for the radix tree of block groups in ram */
 #define BTRFS_BLOCK_GROUP_DIRTY 0
 #define BTRFS_BLOCK_GROUP_AVAIL 1
@@ -286,11 +274,9 @@ struct btrfs_fs_info {
 	spinlock_t hash_lock;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
-	struct btrfs_root *dev_root;
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
-	struct radix_tree_root dev_radix;
 	struct radix_tree_root block_group_radix;
 	struct radix_tree_root block_group_data_radix;
 	struct radix_tree_root extent_map_radix;
@@ -385,11 +371,6 @@ struct btrfs_root {
  */
 #define BTRFS_BLOCK_GROUP_ITEM_KEY 34
 
-/*
- * dev items list the devices that make up the FS
- */
-#define BTRFS_DEV_ITEM_KEY	35
-
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -880,62 +861,6 @@ static inline void btrfs_set_super_root_dir(struct btrfs_super_block *s, u64
 	s->root_dir_objectid = cpu_to_le64(val);
 }
 
-static inline u64 btrfs_super_last_device_id(struct btrfs_super_block *s)
-{
-	return le64_to_cpu(s->last_device_id);
-}
-
-static inline void btrfs_set_super_last_device_id(struct btrfs_super_block *s,
-						  u64 val)
-{
-	s->last_device_id = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_super_device_id(struct btrfs_super_block *s)
-{
-	return le64_to_cpu(s->device_id);
-}
-
-static inline void btrfs_set_super_device_id(struct btrfs_super_block *s,
-						  u64 val)
-{
-	s->device_id = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_super_device_block_start(struct btrfs_super_block *s)
-{
-	return le64_to_cpu(s->device_block_start);
-}
-
-static inline void btrfs_set_super_device_block_start(struct btrfs_super_block
-						      *s, u64 val)
-{
-	s->device_block_start = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_super_device_num_blocks(struct btrfs_super_block *s)
-{
-	return le64_to_cpu(s->device_num_blocks);
-}
-
-static inline void btrfs_set_super_device_num_blocks(struct btrfs_super_block
-						     *s, u64 val)
-{
-	s->device_num_blocks = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_super_device_root(struct btrfs_super_block *s)
-{
-	return le64_to_cpu(s->device_root);
-}
-
-static inline void btrfs_set_super_device_root(struct btrfs_super_block
-						      *s, u64 val)
-{
-	s->device_root = cpu_to_le64(val);
-}
-
-
 static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
 {
 	return (u8 *)l->items;
@@ -1031,28 +956,6 @@ static inline void btrfs_set_file_extent_num_blocks(struct
 	e->num_blocks = cpu_to_le64(val);
 }
 
-static inline u16 btrfs_device_pathlen(struct btrfs_device_item *d)
-{
-	return le16_to_cpu(d->pathlen);
-}
-
-static inline void btrfs_set_device_pathlen(struct btrfs_device_item *d,
-						u16 val)
-{
-	d->pathlen = cpu_to_le16(val);
-}
-
-static inline u64 btrfs_device_id(struct btrfs_device_item *d)
-{
-	return le64_to_cpu(d->device_id);
-}
-
-static inline void btrfs_set_device_id(struct btrfs_device_item *d,
-						u64 val)
-{
-	d->device_id = cpu_to_le64(val);
-}
-
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 751069c0e9f..8a88404525e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -11,47 +11,9 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 
-struct dev_lookup {
-	u64 block_start;
-	u64 num_blocks;
-	u64 device_id;
-	struct block_device *bdev;
-};
-
-int btrfs_insert_dev_radix(struct btrfs_root *root,
-			   struct block_device *bdev,
-			   u64 device_id,
-			   u64 block_start,
-			   u64 num_blocks)
-{
-	struct dev_lookup *lookup;
-	int ret;
-
-	lookup = kmalloc(sizeof(*lookup), GFP_NOFS);
-	if (!lookup)
-		return -ENOMEM;
-	lookup->block_start = block_start;
-	lookup->num_blocks = num_blocks;
-	lookup->bdev = bdev;
-	lookup->device_id = device_id;
-
-	ret = radix_tree_insert(&root->fs_info->dev_radix, block_start +
-				num_blocks - 1, lookup);
-	return ret;
-}
-
 u64 bh_blocknr(struct buffer_head *bh)
 {
-	int blkbits = bh->b_page->mapping->host->i_blkbits;
-	u64 blocknr = bh->b_page->index << (PAGE_CACHE_SHIFT - blkbits);
-	unsigned long offset;
-
-	if (PageHighMem(bh->b_page))
-		offset = (unsigned long)bh->b_data;
-	else
-		offset = bh->b_data - (char *)page_address(bh->b_page);
-	blocknr += offset >> (PAGE_CACHE_SHIFT - blkbits);
-	return blocknr;
+	return bh->b_blocknr;
 }
 
 static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
@@ -102,32 +64,14 @@ out_unlock:
 int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 			     u64 logical)
 {
-	struct dev_lookup *lookup[2];
-
-	int ret;
-
 	if (logical == 0) {
 		bh->b_bdev = NULL;
 		bh->b_blocknr = 0;
 		set_buffer_mapped(bh);
-		return 0;
-	}
-	root = root->fs_info->dev_root;
-	ret = radix_tree_gang_lookup(&root->fs_info->dev_radix,
-				     (void **)lookup,
-				     (unsigned long)logical,
-				     ARRAY_SIZE(lookup));
-	if (ret == 0 || lookup[0]->block_start > logical ||
-	    lookup[0]->block_start + lookup[0]->num_blocks <= logical) {
-		ret = -ENOENT;
-		goto out;
+	} else {
+		map_bh(bh, root->fs_info->sb, logical);
 	}
-	bh->b_bdev = lookup[0]->bdev;
-	bh->b_blocknr = logical - lookup[0]->block_start;
-	set_buffer_mapped(bh);
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
 
 struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -382,24 +326,18 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	u64 highest_inode;
 	int ret = 0;
 
-printk("read_fs_root looking for %Lu %Lu %u\n", location->objectid, location->offset, location->flags);
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
 				 (unsigned long)location->objectid);
-	if (root) {
-printk("found %p in cache\n", root);
+	if (root)
 		return root;
-	}
 	root = kmalloc(sizeof(*root), GFP_NOFS);
-	if (!root) {
-printk("failed1\n");
+	if (!root)
 		return ERR_PTR(-ENOMEM);
-	}
 	if (location->offset == (u64)-1) {
 		ret = find_and_setup_root(fs_info->sb->s_blocksize,
 					  fs_info->tree_root, fs_info,
 					  location->objectid, root);
 		if (ret) {
-printk("failed2\n");
 			kfree(root);
 			return ERR_PTR(ret);
 		}
@@ -413,7 +351,6 @@ printk("failed2\n");
 	BUG_ON(!path);
 	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
 	if (ret != 0) {
-printk("internal search_slot gives us %d\n", ret);
 		if (ret > 0)
 			ret = -ENOENT;
 		goto out;
@@ -435,13 +372,11 @@ out:
 				     btrfs_root_blocknr(&root->root_item));
 	BUG_ON(!root->node);
 insert:
-printk("inserting %p\n", root);
 	root->ref_cows = 1;
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
 				(unsigned long)root->root_key.objectid,
 				root);
 	if (ret) {
-printk("radix_tree_insert gives us %d\n", ret);
 		brelse(root->node);
 		kfree(root);
 		return ERR_PTR(ret);
@@ -450,116 +385,25 @@ printk("radix_tree_insert gives us %d\n", ret);
 	if (ret == 0) {
 		root->highest_inode = highest_inode;
 		root->last_inode_alloc = highest_inode;
-printk("highest inode is %Lu\n", highest_inode);
 	}
-printk("all worked\n");
 	return root;
 }
 
-static int btrfs_open_disk(struct btrfs_root *root, u64 device_id,
-			   u64 block_start, u64 num_blocks,
-			   char *filename, int name_len)
-{
-	char *null_filename;
-	struct block_device *bdev;
-	int ret;
-
-	null_filename = kmalloc(name_len + 1, GFP_NOFS);
-	if (!null_filename)
-		return -ENOMEM;
-	memcpy(null_filename, filename, name_len);
-	null_filename[name_len] = '\0';
-
-	bdev = open_bdev_excl(null_filename, O_RDWR, root->fs_info->sb);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-		goto out;
-	}
-	set_blocksize(bdev, root->fs_info->sb->s_blocksize);
-	ret = btrfs_insert_dev_radix(root, bdev, device_id,
-				     block_start, num_blocks);
-	BUG_ON(ret);
-	ret = 0;
-out:
-	kfree(null_filename);
-	return ret;
-}
-
-static int read_device_info(struct btrfs_root *root)
-{
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_key key;
-	struct btrfs_leaf *leaf;
-	struct btrfs_device_item *dev_item;
-	int nritems;
-	int slot;
-
-	root = root->fs_info->dev_root;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	key.objectid = 0;
-	key.offset = 0;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DEV_ITEM_KEY);
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	nritems = btrfs_header_nritems(&leaf->header);
-	while(1) {
-		slot = path->slots[0];
-		if (slot >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				break;
-			leaf = btrfs_buffer_leaf(path->nodes[0]);
-			nritems = btrfs_header_nritems(&leaf->header);
-			slot = path->slots[0];
-		}
-		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
-		if (btrfs_key_type(&key) != BTRFS_DEV_ITEM_KEY) {
-			path->slots[0]++;
-			continue;
-		}
-		dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_device_item);
-printk("found key %Lu %Lu\n", key.objectid, key.offset);
-		if (btrfs_device_id(dev_item) !=
-		    btrfs_super_device_id(root->fs_info->disk_super)) {
-			ret = btrfs_open_disk(root, btrfs_device_id(dev_item),
-					      key.objectid, key.offset,
-					      (char *)(dev_item + 1),
-					      btrfs_device_pathlen(dev_item));
-			BUG_ON(ret);
-		}
-		path->slots[0]++;
-	}
-	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return 0;
-}
-
 struct btrfs_root *open_ctree(struct super_block *sb)
 {
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
-	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
-						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
 						GFP_NOFS);
 	int ret;
 	struct btrfs_super_block *disk_super;
-	struct dev_lookup *dev_lookup;
 
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
 	init_bit_radix(&fs_info->extent_map_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
-	INIT_RADIX_TREE(&fs_info->dev_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&fs_info->block_group_data_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans_list);
@@ -568,7 +412,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->running_transaction = NULL;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
-	fs_info->dev_root = dev_root;
 	fs_info->sb = sb;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
@@ -595,19 +438,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 
-	__setup_root(sb->s_blocksize, dev_root,
-		     fs_info, BTRFS_DEV_TREE_OBJECTID);
-
 	__setup_root(sb->s_blocksize, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
-	dev_lookup = kmalloc(sizeof(*dev_lookup), GFP_NOFS);
-	dev_lookup->block_start = 0;
-	dev_lookup->num_blocks = (u32)-2;
-	dev_lookup->bdev = sb->s_bdev;
-	dev_lookup->device_id = 0;
-	ret = radix_tree_insert(&fs_info->dev_radix, (u32)-2, dev_lookup);
-	BUG_ON(ret);
 	fs_info->sb_buffer = read_tree_block(tree_root,
 					     BTRFS_SUPER_INFO_OFFSET /
 					     sb->s_blocksize);
@@ -622,24 +455,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		     btrfs_super_total_blocks(disk_super) <<
 		     fs_info->btree_inode->i_blkbits);
 
-	radix_tree_delete(&fs_info->dev_radix, (u32)-2);
-	dev_lookup->block_start = btrfs_super_device_block_start(disk_super);
-	dev_lookup->num_blocks = btrfs_super_device_num_blocks(disk_super);
-	dev_lookup->device_id = btrfs_super_device_id(disk_super);
-
-	ret = radix_tree_insert(&fs_info->dev_radix,
-				dev_lookup->block_start +
-				dev_lookup->num_blocks - 1, dev_lookup);
-	BUG_ON(ret);
-
 	fs_info->disk_super = disk_super;
-
-	dev_root->node = read_tree_block(tree_root,
-					  btrfs_super_device_root(disk_super));
-
-	ret = read_device_info(dev_root);
-	BUG_ON(ret);
-
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super));
 	BUG_ON(!tree_root->node);
@@ -719,30 +535,6 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
-static int free_dev_radix(struct btrfs_fs_info *fs_info)
-{
-	struct dev_lookup *lookup[8];
-	struct block_device *super_bdev = fs_info->sb->s_bdev;
-	int ret;
-	int i;
-	while(1) {
-		ret = radix_tree_gang_lookup(&fs_info->dev_radix,
-					     (void **)lookup, 0,
-					     ARRAY_SIZE(lookup));
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++) {
-			if (lookup[i]->bdev != super_bdev)
-				close_bdev_excl(lookup[i]->bdev);
-			radix_tree_delete(&fs_info->dev_radix,
-					  lookup[i]->block_start +
-					  lookup[i]->num_blocks - 1);
-			kfree(lookup[i]);
-		}
-	}
-	return 0;
-}
-
 int close_ctree(struct btrfs_root *root)
 {
 	int ret;
@@ -765,9 +557,6 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->extent_root->node)
 		btrfs_block_release(fs_info->extent_root,
 				    fs_info->extent_root->node);
-	if (fs_info->dev_root->node)
-		btrfs_block_release(fs_info->dev_root,
-				    fs_info->dev_root->node);
 	if (fs_info->tree_root->node)
 		btrfs_block_release(fs_info->tree_root,
 				    fs_info->tree_root->node);
@@ -776,7 +565,6 @@ int close_ctree(struct btrfs_root *root)
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
 
-	free_dev_radix(fs_info);
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
 	kfree(fs_info->extent_root);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5864917953a..2b24a5a2be6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2566,83 +2566,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	return 0;
 }
 
-static int add_disk(struct btrfs_root *root, char *name, int namelen)
-{
-	struct block_device *bdev;
-	struct btrfs_path *path;
-	struct super_block *sb = root->fs_info->sb;
-	struct btrfs_root *dev_root = root->fs_info->dev_root;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_device_item *dev_item;
-	struct btrfs_key key;
-	u16 item_size;
-	u64 num_blocks;
-	u64 new_blocks;
-	u64 device_id;
-	int ret;
-
-printk("adding disk %s\n", name);
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	num_blocks = btrfs_super_total_blocks(root->fs_info->disk_super);
-	bdev = open_bdev_excl(name, O_RDWR, sb);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-printk("open bdev excl failed ret %d\n", ret);
-		goto out_nolock;
-	}
-	set_blocksize(bdev, sb->s_blocksize);
-	new_blocks = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
-	key.objectid = num_blocks;
-	key.offset = new_blocks;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_DEV_ITEM_KEY);
-
-	mutex_lock(&dev_root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(dev_root, 1);
-	item_size = sizeof(*dev_item) + namelen;
-printk("insert empty on %Lu %Lu %u size %d\n", num_blocks, new_blocks, key.flags, item_size);
-	ret = btrfs_insert_empty_item(trans, dev_root, path, &key, item_size);
-	if (ret) {
-printk("insert failed %d\n", ret);
-		close_bdev_excl(bdev);
-		if (ret > 0)
-			ret = -EEXIST;
-		goto out;
-	}
-	dev_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0], struct btrfs_device_item);
-	btrfs_set_device_pathlen(dev_item, namelen);
-	memcpy(dev_item + 1, name, namelen);
-
-	device_id = btrfs_super_last_device_id(root->fs_info->disk_super) + 1;
-	btrfs_set_super_last_device_id(root->fs_info->disk_super, device_id);
-	btrfs_set_device_id(dev_item, device_id);
-	mark_buffer_dirty(path->nodes[0]);
-
-	ret = btrfs_insert_dev_radix(root, bdev, device_id, num_blocks,
-				     new_blocks);
-
-	if (!ret) {
-		btrfs_set_super_total_blocks(root->fs_info->disk_super,
-					     num_blocks + new_blocks);
-		i_size_write(root->fs_info->btree_inode,
-			     (num_blocks + new_blocks) <<
-			     root->fs_info->btree_inode->i_blkbits);
-	}
-
-out:
-	ret = btrfs_commit_transaction(trans, dev_root);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
-out_nolock:
-	btrfs_free_path(path);
-	btrfs_btree_balance_dirty(root);
-
-	return ret;
-}
-
 static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		       cmd, unsigned long arg)
 {
@@ -2682,17 +2605,6 @@ static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 			ret = create_snapshot(root, vol_args.name, namelen);
 		WARN_ON(ret);
 		break;
-	case BTRFS_IOC_ADD_DISK:
-		if (copy_from_user(&vol_args,
-				   (struct btrfs_ioctl_vol_args __user *)arg,
-				   sizeof(vol_args)))
-			return -EFAULT;
-		namelen = strlen(vol_args.name);
-		if (namelen > BTRFS_VOL_NAME_MAX)
-			return -EINVAL;
-		vol_args.name[namelen] = '\0';
-		ret = add_disk(root, vol_args.name, namelen);
-		break;
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a2c5820f1d3..4f3c8ddcb40 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -150,13 +150,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *extent_root = fs_info->extent_root;
-	struct btrfs_root *dev_root = fs_info->dev_root;
 
-	if (btrfs_super_device_root(fs_info->disk_super) !=
-	    bh_blocknr(dev_root->node)) {
-		btrfs_set_super_device_root(fs_info->disk_super,
-					    bh_blocknr(dev_root->node));
-	}
 	btrfs_write_dirty_block_groups(trans, extent_root);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
-- 
cgit v1.2.3


From 5276aedab0baacfb3c5483208b8be85a8416bd5f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 11 Jun 2007 21:33:38 -0400
Subject: Btrfs: fix oops after block group lookup

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/TODO          |  1 -
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/extent-tree.c | 29 ++++++++++++++++-------------
 fs/btrfs/super.c       |  8 ++------
 4 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 61d25b5de28..d9b6d38c603 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -16,6 +16,5 @@
 * Use relocation to try and fix write errors
 * Make allocator much smarter
 * xattrs (directory streams for regular files)
-* fsck
 * Scrub & defrag
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5ab25a0cb16..4e136b7b03a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -998,6 +998,9 @@ static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 	btrfs_item_offset((leaf)->items + (slot))))
 
 /* extent-tree.c */
+struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
+							 btrfs_fs_info *info,
+							 u64 blocknr);
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 645a4228bfe..f509ffa38d0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -135,9 +135,9 @@ printk("cache block group %Lu\n", block_group->key.objectid);
 	return 0;
 }
 
-static struct btrfs_block_group_cache *lookup_block_group(struct
-							  btrfs_fs_info *info,
-							  u64 blocknr)
+struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
+							 btrfs_fs_info *info,
+							 u64 blocknr)
 {
 	struct btrfs_block_group_cache *block_group;
 	int ret;
@@ -208,7 +208,8 @@ out:
 	return max(cache->last_alloc, search_start);
 
 new_group:
-	cache = lookup_block_group(root->fs_info, last + cache->key.offset - 1);
+	cache = btrfs_lookup_block_group(root->fs_info,
+					 last + cache->key.offset - 1);
 	if (!cache) {
 		return max((*cache_ret)->last_alloc, search_start);
 	}
@@ -250,7 +251,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
-		shint = lookup_block_group(info, search_start);
+		shint = btrfs_lookup_block_group(info, search_start);
 		if (shint->data == data) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
@@ -576,7 +577,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	int ret;
 
 	while(total) {
-		cache = lookup_block_group(info, blocknr);
+		cache = btrfs_lookup_block_group(info, blocknr);
 		if (!cache) {
 			printk(KERN_CRIT "blocknr %Lu lookup failed\n",
 			       blocknr);
@@ -677,8 +678,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			clear_radix_bit(pinned_radix, gang[i]);
-			block_group = lookup_block_group(root->fs_info,
-							 gang[i]);
+			block_group = btrfs_lookup_block_group(root->fs_info,
+							       gang[i]);
 			if (block_group) {
 				WARN_ON(block_group->pinned == 0);
 				block_group->pinned--;
@@ -751,7 +752,8 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 		err = set_radix_bit(&root->fs_info->pinned_radix, blocknr);
 		if (!err) {
 			struct btrfs_block_group_cache *cache;
-			cache = lookup_block_group(root->fs_info, blocknr);
+			cache = btrfs_lookup_block_group(root->fs_info,
+							 blocknr);
 			if (cache)
 				cache->pinned++;
 		}
@@ -851,7 +853,8 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 		for (i = 0; i < ret; i++) {
 			wret = set_radix_bit(pinned_radix, gang[i]);
 			if (wret == 0) {
-				cache = lookup_block_group(extent_root->fs_info,
+				cache =
+				  btrfs_lookup_block_group(extent_root->fs_info,
 							   gang[i]);
 				if (cache)
 					cache->pinned++;
@@ -938,7 +941,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_blocks(info->disk_super);
 	if (hint_block) {
-		block_group = lookup_block_group(info, hint_block);
+		block_group = btrfs_lookup_block_group(info, hint_block);
 		block_group = btrfs_find_block_group(root, block_group,
 						     hint_block, data, 1);
 	} else {
@@ -1118,7 +1121,7 @@ check_pending:
 		info->extent_tree_prealloc_nr = total_found;
 	}
 	if (!data) {
-		block_group = lookup_block_group(info, ins->objectid);
+		block_group = btrfs_lookup_block_group(info, ins->objectid);
 		if (block_group) {
 			if (fill_prealloc)
 				block_group->last_prealloc =
@@ -1143,7 +1146,7 @@ new_group:
 		else
 			wrapped = 1;
 	}
-	block_group = lookup_block_group(info, search_start);
+	block_group = btrfs_lookup_block_group(info, search_start);
 	cond_resched();
 	if (!full_scan)
 		block_group = btrfs_find_block_group(root, block_group,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2b24a5a2be6..9749090cfbb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -62,7 +62,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
-	struct btrfs_block_group_cache *alloc_group;
 	u64 alloc_group_block;
 	int ret;
 
@@ -95,11 +94,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
 	alloc_group_block = btrfs_inode_block_group(inode_item);
-	ret = radix_tree_gang_lookup(&root->fs_info->block_group_radix,
-				     (void **)&alloc_group,
-				     alloc_group_block, 1);
-	BUG_ON(!ret);
-	BTRFS_I(inode)->block_group = alloc_group;
+	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
+						       alloc_group_block);
 
 	btrfs_free_path(path);
 	inode_item = NULL;
-- 
cgit v1.2.3


From 39279cc3d2704cfbf9c35dcb5bdd392159ae4625 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 12 Jun 2007 06:35:45 -0400
Subject: Btrfs: split up super.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile  |    4 +-
 fs/btrfs/TODO      |    1 +
 fs/btrfs/ctree.h   |   25 +-
 fs/btrfs/disk-io.c |   46 +-
 fs/btrfs/file.c    |  783 ++++++++++++++
 fs/btrfs/inode.c   | 2210 ++++++++++++++++++++++++++++++++++++++
 fs/btrfs/super.c   | 2979 ++--------------------------------------------------
 7 files changed, 3120 insertions(+), 2928 deletions(-)
 create mode 100644 fs/btrfs/file.c
 create mode 100644 fs/btrfs/inode.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 5346f706b2c..5af2426116d 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
-	   transaction.o bit-radix.o
+	   transaction.o bit-radix.o inode.o file.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
@@ -16,7 +16,7 @@ else
 
 KERNELDIR := /lib/modules/`uname -r`/build
 all::
-	$(MAKE) C=1 -C $(KERNELDIR) M=`pwd` modules
+	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
 clean::
 	rm *.o btrfs.ko
 endif
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index d9b6d38c603..4a48a2365d5 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,3 +1,4 @@
+* fix printk warnings
 * cleanup, add more error checking, get rid of BUG_ONs
 * Fix ENOSPC handling
 * Make allocator smarter
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4e136b7b03a..24410d42e3c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1123,5 +1123,28 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
-/* super.c */
+/* inode.c */
+void btrfs_delete_inode(struct inode *inode);
+void btrfs_read_locked_inode(struct inode *inode);
+int btrfs_write_inode(struct inode *inode, int wait);
+void btrfs_dirty_inode(struct inode *inode);
+struct inode *btrfs_alloc_inode(struct super_block *sb);
+void btrfs_destroy_inode(struct inode *inode);
+int btrfs_init_cachep(void);
+void btrfs_destroy_cachep(void);
+int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+		unsigned long arg);
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg);
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				struct btrfs_root *root);
+int btrfs_commit_write(struct file *file, struct page *page,
+		       unsigned from, unsigned to);
+int btrfs_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *result, int create);
+/* file.c */
+extern struct file_operations btrfs_file_operations;
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode,
+		       u64 start, u64 end, u64 *hint_block);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8a88404525e..96bf3ef3a79 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -22,7 +22,7 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 	if (bh_blocknr(buf) != btrfs_header_blocknr(&node->header)) {
 		printk(KERN_CRIT "bh_blocknr(buf) is %Lu, header is %Lu\n",
 		       bh_blocknr(buf), btrfs_header_blocknr(&node->header));
-		BUG();
+		return 1;
 	}
 	return 0;
 }
@@ -253,7 +253,7 @@ uptodate:
 		set_buffer_checked(bh);
 	}
 	if (check_tree_block(root, bh))
-		BUG();
+		goto fail;
 	return bh;
 fail:
 	brelse(bh);
@@ -398,8 +398,13 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
 						GFP_NOFS);
 	int ret;
+	int err = -EIO;
 	struct btrfs_super_block *disk_super;
 
+	if (!extent_root || !tree_root || !fs_info) {
+		err = -ENOMEM;
+		goto fail;
+	}
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
 	init_bit_radix(&fs_info->extent_map_radix);
@@ -431,9 +436,11 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->hash_tfm = crypto_alloc_hash("crc32c", 0, CRYPTO_ALG_ASYNC);
 	spin_lock_init(&fs_info->hash_lock);
+
 	if (!fs_info->hash_tfm || IS_ERR(fs_info->hash_tfm)) {
-		printk("failed to allocate digest hash\n");
-		return NULL;
+		printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
+		err = -ENOMEM;
+		goto fail_iput;
 	}
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
@@ -446,30 +453,53 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 					     sb->s_blocksize);
 
 	if (!fs_info->sb_buffer)
-		return NULL;
+		goto fail_iput;
 	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
+
 	if (!btrfs_super_root(disk_super))
-		return NULL;
+		goto fail_sb_buffer;
 
 	i_size_write(fs_info->btree_inode,
 		     btrfs_super_total_blocks(disk_super) <<
 		     fs_info->btree_inode->i_blkbits);
 
 	fs_info->disk_super = disk_super;
+
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic))) {
+		printk("btrfs: valid FS not found on %s\n", sb->s_id);
+		goto fail_sb_buffer;
+	}
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super));
-	BUG_ON(!tree_root->node);
+	if (!tree_root->node)
+		goto fail_sb_buffer;
 
 	mutex_lock(&fs_info->fs_mutex);
 	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&fs_info->fs_mutex);
+		goto fail_tree_root;
+	}
 
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
+
+fail_tree_root:
+	btrfs_block_release(tree_root, tree_root->node);
+fail_sb_buffer:
+	btrfs_block_release(tree_root, fs_info->sb_buffer);
+fail_iput:
+	iput(fs_info->btree_inode);
+fail:
+	kfree(extent_root);
+	kfree(tree_root);
+	kfree(fs_info);
+	return ERR_PTR(err);
 }
 
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
new file mode 100644
index 00000000000..7f8e3035d99
--- /dev/null
+++ b/fs/btrfs/file.c
@@ -0,0 +1,783 @@
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+
+
+static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
+				struct page **prepared_pages,
+				const char __user * buf)
+{
+	long page_fault = 0;
+	int i;
+	int offset = pos & (PAGE_CACHE_SIZE - 1);
+
+	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+		size_t count = min_t(size_t,
+				     PAGE_CACHE_SIZE - offset, write_bytes);
+		struct page *page = prepared_pages[i];
+		fault_in_pages_readable(buf, count);
+
+		/* Copy data from userspace to the current page */
+		kmap(page);
+		page_fault = __copy_from_user(page_address(page) + offset,
+					      buf, count);
+		/* Flush processor's dcache for this page */
+		flush_dcache_page(page);
+		kunmap(page);
+		buf += count;
+		write_bytes -= count;
+
+		if (page_fault)
+			break;
+	}
+	return page_fault ? -EFAULT : 0;
+}
+
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+{
+	size_t i;
+	for (i = 0; i < num_pages; i++) {
+		if (!pages[i])
+			break;
+		unlock_page(pages[i]);
+		mark_page_accessed(pages[i]);
+		page_cache_release(pages[i]);
+	}
+}
+
+static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct file *file,
+				   struct page **pages,
+				   size_t num_pages,
+				   loff_t pos,
+				   size_t write_bytes)
+{
+	int i;
+	int offset;
+	int err = 0;
+	int ret;
+	int this_write;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct buffer_head *bh;
+	struct btrfs_file_extent_item *ei;
+
+	for (i = 0; i < num_pages; i++) {
+		offset = pos & (PAGE_CACHE_SIZE -1);
+		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		/* FIXME, one block at a time */
+
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+
+		bh = page_buffers(pages[i]);
+
+		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
+			struct btrfs_key key;
+			struct btrfs_path *path;
+			char *ptr;
+			u32 datasize;
+
+			/* create an inline extent, and copy the data in */
+			path = btrfs_alloc_path();
+			BUG_ON(!path);
+			key.objectid = inode->i_ino;
+			key.offset = pages[i]->index << PAGE_CACHE_SHIFT;
+			key.flags = 0;
+			btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+			BUG_ON(write_bytes >= PAGE_CACHE_SIZE);
+			datasize = offset +
+				btrfs_file_extent_calc_inline_size(write_bytes);
+
+			ret = btrfs_insert_empty_item(trans, root, path, &key,
+						      datasize);
+			BUG_ON(ret);
+			ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+			       path->slots[0], struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(ei, trans->transid);
+			btrfs_set_file_extent_type(ei,
+						   BTRFS_FILE_EXTENT_INLINE);
+			ptr = btrfs_file_extent_inline_start(ei);
+			btrfs_memcpy(root, path->nodes[0]->b_data,
+				     ptr, bh->b_data, offset + write_bytes);
+			mark_buffer_dirty(path->nodes[0]);
+			btrfs_free_path(path);
+		} else if (buffer_mapped(bh)) {
+			/* csum the file data */
+			btrfs_csum_file_block(trans, root, inode->i_ino,
+				      pages[i]->index << PAGE_CACHE_SHIFT,
+				      kmap(pages[i]), PAGE_CACHE_SIZE);
+			kunmap(pages[i]);
+		}
+		SetPageChecked(pages[i]);
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->fs_mutex);
+
+		ret = btrfs_commit_write(file, pages[i], offset,
+					 offset + this_write);
+		pos += this_write;
+		if (ret) {
+			err = ret;
+			goto failed;
+		}
+		WARN_ON(this_write > write_bytes);
+		write_bytes -= this_write;
+	}
+failed:
+	return err;
+}
+
+/*
+ * this is very complex, but the basic idea is to drop all extents
+ * in the range start - end.  hint_block is filled in with a block number
+ * that would be a good hint to the block allocator for this file.
+ *
+ * If an extent intersects the range but is not entirely inside the range
+ * it is either truncated or split.  Anything entirely inside the range
+ * is deleted from the tree.
+ */
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode,
+		       u64 start, u64 end, u64 *hint_block)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_leaf *leaf;
+	int slot;
+	struct btrfs_file_extent_item *extent;
+	u64 extent_end = 0;
+	int keep;
+	struct btrfs_file_extent_item old;
+	struct btrfs_path *path;
+	u64 search_start = start;
+	int bookend;
+	int found_type;
+	int found_extent;
+	int found_inline;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	while(1) {
+		btrfs_release_path(root, path);
+		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+					       search_start, -1);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = 0;
+				goto out;
+			}
+			path->slots[0]--;
+		}
+		keep = 0;
+		bookend = 0;
+		found_extent = 0;
+		found_inline = 0;
+		extent = NULL;
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		slot = path->slots[0];
+		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+		if (key.offset >= end || key.objectid != inode->i_ino) {
+			ret = 0;
+			goto out;
+		}
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) {
+			ret = 0;
+			goto out;
+		}
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(extent);
+		if (found_type == BTRFS_FILE_EXTENT_REG) {
+			extent_end = key.offset +
+				(btrfs_file_extent_num_blocks(extent) <<
+				 inode->i_blkbits);
+			found_extent = 1;
+		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+			found_inline = 1;
+			extent_end = key.offset +
+			     btrfs_file_extent_inline_len(leaf->items + slot);
+		}
+
+		/* we found nothing we can drop */
+		if (!found_extent && !found_inline) {
+			ret = 0;
+			goto out;
+		}
+
+		/* we found nothing inside the range */
+		if (search_start >= extent_end) {
+			ret = 0;
+			goto out;
+		}
+
+		/* FIXME, there's only one inline extent allowed right now */
+		if (found_inline) {
+			u64 mask = root->blocksize - 1;
+			search_start = (extent_end + mask) & ~mask;
+		} else
+			search_start = extent_end;
+
+		if (end < extent_end && end >= key.offset) {
+			if (found_extent) {
+				u64 disk_blocknr =
+					btrfs_file_extent_disk_blocknr(extent);
+				u64 disk_num_blocks =
+				      btrfs_file_extent_disk_num_blocks(extent);
+				memcpy(&old, extent, sizeof(old));
+				if (disk_blocknr != 0) {
+					ret = btrfs_inc_extent_ref(trans, root,
+					         disk_blocknr, disk_num_blocks);
+					BUG_ON(ret);
+				}
+			}
+			WARN_ON(found_inline);
+			bookend = 1;
+		}
+
+		/* truncate existing extent */
+		if (start > key.offset) {
+			u64 new_num;
+			u64 old_num;
+			keep = 1;
+			WARN_ON(start & (root->blocksize - 1));
+			if (found_extent) {
+				new_num = (start - key.offset) >>
+					inode->i_blkbits;
+				old_num = btrfs_file_extent_num_blocks(extent);
+				*hint_block =
+					btrfs_file_extent_disk_blocknr(extent);
+				if (btrfs_file_extent_disk_blocknr(extent)) {
+					inode->i_blocks -=
+						(old_num - new_num) << 3;
+				}
+				btrfs_set_file_extent_num_blocks(extent,
+								 new_num);
+				mark_buffer_dirty(path->nodes[0]);
+			} else {
+				WARN_ON(1);
+			}
+		}
+		/* delete the entire extent */
+		if (!keep) {
+			u64 disk_blocknr = 0;
+			u64 disk_num_blocks = 0;
+			u64 extent_num_blocks = 0;
+			if (found_extent) {
+				disk_blocknr =
+				      btrfs_file_extent_disk_blocknr(extent);
+				disk_num_blocks =
+				      btrfs_file_extent_disk_num_blocks(extent);
+				extent_num_blocks =
+				      btrfs_file_extent_num_blocks(extent);
+				*hint_block =
+					btrfs_file_extent_disk_blocknr(extent);
+			}
+			ret = btrfs_del_item(trans, root, path);
+			BUG_ON(ret);
+			btrfs_release_path(root, path);
+			extent = NULL;
+			if (found_extent && disk_blocknr != 0) {
+				inode->i_blocks -= extent_num_blocks << 3;
+				ret = btrfs_free_extent(trans, root,
+							disk_blocknr,
+							disk_num_blocks, 0);
+			}
+
+			BUG_ON(ret);
+			if (!bookend && search_start >= end) {
+				ret = 0;
+				goto out;
+			}
+			if (!bookend)
+				continue;
+		}
+		/* create bookend, splitting the extent in two */
+		if (bookend && found_extent) {
+			struct btrfs_key ins;
+			ins.objectid = inode->i_ino;
+			ins.offset = end;
+			ins.flags = 0;
+			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+
+			btrfs_release_path(root, path);
+			ret = btrfs_insert_empty_item(trans, root, path, &ins,
+						      sizeof(*extent));
+			BUG_ON(ret);
+			extent = btrfs_item_ptr(
+				    btrfs_buffer_leaf(path->nodes[0]),
+				    path->slots[0],
+				    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_disk_blocknr(extent,
+				    btrfs_file_extent_disk_blocknr(&old));
+			btrfs_set_file_extent_disk_num_blocks(extent,
+				    btrfs_file_extent_disk_num_blocks(&old));
+
+			btrfs_set_file_extent_offset(extent,
+				    btrfs_file_extent_offset(&old) +
+				    ((end - key.offset) >> inode->i_blkbits));
+			WARN_ON(btrfs_file_extent_num_blocks(&old) <
+				(extent_end - end) >> inode->i_blkbits);
+			btrfs_set_file_extent_num_blocks(extent,
+				    (extent_end - end) >> inode->i_blkbits);
+
+			btrfs_set_file_extent_type(extent,
+						   BTRFS_FILE_EXTENT_REG);
+			btrfs_set_file_extent_generation(extent,
+				    btrfs_file_extent_generation(&old));
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+			if (btrfs_file_extent_disk_blocknr(&old) != 0) {
+				inode->i_blocks +=
+				      btrfs_file_extent_num_blocks(extent) << 3;
+			}
+			ret = 0;
+			goto out;
+		}
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * this gets pages into the page cache and locks them down
+ */
+static int prepare_pages(struct btrfs_root *root,
+			 struct file *file,
+			 struct page **pages,
+			 size_t num_pages,
+			 loff_t pos,
+			 unsigned long first_index,
+			 unsigned long last_index,
+			 size_t write_bytes,
+			 u64 alloc_extent_start)
+{
+	int i;
+	unsigned long index = pos >> PAGE_CACHE_SHIFT;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int offset;
+	int err = 0;
+	int this_write;
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	loff_t isize = i_size_read(inode);
+
+	memset(pages, 0, num_pages * sizeof(struct page *));
+
+	for (i = 0; i < num_pages; i++) {
+		pages[i] = grab_cache_page(inode->i_mapping, index + i);
+		if (!pages[i]) {
+			err = -ENOMEM;
+			goto failed_release;
+		}
+		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+		wait_on_page_writeback(pages[i]);
+		offset = pos & (PAGE_CACHE_SIZE -1);
+		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		if (!page_has_buffers(pages[i])) {
+			create_empty_buffers(pages[i],
+					     root->fs_info->sb->s_blocksize,
+					     (1 << BH_Uptodate));
+		}
+		head = page_buffers(pages[i]);
+		bh = head;
+		do {
+			err = btrfs_map_bh_to_logical(root, bh,
+						      alloc_extent_start);
+			BUG_ON(err);
+			if (err)
+				goto failed_truncate;
+			bh = bh->b_this_page;
+			if (alloc_extent_start)
+				alloc_extent_start++;
+		} while (bh != head);
+		pos += this_write;
+		WARN_ON(this_write > write_bytes);
+		write_bytes -= this_write;
+	}
+	return 0;
+
+failed_release:
+	btrfs_drop_pages(pages, num_pages);
+	return err;
+
+failed_truncate:
+	btrfs_drop_pages(pages, num_pages);
+	if (pos > isize)
+		vmtruncate(inode, isize);
+	return err;
+}
+
+static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	loff_t pos;
+	size_t num_written = 0;
+	int err = 0;
+	int ret = 0;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *pages[8];
+	struct page *pinned[2];
+	unsigned long first_index;
+	unsigned long last_index;
+	u64 start_pos;
+	u64 num_blocks;
+	u64 alloc_extent_start;
+	u64 hint_block;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key ins;
+	pinned[0] = NULL;
+	pinned[1] = NULL;
+	if (file->f_flags & O_DIRECT)
+		return -EINVAL;
+	pos = *ppos;
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+	if (count == 0)
+		goto out;
+	err = remove_suid(file->f_path.dentry);
+	if (err)
+		goto out;
+	file_update_time(file);
+
+	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+	num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
+			inode->i_blkbits;
+
+	mutex_lock(&inode->i_mutex);
+	first_index = pos >> PAGE_CACHE_SHIFT;
+	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+
+	/*
+	 * there are lots of better ways to do this, but this code
+	 * makes sure the first and last page in the file range are
+	 * up to date and ready for cow
+	 */
+	if ((pos & (PAGE_CACHE_SIZE - 1))) {
+		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+		if (!PageUptodate(pinned[0])) {
+			ret = mpage_readpage(pinned[0], btrfs_get_block);
+			BUG_ON(ret);
+			wait_on_page_locked(pinned[0]);
+		} else {
+			unlock_page(pinned[0]);
+		}
+	}
+	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+		if (!PageUptodate(pinned[1])) {
+			ret = mpage_readpage(pinned[1], btrfs_get_block);
+			BUG_ON(ret);
+			wait_on_page_locked(pinned[1]);
+		} else {
+			unlock_page(pinned[1]);
+		}
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		mutex_unlock(&root->fs_info->fs_mutex);
+		goto out_unlock;
+	}
+	btrfs_set_trans_block_group(trans, inode);
+	/* FIXME blocksize != 4096 */
+	inode->i_blocks += num_blocks << 3;
+	hint_block = 0;
+
+	/* FIXME...EIEIO, ENOSPC and more */
+
+	/* step one, delete the existing extents in this range */
+	if (start_pos < inode->i_size) {
+		/* FIXME blocksize != pagesize */
+		ret = btrfs_drop_extents(trans, root, inode,
+					 start_pos,
+					 (pos + count + root->blocksize -1) &
+					 ~((u64)root->blocksize - 1),
+					 &hint_block);
+		BUG_ON(ret);
+	}
+
+	/* insert any holes we need to create */
+	if (inode->i_size < start_pos) {
+		u64 last_pos_in_file;
+		u64 hole_size;
+		u64 mask = root->blocksize - 1;
+		last_pos_in_file = (inode->i_size + mask) & ~mask;
+		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		hole_size >>= inode->i_blkbits;
+		if (last_pos_in_file < start_pos) {
+			ret = btrfs_insert_file_extent(trans, root,
+						       inode->i_ino,
+						       last_pos_in_file,
+						       0, 0, hole_size);
+		}
+		BUG_ON(ret);
+	}
+
+	/*
+	 * either allocate an extent for the new bytes or setup the key
+	 * to show we are doing inline data in the extent
+	 */
+	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
+	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
+					 num_blocks, hint_block, (u64)-1,
+					 &ins, 1);
+		BUG_ON(ret);
+		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       start_pos, ins.objectid, ins.offset,
+				       ins.offset);
+		BUG_ON(ret);
+	} else {
+		ins.offset = 0;
+		ins.objectid = 0;
+	}
+	BUG_ON(ret);
+	alloc_extent_start = ins.objectid;
+	ret = btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	while(count > 0) {
+		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
+		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+
+		memset(pages, 0, sizeof(pages));
+		ret = prepare_pages(root, file, pages, num_pages,
+				    pos, first_index, last_index,
+				    write_bytes, alloc_extent_start);
+		BUG_ON(ret);
+
+		/* FIXME blocks != pagesize */
+		if (alloc_extent_start)
+			alloc_extent_start += num_pages;
+		ret = btrfs_copy_from_user(pos, num_pages,
+					   write_bytes, pages, buf);
+		BUG_ON(ret);
+
+		ret = dirty_and_release_pages(NULL, root, file, pages,
+					      num_pages, pos, write_bytes);
+		BUG_ON(ret);
+		btrfs_drop_pages(pages, num_pages);
+
+		buf += write_bytes;
+		count -= write_bytes;
+		pos += write_bytes;
+		num_written += write_bytes;
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		btrfs_btree_balance_dirty(root);
+		cond_resched();
+	}
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+out:
+	if (pinned[0])
+		page_cache_release(pinned[0]);
+	if (pinned[1])
+		page_cache_release(pinned[1]);
+	*ppos = pos;
+	current->backing_dev_info = NULL;
+	mark_inode_dirty(inode);
+	return num_written ? num_written : err;
+}
+
+/*
+ * FIXME, do this by stuffing the csum we want in the info hanging off
+ * page->private.  For now, verify file csums on read
+ */
+static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
+			unsigned long offset, unsigned long size)
+{
+	char *kaddr;
+	unsigned long left, count = desc->count;
+	struct inode *inode = page->mapping->host;
+
+	if (size > count)
+		size = count;
+
+	if (!PageChecked(page)) {
+		/* FIXME, do it per block */
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		int ret;
+		struct buffer_head *bh;
+
+		if (page_has_buffers(page)) {
+			bh = page_buffers(page);
+			if (!buffer_mapped(bh)) {
+				SetPageChecked(page);
+				goto checked;
+			}
+		}
+
+		ret = btrfs_csum_verify_file_block(root,
+				  page->mapping->host->i_ino,
+				  page->index << PAGE_CACHE_SHIFT,
+				  kmap(page), PAGE_CACHE_SIZE);
+		if (ret) {
+			if (ret != -ENOENT) {
+				printk("failed to verify ino %lu page %lu ret %d\n",
+				       page->mapping->host->i_ino,
+				       page->index, ret);
+				memset(page_address(page), 1, PAGE_CACHE_SIZE);
+				flush_dcache_page(page);
+			}
+		}
+		SetPageChecked(page);
+		kunmap(page);
+	}
+checked:
+	/*
+	 * Faults on the destination of a read are common, so do it before
+	 * taking the kmap.
+	 */
+	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
+		kaddr = kmap_atomic(page, KM_USER0);
+		left = __copy_to_user_inatomic(desc->arg.buf,
+						kaddr + offset, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (left == 0)
+			goto success;
+	}
+
+	/* Do it the slow way */
+	kaddr = kmap(page);
+	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
+	kunmap(page);
+
+	if (left) {
+		size -= left;
+		desc->error = -EFAULT;
+	}
+success:
+	desc->count = count - size;
+	desc->written += size;
+	desc->arg.buf += size;
+	return size;
+}
+
+/**
+ * btrfs_file_aio_read - filesystem read routine, with a mod to csum verify
+ * @iocb:	kernel I/O control block
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @pos:	current file position
+ */
+static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+				   unsigned long nr_segs, loff_t pos)
+{
+	struct file *filp = iocb->ki_filp;
+	ssize_t retval;
+	unsigned long seg;
+	size_t count;
+	loff_t *ppos = &iocb->ki_pos;
+
+	count = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		count += iv->iov_len;
+		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		nr_segs = seg;
+		count -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+	retval = 0;
+	if (count) {
+		for (seg = 0; seg < nr_segs; seg++) {
+			read_descriptor_t desc;
+
+			desc.written = 0;
+			desc.arg.buf = iov[seg].iov_base;
+			desc.count = iov[seg].iov_len;
+			if (desc.count == 0)
+				continue;
+			desc.error = 0;
+			do_generic_file_read(filp, ppos, &desc,
+					     btrfs_read_actor);
+			retval += desc.written;
+			if (desc.error) {
+				retval = retval ?: desc.error;
+				break;
+			}
+		}
+	}
+	return retval;
+}
+
+static int btrfs_sync_file(struct file *file,
+			   struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	/*
+	 * FIXME, use inode generation number to check if we can skip the
+	 * commit
+	 */
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+out:
+	return ret > 0 ? EIO : ret;
+}
+
+struct file_operations btrfs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read       = btrfs_file_aio_read,
+	.write		= btrfs_file_write,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.ioctl		= btrfs_ioctl,
+	.fsync		= btrfs_sync_file,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_compat_ioctl,
+#endif
+};
+
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
new file mode 100644
index 00000000000..62a3a778d37
--- /dev/null
+++ b/fs/btrfs/inode.c
@@ -0,0 +1,2210 @@
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+
+struct btrfs_iget_args {
+	u64 ino;
+	struct btrfs_root *root;
+};
+
+static struct inode_operations btrfs_dir_inode_operations;
+static struct inode_operations btrfs_symlink_inode_operations;
+static struct inode_operations btrfs_dir_ro_inode_operations;
+static struct inode_operations btrfs_file_inode_operations;
+static struct address_space_operations btrfs_aops;
+static struct address_space_operations btrfs_symlink_aops;
+static struct file_operations btrfs_dir_file_operations;
+
+static struct kmem_cache *btrfs_inode_cachep;
+struct kmem_cache *btrfs_trans_handle_cachep;
+struct kmem_cache *btrfs_transaction_cachep;
+struct kmem_cache *btrfs_bit_radix_cachep;
+struct kmem_cache *btrfs_path_cachep;
+
+#define S_SHIFT 12
+static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
+};
+
+void btrfs_read_locked_inode(struct inode *inode)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key location;
+	u64 alloc_group_block;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	mutex_lock(&root->fs_info->fs_mutex);
+
+	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
+	if (ret) {
+		btrfs_free_path(path);
+		goto make_bad;
+	}
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
+				  struct btrfs_inode_item);
+
+	inode->i_mode = btrfs_inode_mode(inode_item);
+	inode->i_nlink = btrfs_inode_nlink(inode_item);
+	inode->i_uid = btrfs_inode_uid(inode_item);
+	inode->i_gid = btrfs_inode_gid(inode_item);
+	inode->i_size = btrfs_inode_size(inode_item);
+	inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime);
+	inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime);
+	inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime);
+	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime);
+	inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime);
+	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
+	inode->i_blocks = btrfs_inode_nblocks(inode_item);
+	inode->i_generation = btrfs_inode_generation(inode_item);
+	alloc_group_block = btrfs_inode_block_group(inode_item);
+	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
+						       alloc_group_block);
+
+	btrfs_free_path(path);
+	inode_item = NULL;
+
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	switch (inode->i_mode & S_IFMT) {
+#if 0
+	default:
+		init_special_inode(inode, inode->i_mode,
+				   btrfs_inode_rdev(inode_item));
+		break;
+#endif
+	case S_IFREG:
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+		break;
+	case S_IFDIR:
+		inode->i_fop = &btrfs_dir_file_operations;
+		if (root == root->fs_info->tree_root)
+			inode->i_op = &btrfs_dir_ro_inode_operations;
+		else
+			inode->i_op = &btrfs_dir_inode_operations;
+		break;
+	case S_IFLNK:
+		inode->i_op = &btrfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &btrfs_symlink_aops;
+		break;
+	}
+	return;
+
+make_bad:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	make_bad_inode(inode);
+}
+
+static void fill_inode_item(struct btrfs_inode_item *item,
+			    struct inode *inode)
+{
+	btrfs_set_inode_uid(item, inode->i_uid);
+	btrfs_set_inode_gid(item, inode->i_gid);
+	btrfs_set_inode_size(item, inode->i_size);
+	btrfs_set_inode_mode(item, inode->i_mode);
+	btrfs_set_inode_nlink(item, inode->i_nlink);
+	btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
+	btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
+	btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
+	btrfs_set_inode_nblocks(item, inode->i_blocks);
+	btrfs_set_inode_generation(item, inode->i_generation);
+	btrfs_set_inode_block_group(item,
+				    BTRFS_I(inode)->block_group->key.objectid);
+}
+
+static int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, 1);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		goto failed;
+	}
+
+	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+				  path->slots[0],
+				  struct btrfs_inode_item);
+
+	fill_inode_item(inode_item, inode);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	ret = 0;
+failed:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	return ret;
+}
+
+
+static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *dir,
+			      struct dentry *dentry)
+{
+	struct btrfs_path *path;
+	const char *name = dentry->d_name.name;
+	int name_len = dentry->d_name.len;
+	int ret = 0;
+	u64 objectid;
+	struct btrfs_dir_item *di;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				    name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		ret = -ENOENT;
+		goto err;
+	}
+	objectid = btrfs_disk_key_objectid(&di->location);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 objectid, name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		ret = -ENOENT;
+		goto err;
+	}
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	BUG_ON(ret);
+
+	dentry->d_inode->i_ctime = dir->i_ctime;
+err:
+	btrfs_free_path(path);
+	if (!ret) {
+		dir->i_size -= name_len * 2;
+		btrfs_update_inode(trans, root, dir);
+		drop_nlink(dentry->d_inode);
+		btrfs_update_inode(trans, root, dentry->d_inode);
+		dir->i_sb->s_dirt = 1;
+	}
+	return ret;
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct btrfs_root *root;
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	root = BTRFS_I(dir)->root;
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+	ret = btrfs_unlink_trans(trans, root, dir, dentry);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
+	return ret;
+}
+
+static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key found_key;
+	int found_type;
+	struct btrfs_leaf *leaf;
+	char *goodnames = "..";
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+	key.objectid = inode->i_ino;
+	key.offset = (u64)-1;
+	key.flags = (u32)-1;
+	while(1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		BUG_ON(ret == 0);
+		if (path->slots[0] == 0) {
+			err = -ENOENT;
+			goto out;
+		}
+		path->slots[0]--;
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		btrfs_disk_key_to_cpu(&found_key,
+				      &leaf->items[path->slots[0]].key);
+		found_type = btrfs_key_type(&found_key);
+		if (found_key.objectid != inode->i_ino) {
+			err = -ENOENT;
+			goto out;
+		}
+		if ((found_type != BTRFS_DIR_ITEM_KEY &&
+		     found_type != BTRFS_DIR_INDEX_KEY) ||
+	            (!btrfs_match_dir_item_name(root, path, goodnames, 2) &&
+	            !btrfs_match_dir_item_name(root, path, goodnames, 1))) {
+			err = -ENOTEMPTY;
+			goto out;
+		}
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
+
+		if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1)
+			break;
+		btrfs_release_path(root, path);
+	}
+	ret = 0;
+	btrfs_release_path(root, path);
+
+	/* now the directory is empty */
+	err = btrfs_unlink_trans(trans, root, dir, dentry);
+	if (!err) {
+		inode->i_size = 0;
+	}
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	ret = btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
+static int btrfs_free_inode(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	clear_inode(inode);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	ret = btrfs_lookup_inode(trans, root, path,
+				 &BTRFS_I(inode)->location, -1);
+	BUG_ON(ret);
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * truncates go from a high offset to a low offset.  So, walk
+ * from hi to lo in the node and issue readas.  Stop when you find
+ * keys from a different objectid
+ */
+static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path,
+			   u64 objectid)
+{
+	struct btrfs_node *node;
+	int i;
+	int nritems;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+	int ret;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1];
+	if (slot == 0)
+		return;
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot - 1; i >= 0; i--) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid != objectid)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
+/*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+ * any higher than i_size.
+ *
+ * csum items that cross the new i_size are truncated to the new size
+ * as well.
+ */
+static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct inode *inode)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_disk_key *found_key;
+	u32 found_type;
+	struct btrfs_leaf *leaf;
+	struct btrfs_file_extent_item *fi;
+	u64 extent_start = 0;
+	u64 extent_num_blocks = 0;
+	u64 item_end = 0;
+	int found_extent;
+	int del_item;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	/* FIXME, add redo link to tree so we don't leak on crash */
+	key.objectid = inode->i_ino;
+	key.offset = (u64)-1;
+	key.flags = (u32)-1;
+	while(1) {
+		btrfs_init_path(path);
+		fi = NULL;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0) {
+			goto error;
+		}
+		if (ret > 0) {
+			BUG_ON(path->slots[0] == 0);
+			path->slots[0]--;
+		}
+		reada_truncate(root, path, inode->i_ino);
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		found_key = &leaf->items[path->slots[0]].key;
+		found_type = btrfs_disk_key_type(found_key);
+
+		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
+			break;
+		if (found_type != BTRFS_CSUM_ITEM_KEY &&
+		    found_type != BTRFS_DIR_ITEM_KEY &&
+		    found_type != BTRFS_DIR_INDEX_KEY &&
+		    found_type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		item_end = btrfs_disk_key_offset(found_key);
+		if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+					    path->slots[0],
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(fi) !=
+			    BTRFS_FILE_EXTENT_INLINE) {
+				item_end += btrfs_file_extent_num_blocks(fi) <<
+						inode->i_blkbits;
+			}
+		}
+		if (found_type == BTRFS_CSUM_ITEM_KEY) {
+			ret = btrfs_csum_truncate(trans, root, path,
+						  inode->i_size);
+			BUG_ON(ret);
+		}
+		if (item_end < inode->i_size) {
+			if (found_type) {
+				btrfs_set_key_type(&key, found_type - 1);
+				continue;
+			}
+			break;
+		}
+		if (btrfs_disk_key_offset(found_key) >= inode->i_size)
+			del_item = 1;
+		else
+			del_item = 0;
+		found_extent = 0;
+
+		/* FIXME, shrink the extent if the ref count is only 1 */
+		if (found_type == BTRFS_EXTENT_DATA_KEY &&
+			   btrfs_file_extent_type(fi) !=
+			   BTRFS_FILE_EXTENT_INLINE) {
+			u64 num_dec;
+			if (!del_item) {
+				u64 orig_num_blocks =
+					btrfs_file_extent_num_blocks(fi);
+				extent_num_blocks = inode->i_size -
+					btrfs_disk_key_offset(found_key) +
+					root->blocksize - 1;
+				extent_num_blocks >>= inode->i_blkbits;
+				btrfs_set_file_extent_num_blocks(fi,
+							 extent_num_blocks);
+				inode->i_blocks -= (orig_num_blocks -
+					extent_num_blocks) << 3;
+				mark_buffer_dirty(path->nodes[0]);
+			} else {
+				extent_start =
+					btrfs_file_extent_disk_blocknr(fi);
+				extent_num_blocks =
+					btrfs_file_extent_disk_num_blocks(fi);
+				/* FIXME blocksize != 4096 */
+				num_dec = btrfs_file_extent_num_blocks(fi) << 3;
+				if (extent_start != 0) {
+					found_extent = 1;
+					inode->i_blocks -= num_dec;
+				}
+			}
+		}
+		if (del_item) {
+			ret = btrfs_del_item(trans, root, path);
+			BUG_ON(ret);
+		} else {
+			break;
+		}
+		btrfs_release_path(root, path);
+		if (found_extent) {
+			ret = btrfs_free_extent(trans, root, extent_start,
+						extent_num_blocks, 0);
+			BUG_ON(ret);
+		}
+	}
+	ret = 0;
+error:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	inode->i_sb->s_dirt = 1;
+	return ret;
+}
+
+/*
+ * taken from block_truncate_page, but does cow as it zeros out
+ * any bytes left in the last page in the file.
+ */
+static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+{
+	struct inode *inode = mapping->host;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	struct page *page;
+	char *kaddr;
+	int ret = 0;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 alloc_hint = 0;
+	struct btrfs_key ins;
+	struct btrfs_trans_handle *trans;
+
+	if ((offset & (blocksize - 1)) == 0)
+		goto out;
+
+	ret = -ENOMEM;
+	page = grab_cache_page(mapping, index);
+	if (!page)
+		goto out;
+
+	if (!PageUptodate(page)) {
+		ret = mpage_readpage(page, btrfs_get_block);
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+			goto out;
+		}
+	}
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	ret = btrfs_drop_extents(trans, root, inode,
+				 page->index << PAGE_CACHE_SHIFT,
+				 (page->index + 1) << PAGE_CACHE_SHIFT,
+				 &alloc_hint);
+	BUG_ON(ret);
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+				 alloc_hint, (u64)-1, &ins, 1);
+	BUG_ON(ret);
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       page->index << PAGE_CACHE_SHIFT,
+				       ins.objectid, 1, 1);
+	BUG_ON(ret);
+	SetPageChecked(page);
+	kaddr = kmap(page);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	btrfs_csum_file_block(trans, root, inode->i_ino,
+			      page->index << PAGE_CACHE_SHIFT,
+			      kaddr, PAGE_CACHE_SIZE);
+	kunmap(page);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	set_page_dirty(page);
+	unlock_page(page);
+	page_cache_release(page);
+out:
+	return ret;
+}
+
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
+
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+		struct btrfs_trans_handle *trans;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		u64 mask = root->blocksize - 1;
+		u64 pos = (inode->i_size + mask) & ~mask;
+		u64 hole_size;
+
+		if (attr->ia_size <= pos)
+			goto out;
+
+		btrfs_truncate_page(inode->i_mapping, inode->i_size);
+
+		hole_size = (attr->ia_size - pos + mask) & ~mask;
+		hole_size >>= inode->i_blkbits;
+
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
+					       pos, 0, 0, hole_size);
+		BUG_ON(err);
+		btrfs_end_transaction(trans, root);
+		mutex_unlock(&root->fs_info->fs_mutex);
+	}
+out:
+	err = inode_setattr(inode, attr);
+
+	return err;
+}
+void btrfs_delete_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	if (is_bad_inode(inode)) {
+		goto no_delete;
+	}
+	inode->i_size = 0;
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	ret = btrfs_truncate_in_trans(trans, root, inode);
+	BUG_ON(ret);
+	btrfs_free_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
+	return;
+no_delete:
+	clear_inode(inode);
+}
+
+/*
+ * this returns the key found in the dir entry in the location pointer.
+ * If no dir entries were found, location->objectid is 0.
+ */
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+			       struct btrfs_key *location)
+{
+	const char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+				    namelen, 0);
+	if (!di || IS_ERR(di)) {
+		location->objectid = 0;
+		ret = 0;
+		goto out;
+	}
+	btrfs_disk_key_to_cpu(location, &di->location);
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * when we hit a tree root in a directory, the btrfs part of the inode
+ * needs to be changed to reflect the root directory of the tree root.  This
+ * is kind of like crossing a mount point.
+ */
+static int fixup_tree_root_location(struct btrfs_root *root,
+			     struct btrfs_key *location,
+			     struct btrfs_root **sub_root)
+{
+	struct btrfs_path *path;
+	struct btrfs_root_item *ri;
+
+	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
+		return 0;
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	mutex_lock(&root->fs_info->fs_mutex);
+
+	*sub_root = btrfs_read_fs_root(root->fs_info, location);
+	if (IS_ERR(*sub_root))
+		return PTR_ERR(*sub_root);
+
+	ri = &(*sub_root)->root_item;
+	location->objectid = btrfs_root_dirid(ri);
+	location->flags = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+	location->offset = 0;
+
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return 0;
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct btrfs_iget_args *args = p;
+	inode->i_ino = args->ino;
+	BTRFS_I(inode)->root = args->root;
+	return 0;
+}
+
+static int btrfs_find_actor(struct inode *inode, void *opaque)
+{
+	struct btrfs_iget_args *args = opaque;
+	return (args->ino == inode->i_ino &&
+		args->root == BTRFS_I(inode)->root);
+}
+
+struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
+				struct btrfs_root *root)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	inode = iget5_locked(s, objectid, btrfs_find_actor,
+			     btrfs_init_locked_inode,
+			     (void *)&args);
+	return inode;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode * inode;
+	struct btrfs_inode *bi = BTRFS_I(dir);
+	struct btrfs_root *root = bi->root;
+	struct btrfs_root *sub_root = root;
+	struct btrfs_key location;
+	int ret;
+
+	if (dentry->d_name.len > BTRFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_inode_by_name(dir, dentry, &location);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	inode = NULL;
+	if (location.objectid) {
+		ret = fixup_tree_root_location(root, &location, &sub_root);
+		if (ret < 0)
+			return ERR_PTR(ret);
+		if (ret > 0)
+			return ERR_PTR(-ENOENT);
+		inode = btrfs_iget_locked(dir->i_sb, location.objectid,
+					  sub_root);
+		if (!inode)
+			return ERR_PTR(-EACCES);
+		if (inode->i_state & I_NEW) {
+			/* the inode and parent dir are two different roots */
+			if (sub_root != root) {
+				igrab(inode);
+				sub_root->inode = inode;
+			}
+			BTRFS_I(inode)->root = sub_root;
+			memcpy(&BTRFS_I(inode)->location, &location,
+			       sizeof(location));
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+		}
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+/*
+ * readahead one full node of leaves as long as their keys include
+ * the objectid supplied
+ */
+static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path,
+			 u64 objectid)
+{
+	struct btrfs_node *node;
+	int i;
+	u32 nritems;
+	u64 item_objectid;
+	u64 blocknr;
+	int slot;
+	int ret;
+
+	if (!path->nodes[1])
+		return;
+	node = btrfs_buffer_node(path->nodes[1]);
+	slot = path->slots[1];
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot + 1; i < nritems; i++) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		if (item_objectid != objectid)
+			break;
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+static unsigned char btrfs_filetype_table[] = {
+	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_item *item;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	int ret;
+	u32 nritems;
+	struct btrfs_leaf *leaf;
+	int slot;
+	int advance;
+	unsigned char d_type;
+	int over = 0;
+	u32 di_cur;
+	u32 di_total;
+	u32 di_len;
+	int key_type = BTRFS_DIR_INDEX_KEY;
+
+	/* FIXME, use a real flag for deciding about the key type */
+	if (root->fs_info->tree_root == root)
+		key_type = BTRFS_DIR_ITEM_KEY;
+	mutex_lock(&root->fs_info->fs_mutex);
+	key.objectid = inode->i_ino;
+	key.flags = 0;
+	btrfs_set_key_type(&key, key_type);
+	key.offset = filp->f_pos;
+	path = btrfs_alloc_path();
+	btrfs_init_path(path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+	advance = 0;
+	reada_leaves(root, path, inode->i_ino);
+	while(1) {
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		nritems = btrfs_header_nritems(&leaf->header);
+		slot = path->slots[0];
+		if (advance || slot >= nritems) {
+			if (slot >= nritems -1) {
+				reada_leaves(root, path, inode->i_ino);
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+				leaf = btrfs_buffer_leaf(path->nodes[0]);
+				nritems = btrfs_header_nritems(&leaf->header);
+				slot = path->slots[0];
+			} else {
+				slot++;
+				path->slots[0]++;
+			}
+		}
+		advance = 1;
+		item = leaf->items + slot;
+		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
+			break;
+		if (btrfs_disk_key_type(&item->key) != key_type)
+			break;
+		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
+			continue;
+		filp->f_pos = btrfs_disk_key_offset(&item->key);
+		advance = 1;
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+		di_cur = 0;
+		di_total = btrfs_item_size(leaf->items + slot);
+		while(di_cur < di_total) {
+			d_type = btrfs_filetype_table[btrfs_dir_type(di)];
+			over = filldir(dirent, (const char *)(di + 1),
+				       btrfs_dir_name_len(di),
+				       btrfs_disk_key_offset(&item->key),
+				       btrfs_disk_key_objectid(&di->location),
+				       d_type);
+			if (over)
+				goto nopos;
+			di_len = btrfs_dir_name_len(di) + sizeof(*di);
+			di_cur += di_len;
+			di = (struct btrfs_dir_item *)((char *)di + di_len);
+		}
+	}
+	filp->f_pos++;
+nopos:
+	ret = 0;
+err:
+	btrfs_release_path(root, path);
+	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+int btrfs_write_inode(struct inode *inode, int wait)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	if (wait) {
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_set_trans_block_group(trans, inode);
+		ret = btrfs_commit_transaction(trans, root);
+		mutex_unlock(&root->fs_info->fs_mutex);
+	}
+	return ret;
+}
+
+/*
+ * This is somewhat expense, updating the tree every time the
+ * inode changes.  But, it is most likely to find the inode in cache.
+ * FIXME, needs more benchmarking...there are no reasons other than performance
+ * to keep or drop this code.
+ */
+void btrfs_dirty_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
+}
+
+static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 objectid,
+				     struct btrfs_block_group_cache *group,
+				     int mode)
+{
+	struct inode *inode;
+	struct btrfs_inode_item inode_item;
+	struct btrfs_key *location;
+	int ret;
+	int owner;
+
+	inode = new_inode(root->fs_info->sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	BTRFS_I(inode)->root = root;
+	if (mode & S_IFDIR)
+		owner = 0;
+	else
+		owner = 1;
+	group = btrfs_find_block_group(root, group, 0, 0, owner);
+	BTRFS_I(inode)->block_group = group;
+
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_mode = mode;
+	inode->i_ino = objectid;
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	fill_inode_item(&inode_item, inode);
+	location = &BTRFS_I(inode)->location;
+	location->objectid = objectid;
+	location->flags = 0;
+	location->offset = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+
+	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
+	BUG_ON(ret);
+
+	insert_inode_hash(inode);
+	return inode;
+}
+
+static inline u8 btrfs_inode_type(struct inode *inode)
+{
+	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
+}
+
+static int btrfs_add_link(struct btrfs_trans_handle *trans,
+			    struct dentry *dentry, struct inode *inode)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
+	key.objectid = inode->i_ino;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	ret = btrfs_insert_dir_item(trans, root,
+				    dentry->d_name.name, dentry->d_name.len,
+				    dentry->d_parent->d_inode->i_ino,
+				    &key, btrfs_inode_type(inode));
+	if (ret == 0) {
+		dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
+		ret = btrfs_update_inode(trans, root,
+					 dentry->d_parent->d_inode);
+	}
+	return ret;
+}
+
+static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
+			    struct dentry *dentry, struct inode *inode)
+{
+	int err = btrfs_add_link(trans, dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	if (err > 0)
+		err = -EEXIST;
+	return err;
+}
+
+static int btrfs_create(struct inode *dir, struct dentry *dentry,
+			int mode, struct nameidata *nd)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_unlock;
+
+	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_add_nondir(trans, dentry, inode);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+	}
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
+static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
+		      struct dentry *dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode = old_dentry->d_inode;
+	int err;
+	int drop_inode = 0;
+
+	if (inode->i_nlink == 0)
+		return -ENOENT;
+
+	inc_nlink(inode);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+	atomic_inc(&inode->i_count);
+	err = btrfs_add_nondir(trans, dentry, inode);
+	if (err)
+		drop_inode = 1;
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, dir);
+	btrfs_update_inode(trans, root, inode);
+
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
+static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 objectid, u64 dirid)
+{
+	int ret;
+	char buf[2];
+	struct btrfs_key key;
+
+	buf[0] = '.';
+	buf[1] = '.';
+
+	key.objectid = objectid;
+	key.offset = 0;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+
+	ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
+				    &key, BTRFS_FT_DIR);
+	if (ret)
+		goto error;
+	key.objectid = dirid;
+	ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
+				    &key, BTRFS_FT_DIR);
+	if (ret)
+		goto error;
+error:
+	return ret;
+}
+
+static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	int err = 0;
+	int drop_on_err = 0;
+	u64 objectid;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, S_IFDIR | mode);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out_fail;
+	}
+	drop_on_err = 1;
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+	btrfs_set_trans_block_group(trans, inode);
+
+	err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
+	if (err)
+		goto out_fail;
+
+	inode->i_size = 6;
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_fail;
+	err = btrfs_add_link(trans, dentry, inode);
+	if (err)
+		goto out_fail;
+	d_instantiate(dentry, inode);
+	drop_on_err = 0;
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+
+out_fail:
+	btrfs_end_transaction(trans, root);
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (drop_on_err)
+		iput(inode);
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
+/*
+ * FIBMAP and others want to pass in a fake buffer head.  They need to
+ * use BTRFS_GET_BLOCK_NO_DIRECT to make sure we don't try to memcpy
+ * any packed file data into the fake bh
+ */
+#define BTRFS_GET_BLOCK_NO_CREATE 0
+#define BTRFS_GET_BLOCK_CREATE 1
+#define BTRFS_GET_BLOCK_NO_DIRECT 2
+
+/*
+ * FIXME create==1 doe not work.
+ */
+static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
+				struct buffer_head *result, int create)
+{
+	int ret;
+	int err = 0;
+	u64 blocknr;
+	u64 extent_start = 0;
+	u64 extent_end = 0;
+	u64 objectid = inode->i_ino;
+	u32 found_type;
+	u64 alloc_hint = 0;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_leaf *leaf;
+	struct btrfs_disk_key *found_key;
+	struct btrfs_trans_handle *trans = NULL;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	btrfs_init_path(path);
+	if (create & BTRFS_GET_BLOCK_CREATE) {
+		WARN_ON(1);
+		/* this almost but not quite works */
+		trans = btrfs_start_transaction(root, 1);
+		if (!trans) {
+			err = -ENOMEM;
+			goto out;
+		}
+		ret = btrfs_drop_extents(trans, root, inode,
+					 iblock << inode->i_blkbits,
+					 (iblock + 1) << inode->i_blkbits,
+					 &alloc_hint);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_lookup_file_extent(NULL, root, path,
+				       inode->i_ino,
+				       iblock << inode->i_blkbits, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret != 0) {
+		if (path->slots[0] == 0) {
+			btrfs_release_path(root, path);
+			goto not_found;
+		}
+		path->slots[0]--;
+	}
+
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+			      struct btrfs_file_extent_item);
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	blocknr = btrfs_file_extent_disk_blocknr(item);
+	blocknr += btrfs_file_extent_offset(item);
+
+	/* are we inside the extent that was found? */
+	found_key = &leaf->items[path->slots[0]].key;
+	found_type = btrfs_disk_key_type(found_key);
+	if (btrfs_disk_key_objectid(found_key) != objectid ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		extent_end = 0;
+		extent_start = 0;
+		goto not_found;
+	}
+	found_type = btrfs_file_extent_type(item);
+	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		extent_start = extent_start >> inode->i_blkbits;
+		extent_end = extent_start + btrfs_file_extent_num_blocks(item);
+		err = 0;
+		if (btrfs_file_extent_disk_blocknr(item) == 0)
+			goto out;
+		if (iblock >= extent_start && iblock < extent_end) {
+			btrfs_map_bh_to_logical(root, result, blocknr +
+						iblock - extent_start);
+			goto out;
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		char *ptr;
+		char *map;
+		u32 size;
+
+		if (create & BTRFS_GET_BLOCK_NO_DIRECT) {
+			err = -EINVAL;
+			goto out;
+		}
+		size = btrfs_file_extent_inline_len(leaf->items +
+						    path->slots[0]);
+		extent_end = (extent_start + size) >> inode->i_blkbits;
+		extent_start >>= inode->i_blkbits;
+		if (iblock < extent_start || iblock > extent_end) {
+			goto not_found;
+		}
+		ptr = btrfs_file_extent_inline_start(item);
+		map = kmap(result->b_page);
+		memcpy(map, ptr, size);
+		memset(map + size, 0, PAGE_CACHE_SIZE - size);
+		flush_dcache_page(result->b_page);
+		kunmap(result->b_page);
+		set_buffer_uptodate(result);
+		SetPageChecked(result->b_page);
+		btrfs_map_bh_to_logical(root, result, 0);
+	}
+not_found:
+	if (create & BTRFS_GET_BLOCK_CREATE) {
+		struct btrfs_key ins;
+		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
+					 1, alloc_hint, (u64)-1,
+					 &ins, 1);
+		BUG_ON(ret);
+		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+					       iblock << inode->i_blkbits,
+					       ins.objectid, ins.offset,
+					       ins.offset);
+		BUG_ON(ret);
+		SetPageChecked(result->b_page);
+		btrfs_map_bh_to_logical(root, result, ins.objectid);
+	}
+out:
+	if (trans)
+		err = btrfs_end_transaction(trans, root);
+	btrfs_free_path(path);
+	return err;
+}
+
+int btrfs_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *result, int create)
+{
+	int err;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_get_block_lock(inode, iblock, result, create);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return err;
+}
+
+static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
+			   struct buffer_head *result, int create)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	mutex_lock(&root->fs_info->fs_mutex);
+	btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return 0;
+}
+
+static sector_t btrfs_bmap(struct address_space *as, sector_t block)
+{
+	return generic_block_bmap(as, block, btrfs_get_block_bmap);
+}
+
+static int btrfs_prepare_write(struct file *file, struct page *page,
+			       unsigned from, unsigned to)
+{
+	return block_prepare_write(page, from, to, btrfs_get_block);
+}
+
+static int btrfs_readpage(struct file *file, struct page *page)
+{
+	return mpage_readpage(page, btrfs_get_block);
+}
+
+/*
+ * Aside from a tiny bit of packed file data handling, this is the
+ * same as the generic code.
+ *
+ * While block_write_full_page is writing back the dirty buffers under
+ * the page lock, whoever dirtied the buffers may decide to clean them
+ * again at any time.  We handle that by only looking at the buffer
+ * state inside lock_buffer().
+ *
+ * If block_write_full_page() is called for regular writeback
+ * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
+ * locked buffer.   This only can happen if someone has written the buffer
+ * directly, with submit_bh().  At the address_space level PageWriteback
+ * prevents this contention from occurring.
+ */
+static int __btrfs_write_full_page(struct inode *inode, struct page *page,
+				   struct writeback_control *wbc)
+{
+	int err;
+	sector_t block;
+	sector_t last_block;
+	struct buffer_head *bh, *head;
+	const unsigned blocksize = 1 << inode->i_blkbits;
+	int nr_underway = 0;
+
+	BUG_ON(!PageLocked(page));
+
+	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
+
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, blocksize,
+					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	}
+
+	/*
+	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
+	 * here, and the (potentially unmapped) buffers may become dirty at
+	 * any time.  If a buffer becomes dirty here after we've inspected it
+	 * then we just miss that fact, and the page stays dirty.
+	 *
+	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+	 * handle that here by just cleaning them.
+	 */
+
+	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	head = page_buffers(page);
+	bh = head;
+
+	/*
+	 * Get all the dirty buffers mapped to disk addresses and
+	 * handle any aliases from the underlying blockdev's mapping.
+	 */
+	do {
+		if (block > last_block) {
+			/*
+			 * mapped buffers outside i_size will occur, because
+			 * this page can be outside i_size when there is a
+			 * truncate in progress.
+			 */
+			/*
+			 * The buffer was zeroed by block_write_full_page()
+			 */
+			clear_buffer_dirty(bh);
+			set_buffer_uptodate(bh);
+		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+			WARN_ON(bh->b_size != blocksize);
+			err = btrfs_get_block(inode, block, bh, 0);
+			if (err) {
+				goto recover;
+			}
+			if (buffer_new(bh)) {
+				/* blockdev mappings never come here */
+				clear_buffer_new(bh);
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+			lock_buffer(bh);
+		} else if (test_set_buffer_locked(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+
+	err = 0;
+done:
+	if (nr_underway == 0) {
+		/*
+		 * The page was marked dirty, but the buffers were
+		 * clean.  Someone wrote them back by hand with
+		 * ll_rw_block/submit_bh.  A rare case.
+		 */
+		int uptodate = 1;
+		do {
+			if (!buffer_uptodate(bh)) {
+				uptodate = 0;
+				break;
+			}
+			bh = bh->b_this_page;
+		} while (bh != head);
+		if (uptodate)
+			SetPageUptodate(page);
+		end_page_writeback(page);
+	}
+	return err;
+
+recover:
+	/*
+	 * ENOSPC, or some other error.  We may already have added some
+	 * blocks to the file, so we need to write these out to avoid
+	 * exposing stale data.
+	 * The page is currently locked and not marked for writeback
+	 */
+	bh = head;
+	/* Recovery: lock and submit the mapped buffers */
+	do {
+		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+			lock_buffer(bh);
+			mark_buffer_async_write(bh);
+		} else {
+			/*
+			 * The buffer may have been set dirty during
+			 * attachment to a dirty page.
+			 */
+			clear_buffer_dirty(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(WRITE, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
+	goto done;
+}
+
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct inode * const inode = page->mapping->host;
+	loff_t i_size = i_size_read(inode);
+	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned offset;
+	void *kaddr;
+
+	/* Is the page fully inside i_size? */
+	if (page->index < end_index)
+		return __btrfs_write_full_page(inode, page, wbc);
+
+	/* Is the page fully outside i_size? (truncate in progress) */
+	offset = i_size & (PAGE_CACHE_SIZE-1);
+	if (page->index >= end_index+1 || !offset) {
+		/*
+		 * The page may have dirty, unmapped buffers.  For example,
+		 * they may have been added in ext3_writepage().  Make them
+		 * freeable here, so the page does not leak.
+		 */
+		block_invalidatepage(page, 0);
+		unlock_page(page);
+		return 0; /* don't care */
+	}
+
+	/*
+	 * The page straddles i_size.  It must be zeroed out on each and every
+	 * writepage invokation because it may be mmapped.  "A file is mapped
+	 * in multiples of the page size.  For a file that is not a multiple of
+	 * the  page size, the remaining memory is zeroed when mapped, and
+	 * writes to that region are not written out to the file."
+	 */
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+	flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_USER0);
+	return __btrfs_write_full_page(inode, page, wbc);
+}
+
+static void btrfs_truncate(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+
+	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	/* FIXME, add redo link to tree so we don't leak on crash */
+	ret = btrfs_truncate_in_trans(trans, root, inode);
+	BUG_ON(ret);
+	btrfs_update_inode(trans, root, inode);
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
+}
+
+int btrfs_commit_write(struct file *file, struct page *page,
+		       unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	struct buffer_head *bh;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	SetPageUptodate(page);
+	bh = page_buffers(page);
+	set_buffer_uptodate(bh);
+	if (buffer_mapped(bh) && bh->b_blocknr != 0) {
+		set_page_dirty(page);
+	}
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+
+static int create_subvol(struct btrfs_root *root, char *name, int namelen)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct buffer_head *subvol;
+	struct btrfs_leaf *leaf;
+	struct btrfs_root *new_root;
+	struct inode *inode;
+	struct inode *dir;
+	int ret;
+	u64 objectid;
+	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	subvol = btrfs_alloc_free_block(trans, root, 0);
+	if (subvol == NULL)
+		return -ENOSPC;
+	leaf = btrfs_buffer_leaf(subvol);
+	btrfs_set_header_nritems(&leaf->header, 0);
+	btrfs_set_header_level(&leaf->header, 0);
+	btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
+	btrfs_set_header_generation(&leaf->header, trans->transid);
+	btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
+	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
+	       sizeof(leaf->header.fsid));
+	mark_buffer_dirty(subvol);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	btrfs_set_inode_generation(inode_item, 1);
+	btrfs_set_inode_size(inode_item, 3);
+	btrfs_set_inode_nlink(inode_item, 1);
+	btrfs_set_inode_nblocks(inode_item, 1);
+	btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
+
+	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
+	btrfs_set_root_refs(&root_item, 1);
+	brelse(subvol);
+	subvol = NULL;
+
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
+	BUG_ON(ret);
+
+	btrfs_set_root_dirid(&root_item, new_dirid);
+
+	key.objectid = objectid;
+	key.offset = 1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&root_item);
+	BUG_ON(ret);
+
+	/*
+	 * insert the directory item
+	 */
+	key.offset = (u64)-1;
+	dir = root->fs_info->sb->s_root->d_inode;
+	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
+				    name, namelen, dir->i_ino, &key,
+				    BTRFS_FT_DIR);
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+
+	new_root = btrfs_read_fs_root(root->fs_info, &key);
+	BUG_ON(!new_root);
+
+	trans = btrfs_start_transaction(new_root, 1);
+	BUG_ON(!trans);
+
+	inode = btrfs_new_inode(trans, new_root, new_dirid,
+				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
+
+	ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
+	BUG_ON(ret);
+
+	inode->i_nlink = 1;
+	inode->i_size = 6;
+	ret = btrfs_update_inode(trans, new_root, inode);
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, new_root);
+	BUG_ON(ret);
+
+	iput(inode);
+
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
+	return 0;
+}
+
+static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item new_root_item;
+	int ret;
+	u64 objectid;
+
+	if (!root->ref_cows)
+		return -EINVAL;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_update_inode(trans, root, root->inode);
+	BUG_ON(ret);
+
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
+	BUG_ON(ret);
+
+	memcpy(&new_root_item, &root->root_item,
+	       sizeof(new_root_item));
+
+	key.objectid = objectid;
+	key.offset = 1;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&new_root_item);
+	BUG_ON(ret);
+
+	/*
+	 * insert the directory item
+	 */
+	key.offset = (u64)-1;
+	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
+				    name, namelen,
+				    root->fs_info->sb->s_root->d_inode->i_ino,
+				    &key, BTRFS_FT_DIR);
+
+	BUG_ON(ret);
+
+	ret = btrfs_inc_root_ref(trans, root);
+	BUG_ON(ret);
+
+	ret = btrfs_commit_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
+	return 0;
+}
+
+int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
+		cmd, unsigned long arg)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ioctl_vol_args vol_args;
+	int ret = 0;
+	struct btrfs_dir_item *di;
+	int namelen;
+	struct btrfs_path *path;
+	u64 root_dirid;
+
+	switch (cmd) {
+	case BTRFS_IOC_SNAP_CREATE:
+		if (copy_from_user(&vol_args,
+				   (struct btrfs_ioctl_vol_args __user *)arg,
+				   sizeof(vol_args)))
+			return -EFAULT;
+		namelen = strlen(vol_args.name);
+		if (namelen > BTRFS_VOL_NAME_MAX)
+			return -EINVAL;
+		path = btrfs_alloc_path();
+		if (!path)
+			return -ENOMEM;
+		root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+		mutex_lock(&root->fs_info->fs_mutex);
+		di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+				    path, root_dirid,
+				    vol_args.name, namelen, 0);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		btrfs_free_path(path);
+		if (di && !IS_ERR(di))
+			return -EEXIST;
+
+		if (root == root->fs_info->tree_root)
+			ret = create_subvol(root, vol_args.name, namelen);
+		else
+			ret = create_snapshot(root, vol_args.name, namelen);
+		WARN_ON(ret);
+		break;
+	default:
+		return -ENOTTY;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+long btrfs_compat_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	int ret;
+	lock_kernel();
+	ret = btrfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
+	unlock_kernel();
+	return ret;
+
+}
+#endif
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+struct inode *btrfs_alloc_inode(struct super_block *sb)
+{
+	struct btrfs_inode *ei;
+
+	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+void btrfs_destroy_inode(struct inode *inode)
+{
+	WARN_ON(!list_empty(&inode->i_dentry));
+	WARN_ON(inode->i_data.nrpages);
+
+	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+
+static void init_once(void * foo, struct kmem_cache * cachep,
+		      unsigned long flags)
+{
+	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+void btrfs_destroy_cachep(void)
+{
+	if (btrfs_inode_cachep)
+		kmem_cache_destroy(btrfs_inode_cachep);
+	if (btrfs_trans_handle_cachep)
+		kmem_cache_destroy(btrfs_trans_handle_cachep);
+	if (btrfs_transaction_cachep)
+		kmem_cache_destroy(btrfs_transaction_cachep);
+	if (btrfs_bit_radix_cachep)
+		kmem_cache_destroy(btrfs_bit_radix_cachep);
+	if (btrfs_path_cachep)
+		kmem_cache_destroy(btrfs_path_cachep);
+}
+
+int btrfs_init_cachep(void)
+{
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+					     sizeof(struct btrfs_inode),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     init_once, NULL);
+	if (!btrfs_inode_cachep)
+		goto fail;
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+					     sizeof(struct btrfs_trans_handle),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     NULL, NULL);
+	if (!btrfs_trans_handle_cachep)
+		goto fail;
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+					     sizeof(struct btrfs_transaction),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     NULL, NULL);
+	if (!btrfs_transaction_cachep)
+		goto fail;
+	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+					     sizeof(struct btrfs_transaction),
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					     NULL, NULL);
+	if (!btrfs_path_cachep)
+		goto fail;
+	btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
+					     256,
+					     0, (SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD |
+						SLAB_DESTROY_BY_RCU),
+					     NULL, NULL);
+	if (!btrfs_bit_radix_cachep)
+		goto fail;
+	return 0;
+fail:
+	btrfs_destroy_cachep();
+	return -ENOMEM;
+}
+
+static int btrfs_getattr(struct vfsmount *mnt,
+			 struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+	stat->blksize = 256 * 1024;
+	return 0;
+}
+
+static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
+			   struct inode * new_dir,struct dentry *new_dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct timespec ctime = CURRENT_TIME;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
+	int ret;
+
+	if (S_ISDIR(old_inode->i_mode) && new_inode &&
+	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+		return -ENOTEMPTY;
+	}
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, new_dir);
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out_fail;
+	}
+
+	old_dentry->d_inode->i_nlink++;
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	old_inode->i_ctime = ctime;
+	if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
+		struct btrfs_key *location = &BTRFS_I(new_dir)->location;
+		u64 old_parent_oid;
+		di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino,
+					   "..", 2, -1);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out_fail;
+		}
+		if (!di) {
+			ret = -ENOENT;
+			goto out_fail;
+		}
+		old_parent_oid = btrfs_disk_key_objectid(&di->location);
+		ret = btrfs_del_item(trans, root, path);
+		if (ret) {
+			ret = -EIO;
+			goto out_fail;
+		}
+		btrfs_release_path(root, path);
+
+		di = btrfs_lookup_dir_index_item(trans, root, path,
+						 old_inode->i_ino,
+						 old_parent_oid,
+						 "..", 2, -1);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out_fail;
+		}
+		if (!di) {
+			ret = -ENOENT;
+			goto out_fail;
+		}
+		ret = btrfs_del_item(trans, root, path);
+		if (ret) {
+			ret = -EIO;
+			goto out_fail;
+		}
+		btrfs_release_path(root, path);
+
+		ret = btrfs_insert_dir_item(trans, root, "..", 2,
+					    old_inode->i_ino, location,
+					    BTRFS_FT_DIR);
+		if (ret)
+			goto out_fail;
+	}
+
+
+	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry);
+	if (ret)
+		goto out_fail;
+
+	if (new_inode) {
+		new_inode->i_ctime = CURRENT_TIME;
+		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
+		if (ret)
+			goto out_fail;
+		if (S_ISDIR(new_inode->i_mode))
+			clear_nlink(new_inode);
+		else
+			drop_nlink(new_inode);
+		btrfs_update_inode(trans, root, new_inode);
+	}
+	ret = btrfs_add_link(trans, new_dentry, old_inode);
+	if (ret)
+		goto out_fail;
+
+out_fail:
+	btrfs_free_path(path);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct inode *inode;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+	int name_len;
+	int datasize;
+	char *ptr;
+	struct btrfs_file_extent_item *ei;
+
+	name_len = strlen(symname) + 1;
+	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
+		return -ENAMETOOLONG;
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_unlock;
+
+	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_add_nondir(trans, dentry, inode);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_fop = &btrfs_file_operations;
+		inode->i_op = &btrfs_file_inode_operations;
+	}
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+	if (drop_inode)
+		goto out_unlock;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	key.objectid = inode->i_ino;
+	key.offset = 0;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	datasize = btrfs_file_extent_calc_inline_size(name_len);
+	err = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	BUG_ON(err);
+	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+	       path->slots[0], struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(ei, trans->transid);
+	btrfs_set_file_extent_type(ei,
+				   BTRFS_FILE_EXTENT_INLINE);
+	ptr = btrfs_file_extent_inline_start(ei);
+	btrfs_memcpy(root, path->nodes[0]->b_data,
+		     ptr, symname, name_len);
+	mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+	inode->i_op = &btrfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_symlink_aops;
+	inode->i_size = name_len - 1;
+	btrfs_update_inode(trans, root, inode);
+	err = 0;
+
+out_unlock:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
+static struct inode_operations btrfs_dir_inode_operations = {
+	.lookup		= btrfs_lookup,
+	.create		= btrfs_create,
+	.unlink		= btrfs_unlink,
+	.link		= btrfs_link,
+	.mkdir		= btrfs_mkdir,
+	.rmdir		= btrfs_rmdir,
+	.rename		= btrfs_rename,
+	.symlink	= btrfs_symlink,
+	.setattr	= btrfs_setattr,
+};
+
+static struct inode_operations btrfs_dir_ro_inode_operations = {
+	.lookup		= btrfs_lookup,
+};
+
+static struct file_operations btrfs_dir_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= btrfs_readdir,
+	.ioctl		= btrfs_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= btrfs_compat_ioctl,
+#endif
+};
+
+static struct address_space_operations btrfs_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+	.sync_page	= block_sync_page,
+	.prepare_write	= btrfs_prepare_write,
+	.commit_write	= btrfs_commit_write,
+	.bmap		= btrfs_bmap,
+};
+
+static struct address_space_operations btrfs_symlink_aops = {
+	.readpage	= btrfs_readpage,
+	.writepage	= btrfs_writepage,
+};
+
+static struct inode_operations btrfs_file_inode_operations = {
+	.truncate	= btrfs_truncate,
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+};
+
+static struct inode_operations btrfs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9749090cfbb..94b3535a2a1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -20,2686 +20,101 @@
 #include "ioctl.h"
 #include "print-tree.h"
 
-struct btrfs_iget_args {
-	u64 ino;
-	struct btrfs_root *root;
-};
-
-#define BTRFS_SUPER_MAGIC 0x9123682E
-
-static struct inode_operations btrfs_dir_inode_operations;
-static struct inode_operations btrfs_symlink_inode_operations;
-static struct inode_operations btrfs_dir_ro_inode_operations;
-static struct super_operations btrfs_super_ops;
-static struct file_operations btrfs_dir_file_operations;
-static struct inode_operations btrfs_file_inode_operations;
-static struct address_space_operations btrfs_aops;
-static struct address_space_operations btrfs_symlink_aops;
-static struct file_operations btrfs_file_operations;
-
-static int drop_extents(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct inode *inode,
-			  u64 start, u64 end, u64 *hint_block);
-static int btrfs_get_block(struct inode *inode, sector_t iblock,
-			   struct buffer_head *result, int create);
-
-
-#define S_SHIFT 12
-static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
-	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
-	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
-};
-
-static void btrfs_read_locked_inode(struct inode *inode)
-{
-	struct btrfs_path *path;
-	struct btrfs_inode_item *inode_item;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_key location;
-	u64 alloc_group_block;
-	int ret;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	mutex_lock(&root->fs_info->fs_mutex);
-
-	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
-	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
-	if (ret) {
-		btrfs_free_path(path);
-		goto make_bad;
-	}
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
-				  struct btrfs_inode_item);
-
-	inode->i_mode = btrfs_inode_mode(inode_item);
-	inode->i_nlink = btrfs_inode_nlink(inode_item);
-	inode->i_uid = btrfs_inode_uid(inode_item);
-	inode->i_gid = btrfs_inode_gid(inode_item);
-	inode->i_size = btrfs_inode_size(inode_item);
-	inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime);
-	inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime);
-	inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime);
-	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime);
-	inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime);
-	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
-	inode->i_blocks = btrfs_inode_nblocks(inode_item);
-	inode->i_generation = btrfs_inode_generation(inode_item);
-	alloc_group_block = btrfs_inode_block_group(inode_item);
-	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
-						       alloc_group_block);
-
-	btrfs_free_path(path);
-	inode_item = NULL;
-
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	switch (inode->i_mode & S_IFMT) {
-#if 0
-	default:
-		init_special_inode(inode, inode->i_mode,
-				   btrfs_inode_rdev(inode_item));
-		break;
-#endif
-	case S_IFREG:
-		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_fop = &btrfs_file_operations;
-		inode->i_op = &btrfs_file_inode_operations;
-		break;
-	case S_IFDIR:
-		inode->i_fop = &btrfs_dir_file_operations;
-		if (root == root->fs_info->tree_root)
-			inode->i_op = &btrfs_dir_ro_inode_operations;
-		else
-			inode->i_op = &btrfs_dir_inode_operations;
-		break;
-	case S_IFLNK:
-		inode->i_op = &btrfs_symlink_inode_operations;
-		inode->i_mapping->a_ops = &btrfs_symlink_aops;
-		break;
-	}
-	return;
-
-make_bad:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	make_bad_inode(inode);
-}
-
-static void fill_inode_item(struct btrfs_inode_item *item,
-			    struct inode *inode)
-{
-	btrfs_set_inode_uid(item, inode->i_uid);
-	btrfs_set_inode_gid(item, inode->i_gid);
-	btrfs_set_inode_size(item, inode->i_size);
-	btrfs_set_inode_mode(item, inode->i_mode);
-	btrfs_set_inode_nlink(item, inode->i_nlink);
-	btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
-	btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
-	btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
-	btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
-	btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
-	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
-	btrfs_set_inode_nblocks(item, inode->i_blocks);
-	btrfs_set_inode_generation(item, inode->i_generation);
-	btrfs_set_inode_block_group(item,
-				    BTRFS_I(inode)->block_group->key.objectid);
-}
-
-static int btrfs_update_inode(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *inode)
-{
-	struct btrfs_inode_item *inode_item;
-	struct btrfs_path *path;
-	int ret;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	ret = btrfs_lookup_inode(trans, root, path,
-				 &BTRFS_I(inode)->location, 1);
-	if (ret) {
-		if (ret > 0)
-			ret = -ENOENT;
-		goto failed;
-	}
-
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
-				  struct btrfs_inode_item);
-
-	fill_inode_item(inode_item, inode);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-	ret = 0;
-failed:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	return ret;
-}
-
-
-static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *dir,
-			      struct dentry *dentry)
-{
-	struct btrfs_path *path;
-	const char *name = dentry->d_name.name;
-	int name_len = dentry->d_name.len;
-	int ret = 0;
-	u64 objectid;
-	struct btrfs_dir_item *di;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
-				    name, name_len, -1);
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
-		goto err;
-	}
-	if (!di) {
-		ret = -ENOENT;
-		goto err;
-	}
-	objectid = btrfs_disk_key_objectid(&di->location);
-	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
-
-	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					 objectid, name, name_len, -1);
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
-		goto err;
-	}
-	if (!di) {
-		ret = -ENOENT;
-		goto err;
-	}
-	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	BUG_ON(ret);
-
-	dentry->d_inode->i_ctime = dir->i_ctime;
-err:
-	btrfs_free_path(path);
-	if (!ret) {
-		dir->i_size -= name_len * 2;
-		btrfs_update_inode(trans, root, dir);
-		drop_nlink(dentry->d_inode);
-		btrfs_update_inode(trans, root, dentry->d_inode);
-		dir->i_sb->s_dirt = 1;
-	}
-	return ret;
-}
-
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct btrfs_root *root;
-	struct btrfs_trans_handle *trans;
-	int ret;
-
-	root = BTRFS_I(dir)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, dir);
-	ret = btrfs_unlink_trans(trans, root, dir, dentry);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
-	return ret;
-}
-
-static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-	int ret;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_key found_key;
-	int found_type;
-	struct btrfs_leaf *leaf;
-	char *goodnames = "..";
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, dir);
-	key.objectid = inode->i_ino;
-	key.offset = (u64)-1;
-	key.flags = (u32)-1;
-	while(1) {
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-		BUG_ON(ret == 0);
-		if (path->slots[0] == 0) {
-			err = -ENOENT;
-			goto out;
-		}
-		path->slots[0]--;
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		btrfs_disk_key_to_cpu(&found_key,
-				      &leaf->items[path->slots[0]].key);
-		found_type = btrfs_key_type(&found_key);
-		if (found_key.objectid != inode->i_ino) {
-			err = -ENOENT;
-			goto out;
-		}
-		if ((found_type != BTRFS_DIR_ITEM_KEY &&
-		     found_type != BTRFS_DIR_INDEX_KEY) ||
-	            (!btrfs_match_dir_item_name(root, path, goodnames, 2) &&
-	            !btrfs_match_dir_item_name(root, path, goodnames, 1))) {
-			err = -ENOTEMPTY;
-			goto out;
-		}
-		ret = btrfs_del_item(trans, root, path);
-		BUG_ON(ret);
-
-		if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1)
-			break;
-		btrfs_release_path(root, path);
-	}
-	ret = 0;
-	btrfs_release_path(root, path);
-
-	/* now the directory is empty */
-	err = btrfs_unlink_trans(trans, root, dir, dentry);
-	if (!err) {
-		inode->i_size = 0;
-	}
-out:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	ret = btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root);
-	if (ret && !err)
-		err = ret;
-	return err;
-}
-
-static int btrfs_free_inode(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct inode *inode)
-{
-	struct btrfs_path *path;
-	int ret;
-
-	clear_inode(inode);
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	ret = btrfs_lookup_inode(trans, root, path,
-				 &BTRFS_I(inode)->location, -1);
-	BUG_ON(ret);
-	ret = btrfs_del_item(trans, root, path);
-	BUG_ON(ret);
-	btrfs_free_path(path);
-	return ret;
-}
-
-static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path,
-			   u64 objectid)
-{
-	struct btrfs_node *node;
-	int i;
-	int nritems;
-	u64 item_objectid;
-	u64 blocknr;
-	int slot;
-	int ret;
-
-	if (!path->nodes[1])
-		return;
-	node = btrfs_buffer_node(path->nodes[1]);
-	slot = path->slots[1];
-	if (slot == 0)
-		return;
-	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot - 1; i >= 0; i--) {
-		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-		if (item_objectid != objectid)
-			break;
-		blocknr = btrfs_node_blockptr(node, i);
-		ret = readahead_tree_block(root, blocknr);
-		if (ret)
-			break;
-	}
-}
-
-static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct inode *inode)
-{
-	int ret;
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	struct btrfs_disk_key *found_key;
-	u32 found_type;
-	struct btrfs_leaf *leaf;
-	struct btrfs_file_extent_item *fi;
-	u64 extent_start = 0;
-	u64 extent_num_blocks = 0;
-	u64 item_end = 0;
-	int found_extent;
-	int del_item;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	/* FIXME, add redo link to tree so we don't leak on crash */
-	key.objectid = inode->i_ino;
-	key.offset = (u64)-1;
-	key.flags = (u32)-1;
-	while(1) {
-		btrfs_init_path(path);
-		fi = NULL;
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0) {
-			goto error;
-		}
-		if (ret > 0) {
-			BUG_ON(path->slots[0] == 0);
-			path->slots[0]--;
-		}
-		reada_truncate(root, path, inode->i_ino);
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		found_key = &leaf->items[path->slots[0]].key;
-		found_type = btrfs_disk_key_type(found_key);
-		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
-			break;
-		if (found_type != BTRFS_CSUM_ITEM_KEY &&
-		    found_type != BTRFS_DIR_ITEM_KEY &&
-		    found_type != BTRFS_DIR_INDEX_KEY &&
-		    found_type != BTRFS_EXTENT_DATA_KEY)
-			break;
-		item_end = btrfs_disk_key_offset(found_key);
-		if (found_type == BTRFS_EXTENT_DATA_KEY) {
-			fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-					    path->slots[0],
-					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(fi) !=
-			    BTRFS_FILE_EXTENT_INLINE) {
-				item_end += btrfs_file_extent_num_blocks(fi) <<
-						inode->i_blkbits;
-			}
-		}
-		if (found_type == BTRFS_CSUM_ITEM_KEY) {
-			ret = btrfs_csum_truncate(trans, root, path,
-						  inode->i_size);
-			BUG_ON(ret);
-		}
-		if (item_end < inode->i_size) {
-			if (found_type) {
-				btrfs_set_key_type(&key, found_type - 1);
-				continue;
-			}
-			break;
-		}
-		if (btrfs_disk_key_offset(found_key) >= inode->i_size)
-			del_item = 1;
-		else
-			del_item = 0;
-		found_extent = 0;
-
-		if (found_type == BTRFS_EXTENT_DATA_KEY &&
-			   btrfs_file_extent_type(fi) !=
-			   BTRFS_FILE_EXTENT_INLINE) {
-			u64 num_dec;
-			if (!del_item) {
-				u64 orig_num_blocks =
-					btrfs_file_extent_num_blocks(fi);
-				extent_num_blocks = inode->i_size -
-					btrfs_disk_key_offset(found_key) +
-					root->blocksize - 1;
-				extent_num_blocks >>= inode->i_blkbits;
-				btrfs_set_file_extent_num_blocks(fi,
-							 extent_num_blocks);
-				inode->i_blocks -= (orig_num_blocks -
-					extent_num_blocks) << 3;
-				mark_buffer_dirty(path->nodes[0]);
-			} else {
-				extent_start =
-					btrfs_file_extent_disk_blocknr(fi);
-				extent_num_blocks =
-					btrfs_file_extent_disk_num_blocks(fi);
-				/* FIXME blocksize != 4096 */
-				num_dec = btrfs_file_extent_num_blocks(fi) << 3;
-				if (extent_start != 0) {
-					found_extent = 1;
-					inode->i_blocks -= num_dec;
-				}
-			}
-		}
-		if (del_item) {
-			ret = btrfs_del_item(trans, root, path);
-			BUG_ON(ret);
-		} else {
-			break;
-		}
-		btrfs_release_path(root, path);
-		if (found_extent) {
-			ret = btrfs_free_extent(trans, root, extent_start,
-						extent_num_blocks, 0);
-			BUG_ON(ret);
-		}
-	}
-	ret = 0;
-error:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	inode->i_sb->s_dirt = 1;
-	return ret;
-}
-
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
-{
-	struct inode *inode = mapping->host;
-	unsigned blocksize = 1 << inode->i_blkbits;
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
-	struct page *page;
-	char *kaddr;
-	int ret = 0;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 alloc_hint;
-	struct btrfs_key ins;
-	struct btrfs_trans_handle *trans;
-
-	if ((offset & (blocksize - 1)) == 0)
-		goto out;
-
-	ret = -ENOMEM;
-	page = grab_cache_page(mapping, index);
-	if (!page)
-		goto out;
-
-	if (!PageUptodate(page)) {
-		ret = mpage_readpage(page, btrfs_get_block);
-		lock_page(page);
-		if (!PageUptodate(page)) {
-			ret = -EIO;
-			goto out;
-		}
-	}
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-
-	ret = drop_extents(trans, root, inode, page->index << PAGE_CACHE_SHIFT,
-			   (page->index + 1) << PAGE_CACHE_SHIFT, &alloc_hint);
-	BUG_ON(ret);
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
-				 alloc_hint, (u64)-1, &ins, 1);
-	BUG_ON(ret);
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       page->index << PAGE_CACHE_SHIFT,
-				       ins.objectid, 1, 1);
-	BUG_ON(ret);
-	SetPageChecked(page);
-	kaddr = kmap(page);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	btrfs_csum_file_block(trans, root, inode->i_ino,
-			      page->index << PAGE_CACHE_SHIFT,
-			      kaddr, PAGE_CACHE_SIZE);
-	kunmap(page);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	set_page_dirty(page);
-	unlock_page(page);
-	page_cache_release(page);
-out:
-	return ret;
-}
-
-static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-
-	err = inode_change_ok(inode, attr);
-	if (err)
-		return err;
-
-	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
-		struct btrfs_trans_handle *trans;
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-		u64 mask = root->blocksize - 1;
-		u64 pos = (inode->i_size + mask) & ~mask;
-		u64 hole_size;
-
-		if (attr->ia_size <= pos)
-			goto out;
-
-		btrfs_truncate_page(inode->i_mapping, inode->i_size);
-
-		hole_size = (attr->ia_size - pos + mask) & ~mask;
-		hole_size >>= inode->i_blkbits;
-
-		mutex_lock(&root->fs_info->fs_mutex);
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_set_trans_block_group(trans, inode);
-		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
-					       pos, 0, 0, hole_size);
-		BUG_ON(err);
-		btrfs_end_transaction(trans, root);
-		mutex_unlock(&root->fs_info->fs_mutex);
-	}
-out:
-	err = inode_setattr(inode, attr);
-
-	return err;
-}
-static void btrfs_delete_inode(struct inode *inode)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
-
-	truncate_inode_pages(&inode->i_data, 0);
-	if (is_bad_inode(inode)) {
-		goto no_delete;
-	}
-	inode->i_size = 0;
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-	ret = btrfs_truncate_in_trans(trans, root, inode);
-	BUG_ON(ret);
-	btrfs_free_inode(trans, root, inode);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
-	return;
-no_delete:
-	clear_inode(inode);
-}
-
-static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
-			       struct btrfs_key *location)
-{
-	const char *name = dentry->d_name.name;
-	int namelen = dentry->d_name.len;
-	struct btrfs_dir_item *di;
-	struct btrfs_path *path;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
-	int ret;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
-				    namelen, 0);
-	if (!di || IS_ERR(di)) {
-		location->objectid = 0;
-		ret = 0;
-		goto out;
-	}
-	btrfs_disk_key_to_cpu(location, &di->location);
-out:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	return ret;
-}
-
-static int fixup_tree_root_location(struct btrfs_root *root,
-			     struct btrfs_key *location,
-			     struct btrfs_root **sub_root)
-{
-	struct btrfs_path *path;
-	struct btrfs_root_item *ri;
-
-	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
-		return 0;
-	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
-		return 0;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
-
-	*sub_root = btrfs_read_fs_root(root->fs_info, location);
-	if (IS_ERR(*sub_root))
-		return PTR_ERR(*sub_root);
-
-	ri = &(*sub_root)->root_item;
-	location->objectid = btrfs_root_dirid(ri);
-	location->flags = 0;
-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-	location->offset = 0;
-
-	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return 0;
-}
-
-static int btrfs_init_locked_inode(struct inode *inode, void *p)
-{
-	struct btrfs_iget_args *args = p;
-	inode->i_ino = args->ino;
-	BTRFS_I(inode)->root = args->root;
-	return 0;
-}
-
-static int btrfs_find_actor(struct inode *inode, void *opaque)
-{
-	struct btrfs_iget_args *args = opaque;
-	return (args->ino == inode->i_ino &&
-		args->root == BTRFS_I(inode)->root);
-}
-
-static struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-				       struct btrfs_root *root)
-{
-	struct inode *inode;
-	struct btrfs_iget_args args;
-	args.ino = objectid;
-	args.root = root;
-
-	inode = iget5_locked(s, objectid, btrfs_find_actor,
-			     btrfs_init_locked_inode,
-			     (void *)&args);
-	return inode;
-}
-
-static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
-{
-	struct inode * inode;
-	struct btrfs_inode *bi = BTRFS_I(dir);
-	struct btrfs_root *root = bi->root;
-	struct btrfs_root *sub_root = root;
-	struct btrfs_key location;
-	int ret;
-
-	if (dentry->d_name.len > BTRFS_NAME_LEN)
-		return ERR_PTR(-ENAMETOOLONG);
-	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_inode_by_name(dir, dentry, &location);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	if (ret < 0)
-		return ERR_PTR(ret);
-	inode = NULL;
-	if (location.objectid) {
-		ret = fixup_tree_root_location(root, &location, &sub_root);
-		if (ret < 0)
-			return ERR_PTR(ret);
-		if (ret > 0)
-			return ERR_PTR(-ENOENT);
-		inode = btrfs_iget_locked(dir->i_sb, location.objectid,
-					  sub_root);
-		if (!inode)
-			return ERR_PTR(-EACCES);
-		if (inode->i_state & I_NEW) {
-			if (sub_root != root) {
-printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_root, BTRFS_I(inode)->root);
-				igrab(inode);
-				sub_root->inode = inode;
-			}
-			BTRFS_I(inode)->root = sub_root;
-			memcpy(&BTRFS_I(inode)->location, &location,
-			       sizeof(location));
-			btrfs_read_locked_inode(inode);
-			unlock_new_inode(inode);
-		}
-	}
-	return d_splice_alias(inode, dentry);
-}
-
-static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path,
-			 u64 objectid)
-{
-	struct btrfs_node *node;
-	int i;
-	u32 nritems;
-	u64 item_objectid;
-	u64 blocknr;
-	int slot;
-	int ret;
-
-	if (!path->nodes[1])
-		return;
-	node = btrfs_buffer_node(path->nodes[1]);
-	slot = path->slots[1];
-	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot + 1; i < nritems; i++) {
-		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-		if (item_objectid != objectid)
-			break;
-		blocknr = btrfs_node_blockptr(node, i);
-		ret = readahead_tree_block(root, blocknr);
-		if (ret)
-			break;
-	}
-}
-static unsigned char btrfs_filetype_table[] = {
-	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
-
-static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
-{
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_item *item;
-	struct btrfs_dir_item *di;
-	struct btrfs_key key;
-	struct btrfs_path *path;
-	int ret;
-	u32 nritems;
-	struct btrfs_leaf *leaf;
-	int slot;
-	int advance;
-	unsigned char d_type;
-	int over = 0;
-	u32 di_cur;
-	u32 di_total;
-	u32 di_len;
-	int key_type = BTRFS_DIR_INDEX_KEY;
-
-	/* FIXME, use a real flag for deciding about the key type */
-	if (root->fs_info->tree_root == root)
-		key_type = BTRFS_DIR_ITEM_KEY;
-	mutex_lock(&root->fs_info->fs_mutex);
-	key.objectid = inode->i_ino;
-	key.flags = 0;
-	btrfs_set_key_type(&key, key_type);
-	key.offset = filp->f_pos;
-	path = btrfs_alloc_path();
-	btrfs_init_path(path);
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto err;
-	advance = 0;
-	reada_leaves(root, path, inode->i_ino);
-	while(1) {
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		nritems = btrfs_header_nritems(&leaf->header);
-		slot = path->slots[0];
-		if (advance || slot >= nritems) {
-			if (slot >= nritems -1) {
-				reada_leaves(root, path, inode->i_ino);
-				ret = btrfs_next_leaf(root, path);
-				if (ret)
-					break;
-				leaf = btrfs_buffer_leaf(path->nodes[0]);
-				nritems = btrfs_header_nritems(&leaf->header);
-				slot = path->slots[0];
-			} else {
-				slot++;
-				path->slots[0]++;
-			}
-		}
-		advance = 1;
-		item = leaf->items + slot;
-		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
-			break;
-		if (btrfs_disk_key_type(&item->key) != key_type)
-			break;
-		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
-			continue;
-		filp->f_pos = btrfs_disk_key_offset(&item->key);
-		advance = 1;
-		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
-		di_cur = 0;
-		di_total = btrfs_item_size(leaf->items + slot);
-		while(di_cur < di_total) {
-			d_type = btrfs_filetype_table[btrfs_dir_type(di)];
-			over = filldir(dirent, (const char *)(di + 1),
-				       btrfs_dir_name_len(di),
-				       btrfs_disk_key_offset(&item->key),
-				       btrfs_disk_key_objectid(&di->location),
-				       d_type);
-			if (over)
-				goto nopos;
-			di_len = btrfs_dir_name_len(di) + sizeof(*di);
-			di_cur += di_len;
-			di = (struct btrfs_dir_item *)((char *)di + di_len);
-		}
-	}
-	filp->f_pos++;
-nopos:
-	ret = 0;
-err:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
-
-static void btrfs_put_super (struct super_block * sb)
-{
-	struct btrfs_root *root = btrfs_sb(sb);
-	int ret;
-
-	ret = close_ctree(root);
-	if (ret) {
-		printk("close ctree returns %d\n", ret);
-	}
-	sb->s_fs_info = NULL;
-}
-
-static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
-{
-	struct inode * inode;
-	struct dentry * root_dentry;
-	struct btrfs_super_block *disk_super;
-	struct btrfs_root *tree_root;
-	struct btrfs_inode *bi;
-
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_magic = BTRFS_SUPER_MAGIC;
-	sb->s_op = &btrfs_super_ops;
-	sb->s_time_gran = 1;
-
-	tree_root = open_ctree(sb);
-
-	if (!tree_root) {
-		printk("btrfs: open_ctree failed\n");
-		return -EIO;
-	}
-	sb->s_fs_info = tree_root;
-	disk_super = tree_root->fs_info->disk_super;
-	printk("read in super total blocks %Lu root %Lu\n",
-	       btrfs_super_total_blocks(disk_super),
-	       btrfs_super_root_dir(disk_super));
-
-	inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
-				  tree_root);
-	bi = BTRFS_I(inode);
-	bi->location.objectid = inode->i_ino;
-	bi->location.offset = 0;
-	bi->location.flags = 0;
-	bi->root = tree_root;
-	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
-
-	if (!inode)
-		return -ENOMEM;
-	if (inode->i_state & I_NEW) {
-		btrfs_read_locked_inode(inode);
-		unlock_new_inode(inode);
-	}
-
-	root_dentry = d_alloc_root(inode);
-	if (!root_dentry) {
-		iput(inode);
-		return -ENOMEM;
-	}
-	sb->s_root = root_dentry;
-	btrfs_transaction_queue_work(tree_root, HZ * 30);
-	return 0;
-}
-
-static int btrfs_write_inode(struct inode *inode, int wait)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
-	int ret = 0;
-
-	if (wait) {
-		mutex_lock(&root->fs_info->fs_mutex);
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_set_trans_block_group(trans, inode);
-		ret = btrfs_commit_transaction(trans, root);
-		mutex_unlock(&root->fs_info->fs_mutex);
-	}
-	return ret;
-}
-
-static void btrfs_dirty_inode(struct inode *inode)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-	btrfs_update_inode(trans, root, inode);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
-}
-
-static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root,
-				     u64 objectid,
-				     struct btrfs_block_group_cache *group,
-				     int mode)
-{
-	struct inode *inode;
-	struct btrfs_inode_item inode_item;
-	struct btrfs_key *location;
-	int ret;
-	int owner;
-
-	inode = new_inode(root->fs_info->sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	BTRFS_I(inode)->root = root;
-	if (mode & S_IFDIR)
-		owner = 0;
-	else
-		owner = 1;
-	group = btrfs_find_block_group(root, group, 0, 0, owner);
-	BTRFS_I(inode)->block_group = group;
-
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
-	inode->i_mode = mode;
-	inode->i_ino = objectid;
-	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	fill_inode_item(&inode_item, inode);
-	location = &BTRFS_I(inode)->location;
-	location->objectid = objectid;
-	location->flags = 0;
-	location->offset = 0;
-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-
-	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
-	BUG_ON(ret);
-
-	insert_inode_hash(inode);
-	return inode;
-}
-
-static inline u8 btrfs_inode_type(struct inode *inode)
-{
-	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
-}
-
-static int btrfs_add_link(struct btrfs_trans_handle *trans,
-			    struct dentry *dentry, struct inode *inode)
-{
-	int ret;
-	struct btrfs_key key;
-	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
-	key.objectid = inode->i_ino;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-	key.offset = 0;
-
-	ret = btrfs_insert_dir_item(trans, root,
-				    dentry->d_name.name, dentry->d_name.len,
-				    dentry->d_parent->d_inode->i_ino,
-				    &key, btrfs_inode_type(inode));
-	if (ret == 0) {
-		dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
-		ret = btrfs_update_inode(trans, root,
-					 dentry->d_parent->d_inode);
-	}
-	return ret;
-}
-
-static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
-			    struct dentry *dentry, struct inode *inode)
-{
-	int err = btrfs_add_link(trans, dentry, inode);
-	if (!err) {
-		d_instantiate(dentry, inode);
-		return 0;
-	}
-	if (err > 0)
-		err = -EEXIST;
-	return err;
-}
-
-static int btrfs_create(struct inode *dir, struct dentry *dentry,
-			int mode, struct nameidata *nd)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct inode *inode;
-	int err;
-	int drop_inode = 0;
-	u64 objectid;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, dir);
-
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
-	inode = btrfs_new_inode(trans, root, objectid,
-				BTRFS_I(dir)->block_group, mode);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out_unlock;
-
-	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode);
-	if (err)
-		drop_inode = 1;
-	else {
-		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_fop = &btrfs_file_operations;
-		inode->i_op = &btrfs_file_inode_operations;
-	}
-	dir->i_sb->s_dirt = 1;
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
-out_unlock:
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	if (drop_inode) {
-		inode_dec_link_count(inode);
-		iput(inode);
-	}
-	btrfs_btree_balance_dirty(root);
-	return err;
-}
-
-static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
-		      struct dentry *dentry)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct inode *inode = old_dentry->d_inode;
-	int err;
-	int drop_inode = 0;
-
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
-	inc_nlink(inode);
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, dir);
-	atomic_inc(&inode->i_count);
-	err = btrfs_add_nondir(trans, dentry, inode);
-	if (err)
-		drop_inode = 1;
-	dir->i_sb->s_dirt = 1;
-	btrfs_update_inode_block_group(trans, dir);
-	btrfs_update_inode(trans, root, inode);
-
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	if (drop_inode) {
-		inode_dec_link_count(inode);
-		iput(inode);
-	}
-	btrfs_btree_balance_dirty(root);
-	return err;
-}
-
-static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 objectid, u64 dirid)
-{
-	int ret;
-	char buf[2];
-	struct btrfs_key key;
-
-	buf[0] = '.';
-	buf[1] = '.';
-
-	key.objectid = objectid;
-	key.offset = 0;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-
-	ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
-				    &key, BTRFS_FT_DIR);
-	if (ret)
-		goto error;
-	key.objectid = dirid;
-	ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
-				    &key, BTRFS_FT_DIR);
-	if (ret)
-		goto error;
-error:
-	return ret;
-}
-
-static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
-	struct inode *inode;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
-	int err = 0;
-	int drop_on_err = 0;
-	u64 objectid;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, dir);
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
-		goto out_unlock;
-	}
-
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
-	inode = btrfs_new_inode(trans, root, objectid,
-				BTRFS_I(dir)->block_group, S_IFDIR | mode);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto out_fail;
-	}
-	drop_on_err = 1;
-	inode->i_op = &btrfs_dir_inode_operations;
-	inode->i_fop = &btrfs_dir_file_operations;
-	btrfs_set_trans_block_group(trans, inode);
-
-	err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
-	if (err)
-		goto out_fail;
-
-	inode->i_size = 6;
-	err = btrfs_update_inode(trans, root, inode);
-	if (err)
-		goto out_fail;
-	err = btrfs_add_link(trans, dentry, inode);
-	if (err)
-		goto out_fail;
-	d_instantiate(dentry, inode);
-	drop_on_err = 0;
-	dir->i_sb->s_dirt = 1;
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
-
-out_fail:
-	btrfs_end_transaction(trans, root);
-out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
-	if (drop_on_err)
-		iput(inode);
-	btrfs_btree_balance_dirty(root);
-	return err;
-}
-
-static int btrfs_sync_file(struct file *file,
-			   struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
-	struct btrfs_trans_handle *trans;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	ret = btrfs_commit_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-out:
-	return ret > 0 ? EIO : ret;
-}
-
-static int btrfs_sync_fs(struct super_block *sb, int wait)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root;
-	int ret;
-	root = btrfs_sb(sb);
-
-	sb->s_dirt = 0;
-	if (!wait) {
-		filemap_flush(root->fs_info->btree_inode->i_mapping);
-		return 0;
-	}
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	ret = btrfs_commit_transaction(trans, root);
-	sb->s_dirt = 0;
-	BUG_ON(ret);
-printk("btrfs sync_fs\n");
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return 0;
-}
-
-#define BTRFS_GET_BLOCK_NO_CREATE 0
-#define BTRFS_GET_BLOCK_CREATE 1
-#define BTRFS_GET_BLOCK_NO_DIRECT 2
-
-static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
-			   struct buffer_head *result, int create)
-{
-	int ret;
-	int err = 0;
-	u64 blocknr;
-	u64 extent_start = 0;
-	u64 extent_end = 0;
-	u64 objectid = inode->i_ino;
-	u32 found_type;
-	u64 alloc_hint = 0;
-	struct btrfs_path *path;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_file_extent_item *item;
-	struct btrfs_leaf *leaf;
-	struct btrfs_disk_key *found_key;
-	struct btrfs_trans_handle *trans = NULL;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	btrfs_init_path(path);
-	if (create & BTRFS_GET_BLOCK_CREATE) {
-		WARN_ON(1);
-		/* this almost but not quite works */
-		trans = btrfs_start_transaction(root, 1);
-		if (!trans) {
-			err = -ENOMEM;
-			goto out;
-		}
-		ret = drop_extents(trans, root, inode,
-				   iblock << inode->i_blkbits,
-				   (iblock + 1) << inode->i_blkbits,
-				   &alloc_hint);
-		BUG_ON(ret);
-	}
-
-	ret = btrfs_lookup_file_extent(NULL, root, path,
-				       inode->i_ino,
-				       iblock << inode->i_blkbits, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-
-	if (ret != 0) {
-		if (path->slots[0] == 0) {
-			btrfs_release_path(root, path);
-			goto not_found;
-		}
-		path->slots[0]--;
-	}
-
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			      struct btrfs_file_extent_item);
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	blocknr = btrfs_file_extent_disk_blocknr(item);
-	blocknr += btrfs_file_extent_offset(item);
-
-	/* are we inside the extent that was found? */
-	found_key = &leaf->items[path->slots[0]].key;
-	found_type = btrfs_disk_key_type(found_key);
-	if (btrfs_disk_key_objectid(found_key) != objectid ||
-	    found_type != BTRFS_EXTENT_DATA_KEY) {
-		extent_end = 0;
-		extent_start = 0;
-		goto not_found;
-	}
-	found_type = btrfs_file_extent_type(item);
-	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
-		extent_start = extent_start >> inode->i_blkbits;
-		extent_end = extent_start + btrfs_file_extent_num_blocks(item);
-		err = 0;
-		if (btrfs_file_extent_disk_blocknr(item) == 0)
-			goto out;
-		if (iblock >= extent_start && iblock < extent_end) {
-			btrfs_map_bh_to_logical(root, result, blocknr +
-						iblock - extent_start);
-			goto out;
-		}
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		char *ptr;
-		char *map;
-		u32 size;
-
-		if (create & BTRFS_GET_BLOCK_NO_DIRECT) {
-			err = -EINVAL;
-			goto out;
-		}
-		size = btrfs_file_extent_inline_len(leaf->items +
-						    path->slots[0]);
-		extent_end = (extent_start + size) >> inode->i_blkbits;
-		extent_start >>= inode->i_blkbits;
-		if (iblock < extent_start || iblock > extent_end) {
-			goto not_found;
-		}
-		ptr = btrfs_file_extent_inline_start(item);
-		map = kmap(result->b_page);
-		memcpy(map, ptr, size);
-		memset(map + size, 0, PAGE_CACHE_SIZE - size);
-		flush_dcache_page(result->b_page);
-		kunmap(result->b_page);
-		set_buffer_uptodate(result);
-		SetPageChecked(result->b_page);
-		btrfs_map_bh_to_logical(root, result, 0);
-	}
-not_found:
-	if (create & BTRFS_GET_BLOCK_CREATE) {
-		struct btrfs_key ins;
-		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 1, alloc_hint, (u64)-1,
-					 &ins, 1);
-		BUG_ON(ret);
-		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-					       iblock << inode->i_blkbits,
-					       ins.objectid, ins.offset,
-					       ins.offset);
-		BUG_ON(ret);
-		SetPageChecked(result->b_page);
-		btrfs_map_bh_to_logical(root, result, ins.objectid);
-	}
-out:
-	if (trans)
-		err = btrfs_end_transaction(trans, root);
-	btrfs_free_path(path);
-	return err;
-}
-
-static int btrfs_get_block(struct inode *inode, sector_t iblock,
-			   struct buffer_head *result, int create)
-{
-	int err;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
-	err = btrfs_get_block_lock(inode, iblock, result, create);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return err;
-}
-
-static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
-			   struct buffer_head *result, int create)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
-	btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return 0;
-}
-
-static sector_t btrfs_bmap(struct address_space *as, sector_t block)
-{
-	return generic_block_bmap(as, block, btrfs_get_block_bmap);
-}
-
-static int btrfs_prepare_write(struct file *file, struct page *page,
-			       unsigned from, unsigned to)
-{
-	return block_prepare_write(page, from, to, btrfs_get_block);
-}
-
-static void btrfs_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
-static int btrfs_readpage(struct file *file, struct page *page)
-{
-	return mpage_readpage(page, btrfs_get_block);
-}
-
-/*
- * While block_write_full_page is writing back the dirty buffers under
- * the page lock, whoever dirtied the buffers may decide to clean them
- * again at any time.  We handle that by only looking at the buffer
- * state inside lock_buffer().
- *
- * If block_write_full_page() is called for regular writeback
- * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
- * locked buffer.   This only can happen if someone has written the buffer
- * directly, with submit_bh().  At the address_space level PageWriteback
- * prevents this contention from occurring.
- */
-static int __btrfs_write_full_page(struct inode *inode, struct page *page,
-				   struct writeback_control *wbc)
-{
-	int err;
-	sector_t block;
-	sector_t last_block;
-	struct buffer_head *bh, *head;
-	const unsigned blocksize = 1 << inode->i_blkbits;
-	int nr_underway = 0;
-
-	BUG_ON(!PageLocked(page));
-
-	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, blocksize,
-					(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-
-	/*
-	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
-	 * here, and the (potentially unmapped) buffers may become dirty at
-	 * any time.  If a buffer becomes dirty here after we've inspected it
-	 * then we just miss that fact, and the page stays dirty.
-	 *
-	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
-	 * handle that here by just cleaning them.
-	 */
-
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	head = page_buffers(page);
-	bh = head;
-
-	/*
-	 * Get all the dirty buffers mapped to disk addresses and
-	 * handle any aliases from the underlying blockdev's mapping.
-	 */
-	do {
-		if (block > last_block) {
-			/*
-			 * mapped buffers outside i_size will occur, because
-			 * this page can be outside i_size when there is a
-			 * truncate in progress.
-			 */
-			/*
-			 * The buffer was zeroed by block_write_full_page()
-			 */
-			clear_buffer_dirty(bh);
-			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
-			WARN_ON(bh->b_size != blocksize);
-			err = btrfs_get_block(inode, block, bh, 0);
-			if (err) {
-printk("writepage going to recovery err %d\n", err);
-				goto recover;
-			}
-			if (buffer_new(bh)) {
-				/* blockdev mappings never come here */
-				clear_buffer_new(bh);
-			}
-		}
-		bh = bh->b_this_page;
-		block++;
-	} while (bh != head);
-
-	do {
-		if (!buffer_mapped(bh))
-			continue;
-		/*
-		 * If it's a fully non-blocking write attempt and we cannot
-		 * lock the buffer then redirty the page.  Note that this can
-		 * potentially cause a busy-wait loop from pdflush and kswapd
-		 * activity, but those code paths have their own higher-level
-		 * throttling.
-		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
-			lock_buffer(bh);
-		} else if (test_set_buffer_locked(bh)) {
-			redirty_page_for_writepage(wbc, page);
-			continue;
-		}
-		if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
-			mark_buffer_async_write(bh);
-		} else {
-			unlock_buffer(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-
-	/*
-	 * The page and its buffers are protected by PageWriteback(), so we can
-	 * drop the bh refcounts early.
-	 */
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			submit_bh(WRITE, bh);
-			nr_underway++;
-		}
-		bh = next;
-	} while (bh != head);
-	unlock_page(page);
-
-	err = 0;
-done:
-	if (nr_underway == 0) {
-		/*
-		 * The page was marked dirty, but the buffers were
-		 * clean.  Someone wrote them back by hand with
-		 * ll_rw_block/submit_bh.  A rare case.
-		 */
-		int uptodate = 1;
-		do {
-			if (!buffer_uptodate(bh)) {
-				uptodate = 0;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		if (uptodate)
-			SetPageUptodate(page);
-		end_page_writeback(page);
-	}
-	return err;
-
-recover:
-	/*
-	 * ENOSPC, or some other error.  We may already have added some
-	 * blocks to the file, so we need to write these out to avoid
-	 * exposing stale data.
-	 * The page is currently locked and not marked for writeback
-	 */
-	bh = head;
-	/* Recovery: lock and submit the mapped buffers */
-	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
-			lock_buffer(bh);
-			mark_buffer_async_write(bh);
-		} else {
-			/*
-			 * The buffer may have been set dirty during
-			 * attachment to a dirty page.
-			 */
-			clear_buffer_dirty(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-	SetPageError(page);
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			clear_buffer_dirty(bh);
-			submit_bh(WRITE, bh);
-			nr_underway++;
-		}
-		bh = next;
-	} while (bh != head);
-	unlock_page(page);
-	goto done;
-}
-
-/*
- * The generic ->writepage function for buffer-backed address_spaces
- */
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct inode * const inode = page->mapping->host;
-	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset;
-	void *kaddr;
-
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		return __btrfs_write_full_page(inode, page, wbc);
-
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index >= end_index+1 || !offset) {
-		/*
-		 * The page may have dirty, unmapped buffers.  For example,
-		 * they may have been added in ext3_writepage().  Make them
-		 * freeable here, so the page does not leak.
-		 */
-		block_invalidatepage(page, 0);
-		unlock_page(page);
-		return 0; /* don't care */
-	}
-
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invokation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-	return __btrfs_write_full_page(inode, page, wbc);
-}
-
-static void btrfs_truncate(struct inode *inode)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
-	struct btrfs_trans_handle *trans;
-
-	if (!S_ISREG(inode->i_mode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-		return;
-
-	btrfs_truncate_page(inode->i_mapping, inode->i_size);
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-
-	/* FIXME, add redo link to tree so we don't leak on crash */
-	ret = btrfs_truncate_in_trans(trans, root, inode);
-	BUG_ON(ret);
-	btrfs_update_inode(trans, root, inode);
-	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
-}
-
-static int btrfs_commit_write(struct file *file, struct page *page,
-			      unsigned from, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *bh;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	SetPageUptodate(page);
-	bh = page_buffers(page);
-	set_buffer_uptodate(bh);
-	if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-		set_page_dirty(page);
-	}
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
-}
-
-static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
-				struct page **prepared_pages,
-				const char __user * buf)
-{
-	long page_fault = 0;
-	int i;
-	int offset = pos & (PAGE_CACHE_SIZE - 1);
-
-	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
-		size_t count = min_t(size_t,
-				     PAGE_CACHE_SIZE - offset, write_bytes);
-		struct page *page = prepared_pages[i];
-		fault_in_pages_readable(buf, count);
-
-		/* Copy data from userspace to the current page */
-		kmap(page);
-		page_fault = __copy_from_user(page_address(page) + offset,
-					      buf, count);
-		/* Flush processor's dcache for this page */
-		flush_dcache_page(page);
-		kunmap(page);
-		buf += count;
-		write_bytes -= count;
-
-		if (page_fault)
-			break;
-	}
-	return page_fault ? -EFAULT : 0;
-}
-
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
-{
-	size_t i;
-	for (i = 0; i < num_pages; i++) {
-		if (!pages[i])
-			break;
-		unlock_page(pages[i]);
-		mark_page_accessed(pages[i]);
-		page_cache_release(pages[i]);
-	}
-}
-static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct file *file,
-				   struct page **pages,
-				   size_t num_pages,
-				   loff_t pos,
-				   size_t write_bytes)
-{
-	int i;
-	int offset;
-	int err = 0;
-	int ret;
-	int this_write;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct buffer_head *bh;
-	struct btrfs_file_extent_item *ei;
-
-	for (i = 0; i < num_pages; i++) {
-		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
-		/* FIXME, one block at a time */
-
-		mutex_lock(&root->fs_info->fs_mutex);
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_set_trans_block_group(trans, inode);
-
-		bh = page_buffers(pages[i]);
-		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
-			struct btrfs_key key;
-			struct btrfs_path *path;
-			char *ptr;
-			u32 datasize;
-
-			path = btrfs_alloc_path();
-			BUG_ON(!path);
-			key.objectid = inode->i_ino;
-			key.offset = pages[i]->index << PAGE_CACHE_SHIFT;
-			key.flags = 0;
-			btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-			BUG_ON(write_bytes >= PAGE_CACHE_SIZE);
-			datasize = offset +
-				btrfs_file_extent_calc_inline_size(write_bytes);
-			ret = btrfs_insert_empty_item(trans, root, path, &key,
-						      datasize);
-			BUG_ON(ret);
-			ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-			       path->slots[0], struct btrfs_file_extent_item);
-			btrfs_set_file_extent_generation(ei, trans->transid);
-			btrfs_set_file_extent_type(ei,
-						   BTRFS_FILE_EXTENT_INLINE);
-			ptr = btrfs_file_extent_inline_start(ei);
-			btrfs_memcpy(root, path->nodes[0]->b_data,
-				     ptr, bh->b_data, offset + write_bytes);
-			mark_buffer_dirty(path->nodes[0]);
-			btrfs_free_path(path);
-		} else if (buffer_mapped(bh)) {
-			btrfs_csum_file_block(trans, root, inode->i_ino,
-				      pages[i]->index << PAGE_CACHE_SHIFT,
-				      kmap(pages[i]), PAGE_CACHE_SIZE);
-			kunmap(pages[i]);
-		}
-		SetPageChecked(pages[i]);
-		// btrfs_update_inode_block_group(trans, inode);
-		ret = btrfs_end_transaction(trans, root);
-		BUG_ON(ret);
-		mutex_unlock(&root->fs_info->fs_mutex);
+#define BTRFS_SUPER_MAGIC 0x9123682E
 
-		ret = btrfs_commit_write(file, pages[i], offset,
-					 offset + this_write);
-		pos += this_write;
-		if (ret) {
-			err = ret;
-			goto failed;
-		}
-		WARN_ON(this_write > write_bytes);
-		write_bytes -= this_write;
-	}
-failed:
-	return err;
-}
+static struct super_operations btrfs_super_ops;
 
-static int drop_extents(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct inode *inode,
-			  u64 start, u64 end, u64 *hint_block)
+static void btrfs_put_super (struct super_block * sb)
 {
+	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
-	struct btrfs_key key;
-	struct btrfs_leaf *leaf;
-	int slot;
-	struct btrfs_file_extent_item *extent;
-	u64 extent_end = 0;
-	int keep;
-	struct btrfs_file_extent_item old;
-	struct btrfs_path *path;
-	u64 search_start = start;
-	int bookend;
-	int found_type;
-	int found_extent;
-	int found_inline;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	while(1) {
-		btrfs_release_path(root, path);
-		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
-					       search_start, -1);
-		if (ret < 0)
-			goto out;
-		if (ret > 0) {
-			if (path->slots[0] == 0) {
-				ret = 0;
-				goto out;
-			}
-			path->slots[0]--;
-		}
-		keep = 0;
-		bookend = 0;
-		found_extent = 0;
-		found_inline = 0;
-		extent = NULL;
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		slot = path->slots[0];
-		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
-		if (key.offset >= end || key.objectid != inode->i_ino) {
-			ret = 0;
-			goto out;
-		}
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) {
-			ret = 0;
-			goto out;
-		}
-		extent = btrfs_item_ptr(leaf, slot,
-					struct btrfs_file_extent_item);
-		found_type = btrfs_file_extent_type(extent);
-		if (found_type == BTRFS_FILE_EXTENT_REG) {
-			extent_end = key.offset +
-				(btrfs_file_extent_num_blocks(extent) <<
-				 inode->i_blkbits);
-			found_extent = 1;
-		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-			found_inline = 1;
-			extent_end = key.offset +
-			     btrfs_file_extent_inline_len(leaf->items + slot);
-		}
-
-		if (!found_extent && !found_inline) {
-			ret = 0;
-			goto out;
-		}
-
-		if (search_start >= extent_end) {
-			ret = 0;
-			goto out;
-		}
-
-		if (found_inline) {
-			u64 mask = root->blocksize - 1;
-			search_start = (extent_end + mask) & ~mask;
-		} else
-			search_start = extent_end;
 
-		if (end < extent_end && end >= key.offset) {
-			if (found_extent) {
-				u64 disk_blocknr =
-					btrfs_file_extent_disk_blocknr(extent);
-				u64 disk_num_blocks =
-				      btrfs_file_extent_disk_num_blocks(extent);
-				memcpy(&old, extent, sizeof(old));
-				if (disk_blocknr != 0) {
-					ret = btrfs_inc_extent_ref(trans, root,
-					         disk_blocknr, disk_num_blocks);
-					BUG_ON(ret);
-				}
-			}
-			WARN_ON(found_inline);
-			bookend = 1;
-		}
-
-		if (start > key.offset) {
-			u64 new_num;
-			u64 old_num;
-			/* truncate existing extent */
-			keep = 1;
-			WARN_ON(start & (root->blocksize - 1));
-			if (found_extent) {
-				new_num = (start - key.offset) >>
-					inode->i_blkbits;
-				old_num = btrfs_file_extent_num_blocks(extent);
-				*hint_block =
-					btrfs_file_extent_disk_blocknr(extent);
-				if (btrfs_file_extent_disk_blocknr(extent)) {
-					inode->i_blocks -=
-						(old_num - new_num) << 3;
-				}
-				btrfs_set_file_extent_num_blocks(extent,
-								 new_num);
-				mark_buffer_dirty(path->nodes[0]);
-			} else {
-				WARN_ON(1);
-			}
-		}
-		if (!keep) {
-			u64 disk_blocknr = 0;
-			u64 disk_num_blocks = 0;
-			u64 extent_num_blocks = 0;
-			if (found_extent) {
-				disk_blocknr =
-				      btrfs_file_extent_disk_blocknr(extent);
-				disk_num_blocks =
-				      btrfs_file_extent_disk_num_blocks(extent);
-				extent_num_blocks =
-				      btrfs_file_extent_num_blocks(extent);
-				*hint_block =
-					btrfs_file_extent_disk_blocknr(extent);
-			}
-			ret = btrfs_del_item(trans, root, path);
-			BUG_ON(ret);
-			btrfs_release_path(root, path);
-			extent = NULL;
-			if (found_extent && disk_blocknr != 0) {
-				inode->i_blocks -= extent_num_blocks << 3;
-				ret = btrfs_free_extent(trans, root,
-							disk_blocknr,
-							disk_num_blocks, 0);
-			}
-
-			BUG_ON(ret);
-			if (!bookend && search_start >= end) {
-				ret = 0;
-				goto out;
-			}
-			if (!bookend)
-				continue;
-		}
-		if (bookend && found_extent) {
-			/* create bookend */
-			struct btrfs_key ins;
-			ins.objectid = inode->i_ino;
-			ins.offset = end;
-			ins.flags = 0;
-			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
-
-			btrfs_release_path(root, path);
-			ret = btrfs_insert_empty_item(trans, root, path, &ins,
-						      sizeof(*extent));
-			BUG_ON(ret);
-			extent = btrfs_item_ptr(
-				    btrfs_buffer_leaf(path->nodes[0]),
-				    path->slots[0],
-				    struct btrfs_file_extent_item);
-			btrfs_set_file_extent_disk_blocknr(extent,
-				    btrfs_file_extent_disk_blocknr(&old));
-			btrfs_set_file_extent_disk_num_blocks(extent,
-				    btrfs_file_extent_disk_num_blocks(&old));
-
-			btrfs_set_file_extent_offset(extent,
-				    btrfs_file_extent_offset(&old) +
-				    ((end - key.offset) >> inode->i_blkbits));
-			WARN_ON(btrfs_file_extent_num_blocks(&old) <
-				(extent_end - end) >> inode->i_blkbits);
-			btrfs_set_file_extent_num_blocks(extent,
-				    (extent_end - end) >> inode->i_blkbits);
-
-			btrfs_set_file_extent_type(extent,
-						   BTRFS_FILE_EXTENT_REG);
-			btrfs_set_file_extent_generation(extent,
-				    btrfs_file_extent_generation(&old));
-			btrfs_mark_buffer_dirty(path->nodes[0]);
-			if (btrfs_file_extent_disk_blocknr(&old) != 0) {
-				inode->i_blocks +=
-				      btrfs_file_extent_num_blocks(extent) << 3;
-			}
-			ret = 0;
-			goto out;
-		}
-	}
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-static int prepare_pages(struct btrfs_root *root,
-			 struct file *file,
-			 struct page **pages,
-			 size_t num_pages,
-			 loff_t pos,
-			 unsigned long first_index,
-			 unsigned long last_index,
-			 size_t write_bytes,
-			 u64 alloc_extent_start)
-{
-	int i;
-	unsigned long index = pos >> PAGE_CACHE_SHIFT;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int offset;
-	int err = 0;
-	int this_write;
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	loff_t isize = i_size_read(inode);
-
-	memset(pages, 0, num_pages * sizeof(struct page *));
-
-	for (i = 0; i < num_pages; i++) {
-		pages[i] = grab_cache_page(inode->i_mapping, index + i);
-		if (!pages[i]) {
-			err = -ENOMEM;
-			goto failed_release;
-		}
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-		wait_on_page_writeback(pages[i]);
-		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
-		if (!page_has_buffers(pages[i])) {
-			create_empty_buffers(pages[i],
-					     root->fs_info->sb->s_blocksize,
-					     (1 << BH_Uptodate));
-		}
-		head = page_buffers(pages[i]);
-		bh = head;
-		do {
-			err = btrfs_map_bh_to_logical(root, bh,
-						      alloc_extent_start);
-			BUG_ON(err);
-			if (err)
-				goto failed_truncate;
-			bh = bh->b_this_page;
-			if (alloc_extent_start)
-				alloc_extent_start++;
-		} while (bh != head);
-		pos += this_write;
-		WARN_ON(this_write > write_bytes);
-		write_bytes -= this_write;
+	ret = close_ctree(root);
+	if (ret) {
+		printk("close ctree returns %d\n", ret);
 	}
-	return 0;
-
-failed_release:
-	btrfs_drop_pages(pages, num_pages);
-	return err;
-
-failed_truncate:
-	btrfs_drop_pages(pages, num_pages);
-	if (pos > isize)
-		vmtruncate(inode, isize);
-	return err;
+	sb->s_fs_info = NULL;
 }
 
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
+static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 {
-	loff_t pos;
-	size_t num_written = 0;
-	int err = 0;
-	int ret = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *pages[8];
-	struct page *pinned[2];
-	unsigned long first_index;
-	unsigned long last_index;
-	u64 start_pos;
-	u64 num_blocks;
-	u64 alloc_extent_start;
-	u64 hint_block;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_key ins;
-	pinned[0] = NULL;
-	pinned[1] = NULL;
-	if (file->f_flags & O_DIRECT)
-		return -EINVAL;
-	pos = *ppos;
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-	current->backing_dev_info = inode->i_mapping->backing_dev_info;
-	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-	if (err)
-		goto out;
-	if (count == 0)
-		goto out;
-	err = remove_suid(file->f_path.dentry);
-	if (err)
-		goto out;
-	file_update_time(file);
+	struct inode * inode;
+	struct dentry * root_dentry;
+	struct btrfs_super_block *disk_super;
+	struct btrfs_root *tree_root;
+	struct btrfs_inode *bi;
+	int err;
 
-	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
-	num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
-			inode->i_blkbits;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_magic = BTRFS_SUPER_MAGIC;
+	sb->s_op = &btrfs_super_ops;
+	sb->s_time_gran = 1;
 
-	mutex_lock(&inode->i_mutex);
-	first_index = pos >> PAGE_CACHE_SHIFT;
-	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+	tree_root = open_ctree(sb);
 
-	if ((pos & (PAGE_CACHE_SIZE - 1))) {
-		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-		if (!PageUptodate(pinned[0])) {
-			ret = mpage_readpage(pinned[0], btrfs_get_block);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[0]);
-		} else {
-			unlock_page(pinned[0]);
-		}
-	}
-	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
-		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-		if (!PageUptodate(pinned[1])) {
-			ret = mpage_readpage(pinned[1], btrfs_get_block);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[1]);
-		} else {
-			unlock_page(pinned[1]);
-		}
+	if (!tree_root || IS_ERR(tree_root)) {
+		printk("btrfs: open_ctree failed\n");
+		return -EIO;
 	}
+	sb->s_fs_info = tree_root;
+	disk_super = tree_root->fs_info->disk_super;
+	inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
+				  tree_root);
+	bi = BTRFS_I(inode);
+	bi->location.objectid = inode->i_ino;
+	bi->location.offset = 0;
+	bi->location.flags = 0;
+	bi->root = tree_root;
+	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
+	if (!inode) {
 		err = -ENOMEM;
-		mutex_unlock(&root->fs_info->fs_mutex);
-		goto out_unlock;
-	}
-	btrfs_set_trans_block_group(trans, inode);
-	/* FIXME blocksize != 4096 */
-	inode->i_blocks += num_blocks << 3;
-	hint_block = 0;
-	if (start_pos < inode->i_size) {
-		/* FIXME blocksize != pagesize */
-		ret = drop_extents(trans, root, inode,
-				   start_pos,
-				   (pos + count + root->blocksize -1) &
-				   ~((u64)root->blocksize - 1), &hint_block);
-		BUG_ON(ret);
-	}
-	if (inode->i_size < start_pos) {
-		u64 last_pos_in_file;
-		u64 hole_size;
-		u64 mask = root->blocksize - 1;
-		last_pos_in_file = (inode->i_size + mask) & ~mask;
-		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		hole_size >>= inode->i_blkbits;
-		if (last_pos_in_file < start_pos) {
-			ret = btrfs_insert_file_extent(trans, root,
-						       inode->i_ino,
-						       last_pos_in_file,
-						       0, 0, hole_size);
-		}
-		BUG_ON(ret);
-	}
-	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
-	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, hint_block, (u64)-1,
-					 &ins, 1);
-		BUG_ON(ret);
-		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       start_pos, ins.objectid, ins.offset,
-				       ins.offset);
-		BUG_ON(ret);
-	} else {
-		ins.offset = 0;
-		ins.objectid = 0;
-	}
-	BUG_ON(ret);
-	alloc_extent_start = ins.objectid;
-	// btrfs_update_inode_block_group(trans, inode);
-	ret = btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	while(count > 0) {
-		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
-		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT;
-
-		memset(pages, 0, sizeof(pages));
-		ret = prepare_pages(root, file, pages, num_pages,
-				    pos, first_index, last_index,
-				    write_bytes, alloc_extent_start);
-		BUG_ON(ret);
-
-		/* FIXME blocks != pagesize */
-		if (alloc_extent_start)
-			alloc_extent_start += num_pages;
-		ret = btrfs_copy_from_user(pos, num_pages,
-					   write_bytes, pages, buf);
-		BUG_ON(ret);
-
-		ret = dirty_and_release_pages(NULL, root, file, pages,
-					      num_pages, pos, write_bytes);
-		BUG_ON(ret);
-		btrfs_drop_pages(pages, num_pages);
-
-		buf += write_bytes;
-		count -= write_bytes;
-		pos += write_bytes;
-		num_written += write_bytes;
-
-		balance_dirty_pages_ratelimited(inode->i_mapping);
-		btrfs_btree_balance_dirty(root);
-		cond_resched();
-	}
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
-out:
-	if (pinned[0])
-		page_cache_release(pinned[0]);
-	if (pinned[1])
-		page_cache_release(pinned[1]);
-	*ppos = pos;
-	current->backing_dev_info = NULL;
-	mark_inode_dirty(inode);
-	return num_written ? num_written : err;
-}
-
-static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
-			unsigned long offset, unsigned long size)
-{
-	char *kaddr;
-	unsigned long left, count = desc->count;
-	struct inode *inode = page->mapping->host;
-
-	if (size > count)
-		size = count;
-
-	if (!PageChecked(page)) {
-		/* FIXME, do it per block */
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-		int ret;
-		struct buffer_head *bh;
-
-		if (page_has_buffers(page)) {
-			bh = page_buffers(page);
-			if (!buffer_mapped(bh)) {
-				SetPageChecked(page);
-				goto checked;
-			}
-		}
-
-		ret = btrfs_csum_verify_file_block(root,
-				  page->mapping->host->i_ino,
-				  page->index << PAGE_CACHE_SHIFT,
-				  kmap(page), PAGE_CACHE_SIZE);
-		if (ret) {
-			if (ret != -ENOENT) {
-				printk("failed to verify ino %lu page %lu ret %d\n",
-				       page->mapping->host->i_ino,
-				       page->index, ret);
-				memset(page_address(page), 1, PAGE_CACHE_SIZE);
-				flush_dcache_page(page);
-			}
-		}
-		SetPageChecked(page);
-		kunmap(page);
-	}
-checked:
-	/*
-	 * Faults on the destination of a read are common, so do it before
-	 * taking the kmap.
-	 */
-	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		left = __copy_to_user_inatomic(desc->arg.buf,
-						kaddr + offset, size);
-		kunmap_atomic(kaddr, KM_USER0);
-		if (left == 0)
-			goto success;
-	}
-
-	/* Do it the slow way */
-	kaddr = kmap(page);
-	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
-	kunmap(page);
-
-	if (left) {
-		size -= left;
-		desc->error = -EFAULT;
+		goto fail_close;
 	}
-success:
-	desc->count = count - size;
-	desc->written += size;
-	desc->arg.buf += size;
-	return size;
-}
-
-/**
- * btrfs_file_aio_read - filesystem read routine
- * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
- * @pos:	current file position
- */
-static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-				   unsigned long nr_segs, loff_t pos)
-{
-	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg;
-	size_t count;
-	loff_t *ppos = &iocb->ki_pos;
-
-	count = 0;
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		count += iv->iov_len;
-		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		nr_segs = seg;
-		count -= iv->iov_len;	/* This segment is no good */
-		break;
+	if (inode->i_state & I_NEW) {
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
 	}
-	retval = 0;
-	if (count) {
-		for (seg = 0; seg < nr_segs; seg++) {
-			read_descriptor_t desc;
 
-			desc.written = 0;
-			desc.arg.buf = iov[seg].iov_base;
-			desc.count = iov[seg].iov_len;
-			if (desc.count == 0)
-				continue;
-			desc.error = 0;
-			do_generic_file_read(filp, ppos, &desc,
-					     btrfs_read_actor);
-			retval += desc.written;
-			if (desc.error) {
-				retval = retval ?: desc.error;
-				break;
-			}
-		}
+	root_dentry = d_alloc_root(inode);
+	if (!root_dentry) {
+		iput(inode);
+		err = -ENOMEM;
+		goto fail_close;
 	}
-	return retval;
-}
-
-static int create_subvol(struct btrfs_root *root, char *name, int namelen)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_key key;
-	struct btrfs_root_item root_item;
-	struct btrfs_inode_item *inode_item;
-	struct buffer_head *subvol;
-	struct btrfs_leaf *leaf;
-	struct btrfs_root *new_root;
-	struct inode *inode;
-	struct inode *dir;
-	int ret;
-	u64 objectid;
-	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
-
-	subvol = btrfs_alloc_free_block(trans, root, 0);
-	if (subvol == NULL)
-		return -ENOSPC;
-	leaf = btrfs_buffer_leaf(subvol);
-	btrfs_set_header_nritems(&leaf->header, 0);
-	btrfs_set_header_level(&leaf->header, 0);
-	btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
-	btrfs_set_header_generation(&leaf->header, trans->transid);
-	btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
-	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
-	       sizeof(leaf->header.fsid));
-	mark_buffer_dirty(subvol);
-
-	inode_item = &root_item.inode;
-	memset(inode_item, 0, sizeof(*inode_item));
-	btrfs_set_inode_generation(inode_item, 1);
-	btrfs_set_inode_size(inode_item, 3);
-	btrfs_set_inode_nlink(inode_item, 1);
-	btrfs_set_inode_nblocks(inode_item, 1);
-	btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
-
-	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
-	btrfs_set_root_refs(&root_item, 1);
-	brelse(subvol);
-	subvol = NULL;
-
-	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-				       0, &objectid);
-	BUG_ON(ret);
-
-	btrfs_set_root_dirid(&root_item, new_dirid);
-
-	key.objectid = objectid;
-	key.offset = 1;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-				&root_item);
-	BUG_ON(ret);
-
-	/*
-	 * insert the directory item
-	 */
-	key.offset = (u64)-1;
-	dir = root->fs_info->sb->s_root->d_inode;
-	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    name, namelen, dir->i_ino, &key,
-				    BTRFS_FT_DIR);
-	BUG_ON(ret);
-
-	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
-
-	new_root = btrfs_read_fs_root(root->fs_info, &key);
-	BUG_ON(!new_root);
-
-	trans = btrfs_start_transaction(new_root, 1);
-	BUG_ON(!trans);
-
-	inode = btrfs_new_inode(trans, new_root, new_dirid,
-				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
-	inode->i_op = &btrfs_dir_inode_operations;
-	inode->i_fop = &btrfs_dir_file_operations;
-
-	ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
-	BUG_ON(ret);
-
-	inode->i_nlink = 1;
-	inode->i_size = 6;
-	ret = btrfs_update_inode(trans, new_root, inode);
-	BUG_ON(ret);
-
-	ret = btrfs_commit_transaction(trans, new_root);
-	BUG_ON(ret);
-
-	iput(inode);
-
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
+	sb->s_root = root_dentry;
+	btrfs_transaction_queue_work(tree_root, HZ * 30);
 	return 0;
+
+fail_close:
+	close_ctree(tree_root);
+	return err;
 }
 
-static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
+static int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_key key;
-	struct btrfs_root_item new_root_item;
+	struct btrfs_root *root;
 	int ret;
-	u64 objectid;
-
-	if (!root->ref_cows)
-		return -EINVAL;
+	root = btrfs_sb(sb);
 
+	sb->s_dirt = 0;
+	if (!wait) {
+		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		return 0;
+	}
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
-
-	ret = btrfs_update_inode(trans, root, root->inode);
-	BUG_ON(ret);
-
-	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-				       0, &objectid);
-	BUG_ON(ret);
-
-	memcpy(&new_root_item, &root->root_item,
-	       sizeof(new_root_item));
-
-	key.objectid = objectid;
-	key.offset = 1;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
-
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-				&new_root_item);
-	BUG_ON(ret);
-
-	/*
-	 * insert the directory item
-	 */
-	key.offset = (u64)-1;
-	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    name, namelen,
-				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, BTRFS_FT_DIR);
-
-	BUG_ON(ret);
-
-	ret = btrfs_inc_root_ref(trans, root);
-	BUG_ON(ret);
-
 	ret = btrfs_commit_transaction(trans, root);
+	sb->s_dirt = 0;
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
-	return 0;
-}
-
-static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
-		       cmd, unsigned long arg)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_ioctl_vol_args vol_args;
-	int ret = 0;
-	struct btrfs_dir_item *di;
-	int namelen;
-	struct btrfs_path *path;
-	u64 root_dirid;
-
-	switch (cmd) {
-	case BTRFS_IOC_SNAP_CREATE:
-		if (copy_from_user(&vol_args,
-				   (struct btrfs_ioctl_vol_args __user *)arg,
-				   sizeof(vol_args)))
-			return -EFAULT;
-		namelen = strlen(vol_args.name);
-		if (namelen > BTRFS_VOL_NAME_MAX)
-			return -EINVAL;
-		path = btrfs_alloc_path();
-		if (!path)
-			return -ENOMEM;
-		root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
-		mutex_lock(&root->fs_info->fs_mutex);
-		di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
-				    path, root_dirid,
-				    vol_args.name, namelen, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
-		btrfs_free_path(path);
-		if (di && !IS_ERR(di))
-			return -EEXIST;
-
-		if (root == root->fs_info->tree_root)
-			ret = create_subvol(root, vol_args.name, namelen);
-		else
-			ret = create_snapshot(root, vol_args.name, namelen);
-		WARN_ON(ret);
-		break;
-	default:
-		return -ENOTTY;
-	}
-	return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static long btrfs_compat_ioctl(struct file *file, unsigned int cmd,
-			       unsigned long arg)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-	lock_kernel();
-	ret = btrfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
-
-}
-#endif
-
-static struct kmem_cache *btrfs_inode_cachep;
-struct kmem_cache *btrfs_trans_handle_cachep;
-struct kmem_cache *btrfs_transaction_cachep;
-struct kmem_cache *btrfs_bit_radix_cachep;
-struct kmem_cache *btrfs_path_cachep;
-
-/*
- * Called inside transaction, so use GFP_NOFS
- */
-static struct inode *btrfs_alloc_inode(struct super_block *sb)
-{
-	struct btrfs_inode *ei;
-
-	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
-	if (!ei)
-		return NULL;
-	return &ei->vfs_inode;
-}
-
-static void btrfs_destroy_inode(struct inode *inode)
-{
-	WARN_ON(!list_empty(&inode->i_dentry));
-	WARN_ON(inode->i_data.nrpages);
-
-	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
-}
-
-static void init_once(void * foo, struct kmem_cache * cachep,
-		      unsigned long flags)
-{
-	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
-
-	if ((flags & (SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		inode_init_once(&ei->vfs_inode);
-	}
-}
-
-static int init_inodecache(void)
-{
-	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
-					     sizeof(struct btrfs_inode),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once, NULL);
-	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
-					     sizeof(struct btrfs_trans_handle),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     NULL, NULL);
-	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
-					     sizeof(struct btrfs_transaction),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     NULL, NULL);
-	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
-					     sizeof(struct btrfs_transaction),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     NULL, NULL);
-	btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
-					     256,
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD |
-						SLAB_DESTROY_BY_RCU),
-					     NULL, NULL);
-	if (btrfs_inode_cachep == NULL || btrfs_trans_handle_cachep == NULL ||
-	    btrfs_transaction_cachep == NULL || btrfs_bit_radix_cachep == NULL)
-		return -ENOMEM;
 	return 0;
 }
 
-static void destroy_inodecache(void)
+static void btrfs_write_super(struct super_block *sb)
 {
-	kmem_cache_destroy(btrfs_inode_cachep);
-	kmem_cache_destroy(btrfs_trans_handle_cachep);
-	kmem_cache_destroy(btrfs_transaction_cachep);
-	kmem_cache_destroy(btrfs_bit_radix_cachep);
-	kmem_cache_destroy(btrfs_path_cachep);
+	sb->s_dirt = 0;
 }
 
 static int btrfs_get_sb(struct file_system_type *fs_type,
@@ -2709,15 +124,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 			   btrfs_fill_super, mnt);
 }
 
-static int btrfs_getattr(struct vfsmount *mnt,
-			 struct dentry *dentry, struct kstat *stat)
-{
-	struct inode *inode = dentry->d_inode;
-	generic_fillattr(inode, stat);
-	stat->blksize = 256 * 1024;
-	return 0;
-}
-
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -2732,197 +138,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(old_dir)->root;
-	struct inode *new_inode = new_dentry->d_inode;
-	struct inode *old_inode = old_dentry->d_inode;
-	struct timespec ctime = CURRENT_TIME;
-	struct btrfs_path *path;
-	struct btrfs_dir_item *di;
-	int ret;
-
-	if (S_ISDIR(old_inode->i_mode) && new_inode &&
-	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
-		return -ENOTEMPTY;
-	}
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, new_dir);
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out_fail;
-	}
-
-	old_dentry->d_inode->i_nlink++;
-	old_dir->i_ctime = old_dir->i_mtime = ctime;
-	new_dir->i_ctime = new_dir->i_mtime = ctime;
-	old_inode->i_ctime = ctime;
-	if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
-		struct btrfs_key *location = &BTRFS_I(new_dir)->location;
-		u64 old_parent_oid;
-		di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino,
-					   "..", 2, -1);
-		if (IS_ERR(di)) {
-			ret = PTR_ERR(di);
-			goto out_fail;
-		}
-		if (!di) {
-			ret = -ENOENT;
-			goto out_fail;
-		}
-		old_parent_oid = btrfs_disk_key_objectid(&di->location);
-		ret = btrfs_del_item(trans, root, path);
-		if (ret) {
-			ret = -EIO;
-			goto out_fail;
-		}
-		btrfs_release_path(root, path);
-
-		di = btrfs_lookup_dir_index_item(trans, root, path,
-						 old_inode->i_ino,
-						 old_parent_oid,
-						 "..", 2, -1);
-		if (IS_ERR(di)) {
-			ret = PTR_ERR(di);
-			goto out_fail;
-		}
-		if (!di) {
-			ret = -ENOENT;
-			goto out_fail;
-		}
-		ret = btrfs_del_item(trans, root, path);
-		if (ret) {
-			ret = -EIO;
-			goto out_fail;
-		}
-		btrfs_release_path(root, path);
-
-		ret = btrfs_insert_dir_item(trans, root, "..", 2,
-					    old_inode->i_ino, location,
-					    BTRFS_FT_DIR);
-		if (ret)
-			goto out_fail;
-	}
-
-
-	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry);
-	if (ret)
-		goto out_fail;
-
-	if (new_inode) {
-		new_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
-		if (ret)
-			goto out_fail;
-		if (S_ISDIR(new_inode->i_mode))
-			clear_nlink(new_inode);
-		else
-			drop_nlink(new_inode);
-		btrfs_update_inode(trans, root, new_inode);
-	}
-	ret = btrfs_add_link(trans, new_dentry, old_inode);
-	if (ret)
-		goto out_fail;
-
-out_fail:
-	btrfs_free_path(path);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
-
-static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
-			 const char *symname)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	struct inode *inode;
-	int err;
-	int drop_inode = 0;
-	u64 objectid;
-	int name_len;
-	int datasize;
-	char *ptr;
-	struct btrfs_file_extent_item *ei;
-
-	name_len = strlen(symname) + 1;
-	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
-		return -ENAMETOOLONG;
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, dir);
-
-	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-	if (err) {
-		err = -ENOSPC;
-		goto out_unlock;
-	}
-
-	inode = btrfs_new_inode(trans, root, objectid,
-				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out_unlock;
-
-	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode);
-	if (err)
-		drop_inode = 1;
-	else {
-		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_fop = &btrfs_file_operations;
-		inode->i_op = &btrfs_file_inode_operations;
-	}
-	dir->i_sb->s_dirt = 1;
-	btrfs_update_inode_block_group(trans, inode);
-	btrfs_update_inode_block_group(trans, dir);
-	if (drop_inode)
-		goto out_unlock;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	key.objectid = inode->i_ino;
-	key.offset = 0;
-	key.flags = 0;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	datasize = btrfs_file_extent_calc_inline_size(name_len);
-	err = btrfs_insert_empty_item(trans, root, path, &key,
-				      datasize);
-	BUG_ON(err);
-	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-	       path->slots[0], struct btrfs_file_extent_item);
-	btrfs_set_file_extent_generation(ei, trans->transid);
-	btrfs_set_file_extent_type(ei,
-				   BTRFS_FILE_EXTENT_INLINE);
-	ptr = btrfs_file_extent_inline_start(ei);
-	btrfs_memcpy(root, path->nodes[0]->b_data,
-		     ptr, symname, name_len);
-	mark_buffer_dirty(path->nodes[0]);
-	btrfs_free_path(path);
-	inode->i_op = &btrfs_symlink_inode_operations;
-	inode->i_mapping->a_ops = &btrfs_symlink_aops;
-	inode->i_size = name_len - 1;
-	btrfs_update_inode(trans, root, inode);
-	err = 0;
-
-out_unlock:
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	if (drop_inode) {
-		inode_dec_link_count(inode);
-		iput(inode);
-	}
-	btrfs_btree_balance_dirty(root);
-	return err;
-}
-
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -2944,91 +159,21 @@ static struct super_operations btrfs_super_ops = {
 	.statfs		= btrfs_statfs,
 };
 
-static struct inode_operations btrfs_dir_inode_operations = {
-	.lookup		= btrfs_lookup,
-	.create		= btrfs_create,
-	.unlink		= btrfs_unlink,
-	.link		= btrfs_link,
-	.mkdir		= btrfs_mkdir,
-	.rmdir		= btrfs_rmdir,
-	.rename		= btrfs_rename,
-	.symlink	= btrfs_symlink,
-	.setattr	= btrfs_setattr,
-};
-
-static struct inode_operations btrfs_dir_ro_inode_operations = {
-	.lookup		= btrfs_lookup,
-};
-
-static struct file_operations btrfs_dir_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= generic_read_dir,
-	.readdir	= btrfs_readdir,
-	.ioctl		= btrfs_ioctl,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= btrfs_compat_ioctl,
-#endif
-};
-
-static struct address_space_operations btrfs_aops = {
-	.readpage	= btrfs_readpage,
-	.writepage	= btrfs_writepage,
-	.sync_page	= block_sync_page,
-	.prepare_write	= btrfs_prepare_write,
-	.commit_write	= btrfs_commit_write,
-	.bmap		= btrfs_bmap,
-};
-
-static struct address_space_operations btrfs_symlink_aops = {
-	.readpage	= btrfs_readpage,
-	.writepage	= btrfs_writepage,
-};
-
-static struct inode_operations btrfs_file_inode_operations = {
-	.truncate	= btrfs_truncate,
-	.getattr	= btrfs_getattr,
-	.setattr	= btrfs_setattr,
-};
-
-static struct file_operations btrfs_file_operations = {
-	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.aio_read       = btrfs_file_aio_read,
-	.write		= btrfs_file_write,
-	.mmap		= generic_file_mmap,
-	.open		= generic_file_open,
-	.ioctl		= btrfs_ioctl,
-	.fsync		= btrfs_sync_file,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= btrfs_compat_ioctl,
-#endif
-};
-
-static struct inode_operations btrfs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-};
-
 static int __init init_btrfs_fs(void)
 {
 	int err;
-	printk("btrfs loaded!\n");
 	btrfs_init_transaction_sys();
-	err = init_inodecache();
+	err = btrfs_init_cachep();
 	if (err)
 		return err;
 	return register_filesystem(&btrfs_fs_type);
-	destroy_inodecache();
-	return err;
 }
 
 static void __exit exit_btrfs_fs(void)
 {
 	btrfs_exit_transaction_sys();
-	destroy_inodecache();
+	btrfs_destroy_cachep();
 	unregister_filesystem(&btrfs_fs_type);
-	printk("btrfs unloaded\n");
 }
 
 module_init(init_btrfs_fs)
-- 
cgit v1.2.3


From 84f54cfa78c81991e087309a9b379f25f1ffdb10 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 12 Jun 2007 07:43:08 -0400
Subject: Btrfs: 64 bit div fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 43 ++++++++++++++++++++++++-------------------
 fs/btrfs/file-item.c   |  3 ++-
 fs/btrfs/file.c        |  7 ++++---
 fs/btrfs/root-tree.c   |  2 +-
 4 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f509ffa38d0..6949eebc9dc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -65,7 +65,6 @@ static int cache_block_group(struct btrfs_root *root,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-printk("cache block group %Lu\n", block_group->key.objectid);
 	key.objectid = block_group->key.objectid;
 	key.flags = 0;
 	key.offset = 0;
@@ -164,8 +163,8 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 static u64 leaf_range(struct btrfs_root *root)
 {
 	u64 size = BTRFS_LEAF_DATA_SIZE(root);
-	size = size / (sizeof(struct btrfs_extent_item) +
-		       sizeof(struct btrfs_item));
+	do_div(size, sizeof(struct btrfs_extent_item) +
+		sizeof(struct btrfs_item));
 	return size;
 }
 
@@ -219,6 +218,13 @@ new_group:
 	goto again;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
@@ -255,17 +261,18 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		if (shint->data == data) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
-			    (shint->key.offset * factor) / 10) {
+			    div_factor(shint->key.offset, factor)) {
 				return shint;
 			}
 		}
 	}
 	if (hint && hint->data == data) {
 		used = btrfs_block_group_used(&hint->item);
-		if (used + hint->pinned < (hint->key.offset * factor) / 10) {
+		if (used + hint->pinned <
+		    div_factor(hint->key.offset, factor)) {
 			return hint;
 		}
-		if (used >= (hint->key.offset * 8) / 10) {
+		if (used >= div_factor(hint->key.offset, 8)) {
 			radix_tree_tag_clear(radix,
 					     hint->key.objectid +
 					     hint->key.offset - 1,
@@ -297,11 +304,11 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 				cache[i]->key.offset;
 			used = btrfs_block_group_used(&cache[i]->item);
 			if (used + cache[i]->pinned <
-			    (cache[i]->key.offset * factor) / 10) {
+			    div_factor(cache[i]->key.offset, factor)) {
 				found_group = cache[i];
 				goto found;
 			}
-			if (used >= (cache[i]->key.offset * 8) / 10) {
+			if (used >= div_factor(cache[i]->key.offset, 8)) {
 				radix_tree_tag_clear(radix,
 						     cache[i]->key.objectid +
 						     cache[i]->key.offset - 1,
@@ -348,7 +355,6 @@ again:
 		goto again;
 	}
 	if (!found_group) {
-printk("find block group bailing to zero data %d\n", data);
 		ret = radix_tree_gang_lookup(radix,
 					     (void **)&found_group, 0, 1);
 		if (ret == 0) {
@@ -386,7 +392,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 1);
 	if (ret != 0) {
-printk("can't find block %Lu %Lu\n", blocknr, num_blocks);
 		BUG();
 	}
 	BUG_ON(ret != 0);
@@ -601,8 +606,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				}
 			}
 			if (cache->data != data &&
-			    old_val < cache->key.offset / 2) {
-printk("changing block group %Lu from %d to %d\n", cache->key.objectid, cache->data, data);
+			    old_val < (cache->key.offset >> 1)) {
 				cache->data = data;
 				radix_tree_delete(cache->radix,
 						  cache->key.objectid +
@@ -634,9 +638,8 @@ printk("changing block group %Lu from %d to %d\n", cache->key.objectid, cache->d
 						      blocknr + i);
 				}
 			}
-			if (old_val < cache->key.offset / 2 &&
-			    old_val + num >= cache->key.offset / 2) {
-printk("group %Lu now available\n", cache->key.objectid);
+			if (old_val < (cache->key.offset >> 1) &&
+			    old_val + num >= (cache->key.offset >> 1)) {
 				radix_tree_tag_set(cache->radix,
 						   cache->key.objectid +
 						   cache->key.offset - 1,
@@ -1000,10 +1003,10 @@ check_failed:
 			}
 			if (start_found)
 				limit = last_block +
-					block_group->key.offset / 2;
+					(block_group->key.offset >> 1);
 			else
 				limit = search_start +
-					block_group->key.offset / 2;
+					(block_group->key.offset >> 1);
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
@@ -1534,9 +1537,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_leaf *leaf;
-	u64 group_size_blocks = BTRFS_BLOCK_GROUP_SIZE / root->blocksize;
+	u64 group_size_blocks;
 	u64 used;
 
+	group_size_blocks = BTRFS_BLOCK_GROUP_SIZE >>
+		root->fs_info->sb->s_blocksize_bits;
 	root = info->extent_root;
 	key.objectid = 0;
 	key.offset = group_size_blocks;
@@ -1590,7 +1595,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 					(void *)cache);
 		BUG_ON(ret);
 		used = btrfs_block_group_used(bi);
-		if (used < (key.offset * 8) / 10) {
+		if (used < div_factor(key.offset, 8)) {
 			radix_tree_tag_set(radix, found_key.objectid +
 					   found_key.offset - 1,
 					   BTRFS_BLOCK_GROUP_AVAIL);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 0782e924dde..fb8c214160c 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -239,7 +239,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 	if (isize <= key.offset)
 		return 0;
 	new_item_span = isize - key.offset;
-	blocks = (new_item_span + root->blocksize - 1) / root->blocksize;
+	blocks = (new_item_span + root->blocksize - 1) >>
+		root->fs_info->sb->s_blocksize_bits;
 	new_item_size = blocks * BTRFS_CRC32_SIZE;
 	if (new_item_size >= btrfs_item_size(leaf->items + slot))
 		return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7f8e3035d99..0325dc03859 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -82,7 +82,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 	for (i = 0; i < num_pages; i++) {
 		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
 		/* FIXME, one block at a time */
 
 		mutex_lock(&root->fs_info->fs_mutex);
@@ -395,7 +395,7 @@ static int prepare_pages(struct btrfs_root *root,
 		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
 		wait_on_page_writeback(pages[i]);
 		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
+		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
 		if (!page_has_buffers(pages[i])) {
 			create_empty_buffers(pages[i],
 					     root->fs_info->sb->s_blocksize,
@@ -567,7 +567,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 
 	while(count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-		size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
+		size_t write_bytes = min(count,
+					 (size_t)PAGE_CACHE_SIZE - offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a42943bd917..0564a73bb2e 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -102,7 +102,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ret = btrfs_del_item(trans, root, path);
 	} else {
 		btrfs_set_root_refs(ri, refs - 1);
-printk("ref now %u root %Lu %Lu %u\n", refs -1, key->objectid, key->offset, key->flags);
+printk("ref now %u root %llu %Lu %u\n", refs -1, key->objectid, key->offset, key->flags);
 		mark_buffer_dirty(path->nodes[0]);
 	}
 out:
-- 
cgit v1.2.3


From 5af3981c1878b0657b9babd2ef7ec98c2008cf2c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 12 Jun 2007 07:50:13 -0400
Subject: Btrfs: printk fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 10 ++++++----
 fs/btrfs/extent-tree.c |  5 -----
 fs/btrfs/file-item.c   |  5 -----
 fs/btrfs/print-tree.c  | 42 ++++++++++++++++++++++--------------------
 fs/btrfs/root-tree.c   |  2 +-
 5 files changed, 29 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 96bf3ef3a79..87810117254 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -20,8 +20,9 @@ static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
 {
 	struct btrfs_node *node = btrfs_buffer_node(buf);
 	if (bh_blocknr(buf) != btrfs_header_blocknr(&node->header)) {
-		printk(KERN_CRIT "bh_blocknr(buf) is %Lu, header is %Lu\n",
-		       bh_blocknr(buf), btrfs_header_blocknr(&node->header));
+		printk(KERN_CRIT "bh_blocknr(buf) is %llu, header is %llu\n",
+		       (unsigned long long)bh_blocknr(buf),
+		       (unsigned long long)btrfs_header_blocknr(&node->header));
 		return 1;
 	}
 	return 0;
@@ -157,8 +158,9 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 		return ret;
 	if (verify) {
 		if (memcmp(bh->b_data, result, BTRFS_CRC32_SIZE)) {
-			printk("checksum verify failed on %Lu\n",
-			       bh_blocknr(bh));
+			printk("btrfs: %s checksum verify failed on %llu\n",
+			       root->fs_info->sb->s_id,
+			       (unsigned long long)bh_blocknr(bh));
 			return 1;
 		}
 	} else {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6949eebc9dc..fe02fbfa6d5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -584,8 +584,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	while(total) {
 		cache = btrfs_lookup_block_group(info, blocknr);
 		if (!cache) {
-			printk(KERN_CRIT "blocknr %Lu lookup failed\n",
-			       blocknr);
 			return -1;
 		}
 		block_in_group = blocknr - cache->key.objectid;
@@ -795,9 +793,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
 	if (ret) {
-		printk("failed to find %Lu\n", key.objectid);
-		btrfs_print_tree(extent_root, extent_root->node);
-		printk("failed to find %Lu\n", key.objectid);
 		BUG();
 	}
 	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index fb8c214160c..d5a98827e38 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -28,10 +28,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(*item));
-	if (ret) {
-printk("failed to insert %Lu %Lu ret %d\n", objectid, pos, ret);
-btrfs_print_leaf(root, btrfs_buffer_leaf(path->nodes[0]));
-	}
 	BUG_ON(ret);
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_file_extent_item);
@@ -201,7 +197,6 @@ insert:
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      BTRFS_CRC32_SIZE);
 	if (ret != 0) {
-		printk("at insert for %Lu %u %Lu ret is %d\n", file_key.objectid, file_key.flags, file_key.offset, ret);
 		WARN_ON(1);
 		goto fail;
 	}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 28813411de6..21791f03756 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -15,31 +15,32 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_block_group_item *bi;
 	u32 type;
 
-	printk("leaf %Lu total ptrs %d free space %d\n",
-		btrfs_header_blocknr(&l->header), nr,
+	printk("leaf %llu total ptrs %d free space %d\n",
+		(unsigned long long)btrfs_header_blocknr(&l->header), nr,
 		btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
 		item = l->items + i;
 		type = btrfs_disk_key_type(&item->key);
-		printk("\titem %d key (%Lu %x %Lu) itemoff %d itemsize %d\n",
+		printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
 			i,
-			btrfs_disk_key_objectid(&item->key),
+			(unsigned long long)btrfs_disk_key_objectid(&item->key),
 			btrfs_disk_key_flags(&item->key),
-			btrfs_disk_key_offset(&item->key),
+			(unsigned long long)btrfs_disk_key_offset(&item->key),
 			btrfs_item_offset(item),
 			btrfs_item_size(item));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-			printk("\t\tinode generation %Lu size %Lu mode %o\n",
-			       btrfs_inode_generation(ii),
-			       btrfs_inode_size(ii),
+			printk("\t\tinode generation %llu size %llu mode %o\n",
+			       (unsigned long long)btrfs_inode_generation(ii),
+			       (unsigned long long)btrfs_inode_size(ii),
 			       btrfs_inode_mode(ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
-			printk("\t\tdir oid %Lu flags %u type %u\n",
-				btrfs_disk_key_objectid(&di->location),
+			printk("\t\tdir oid %llu flags %u type %u\n",
+				(unsigned long long)btrfs_disk_key_objectid(
+							    &di->location),
 				btrfs_dir_flags(di),
 				btrfs_dir_type(di));
 			printk("\t\tname %.*s\n",
@@ -47,8 +48,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-			printk("\t\troot data blocknr %Lu refs %u\n",
-				btrfs_root_blocknr(ri), btrfs_root_refs(ri));
+			printk("\t\troot data blocknr %llu refs %u\n",
+				(unsigned long long)btrfs_root_blocknr(ri),
+				btrfs_root_refs(ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
 			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
@@ -58,8 +60,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
-			printk("\t\tblock group used %Lu\n",
-			       btrfs_block_group_used(bi));
+			printk("\t\tblock group used %llu\n",
+			       (unsigned long long)btrfs_block_group_used(bi));
 			break;
 		case BTRFS_STRING_ITEM_KEY:
 			printk("\t\titem data %.*s\n", btrfs_item_size(item),
@@ -83,17 +85,17 @@ void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t)
 		btrfs_print_leaf(root, (struct btrfs_leaf *)c);
 		return;
 	}
-	printk("node %Lu level %d total ptrs %d free spc %u\n",
-	       btrfs_header_blocknr(&c->header),
+	printk("node %llu level %d total ptrs %d free spc %u\n",
+	       (unsigned long long)btrfs_header_blocknr(&c->header),
 	       btrfs_header_level(&c->header), nr,
 	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
-		printk("\tkey %d (%Lu %u %Lu) block %Lu\n",
+		printk("\tkey %d (%llu %u %llu) block %llu\n",
 		       i,
-		       c->ptrs[i].key.objectid,
+		       (unsigned long long)c->ptrs[i].key.objectid,
 		       c->ptrs[i].key.flags,
-		       c->ptrs[i].key.offset,
-		       btrfs_node_blockptr(c, i));
+		       (unsigned long long)c->ptrs[i].key.offset,
+		       (unsigned long long)btrfs_node_blockptr(c, i));
 	}
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *next_buf = read_tree_block(root,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0564a73bb2e..13c30284fd8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -102,7 +102,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ret = btrfs_del_item(trans, root, path);
 	} else {
 		btrfs_set_root_refs(ri, refs - 1);
-printk("ref now %u root %llu %Lu %u\n", refs -1, key->objectid, key->offset, key->flags);
+		WARN_ON(1);
 		mark_buffer_dirty(path->nodes[0]);
 	}
 out:
-- 
cgit v1.2.3


From 8a712645c3fbbf43293af8640af337b7cf1fef01 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 12 Jun 2007 08:21:35 -0400
Subject: Btrfs: no slashes in subvolume names

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 62a3a778d37..1cabc1951b0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1831,6 +1831,8 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		namelen = strlen(vol_args.name);
 		if (namelen > BTRFS_VOL_NAME_MAX)
 			return -EINVAL;
+		if (strchr(vol_args.name, '/'))
+			return -EINVAL;
 		path = btrfs_alloc_path();
 		if (!path)
 			return -ENOMEM;
-- 
cgit v1.2.3


From 6cbd55707802b98843f953d1ae6d8f5bcd9a76c0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 12 Jun 2007 09:07:21 -0400
Subject: Btrfs: add GPLv2

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/COPYING       | 356 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/TODO          |   1 -
 fs/btrfs/bit-radix.c   |  18 +++
 fs/btrfs/bit-radix.h   |  18 +++
 fs/btrfs/btrfs_inode.h |  18 +++
 fs/btrfs/ctree.c       |  18 +++
 fs/btrfs/ctree.h       |  18 +++
 fs/btrfs/dir-item.c    |  18 +++
 fs/btrfs/disk-io.c     |  18 +++
 fs/btrfs/disk-io.h     |  18 +++
 fs/btrfs/extent-tree.c |  18 +++
 fs/btrfs/file-item.c   |  18 +++
 fs/btrfs/file.c        |  18 +++
 fs/btrfs/hash.c        |  18 +++
 fs/btrfs/hash.h        |  18 +++
 fs/btrfs/inode-item.c  |  18 +++
 fs/btrfs/inode-map.c   |  18 +++
 fs/btrfs/inode.c       |  18 +++
 fs/btrfs/ioctl.h       |  18 +++
 fs/btrfs/print-tree.c  |  18 +++
 fs/btrfs/print-tree.h  |  18 +++
 fs/btrfs/root-tree.c   |  18 +++
 fs/btrfs/super.c       |  18 +++
 fs/btrfs/sysfs.c       |  18 +++
 fs/btrfs/transaction.c |  18 +++
 fs/btrfs/transaction.h |  18 +++
 26 files changed, 788 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/COPYING

(limited to 'fs')

diff --git a/fs/btrfs/COPYING b/fs/btrfs/COPYING
new file mode 100644
index 00000000000..ca442d313d8
--- /dev/null
+++ b/fs/btrfs/COPYING
@@ -0,0 +1,356 @@
+
+   NOTE! This copyright does *not* cover user programs that use kernel
+ services by normal system calls - this is merely considered normal use
+ of the kernel, and does *not* fall under the heading of "derived work".
+ Also note that the GPL below is copyrighted by the Free Software
+ Foundation, but the instance of code that it refers to (the Linux
+ kernel) is copyrighted by me and others who actually wrote it.
+
+ Also note that the only valid version of the GPL as far as the kernel
+ is concerned is _this_ particular version of the license (ie v2, not
+ v2.2 or v3.x or whatever), unless explicitly otherwise stated.
+
+			Linus Torvalds
+
+----------------------------------------
+
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                       51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
index 4a48a2365d5..d9b6d38c603 100644
--- a/fs/btrfs/TODO
+++ b/fs/btrfs/TODO
@@ -1,4 +1,3 @@
-* fix printk warnings
 * cleanup, add more error checking, get rid of BUG_ONs
 * Fix ENOSPC handling
 * Make allocator smarter
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index 8f9cd427723..a4ddf4de979 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "bit-radix.h"
 
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
index 4e717e30db4..c100f54d5c3 100644
--- a/fs/btrfs/bit-radix.h
+++ b/fs/btrfs/bit-radix.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BIT_RADIX__
 #define __BIT_RADIX__
 #include <linux/radix-tree.h>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6b50076b02d..783f06b6325 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 20e01f95ce6..4736155af41 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 24410d42e3c..4bd648d68e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BTRFS__
 #define __BTRFS__
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index b408a3d20c7..0d179a44ebf 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87810117254..8c68a64c0ce 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 822ccb8aa4a..7b76ccc4875 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __DISKIO__
 #define __DISKIO__
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe02fbfa6d5..975a0eb9241 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d5a98827e38..37dd3801a10 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0325dc03859..566e526b60f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 32de1ea1b64..e5c76903d72 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 /*
  *  Original copy from:
  *  linux/fs/ext3/hash.c
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index e56c4670093..d3be0267058 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __HASH__
 #define __HASH__
 int btrfs_name_hash(const char *name, int len, u64 *hash_result);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index c3b990b661c..449e9dbdf95 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 318e27a6378..58930fc48be 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1cabc1951b0..46f869d6d0f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 497fadd98c3..23bed48c51a 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __IOCTL_
 #define __IOCTL_
 #include <linux/ioctl.h>
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 21791f03756..1fa70568300 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 396041a05cf..205cd03601a 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
 void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 13c30284fd8..7a2d63c1ee8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 94b3535a2a1..ef8ad18b275 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ed8b7e20cfb..ead9dc8051b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4f3c8ddcb40..30914521cf0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #include <linux/module.h>
 #include <linux/fs.h>
 #include "ctree.h"
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f25b4900db4..5fb1d322aa1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __TRANSACTION__
 #define __TRANSACTION__
 #include "btrfs_inode.h"
-- 
cgit v1.2.3


From 340887809d92a4dd6c71caa97d658cef32c098c3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 12 Jun 2007 11:36:58 -0400
Subject: Btrfs: i386 fixes from axboe

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/INSTALL       | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/file.c        |  7 +++++--
 fs/btrfs/inode.c       |  3 +--
 fs/btrfs/transaction.c |  1 +
 5 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 fs/btrfs/INSTALL

(limited to 'fs')

diff --git a/fs/btrfs/INSTALL b/fs/btrfs/INSTALL
new file mode 100644
index 00000000000..e83ff6e8c8d
--- /dev/null
+++ b/fs/btrfs/INSTALL
@@ -0,0 +1,51 @@
+Install Instructions
+
+Btrfs puts snapshots and subvolumes into the root directory of the FS.  This
+directory can only be changed by btrfsctl right now, and normal filesystem
+operations do not work on it.  The default subvolume is called 'default',
+and you can create files and directories in mount_point/default
+
+Btrfs uses the crypto manager interface in the kernel for file and
+metadata checksums.  You need to compile the kernel with:
+
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_MANAGER=m
+CONFIG_CRYPTO_CRC32C=m
+
+cryptomanager and crc32c can be static as well.  Once your kernel is
+setup, typing make in the btrfs module sources will build against the
+running kernel.  When the build is complete:
+
+modprobe crc32c
+modprobe cryptomgr
+insmod btrfs.ko
+
+The Btrfs utility programs require libuuid to build.  This can be found
+in the e2fsprogs sources, and is usually available as libuuid or
+e2fsprogs-devel from various distros.
+
+Building the utilities is just make ; make install.  The programs go
+into /usr/local/bin.  The commands available are:
+
+mkfs.btrfs: create a filesystem
+
+btrfsctl: control program to create snapshots and subvolumes:
+
+	mount /dev/sda2 /mnt
+	btrfsctl -s new_subvol_name /mnt
+	btrfsctl -s snapshot_of_default /mnt/default
+	btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
+	btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
+	ls /mnt
+	default snapshot_of_a_snapshot snapshot_of_new_subvol
+	new_subvol_name snapshot_of_default
+
+	Snapshots and subvolumes cannot be deleted right now, but you can
+	rm -rf all the files and directories inside them.
+
+btrfsck: do a limited check of the FS extent trees.</li>
+
+debug-tree: print all of the FS metadata in text form.  Example:
+
+	debug-tree /dev/sda2 >& big_output_file
+
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c68a64c0ce..b0ebba6f2e4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -106,7 +106,7 @@ struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
 	int err;
 	u64 first_block = index << (PAGE_CACHE_SHIFT - blockbits);
 
-	page = grab_cache_page(mapping, index);
+	page = find_or_create_page(mapping, index, GFP_NOFS);
 	if (!page)
 		return NULL;
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 566e526b60f..83836fb3d4e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -112,7 +112,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
 			struct btrfs_key key;
 			struct btrfs_path *path;
-			char *ptr;
+			char *ptr, *kaddr;
 			u32 datasize;
 
 			/* create an inline extent, and copy the data in */
@@ -135,8 +135,11 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			btrfs_set_file_extent_type(ei,
 						   BTRFS_FILE_EXTENT_INLINE);
 			ptr = btrfs_file_extent_inline_start(ei);
+			kaddr = kmap_atomic(bh->b_page, KM_USER0);
 			btrfs_memcpy(root, path->nodes[0]->b_data,
-				     ptr, bh->b_data, offset + write_bytes);
+				     ptr, kaddr + bh_offset(bh),
+				     offset + write_bytes);
+			kunmap_atomic(kaddr, KM_USER0);
 			mark_buffer_dirty(path->nodes[0]);
 			btrfs_free_path(path);
 		} else if (buffer_mapped(bh)) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 46f869d6d0f..d3d5a4cdb6a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1754,6 +1754,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
+	new_root->inode = inode;
 
 	ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
 	BUG_ON(ret);
@@ -1766,8 +1767,6 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_commit_transaction(trans, new_root);
 	BUG_ON(ret);
 
-	iput(inode);
-
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root);
 	return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 30914521cf0..cd4f1d14038 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -18,6 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
-- 
cgit v1.2.3


From f1ace244c8c1e16eaa5c8b3b5339849651e31ede Mon Sep 17 00:00:00 2001
From: Aneesh <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 Jun 2007 16:18:26 -0400
Subject: btrfs: Code cleanup Attaching below is some of the code cleanups that
 i came across while reading the code.

a) alloc_path already calls init_path.
b) Mention that btrfs_inode is the in memory copy.Ext4 have ext4_inode_info as
the in memory copy ext4_inode as the disk copy

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.c       |  1 -
 fs/btrfs/dir-item.c    |  1 -
 fs/btrfs/extent-tree.c |  4 ----
 fs/btrfs/file-item.c   |  2 --
 fs/btrfs/inode-item.c  |  1 -
 fs/btrfs/inode.c       | 10 +---------
 fs/btrfs/root-tree.c   |  3 ---
 8 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 783f06b6325..6bce46be49f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,6 +19,7 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+/* in memory btrfs inode */
 struct btrfs_inode {
 	struct btrfs_root *root;
 	struct btrfs_block_group_cache *block_group;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4736155af41..498dea0854c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1647,7 +1647,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (!ret) {
 		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 0d179a44ebf..ff10cf5a842 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -74,7 +74,6 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
 	path = btrfs_alloc_path();
-	btrfs_init_path(path);
 	data_size = sizeof(*dir_item) + name_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 975a0eb9241..5f4eaba8b85 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -402,7 +402,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -437,7 +436,6 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *item;
 
 	path = btrfs_alloc_path();
-	btrfs_init_path(path);
 	key.objectid = blocknr;
 	key.offset = num_blocks;
 	key.flags = 0;
@@ -807,7 +805,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 
 	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
 	if (ret) {
@@ -1463,7 +1460,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 
 	level = btrfs_header_level(btrfs_buffer_header(snap));
 	orig_level = level;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 37dd3801a10..3c140cc1dd0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -38,7 +38,6 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	file_key.objectid = objectid;
 	file_key.offset = pos;
 	file_key.flags = 0;
@@ -274,7 +273,6 @@ int btrfs_csum_verify_file_block(struct btrfs_root *root,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	file_key.flags = 0;
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 449e9dbdf95..e8b5a9f7c0c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -35,7 +35,6 @@ int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_insert_item(trans, root, &key, inode_item,
 				sizeof(*inode_item));
 	btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3d5a4cdb6a..e639cc9e089 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -79,7 +79,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
@@ -177,7 +176,6 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, 1);
 	if (ret) {
@@ -214,7 +212,6 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -289,7 +286,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
@@ -360,7 +356,6 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, -1);
 	BUG_ON(ret);
@@ -687,7 +682,6 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
 	if (!di || IS_ERR(di)) {
@@ -874,7 +868,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 	path = btrfs_alloc_path();
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -1263,7 +1256,6 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	if (create & BTRFS_GET_BLOCK_CREATE) {
 		WARN_ON(1);
 		/* this almost but not quite works */
@@ -1280,7 +1272,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	}
 
 	ret = btrfs_lookup_file_extent(NULL, root, path,
-				       inode->i_ino,
+				       objectid,
 				       iblock << inode->i_blkbits, 0);
 	if (ret < 0) {
 		err = ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7a2d63c1ee8..8e1b9046d5e 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -36,7 +36,6 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -70,7 +69,6 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
 	if (ret < 0)
 		goto out;
@@ -106,7 +104,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	btrfs_init_path(path);
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-- 
cgit v1.2.3


From 9ebefb180bad4914a31c4e1748ba187a30e1e990 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Jun 2007 13:50:00 -0400
Subject: Btrfs: patch queue: page_mkwrite

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |   2 +
 fs/btrfs/disk-io.c   |   4 +-
 fs/btrfs/file-item.c |   2 +-
 fs/btrfs/file.c      | 176 +++++-------------------------
 fs/btrfs/inode.c     | 303 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 328 insertions(+), 159 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4bd648d68e8..1e83ad720b1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1142,6 +1142,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ebba6f2e4..602b63dc76c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -155,9 +155,9 @@ int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 	desc.tfm = tfm;
 	desc.flags = 0;
 	sg_init_one(&sg, data, len);
-	spin_lock(&root->fs_info->hash_lock);
+	spin_lock_irq(&root->fs_info->hash_lock);
 	ret = crypto_hash_digest(&desc, &sg, 1, result);
-	spin_unlock(&root->fs_info->hash_lock);
+	spin_unlock_irq(&root->fs_info->hash_lock);
 	if (ret) {
 		printk("digest failed\n");
 	}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3c140cc1dd0..1068993ab1c 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -228,6 +228,7 @@ found:
 			   path->nodes[0]->b_data,
 			   root->fs_info->sb->s_blocksize);
 	ret = btrfs_csum_data(root, data, len, &item->csum);
+// printk("file %lu offset %llu csum %X\n", objectid, (unsigned long long)offset, *(int *)(&item->csum));
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
@@ -298,4 +299,3 @@ fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
-
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 83836fb3d4e..de8d47b44e1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -103,10 +103,6 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
 		/* FIXME, one block at a time */
 
-		mutex_lock(&root->fs_info->fs_mutex);
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_set_trans_block_group(trans, inode);
-
 		bh = page_buffers(pages[i]);
 
 		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
@@ -115,6 +111,10 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			char *ptr, *kaddr;
 			u32 datasize;
 
+			mutex_lock(&root->fs_info->fs_mutex);
+			trans = btrfs_start_transaction(root, 1);
+			btrfs_set_trans_block_group(trans, inode);
+
 			/* create an inline extent, and copy the data in */
 			path = btrfs_alloc_path();
 			BUG_ON(!path);
@@ -135,24 +135,19 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			btrfs_set_file_extent_type(ei,
 						   BTRFS_FILE_EXTENT_INLINE);
 			ptr = btrfs_file_extent_inline_start(ei);
+
 			kaddr = kmap_atomic(bh->b_page, KM_USER0);
 			btrfs_memcpy(root, path->nodes[0]->b_data,
 				     ptr, kaddr + bh_offset(bh),
 				     offset + write_bytes);
 			kunmap_atomic(kaddr, KM_USER0);
+
 			mark_buffer_dirty(path->nodes[0]);
 			btrfs_free_path(path);
-		} else if (buffer_mapped(bh)) {
-			/* csum the file data */
-			btrfs_csum_file_block(trans, root, inode->i_ino,
-				      pages[i]->index << PAGE_CACHE_SHIFT,
-				      kmap(pages[i]), PAGE_CACHE_SIZE);
-			kunmap(pages[i]);
+			ret = btrfs_end_transaction(trans, root);
+			BUG_ON(ret);
+			mutex_unlock(&root->fs_info->fs_mutex);
 		}
-		SetPageChecked(pages[i]);
-		ret = btrfs_end_transaction(trans, root);
-		BUG_ON(ret);
-		mutex_unlock(&root->fs_info->fs_mutex);
 
 		ret = btrfs_commit_write(file, pages[i], offset,
 					 offset + this_write);
@@ -503,7 +498,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	if ((pos & (PAGE_CACHE_SIZE - 1))) {
 		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
 		if (!PageUptodate(pinned[0])) {
-			ret = mpage_readpage(pinned[0], btrfs_get_block);
+			ret = btrfs_readpage(NULL, pinned[0]);
 			BUG_ON(ret);
 			wait_on_page_locked(pinned[0]);
 		} else {
@@ -513,7 +508,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
 		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
 		if (!PageUptodate(pinned[1])) {
-			ret = mpage_readpage(pinned[1], btrfs_get_block);
+			ret = btrfs_readpage(NULL, pinned[1]);
 			BUG_ON(ret);
 			wait_on_page_locked(pinned[1]);
 		} else {
@@ -633,138 +628,6 @@ out:
 	return num_written ? num_written : err;
 }
 
-/*
- * FIXME, do this by stuffing the csum we want in the info hanging off
- * page->private.  For now, verify file csums on read
- */
-static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
-			unsigned long offset, unsigned long size)
-{
-	char *kaddr;
-	unsigned long left, count = desc->count;
-	struct inode *inode = page->mapping->host;
-
-	if (size > count)
-		size = count;
-
-	if (!PageChecked(page)) {
-		/* FIXME, do it per block */
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-		int ret;
-		struct buffer_head *bh;
-
-		if (page_has_buffers(page)) {
-			bh = page_buffers(page);
-			if (!buffer_mapped(bh)) {
-				SetPageChecked(page);
-				goto checked;
-			}
-		}
-
-		ret = btrfs_csum_verify_file_block(root,
-				  page->mapping->host->i_ino,
-				  page->index << PAGE_CACHE_SHIFT,
-				  kmap(page), PAGE_CACHE_SIZE);
-		if (ret) {
-			if (ret != -ENOENT) {
-				printk("failed to verify ino %lu page %lu ret %d\n",
-				       page->mapping->host->i_ino,
-				       page->index, ret);
-				memset(page_address(page), 1, PAGE_CACHE_SIZE);
-				flush_dcache_page(page);
-			}
-		}
-		SetPageChecked(page);
-		kunmap(page);
-	}
-checked:
-	/*
-	 * Faults on the destination of a read are common, so do it before
-	 * taking the kmap.
-	 */
-	if (!fault_in_pages_writeable(desc->arg.buf, size)) {
-		kaddr = kmap_atomic(page, KM_USER0);
-		left = __copy_to_user_inatomic(desc->arg.buf,
-						kaddr + offset, size);
-		kunmap_atomic(kaddr, KM_USER0);
-		if (left == 0)
-			goto success;
-	}
-
-	/* Do it the slow way */
-	kaddr = kmap(page);
-	left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
-	kunmap(page);
-
-	if (left) {
-		size -= left;
-		desc->error = -EFAULT;
-	}
-success:
-	desc->count = count - size;
-	desc->written += size;
-	desc->arg.buf += size;
-	return size;
-}
-
-/**
- * btrfs_file_aio_read - filesystem read routine, with a mod to csum verify
- * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
- * @pos:	current file position
- */
-static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-				   unsigned long nr_segs, loff_t pos)
-{
-	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg;
-	size_t count;
-	loff_t *ppos = &iocb->ki_pos;
-
-	count = 0;
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		count += iv->iov_len;
-		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		nr_segs = seg;
-		count -= iv->iov_len;	/* This segment is no good */
-		break;
-	}
-	retval = 0;
-	if (count) {
-		for (seg = 0; seg < nr_segs; seg++) {
-			read_descriptor_t desc;
-
-			desc.written = 0;
-			desc.arg.buf = iov[seg].iov_base;
-			desc.count = iov[seg].iov_len;
-			if (desc.count == 0)
-				continue;
-			desc.error = 0;
-			do_generic_file_read(filp, ppos, &desc,
-					     btrfs_read_actor);
-			retval += desc.written;
-			if (desc.error) {
-				retval = retval ?: desc.error;
-				break;
-			}
-		}
-	}
-	return retval;
-}
-
 static int btrfs_sync_file(struct file *file,
 			   struct dentry *dentry, int datasync)
 {
@@ -789,12 +652,25 @@ out:
 	return ret > 0 ? EIO : ret;
 }
 
+static struct vm_operations_struct btrfs_file_vm_ops = {
+	.nopage		= filemap_nopage,
+	.populate	= filemap_populate,
+	.page_mkwrite	= btrfs_page_mkwrite,
+};
+
+static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &btrfs_file_vm_ops;
+	file_accessed(filp);
+	return 0;
+}
+
 struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
-	.aio_read       = btrfs_file_aio_read,
+	.aio_read       = generic_file_aio_read,
 	.write		= btrfs_file_write,
-	.mmap		= generic_file_mmap,
+	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
 	.ioctl		= btrfs_ioctl,
 	.fsync		= btrfs_sync_file,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e639cc9e089..def33ac90d7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/bit_spinlock.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -561,7 +562,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 		goto out;
 
 	if (!PageUptodate(page)) {
-		ret = mpage_readpage(page, btrfs_get_block);
+		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
 		if (!PageUptodate(page)) {
 			ret = -EIO;
@@ -1257,8 +1258,10 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	if (create & BTRFS_GET_BLOCK_CREATE) {
-		WARN_ON(1);
-		/* this almost but not quite works */
+		/*
+		 * danger!, this only works if the page is properly up
+		 * to date somehow
+		 */
 		trans = btrfs_start_transaction(root, 1);
 		if (!trans) {
 			err = -ENOMEM;
@@ -1353,7 +1356,6 @@ not_found:
 					       ins.objectid, ins.offset,
 					       ins.offset);
 		BUG_ON(ret);
-		SetPageChecked(result->b_page);
 		btrfs_map_bh_to_logical(root, result, ins.objectid);
 	}
 out:
@@ -1374,6 +1376,40 @@ int btrfs_get_block(struct inode *inode, sector_t iblock,
 	return err;
 }
 
+int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
+		    struct buffer_head *result, int create)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *page = result->b_page;
+	u64 offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(result);
+	struct btrfs_csum_item *item;
+	struct btrfs_path *path = NULL;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_get_block_lock(inode, iblock, result, create);
+	if (ret)
+		goto out;
+
+	path = btrfs_alloc_path();
+	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, offset, 0);
+	if (IS_ERR(item)) {
+		ret = PTR_ERR(item);
+		/* a csum that isn't present is a preallocated region. */
+		if (ret == -ENOENT || ret == -EFBIG)
+			ret = 0;
+		result->b_private = 0;
+		goto out;
+	}
+	memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE);
+printk("get_block_sum file %lu offset %llu csum %X\n", inode->i_ino, (unsigned long long)offset, *(int *)(&item->csum));
+out:
+	if (path)
+		btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
@@ -1395,9 +1431,198 @@ static int btrfs_prepare_write(struct file *file, struct page *page,
 	return block_prepare_write(page, from, to, btrfs_get_block);
 }
 
-static int btrfs_readpage(struct file *file, struct page *page)
+static void buffer_io_error(struct buffer_head *bh)
+{
+	char b[BDEVNAME_SIZE];
+
+	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+			bdevname(bh->b_bdev, b),
+			(unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void btrfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags;
+	struct buffer_head *first;
+	struct buffer_head *tmp;
+	struct page *page;
+	int page_uptodate = 1;
+	struct inode *inode;
+	int ret;
+
+	BUG_ON(!buffer_async_read(bh));
+
+	page = bh->b_page;
+	inode = page->mapping->host;
+	if (uptodate) {
+		void *kaddr;
+		struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+		if (bh->b_private) {
+			char csum[BTRFS_CRC32_SIZE];
+			kaddr = kmap_atomic(page, KM_IRQ0);
+			ret = btrfs_csum_data(root, kaddr + bh_offset(bh),
+					      bh->b_size, csum);
+			BUG_ON(ret);
+			if (memcmp(csum, &bh->b_private, BTRFS_CRC32_SIZE)) {
+				u64 offset;
+				offset = (page->index << PAGE_CACHE_SHIFT) +
+					bh_offset(bh);
+				printk("btrfs csum failed ino %lu off %llu\n",
+				       page->mapping->host->i_ino,
+				       (unsigned long long)offset);
+				memset(kaddr + bh_offset(bh), 1, bh->b_size);
+				flush_dcache_page(page);
+printk("bad verify file %lu offset %llu bh_private %lX csum %X\n", inode->i_ino, (unsigned long long)offset, (unsigned long)(bh->b_private), *(int *)csum);
+			}
+			kunmap_atomic(kaddr, KM_IRQ0);
+		}
+		set_buffer_uptodate(bh);
+	} else {
+		clear_buffer_uptodate(bh);
+		if (printk_ratelimit())
+			buffer_io_error(bh);
+		SetPageError(page);
+	}
+
+	/*
+	 * Be _very_ careful from here on. Bad things can happen if
+	 * two buffer heads end IO at almost the same time and both
+	 * decide that the page is now completely done.
+	 */
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	clear_buffer_async_read(bh);
+	unlock_buffer(bh);
+	tmp = bh;
+	do {
+		if (!buffer_uptodate(tmp))
+			page_uptodate = 0;
+		if (buffer_async_read(tmp)) {
+			BUG_ON(!buffer_locked(tmp));
+			goto still_busy;
+		}
+		tmp = tmp->b_this_page;
+	} while (tmp != bh);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+
+	/*
+	 * If none of the buffers had errors and they are all
+	 * uptodate then we can set the page uptodate.
+	 */
+	if (page_uptodate && !PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+	return;
+
+still_busy:
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
+	return;
+}
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int btrfs_readpage(struct file *file, struct page *page)
 {
-	return mpage_readpage(page, btrfs_get_block);
+	struct inode *inode = page->mapping->host;
+	sector_t iblock, lblock;
+	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	unsigned int blocksize;
+	int nr, i;
+	int fully_mapped = 1;
+
+	BUG_ON(!PageLocked(page));
+	blocksize = 1 << inode->i_blkbits;
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, blocksize, 0);
+	head = page_buffers(page);
+
+	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+	bh = head;
+	nr = 0;
+	i = 0;
+
+	do {
+		if (buffer_uptodate(bh))
+			continue;
+
+		if (!buffer_mapped(bh)) {
+			int err = 0;
+
+			fully_mapped = 0;
+			if (iblock < lblock) {
+				WARN_ON(bh->b_size != blocksize);
+				err = btrfs_get_block_csum(inode, iblock,
+							   bh, 0);
+				if (err)
+					SetPageError(page);
+			}
+			if (!buffer_mapped(bh)) {
+				void *kaddr = kmap_atomic(page, KM_USER0);
+				memset(kaddr + i * blocksize, 0, blocksize);
+				flush_dcache_page(page);
+				kunmap_atomic(kaddr, KM_USER0);
+				if (!err)
+					set_buffer_uptodate(bh);
+				continue;
+			}
+			/*
+			 * get_block() might have updated the buffer
+			 * synchronously
+			 */
+			if (buffer_uptodate(bh))
+				continue;
+		}
+		arr[nr++] = bh;
+	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+	if (fully_mapped)
+		SetPageMappedToDisk(page);
+
+	if (!nr) {
+		/*
+		 * All buffers are uptodate - we can set the page uptodate
+		 * as well. But not if get_block() returned an error.
+		 */
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+		return 0;
+	}
+
+	/* Stage two: lock the buffers */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		lock_buffer(bh);
+		bh->b_end_io = btrfs_end_buffer_async_read;
+		set_buffer_async_read(bh);
+	}
+
+	/*
+	 * Stage 3: start the IO.  Check for uptodateness
+	 * inside the buffer lock in case another process reading
+	 * the underlying blockdev brought it uptodate (the sct fix).
+	 */
+	for (i = 0; i < nr; i++) {
+		bh = arr[i];
+		if (buffer_uptodate(bh))
+			btrfs_end_buffer_async_read(bh, 1);
+		else
+			submit_bh(READ, bh);
+	}
+	return 0;
 }
 
 /*
@@ -1424,6 +1649,7 @@ static int __btrfs_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	const unsigned blocksize = 1 << inode->i_blkbits;
 	int nr_underway = 0;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 
 	BUG_ON(!PageLocked(page));
 
@@ -1496,6 +1722,24 @@ static int __btrfs_write_full_page(struct inode *inode, struct page *page,
 			continue;
 		}
 		if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
+			struct btrfs_trans_handle *trans;
+			int ret;
+			u64 off = page->index << PAGE_CACHE_SHIFT;
+			char *kaddr;
+
+			off += bh_offset(bh);
+			mutex_lock(&root->fs_info->fs_mutex);
+			trans = btrfs_start_transaction(root, 1);
+			btrfs_set_trans_block_group(trans, inode);
+			kaddr = kmap(page);
+			ret = btrfs_csum_file_block(trans, root, inode->i_ino,
+						    off, kaddr + bh_offset(bh),
+						    bh->b_size);
+			kunmap(page);
+			BUG_ON(ret);
+			ret = btrfs_end_transaction(trans, root);
+			BUG_ON(ret);
+			mutex_unlock(&root->fs_info->fs_mutex);
 			mark_buffer_async_write(bh);
 		} else {
 			unlock_buffer(bh);
@@ -1617,6 +1861,53 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 	return __btrfs_write_full_page(inode, page, wbc);
 }
 
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	unsigned long end;
+	loff_t size;
+	int ret = -EINVAL;
+
+	lock_page(page);
+	wait_on_page_writeback(page);
+printk("btrfs_page_mkwrite %lu %lu\n", page->mapping->host->i_ino, page->index);
+	size = i_size_read(inode);
+	if ((page->mapping != inode->i_mapping) ||
+	    ((page->index << PAGE_CACHE_SHIFT) > size)) {
+		/* page got truncated out from underneath us */
+		goto out_unlock;
+	}
+
+	/* page is wholly or partially inside EOF */
+	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+		end = size & ~PAGE_CACHE_MASK;
+	else
+		end = PAGE_CACHE_SIZE;
+
+	ret = btrfs_prepare_write(NULL, page, 0, end);
+	if (!ret)
+		ret = btrfs_commit_write(NULL, page, 0, end);
+
+out_unlock:
+	unlock_page(page);
+	return ret;
+}
+
 static void btrfs_truncate(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-- 
cgit v1.2.3


From 8c2383c3dd2cb5bb39598ce4fa97154bc591020a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 Jun 2007 09:57:58 -0400
Subject: Subject: Rework btrfs_file_write to only allocate while page locks
 are held

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |   2 +
 fs/btrfs/extent-tree.c |   1 -
 fs/btrfs/file-item.c   |   1 -
 fs/btrfs/file.c        | 244 +++++++++++++++++++++++++++----------------------
 fs/btrfs/inode.c       |  11 ++-
 fs/btrfs/print-tree.c  |  18 ++++
 fs/btrfs/transaction.c |   1 +
 7 files changed, 161 insertions(+), 117 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 602b63dc76c..15cc9ec9230 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -541,6 +541,8 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	else
 		ret = submit_bh(WRITE, bh);
 	if (ret == -EOPNOTSUPP) {
+		get_bh(bh);
+		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		root->fs_info->do_barriers = 0;
 		ret = submit_bh(WRITE, bh);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5f4eaba8b85..e0738c65bc5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1477,7 +1477,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
-		btrfs_btree_balance_dirty(root);
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1068993ab1c..cf894f09f6c 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -228,7 +228,6 @@ found:
 			   path->nodes[0]->b_data,
 			   root->fs_info->sb->s_blocksize);
 	ret = btrfs_csum_data(root, data, len, &item->csum);
-// printk("file %lu offset %llu csum %X\n", objectid, (unsigned long long)offset, *(int *)(&item->csum));
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index de8d47b44e1..6b455c2b3f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -207,6 +207,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			}
 			path->slots[0]--;
 		}
+next_slot:
 		keep = 0;
 		bookend = 0;
 		found_extent = 0;
@@ -214,39 +215,48 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		extent = NULL;
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
+		ret = 0;
 		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
 		if (key.offset >= end || key.objectid != inode->i_ino) {
-			ret = 0;
 			goto out;
 		}
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) {
-			ret = 0;
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY) {
 			goto out;
 		}
-		extent = btrfs_item_ptr(leaf, slot,
-					struct btrfs_file_extent_item);
-		found_type = btrfs_file_extent_type(extent);
-		if (found_type == BTRFS_FILE_EXTENT_REG) {
-			extent_end = key.offset +
-				(btrfs_file_extent_num_blocks(extent) <<
-				 inode->i_blkbits);
-			found_extent = 1;
-		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-			found_inline = 1;
-			extent_end = key.offset +
-			     btrfs_file_extent_inline_len(leaf->items + slot);
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				extent_end = key.offset +
+					(btrfs_file_extent_num_blocks(extent) <<
+					 inode->i_blkbits);
+				found_extent = 1;
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+				found_inline = 1;
+				extent_end = key.offset +
+				     btrfs_file_extent_inline_len(leaf->items +
+								  slot);
+			}
+		} else {
+			extent_end = search_start;
 		}
 
 		/* we found nothing we can drop */
-		if (!found_extent && !found_inline) {
-			ret = 0;
-			goto out;
-		}
-
-		/* we found nothing inside the range */
-		if (search_start >= extent_end) {
-			ret = 0;
-			goto out;
+		if ((!found_extent && !found_inline) ||
+		    search_start >= extent_end) {
+			int nextret;
+			u32 nritems;
+			nritems = btrfs_header_nritems(
+					btrfs_buffer_header(path->nodes[0]));
+			if (slot >= nritems - 1) {
+				nextret = btrfs_next_leaf(root, path);
+				if (nextret)
+					goto out;
+			} else {
+				path->slots[0]++;
+			}
+			goto next_slot;
 		}
 
 		/* FIXME, there's only one inline extent allowed right now */
@@ -272,7 +282,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			WARN_ON(found_inline);
 			bookend = 1;
 		}
-
 		/* truncate existing extent */
 		if (start > key.offset) {
 			u64 new_num;
@@ -337,10 +346,14 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			ins.offset = end;
 			ins.flags = 0;
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
-
 			btrfs_release_path(root, path);
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
+
+			if (ret) {
+				btrfs_print_leaf(root, btrfs_buffer_leaf(path->nodes[0]));
+				printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu\n", ret , ins.objectid, ins.flags, ins.offset, start, end, key.offset, extent_end);
+			}
 			BUG_ON(ret);
 			extent = btrfs_item_ptr(
 				    btrfs_buffer_leaf(path->nodes[0]),
@@ -387,8 +400,7 @@ static int prepare_pages(struct btrfs_root *root,
 			 loff_t pos,
 			 unsigned long first_index,
 			 unsigned long last_index,
-			 size_t write_bytes,
-			 u64 alloc_extent_start)
+			 size_t write_bytes)
 {
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
@@ -399,6 +411,16 @@ static int prepare_pages(struct btrfs_root *root,
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	loff_t isize = i_size_read(inode);
+	struct btrfs_trans_handle *trans;
+	u64 hint_block;
+	u64 num_blocks;
+	u64 alloc_extent_start;
+	u64 start_pos;
+	struct btrfs_key ins;
+
+	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
+			inode->i_blkbits;
 
 	memset(pages, 0, num_pages * sizeof(struct page *));
 
@@ -408,6 +430,72 @@ static int prepare_pages(struct btrfs_root *root,
 			err = -ENOMEM;
 			goto failed_release;
 		}
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		mutex_unlock(&root->fs_info->fs_mutex);
+		goto out_unlock;
+	}
+	btrfs_set_trans_block_group(trans, inode);
+	/* FIXME blocksize != 4096 */
+	inode->i_blocks += num_blocks << 3;
+	hint_block = 0;
+
+	/* FIXME...EIEIO, ENOSPC and more */
+
+	/* step one, delete the existing extents in this range */
+	/* FIXME blocksize != pagesize */
+	if (start_pos < inode->i_size) {
+		err = btrfs_drop_extents(trans, root, inode,
+			 start_pos, (pos + write_bytes + root->blocksize -1) &
+			 ~((u64)root->blocksize - 1), &hint_block);
+		BUG_ON(err);
+	}
+
+	/* insert any holes we need to create */
+	if (inode->i_size < start_pos) {
+		u64 last_pos_in_file;
+		u64 hole_size;
+		u64 mask = root->blocksize - 1;
+		last_pos_in_file = (isize + mask) & ~mask;
+		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		hole_size >>= inode->i_blkbits;
+		if (last_pos_in_file < start_pos) {
+			err = btrfs_insert_file_extent(trans, root,
+						       inode->i_ino,
+						       last_pos_in_file,
+						       0, 0, hole_size);
+		}
+		BUG_ON(err);
+	}
+
+	/*
+	 * either allocate an extent for the new bytes or setup the key
+	 * to show we are doing inline data in the extent
+	 */
+	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
+	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+		err = btrfs_alloc_extent(trans, root, inode->i_ino,
+					 num_blocks, hint_block, (u64)-1,
+					 &ins, 1);
+		BUG_ON(err);
+		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       start_pos, ins.objectid, ins.offset,
+				       ins.offset);
+		BUG_ON(err);
+	} else {
+		ins.offset = 0;
+		ins.objectid = 0;
+	}
+	BUG_ON(err);
+	alloc_extent_start = ins.objectid;
+	err = btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	for (i = 0; i < num_pages; i++) {
 		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
 		wait_on_page_writeback(pages[i]);
 		offset = pos & (PAGE_CACHE_SIZE -1);
@@ -444,6 +532,11 @@ failed_truncate:
 	if (pos > isize)
 		vmtruncate(inode, isize);
 	return err;
+
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	goto failed_release;
+
 }
 
 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
@@ -455,16 +548,14 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *pages[8];
+	struct page **pages = NULL;
+	int nrptrs;
 	struct page *pinned[2];
 	unsigned long first_index;
 	unsigned long last_index;
-	u64 start_pos;
-	u64 num_blocks;
-	u64 alloc_extent_start;
-	u64 hint_block;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_key ins;
+
+	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
+		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
 	pinned[0] = NULL;
 	pinned[1] = NULL;
 	if (file->f_flags & O_DIRECT)
@@ -482,9 +573,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		goto out;
 	file_update_time(file);
 
-	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
-	num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
-			inode->i_blkbits;
+	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
 	mutex_lock(&inode->i_mutex);
 	first_index = pos >> PAGE_CACHE_SHIFT;
@@ -516,87 +605,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		}
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		err = -ENOMEM;
-		mutex_unlock(&root->fs_info->fs_mutex);
-		goto out_unlock;
-	}
-	btrfs_set_trans_block_group(trans, inode);
-	/* FIXME blocksize != 4096 */
-	inode->i_blocks += num_blocks << 3;
-	hint_block = 0;
-
-	/* FIXME...EIEIO, ENOSPC and more */
-
-	/* step one, delete the existing extents in this range */
-	if (start_pos < inode->i_size) {
-		/* FIXME blocksize != pagesize */
-		ret = btrfs_drop_extents(trans, root, inode,
-					 start_pos,
-					 (pos + count + root->blocksize -1) &
-					 ~((u64)root->blocksize - 1),
-					 &hint_block);
-		BUG_ON(ret);
-	}
-
-	/* insert any holes we need to create */
-	if (inode->i_size < start_pos) {
-		u64 last_pos_in_file;
-		u64 hole_size;
-		u64 mask = root->blocksize - 1;
-		last_pos_in_file = (inode->i_size + mask) & ~mask;
-		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		hole_size >>= inode->i_blkbits;
-		if (last_pos_in_file < start_pos) {
-			ret = btrfs_insert_file_extent(trans, root,
-						       inode->i_ino,
-						       last_pos_in_file,
-						       0, 0, hole_size);
-		}
-		BUG_ON(ret);
-	}
-
-	/*
-	 * either allocate an extent for the new bytes or setup the key
-	 * to show we are doing inline data in the extent
-	 */
-	if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
-	    pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, hint_block, (u64)-1,
-					 &ins, 1);
-		BUG_ON(ret);
-		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       start_pos, ins.objectid, ins.offset,
-				       ins.offset);
-		BUG_ON(ret);
-	} else {
-		ins.offset = 0;
-		ins.objectid = 0;
-	}
-	BUG_ON(ret);
-	alloc_extent_start = ins.objectid;
-	ret = btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	while(count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-		size_t write_bytes = min(count,
-					 (size_t)PAGE_CACHE_SIZE - offset);
+		size_t write_bytes = min(count, nrptrs * PAGE_CACHE_SIZE -
+					 offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
 
+		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(pages));
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
-				    write_bytes, alloc_extent_start);
+				    write_bytes);
 		BUG_ON(ret);
 
-		/* FIXME blocks != pagesize */
-		if (alloc_extent_start)
-			alloc_extent_start += num_pages;
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		BUG_ON(ret);
@@ -611,13 +633,13 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		pos += write_bytes;
 		num_written += write_bytes;
 
-		balance_dirty_pages_ratelimited(inode->i_mapping);
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
 		btrfs_btree_balance_dirty(root);
 		cond_resched();
 	}
-out_unlock:
 	mutex_unlock(&inode->i_mutex);
 out:
+	kfree(pages);
 	if (pinned[0])
 		page_cache_release(pinned[0]);
 	if (pinned[1])
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index def33ac90d7..94f1c28c25b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -962,7 +962,6 @@ void btrfs_dirty_inode(struct inode *inode)
 	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -1402,7 +1401,6 @@ int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
 		goto out;
 	}
 	memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE);
-printk("get_block_sum file %lu offset %llu csum %X\n", inode->i_ino, (unsigned long long)offset, *(int *)(&item->csum));
 out:
 	if (path)
 		btrfs_free_path(path);
@@ -1476,7 +1474,6 @@ static void btrfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 				       (unsigned long long)offset);
 				memset(kaddr + bh_offset(bh), 1, bh->b_size);
 				flush_dcache_page(page);
-printk("bad verify file %lu offset %llu bh_private %lX csum %X\n", inode->i_ino, (unsigned long long)offset, (unsigned long)(bh->b_private), *(int *)csum);
 			}
 			kunmap_atomic(kaddr, KM_IRQ0);
 		}
@@ -1655,6 +1652,13 @@ static int __btrfs_write_full_page(struct inode *inode, struct page *page,
 
 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 
+	/* no csumming allowed when from PF_MEMALLOC */
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
+
 	if (!page_has_buffers(page)) {
 		create_empty_buffers(page, blocksize,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1885,7 +1889,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 
 	lock_page(page);
 	wait_on_page_writeback(page);
-printk("btrfs_page_mkwrite %lu %lu\n", page->mapping->host->i_ino, page->index);
 	size = i_size_read(inode);
 	if ((page->mapping != inode->i_mapping) ||
 	    ((page->index << PAGE_CACHE_SHIFT) > size)) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 1fa70568300..e66acd4e274 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -31,6 +31,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_item *ii;
 	struct btrfs_block_group_item *bi;
+	struct btrfs_file_extent_item *fi;
 	u32 type;
 
 	printk("leaf %llu total ptrs %d free space %d\n",
@@ -75,6 +76,23 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 			printk("\t\textent data refs %u\n",
 				btrfs_extent_refs(ei));
 			break;
+
+		case BTRFS_EXTENT_DATA_KEY:
+			fi = btrfs_item_ptr(l, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				printk("\t\tinline extent data size %u\n",
+			           btrfs_file_extent_inline_len(l->items + i));
+				break;
+			}
+			printk("\t\textent data disk block %llu nr %llu\n",
+			       (unsigned long long)btrfs_file_extent_disk_blocknr(fi),
+			       (unsigned long long)btrfs_file_extent_disk_num_blocks(fi));
+			printk("\t\textent data offset %llu nr %llu\n",
+			  (unsigned long long)btrfs_file_extent_offset(fi),
+			  (unsigned long long)btrfs_file_extent_num_blocks(fi));
+			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cd4f1d14038..b5b99a85763 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -279,6 +279,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		BUG_ON(ret);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
+		btrfs_btree_balance_dirty(tree_root);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 85e55b13e4e318672f669747e5e010d1b707b198 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 19 Jun 2007 15:50:51 -0400
Subject: Btrfs: cache the extent tree preallocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e0738c65bc5..47e91184fff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -741,7 +741,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 		BUG_ON(ret);
 	}
 	extent_root->fs_info->extent_tree_insert_nr = 0;
-	extent_root->fs_info->extent_tree_prealloc_nr = 0;
 	return 0;
 }
 
@@ -951,6 +950,18 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		num_blocks = 1;
 		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
 	}
+	if (fill_prealloc) {
+		u64 first;
+		int nr = info->extent_tree_prealloc_nr;
+		first = info->extent_tree_prealloc[nr - 1];
+		if (info->extent_tree_prealloc_nr >= total_needed &&
+		    first >= search_start) {
+			ins->objectid = info->extent_tree_prealloc[0];
+			ins->offset = 1;
+			return 0;
+		}
+		info->extent_tree_prealloc_nr = 0;
+	}
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_blocks(info->disk_super);
 	if (hint_block) {
@@ -1107,7 +1118,6 @@ check_pending:
 		if (ins->objectid + num_blocks > first &&
 		    ins->objectid <= info->extent_tree_prealloc[0]) {
 			search_start = info->extent_tree_prealloc[0] + 1;
-			WARN_ON(!full_scan);
 			goto new_group;
 		}
 	}
-- 
cgit v1.2.3


From e011599b0f375683499bf1a9954703f4959a8d00 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 19 Jun 2007 16:23:05 -0400
Subject: Btrfs: reada while dropping snapshots

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47e91184fff..b38c3e92f0c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -940,7 +940,6 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int wrapped = 0;
 	u64 limit;
 
-	path = btrfs_alloc_path();
 	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
@@ -974,6 +973,8 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 						     data, 1);
 	}
 
+	path = btrfs_alloc_path();
+
 check_failed:
 	if (!block_group->data)
 		search_start = find_search_start(root, &block_group,
@@ -1355,6 +1356,28 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void reada_walk_down(struct btrfs_root *root,
+			    struct btrfs_node *node)
+{
+	int i;
+	u32 nritems;
+	u64 blocknr;
+	int ret;
+	u32 refs;
+
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = 0; i < nritems; i++) {
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = lookup_extent_ref(NULL, root, blocknr, 1, &refs);
+		BUG_ON(ret);
+		if (refs != 1)
+			continue;
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
 /*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
@@ -1375,6 +1398,7 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
+
 	/*
 	 * walk down to the last node level and free all the leaves
 	 */
@@ -1382,8 +1406,13 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
+
+		if (*level > 0 && path->slots[*level] == 0)
+			reada_walk_down(root, btrfs_buffer_node(cur));
+
 		if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
 			WARN_ON(1);
+
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(btrfs_buffer_header(cur)))
 			break;
-- 
cgit v1.2.3


From 11bd143fc8243cf48c934dc1c4479a5aacf58ce3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 22 Jun 2007 14:16:24 -0400
Subject: Btrfs: Switch to libcrc32c to avoid problems with cryptomgr on
 highmem machines

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  4 ----
 fs/btrfs/disk-io.c | 30 ++++++------------------------
 fs/btrfs/file.c    |  3 ++-
 3 files changed, 8 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e83ad720b1..19a1d998fca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -286,10 +286,7 @@ struct btrfs_block_group_cache {
 	int cached;
 };
 
-struct crypto_hash;
-
 struct btrfs_fs_info {
-	spinlock_t hash_lock;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct radix_tree_root fs_roots_radix;
@@ -314,7 +311,6 @@ struct btrfs_fs_info {
 	struct mutex fs_mutex;
 	struct list_head trans_list;
 	struct list_head dead_roots;
-	struct crypto_hash *hash_tfm;
 	struct delayed_work trans_work;
 	int do_barriers;
 	int closing;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 15cc9ec9230..3712e946f9c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -19,7 +19,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
-#include <linux/crypto.h>
+#include <linux/crc32c.h>
 #include <linux/scatterlist.h>
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
@@ -147,22 +147,12 @@ static int btree_get_block(struct inode *inode, sector_t iblock,
 int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 		    char *result)
 {
-	struct scatterlist sg;
-	struct crypto_hash *tfm = root->fs_info->hash_tfm;
-	struct hash_desc desc;
-	int ret;
-
-	desc.tfm = tfm;
-	desc.flags = 0;
-	sg_init_one(&sg, data, len);
-	spin_lock_irq(&root->fs_info->hash_lock);
-	ret = crypto_hash_digest(&desc, &sg, 1, result);
-	spin_unlock_irq(&root->fs_info->hash_lock);
-	if (ret) {
-		printk("digest failed\n");
-	}
-	return ret;
+	u32 crc;
+	crc = crc32c(0, data, len);
+	memcpy(result, &crc, BTRFS_CRC32_SIZE);
+	return 0;
 }
+
 static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 			   int verify)
 {
@@ -454,14 +444,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	       sizeof(struct btrfs_key));
 	insert_inode_hash(fs_info->btree_inode);
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
-	fs_info->hash_tfm = crypto_alloc_hash("crc32c", 0, CRYPTO_ALG_ASYNC);
-	spin_lock_init(&fs_info->hash_lock);
 
-	if (!fs_info->hash_tfm || IS_ERR(fs_info->hash_tfm)) {
-		printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
-		err = -ENOMEM;
-		goto fail_iput;
-	}
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 
@@ -613,7 +596,6 @@ int close_ctree(struct btrfs_root *root)
 		btrfs_block_release(fs_info->tree_root,
 				    fs_info->tree_root->node);
 	btrfs_block_release(root, fs_info->sb_buffer);
-	crypto_free_hash(fs_info->hash_tfm);
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6b455c2b3f0..23f02935968 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -607,7 +607,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 
 	while(count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-		size_t write_bytes = min(count, nrptrs * PAGE_CACHE_SIZE -
+		size_t write_bytes = min(count, nrptrs *
+					(size_t)PAGE_CACHE_SIZE -
 					 offset);
 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
 					PAGE_CACHE_SHIFT;
-- 
cgit v1.2.3


From 54aa1f4dfdacd60a19c4471220b24e581be6f774 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 22 Jun 2007 14:16:25 -0400
Subject: Btrfs: Audit callers and return codes to make sure -ENOSPC gets up
 the stack

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 106 ++++++++++++++++++++++++-------
 fs/btrfs/ctree.h       |   3 -
 fs/btrfs/dir-item.c    |   6 +-
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c | 141 ++++++++++++++++++++++++++++++++++--------
 fs/btrfs/file-item.c   |  46 ++------------
 fs/btrfs/file.c        | 131 ++++++++++++++++++++++++---------------
 fs/btrfs/inode.c       | 165 ++++++++++++++++++++++++++++++++-----------------
 fs/btrfs/root-tree.c   |   1 -
 fs/btrfs/super.c       |   3 +-
 fs/btrfs/transaction.c |  21 ++++---
 11 files changed, 410 insertions(+), 215 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 498dea0854c..606a19b5916 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -73,6 +73,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	struct buffer_head *cow;
 	struct btrfs_node *cow_node;
+	int ret;
 
 	if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
 				    trans->transid) {
@@ -80,6 +81,8 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 0;
 	}
 	cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
 	cow_node = btrfs_buffer_node(cow);
 	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
 		WARN_ON(1);
@@ -87,7 +90,9 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
 	btrfs_set_header_generation(&cow_node->header, trans->transid);
 	btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
-	btrfs_inc_ref(trans, root, buf);
+	ret = btrfs_inc_ref(trans, root, buf);
+	if (ret)
+		return ret;
 	if (buf == root->node) {
 		root->node = cow;
 		get_bh(cow);
@@ -320,6 +325,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	int wret;
 	int pslot;
 	int orig_slot = path->slots[level];
+	int err_on_enospc = 0;
 	u64 orig_ptr;
 
 	if (level == 0)
@@ -363,29 +369,43 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
+	if (btrfs_header_nritems(&mid->header) < 2)
+		err_on_enospc = 1;
+
 	left_buf = read_node_slot(root, parent_buf, pslot - 1);
 	right_buf = read_node_slot(root, parent_buf, pslot + 1);
 
 	/* first, try to make some room in the middle buffer */
 	if (left_buf) {
-		btrfs_cow_block(trans, root, left_buf, parent_buf, pslot - 1,
-				&left_buf);
+		wret = btrfs_cow_block(trans, root, left_buf,
+				       parent_buf, pslot - 1, &left_buf);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
 		left = btrfs_buffer_node(left_buf);
 		orig_slot += btrfs_header_nritems(&left->header);
 		wret = push_node_left(trans, root, left_buf, mid_buf);
 		if (wret < 0)
 			ret = wret;
+		if (btrfs_header_nritems(&mid->header) < 2)
+			err_on_enospc = 1;
 	}
 
 	/*
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right_buf) {
-		btrfs_cow_block(trans, root, right_buf, parent_buf, pslot + 1,
-				&right_buf);
+		wret = btrfs_cow_block(trans, root, right_buf,
+				       parent_buf, pslot + 1, &right_buf);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+
 		right = btrfs_buffer_node(right_buf);
 		wret = push_node_left(trans, root, mid_buf, right_buf);
-		if (wret < 0)
+		if (wret < 0 && wret != -ENOSPC)
 			ret = wret;
 		if (btrfs_header_nritems(&right->header) == 0) {
 			u64 blocknr = bh_blocknr(right_buf);
@@ -421,8 +441,10 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		 */
 		BUG_ON(!left_buf);
 		wret = balance_node_right(trans, root, mid_buf, left_buf);
-		if (wret < 0)
+		if (wret < 0) {
 			ret = wret;
+			goto enospc;
+		}
 		BUG_ON(wret == 1);
 	}
 	if (btrfs_header_nritems(&mid->header) == 0) {
@@ -467,7 +489,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	    btrfs_node_blockptr(btrfs_buffer_node(path->nodes[level]),
 				path->slots[level]))
 		BUG();
-
+enospc:
 	if (right_buf)
 		btrfs_block_release(root, right_buf);
 	if (left_buf)
@@ -519,10 +541,15 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
 		} else {
-			btrfs_cow_block(trans, root, left_buf, parent_buf,
-					pslot - 1, &left_buf);
-			left = btrfs_buffer_node(left_buf);
-			wret = push_node_left(trans, root, left_buf, mid_buf);
+			ret = btrfs_cow_block(trans, root, left_buf, parent_buf,
+					      pslot - 1, &left_buf);
+			if (ret)
+				wret = 1;
+			else {
+				left = btrfs_buffer_node(left_buf);
+				wret = push_node_left(trans, root,
+						      left_buf, mid_buf);
+			}
 		}
 		if (wret < 0)
 			ret = wret;
@@ -561,11 +588,16 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
 		} else {
-			btrfs_cow_block(trans, root, right_buf,
-					parent_buf, pslot + 1, &right_buf);
-			right = btrfs_buffer_node(right_buf);
-			wret = balance_node_right(trans, root,
-						  right_buf, mid_buf);
+			ret = btrfs_cow_block(trans, root, right_buf,
+					      parent_buf, pslot + 1,
+					      &right_buf);
+			if (ret)
+				wret = 1;
+			else {
+				right = btrfs_buffer_node(right_buf);
+				wret = balance_node_right(trans, root,
+							  right_buf, mid_buf);
+			}
 		}
 		if (wret < 0)
 			ret = wret;
@@ -631,6 +663,10 @@ again:
 					       p->nodes[level + 1],
 					       p->slots[level + 1],
 					       &cow_buf);
+			if (wret) {
+				btrfs_block_release(root, cow_buf);
+				return wret;
+			}
 			b = cow_buf;
 			c = btrfs_buffer_node(b);
 		}
@@ -737,6 +773,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	src_nritems = btrfs_header_nritems(&src->header);
 	dst_nritems = btrfs_header_nritems(&dst->header);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+
 	if (push_items <= 0) {
 		return 1;
 	}
@@ -827,6 +864,8 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(path->nodes[level-1] != root->node);
 
 	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
+	if (IS_ERR(t))
+		return PTR_ERR(t);
 	c = btrfs_buffer_node(t);
 	memset(c, 0, root->blocksize);
 	btrfs_set_header_nritems(&c->header, 1);
@@ -929,10 +968,15 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		    btrfs_header_nritems(&c->header) <
 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 1)
 			return 0;
+		if (ret < 0)
+			return ret;
 	}
 
 	c_nritems = btrfs_header_nritems(&c->header);
 	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
+	if (IS_ERR(split_buffer))
+		return PTR_ERR(split_buffer);
+
 	split = btrfs_buffer_node(split_buffer);
 	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
 	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
@@ -1022,6 +1066,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_item *item;
 	u32 left_nritems;
 	u32 right_nritems;
+	int ret;
 
 	slot = path->slots[1];
 	if (!path->nodes[1]) {
@@ -1041,7 +1086,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 	/* cow and double check */
-	btrfs_cow_block(trans, root, right_buf, upper, slot + 1, &right_buf);
+	ret = btrfs_cow_block(trans, root, right_buf, upper,
+			      slot + 1, &right_buf);
+	if (ret) {
+		btrfs_block_release(root, right_buf);
+		return 1;
+	}
 	right = btrfs_buffer_leaf(right_buf);
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
@@ -1162,7 +1212,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	/* cow and double check */
-	btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t);
+	ret = btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t);
+	if (ret) {
+		/* we hit -ENOSPC, but it isn't fatal here */
+		return 1;
+	}
 	left = btrfs_buffer_leaf(t);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
@@ -1309,8 +1363,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
+
 	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
-	BUG_ON(!right_buffer);
+	if (IS_ERR(right_buffer))
+		return PTR_ERR(right_buffer);
+
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
 	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
@@ -1407,7 +1464,9 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!double_split)
 		return ret;
 	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
-	BUG_ON(!right_buffer);
+	if (IS_ERR(right_buffer))
+		return PTR_ERR(right_buffer);
+
 	right = btrfs_buffer_leaf(right_buffer);
 	memset(&right->header, 0, sizeof(right->header));
 	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
@@ -1655,7 +1714,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			     ptr, data, data_size);
 		btrfs_mark_buffer_dirty(path->nodes[0]);
 	}
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1775,12 +1833,12 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			slot = path->slots[1];
 			get_bh(leaf_buf);
 			wret = push_leaf_left(trans, root, path, 1);
-			if (wret < 0)
+			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
 			if (path->nodes[0] == leaf_buf &&
 			    btrfs_header_nritems(&leaf->header)) {
 				wret = push_leaf_right(trans, root, path, 1);
-				if (wret < 0)
+				if (wret < 0 && wret != -ENOSPC)
 					ret = wret;
 			}
 			if (btrfs_header_nritems(&leaf->header) == 0) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 19a1d998fca..77071f27397 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1126,9 +1126,6 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset,
 			  char *data, size_t len);
-int btrfs_csum_verify_file_block(struct btrfs_root *root,
-				 u64 objectid, u64 offset,
-				 char *data, size_t len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index ff10cf5a842..d5db8bdea3d 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -47,6 +47,8 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 		if (ret)
 			return ERR_PTR(ret);
 	}
+	if (ret < 0)
+		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
 	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	item = leaf->items + path->slots[0];
@@ -225,14 +227,13 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 	struct btrfs_leaf *leaf;
 	u32 sub_item_len;
 	u32 item_len;
-	int ret;
+	int ret = 0;
 
 	leaf = btrfs_buffer_leaf(path->nodes[0]);
 	sub_item_len = sizeof(*di) + btrfs_dir_name_len(di);
 	item_len = btrfs_item_size(leaf->items + path->slots[0]);
 	if (sub_item_len == btrfs_item_size(leaf->items + path->slots[0])) {
 		ret = btrfs_del_item(trans, root, path);
-		BUG_ON(ret);
 	} else {
 		char *ptr = (char *)di;
 		char *start = btrfs_item_ptr(leaf, path->slots[0], char);
@@ -240,7 +241,6 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			item_len - (ptr + sub_item_len - start));
 		ret = btrfs_truncate_item(trans, root, path,
 					  item_len - sub_item_len);
-		BUG_ON(ret);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3712e946f9c..865a284aa06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -580,7 +580,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_transaction_flush_work(root);
 	mutex_lock(&fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	btrfs_commit_transaction(trans, root);
+	ret = btrfs_commit_transaction(trans, root);
 	/* run commit again to  drop the original snapshot */
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b38c3e92f0c..8025e9f8ef1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -100,6 +100,8 @@ static int cache_block_group(struct btrfs_root *root,
 		if (slot >= btrfs_header_nritems(&leaf->header)) {
 			reada_extent_leaves(root, path, limit);
 			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto err;
 			if (ret == 0) {
 				continue;
 			} else {
@@ -148,6 +150,7 @@ static int cache_block_group(struct btrfs_root *root,
 	}
 
 	block_group->cached = 1;
+err:
 	btrfs_free_path(path);
 	return 0;
 }
@@ -201,7 +204,9 @@ static u64 find_search_start(struct btrfs_root *root,
 		last = max(last, cache->last_prealloc);
 	}
 again:
-	cache_block_group(root, cache);
+	ret = cache_block_group(root, cache);
+	if (ret)
+		goto out;
 	while(1) {
 		ret = find_first_radix_bit(&root->fs_info->extent_map_radix,
 					   gang, last, ARRAY_SIZE(gang));
@@ -398,16 +403,23 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_key ins;
 	u32 refs;
 
-	find_free_extent(trans, root->fs_info->extent_root, 0, 0, (u64)-1, 0,
-			 &ins, 0);
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+	ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0,
+			       (u64)-1, 0, &ins, 0);
+	if (ret) {
+		btrfs_free_path(path);
+		return ret;
+	}
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 1);
+	if (ret < 0)
+		return ret;
 	if (ret != 0) {
 		BUG();
 	}
@@ -442,12 +454,14 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 0);
+	if (ret < 0)
+		goto out;
 	if (ret != 0)
 		BUG();
 	l = btrfs_buffer_leaf(path->nodes[0]);
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
 	*refs = btrfs_extent_refs(item);
-	btrfs_release_path(root->fs_info->extent_root, path);
+out:
 	btrfs_free_path(path);
 	return 0;
 }
@@ -469,6 +483,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int i;
 	int leaf;
 	int ret;
+	int faili;
+	int err;
 
 	if (!root->ref_cows)
 		return 0;
@@ -491,14 +507,45 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				continue;
 			ret = btrfs_inc_extent_ref(trans, root, disk_blocknr,
 				    btrfs_file_extent_disk_num_blocks(fi));
-			BUG_ON(ret);
+			if (ret) {
+				faili = i;
+				goto fail;
+			}
 		} else {
 			blocknr = btrfs_node_blockptr(buf_node, i);
 			ret = btrfs_inc_extent_ref(trans, root, blocknr, 1);
-			BUG_ON(ret);
+			if (ret) {
+				faili = i;
+				goto fail;
+			}
 		}
 	}
 	return 0;
+fail:
+	for (i =0; i < faili; i++) {
+		if (leaf) {
+			u64 disk_blocknr;
+			key = &buf_leaf->items[i].key;
+			if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf_leaf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			disk_blocknr = btrfs_file_extent_disk_blocknr(fi);
+			if (disk_blocknr == 0)
+				continue;
+			err = btrfs_free_extent(trans, root, disk_blocknr,
+				    btrfs_file_extent_disk_num_blocks(fi), 0);
+			BUG_ON(err);
+		} else {
+			blocknr = btrfs_node_blockptr(buf_node, i);
+			err = btrfs_free_extent(trans, root, blocknr, 1, 0);
+			BUG_ON(err);
+		}
+	}
+	return ret;
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -512,15 +559,20 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_item *bi;
 	struct btrfs_key ins;
 
-	find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins, 0);
+	ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins, 0);
+	/* FIXME, set bit to recalc cache groups on next mount */
+	if (ret)
+		return ret;
 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
+	if (ret < 0)
+		goto fail;
 	BUG_ON(ret);
 	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_block_group_item);
 	memcpy(bi, &cache->item, sizeof(*bi));
 	mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(extent_root, path);
-
+fail:
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
 	if (ret)
@@ -543,6 +595,7 @@ static int write_dirty_block_radix(struct btrfs_trans_handle *trans,
 	int werr = 0;
 	int i;
 	struct btrfs_path *path;
+	unsigned long off = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -550,18 +603,28 @@ static int write_dirty_block_radix(struct btrfs_trans_handle *trans,
 
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
-						 0, ARRAY_SIZE(cache),
+						 off, ARRAY_SIZE(cache),
 						 BTRFS_BLOCK_GROUP_DIRTY);
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++) {
-			radix_tree_tag_clear(radix, cache[i]->key.objectid +
-					     cache[i]->key.offset - 1,
-					     BTRFS_BLOCK_GROUP_DIRTY);
 			err = write_one_cache_group(trans, root,
 						    path, cache[i]);
-			if (err)
+			/*
+			 * if we fail to write the cache group, we want
+			 * to keep it marked dirty in hopes that a later
+			 * write will work
+			 */
+			if (err) {
 				werr = err;
+				off = cache[i]->key.objectid +
+					cache[i]->key.offset;
+				continue;
+			}
+
+			radix_tree_tag_clear(radix, cache[i]->key.objectid +
+					     cache[i]->key.offset - 1,
+					     BTRFS_BLOCK_GROUP_DIRTY);
 		}
 	}
 	btrfs_free_path(path);
@@ -801,14 +864,20 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 
-	find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0);
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
 
-	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+	ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0);
 	if (ret) {
-		BUG();
+		btrfs_free_path(path);
+		return ret;
 	}
+
+	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(ret);
 	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_extent_item);
 	BUG_ON(ei->refs == 0);
@@ -827,8 +896,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_set_super_blocks_used(info->disk_super,
 					    super_blocks_used - num_blocks);
 		ret = btrfs_del_item(trans, extent_root, path);
-		if (ret)
-			BUG();
+		if (ret) {
+			return ret;
+		}
 		ret = update_block_group(trans, root, blocknr, num_blocks, 0,
 					 mark_free, 0);
 		BUG_ON(ret);
@@ -1075,7 +1145,6 @@ next:
 		path->slots[0]++;
 		cond_resched();
 	}
-	// FIXME -ENOSPC
 check_pending:
 	/* we have to make sure we didn't find an extent that has already
 	 * been allocated by the map tree or the original allocation
@@ -1246,7 +1315,15 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	ret = find_free_extent(trans, root, num_blocks, search_start,
 			       search_end, hint_block, ins, data);
 	if (ret) {
-		return ret;
+		if (search_start == 0)
+			return ret;
+		search_end = search_start - 1;
+		search_start = 0;
+		hint_block = search_start;
+		ret = find_free_extent(trans, root, num_blocks, search_start,
+				       search_end, hint_block, ins, data);
+		if (ret)
+			return ret;
 	}
 
 	/*
@@ -1271,7 +1348,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 				       search_end, hint_block,
 				       &prealloc_key, 0);
 		if (ret) {
-			return ret;
+			if (search_start == 0)
+				return ret;
+			search_end = search_start - 1;
+			search_start = 0;
+			hint_block = search_start;
+			ret = find_free_extent(trans, root, 0, search_start,
+					       search_end, hint_block,
+					       &prealloc_key, 0);
+			if (ret)
+				return ret;
 		}
 	}
 
@@ -1309,11 +1395,14 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
 				 1, hint, (unsigned long)-1, &ins, 0);
 	if (ret) {
-		BUG();
-		return NULL;
+		BUG_ON(ret > 0);
+		return ERR_PTR(ret);
 	}
-	BUG_ON(ret);
 	buf = btrfs_find_create_tree_block(root, ins.objectid);
+	if (!buf) {
+		btrfs_free_extent(trans, root, ins.objectid, 1, 0);
+		return ERR_PTR(-ENOMEM);
+	}
 	set_buffer_uptodate(buf);
 	set_buffer_checked(buf);
 	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index cf894f09f6c..68859934ae2 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -45,6 +45,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      sizeof(*item));
+	if (ret < 0)
+		goto out;
 	BUG_ON(ret);
 	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			      struct btrfs_file_extent_item);
@@ -55,10 +57,9 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_generation(item, trans->transid);
 	btrfs_set_file_extent_type(item, BTRFS_FILE_EXTENT_REG);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-
-	btrfs_release_path(root, path);
+out:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
@@ -213,6 +214,8 @@ insert:
 	csum_offset = 0;
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      BTRFS_CRC32_SIZE);
+	if (ret < 0)
+		goto fail;
 	if (ret != 0) {
 		WARN_ON(1);
 		goto fail;
@@ -261,40 +264,3 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_csum_verify_file_block(struct btrfs_root *root,
-				 u64 objectid, u64 offset,
-				 char *data, size_t len)
-{
-	int ret;
-	struct btrfs_key file_key;
-	struct btrfs_path *path;
-	struct btrfs_csum_item *item;
-	char result[BTRFS_CRC32_SIZE];
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	file_key.objectid = objectid;
-	file_key.offset = offset;
-	file_key.flags = 0;
-	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
-	mutex_lock(&root->fs_info->fs_mutex);
-
-	item = btrfs_lookup_csum(NULL, root, path, objectid, offset, 0);
-	if (IS_ERR(item)) {
-		ret = PTR_ERR(item);
-		/* a csum that isn't present is a preallocated region. */
-		if (ret == -ENOENT || ret == -EFBIG)
-			ret = -ENOENT;
-		goto fail;
-	}
-
-	ret = btrfs_csum_data(root, data, len, result);
-	WARN_ON(ret);
-	if (memcmp(result, &item->csum, BTRFS_CRC32_SIZE))
-		ret = 1;
-fail:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 23f02935968..fef7ba1e707 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -81,6 +81,62 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
+static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
+				u64 offset, ssize_t size,
+				struct buffer_head *bh)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	char *ptr, *kaddr;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_file_extent_item *ei;
+	u32 datasize;
+	int err = 0;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	key.objectid = inode->i_ino;
+	key.offset = offset;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	BUG_ON(size >= PAGE_CACHE_SIZE);
+	datasize = btrfs_file_extent_calc_inline_size(size);
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	if (ret) {
+		err = ret;
+		goto fail;
+	}
+	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
+	       path->slots[0], struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(ei, trans->transid);
+	btrfs_set_file_extent_type(ei,
+				   BTRFS_FILE_EXTENT_INLINE);
+	ptr = btrfs_file_extent_inline_start(ei);
+
+	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	btrfs_memcpy(root, path->nodes[0]->b_data,
+		     ptr, kaddr + bh_offset(bh),
+		     size);
+	kunmap_atomic(kaddr, KM_USER0);
+	mark_buffer_dirty(path->nodes[0]);
+fail:
+	btrfs_free_path(path);
+	ret = btrfs_end_transaction(trans, root);
+	if (ret && !err)
+		err = ret;
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return err;
+}
+
 static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct file *file,
@@ -96,57 +152,22 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	int this_write;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct buffer_head *bh;
-	struct btrfs_file_extent_item *ei;
 
 	for (i = 0; i < num_pages; i++) {
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
-		/* FIXME, one block at a time */
 
+		/* FIXME, one block at a time */
 		bh = page_buffers(pages[i]);
 
 		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
-			struct btrfs_key key;
-			struct btrfs_path *path;
-			char *ptr, *kaddr;
-			u32 datasize;
-
-			mutex_lock(&root->fs_info->fs_mutex);
-			trans = btrfs_start_transaction(root, 1);
-			btrfs_set_trans_block_group(trans, inode);
-
-			/* create an inline extent, and copy the data in */
-			path = btrfs_alloc_path();
-			BUG_ON(!path);
-			key.objectid = inode->i_ino;
-			key.offset = pages[i]->index << PAGE_CACHE_SHIFT;
-			key.flags = 0;
-			btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-			BUG_ON(write_bytes >= PAGE_CACHE_SIZE);
-			datasize = offset +
-				btrfs_file_extent_calc_inline_size(write_bytes);
-
-			ret = btrfs_insert_empty_item(trans, root, path, &key,
-						      datasize);
-			BUG_ON(ret);
-			ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-			       path->slots[0], struct btrfs_file_extent_item);
-			btrfs_set_file_extent_generation(ei, trans->transid);
-			btrfs_set_file_extent_type(ei,
-						   BTRFS_FILE_EXTENT_INLINE);
-			ptr = btrfs_file_extent_inline_start(ei);
-
-			kaddr = kmap_atomic(bh->b_page, KM_USER0);
-			btrfs_memcpy(root, path->nodes[0]->b_data,
-				     ptr, kaddr + bh_offset(bh),
-				     offset + write_bytes);
-			kunmap_atomic(kaddr, KM_USER0);
-
-			mark_buffer_dirty(path->nodes[0]);
-			btrfs_free_path(path);
-			ret = btrfs_end_transaction(trans, root);
-			BUG_ON(ret);
-			mutex_unlock(&root->fs_info->fs_mutex);
+			ret = insert_inline_extent(root, inode,
+					pages[i]->index << PAGE_CACHE_SHIFT,
+					offset + this_write, bh);
+			if (ret) {
+				err = ret;
+				goto failed;
+			}
 		}
 
 		ret = btrfs_commit_write(file, pages[i], offset,
@@ -321,6 +342,7 @@ next_slot:
 					btrfs_file_extent_disk_blocknr(extent);
 			}
 			ret = btrfs_del_item(trans, root, path);
+			/* TODO update progress marker and return */
 			BUG_ON(ret);
 			btrfs_release_path(root, path);
 			extent = NULL;
@@ -452,7 +474,8 @@ static int prepare_pages(struct btrfs_root *root,
 		err = btrfs_drop_extents(trans, root, inode,
 			 start_pos, (pos + write_bytes + root->blocksize -1) &
 			 ~((u64)root->blocksize - 1), &hint_block);
-		BUG_ON(err);
+		if (err)
+			goto failed_release;
 	}
 
 	/* insert any holes we need to create */
@@ -469,7 +492,8 @@ static int prepare_pages(struct btrfs_root *root,
 						       last_pos_in_file,
 						       0, 0, hole_size);
 		}
-		BUG_ON(err);
+		if (err)
+			goto failed_release;
 	}
 
 	/*
@@ -481,11 +505,13 @@ static int prepare_pages(struct btrfs_root *root,
 		err = btrfs_alloc_extent(trans, root, inode->i_ino,
 					 num_blocks, hint_block, (u64)-1,
 					 &ins, 1);
-		BUG_ON(err);
+		if (err)
+			goto failed_truncate;
 		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       start_pos, ins.objectid, ins.offset,
 				       ins.offset);
-		BUG_ON(err);
+		if (err)
+			goto failed_truncate;
 	} else {
 		ins.offset = 0;
 		ins.objectid = 0;
@@ -618,16 +644,21 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes);
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_drop_pages(pages, num_pages);
+			goto out;
+		}
 
 		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, pos, write_bytes);
-		BUG_ON(ret);
 		btrfs_drop_pages(pages, num_pages);
+		if (ret)
+			goto out;
 
 		buf += write_bytes;
 		count -= write_bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 94f1c28c25b..6d031daa777 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -212,7 +212,11 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	struct btrfs_dir_item *di;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
 				    name, name_len, -1);
 	if (IS_ERR(di)) {
@@ -225,7 +229,8 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	}
 	objectid = btrfs_disk_key_objectid(&di->location);
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	BUG_ON(ret);
+	if (ret)
+		goto err;
 	btrfs_release_path(root, path);
 
 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
@@ -239,7 +244,6 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 		goto err;
 	}
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	BUG_ON(ret);
 
 	dentry->d_inode->i_ctime = dir->i_ctime;
 err:
@@ -248,7 +252,7 @@ err:
 		dir->i_size -= name_len * 2;
 		btrfs_update_inode(trans, root, dir);
 		drop_nlink(dentry->d_inode);
-		btrfs_update_inode(trans, root, dentry->d_inode);
+		ret = btrfs_update_inode(trans, root, dentry->d_inode);
 		dir->i_sb->s_dirt = 1;
 	}
 	return ret;
@@ -359,9 +363,10 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(!path);
 	ret = btrfs_lookup_inode(trans, root, path,
 				 &BTRFS_I(inode)->location, -1);
-	BUG_ON(ret);
-	ret = btrfs_del_item(trans, root, path);
-	BUG_ON(ret);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (!ret)
+		ret = btrfs_del_item(trans, root, path);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -516,7 +521,8 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		}
 		if (del_item) {
 			ret = btrfs_del_item(trans, root, path);
-			BUG_ON(ret);
+			if (ret)
+				goto error;
 		} else {
 			break;
 		}
@@ -577,19 +583,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 				 page->index << PAGE_CACHE_SHIFT,
 				 (page->index + 1) << PAGE_CACHE_SHIFT,
 				 &alloc_hint);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
 				 alloc_hint, (u64)-1, &ins, 1);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 				       page->index << PAGE_CACHE_SHIFT,
 				       ins.objectid, 1, 1);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 	SetPageChecked(page);
 	kaddr = kmap(page);
 	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
 	flush_dcache_page(page);
-	btrfs_csum_file_block(trans, root, inode->i_ino,
+	ret = btrfs_csum_file_block(trans, root, inode->i_ino,
 			      page->index << PAGE_CACHE_SHIFT,
 			      kaddr, PAGE_CACHE_SIZE);
 	kunmap(page);
@@ -633,9 +642,10 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		btrfs_set_trans_block_group(trans, inode);
 		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       pos, 0, 0, hole_size);
-		BUG_ON(err);
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
+		if (err)
+			return err;
 	}
 out:
 	err = inode_setattr(inode, attr);
@@ -657,12 +667,20 @@ void btrfs_delete_inode(struct inode *inode)
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode);
-	BUG_ON(ret);
-	btrfs_free_inode(trans, root, inode);
+	if (ret)
+		goto no_delete_lock;
+	ret = btrfs_free_inode(trans, root, inode);
+	if (ret)
+		goto no_delete_lock;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root);
 	return;
+
+no_delete_lock:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root);
 no_delete:
 	clear_inode(inode);
 }
@@ -946,7 +964,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
 }
 
 /*
- * This is somewhat expense, updating the tree every time the
+ * This is somewhat expensive, updating the tree every time the
  * inode changes.  But, it is most likely to find the inode in cache.
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
@@ -1002,8 +1020,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
 	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
-	BUG_ON(ret);
-
+	if (ret)
+		return ERR_PTR(ret);
 	insert_inode_hash(inode);
 	return inode;
 }
@@ -1121,7 +1139,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, dir);
-	btrfs_update_inode(trans, root, inode);
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		drop_inode = 1;
 
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -1349,17 +1369,26 @@ not_found:
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
 					 1, alloc_hint, (u64)-1,
 					 &ins, 1);
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       iblock << inode->i_blkbits,
 					       ins.objectid, ins.offset,
 					       ins.offset);
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
 		btrfs_map_bh_to_logical(root, result, ins.objectid);
 	}
 out:
-	if (trans)
-		err = btrfs_end_transaction(trans, root);
+	if (trans) {
+		ret = btrfs_end_transaction(trans, root);
+		if (!err)
+			err = ret;
+	}
 	btrfs_free_path(path);
 	return err;
 }
@@ -1375,8 +1404,8 @@ int btrfs_get_block(struct inode *inode, sector_t iblock,
 	return err;
 }
 
-int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
-		    struct buffer_head *result, int create)
+static int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
+				struct buffer_head *result, int create)
 {
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1397,7 +1426,7 @@ int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
 		/* a csum that isn't present is a preallocated region. */
 		if (ret == -ENOENT || ret == -EFBIG)
 			ret = 0;
-		result->b_private = 0;
+		result->b_private = NULL;
 		goto out;
 	}
 	memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE);
@@ -1736,11 +1765,10 @@ static int __btrfs_write_full_page(struct inode *inode, struct page *page,
 			trans = btrfs_start_transaction(root, 1);
 			btrfs_set_trans_block_group(trans, inode);
 			kaddr = kmap(page);
-			ret = btrfs_csum_file_block(trans, root, inode->i_ino,
+			btrfs_csum_file_block(trans, root, inode->i_ino,
 						    off, kaddr + bh_offset(bh),
 						    bh->b_size);
 			kunmap(page);
-			BUG_ON(ret);
 			ret = btrfs_end_transaction(trans, root);
 			BUG_ON(ret);
 			mutex_unlock(&root->fs_info->fs_mutex);
@@ -1930,7 +1958,6 @@ static void btrfs_truncate(struct inode *inode)
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	ret = btrfs_truncate_in_trans(trans, root, inode);
-	BUG_ON(ret);
 	btrfs_update_inode(trans, root, inode);
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
@@ -1970,6 +1997,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	struct inode *inode;
 	struct inode *dir;
 	int ret;
+	int err;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 
@@ -1978,8 +2006,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	BUG_ON(!trans);
 
 	subvol = btrfs_alloc_free_block(trans, root, 0);
-	if (subvol == NULL)
-		return -ENOSPC;
+	if (IS_ERR(subvol))
+		return PTR_ERR(subvol);
 	leaf = btrfs_buffer_leaf(subvol);
 	btrfs_set_header_nritems(&leaf->header, 0);
 	btrfs_set_header_level(&leaf->header, 0);
@@ -2005,7 +2033,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	btrfs_set_root_dirid(&root_item, new_dirid);
 
@@ -2015,7 +2044,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&root_item);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	/*
 	 * insert the directory item
@@ -2025,10 +2055,12 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
 				    name, namelen, dir->i_ino, &key,
 				    BTRFS_FT_DIR);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
+	if (ret)
+		goto fail_commit;
 
 	new_root = btrfs_read_fs_root(root->fs_info, &key);
 	BUG_ON(!new_root);
@@ -2038,24 +2070,29 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 
 	inode = btrfs_new_inode(trans, new_root, new_dirid,
 				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
+	if (IS_ERR(inode))
+		goto fail;
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 	new_root->inode = inode;
 
 	ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	inode->i_nlink = 1;
 	inode->i_size = 6;
 	ret = btrfs_update_inode(trans, new_root, inode);
-	BUG_ON(ret);
-
-	ret = btrfs_commit_transaction(trans, new_root);
-	BUG_ON(ret);
-
+	if (ret)
+		goto fail;
+fail:
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+fail_commit:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root);
-	return 0;
+	return ret;
 }
 
 static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
@@ -2064,6 +2101,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	struct btrfs_key key;
 	struct btrfs_root_item new_root_item;
 	int ret;
+	int err;
 	u64 objectid;
 
 	if (!root->ref_cows)
@@ -2074,11 +2112,13 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	BUG_ON(!trans);
 
 	ret = btrfs_update_inode(trans, root, root->inode);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	memcpy(&new_root_item, &root->root_item,
 	       sizeof(new_root_item));
@@ -2091,7 +2131,8 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	/*
 	 * insert the directory item
@@ -2102,16 +2143,20 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 				    root->fs_info->sb->s_root->d_inode->i_ino,
 				    &key, BTRFS_FT_DIR);
 
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	ret = btrfs_inc_root_ref(trans, root);
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
-	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
+fail:
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root);
-	return 0;
+	return ret;
 }
 
 int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
@@ -2148,12 +2193,13 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		btrfs_free_path(path);
 		if (di && !IS_ERR(di))
 			return -EEXIST;
+		if (IS_ERR(di))
+			return PTR_ERR(di);
 
 		if (root == root->fs_info->tree_root)
 			ret = create_subvol(root, vol_args.name, namelen);
 		else
 			ret = create_snapshot(root, vol_args.name, namelen);
-		WARN_ON(ret);
 		break;
 	default:
 		return -ENOTTY;
@@ -2316,7 +2362,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		old_parent_oid = btrfs_disk_key_objectid(&di->location);
 		ret = btrfs_del_item(trans, root, path);
 		if (ret) {
-			ret = -EIO;
 			goto out_fail;
 		}
 		btrfs_release_path(root, path);
@@ -2335,7 +2380,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		}
 		ret = btrfs_del_item(trans, root, path);
 		if (ret) {
-			ret = -EIO;
 			goto out_fail;
 		}
 		btrfs_release_path(root, path);
@@ -2361,7 +2405,9 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 			clear_nlink(new_inode);
 		else
 			drop_nlink(new_inode);
-		btrfs_update_inode(trans, root, new_inode);
+		ret = btrfs_update_inode(trans, root, new_inode);
+		if (ret)
+			goto out_fail;
 	}
 	ret = btrfs_add_link(trans, new_dentry, old_inode);
 	if (ret)
@@ -2433,7 +2479,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	datasize = btrfs_file_extent_calc_inline_size(name_len);
 	err = btrfs_insert_empty_item(trans, root, path, &key,
 				      datasize);
-	BUG_ON(err);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
 	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
 	       path->slots[0], struct btrfs_file_extent_item);
 	btrfs_set_file_extent_generation(ei, trans->transid);
@@ -2447,13 +2496,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_size = name_len - 1;
-	btrfs_update_inode(trans, root, inode);
-	err = 0;
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		drop_inode = 1;
 
 out_unlock:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 8e1b9046d5e..ac0fae7780f 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -90,7 +90,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	int ret;
 	ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
-	BUG_ON(ret);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ef8ad18b275..c11ecf50020 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -125,9 +125,8 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
-	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	return 0;
+	return ret;
 }
 
 static void btrfs_write_super(struct super_block *sb)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b5b99a85763..321f8852755 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -219,7 +219,8 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root;
 	int i;
 	int ret;
-	int err;
+	int err = 0;
+
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
 						 ARRAY_SIZE(gang),
@@ -251,11 +252,12 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 			err = btrfs_insert_root(trans, root->fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
-			BUG_ON(err);
+			if (err)
+				break;
 			list_add(&dirty->list, list);
 		}
 	}
-	return 0;
+	return err;
 }
 
 static int drop_dirty_roots(struct btrfs_root *tree_root,
@@ -263,7 +265,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 {
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
-	int ret;
+	int ret = 0;
 	while(!list_empty(list)) {
 		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
@@ -274,14 +276,15 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		BUG_ON(ret);
 
 		ret = btrfs_del_root(trans, tree_root, &dirty->snap_key);
-		BUG_ON(ret);
+		if (ret)
+			break;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
 		btrfs_btree_balance_dirty(tree_root);
 	}
-	return 0;
+	return ret;
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -321,9 +324,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 	finish_wait(&trans->transaction->writer_wait, &wait);
 	WARN_ON(cur_trans != trans->transaction);
-	add_dirty_roots(trans, &root->fs_info->fs_roots_radix, &dirty_fs_roots);
+	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
+			      &dirty_fs_roots);
+	BUG_ON(ret);
+
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
+
 	cur_trans = root->fs_info->running_transaction;
 	root->fs_info->running_transaction = NULL;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
-- 
cgit v1.2.3


From 5eda7b5e9b0bed864dd18284c7df9b3c8207dad7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 22 Jun 2007 14:16:25 -0400
Subject: Btrfs: Add the ability to find and remove dead roots after a crash.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 17 ++++++++++--
 fs/btrfs/disk-io.c     | 44 +++++++++++++++++++----------
 fs/btrfs/disk-io.h     |  3 ++
 fs/btrfs/inode.c       |  2 ++
 fs/btrfs/root-tree.c   | 75 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/transaction.c | 54 +++++++++++++++++++++++++++++++-----
 fs/btrfs/transaction.h |  1 +
 7 files changed, 164 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 77071f27397..fb6fffb71dd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -122,12 +122,12 @@ struct btrfs_super_block {
 	u8 fsid[16];    /* FS specific uuid */
 	__le64 blocknr; /* this block number */
 	__le64 magic;
-	__le32 blocksize;
 	__le64 generation;
 	__le64 root;
 	__le64 total_blocks;
 	__le64 blocks_used;
 	__le64 root_dir_objectid;
+	__le32 blocksize;
 } __attribute__ ((__packed__));
 
 /*
@@ -226,10 +226,12 @@ struct btrfs_root_item {
 	struct btrfs_inode_item inode;
 	__le64 root_dirid;
 	__le64 blocknr;
-	__le32 flags;
 	__le64 block_limit;
 	__le64 blocks_used;
+	__le32 flags;
 	__le32 refs;
+	struct btrfs_disk_key drop_progress;
+	u8 drop_level;
 } __attribute__ ((__packed__));
 
 #define BTRFS_FILE_EXTENT_REG 0
@@ -800,6 +802,16 @@ static inline void btrfs_set_root_refs(struct btrfs_root_item *item, u32 val)
 	item->refs = cpu_to_le32(val);
 }
 
+static inline u32 btrfs_root_flags(struct btrfs_root_item *item)
+{
+	return le32_to_cpu(item->flags);
+}
+
+static inline void btrfs_set_root_flags(struct btrfs_root_item *item, u32 val)
+{
+	item->flags = cpu_to_le32(val);
+}
+
 static inline u64 btrfs_super_blocknr(struct btrfs_super_block *s)
 {
 	return le64_to_cpu(s->blocknr);
@@ -1076,6 +1088,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *item);
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_find_dead_roots(struct btrfs_root *root);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 865a284aa06..d7615e1578c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -326,8 +326,8 @@ static int find_and_setup_root(int blocksize,
 	return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-				      struct btrfs_key *location)
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
+					       struct btrfs_key *location)
 {
 	struct btrfs_root *root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
@@ -336,11 +336,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	u64 highest_inode;
 	int ret = 0;
 
-	root = radix_tree_lookup(&fs_info->fs_roots_radix,
-				 (unsigned long)location->objectid);
-	if (root)
-		return root;
-	root = kmalloc(sizeof(*root), GFP_NOFS);
+	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 	if (location->offset == (u64)-1) {
@@ -383,6 +379,28 @@ out:
 	BUG_ON(!root->node);
 insert:
 	root->ref_cows = 1;
+	ret = btrfs_find_highest_inode(root, &highest_inode);
+	if (ret == 0) {
+		root->highest_inode = highest_inode;
+		root->last_inode_alloc = highest_inode;
+	}
+	return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+				      struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)location->objectid);
+	if (root)
+		return root;
+
+	root = btrfs_read_fs_root_no_radix(fs_info, location);
+	if (IS_ERR(root))
+		return root;
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
 				(unsigned long)root->root_key.objectid,
 				root);
@@ -391,11 +409,6 @@ insert:
 		kfree(root);
 		return ERR_PTR(ret);
 	}
-	ret = btrfs_find_highest_inode(root, &highest_inode);
-	if (ret == 0) {
-		root->highest_inode = highest_inode;
-		root->last_inode_alloc = highest_inode;
-	}
 	return root;
 }
 
@@ -489,6 +502,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
+	ret = btrfs_find_dead_roots(tree_root);
+	if (ret)
+		goto fail_tree_root;
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
@@ -538,7 +554,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 0;
 }
 
-static int free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
@@ -565,7 +581,7 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 		if (!ret)
 			break;
 		for (i = 0; i < ret; i++)
-			free_fs_root(fs_info, gang[i]);
+			btrfs_free_fs_root(fs_info, gang[i]);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 7b76ccc4875..c4a695ac44f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -65,6 +65,8 @@ int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 		    char *result);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
+					       struct btrfs_key *location);
 u64 bh_blocknr(struct buffer_head *bh);
 int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   struct block_device *bdev,
@@ -75,4 +77,5 @@ int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 			     u64 logical);
 int btrfs_releasepage(struct page *page, gfp_t flags);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
+int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6d031daa777..9d2a0a3f674 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2028,6 +2028,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 
 	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
 	btrfs_set_root_refs(&root_item, 1);
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
 	brelse(subvol);
 	subvol = NULL;
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ac0fae7780f..737e5a38d17 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -18,6 +18,7 @@
 
 #include <linux/module.h>
 #include "ctree.h"
+#include "transaction.h"
 #include "disk-io.h"
 #include "print-tree.h"
 
@@ -32,7 +33,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 
 	search_key.objectid = objectid;
 	search_key.flags = (u32)-1;
-	search_key.offset = (u32)-1;
+	search_key.offset = (u64)-1;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -50,6 +51,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	memcpy(item, btrfs_item_ptr(l, slot, struct btrfs_root_item),
 		sizeof(*item));
 	btrfs_disk_key_to_cpu(key, &l->items[slot].key);
+printk("find last finds key %Lu %u %Lu slot %d search for obj %Lu\n", key->objectid, key->flags, key->offset, slot, objectid);
 	ret = 0;
 out:
 	btrfs_release_path(root, path);
@@ -93,6 +95,67 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+int btrfs_find_dead_roots(struct btrfs_root *root)
+{
+	struct btrfs_root *dead_root;
+	struct btrfs_item *item;
+	struct btrfs_root_item *ri;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	int ret;
+	u32 nritems;
+	struct btrfs_leaf *leaf;
+	int slot;
+
+	key.objectid = 0;
+	key.flags = 0;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+	while(1) {
+		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		nritems = btrfs_header_nritems(&leaf->header);
+		slot = path->slots[0];
+		if (slot >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				break;
+			leaf = btrfs_buffer_leaf(path->nodes[0]);
+			nritems = btrfs_header_nritems(&leaf->header);
+			slot = path->slots[0];
+		}
+		item = leaf->items + slot;
+		btrfs_disk_key_to_cpu(&key, &item->key);
+		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
+			goto next;
+		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
+		if (btrfs_root_refs(ri) != 0)
+			goto next;
+		dead_root = btrfs_read_fs_root_no_radix(root->fs_info, &key);
+		if (IS_ERR(root)) {
+			ret = PTR_ERR(root);
+			goto err;
+		}
+printk("found dead root %Lu %u %Lu\n", key.objectid, key.flags, key.offset);
+		ret = btrfs_add_dead_root(dead_root,
+					  &root->fs_info->dead_roots);
+		if (ret)
+			goto err;
+next:
+		slot++;
+		path->slots[0]++;
+	}
+	ret = 0;
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key)
 {
@@ -111,14 +174,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			    path->slots[0], struct btrfs_root_item);
 
 	refs = btrfs_root_refs(ri);
-	BUG_ON(refs == 0);
-	if (refs == 1) {
-		ret = btrfs_del_item(trans, root, path);
-	} else {
-		btrfs_set_root_refs(ri, refs - 1);
-		WARN_ON(1);
-		mark_buffer_dirty(path->nodes[0]);
-	}
+	BUG_ON(refs != 0);
+	ret = btrfs_del_item(trans, root, path);
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 321f8852755..85a2a5e2714 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -85,11 +85,15 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 
 	if (root != root->fs_info->tree_root && root->last_trans <
 	    running_trans_id) {
-		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-				   (unsigned long)root->root_key.objectid,
-				   BTRFS_ROOT_TRANS_TAG);
-		root->commit_root = root->node;
-		get_bh(root->node);
+		if (root->root_item.refs != 0) {
+			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+					   (unsigned long)root->root_key.objectid,
+					   BTRFS_ROOT_TRANS_TAG);
+			root->commit_root = root->node;
+			get_bh(root->node);
+		} else {
+			WARN_ON(1);
+		}
 	}
 	root->last_trans = running_trans_id;
 	h->transid = running_trans_id;
@@ -208,8 +212,24 @@ struct dirty_root {
 	struct btrfs_key snap_key;
 	struct buffer_head *commit_root;
 	struct btrfs_root *root;
+	int free_on_drop;
 };
 
+int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
+{
+	struct dirty_root *dirty;
+
+	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+	if (!dirty)
+		return -ENOMEM;
+	memcpy(&dirty->snap_key, &root->root_key, sizeof(root->root_key));
+	dirty->commit_root = root->node;
+	dirty->root = root;
+	dirty->free_on_drop = 1;
+	list_add(&dirty->list, dead_list);
+	return 0;
+}
+
 static int add_dirty_roots(struct btrfs_trans_handle *trans,
 			   struct radix_tree_root *radix,
 			   struct list_head *list)
@@ -217,9 +237,11 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 	struct dirty_root *dirty;
 	struct btrfs_root *gang[8];
 	struct btrfs_root *root;
+	struct btrfs_root_item tmp_item;
 	int i;
 	int ret;
 	int err = 0;
+	u32 refs;
 
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
@@ -246,6 +268,9 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 			dirty->commit_root = root->commit_root;
 			root->commit_root = NULL;
 			dirty->root = root;
+			dirty->free_on_drop = 0;
+			memcpy(&tmp_item, &root->root_item, sizeof(tmp_item));
+
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_blocknr(&root->root_item,
 					       bh_blocknr(root->node));
@@ -254,7 +279,18 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 						&root->root_item);
 			if (err)
 				break;
-			list_add(&dirty->list, list);
+
+			refs = btrfs_root_refs(&tmp_item);
+			btrfs_set_root_refs(&tmp_item, refs - 1);
+			err = btrfs_update_root(trans, root->fs_info->tree_root,
+						&dirty->snap_key,
+						&tmp_item);
+
+			BUG_ON(err);
+			if (refs == 1)
+				list_add(&dirty->list, list);
+			else
+				kfree(dirty);
 		}
 	}
 	return err;
@@ -270,16 +306,20 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
+
 		trans = btrfs_start_transaction(tree_root, 1);
+printk("deleting root %Lu %u %Lu\n", dirty->snap_key.objectid, dirty->snap_key.flags, dirty->snap_key.offset);
 		ret = btrfs_drop_snapshot(trans, dirty->root,
 					  dirty->commit_root);
 		BUG_ON(ret);
-
 		ret = btrfs_del_root(trans, tree_root, &dirty->snap_key);
 		if (ret)
 			break;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
+
+		if (dirty->free_on_drop)
+			kfree(dirty->root);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
 		btrfs_btree_balance_dirty(tree_root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5fb1d322aa1..ebf44f3e111 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -69,5 +69,6 @@ void btrfs_transaction_flush_work(struct btrfs_root *root);
 void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
 void btrfs_init_transaction_sys(void);
 void btrfs_exit_transaction_sys(void);
+int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
 
 #endif
-- 
cgit v1.2.3


From 22bb92f376b38154dc0c3688a01a16525b4caab0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 22 Jun 2007 14:49:31 -0400
Subject: Btrfs: Documentation update

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/INSTALL       | 17 +++++++----------
 fs/btrfs/root-tree.c   |  2 --
 fs/btrfs/transaction.c |  1 -
 3 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/INSTALL b/fs/btrfs/INSTALL
index e83ff6e8c8d..16b45a56878 100644
--- a/fs/btrfs/INSTALL
+++ b/fs/btrfs/INSTALL
@@ -5,19 +5,16 @@ directory can only be changed by btrfsctl right now, and normal filesystem
 operations do not work on it.  The default subvolume is called 'default',
 and you can create files and directories in mount_point/default
 
-Btrfs uses the crypto manager interface in the kernel for file and
-metadata checksums.  You need to compile the kernel with:
+Btrfs uses libcrc32c in the kernel for file and metadata checksums.  You need
+to compile the kernel with:
 
-CONFIG_CRYPTO=y
-CONFIG_CRYPTO_MANAGER=m
-CONFIG_CRYPTO_CRC32C=m
+CONFIG_LIBCRC32C=m
 
-cryptomanager and crc32c can be static as well.  Once your kernel is
-setup, typing make in the btrfs module sources will build against the
-running kernel.  When the build is complete:
+libcrc32c can be static as well.  Once your kernel is setup, typing make in the
+btrfs module sources will build against the running kernel.  When the build is
+complete:
 
-modprobe crc32c
-modprobe cryptomgr
+modprobe libcrc32c
 insmod btrfs.ko
 
 The Btrfs utility programs require libuuid to build.  This can be found
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 737e5a38d17..8b0eca042ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -51,7 +51,6 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	memcpy(item, btrfs_item_ptr(l, slot, struct btrfs_root_item),
 		sizeof(*item));
 	btrfs_disk_key_to_cpu(key, &l->items[slot].key);
-printk("find last finds key %Lu %u %Lu slot %d search for obj %Lu\n", key->objectid, key->flags, key->offset, slot, objectid);
 	ret = 0;
 out:
 	btrfs_release_path(root, path);
@@ -141,7 +140,6 @@ int btrfs_find_dead_roots(struct btrfs_root *root)
 			ret = PTR_ERR(root);
 			goto err;
 		}
-printk("found dead root %Lu %u %Lu\n", key.objectid, key.flags, key.offset);
 		ret = btrfs_add_dead_root(dead_root,
 					  &root->fs_info->dead_roots);
 		if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 85a2a5e2714..a5a63d471e4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -308,7 +308,6 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		list_del_init(&dirty->list);
 
 		trans = btrfs_start_transaction(tree_root, 1);
-printk("deleting root %Lu %u %Lu\n", dirty->snap_key.objectid, dirty->snap_key.flags, dirty->snap_key.offset);
 		ret = btrfs_drop_snapshot(trans, dirty->root,
 					  dirty->commit_root);
 		BUG_ON(ret);
-- 
cgit v1.2.3


From 79c44584ea346024e929ac0f2a3577aac33b7b3f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 25 Jun 2007 10:09:33 -0400
Subject: Btrfs: Fix mtime and ctime updates on parent dirs

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9d2a0a3f674..eba06e7cf41 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -250,6 +250,7 @@ err:
 	btrfs_free_path(path);
 	if (!ret) {
 		dir->i_size -= name_len * 2;
+		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 		btrfs_update_inode(trans, root, dir);
 		drop_nlink(dentry->d_inode);
 		ret = btrfs_update_inode(trans, root, dentry->d_inode);
@@ -1037,6 +1038,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
+	struct inode *parent_inode;
 	key.objectid = inode->i_ino;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1047,7 +1049,9 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_parent->d_inode->i_ino,
 				    &key, btrfs_inode_type(inode));
 	if (ret == 0) {
-		dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
+		parent_inode = dentry->d_parent->d_inode;
+		parent_inode->i_size += dentry->d_name.len * 2;
+		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 		ret = btrfs_update_inode(trans, root,
 					 dentry->d_parent->d_inode);
 	}
-- 
cgit v1.2.3


From 4b52dff6d371b9b93bc99f64c32831ea9a8ec3ac Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 26 Jun 2007 10:06:50 -0400
Subject: Btrfs: Fix super block updates during transaction commit

The super block written during commit was not consistent with the state of
the trees.  This change adds an in-memory copy of the super so that we can
make sure to write out consistent data during a commit.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  5 ++---
 fs/btrfs/extent-tree.c | 16 ++++++++--------
 fs/btrfs/super.c       |  2 +-
 fs/btrfs/transaction.c |  8 ++++++--
 5 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fb6fffb71dd..1998f86df08 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -306,6 +306,7 @@ struct btrfs_fs_info {
 	u64 generation;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
+	struct btrfs_super_block super_copy;
 	struct buffer_head *sb_buffer;
 	struct super_block *sb;
 	struct inode *btree_inode;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d7615e1578c..7081729d5b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -471,6 +471,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	if (!fs_info->sb_buffer)
 		goto fail_iput;
 	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
+	fs_info->disk_super = disk_super;
+	memcpy(&fs_info->super_copy, disk_super, sizeof(fs_info->super_copy));
 
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
@@ -479,7 +481,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		     btrfs_super_total_blocks(disk_super) <<
 		     fs_info->btree_inode->i_blkbits);
 
-	fs_info->disk_super = disk_super;
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
@@ -527,8 +528,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	struct buffer_head *bh = root->fs_info->sb_buffer;
 
-	btrfs_set_super_root(root->fs_info->disk_super,
-			     bh_blocknr(root->fs_info->tree_root->node));
 	lock_buffer(bh);
 	WARN_ON(atomic_read(&bh->b_count) < 1);
 	clear_buffer_dirty(bh);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8025e9f8ef1..7e550343aee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -796,8 +796,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 
 	for (i = 0; i < extent_root->fs_info->extent_tree_insert_nr; i++) {
 		ins.objectid = extent_root->fs_info->extent_tree_insert[i];
-		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
-		btrfs_set_super_blocks_used(info->disk_super,
+		super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
+		btrfs_set_super_blocks_used(&info->super_copy,
 					    super_blocks_used + 1);
 		ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item,
 					sizeof(extent_item));
@@ -892,8 +892,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			BUG_ON(ret);
 		}
 
-		super_blocks_used = btrfs_super_blocks_used(info->disk_super);
-		btrfs_set_super_blocks_used(info->disk_super,
+		super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
+		btrfs_set_super_blocks_used(&info->super_copy,
 					    super_blocks_used - num_blocks);
 		ret = btrfs_del_item(trans, extent_root, path);
 		if (ret) {
@@ -1032,7 +1032,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		info->extent_tree_prealloc_nr = 0;
 	}
 	if (search_end == (u64)-1)
-		search_end = btrfs_super_total_blocks(info->disk_super);
+		search_end = btrfs_super_total_blocks(&info->super_copy);
 	if (hint_block) {
 		block_group = btrfs_lookup_block_group(info, hint_block);
 		block_group = btrfs_find_block_group(root, block_group,
@@ -1361,8 +1361,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	super_blocks_used = btrfs_super_blocks_used(info->disk_super);
-	btrfs_set_super_blocks_used(info->disk_super, super_blocks_used +
+	super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
+	btrfs_set_super_blocks_used(&info->super_copy, super_blocks_used +
 				    num_blocks);
 	ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
 				sizeof(extent_item));
@@ -1737,7 +1737,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 					   BTRFS_BLOCK_GROUP_AVAIL);
 		}
 		if (key.objectid >=
-		    btrfs_super_total_blocks(info->disk_super))
+		    btrfs_super_total_blocks(&info->super_copy))
 			break;
 	}
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c11ecf50020..2e797d5fb28 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -144,7 +144,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = root->fs_info->disk_super;
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
 
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_blocks(disk_super);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a5a63d471e4..3b2face593e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -380,6 +380,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		else
 			prev_trans->use_count++;
 	}
+	btrfs_set_super_generation(&root->fs_info->super_copy,
+				   cur_trans->transid);
+	btrfs_set_super_root(&root->fs_info->super_copy,
+			     bh_blocknr(root->fs_info->tree_root->node));
+	memcpy(root->fs_info->disk_super, &root->fs_info->super_copy,
+	       sizeof(root->fs_info->super_copy));
 	mutex_unlock(&root->fs_info->trans_mutex);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
@@ -389,8 +395,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		put_transaction(prev_trans);
 		mutex_unlock(&root->fs_info->trans_mutex);
 	}
-	btrfs_set_super_generation(root->fs_info->disk_super,
-				   cur_trans->transid);
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
 
-- 
cgit v1.2.3


From f2654de42a759127cb1f1e8a626ec94178732e20 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 26 Jun 2007 12:20:46 -0400
Subject: Btrfs: Allow find_free_extent callers to pass in an exclusion range

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 79 +++++++++++++++++++-------------------------------
 1 file changed, 30 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7e550343aee..01dc3057928 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -25,7 +25,8 @@
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *orig_root, u64 num_blocks, u64 search_start,
 			    u64 search_end, u64 hint_block,
-			    struct btrfs_key *ins, int data);
+			    struct btrfs_key *ins, u64 exclude_start,
+			    u64 exclude_nr, int data);
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -407,7 +408,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 	ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0,
-			       (u64)-1, 0, &ins, 0);
+			       (u64)-1, 0, &ins, 0, 0, 0);
 	if (ret) {
 		btrfs_free_path(path);
 		return ret;
@@ -559,7 +560,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_item *bi;
 	struct btrfs_key ins;
 
-	ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins, 0);
+	ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins,
+			       0, 0, 0);
 	/* FIXME, set bit to recalc cache groups on next mount */
 	if (ret)
 		return ret;
@@ -868,7 +870,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0);
+	ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
 	if (ret) {
 		btrfs_free_path(path);
 		return ret;
@@ -987,7 +989,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *orig_root, u64 num_blocks, u64 search_start, u64
 			    search_end, u64 hint_block,
-			    struct btrfs_key *ins, int data)
+			    struct btrfs_key *ins, u64 exclude_start,
+			    u64 exclude_nr, int data)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -1191,6 +1194,11 @@ check_pending:
 			goto new_group;
 		}
 	}
+	if (exclude_nr > 0 && (ins->objectid + num_blocks > exclude_start &&
+	    ins->objectid < exclude_start + exclude_nr)) {
+		search_start = exclude_start + exclude_nr;
+		goto new_group;
+	}
 	if (fill_prealloc) {
 		int nr;
 		test_block = ins->objectid;
@@ -1267,6 +1275,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	int pending_ret;
 	u64 super_blocks_used;
 	u64 search_start = 0;
+	u64 exclude_start = 0;
+	u64 exclude_nr = 0;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
@@ -1298,33 +1308,19 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	 */
 	if (data) {
 		ret = find_free_extent(trans, root, 0, 0,
-				       search_end, 0, &prealloc_key, 0);
-		if (ret) {
+				       search_end, 0, &prealloc_key, 0, 0, 0);
+		if (ret)
 			return ret;
-		}
-		if (prealloc_key.objectid + prealloc_key.offset >= search_end) {
-			int nr = info->extent_tree_prealloc_nr;
-			search_end = info->extent_tree_prealloc[nr - 1] - 1;
-		} else {
-			search_start = info->extent_tree_prealloc[0] + 1;
-		}
+		exclude_nr = info->extent_tree_prealloc_nr;
+		exclude_start = info->extent_tree_prealloc[exclude_nr - 1];
 	}
-	if (hint_block < search_start)
-		hint_block = search_start;
+
 	/* do the real allocation */
 	ret = find_free_extent(trans, root, num_blocks, search_start,
-			       search_end, hint_block, ins, data);
-	if (ret) {
-		if (search_start == 0)
-			return ret;
-		search_end = search_start - 1;
-		search_start = 0;
-		hint_block = search_start;
-		ret = find_free_extent(trans, root, num_blocks, search_start,
-				       search_end, hint_block, ins, data);
-		if (ret)
-			return ret;
-	}
+			       search_end, hint_block, ins,
+			       exclude_start, exclude_nr, data);
+	if (ret)
+		return ret;
 
 	/*
 	 * if we're doing a metadata allocation, preallocate space in the
@@ -1336,29 +1332,14 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	 * The unused prealloc will get reused the next time around.
 	 */
 	if (!data) {
-		if (ins->objectid + ins->offset >= search_end)
-			search_end = ins->objectid - 1;
-		else
-			search_start = ins->objectid + ins->offset;
-
-		if (hint_block < search_start)
-			hint_block = search_start;
-
+		exclude_start = ins->objectid;
+		exclude_nr = ins->offset;
 		ret = find_free_extent(trans, root, 0, search_start,
 				       search_end, hint_block,
-				       &prealloc_key, 0);
-		if (ret) {
-			if (search_start == 0)
-				return ret;
-			search_end = search_start - 1;
-			search_start = 0;
-			hint_block = search_start;
-			ret = find_free_extent(trans, root, 0, search_start,
-					       search_end, hint_block,
-					       &prealloc_key, 0);
-			if (ret)
-				return ret;
-		}
+				       &prealloc_key, exclude_start,
+				       exclude_nr, 0);
+		if (ret)
+			return ret;
 	}
 
 	super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
-- 
cgit v1.2.3


From ccd467d60e81b48cdbecae93532b66bcdedca91d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 28 Jun 2007 15:57:36 -0400
Subject: Btrfs: crash recovery fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 13 +++++++++++-
 fs/btrfs/ctree.h       | 12 ++++-------
 fs/btrfs/disk-io.c     | 22 ++++++++++++--------
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/extent-tree.c | 37 ++++++++++++++++++++++++++++++----
 fs/btrfs/file.c        | 15 ++++++++++----
 fs/btrfs/inode.c       |  6 +++---
 fs/btrfs/transaction.c | 54 +++++++++++++++++++++++++++++++-------------------
 8 files changed, 112 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 606a19b5916..9eb646529ed 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -75,6 +75,17 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_node *cow_node;
 	int ret;
 
+	WARN_ON(!buffer_uptodate(buf));
+	if (trans->transaction != root->fs_info->running_transaction) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->running_transaction->transid);
+		WARN_ON(1);
+	}
+	if (trans->transid != root->fs_info->generation) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->generation);
+		WARN_ON(1);
+	}
 	if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
 				    trans->transid) {
 		*cow_ret = buf;
@@ -107,7 +118,7 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
 	}
 	btrfs_block_release(root, buf);
-	mark_buffer_dirty(cow);
+	btrfs_mark_buffer_dirty(cow);
 	*cow_ret = cow;
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1998f86df08..0287bd51d87 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1013,18 +1013,13 @@ static inline void btrfs_memmove(struct btrfs_root *root,
 	memmove(dst, src, nr);
 }
 
-static inline void btrfs_mark_buffer_dirty(struct buffer_head *bh)
-{
-	WARN_ON(!atomic_read(&bh->b_count));
-	mark_buffer_dirty(bh);
-}
-
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset((leaf)->items + (slot))))
 
 /* extent-tree.c */
+int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 blocknr);
@@ -1044,8 +1039,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin);
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *root);
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct radix_tree_root *unpin_radix);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 blocknr, u64 num_blocks);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7081729d5b1..d1bf5bc1bc1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -270,14 +270,6 @@ fail:
 	return NULL;
 }
 
-int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct buffer_head *buf)
-{
-	WARN_ON(atomic_read(&buf->b_count) == 0);
-	mark_buffer_dirty(buf);
-	return 0;
-}
-
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct buffer_head *buf)
 {
@@ -621,6 +613,20 @@ int close_ctree(struct btrfs_root *root)
 	return 0;
 }
 
+void btrfs_mark_buffer_dirty(struct buffer_head *bh)
+{
+	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
+	u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+	WARN_ON(!atomic_read(&bh->b_count));
+	if (transid != root->fs_info->generation) {
+		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
+			(unsigned long long)bh->b_blocknr,
+			transid, root->fs_info->generation);
+		WARN_ON(1);
+	}
+	mark_buffer_dirty(bh);
+}
+
 void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 {
 	brelse(buf);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c4a695ac44f..9e2c261b41a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -78,4 +78,5 @@ int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 int btrfs_releasepage(struct page *page, gfp_t flags);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_mark_buffer_dirty(struct buffer_head *bh);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01dc3057928..14b93268920 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -523,6 +523,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	}
 	return 0;
 fail:
+	WARN_ON(1);
 	for (i =0; i < faili; i++) {
 		if (leaf) {
 			u64 disk_blocknr;
@@ -572,7 +573,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
 			    struct btrfs_block_group_item);
 	memcpy(bi, &cache->item, sizeof(*bi));
-	mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(extent_root, path);
 fail:
 	finish_current_insert(trans, extent_root);
@@ -739,8 +740,30 @@ static int try_remove_page(struct address_space *mapping, unsigned long index)
 	return ret;
 }
 
-int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *root)
+int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy)
+{
+	unsigned long gang[8];
+	u64 last = 0;
+	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
+	int ret;
+	int i;
+
+	while(1) {
+		ret = find_first_radix_bit(pinned_radix, gang, last,
+					   ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0 ; i < ret; i++) {
+			set_radix_bit(copy, gang[i]);
+			last = gang[i] + 1;
+		}
+	}
+	return 0;
+}
+
+int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct radix_tree_root *unpin_radix)
 {
 	unsigned long gang[8];
 	struct inode *btree_inode = root->fs_info->btree_inode;
@@ -752,7 +775,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 	struct radix_tree_root *extent_radix = &root->fs_info->extent_map_radix;
 
 	while(1) {
-		ret = find_first_radix_bit(pinned_radix, gang, 0,
+		ret = find_first_radix_bit(unpin_radix, gang, 0,
 					   ARRAY_SIZE(gang));
 		if (!ret)
 			break;
@@ -760,6 +783,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct
 			first = gang[0];
 		for (i = 0; i < ret; i++) {
 			clear_radix_bit(pinned_radix, gang[i]);
+			clear_radix_bit(unpin_radix, gang[i]);
 			block_group = btrfs_lookup_block_group(root->fs_info,
 							       gang[i]);
 			if (block_group) {
@@ -1309,6 +1333,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (data) {
 		ret = find_free_extent(trans, root, 0, 0,
 				       search_end, 0, &prealloc_key, 0, 0, 0);
+		BUG_ON(ret);
 		if (ret)
 			return ret;
 		exclude_nr = info->extent_tree_prealloc_nr;
@@ -1319,6 +1344,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	ret = find_free_extent(trans, root, num_blocks, search_start,
 			       search_end, hint_block, ins,
 			       exclude_start, exclude_nr, data);
+	BUG_ON(ret);
 	if (ret)
 		return ret;
 
@@ -1334,10 +1360,12 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (!data) {
 		exclude_start = ins->objectid;
 		exclude_nr = ins->offset;
+		hint_block = exclude_start + exclude_nr;
 		ret = find_free_extent(trans, root, 0, search_start,
 				       search_end, hint_block,
 				       &prealloc_key, exclude_start,
 				       exclude_nr, 0);
+		BUG_ON(ret);
 		if (ret)
 			return ret;
 	}
@@ -1348,6 +1376,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
 				sizeof(extent_item));
 
+	BUG_ON(ret);
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
 	if (ret) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fef7ba1e707..2456cc3e1cf 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -127,7 +127,7 @@ static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
 		     ptr, kaddr + bh_offset(bh),
 		     size);
 	kunmap_atomic(kaddr, KM_USER0);
-	mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_free_path(path);
 	ret = btrfs_end_transaction(trans, root);
@@ -211,11 +211,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int found_type;
 	int found_extent;
 	int found_inline;
+	int recow;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	while(1) {
+		recow = 0;
 		btrfs_release_path(root, path);
 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 					       search_start, -1);
@@ -244,6 +246,10 @@ next_slot:
 		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY) {
 			goto out;
 		}
+		if (recow) {
+			search_start = key.offset;
+			continue;
+		}
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
@@ -274,6 +280,7 @@ next_slot:
 				nextret = btrfs_next_leaf(root, path);
 				if (nextret)
 					goto out;
+				recow = 1;
 			} else {
 				path->slots[0]++;
 			}
@@ -321,7 +328,7 @@ next_slot:
 				}
 				btrfs_set_file_extent_num_blocks(extent,
 								 new_num);
-				mark_buffer_dirty(path->nodes[0]);
+				btrfs_mark_buffer_dirty(path->nodes[0]);
 			} else {
 				WARN_ON(1);
 			}
@@ -452,6 +459,8 @@ static int prepare_pages(struct btrfs_root *root,
 			err = -ENOMEM;
 			goto failed_release;
 		}
+		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+		wait_on_page_writeback(pages[i]);
 	}
 
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -522,8 +531,6 @@ static int prepare_pages(struct btrfs_root *root,
 	mutex_unlock(&root->fs_info->fs_mutex);
 
 	for (i = 0; i < num_pages; i++) {
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-		wait_on_page_writeback(pages[i]);
 		offset = pos & (PAGE_CACHE_SIZE -1);
 		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
 		if (!page_has_buffers(pages[i])) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index eba06e7cf41..4fc0367d54f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -506,7 +506,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 							 extent_num_blocks);
 				inode->i_blocks -= (orig_num_blocks -
 					extent_num_blocks) << 3;
-				mark_buffer_dirty(path->nodes[0]);
+				btrfs_mark_buffer_dirty(path->nodes[0]);
 			} else {
 				extent_start =
 					btrfs_file_extent_disk_blocknr(fi);
@@ -2020,7 +2020,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
 	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
 	       sizeof(leaf->header.fsid));
-	mark_buffer_dirty(subvol);
+	btrfs_mark_buffer_dirty(subvol);
 
 	inode_item = &root_item.inode;
 	memset(inode_item, 0, sizeof(*inode_item));
@@ -2497,7 +2497,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	ptr = btrfs_file_extent_inline_start(ei);
 	btrfs_memcpy(root, path->nodes[0]->b_data,
 		     ptr, symname, name_len);
-	mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b2face593e..bec38ae8aa1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -85,6 +85,8 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 
 	if (root != root->fs_info->tree_root && root->last_trans <
 	    running_trans_id) {
+		WARN_ON(root == root->fs_info->extent_root);
+		WARN_ON(root->ref_cows != 1);
 		if (root->root_item.refs != 0) {
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 					   (unsigned long)root->root_key.objectid,
@@ -113,10 +115,11 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = root->fs_info->running_transaction;
+	WARN_ON(cur_trans != trans->transaction);
 	WARN_ON(cur_trans->num_writers < 1);
+	cur_trans->num_writers--;
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-	cur_trans->num_writers--;
 	put_transaction(cur_trans);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	memset(trans, 0, sizeof(*trans));
@@ -194,6 +197,7 @@ static int wait_for_commit(struct btrfs_root *root,
 			   struct btrfs_transaction *commit)
 {
 	DEFINE_WAIT(wait);
+	mutex_lock(&root->fs_info->trans_mutex);
 	while(!commit->commit_done) {
 		prepare_to_wait(&commit->commit_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
@@ -203,6 +207,7 @@ static int wait_for_commit(struct btrfs_root *root,
 		schedule();
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
+	mutex_unlock(&root->fs_info->trans_mutex);
 	finish_wait(&commit->commit_wait, &wait);
 	return 0;
 }
@@ -279,7 +284,6 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 						&root->root_item);
 			if (err)
 				break;
-
 			refs = btrfs_root_refs(&tmp_item);
 			btrfs_set_root_refs(&tmp_item, refs - 1);
 			err = btrfs_update_root(trans, root->fs_info->tree_root,
@@ -333,31 +337,53 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
+	struct radix_tree_root pinned_copy;
 	DEFINE_WAIT(wait);
 
+	init_bit_radix(&pinned_copy);
 	INIT_LIST_HEAD(&dirty_fs_roots);
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
 		cur_trans = trans->transaction;
 		trans->transaction->use_count++;
+		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
+
+		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
 		put_transaction(cur_trans);
-		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
-	cur_trans = trans->transaction;
 	trans->transaction->in_commit = 1;
+	cur_trans = trans->transaction;
+	if (cur_trans->list.prev != &root->fs_info->trans_list) {
+		prev_trans = list_entry(cur_trans->list.prev,
+					struct btrfs_transaction, list);
+		if (!prev_trans->commit_done) {
+			prev_trans->use_count++;
+			mutex_unlock(&root->fs_info->fs_mutex);
+			mutex_unlock(&root->fs_info->trans_mutex);
+
+			wait_for_commit(root, prev_trans);
+			put_transaction(prev_trans);
+
+			mutex_lock(&root->fs_info->fs_mutex);
+			mutex_lock(&root->fs_info->trans_mutex);
+		}
+	}
 	while (trans->transaction->num_writers > 1) {
 		WARN_ON(cur_trans != trans->transaction);
 		prepare_to_wait(&trans->transaction->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (trans->transaction->num_writers <= 1)
 			break;
+		mutex_unlock(&root->fs_info->fs_mutex);
 		mutex_unlock(&root->fs_info->trans_mutex);
 		schedule();
+		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&trans->transaction->writer_wait, &wait);
 	}
@@ -372,34 +398,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	cur_trans = root->fs_info->running_transaction;
 	root->fs_info->running_transaction = NULL;
-	if (cur_trans->list.prev != &root->fs_info->trans_list) {
-		prev_trans = list_entry(cur_trans->list.prev,
-					struct btrfs_transaction, list);
-		if (prev_trans->commit_done)
-			prev_trans = NULL;
-		else
-			prev_trans->use_count++;
-	}
 	btrfs_set_super_generation(&root->fs_info->super_copy,
 				   cur_trans->transid);
 	btrfs_set_super_root(&root->fs_info->super_copy,
 			     bh_blocknr(root->fs_info->tree_root->node));
 	memcpy(root->fs_info->disk_super, &root->fs_info->super_copy,
 	       sizeof(root->fs_info->super_copy));
+
+	btrfs_copy_pinned(root, &pinned_copy);
+
 	mutex_unlock(&root->fs_info->trans_mutex);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
-	if (prev_trans) {
-		mutex_lock(&root->fs_info->trans_mutex);
-		wait_for_commit(root, prev_trans);
-		put_transaction(prev_trans);
-		mutex_unlock(&root->fs_info->trans_mutex);
-	}
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
-
 	mutex_lock(&root->fs_info->fs_mutex);
-	btrfs_finish_extent_commit(trans, root);
+	btrfs_finish_extent_commit(trans, root, &pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans->commit_done = 1;
 	wake_up(&cur_trans->commit_wait);
-- 
cgit v1.2.3


From ec6b910fb330f29e169c9f502c3ac209515af8d1 Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Wed, 11 Jul 2007 10:00:37 -0400
Subject: Btrfs: trivial include fixups

Almost none of the files including module.h need to do so,
remove them.

Include sched.h in extent-tree.c to silence a warning about cond_resched()
being undeclared.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/bit-radix.c   | 1 -
 fs/btrfs/ctree.c       | 1 -
 fs/btrfs/dir-item.c    | 1 -
 fs/btrfs/disk-io.c     | 1 -
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/file-item.c   | 1 -
 fs/btrfs/file.c        | 1 -
 fs/btrfs/inode-item.c  | 1 -
 fs/btrfs/inode-map.c   | 1 -
 fs/btrfs/inode.c       | 1 -
 fs/btrfs/print-tree.c  | 1 -
 fs/btrfs/root-tree.c   | 1 -
 fs/btrfs/sysfs.c       | 1 -
 fs/btrfs/transaction.c | 1 -
 14 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
index a4ddf4de979..e8bf876db39 100644
--- a/fs/btrfs/bit-radix.c
+++ b/fs/btrfs/bit-radix.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "bit-radix.h"
 
 #define BIT_ARRAY_BYTES 256
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9eb646529ed..eace2ee7644 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index d5db8bdea3d..49db5fa7ced 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "hash.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d1bf5bc1bc1..60db85bc4ba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/crc32c.h>
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 14b93268920..5ace2c33f1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -16,7 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
+#include <linux/sched.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 68859934ae2..ea4dd4c5fce 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2456cc3e1cf..f327525f541 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index e8b5a9f7c0c..b7834617753 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 58930fc48be..40547086625 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4fc0367d54f..d75ca5cf59b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index e66acd4e274..bafa78603bd 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 8b0eca042ea..271dd2b5c32 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ead9dc8051b..db8917e5b25 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bec38ae8aa1..0f494fe365c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include "ctree.h"
-- 
cgit v1.2.3


From a1f396304fb7e5f18e4ea81c294415375f1c814c Mon Sep 17 00:00:00 2001
From: Aneesh <aneesh.kumar@gmail.com>
Date: Wed, 11 Jul 2007 10:03:27 -0400
Subject: Btrfs: Some code cleanups

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 8 ++++++--
 fs/btrfs/root-tree.c | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eace2ee7644..19a30b7c3a2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -172,11 +172,13 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 
 	if (path->nodes[level + 1])
 		parent = btrfs_buffer_node(path->nodes[level + 1]);
-	parent_slot = path->slots[level + 1];
+
 	slot = path->slots[level];
 	BUG_ON(nritems == 0);
 	if (parent) {
 		struct btrfs_disk_key *parent_key;
+
+		parent_slot = path->slots[level + 1];
 		parent_key = &parent->ptrs[parent_slot].key;
 		BUG_ON(memcmp(parent_key, &node->ptrs[0].key,
 			      sizeof(struct btrfs_disk_key)));
@@ -208,7 +210,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 
 	if (path->nodes[level + 1])
 		parent = btrfs_buffer_node(path->nodes[level + 1]);
-	parent_slot = path->slots[level + 1];
+
 	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
 
 	if (nritems == 0)
@@ -216,6 +218,8 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 
 	if (parent) {
 		struct btrfs_disk_key *parent_key;
+
+		parent_slot = path->slots[level + 1];
 		parent_key = &parent->ptrs[parent_slot].key;
 		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
 		       sizeof(struct btrfs_disk_key)));
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 271dd2b5c32..402f67821c1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -135,8 +135,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root)
 		if (btrfs_root_refs(ri) != 0)
 			goto next;
 		dead_root = btrfs_read_fs_root_no_radix(root->fs_info, &key);
-		if (IS_ERR(root)) {
-			ret = PTR_ERR(root);
+		if (IS_ERR(dead_root)) {
+			ret = PTR_ERR(dead_root);
 			goto err;
 		}
 		ret = btrfs_add_dead_root(dead_root,
-- 
cgit v1.2.3


From 618e21d5957c88c5ea346dfd7d1dbac96410fc5b Mon Sep 17 00:00:00 2001
From: Josef Bacik <jwhiter@redhat.com>
Date: Wed, 11 Jul 2007 10:18:17 -0400
Subject: Btrfs: Implement mknod

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d75ca5cf59b..f95cf5d4a69 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -46,6 +46,7 @@ struct btrfs_iget_args {
 static struct inode_operations btrfs_dir_inode_operations;
 static struct inode_operations btrfs_symlink_inode_operations;
 static struct inode_operations btrfs_dir_ro_inode_operations;
+static struct inode_operations btrfs_special_inode_operations;
 static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
 static struct address_space_operations btrfs_symlink_aops;
@@ -75,6 +76,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
 	u64 alloc_group_block;
+	u32 rdev;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -104,6 +106,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
 	inode->i_blocks = btrfs_inode_nblocks(inode_item);
 	inode->i_generation = btrfs_inode_generation(inode_item);
+	inode->i_rdev = 0;
+	rdev = btrfs_inode_rdev(inode_item);
 	alloc_group_block = btrfs_inode_block_group(inode_item);
 	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
 						       alloc_group_block);
@@ -114,12 +118,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 	mutex_unlock(&root->fs_info->fs_mutex);
 
 	switch (inode->i_mode & S_IFMT) {
-#if 0
-	default:
-		init_special_inode(inode, inode->i_mode,
-				   btrfs_inode_rdev(inode_item));
-		break;
-#endif
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
@@ -136,6 +134,9 @@ void btrfs_read_locked_inode(struct inode *inode)
 		inode->i_op = &btrfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
 		break;
+	default:
+		init_special_inode(inode, inode->i_mode, rdev);
+		break;
 	}
 	return;
 
@@ -162,6 +163,7 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
 	btrfs_set_inode_nblocks(item, inode->i_blocks);
 	btrfs_set_inode_generation(item, inode->i_generation);
+	btrfs_set_inode_rdev(item, inode->i_rdev);
 	btrfs_set_inode_block_group(item,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
@@ -1070,6 +1072,58 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 	return err;
 }
 
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+			int mode, dev_t rdev)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct inode *inode;
+	int err;
+	int drop_inode = 0;
+	u64 objectid;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, dir);
+
+	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+	if (err) {
+		err = -ENOSPC;
+		goto out_unlock;
+	}
+
+	inode = btrfs_new_inode(trans, root, objectid,
+				BTRFS_I(dir)->block_group, mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_unlock;
+
+	btrfs_set_trans_block_group(trans, inode);
+	err = btrfs_add_nondir(trans, dentry, inode);
+	if (err)
+		drop_inode = 1;
+	else {
+		inode->i_op = &btrfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, rdev);
+	}
+	dir->i_sb->s_dirt = 1;
+	btrfs_update_inode_block_group(trans, inode);
+	btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+	if (drop_inode) {
+		inode_dec_link_count(inode);
+		iput(inode);
+	}
+	btrfs_btree_balance_dirty(root);
+	return err;
+}
+
 static int btrfs_create(struct inode *dir, struct dentry *dentry,
 			int mode, struct nameidata *nd)
 {
@@ -2526,6 +2580,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.rename		= btrfs_rename,
 	.symlink	= btrfs_symlink,
 	.setattr	= btrfs_setattr,
+	.mknod		= btrfs_mknod,
 };
 
 static struct inode_operations btrfs_dir_ro_inode_operations = {
@@ -2562,6 +2617,11 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.setattr	= btrfs_setattr,
 };
 
+static struct inode_operations btrfs_special_inode_operations = {
+	.getattr	= btrfs_getattr,
+	.setattr	= btrfs_setattr,
+};
+
 static struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
-- 
cgit v1.2.3


From 92fee66d49b812ea57eae53c5baaf8942ac9af06 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jul 2007 12:31:35 -0400
Subject: Btrfs: deal with api changes in 2.6.23-rc1

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c  |  9 +++++++--
 fs/btrfs/inode.c | 50 ++++++++++++++++++++++++++------------------------
 2 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f327525f541..1fe38fe8415 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,6 +30,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/version.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -713,8 +714,12 @@ out:
 }
 
 static struct vm_operations_struct btrfs_file_vm_ops = {
-	.nopage		= filemap_nopage,
-	.populate	= filemap_populate,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+	.nopage         = filemap_nopage,
+	.populate       = filemap_populate,
+#else
+	.fault		= filemap_fault,
+#endif
 	.page_mkwrite	= btrfs_page_mkwrite,
 };
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f95cf5d4a69..fa9c531c86b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
+#include <linux/version.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -2323,42 +2324,43 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_path_cachep);
 }
 
+static struct kmem_cache *cache_create(const char *name, size_t size,
+				       unsigned long extra_flags,
+				       void (*ctor)(void *, struct kmem_cache *,
+						    unsigned long))
+{
+	return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
+				 SLAB_MEM_SPREAD | extra_flags), ctor
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+				 ,NULL
+#endif
+				);
+}
+
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
-					     sizeof(struct btrfs_inode),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     init_once, NULL);
+	btrfs_inode_cachep = cache_create("btrfs_inode_cache",
+					  sizeof(struct btrfs_inode),
+					  0, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
-	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+	btrfs_trans_handle_cachep = cache_create("btrfs_trans_handle_cache",
 					     sizeof(struct btrfs_trans_handle),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     NULL, NULL);
+					     0, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
-	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+	btrfs_transaction_cachep = cache_create("btrfs_transaction_cache",
 					     sizeof(struct btrfs_transaction),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     NULL, NULL);
+					     0, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
-	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
-					     sizeof(struct btrfs_transaction),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
-					     NULL, NULL);
+	btrfs_path_cachep = cache_create("btrfs_path_cache",
+					 sizeof(struct btrfs_transaction),
+					 0, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
-	btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
-					     256,
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD |
-						SLAB_DESTROY_BY_RCU),
-					     NULL, NULL);
+	btrfs_bit_radix_cachep = cache_create("btrfs_radix", 256,
+					      SLAB_DESTROY_BY_RCU, NULL);
 	if (!btrfs_bit_radix_cachep)
 		goto fail;
 	return 0;
-- 
cgit v1.2.3


From 8578f0f1fda41f8429a1037022b720275102ca65 Mon Sep 17 00:00:00 2001
From: Joel Becker <Joel.Becker@oracle.com>
Date: Wed, 25 Jul 2007 14:04:51 -0400
Subject: Btrfs: cleaner make clean

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 5af2426116d..a4e2df66bdb 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -18,5 +18,5 @@ KERNELDIR := /lib/modules/`uname -r`/build
 all::
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
 clean::
-	rm *.o btrfs.ko
+	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
 endif
-- 
cgit v1.2.3


From 9f3a742736cecda5a8778be70faa2f779458839f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 7 Aug 2007 15:52:19 -0400
Subject: Btrfs: Do snapshot deletion in smaller chunks.

Before, snapshot deletion was a single atomic unit.  This caused considerable
lock contention and required an unbounded amount of space.  Now,
the drop_progress field in the root item is used to indicate how far along
snapshot deletion is, and to resume where it left off.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 10 ++++++++
 fs/btrfs/ctree.h       |  4 ++--
 fs/btrfs/extent-tree.c | 45 ++++++++++++++++++++++++++++++++----
 fs/btrfs/transaction.c | 63 ++++++++++++++++++++++++++++++--------------------
 4 files changed, 91 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19a30b7c3a2..aa824e2c521 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -659,9 +659,16 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct buffer_head *b;
 	struct buffer_head *cow_buf;
 	struct btrfs_node *c;
+	struct btrfs_root_item *root_item = &root->root_item;
 	int slot;
 	int ret;
 	int level;
+	u8 lowest_level = 0;
+
+	if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
+		lowest_level = root_item->drop_level;
+		WARN_ON(ins_len || cow);
+	}
 
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
@@ -718,6 +725,9 @@ again:
 				slot = p->slots[level];
 				BUG_ON(btrfs_header_nritems(&c->header) == 1);
 			}
+			/* this is only true while dropping a snapshot */
+			if (level == lowest_level)
+				break;
 			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 		} else {
 			struct btrfs_leaf *l = (struct btrfs_leaf *)c;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0287bd51d87..73c2e75a136 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -333,10 +333,10 @@ struct btrfs_root {
 	u64 objectid;
 	u64 last_trans;
 	u32 blocksize;
-	int ref_cows;
 	u32 type;
 	u64 highest_inode;
 	u64 last_inode_alloc;
+	int ref_cows;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -1073,7 +1073,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, struct buffer_head *snap);
+			*root);
 /* root-item.c */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5ace2c33f1a..9455974dabe 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1561,12 +1561,21 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 	int i;
 	int slot;
 	int ret;
+	struct btrfs_root_item *root_item = &root->root_item;
+
 	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot < btrfs_header_nritems(
 		    btrfs_buffer_header(path->nodes[i])) - 1) {
+			struct btrfs_node *node;
+			node = btrfs_buffer_node(path->nodes[i]);
 			path->slots[i]++;
 			*level = i;
+			WARN_ON(*level == 0);
+			memcpy(&root_item->drop_progress,
+			       &node->ptrs[path->slots[i]].key,
+			       sizeof(root_item->drop_progress));
+			root_item->drop_level = i;
 			return 0;
 		} else {
 			ret = btrfs_free_extent(trans, root,
@@ -1587,7 +1596,7 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
  * decremented.
  */
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, struct buffer_head *snap)
+			*root)
 {
 	int ret = 0;
 	int wret;
@@ -1595,14 +1604,33 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_path *path;
 	int i;
 	int orig_level;
+	int num_walks = 0;
+	struct btrfs_root_item *root_item = &root->root_item;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	level = btrfs_header_level(btrfs_buffer_header(snap));
+	level = btrfs_header_level(btrfs_buffer_header(root->node));
 	orig_level = level;
-	path->nodes[level] = snap;
-	path->slots[level] = 0;
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		path->nodes[level] = root->node;
+		path->slots[level] = 0;
+	} else {
+		struct btrfs_key key;
+		struct btrfs_disk_key *found_key;
+		struct btrfs_node *node;
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			ret = wret;
+			goto out;
+		}
+		level = root_item->drop_level;
+		node = btrfs_buffer_node(path->nodes[level]);
+		found_key = &node->ptrs[path->slots[level]].key;
+		WARN_ON(memcmp(found_key, &root_item->drop_progress,
+			       sizeof(*found_key)));
+	}
 	while(1) {
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
@@ -1615,12 +1643,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
+		num_walks++;
+		if (num_walks > 10) {
+			struct btrfs_key key;
+			btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+			ret = -EAGAIN;
+			get_bh(root->node);
+			break;
+		}
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
 			btrfs_block_release(root, path->nodes[i]);
 		}
 	}
+out:
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0f494fe365c..498626470a0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -213,10 +213,7 @@ static int wait_for_commit(struct btrfs_root *root,
 
 struct dirty_root {
 	struct list_head list;
-	struct btrfs_key snap_key;
-	struct buffer_head *commit_root;
 	struct btrfs_root *root;
-	int free_on_drop;
 };
 
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
@@ -226,10 +223,7 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
-	memcpy(&dirty->snap_key, &root->root_key, sizeof(root->root_key));
-	dirty->commit_root = root->node;
 	dirty->root = root;
-	dirty->free_on_drop = 1;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -241,7 +235,6 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 	struct dirty_root *dirty;
 	struct btrfs_root *gang[8];
 	struct btrfs_root *root;
-	struct btrfs_root_item tmp_item;
 	int i;
 	int ret;
 	int err = 0;
@@ -267,13 +260,16 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 			}
 			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 			BUG_ON(!dirty);
-			memcpy(&dirty->snap_key, &root->root_key,
-			       sizeof(root->root_key));
-			dirty->commit_root = root->commit_root;
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+
+			memset(&root->root_item.drop_progress, 0,
+			       sizeof(struct btrfs_disk_key));
+			root->root_item.drop_level = 0;
+
+			memcpy(dirty->root, root, sizeof(*root));
+			dirty->root->node = root->commit_root;
 			root->commit_root = NULL;
-			dirty->root = root;
-			dirty->free_on_drop = 0;
-			memcpy(&tmp_item, &root->root_item, sizeof(tmp_item));
 
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_blocknr(&root->root_item,
@@ -283,17 +279,21 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 						&root->root_item);
 			if (err)
 				break;
-			refs = btrfs_root_refs(&tmp_item);
-			btrfs_set_root_refs(&tmp_item, refs - 1);
+
+			refs = btrfs_root_refs(&dirty->root->root_item);
+			btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
 			err = btrfs_update_root(trans, root->fs_info->tree_root,
-						&dirty->snap_key,
-						&tmp_item);
+						&dirty->root->root_key,
+						&dirty->root->root_item);
 
 			BUG_ON(err);
-			if (refs == 1)
+			if (refs == 1) {
 				list_add(&dirty->list, list);
-			else
+			} else {
+				WARN_ON(1);
+				kfree(dirty->root);
 				kfree(dirty);
+			}
 		}
 	}
 	return err;
@@ -305,23 +305,36 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
+	int err;
+
 	while(!list_empty(list)) {
 		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
-		trans = btrfs_start_transaction(tree_root, 1);
-		ret = btrfs_drop_snapshot(trans, dirty->root,
-					  dirty->commit_root);
+		while(1) {
+			trans = btrfs_start_transaction(tree_root, 1);
+			ret = btrfs_drop_snapshot(trans, dirty->root);
+			if (ret != -EAGAIN) {
+				break;
+			}
+			err = btrfs_update_root(trans,
+					tree_root,
+					&dirty->root->root_key,
+					&dirty->root->root_item);
+			if (err)
+				ret = err;
+			ret = btrfs_end_transaction(trans, tree_root);
+			BUG_ON(ret);
+		}
 		BUG_ON(ret);
-		ret = btrfs_del_root(trans, tree_root, &dirty->snap_key);
+		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret)
 			break;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		if (dirty->free_on_drop)
-			kfree(dirty->root);
+		kfree(dirty->root);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
 		btrfs_btree_balance_dirty(tree_root);
-- 
cgit v1.2.3


From 3c69faecb8d83cb2ef085a98b196a3fecea67725 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 7 Aug 2007 15:52:22 -0400
Subject: Btrfs: Fold some btree readahead routines into something more
 generic.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c | 30 +-------------------
 fs/btrfs/inode.c       | 69 ++------------------------------------------
 4 files changed, 81 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aa824e2c521..7a08491e208 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -639,6 +639,73 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	return 1;
 }
 
+/*
+ * readahead one full node of leaves
+ */
+static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
+			     int slot)
+{
+	struct btrfs_node *node;
+	int i;
+	u32 nritems;
+	u64 item_objectid;
+	u64 blocknr;
+	u64 search;
+	u64 cluster_start;
+	int ret;
+	int nread = 0;
+	int direction = path->reada;
+	struct radix_tree_root found;
+	unsigned long gang[8];
+	struct buffer_head *bh;
+
+	if (!path->nodes[1])
+		return;
+
+	node = btrfs_buffer_node(path->nodes[1]);
+	search = btrfs_node_blockptr(node, slot);
+	bh = btrfs_find_tree_block(root, search);
+	if (bh) {
+		brelse(bh);
+		return;
+	}
+
+	init_bit_radix(&found);
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = slot; i < nritems; i++) {
+		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
+		blocknr = btrfs_node_blockptr(node, i);
+		set_radix_bit(&found, blocknr);
+	}
+	if (direction > 0) {
+		cluster_start = search - 4;
+		if (cluster_start > search)
+			cluster_start = 0;
+	} else
+		cluster_start = search + 4;
+	while(1) {
+		ret = find_first_radix_bit(&found, gang, 0, ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			blocknr = gang[i];
+			clear_radix_bit(&found, blocknr);
+			if (nread > 64)
+				continue;
+			if (direction > 0 && cluster_start <= blocknr &&
+			    cluster_start + 8 > blocknr) {
+				cluster_start = blocknr;
+				readahead_tree_block(root, blocknr);
+				nread++;
+			} else if (direction < 0 && cluster_start >= blocknr &&
+				   blocknr + 8 > cluster_start) {
+				cluster_start = blocknr;
+				readahead_tree_block(root, blocknr);
+				nread++;
+			}
+		}
+	}
+}
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -660,9 +727,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct buffer_head *cow_buf;
 	struct btrfs_node *c;
 	struct btrfs_root_item *root_item = &root->root_item;
+	u64 blocknr;
 	int slot;
 	int ret;
 	int level;
+	int should_reada = p->reada;
 	u8 lowest_level = 0;
 
 	if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
@@ -728,7 +797,11 @@ again:
 			/* this is only true while dropping a snapshot */
 			if (level == lowest_level)
 				break;
+			blocknr = btrfs_node_blockptr(c, slot);
+			if (level == 1 && should_reada)
+				reada_for_search(root, p, slot);
 			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
+
 		} else {
 			struct btrfs_leaf *l = (struct btrfs_leaf *)c;
 			p->slots[level] = slot;
@@ -1915,6 +1988,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		blocknr = btrfs_node_blockptr(c_node, slot);
 		if (next)
 			btrfs_block_release(root, next);
+		if (level == 1 && path->reada)
+			reada_for_search(root, path, slot);
 		next = read_tree_block(root, blocknr);
 		break;
 	}
@@ -1927,6 +2002,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		path->slots[level] = 0;
 		if (!level)
 			break;
+		if (level == 1 && path->reada)
+			reada_for_search(root, path, slot);
 		next = read_tree_block(root,
 		       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 73c2e75a136..c5a18d5d7f7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -177,6 +177,7 @@ struct btrfs_node {
 struct btrfs_path {
 	struct buffer_head *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
+	int reada;
 };
 
 /*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9455974dabe..5d4d5d8db8e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -32,33 +32,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
 
-static void reada_extent_leaves(struct btrfs_root *root,
-				struct btrfs_path *path, u64 limit)
-{
-	struct btrfs_node *node;
-	int i;
-	int nritems;
-	u64 item_objectid;
-	u64 blocknr;
-	int slot;
-	int ret;
-
-	if (!path->nodes[1])
-		return;
-	node = btrfs_buffer_node(path->nodes[1]);
-	slot = path->slots[1] + 1;
-	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot; i < nritems && i < slot + 8; i++) {
-		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-		if (item_objectid > limit)
-			break;
-		blocknr = btrfs_node_blockptr(node, i);
-		ret = readahead_tree_block(root, blocknr);
-		if (ret)
-			break;
-	}
-}
-
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -84,6 +57,7 @@ static int cache_block_group(struct btrfs_root *root,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	path->reada = 1;
 	key.objectid = block_group->key.objectid;
 	key.flags = 0;
 	key.offset = 0;
@@ -94,12 +68,10 @@ static int cache_block_group(struct btrfs_root *root,
 	if (ret && path->slots[0] > 0)
 		path->slots[0]--;
 	limit = block_group->key.objectid + block_group->key.offset;
-	reada_extent_leaves(root, path, limit);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&leaf->header)) {
-			reada_extent_leaves(root, path, limit);
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto err;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fa9c531c86b..3889032fc44 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -374,40 +374,6 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/*
- * truncates go from a high offset to a low offset.  So, walk
- * from hi to lo in the node and issue readas.  Stop when you find
- * keys from a different objectid
- */
-static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path,
-			   u64 objectid)
-{
-	struct btrfs_node *node;
-	int i;
-	int nritems;
-	u64 item_objectid;
-	u64 blocknr;
-	int slot;
-	int ret;
-
-	if (!path->nodes[1])
-		return;
-	node = btrfs_buffer_node(path->nodes[1]);
-	slot = path->slots[1];
-	if (slot == 0)
-		return;
-	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot - 1; i >= 0; i--) {
-		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-		if (item_objectid != objectid)
-			break;
-		blocknr = btrfs_node_blockptr(node, i);
-		ret = readahead_tree_block(root, blocknr);
-		if (ret)
-			break;
-	}
-}
-
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -434,6 +400,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int del_item;
 
 	path = btrfs_alloc_path();
+	path->reada = -1;
 	BUG_ON(!path);
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	key.objectid = inode->i_ino;
@@ -450,7 +417,6 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			BUG_ON(path->slots[0] == 0);
 			path->slots[0]--;
 		}
-		reada_truncate(root, path, inode->i_ino);
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		found_key = &leaf->items[path->slots[0]].key;
 		found_type = btrfs_disk_key_type(found_key);
@@ -827,36 +793,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	return d_splice_alias(inode, dentry);
 }
 
-/*
- * readahead one full node of leaves as long as their keys include
- * the objectid supplied
- */
-static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path,
-			 u64 objectid)
-{
-	struct btrfs_node *node;
-	int i;
-	u32 nritems;
-	u64 item_objectid;
-	u64 blocknr;
-	int slot;
-	int ret;
-
-	if (!path->nodes[1])
-		return;
-	node = btrfs_buffer_node(path->nodes[1]);
-	slot = path->slots[1];
-	nritems = btrfs_header_nritems(&node->header);
-	for (i = slot + 1; i < nritems; i++) {
-		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-		if (item_objectid != objectid)
-			break;
-		blocknr = btrfs_node_blockptr(node, i);
-		ret = readahead_tree_block(root, blocknr);
-		if (ret)
-			break;
-	}
-}
 static unsigned char btrfs_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
@@ -890,18 +826,17 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 	path = btrfs_alloc_path();
+	path->reada = 1;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
 	advance = 0;
-	reada_leaves(root, path, inode->i_ino);
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		nritems = btrfs_header_nritems(&leaf->header);
 		slot = path->slots[0];
 		if (advance || slot >= nritems) {
 			if (slot >= nritems -1) {
-				reada_leaves(root, path, inode->i_ino);
 				ret = btrfs_next_leaf(root, path);
 				if (ret)
 					break;
-- 
cgit v1.2.3


From 6702ed490ca0bb44e17131818a5a18b773957c5a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 7 Aug 2007 16:15:09 -0400
Subject: Btrfs: Add run time btree defrag, and an ioctl to force btree defrag

This adds two types of btree defrag, a run time form that tries to
defrag recently allocated blocks in the btree when they are still in ram,
and an ioctl that forces defrag of all btree blocks.

File data blocks are not defragged yet, but this can make a huge difference
in sequential btree reads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/ctree.c       | 195 +++++++++++++++++++++++++++++++++----------
 fs/btrfs/ctree.h       |  19 ++++-
 fs/btrfs/disk-io.c     |   8 ++
 fs/btrfs/extent-tree.c |  53 +++++++-----
 fs/btrfs/file.c        |   2 +-
 fs/btrfs/inode.c       |  33 +++++++-
 fs/btrfs/ioctl.h       |   2 +-
 fs/btrfs/transaction.c | 105 +++++++++++++++++++----
 fs/btrfs/transaction.h |   1 +
 fs/btrfs/tree-defrag.c | 222 +++++++++++++++++++++++++++++++++++++++++++++++++
 11 files changed, 553 insertions(+), 89 deletions(-)
 create mode 100644 fs/btrfs/tree-defrag.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a4e2df66bdb..9321438c5ee 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
-	   transaction.o bit-radix.o inode.o file.o
+	   transaction.o bit-radix.o inode.o file.o tree-defrag.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7a08491e208..c7e47e77723 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -65,44 +65,44 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	memset(p, 0, sizeof(*p));
 }
 
-static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 			   *root, struct buffer_head *buf, struct buffer_head
 			   *parent, int parent_slot, struct buffer_head
-			   **cow_ret)
+			   **cow_ret, u64 search_start, u64 empty_size)
 {
 	struct buffer_head *cow;
 	struct btrfs_node *cow_node;
-	int ret;
+	int ret = 0;
+	int different_trans = 0;
 
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 	WARN_ON(!buffer_uptodate(buf));
-	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->running_transaction->transid);
-		WARN_ON(1);
-	}
-	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->generation);
-		WARN_ON(1);
-	}
-	if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
-				    trans->transid) {
-		*cow_ret = buf;
-		return 0;
-	}
-	cow = btrfs_alloc_free_block(trans, root, buf->b_blocknr);
+	cow = btrfs_alloc_free_block(trans, root, search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
+
 	cow_node = btrfs_buffer_node(cow);
 	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
 		WARN_ON(1);
+
 	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
 	btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
 	btrfs_set_header_generation(&cow_node->header, trans->transid);
 	btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
-	ret = btrfs_inc_ref(trans, root, buf);
-	if (ret)
-		return ret;
+
+	WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) >
+		trans->transid);
+	if (btrfs_header_generation(btrfs_buffer_header(buf)) !=
+				    trans->transid) {
+		different_trans = 1;
+		ret = btrfs_inc_ref(trans, root, buf);
+		if (ret)
+			return ret;
+	} else {
+		WARN_ON(!root->ref_cows);
+		clean_tree_block(trans, root, buf);
+	}
+
 	if (buf == root->node) {
 		root->node = cow;
 		get_bh(cow);
@@ -114,6 +114,8 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
 					bh_blocknr(cow));
 		btrfs_mark_buffer_dirty(parent);
+		WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) !=
+				    trans->transid);
 		btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
 	}
 	btrfs_block_release(root, buf);
@@ -122,6 +124,115 @@ static int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 	return 0;
 }
 
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct buffer_head *buf, struct buffer_head
+			   *parent, int parent_slot, struct buffer_head
+			   **cow_ret)
+{
+	u64 search_start;
+	if (trans->transaction != root->fs_info->running_transaction) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->running_transaction->transid);
+		WARN_ON(1);
+	}
+	if (trans->transid != root->fs_info->generation) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->generation);
+		WARN_ON(1);
+	}
+	if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
+				    trans->transid) {
+		*cow_ret = buf;
+		return 0;
+	}
+
+	search_start = bh_blocknr(buf) & ~((u64)65535);
+	return __btrfs_cow_block(trans, root, buf, parent,
+				 parent_slot, cow_ret, search_start, 0);
+}
+
+static int close_blocks(u64 blocknr, u64 other)
+{
+	if (blocknr < other && other - blocknr < 8)
+		return 1;
+	if (blocknr > other && blocknr - other < 8)
+		return 1;
+	return 0;
+}
+
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct buffer_head *parent,
+		       int cache_only)
+{
+	struct btrfs_node *parent_node;
+	struct buffer_head *cur_bh;
+	struct buffer_head *tmp_bh;
+	u64 blocknr;
+	u64 search_start = 0;
+	u64 other;
+	u32 parent_nritems;
+	int start_slot;
+	int end_slot;
+	int i;
+	int err = 0;
+
+	if (trans->transaction != root->fs_info->running_transaction) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->running_transaction->transid);
+		WARN_ON(1);
+	}
+	if (trans->transid != root->fs_info->generation) {
+		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		       root->fs_info->generation);
+		WARN_ON(1);
+	}
+	parent_node = btrfs_buffer_node(parent);
+	parent_nritems = btrfs_header_nritems(&parent_node->header);
+
+	start_slot = 0;
+	end_slot = parent_nritems;
+
+	if (parent_nritems == 1)
+		return 0;
+
+	for (i = start_slot; i < end_slot; i++) {
+		int close = 1;
+		blocknr = btrfs_node_blockptr(parent_node, i);
+		if (i > 0) {
+			other = btrfs_node_blockptr(parent_node, i - 1);
+			close = close_blocks(blocknr, other);
+		}
+		if (close && i < end_slot - 1) {
+			other = btrfs_node_blockptr(parent_node, i + 1);
+			close = close_blocks(blocknr, other);
+		}
+		if (close)
+			continue;
+
+		cur_bh = btrfs_find_tree_block(root, blocknr);
+		if (!cur_bh || !buffer_uptodate(cur_bh) ||
+		    buffer_locked(cur_bh)) {
+			if (cache_only) {
+				brelse(cur_bh);
+				continue;
+			}
+			brelse(cur_bh);
+			cur_bh = read_tree_block(root, blocknr);
+		}
+		if (search_start == 0) {
+			search_start = bh_blocknr(cur_bh) & ~((u64)65535);
+		}
+		err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
+					&tmp_bh, search_start,
+					min(8, end_slot - i));
+		if (err)
+			break;
+		search_start = bh_blocknr(tmp_bh);
+		brelse(tmp_bh);
+	}
+	return err;
+}
+
 /*
  * The leaf data grows from end-to-front in the node.
  * this returns the address of the start of the last item,
@@ -221,6 +332,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 
 		parent_slot = path->slots[level + 1];
 		parent_key = &parent->ptrs[parent_slot].key;
+
 		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
 		       sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
@@ -643,7 +755,7 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
  * readahead one full node of leaves
  */
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
-			     int slot)
+			     int level, int slot)
 {
 	struct btrfs_node *node;
 	int i;
@@ -659,10 +771,13 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 	unsigned long gang[8];
 	struct buffer_head *bh;
 
-	if (!path->nodes[1])
+	if (level == 0)
+		return;
+
+	if (!path->nodes[level])
 		return;
 
-	node = btrfs_buffer_node(path->nodes[1]);
+	node = btrfs_buffer_node(path->nodes[level]);
 	search = btrfs_node_blockptr(node, slot);
 	bh = btrfs_find_tree_block(root, search);
 	if (bh) {
@@ -690,7 +805,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 		for (i = 0; i < ret; i++) {
 			blocknr = gang[i];
 			clear_radix_bit(&found, blocknr);
-			if (nread > 64)
+			if (nread > 32)
 				continue;
 			if (direction > 0 && cluster_start <= blocknr &&
 			    cluster_start + 8 > blocknr) {
@@ -726,7 +841,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct buffer_head *b;
 	struct buffer_head *cow_buf;
 	struct btrfs_node *c;
-	struct btrfs_root_item *root_item = &root->root_item;
 	u64 blocknr;
 	int slot;
 	int ret;
@@ -734,11 +848,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int should_reada = p->reada;
 	u8 lowest_level = 0;
 
-	if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
-		lowest_level = root_item->drop_level;
-		WARN_ON(ins_len || cow);
-	}
-
+	lowest_level = p->lowest_level;
+	WARN_ON(lowest_level && ins_len);
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 again:
@@ -798,8 +909,8 @@ again:
 			if (level == lowest_level)
 				break;
 			blocknr = btrfs_node_blockptr(c, slot);
-			if (level == 1 && should_reada)
-				reada_for_search(root, p, slot);
+			if (should_reada)
+				reada_for_search(root, p, level, slot);
 			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 
 		} else {
@@ -960,7 +1071,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr);
+	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 	c = btrfs_buffer_node(t);
@@ -1070,7 +1181,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	c_nritems = btrfs_header_nritems(&c->header);
-	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr);
+	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0);
 	if (IS_ERR(split_buffer))
 		return PTR_ERR(split_buffer);
 
@@ -1461,7 +1572,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	nritems = btrfs_header_nritems(&l->header);
 	mid = (nritems + 1)/ 2;
 
-	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
 	if (IS_ERR(right_buffer))
 		return PTR_ERR(right_buffer);
 
@@ -1560,7 +1671,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (!double_split)
 		return ret;
-	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr);
+	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
 	if (IS_ERR(right_buffer))
 		return PTR_ERR(right_buffer);
 
@@ -1988,8 +2099,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		blocknr = btrfs_node_blockptr(c_node, slot);
 		if (next)
 			btrfs_block_release(root, next);
-		if (level == 1 && path->reada)
-			reada_for_search(root, path, slot);
+		if (path->reada)
+			reada_for_search(root, path, level, slot);
 		next = read_tree_block(root, blocknr);
 		break;
 	}
@@ -2002,8 +2113,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		path->slots[level] = 0;
 		if (!level)
 			break;
-		if (level == 1 && path->reada)
-			reada_for_search(root, path, slot);
+		if (path->reada)
+			reada_for_search(root, path, level, slot);
 		next = read_tree_block(root,
 		       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c5a18d5d7f7..42aa20301bc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -178,6 +178,7 @@ struct btrfs_path {
 	struct buffer_head *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
 	int reada;
+	int lowest_level;
 };
 
 /*
@@ -338,6 +339,9 @@ struct btrfs_root {
 	u64 highest_inode;
 	u64 last_inode_alloc;
 	int ref_cows;
+	struct btrfs_key defrag_progress;
+	int defrag_running;
+	int defrag_level;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -1031,10 +1035,11 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root, u64 hint);
+					    struct btrfs_root *root, u64 hint,
+					    u64 empty_size);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u64 num_blocks, u64 search_start,
+		       u64 num_blocks, u64 empty_size, u64 search_start,
 		       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct buffer_head *buf);
@@ -1051,6 +1056,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 /* ctree.c */
+int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
+			   *root, struct buffer_head *buf, struct buffer_head
+			   *parent, int parent_slot, struct buffer_head
+			   **cow_ret);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1060,6 +1069,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
+int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct buffer_head *parent,
+		       int cache_only);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
@@ -1171,4 +1183,7 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 *hint_block);
+/* tree-defrag.c */
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60db85bc4ba..c948416fea3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -273,7 +273,9 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct buffer_head *buf)
 {
 	WARN_ON(atomic_read(&buf->b_count) == 0);
+	lock_buffer(buf);
 	clear_buffer_dirty(buf);
+	unlock_buffer(buf);
 	return 0;
 }
 
@@ -294,6 +296,9 @@ static int __setup_root(int blocksize,
 	root->last_inode_alloc = 0;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
+	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+	root->defrag_running = 0;
+	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
 	return 0;
 }
@@ -585,6 +590,7 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	btrfs_transaction_flush_work(root);
 	mutex_lock(&fs_info->fs_mutex);
+	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	/* run commit again to  drop the original snapshot */
@@ -616,7 +622,9 @@ void btrfs_mark_buffer_dirty(struct buffer_head *bh)
 {
 	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+
 	WARN_ON(!atomic_read(&bh->b_count));
+
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
 			(unsigned long long)bh->b_blocknr,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5d4d5d8db8e..26b8d340649 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,7 +23,8 @@
 #include "transaction.h"
 
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_blocks, u64 search_start,
+			    *orig_root, u64 num_blocks, u64 empty_size,
+			    u64 search_start,
 			    u64 search_end, u64 hint_block,
 			    struct btrfs_key *ins, u64 exclude_start,
 			    u64 exclude_nr, int data);
@@ -379,7 +380,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0,
+	ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, 0,
 			       (u64)-1, 0, &ins, 0, 0, 0);
 	if (ret) {
 		btrfs_free_path(path);
@@ -533,7 +534,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_item *bi;
 	struct btrfs_key ins;
 
-	ret = find_free_extent(trans, extent_root, 0, 0, (u64)-1, 0, &ins,
+	ret = find_free_extent(trans, extent_root, 0, 0, 0, (u64)-1, 0, &ins,
 			       0, 0, 0);
 	/* FIXME, set bit to recalc cache groups on next mount */
 	if (ret)
@@ -708,6 +709,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 static int try_remove_page(struct address_space *mapping, unsigned long index)
 {
 	int ret;
+	return 0;
 	ret = invalidate_mapping_pages(mapping, index, index);
 	return ret;
 }
@@ -866,7 +868,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_free_extent(trans, root, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
+	ret = find_free_extent(trans, root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
 	if (ret) {
 		btrfs_free_path(path);
 		return ret;
@@ -983,8 +985,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  * Any available blocks before search_start are skipped.
  */
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_blocks, u64 search_start, u64
-			    search_end, u64 hint_block,
+			    *orig_root, u64 num_blocks, u64 empty_size,
+			    u64 search_start, u64 search_end, u64 hint_block,
 			    struct btrfs_key *ins, u64 exclude_start,
 			    u64 exclude_nr, int data)
 {
@@ -1042,6 +1044,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 						     data, 1);
 	}
 
+	total_needed += empty_size;
 	path = btrfs_alloc_path();
 
 check_failed:
@@ -1157,9 +1160,11 @@ check_pending:
 			goto error;
 		}
 		search_start = orig_search_start;
-		if (wrapped)
+		if (wrapped) {
+			if (!full_scan)
+				total_needed -= empty_size;
 			full_scan = 1;
-		else
+		} else
 			wrapped = 1;
 		goto new_group;
 	}
@@ -1238,9 +1243,11 @@ new_group:
 			ret = -ENOSPC;
 			goto error;
 		}
-		if (wrapped)
+		if (wrapped) {
+			if (!full_scan)
+				total_needed -= empty_size;
 			full_scan = 1;
-		else
+		} else
 			wrapped = 1;
 	}
 	block_group = btrfs_lookup_block_group(info, search_start);
@@ -1264,7 +1271,7 @@ error:
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u64 num_blocks, u64 hint_block,
+		       u64 num_blocks, u64 empty_size, u64 hint_block,
 		       u64 search_end, struct btrfs_key *ins, int data)
 {
 	int ret;
@@ -1303,7 +1310,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	 * in the correct block group.
 	 */
 	if (data) {
-		ret = find_free_extent(trans, root, 0, 0,
+		ret = find_free_extent(trans, root, 0, 0, 0,
 				       search_end, 0, &prealloc_key, 0, 0, 0);
 		BUG_ON(ret);
 		if (ret)
@@ -1313,8 +1320,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	}
 
 	/* do the real allocation */
-	ret = find_free_extent(trans, root, num_blocks, search_start,
-			       search_end, hint_block, ins,
+	ret = find_free_extent(trans, root, num_blocks, empty_size,
+			       search_start, search_end, hint_block, ins,
 			       exclude_start, exclude_nr, data);
 	BUG_ON(ret);
 	if (ret)
@@ -1333,7 +1340,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		exclude_start = ins->objectid;
 		exclude_nr = ins->offset;
 		hint_block = exclude_start + exclude_nr;
-		ret = find_free_extent(trans, root, 0, search_start,
+		ret = find_free_extent(trans, root, 0, 0, search_start,
 				       search_end, hint_block,
 				       &prealloc_key, exclude_start,
 				       exclude_nr, 0);
@@ -1368,14 +1375,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
  * returns the tree buffer or NULL.
  */
 struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root, u64 hint)
+					   struct btrfs_root *root, u64 hint,
+					   u64 empty_size)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, hint, (unsigned long)-1, &ins, 0);
+				 1, empty_size, hint,
+				 (unsigned long)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
@@ -1385,6 +1394,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		btrfs_free_extent(trans, root, ins.objectid, 1, 0);
 		return ERR_PTR(-ENOMEM);
 	}
+	WARN_ON(buffer_dirty(buf));
 	set_buffer_uptodate(buf);
 	set_buffer_checked(buf);
 	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
@@ -1591,13 +1601,15 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		struct btrfs_key key;
 		struct btrfs_disk_key *found_key;
 		struct btrfs_node *node;
+
 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+		level = root_item->drop_level;
+		path->lowest_level = level;
 		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0) {
+		if (wret < 0) {
 			ret = wret;
 			goto out;
 		}
-		level = root_item->drop_level;
 		node = btrfs_buffer_node(path->nodes[level]);
 		found_key = &node->ptrs[path->slots[level]].key;
 		WARN_ON(memcmp(found_key, &root_item->drop_progress,
@@ -1617,8 +1629,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = wret;
 		num_walks++;
 		if (num_walks > 10) {
-			struct btrfs_key key;
-			btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
 			ret = -EAGAIN;
 			get_bh(root->node);
 			break;
@@ -1627,6 +1637,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
 			btrfs_block_release(root, path->nodes[i]);
+			path->nodes[i] = 0;
 		}
 	}
 out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1fe38fe8415..00b118a2db6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -512,7 +512,7 @@ static int prepare_pages(struct btrfs_root *root,
 	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
 	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		err = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, hint_block, (u64)-1,
+					 num_blocks, 0, hint_block, (u64)-1,
 					 &ins, 1);
 		if (err)
 			goto failed_truncate;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3889032fc44..12aa043b9f6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -554,7 +554,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 				 &alloc_hint);
 	if (ret)
 		goto out;
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
 				 alloc_hint, (u64)-1, &ins, 1);
 	if (ret)
 		goto out;
@@ -1360,7 +1360,7 @@ not_found:
 	if (create & BTRFS_GET_BLOCK_CREATE) {
 		struct btrfs_key ins;
 		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 1, alloc_hint, (u64)-1,
+					 1, 0, alloc_hint, (u64)-1,
 					 &ins, 1);
 		if (ret) {
 			err = ret;
@@ -1998,7 +1998,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
-	subvol = btrfs_alloc_free_block(trans, root, 0);
+	subvol = btrfs_alloc_free_block(trans, root, 0, 0);
 	if (IS_ERR(subvol))
 		return PTR_ERR(subvol);
 	leaf = btrfs_buffer_leaf(subvol);
@@ -2159,7 +2159,9 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_vol_args vol_args;
+	struct btrfs_trans_handle *trans;
 	int ret = 0;
+	int err;
 	struct btrfs_dir_item *di;
 	int namelen;
 	struct btrfs_path *path;
@@ -2196,6 +2198,31 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		else
 			ret = create_snapshot(root, vol_args.name, namelen);
 		break;
+
+	case BTRFS_IOC_DEFRAG:
+		mutex_lock(&root->fs_info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+		while (1) {
+			root->defrag_running = 1;
+			err = btrfs_defrag_leaves(trans, root, 0);
+
+			btrfs_end_transaction(trans, root);
+			mutex_unlock(&root->fs_info->fs_mutex);
+
+			btrfs_btree_balance_dirty(root);
+
+			mutex_lock(&root->fs_info->fs_mutex);
+			trans = btrfs_start_transaction(root, 1);
+			if (err != -EAGAIN)
+				break;
+		}
+		root->defrag_running = 0;
+		btrfs_end_transaction(trans, root);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		ret = 0;
+		break;
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 23bed48c51a..8bc47dec286 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -28,6 +28,6 @@ struct btrfs_ioctl_vol_args {
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_ADD_DISK _IOW(BTRFS_IOCTL_MAGIC, 2, \
+#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
 				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 498626470a0..338a7199363 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -29,6 +29,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 static struct workqueue_struct *trans_wq;
 
 #define BTRFS_ROOT_TRANS_TAG 0
+#define BTRFS_ROOT_DEFRAG_TAG 1
 
 static void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -69,35 +70,41 @@ static int join_transaction(struct btrfs_root *root)
 	return 0;
 }
 
+static int record_root_in_trans(struct btrfs_root *root)
+{
+	u64 running_trans_id = root->fs_info->running_transaction->transid;
+	if (root->ref_cows && root->last_trans < running_trans_id) {
+		WARN_ON(root == root->fs_info->extent_root);
+		if (root->root_item.refs != 0) {
+			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+				   (unsigned long)root->root_key.objectid,
+				   BTRFS_ROOT_TRANS_TAG);
+			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+				   (unsigned long)root->root_key.objectid,
+				   BTRFS_ROOT_DEFRAG_TAG);
+			root->commit_root = root->node;
+			get_bh(root->node);
+		} else {
+			WARN_ON(1);
+		}
+		root->last_trans = running_trans_id;
+	}
+	return 0;
+}
+
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_blocks)
 {
 	struct btrfs_trans_handle *h =
 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	int ret;
-	u64 running_trans_id;
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	ret = join_transaction(root);
 	BUG_ON(ret);
-	running_trans_id = root->fs_info->running_transaction->transid;
 
-	if (root != root->fs_info->tree_root && root->last_trans <
-	    running_trans_id) {
-		WARN_ON(root == root->fs_info->extent_root);
-		WARN_ON(root->ref_cows != 1);
-		if (root->root_item.refs != 0) {
-			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-					   (unsigned long)root->root_key.objectid,
-					   BTRFS_ROOT_TRANS_TAG);
-			root->commit_root = root->node;
-			get_bh(root->node);
-		} else {
-			WARN_ON(1);
-		}
-	}
-	root->last_trans = running_trans_id;
-	h->transid = running_trans_id;
+	record_root_in_trans(root);
+	h->transid = root->fs_info->running_transaction->transid;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
@@ -155,6 +162,15 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 					      gang[i]);
 			if (!page)
 				continue;
+			if (PageWriteback(page)) {
+				if (PageDirty(page))
+					wait_on_page_writeback(page);
+				else {
+					unlock_page(page);
+					page_cache_release(page);
+					continue;
+				}
+			}
 			err = write_one_page(page, 0);
 			if (err)
 				werr = err;
@@ -299,6 +315,58 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 	return err;
 }
 
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
+{
+	struct btrfs_root *gang[1];
+	struct btrfs_root *root;
+	struct btrfs_root *tree_root = info->tree_root;
+	struct btrfs_trans_handle *trans;
+	int i;
+	int ret;
+	int err = 0;
+	u64 last = 0;
+
+	trans = btrfs_start_transaction(tree_root, 1);
+	while(1) {
+		ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
+						 (void **)gang, last,
+						 ARRAY_SIZE(gang),
+						 BTRFS_ROOT_DEFRAG_TAG);
+		if (ret == 0)
+			break;
+		for (i = 0; i < ret; i++) {
+			root = gang[i];
+			last = root->root_key.objectid + 1;
+			radix_tree_tag_clear(&info->fs_roots_radix,
+				     (unsigned long)root->root_key.objectid,
+				     BTRFS_ROOT_DEFRAG_TAG);
+			if (root->defrag_running)
+				continue;
+
+			while (1) {
+				mutex_lock(&root->fs_info->trans_mutex);
+				record_root_in_trans(root);
+				mutex_unlock(&root->fs_info->trans_mutex);
+
+				root->defrag_running = 1;
+				err = btrfs_defrag_leaves(trans, root, 1);
+				btrfs_end_transaction(trans, tree_root);
+				mutex_unlock(&info->fs_mutex);
+
+				btrfs_btree_balance_dirty(root);
+
+				mutex_lock(&info->fs_mutex);
+				trans = btrfs_start_transaction(tree_root, 1);
+				if (err != -EAGAIN)
+					break;
+			}
+			root->defrag_running = 0;
+		}
+	}
+	btrfs_end_transaction(trans, tree_root);
+	return err;
+}
+
 static int drop_dirty_roots(struct btrfs_root *tree_root,
 			    struct list_head *list)
 {
@@ -475,6 +543,7 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 		goto out;
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
+	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 out:
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ebf44f3e111..8b2714e6556 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -70,5 +70,6 @@ void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
 void btrfs_init_transaction_sys(void);
 void btrfs_exit_transaction_sys(void);
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
+int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
new file mode 100644
index 00000000000..15d0a486fb5
--- /dev/null
+++ b/fs/btrfs/tree-defrag.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "print-tree.h"
+#include "transaction.h"
+
+static void reada_defrag(struct btrfs_root *root,
+			 struct btrfs_node *node)
+{
+	int i;
+	u32 nritems;
+	u64 blocknr;
+	int ret;
+
+	nritems = btrfs_header_nritems(&node->header);
+	for (i = 0; i < nritems; i++) {
+		blocknr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, blocknr);
+		if (ret)
+			break;
+	}
+}
+
+static int defrag_walk_down(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, int *level,
+			    int cache_only)
+{
+	struct buffer_head *next;
+	struct buffer_head *cur;
+	u64 blocknr;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while(*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (!cache_only && *level > 1 && path->slots[*level] == 0)
+			reada_defrag(root, btrfs_buffer_node(cur));
+
+		if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(btrfs_buffer_header(cur)))
+			break;
+
+		if (*level == 1) {
+			ret = btrfs_realloc_node(trans, root,
+						 path->nodes[*level],
+						 cache_only);
+			break;
+		}
+		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
+					      path->slots[*level]);
+
+		if (cache_only) {
+			next = btrfs_find_tree_block(root, blocknr);
+			if (!next || !buffer_uptodate(next) ||
+			   buffer_locked(next)) {
+				brelse(next);
+				path->slots[*level]++;
+				continue;
+			}
+		} else {
+			next = read_tree_block(root, blocknr);
+		}
+		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
+				      path->slots[*level], &next);
+		BUG_ON(ret);
+		ret = btrfs_realloc_node(trans, root, next, cache_only);
+		BUG_ON(ret);
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			btrfs_block_release(root, path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(btrfs_buffer_header(next));
+		path->slots[*level] = 0;
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+	btrfs_block_release(root, path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+	WARN_ON(ret);
+	return 0;
+}
+
+static int defrag_walk_up(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path, int *level,
+			  int cache_only)
+{
+	int i;
+	int slot;
+	struct btrfs_node *node;
+
+	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < btrfs_header_nritems(
+		    btrfs_buffer_header(path->nodes[i])) - 1) {
+			path->slots[i]++;
+			*level = i;
+			node = btrfs_buffer_node(path->nodes[i]);
+			WARN_ON(i == 0);
+			btrfs_disk_key_to_cpu(&root->defrag_progress,
+					      &node->ptrs[path->slots[i]].key);
+			root->defrag_level = i;
+			return 0;
+		} else {
+			btrfs_block_release(root, path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, int cache_only)
+{
+	struct btrfs_path *path = NULL;
+	struct buffer_head *tmp;
+	int ret = 0;
+	int wret;
+	int level;
+	int orig_level;
+	int i;
+	int num_runs = 0;
+
+	if (root->ref_cows == 0) {
+		goto out;
+	}
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	orig_level = level;
+	if (level == 0) {
+		goto out;
+	}
+	if (root->defrag_progress.objectid == 0) {
+		get_bh(root->node);
+		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
+		BUG_ON(ret);
+		ret = btrfs_realloc_node(trans, root, root->node, cache_only);
+		BUG_ON(ret);
+		path->nodes[level] = root->node;
+		path->slots[level] = 0;
+	} else {
+		level = root->defrag_level;
+		path->lowest_level = level;
+		wret = btrfs_search_slot(trans, root, &root->defrag_progress,
+					 path, 0, 1);
+
+		if (wret < 0) {
+			ret = wret;
+			goto out;
+		}
+		while(level > 0 && !path->nodes[level])
+			level--;
+		if (!path->nodes[level]) {
+			ret = 0;
+			goto out;
+		}
+	}
+
+	while(1) {
+		wret = defrag_walk_down(trans, root, path, &level, cache_only);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = defrag_walk_up(trans, root, path, &level, cache_only);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+		if (num_runs++ > 8) {
+			ret = -EAGAIN;
+			break;
+		}
+	}
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			btrfs_block_release(root, path->nodes[i]);
+			path->nodes[i] = 0;
+		}
+	}
+out:
+	if (path)
+		btrfs_free_path(path);
+	if (ret != -EAGAIN) {
+		memset(&root->defrag_progress, 0,
+		       sizeof(root->defrag_progress));
+	}
+	return ret;
+}
-- 
cgit v1.2.3


From f4468e94c86c2031f447788c4bfe7dfd2fcdc93a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 8 Aug 2007 10:08:58 -0400
Subject: Btrfs: Let some locks go during defrag and snapshot dropping

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 4 ++--
 fs/btrfs/extent-tree.c | 6 ++++--
 fs/btrfs/transaction.c | 8 ++++++++
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 42aa20301bc..8b7d01597f4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -300,9 +300,9 @@ struct btrfs_fs_info {
 	struct radix_tree_root block_group_data_radix;
 	struct radix_tree_root extent_map_radix;
 
-	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 3];
+	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 6];
 	int extent_tree_insert_nr;
-	u64 extent_tree_prealloc[BTRFS_MAX_LEVEL * 3];
+	u64 extent_tree_prealloc[BTRFS_MAX_LEVEL * 6];
 	int extent_tree_prealloc_nr;
 
 	u64 generation;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 26b8d340649..f7d76d34fc3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1018,7 +1018,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (num_blocks == 0) {
 		fill_prealloc = 1;
 		num_blocks = 1;
-		total_needed = (min(level + 1, BTRFS_MAX_LEVEL) + 2) * 3;
+		total_needed = (min(level + 1, BTRFS_MAX_LEVEL)) * 6;
 	}
 	if (fill_prealloc) {
 		u64 first;
@@ -1300,6 +1300,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 			ins->objectid;
 		ret = update_block_group(trans, root,
 					 ins->objectid, ins->offset, 1, 0, 0);
+		WARN_ON(info->extent_tree_insert_nr >
+			ARRAY_SIZE(info->extent_tree_insert));
 		BUG_ON(ret);
 		return 0;
 	}
@@ -1628,7 +1630,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret < 0)
 			ret = wret;
 		num_walks++;
-		if (num_walks > 10) {
+		if (num_walks > 2) {
 			ret = -EAGAIN;
 			get_bh(root->node);
 			break;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 338a7199363..88b0752550b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -354,6 +354,7 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 				mutex_unlock(&info->fs_mutex);
 
 				btrfs_btree_balance_dirty(root);
+				cond_resched();
 
 				mutex_lock(&info->fs_mutex);
 				trans = btrfs_start_transaction(tree_root, 1);
@@ -394,6 +395,12 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 				ret = err;
 			ret = btrfs_end_transaction(trans, tree_root);
 			BUG_ON(ret);
+			mutex_unlock(&tree_root->fs_info->fs_mutex);
+
+			btrfs_btree_balance_dirty(tree_root);
+			schedule();
+
+			mutex_lock(&tree_root->fs_info->fs_mutex);
 		}
 		BUG_ON(ret);
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
@@ -406,6 +413,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
 		btrfs_btree_balance_dirty(tree_root);
+		schedule();
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 26b8003f10569a9155b7539ef5a7379ee0c6b050 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 8 Aug 2007 20:17:12 -0400
Subject: Btrfs: Replace extent tree preallocation code with some bit radix
 magic.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   7 +-
 fs/btrfs/disk-io.c     |   3 +-
 fs/btrfs/extent-tree.c | 230 +++++++++----------------------------------------
 fs/btrfs/transaction.c |   2 +
 fs/btrfs/transaction.h |   2 +
 5 files changed, 49 insertions(+), 195 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8b7d01597f4..59e09e37ab9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -299,12 +299,7 @@ struct btrfs_fs_info {
 	struct radix_tree_root block_group_radix;
 	struct radix_tree_root block_group_data_radix;
 	struct radix_tree_root extent_map_radix;
-
-	u64 extent_tree_insert[BTRFS_MAX_LEVEL * 6];
-	int extent_tree_insert_nr;
-	u64 extent_tree_prealloc[BTRFS_MAX_LEVEL * 6];
-	int extent_tree_prealloc_nr;
-
+	struct radix_tree_root extent_ins_radix;
 	u64 generation;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c948416fea3..192211274ce 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -427,6 +427,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
 	init_bit_radix(&fs_info->extent_map_radix);
+	init_bit_radix(&fs_info->extent_ins_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
 	INIT_RADIX_TREE(&fs_info->block_group_data_radix, GFP_KERNEL);
@@ -443,8 +444,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	fs_info->do_barriers = 1;
-	fs_info->extent_tree_insert_nr = 0;
-	fs_info->extent_tree_prealloc_nr = 0;
 	fs_info->closing = 0;
 
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f7d76d34fc3..ba50bd7b9a7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,12 +22,6 @@
 #include "print-tree.h"
 #include "transaction.h"
 
-static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_blocks, u64 empty_size,
-			    u64 search_start,
-			    u64 search_end, u64 hint_block,
-			    struct btrfs_key *ins, u64 exclude_start,
-			    u64 exclude_nr, int data);
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -174,9 +168,6 @@ static u64 find_search_start(struct btrfs_root *root,
 
 	if (cache->data)
 		goto out;
-	if (num > 1) {
-		last = max(last, cache->last_prealloc);
-	}
 again:
 	ret = cache_block_group(root, cache);
 	if (ret)
@@ -374,18 +365,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct btrfs_leaf *l;
 	struct btrfs_extent_item *item;
-	struct btrfs_key ins;
 	u32 refs;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	ret = find_free_extent(trans, root->fs_info->extent_root, 0, 0, 0,
-			       (u64)-1, 0, &ins, 0, 0, 0);
-	if (ret) {
-		btrfs_free_path(path);
-		return ret;
-	}
+
 	key.objectid = blocknr;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -532,13 +517,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	int pending_ret;
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	struct btrfs_block_group_item *bi;
-	struct btrfs_key ins;
 
-	ret = find_free_extent(trans, extent_root, 0, 0, 0, (u64)-1, 0, &ins,
-			       0, 0, 0);
-	/* FIXME, set bit to recalc cache groups on next mount */
-	if (ret)
-		return ret;
 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 	if (ret < 0)
 		goto fail;
@@ -706,14 +685,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int try_remove_page(struct address_space *mapping, unsigned long index)
-{
-	int ret;
-	return 0;
-	ret = invalidate_mapping_pages(mapping, index, index);
-	return ret;
-}
-
 int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy)
 {
 	unsigned long gang[8];
@@ -732,6 +703,9 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy)
 			last = gang[i] + 1;
 		}
 	}
+	ret = find_first_radix_bit(&root->fs_info->extent_ins_radix, gang, 0,
+				   ARRAY_SIZE(gang));
+	WARN_ON(ret);
 	return 0;
 }
 
@@ -740,7 +714,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct radix_tree_root *unpin_radix)
 {
 	unsigned long gang[8];
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct btrfs_block_group_cache *block_group;
 	u64 first = 0;
 	int ret;
@@ -765,14 +738,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 				block_group->pinned--;
 				if (gang[i] < block_group->last_alloc)
 					block_group->last_alloc = gang[i];
-				if (gang[i] < block_group->last_prealloc)
-					block_group->last_prealloc = gang[i];
 				if (!block_group->data)
 					set_radix_bit(extent_radix, gang[i]);
 			}
-			try_remove_page(btree_inode->i_mapping,
-					gang[i] << (PAGE_CACHE_SHIFT -
-						    btree_inode->i_blkbits));
 		}
 	}
 	return 0;
@@ -785,7 +753,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	struct btrfs_extent_item extent_item;
 	int i;
 	int ret;
-	u64 super_blocks_used;
+	int err;
+	unsigned long gang[8];
 	struct btrfs_fs_info *info = extent_root->fs_info;
 
 	btrfs_set_extent_refs(&extent_item, 1);
@@ -794,16 +763,21 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 	btrfs_set_extent_owner(&extent_item, extent_root->root_key.objectid);
 
-	for (i = 0; i < extent_root->fs_info->extent_tree_insert_nr; i++) {
-		ins.objectid = extent_root->fs_info->extent_tree_insert[i];
-		super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
-		btrfs_set_super_blocks_used(&info->super_copy,
-					    super_blocks_used + 1);
-		ret = btrfs_insert_item(trans, extent_root, &ins, &extent_item,
-					sizeof(extent_item));
-		BUG_ON(ret);
+	while(1) {
+		ret = find_first_radix_bit(&info->extent_ins_radix, gang, 0,
+					   ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+
+		for (i = 0; i < ret; i++) {
+			ins.objectid = gang[i];
+			err = btrfs_insert_item(trans, extent_root, &ins,
+						&extent_item,
+						sizeof(extent_item));
+			clear_radix_bit(&info->extent_ins_radix, gang[i]);
+			WARN_ON(err);
+		}
 	}
-	extent_root->fs_info->extent_tree_insert_nr = 0;
 	return 0;
 }
 
@@ -856,7 +830,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_root *extent_root = info->extent_root;
 	int ret;
 	struct btrfs_extent_item *ei;
-	struct btrfs_key ins;
 	u32 refs;
 
 	key.objectid = blocknr;
@@ -868,12 +841,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_free_extent(trans, root, 0, 0, 0, (u64)-1, 0, &ins, 0, 0, 0);
-	if (ret) {
-		btrfs_free_path(path);
-		return ret;
-	}
-
 	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
 	if (ret < 0)
 		return ret;
@@ -1003,35 +970,17 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	int total_needed = num_blocks;
-	int total_found = 0;
-	int fill_prealloc = 0;
 	int level;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
 	u64 limit;
 
+	WARN_ON(num_blocks < 1);
 	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
 	level = btrfs_header_level(btrfs_buffer_header(root->node));
-	if (num_blocks == 0) {
-		fill_prealloc = 1;
-		num_blocks = 1;
-		total_needed = (min(level + 1, BTRFS_MAX_LEVEL)) * 6;
-	}
-	if (fill_prealloc) {
-		u64 first;
-		int nr = info->extent_tree_prealloc_nr;
-		first = info->extent_tree_prealloc[nr - 1];
-		if (info->extent_tree_prealloc_nr >= total_needed &&
-		    first >= search_start) {
-			ins->objectid = info->extent_tree_prealloc[0];
-			ins->offset = 1;
-			return 0;
-		}
-		info->extent_tree_prealloc_nr = 0;
-	}
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_blocks(&info->super_copy);
 	if (hint_block) {
@@ -1091,10 +1040,6 @@ check_failed:
 		l = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
-			if (fill_prealloc) {
-				info->extent_tree_prealloc_nr = 0;
-				total_found = 0;
-			}
 			if (start_found)
 				limit = last_block +
 					(block_group->key.offset >> 1);
@@ -1170,67 +1115,21 @@ check_pending:
 	}
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + num_blocks; test_block++) {
-		if (test_radix_bit(&info->pinned_radix, test_block)) {
+		if (test_radix_bit(&info->pinned_radix, test_block) ||
+		    test_radix_bit(&info->extent_ins_radix, test_block)) {
 			search_start = test_block + 1;
 			goto new_group;
 		}
 	}
-	if (!fill_prealloc && info->extent_tree_insert_nr) {
-		u64 last =
-		  info->extent_tree_insert[info->extent_tree_insert_nr - 1];
-		if (ins->objectid + num_blocks >
-		    info->extent_tree_insert[0] &&
-		    ins->objectid <= last) {
-			search_start = last + 1;
-			WARN_ON(!full_scan);
-			goto new_group;
-		}
-	}
-	if (!fill_prealloc && info->extent_tree_prealloc_nr) {
-		u64 first =
-		  info->extent_tree_prealloc[info->extent_tree_prealloc_nr - 1];
-		if (ins->objectid + num_blocks > first &&
-		    ins->objectid <= info->extent_tree_prealloc[0]) {
-			search_start = info->extent_tree_prealloc[0] + 1;
-			goto new_group;
-		}
-	}
 	if (exclude_nr > 0 && (ins->objectid + num_blocks > exclude_start &&
 	    ins->objectid < exclude_start + exclude_nr)) {
 		search_start = exclude_start + exclude_nr;
 		goto new_group;
 	}
-	if (fill_prealloc) {
-		int nr;
-		test_block = ins->objectid;
-		if (test_block - info->extent_tree_prealloc[total_needed - 1] >=
-		    leaf_range(root)) {
-			total_found = 0;
-			info->extent_tree_prealloc_nr = total_found;
-		}
-		while(test_block < ins->objectid + ins->offset &&
-		      total_found < total_needed) {
-			nr = total_needed - total_found - 1;
-			BUG_ON(nr < 0);
-			info->extent_tree_prealloc[nr] = test_block;
-			total_found++;
-			test_block++;
-		}
-		if (total_found < total_needed) {
-			search_start = test_block;
-			goto new_group;
-		}
-		info->extent_tree_prealloc_nr = total_found;
-	}
 	if (!data) {
 		block_group = btrfs_lookup_block_group(info, ins->objectid);
-		if (block_group) {
-			if (fill_prealloc)
-				block_group->last_prealloc =
-				     info->extent_tree_prealloc[total_needed-1];
-			else
-				trans->block_group = block_group;
-		}
+		if (block_group)
+			trans->block_group = block_group;
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
@@ -1278,85 +1177,41 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	int pending_ret;
 	u64 super_blocks_used;
 	u64 search_start = 0;
-	u64 exclude_start = 0;
-	u64 exclude_nr = 0;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
-	struct btrfs_key prealloc_key;
 
 	btrfs_set_extent_refs(&extent_item, 1);
 	btrfs_set_extent_owner(&extent_item, owner);
 
-	if (root == extent_root) {
-		int nr;
-		BUG_ON(info->extent_tree_prealloc_nr == 0);
-		BUG_ON(num_blocks != 1);
-		ins->offset = 1;
-		info->extent_tree_prealloc_nr--;
-		nr = info->extent_tree_prealloc_nr;
-		ins->objectid = info->extent_tree_prealloc[nr];
-		info->extent_tree_insert[info->extent_tree_insert_nr++] =
-			ins->objectid;
-		ret = update_block_group(trans, root,
-					 ins->objectid, ins->offset, 1, 0, 0);
-		WARN_ON(info->extent_tree_insert_nr >
-			ARRAY_SIZE(info->extent_tree_insert));
-		BUG_ON(ret);
-		return 0;
-	}
-
-	/*
-	 * if we're doing a data allocation, preallocate room in the
-	 * extent tree first.  This way the extent tree blocks end up
-	 * in the correct block group.
-	 */
-	if (data) {
-		ret = find_free_extent(trans, root, 0, 0, 0,
-				       search_end, 0, &prealloc_key, 0, 0, 0);
-		BUG_ON(ret);
-		if (ret)
-			return ret;
-		exclude_nr = info->extent_tree_prealloc_nr;
-		exclude_start = info->extent_tree_prealloc[exclude_nr - 1];
-	}
-
-	/* do the real allocation */
+	WARN_ON(num_blocks < 1);
 	ret = find_free_extent(trans, root, num_blocks, empty_size,
 			       search_start, search_end, hint_block, ins,
-			       exclude_start, exclude_nr, data);
+			       trans->alloc_exclude_start,
+			       trans->alloc_exclude_nr, data);
 	BUG_ON(ret);
 	if (ret)
 		return ret;
 
-	/*
-	 * if we're doing a metadata allocation, preallocate space in the
-	 * extent tree second.  This way, we don't create a tiny hole
-	 * in the allocation map between any unused preallocation blocks
-	 * and the metadata block we're actually allocating.  On disk,
-	 * it'll go:
-	 * [block we've allocated], [used prealloc 1], [ unused prealloc ]
-	 * The unused prealloc will get reused the next time around.
-	 */
-	if (!data) {
-		exclude_start = ins->objectid;
-		exclude_nr = ins->offset;
-		hint_block = exclude_start + exclude_nr;
-		ret = find_free_extent(trans, root, 0, 0, search_start,
-				       search_end, hint_block,
-				       &prealloc_key, exclude_start,
-				       exclude_nr, 0);
-		BUG_ON(ret);
-		if (ret)
-			return ret;
-	}
-
 	super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
 	btrfs_set_super_blocks_used(&info->super_copy, super_blocks_used +
 				    num_blocks);
+
+	if (root == extent_root) {
+		BUG_ON(num_blocks != 1);
+		set_radix_bit(&root->fs_info->extent_ins_radix, ins->objectid);
+		goto update_block;
+	}
+
+	WARN_ON(trans->alloc_exclude_nr);
+	trans->alloc_exclude_start = ins->objectid;
+	trans->alloc_exclude_nr = ins->offset;
 	ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
 				sizeof(extent_item));
 
+	trans->alloc_exclude_start = 0;
+	trans->alloc_exclude_nr = 0;
+
 	BUG_ON(ret);
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
@@ -1366,6 +1221,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (pending_ret) {
 		return pending_ret;
 	}
+
+update_block:
 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0,
 				 data);
 	BUG_ON(ret);
@@ -1750,7 +1607,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->last_alloc = cache->key.objectid;
 		cache->first_free = cache->key.objectid;
-		cache->last_prealloc = cache->key.objectid;
 		cache->pinned = 0;
 		cache->cached = 0;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88b0752550b..e64ecec3f52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -109,6 +109,8 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
 	h->block_group = NULL;
+	h->alloc_exclude_nr = 0;
+	h->alloc_exclude_start = 0;
 	root->fs_info->running_transaction->use_count++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 8b2714e6556..65a395eeca9 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -39,6 +39,8 @@ struct btrfs_trans_handle {
 	unsigned long blocks_used;
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_group_cache *block_group;
+	u64 alloc_exclude_start;
+	u64 alloc_exclude_nr;
 };
 
 
-- 
cgit v1.2.3


From 409eb95d7f6632d5af32b795244ce68a29e49319 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 8 Aug 2007 20:17:12 -0400
Subject: Btrfs: Further reduce the concurrency penalty of defrag and
 drop_snapshot

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 13 ++++++-------
 fs/btrfs/transaction.c |  3 +++
 fs/btrfs/tree-defrag.c |  7 ++-----
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ba50bd7b9a7..120c448f58f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1312,7 +1312,10 @@ static void reada_walk_down(struct btrfs_root *root,
 		BUG_ON(ret);
 		if (refs != 1)
 			continue;
+		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = readahead_tree_block(root, blocknr);
+		cond_resched();
+		mutex_lock(&root->fs_info->fs_mutex);
 		if (ret)
 			break;
 	}
@@ -1445,7 +1448,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_path *path;
 	int i;
 	int orig_level;
-	int num_walks = 0;
 	struct btrfs_root_item *root_item = &root->root_item;
 
 	path = btrfs_alloc_path();
@@ -1486,12 +1488,9 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
-		num_walks++;
-		if (num_walks > 2) {
-			ret = -EAGAIN;
-			get_bh(root->node);
-			break;
-		}
+		ret = -EAGAIN;
+		get_bh(root->node);
+		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e64ecec3f52..204337c5ca0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -364,6 +364,9 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 					break;
 			}
 			root->defrag_running = 0;
+			radix_tree_tag_clear(&info->fs_roots_radix,
+				     (unsigned long)root->root_key.objectid,
+				     BTRFS_ROOT_DEFRAG_TAG);
 		}
 	}
 	btrfs_end_transaction(trans, tree_root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 15d0a486fb5..7ea66b4aa5c 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -148,7 +148,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int level;
 	int orig_level;
 	int i;
-	int num_runs = 0;
 
 	if (root->ref_cows == 0) {
 		goto out;
@@ -200,10 +199,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			break;
 		if (wret < 0)
 			ret = wret;
-		if (num_runs++ > 8) {
-			ret = -EAGAIN;
-			break;
-		}
+		ret = -EAGAIN;
+		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
-- 
cgit v1.2.3


From e9d0b13b5bbb58c9b840e407a8d181442f799966 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 10 Aug 2007 14:06:19 -0400
Subject: Btrfs: Btree defrag on the extent-mapping tree as well

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 18 ++++++----
 fs/btrfs/ctree.h       |  4 ++-
 fs/btrfs/extent-tree.c | 28 +++++++++++++++-
 fs/btrfs/inode.c       | 23 ++-----------
 fs/btrfs/super.c       |  2 ++
 fs/btrfs/transaction.c | 91 +++++++++++++++++++++++++++-----------------------
 fs/btrfs/transaction.h |  2 ++
 fs/btrfs/tree-defrag.c | 36 ++++++++++++++++----
 8 files changed, 125 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c7e47e77723..ee1ae00d282 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -99,7 +99,6 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (ret)
 			return ret;
 	} else {
-		WARN_ON(!root->ref_cows);
 		clean_tree_block(trans, root, buf);
 	}
 
@@ -162,13 +161,14 @@ static int close_blocks(u64 blocknr, u64 other)
 
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct buffer_head *parent,
-		       int cache_only)
+		       int cache_only, u64 *last_ret)
 {
 	struct btrfs_node *parent_node;
 	struct buffer_head *cur_bh;
 	struct buffer_head *tmp_bh;
 	u64 blocknr;
-	u64 search_start = 0;
+	u64 search_start = *last_ret;
+	u64 last_block = 0;
 	u64 other;
 	u32 parent_nritems;
 	int start_slot;
@@ -198,6 +198,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
 		blocknr = btrfs_node_blockptr(parent_node, i);
+		if (last_block == 0)
+			last_block = blocknr;
 		if (i > 0) {
 			other = btrfs_node_blockptr(parent_node, i - 1);
 			close = close_blocks(blocknr, other);
@@ -206,8 +208,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			other = btrfs_node_blockptr(parent_node, i + 1);
 			close = close_blocks(blocknr, other);
 		}
-		if (close)
+		if (close) {
+			last_block = blocknr;
 			continue;
+		}
 
 		cur_bh = btrfs_find_tree_block(root, blocknr);
 		if (!cur_bh || !buffer_uptodate(cur_bh) ||
@@ -219,9 +223,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			brelse(cur_bh);
 			cur_bh = read_tree_block(root, blocknr);
 		}
-		if (search_start == 0) {
-			search_start = bh_blocknr(cur_bh) & ~((u64)65535);
-		}
+		if (search_start == 0)
+			search_start = last_block & ~((u64)65535);
+
 		err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
 					&tmp_bh, search_start,
 					min(8, end_slot - i));
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 59e09e37ab9..d3cd564b3b3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1019,6 +1019,8 @@ static inline void btrfs_memmove(struct btrfs_root *root,
 	btrfs_item_offset((leaf)->items + (slot))))
 
 /* extent-tree.c */
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
@@ -1066,7 +1068,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct buffer_head *parent,
-		       int cache_only);
+		       int cache_only, u64 *last_ret);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 120c448f58f..3418bb62b99 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -396,6 +396,14 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root)
+{
+	finish_current_insert(trans, root->fs_info->extent_root);
+	del_pending_extents(trans, root->fs_info->extent_root);
+	return 0;
+}
+
 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 blocknr,
 			     u64 num_blocks, u32 *refs)
@@ -1374,7 +1382,25 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			BUG_ON(ret);
 			continue;
 		}
-		next = read_tree_block(root, blocknr);
+		next = btrfs_find_tree_block(root, blocknr);
+		if (!next || !buffer_uptodate(next)) {
+			brelse(next);
+			mutex_unlock(&root->fs_info->fs_mutex);
+			next = read_tree_block(root, blocknr);
+			mutex_lock(&root->fs_info->fs_mutex);
+
+			/* we dropped the lock, check one more time */
+			ret = lookup_extent_ref(trans, root, blocknr, 1, &refs);
+			BUG_ON(ret);
+			if (refs != 1) {
+				path->slots[*level]++;
+				brelse(next);
+				ret = btrfs_free_extent(trans, root,
+							blocknr, 1, 1);
+				BUG_ON(ret);
+				continue;
+			}
+		}
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
 			btrfs_block_release(root, path->nodes[*level-1]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 12aa043b9f6..5c05ecbc572 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2159,9 +2159,7 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_vol_args vol_args;
-	struct btrfs_trans_handle *trans;
 	int ret = 0;
-	int err;
 	struct btrfs_dir_item *di;
 	int namelen;
 	struct btrfs_path *path;
@@ -2201,25 +2199,8 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 
 	case BTRFS_IOC_DEFRAG:
 		mutex_lock(&root->fs_info->fs_mutex);
-		trans = btrfs_start_transaction(root, 1);
-		memset(&root->defrag_progress, 0,
-		       sizeof(root->defrag_progress));
-		while (1) {
-			root->defrag_running = 1;
-			err = btrfs_defrag_leaves(trans, root, 0);
-
-			btrfs_end_transaction(trans, root);
-			mutex_unlock(&root->fs_info->fs_mutex);
-
-			btrfs_btree_balance_dirty(root);
-
-			mutex_lock(&root->fs_info->fs_mutex);
-			trans = btrfs_start_transaction(root, 1);
-			if (err != -EAGAIN)
-				break;
-		}
-		root->defrag_running = 0;
-		btrfs_end_transaction(trans, root);
+		btrfs_defrag_root(root, 0);
+		btrfs_defrag_root(root->fs_info->extent_root, 0);
 		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = 0;
 		break;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e797d5fb28..74f3de47423 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -121,7 +121,9 @@ static int btrfs_sync_fs(struct super_block *sb, int wait)
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
 	}
+	btrfs_clean_old_snapshots(root);
 	mutex_lock(&root->fs_info->fs_mutex);
+	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 204337c5ca0..c9d52dc83e4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -317,18 +317,47 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 	return err;
 }
 
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	if (root->defrag_running)
+		return 0;
+
+	trans = btrfs_start_transaction(root, 1);
+	while (1) {
+		root->defrag_running = 1;
+		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+		btrfs_end_transaction(trans, root);
+		mutex_unlock(&info->fs_mutex);
+
+		btrfs_btree_balance_dirty(root);
+		cond_resched();
+
+		mutex_lock(&info->fs_mutex);
+		trans = btrfs_start_transaction(root, 1);
+		if (ret != -EAGAIN)
+			break;
+	}
+	root->defrag_running = 0;
+	radix_tree_tag_clear(&info->fs_roots_radix,
+		     (unsigned long)root->root_key.objectid,
+		     BTRFS_ROOT_DEFRAG_TAG);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
 int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 {
 	struct btrfs_root *gang[1];
 	struct btrfs_root *root;
-	struct btrfs_root *tree_root = info->tree_root;
-	struct btrfs_trans_handle *trans;
 	int i;
 	int ret;
 	int err = 0;
 	u64 last = 0;
 
-	trans = btrfs_start_transaction(tree_root, 1);
 	while(1) {
 		ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
 						 (void **)gang, last,
@@ -339,37 +368,10 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
 			last = root->root_key.objectid + 1;
-			radix_tree_tag_clear(&info->fs_roots_radix,
-				     (unsigned long)root->root_key.objectid,
-				     BTRFS_ROOT_DEFRAG_TAG);
-			if (root->defrag_running)
-				continue;
-
-			while (1) {
-				mutex_lock(&root->fs_info->trans_mutex);
-				record_root_in_trans(root);
-				mutex_unlock(&root->fs_info->trans_mutex);
-
-				root->defrag_running = 1;
-				err = btrfs_defrag_leaves(trans, root, 1);
-				btrfs_end_transaction(trans, tree_root);
-				mutex_unlock(&info->fs_mutex);
-
-				btrfs_btree_balance_dirty(root);
-				cond_resched();
-
-				mutex_lock(&info->fs_mutex);
-				trans = btrfs_start_transaction(tree_root, 1);
-				if (err != -EAGAIN)
-					break;
-			}
-			root->defrag_running = 0;
-			radix_tree_tag_clear(&info->fs_roots_radix,
-				     (unsigned long)root->root_key.objectid,
-				     BTRFS_ROOT_DEFRAG_TAG);
+			btrfs_defrag_root(root, 1);
 		}
 	}
-	btrfs_end_transaction(trans, tree_root);
+	btrfs_defrag_root(info->extent_root, 1);
 	return err;
 }
 
@@ -527,6 +529,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+int btrfs_clean_old_snapshots(struct btrfs_root *root)
+{
+	struct list_head dirty_roots;
+	INIT_LIST_HEAD(&dirty_roots);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	if (!list_empty(&dirty_roots)) {
+		drop_dirty_roots(root, &dirty_roots);
+	}
+	return 0;
+}
 void btrfs_transaction_cleaner(struct work_struct *work)
 {
 	struct btrfs_fs_info *fs_info = container_of(work,
@@ -536,12 +552,10 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_transaction *cur;
 	struct btrfs_trans_handle *trans;
-	struct list_head dirty_roots;
 	unsigned long now;
 	unsigned long delay = HZ * 30;
 	int ret;
 
-	INIT_LIST_HEAD(&dirty_roots);
 	mutex_lock(&root->fs_info->fs_mutex);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur = root->fs_info->running_transaction;
@@ -561,14 +575,7 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	ret = btrfs_commit_transaction(trans, root);
 out:
 	mutex_unlock(&root->fs_info->fs_mutex);
-
-	mutex_lock(&root->fs_info->trans_mutex);
-	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
-
-	if (!list_empty(&dirty_roots)) {
-		drop_dirty_roots(root, &dirty_roots);
-	}
+	btrfs_clean_old_snapshots(root);
 	btrfs_transaction_queue_work(root, delay);
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 65a395eeca9..d5f491d3757 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -73,5 +73,7 @@ void btrfs_init_transaction_sys(void);
 void btrfs_exit_transaction_sys(void);
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
 int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
+int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_clean_old_snapshots(struct btrfs_root *root);
 
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 7ea66b4aa5c..a09064a9a41 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -42,16 +42,20 @@ static void reada_defrag(struct btrfs_root *root,
 static int defrag_walk_down(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path, int *level,
-			    int cache_only)
+			    int cache_only, u64 *last_ret)
 {
 	struct buffer_head *next;
 	struct buffer_head *cur;
 	u64 blocknr;
 	int ret = 0;
+	int is_extent = 0;
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
+	if (root->fs_info->extent_root == root)
+		is_extent = 1;
+
 	while(*level > 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
@@ -70,7 +74,10 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		if (*level == 1) {
 			ret = btrfs_realloc_node(trans, root,
 						 path->nodes[*level],
-						 cache_only);
+						 cache_only, last_ret);
+			if (is_extent)
+				btrfs_extent_post_op(trans, root);
+
 			break;
 		}
 		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
@@ -90,8 +97,13 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
 				      path->slots[*level], &next);
 		BUG_ON(ret);
-		ret = btrfs_realloc_node(trans, root, next, cache_only);
+		ret = btrfs_realloc_node(trans, root, next, cache_only,
+					 last_ret);
 		BUG_ON(ret);
+
+		if (is_extent)
+			btrfs_extent_post_op(trans, root);
+
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
 			btrfs_block_release(root, path->nodes[*level-1]);
@@ -148,10 +160,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int level;
 	int orig_level;
 	int i;
+	int is_extent = 0;
+	u64 last_ret = 0;
+
+	if (root->fs_info->extent_root == root)
+		is_extent = 1;
 
-	if (root->ref_cows == 0) {
+	if (root->ref_cows == 0 && !is_extent)
 		goto out;
-	}
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -165,16 +181,21 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		get_bh(root->node);
 		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 		BUG_ON(ret);
-		ret = btrfs_realloc_node(trans, root, root->node, cache_only);
+		ret = btrfs_realloc_node(trans, root, root->node, cache_only,
+					 &last_ret);
 		BUG_ON(ret);
 		path->nodes[level] = root->node;
 		path->slots[level] = 0;
+		if (is_extent)
+			btrfs_extent_post_op(trans, root);
 	} else {
 		level = root->defrag_level;
 		path->lowest_level = level;
 		wret = btrfs_search_slot(trans, root, &root->defrag_progress,
 					 path, 0, 1);
 
+		if (is_extent)
+			btrfs_extent_post_op(trans, root);
 		if (wret < 0) {
 			ret = wret;
 			goto out;
@@ -188,7 +209,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	}
 
 	while(1) {
-		wret = defrag_walk_down(trans, root, path, &level, cache_only);
+		wret = defrag_walk_down(trans, root, path, &level, cache_only,
+					&last_ret);
 		if (wret > 0)
 			break;
 		if (wret < 0)
-- 
cgit v1.2.3


From f2183bde1a918d338337955c8e8ba29bd8f5e7b1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 10 Aug 2007 14:42:37 -0400
Subject: Btrfs: Add BH_Defrag to mark buffers that are in need of defragging

This allows the tree walking code to defrag only the newly allocated
buffers, it seems to be a good balance between perfect defragging and the
performance hit of repeatedly reallocating blocks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 22 ++++++++++++----------
 fs/btrfs/disk-io.h     |  2 ++
 fs/btrfs/extent-tree.c |  2 ++
 fs/btrfs/tree-defrag.c |  3 ++-
 4 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ee1ae00d282..7cf43da5e78 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -175,6 +175,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	int end_slot;
 	int i;
 	int err = 0;
+	int parent_level;
 
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
@@ -188,6 +189,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	}
 	parent_node = btrfs_buffer_node(parent);
 	parent_nritems = btrfs_header_nritems(&parent_node->header);
+	parent_level = btrfs_header_level(&parent_node->header);
 
 	start_slot = 0;
 	end_slot = parent_nritems;
@@ -215,13 +217,16 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 		cur_bh = btrfs_find_tree_block(root, blocknr);
 		if (!cur_bh || !buffer_uptodate(cur_bh) ||
-		    buffer_locked(cur_bh)) {
+		    buffer_locked(cur_bh) || !buffer_defrag(cur_bh)) {
 			if (cache_only) {
 				brelse(cur_bh);
 				continue;
 			}
-			brelse(cur_bh);
-			cur_bh = read_tree_block(root, blocknr);
+			if (!cur_bh || !buffer_uptodate(cur_bh) ||
+			    buffer_locked(cur_bh)) {
+				brelse(cur_bh);
+				cur_bh = read_tree_block(root, blocknr);
+			}
 		}
 		if (search_start == 0)
 			search_start = last_block & ~((u64)65535);
@@ -232,6 +237,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		if (err)
 			break;
 		search_start = bh_blocknr(tmp_bh);
+		*last_ret = search_start;
+		if (parent_level == 1)
+			clear_buffer_defrag(tmp_bh);
 		brelse(tmp_bh);
 	}
 	return err;
@@ -811,16 +819,10 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 			clear_radix_bit(&found, blocknr);
 			if (nread > 32)
 				continue;
-			if (direction > 0 && cluster_start <= blocknr &&
-			    cluster_start + 8 > blocknr) {
-				cluster_start = blocknr;
+			if (close_blocks(cluster_start, blocknr)) {
 				readahead_tree_block(root, blocknr);
 				nread++;
-			} else if (direction < 0 && cluster_start >= blocknr &&
-				   blocknr + 8 > cluster_start) {
 				cluster_start = blocknr;
-				readahead_tree_block(root, blocknr);
-				nread++;
 			}
 		}
 	}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 9e2c261b41a..81fd18cbd82 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,8 +25,10 @@
 
 enum btrfs_bh_state_bits {
 	BH_Checked = BH_PrivateStart,
+	BH_Defrag,
 };
 BUFFER_FNS(Checked, checked);
+BUFFER_FNS(Defrag, defrag);
 
 static inline struct btrfs_node *btrfs_buffer_node(struct buffer_head *bh)
 {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3418bb62b99..b3641234473 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1015,6 +1015,7 @@ check_failed:
 	ins->objectid = search_start;
 	ins->offset = 0;
 	start_found = 0;
+	path->reada = 1;
 
 	ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
 	if (ret < 0)
@@ -1264,6 +1265,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	WARN_ON(buffer_dirty(buf));
 	set_buffer_uptodate(buf);
 	set_buffer_checked(buf);
+	set_buffer_defrag(buf);
 	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
 	return buf;
 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a09064a9a41..35fd20d2464 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -86,7 +86,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		if (cache_only) {
 			next = btrfs_find_tree_block(root, blocknr);
 			if (!next || !buffer_uptodate(next) ||
-			   buffer_locked(next)) {
+			   buffer_locked(next) || !buffer_defrag(next)) {
 				brelse(next);
 				path->slots[*level]++;
 				continue;
@@ -142,6 +142,7 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 			root->defrag_level = i;
 			return 0;
 		} else {
+			clear_buffer_defrag(path->nodes[*level]);
 			btrfs_block_release(root, path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
-- 
cgit v1.2.3


From 15ee9bc7ed171248d1405df5854da5fa91bfdc39 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jwhiter@redhat.com>
Date: Fri, 10 Aug 2007 16:22:09 -0400
Subject: Btrfs: delay commits during fsync to allow more writers

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  5 +++++
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/file.c        | 22 ++++++++++++++++++----
 fs/btrfs/inode.c       |  2 ++
 fs/btrfs/transaction.c | 44 ++++++++++++++++++++++++++++++++------------
 fs/btrfs/transaction.h |  7 +++++++
 7 files changed, 66 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6bce46be49f..f446f161f73 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -25,6 +25,11 @@ struct btrfs_inode {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
 	struct inode vfs_inode;
+
+	/*
+	 * transid of the trans_handle that last modified this inode
+	 */
+	u64 last_trans;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d3cd564b3b3..c7f5161271d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -301,6 +301,7 @@ struct btrfs_fs_info {
 	struct radix_tree_root extent_map_radix;
 	struct radix_tree_root extent_ins_radix;
 	u64 generation;
+	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_super_block super_copy;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 192211274ce..b2f79878d51 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -435,6 +435,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
+	fs_info->last_trans_committed = 0;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->sb = sb;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 00b118a2db6..6933ab11a5c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -694,22 +694,36 @@ static int btrfs_sync_file(struct file *file,
 {
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
+	int ret = 0;
 	struct btrfs_trans_handle *trans;
 
 	/*
-	 * FIXME, use inode generation number to check if we can skip the
-	 * commit
+	 * check the transaction that last modified this inode
+	 * and see if its already been committed
 	 */
 	mutex_lock(&root->fs_info->fs_mutex);
+	if (!BTRFS_I(inode)->last_trans)
+		goto out;
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (BTRFS_I(inode)->last_trans <=
+	    root->fs_info->last_trans_committed) {
+		BTRFS_I(inode)->last_trans = 0;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		goto out;
+	}
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	/*
+ 	 * ok we haven't committed the transaction yet, lets do a commit
+ 	 */
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	ret = btrfs_commit_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 out:
+	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret > 0 ? EIO : ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5c05ecbc572..398484179d8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -193,6 +193,7 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
 
 	fill_inode_item(inode_item, inode);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
 	btrfs_release_path(root, path);
@@ -2234,6 +2235,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+	ei->last_trans = 0;
 	return &ei->vfs_inode;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c9d52dc83e4..18abea80279 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,7 +55,8 @@ static int join_transaction(struct btrfs_root *root)
 		BUG_ON(!cur_trans);
 		root->fs_info->generation++;
 		root->fs_info->running_transaction = cur_trans;
-		cur_trans->num_writers = 0;
+		cur_trans->num_writers = 1;
+		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
@@ -65,8 +66,11 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		init_bit_radix(&cur_trans->dirty_pages);
+	} else {
+		cur_trans->num_writers++;
+		cur_trans->num_joined++;
 	}
-	cur_trans->num_writers++;
+
 	return 0;
 }
 
@@ -428,12 +432,14 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
-	int ret = 0;
+	unsigned long joined = 0;
+	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
 	struct radix_tree_root pinned_copy;
 	DEFINE_WAIT(wait);
+	int ret;
 
 	init_bit_radix(&pinned_copy);
 	INIT_LIST_HEAD(&dirty_fs_roots);
@@ -448,7 +454,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
+
+		mutex_lock(&root->fs_info->trans_mutex);
 		put_transaction(cur_trans);
+		mutex_unlock(&root->fs_info->trans_mutex);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
@@ -463,26 +473,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			mutex_unlock(&root->fs_info->trans_mutex);
 
 			wait_for_commit(root, prev_trans);
-			put_transaction(prev_trans);
 
 			mutex_lock(&root->fs_info->fs_mutex);
 			mutex_lock(&root->fs_info->trans_mutex);
+			put_transaction(prev_trans);
 		}
 	}
-	while (trans->transaction->num_writers > 1) {
+
+	do {
+		joined = cur_trans->num_joined;
 		WARN_ON(cur_trans != trans->transaction);
-		prepare_to_wait(&trans->transaction->writer_wait, &wait,
+		prepare_to_wait(&cur_trans->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
-		if (trans->transaction->num_writers <= 1)
-			break;
+
+		if (cur_trans->num_writers > 1)
+			timeout = MAX_SCHEDULE_TIMEOUT;
+		else
+			timeout = 1;
+
 		mutex_unlock(&root->fs_info->fs_mutex);
 		mutex_unlock(&root->fs_info->trans_mutex);
-		schedule();
+
+		schedule_timeout(timeout);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
-		finish_wait(&trans->transaction->writer_wait, &wait);
-	}
-	finish_wait(&trans->transaction->writer_wait, &wait);
+		finish_wait(&cur_trans->writer_wait, &wait);
+	} while (cur_trans->num_writers > 1 ||
+		 (cur_trans->num_joined != joined));
+
 	WARN_ON(cur_trans != trans->transaction);
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
@@ -511,6 +530,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_finish_extent_commit(trans, root, &pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans->commit_done = 1;
+	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d5f491d3757..e451783a1a4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -23,6 +23,7 @@
 struct btrfs_transaction {
 	u64 transid;
 	unsigned long num_writers;
+	unsigned long num_joined;
 	int in_commit;
 	int use_count;
 	int commit_done;
@@ -57,6 +58,12 @@ static inline void btrfs_update_inode_block_group(struct
 	BTRFS_I(inode)->block_group = trans->block_group;
 }
 
+static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
+					      struct inode *inode)
+{
+	BTRFS_I(inode)->last_trans = trans->transaction->transid;
+}
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-- 
cgit v1.2.3


From 320206112895c72f98e57570ae194689dcd7fe56 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 27 Aug 2007 15:17:54 -0400
Subject: fix block readahead in btrfs_next_leaf

Send the correct slot down to reada_for_search

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7cf43da5e78..51eea5ccb64 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2120,7 +2120,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (!level)
 			break;
 		if (path->reada)
-			reada_for_search(root, path, level, slot);
+			reada_for_search(root, path, level, 0);
 		next = read_tree_block(root,
 		       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
 	}
-- 
cgit v1.2.3


From 2cc58cf24f69be8632a3b29d653c318bf3bd8c84 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Aug 2007 16:49:44 -0400
Subject: Btrfs: Do more extensive readahead during tree searches

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 62 +++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/inode.c       |  2 +-
 3 files changed, 52 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 51eea5ccb64..c0782a5b04c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -43,8 +43,10 @@ struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
 	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
-	if (path)
+	if (path) {
 		btrfs_init_path(path);
+		path->reada = 1;
+	}
 	return path;
 }
 
@@ -159,6 +161,34 @@ static int close_blocks(u64 blocknr, u64 other)
 	return 0;
 }
 
+static int should_defrag_leaf(struct buffer_head *bh)
+{
+	struct btrfs_leaf *leaf = btrfs_buffer_leaf(bh);
+	struct btrfs_disk_key *key;
+	u32 nritems;
+
+	if (buffer_defrag(bh))
+		return 1;
+
+	nritems = btrfs_header_nritems(&leaf->header);
+	if (nritems == 0)
+		return 0;
+
+	key = &leaf->items[0].key;
+	if (btrfs_disk_key_type(key) == BTRFS_DIR_ITEM_KEY)
+		return 1;
+
+	key = &leaf->items[nritems-1].key;
+	if (btrfs_disk_key_type(key) == BTRFS_DIR_ITEM_KEY)
+		return 1;
+	if (nritems > 4) {
+		key = &leaf->items[nritems/2].key;
+		if (btrfs_disk_key_type(key) == BTRFS_DIR_ITEM_KEY)
+			return 1;
+	}
+	return 0;
+}
+
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct buffer_head *parent,
 		       int cache_only, u64 *last_ret)
@@ -217,7 +247,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 		cur_bh = btrfs_find_tree_block(root, blocknr);
 		if (!cur_bh || !buffer_uptodate(cur_bh) ||
-		    buffer_locked(cur_bh) || !buffer_defrag(cur_bh)) {
+		    buffer_locked(cur_bh) ||
+		    (parent_level != 1 && !buffer_defrag(cur_bh)) ||
+		    (parent_level == 1 && !should_defrag_leaf(cur_bh))) {
 			if (cache_only) {
 				brelse(cur_bh);
 				continue;
@@ -297,6 +329,7 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		parent = btrfs_buffer_node(path->nodes[level + 1]);
 
 	slot = path->slots[level];
+	BUG_ON(!buffer_uptodate(path->nodes[level]));
 	BUG_ON(nritems == 0);
 	if (parent) {
 		struct btrfs_disk_key *parent_key;
@@ -511,9 +544,6 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		err_on_enospc = 1;
 
 	left_buf = read_node_slot(root, parent_buf, pslot - 1);
-	right_buf = read_node_slot(root, parent_buf, pslot + 1);
-
-	/* first, try to make some room in the middle buffer */
 	if (left_buf) {
 		wret = btrfs_cow_block(trans, root, left_buf,
 				       parent_buf, pslot - 1, &left_buf);
@@ -521,6 +551,19 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = wret;
 			goto enospc;
 		}
+	}
+	right_buf = read_node_slot(root, parent_buf, pslot + 1);
+	if (right_buf) {
+		wret = btrfs_cow_block(trans, root, right_buf,
+				       parent_buf, pslot + 1, &right_buf);
+		if (wret) {
+			ret = wret;
+			goto enospc;
+		}
+	}
+
+	/* first, try to make some room in the middle buffer */
+	if (left_buf) {
 		left = btrfs_buffer_node(left_buf);
 		orig_slot += btrfs_header_nritems(&left->header);
 		wret = push_node_left(trans, root, left_buf, mid_buf);
@@ -534,13 +577,6 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right_buf) {
-		wret = btrfs_cow_block(trans, root, right_buf,
-				       parent_buf, pslot + 1, &right_buf);
-		if (wret) {
-			ret = wret;
-			goto enospc;
-		}
-
 		right = btrfs_buffer_node(right_buf);
 		wret = push_node_left(trans, root, mid_buf, right_buf);
 		if (wret < 0 && wret != -ENOSPC)
@@ -817,7 +853,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 		for (i = 0; i < ret; i++) {
 			blocknr = gang[i];
 			clear_radix_bit(&found, blocknr);
-			if (nread > 32)
+			if (path->reada == 1 && nread > 16)
 				continue;
 			if (close_blocks(cluster_start, blocknr)) {
 				readahead_tree_block(root, blocknr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b3641234473..4049aadbeda 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -52,7 +52,7 @@ static int cache_block_group(struct btrfs_root *root,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 1;
+	path->reada = 2;
 	key.objectid = block_group->key.objectid;
 	key.flags = 0;
 	key.offset = 0;
@@ -1015,7 +1015,7 @@ check_failed:
 	ins->objectid = search_start;
 	ins->offset = 0;
 	start_found = 0;
-	path->reada = 1;
+	path->reada = 2;
 
 	ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
 	if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 398484179d8..7e4cf62ada5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -827,7 +827,7 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 	path = btrfs_alloc_path();
-	path->reada = 1;
+	path->reada = 2;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-- 
cgit v1.2.3


From 83df7c1d8b885e216c68c3f0469bec9b93666b74 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Aug 2007 16:49:44 -0400
Subject: Btrfs: Make sure to cow the root during a snapshot

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7e4cf62ada5..504d37dfa6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2096,6 +2096,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
 	struct btrfs_root_item new_root_item;
+	struct buffer_head *tmp;
 	int ret;
 	int err;
 	u64 objectid;
@@ -2123,6 +2124,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	key.offset = 1;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 	btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
 
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-- 
cgit v1.2.3


From a52d9a8033c454cd9b4697cfafb467509fc1693f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Aug 2007 16:49:44 -0400
Subject: Btrfs: Extent based page cache code.  This uses an rbtree of extents
 and tests instead of buffer heads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    3 +-
 fs/btrfs/btrfs_inode.h |    3 +
 fs/btrfs/ctree.h       |    7 +
 fs/btrfs/disk-io.h     |    1 -
 fs/btrfs/extent_map.c  | 1656 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_map.h  |   89 +++
 fs/btrfs/file.c        |  295 ++++-----
 fs/btrfs/inode.c       |  816 +++++++++---------------
 fs/btrfs/super.c       |    2 +
 9 files changed, 2203 insertions(+), 669 deletions(-)
 create mode 100644 fs/btrfs/extent_map.c
 create mode 100644 fs/btrfs/extent_map.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9321438c5ee..ea7a2287035 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,8 @@ ifneq ($(KERNELRELEASE),)
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
-	   transaction.o bit-radix.o inode.o file.o tree-defrag.o
+	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
+	   extent_map.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f446f161f73..d1d5af471c3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,11 +19,14 @@
 #ifndef __BTRFS_I__
 #define __BTRFS_I__
 
+#include "extent_map.h"
+
 /* in memory btrfs inode */
 struct btrfs_inode {
 	struct btrfs_root *root;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
+	struct extent_map_tree extent_tree;
 	struct inode vfs_inode;
 
 	/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c7f5161271d..947f061ed11 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1176,7 +1176,14 @@ int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 int btrfs_get_block(struct inode *inode, sector_t iblock,
 		    struct buffer_head *result, int create);
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t page_offset, u64 start, u64 end,
+				    int create);
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode);
 /* file.c */
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 81fd18cbd82..5261733b873 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -77,7 +77,6 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   u64 num_blocks);
 int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 			     u64 logical);
-int btrfs_releasepage(struct page *page, gfp_t flags);
 void btrfs_btree_balance_dirty(struct btrfs_root *root);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct buffer_head *bh);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
new file mode 100644
index 00000000000..d378edf0964
--- /dev/null
+++ b/fs/btrfs/extent_map.c
@@ -0,0 +1,1656 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include "extent_map.h"
+
+static struct kmem_cache *extent_map_cache;
+static struct kmem_cache *extent_state_cache;
+
+struct tree_entry {
+	u64 start;
+	u64 end;
+	int in_tree;
+	struct rb_node rb_node;
+};
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+static LIST_HEAD(all_states);
+spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
+
+void __init extent_map_init(void)
+{
+	extent_map_cache = kmem_cache_create("extent_map",
+					    sizeof(struct extent_map), 0,
+					    SLAB_RECLAIM_ACCOUNT |
+					    SLAB_DESTROY_BY_RCU,
+					    NULL);
+	extent_state_cache = kmem_cache_create("extent_state",
+					    sizeof(struct extent_state), 0,
+					    SLAB_RECLAIM_ACCOUNT |
+					    SLAB_DESTROY_BY_RCU,
+					    NULL);
+}
+
+void __exit extent_map_exit(void)
+{
+	while(!list_empty(&all_states)) {
+		struct extent_state *state;
+		struct list_head *cur = all_states.next;
+		state = list_entry(cur, struct extent_state, list);
+		printk("found leaked state %Lu %Lu state %d in_tree %d\n",
+		       state->start, state->end, state->state, state->in_tree);
+		list_del(&state->list);
+		kfree(state);
+	}
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+	if (extent_state_cache)
+		kmem_cache_destroy(extent_state_cache);
+}
+
+void extent_map_tree_init(struct extent_map_tree *tree,
+			  struct address_space *mapping, gfp_t mask)
+{
+	tree->map.rb_node = NULL;
+	tree->state.rb_node = NULL;
+	rwlock_init(&tree->lock);
+	tree->mapping = mapping;
+}
+EXPORT_SYMBOL(extent_map_tree_init);
+
+struct extent_map *alloc_extent_map(gfp_t mask)
+{
+	struct extent_map *em;
+	em = kmem_cache_alloc(extent_map_cache, mask);
+	if (!em || IS_ERR(em))
+		return em;
+	em->in_tree = 0;
+	atomic_set(&em->refs, 1);
+	return em;
+}
+EXPORT_SYMBOL(alloc_extent_map);
+
+void free_extent_map(struct extent_map *em)
+{
+	if (atomic_dec_and_test(&em->refs)) {
+		WARN_ON(em->in_tree);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+EXPORT_SYMBOL(free_extent_map);
+
+
+struct extent_state *alloc_extent_state(gfp_t mask)
+{
+	struct extent_state *state;
+	state = kmem_cache_alloc(extent_state_cache, mask);
+	if (!state || IS_ERR(state))
+		return state;
+	state->state = 0;
+	state->in_tree = 0;
+	atomic_set(&state->refs, 1);
+	init_waitqueue_head(&state->wq);
+	spin_lock_irq(&state_lock);
+	list_add(&state->list, &all_states);
+	spin_unlock_irq(&state_lock);
+	return state;
+}
+EXPORT_SYMBOL(alloc_extent_state);
+
+void free_extent_state(struct extent_state *state)
+{
+	if (atomic_dec_and_test(&state->refs)) {
+		WARN_ON(state->in_tree);
+		spin_lock_irq(&state_lock);
+		list_del_init(&state->list);
+		spin_unlock_irq(&state_lock);
+		kmem_cache_free(extent_state_cache, state);
+	}
+}
+EXPORT_SYMBOL(free_extent_state);
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct tree_entry *entry;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset > entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+				   struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+
+	while(n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset > entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+	while(prev && offset > prev_entry->end) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static int tree_delete(struct rb_root *root, u64 offset)
+{
+	struct rb_node *node;
+	struct tree_entry *entry;
+
+	node = __tree_search(root, offset, NULL);
+	if (!node)
+		return -ENOENT;
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry->in_tree = 0;
+	rb_erase(node, root);
+	return 0;
+}
+
+/*
+ * add_extent_mapping tries a simple backward merge with existing
+ * mappings.  The extent_map struct passed in will be inserted into
+ * the tree directly (no copies made, just a reference taken).
+ */
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em)
+{
+	int ret = 0;
+	struct extent_map *prev = NULL;
+	struct rb_node *rb;
+
+	write_lock_irq(&tree->lock);
+	rb = tree_insert(&tree->map, em->end, &em->rb_node);
+	if (rb) {
+		prev = rb_entry(rb, struct extent_map, rb_node);
+		printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
+		ret = -EEXIST;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb)
+			prev = rb_entry(rb, struct extent_map, rb_node);
+		if (prev && prev->end + 1 == em->start &&
+		    ((em->block_start == 0 && prev->block_start == 0) ||
+			     (em->block_start == prev->block_end + 1))) {
+			em->start = prev->start;
+			em->block_start = prev->block_start;
+			rb_erase(&prev->rb_node, &tree->map);
+			prev->in_tree = 0;
+			free_extent_map(prev);
+		}
+	 }
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+EXPORT_SYMBOL(add_extent_mapping);
+
+/*
+ * lookup_extent_mapping returns the first extent_map struct in the
+ * tree that intersects the [start, end] (inclusive) range.  There may
+ * be additional objects in the tree that intersect, so check the object
+ * returned carefully to make sure you don't need additional lookups.
+ */
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 end)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+
+	read_lock_irq(&tree->lock);
+	rb_node = tree_search(&tree->map, start);
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	if (IS_ERR(rb_node)) {
+		em = ERR_PTR(PTR_ERR(rb_node));
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	if (em->end < start || em->start > end) {
+		em = NULL;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+out:
+	read_unlock_irq(&tree->lock);
+	return em;
+}
+EXPORT_SYMBOL(lookup_extent_mapping);
+
+/*
+ * removes an extent_map struct from the tree.  No reference counts are
+ * dropped, and no checks are done to  see if the range is in use
+ */
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret;
+
+	write_lock_irq(&tree->lock);
+	ret = tree_delete(&tree->map, em->end);
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+EXPORT_SYMBOL(remove_extent_mapping);
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_map_tree *tree,
+		       struct extent_state *state)
+{
+	struct extent_state *other;
+	struct rb_node *other_node;
+
+	if (state->state & EXTENT_IOBITS)
+		return 0;
+
+	other_node = rb_prev(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->end == state->start - 1 &&
+		    other->state == state->state) {
+			state->start = other->start;
+			other->in_tree = 0;
+			rb_erase(&other->rb_node, &tree->state);
+			free_extent_state(other);
+		}
+	}
+	other_node = rb_next(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->start == state->end + 1 &&
+		    other->state == state->state) {
+			other->start = state->start;
+			state->in_tree = 0;
+			rb_erase(&state->rb_node, &tree->state);
+			free_extent_state(state);
+		}
+	}
+	return 0;
+}
+
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_map_tree *tree,
+			struct extent_state *state, u64 start, u64 end,
+			int bits)
+{
+	struct rb_node *node;
+
+	if (end < start) {
+		printk("end < start %Lu %Lu\n", end, start);
+		WARN_ON(1);
+	}
+	state->state |= bits;
+	state->start = start;
+	state->end = end;
+	if ((end & 4095) == 0) {
+		printk("insert state %Lu %Lu strange end\n", start, end);
+		WARN_ON(1);
+	}
+	node = tree_insert(&tree->state, end, &state->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		free_extent_state(state);
+		return -EEXIST;
+	}
+	merge_state(tree, state);
+	return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
+		       struct extent_state *prealloc, u64 split)
+{
+	struct rb_node *node;
+	prealloc->start = orig->start;
+	prealloc->end = split - 1;
+	prealloc->state = orig->state;
+	orig->start = split;
+	if ((prealloc->end & 4095) == 0) {
+		printk("insert state %Lu %Lu strange end\n", prealloc->start,
+		       prealloc->end);
+		WARN_ON(1);
+	}
+	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
+		free_extent_state(prealloc);
+		return -EEXIST;
+	}
+	return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_map_tree *tree,
+			    struct extent_state *state, int bits, int wake,
+			    int delete)
+{
+	int ret = state->state & bits;
+	state->state &= ~bits;
+	if (wake)
+		wake_up(&state->wq);
+	if (delete || state->state == 0) {
+		if (state->in_tree) {
+			rb_erase(&state->rb_node, &tree->state);
+			state->in_tree = 0;
+			free_extent_state(state);
+		} else {
+			WARN_ON(1);
+		}
+	} else {
+		merge_state(tree, state);
+	}
+	return ret;
+}
+
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err;
+	int set = 0;
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find the extents that end after
+	 * our range starts
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node)
+		goto out;
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start > end)
+		goto out;
+	WARN_ON(state->end < start);
+
+	/*
+	 *     | ---- desired range ---- |
+	 *  | state | or
+	 *  | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip
+	 * bits on second half.
+	 *
+	 * If the extent we found extends past our range, we
+	 * just split and search again.  It'll get split again
+	 * the next time though.
+	 *
+	 * If the extent we found is inside our range, we clear
+	 * the desired bit on it.
+	 */
+
+	if (state->start < start) {
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			start = state->end + 1;
+			set |= clear_state_bit(tree, state, bits,
+					wake, delete);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and clear the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		if (wake)
+			wake_up(&state->wq);
+		set |= clear_state_bit(tree, prealloc, bits,
+				       wake, delete);
+		prealloc = NULL;
+		goto out;
+	}
+
+	start = state->end + 1;
+	set |= clear_state_bit(tree, state, bits, wake, delete);
+	goto search_again;
+
+out:
+	write_unlock_irq(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return set;
+
+search_again:
+	if (start >= end)
+		goto out;
+	write_unlock_irq(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(clear_extent_bit);
+
+static int wait_on_state(struct extent_map_tree *tree,
+			 struct extent_state *state)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+	read_unlock_irq(&tree->lock);
+	schedule();
+	read_lock_irq(&tree->lock);
+	finish_wait(&state->wq, &wait);
+	return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
+{
+	struct extent_state *state;
+	struct rb_node *node;
+
+	read_lock_irq(&tree->lock);
+again:
+	while (1) {
+		/*
+		 * this search will find all the extents that end after
+		 * our range starts
+		 */
+		node = tree_search(&tree->state, start);
+		if (!node)
+			break;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (state->start > end)
+			goto out;
+
+		if (state->state & bits) {
+			start = state->start;
+			atomic_inc(&state->refs);
+			wait_on_state(tree, state);
+			free_extent_state(state);
+			goto again;
+		}
+		start = state->end + 1;
+
+		if (start > end)
+			break;
+
+		if (need_resched()) {
+			read_unlock_irq(&tree->lock);
+			cond_resched();
+			read_lock_irq(&tree->lock);
+		}
+	}
+out:
+	read_unlock_irq(&tree->lock);
+	return 0;
+}
+EXPORT_SYMBOL(wait_extent_bit);
+
+/*
+ * set some bits on a range in the tree.  This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set.  The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
+		   int exclusive, u64 *failed_start, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err = 0;
+	int set;
+	u64 last_start;
+	u64 last_end;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node) {
+		err = insert_state(tree, prealloc, start, end, bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+
+	state = rb_entry(node, struct extent_state, rb_node);
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		set = state->state & bits;
+		if (set && exclusive) {
+			*failed_start = state->start;
+			err = -EEXIST;
+			goto out;
+		}
+		state->state |= bits;
+		start = state->end + 1;
+		merge_state(tree, state);
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			state->state |= bits;
+			start = state->end + 1;
+			merge_state(tree, state);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		prealloc->state |= bits;
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start -1;
+		err = insert_state(tree, prealloc, start, this_end,
+				   bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		if (err)
+			goto out;
+		start = this_end + 1;
+		goto search_again;
+	}
+	goto search_again;
+
+out:
+	write_unlock_irq(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	write_unlock_irq(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(set_extent_bit);
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_dirty);
+
+int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_dirty);
+
+int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_new);
+
+int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_new);
+
+int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+			gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_uptodate);
+
+int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+			  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_uptodate);
+
+int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+			      0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_writeback);
+
+int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
+			   gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_writeback);
+
+int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+EXPORT_SYMBOL(wait_on_extent_writeback);
+
+/*
+ * locks a range in ascending order, waiting for any locked regions
+ * it hits on the way.  [start,end] are inclusive, and this will sleep.
+ */
+int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+	while (1) {
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+				     &failed_start, mask);
+		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+			start = failed_start;
+		} else {
+			break;
+		}
+		WARN_ON(start > end);
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_extent);
+
+int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
+		  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+EXPORT_SYMBOL(unlock_extent);
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		__set_page_dirty_nobuffers(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_dirty(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_dirty);
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		set_page_writeback(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_writeback(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_writeback);
+
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int err;
+
+	while (index <= end_index) {
+		page = grab_cache_page(tree->mapping, index);
+		if (!page) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed;
+		}
+		index++;
+	}
+	lock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+
+failed:
+	/*
+	 * we failed above in getting the page at 'index', so we undo here
+	 * up to but not including the page at 'index'
+	 */
+	end_index = index;
+	index = start >> PAGE_CACHE_SHIFT;
+	while (index < end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_range);
+
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	unlock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(unlock_range);
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if ever extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
+			  int bits, int filled)
+{
+	struct extent_state *state = NULL;
+	struct rb_node *node;
+	int bitset = 0;
+
+	read_lock_irq(&tree->lock);
+	node = tree_search(&tree->state, start);
+	while (node && start <= end) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start > end)
+			break;
+
+		if (filled && state->start > start) {
+			bitset = 0;
+			break;
+		}
+		if (state->state & bits) {
+			bitset = 1;
+			if (!filled)
+				break;
+		} else if (filled) {
+			bitset = 0;
+			break;
+		}
+		start = state->end + 1;
+		if (start > end)
+			break;
+		node = rb_next(node);
+	}
+	read_unlock_irq(&tree->lock);
+	return bitset;
+}
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_map_tree *tree,
+			       struct page *page)
+{
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_map_tree *tree,
+			     struct page *page)
+{
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+		unlock_page(page);
+	return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_map_tree *tree,
+			     struct page *page)
+{
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+		end_page_writeback(page);
+	return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static int end_bio_extent_writepage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_map_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate) {
+			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			end_page_writeback(page);
+		else
+			check_page_writeback(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+static int end_bio_extent_readpage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_map_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			if (whole_page)
+				SetPageUptodate(page);
+			else
+				check_page_uptodate(tree, page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			unlock_page(page);
+		else
+			check_page_locked(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+static int end_bio_extent_preparewrite(struct bio *bio,
+				       unsigned int bytes_done, int err)
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_map_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+
+	if (bio->bi_size)
+		return 1;
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+	return 0;
+}
+
+static int submit_extent_page(int rw, struct extent_map_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      bio_end_io_t end_io_func)
+{
+	struct bio *bio;
+	int ret = 0;
+
+	bio = bio_alloc(GFP_NOIO, 1);
+
+	bio->bi_sector = sector;
+	bio->bi_bdev = bdev;
+	bio->bi_io_vec[0].bv_page = page;
+	bio->bi_io_vec[0].bv_len = size;
+	bio->bi_io_vec[0].bv_offset = offset;
+
+	bio->bi_vcnt = 1;
+	bio->bi_idx = 0;
+	bio->bi_size = size;
+
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+
+	bio_get(bio);
+	submit_bio(rw, bio);
+
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+
+	bio_put(bio);
+	return ret;
+}
+
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 cur_end;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t iosize;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	end = page_end;
+	lock_extent(tree, start, end, GFP_NOFS);
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			iosize = PAGE_CACHE_SIZE - page_offset;
+			zero_user_page(page, page_offset, iosize, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, page_offset, cur, end, 0);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			unlock_extent(tree, cur, end, GFP_NOFS);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(em->end < cur);
+		BUG_ON(end < cur);
+
+		iosize = min(em->end - cur, end - cur) + 1;
+		cur_end = min(em->end, end);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		/* we've found a hole, just zero and go on */
+		if (block_start == 0) {
+			zero_user_page(page, page_offset, iosize, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		/* the get_extent function already copied into the page */
+		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset, bdev,
+					 end_bio_extent_readpage);
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+	if (!nr) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_read_full_page);
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t iosize;
+	size_t blocksize;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+
+	if (page->index > end_index) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
+		zero_user_page(page, offset,
+			       PAGE_CACHE_SIZE - offset, KM_USER0);
+	}
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	end = page_end;
+	lock_extent(tree, start, page_end, GFP_NOFS);
+
+	if (last_byte <= start) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		goto done;
+	}
+
+	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+	blocksize = inode->i_sb->s_blocksize;
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, page_offset, cur, end, 1);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(em->end < cur);
+		BUG_ON(end < cur);
+		iosize = min(em->end - cur, end - cur) + 1;
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		if (block_start == 0 || block_start == EXTENT_MAP_INLINE) {
+			clear_extent_dirty(tree, cur,
+					   cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		/* leave this out until we have a page_mkwrite call */
+		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+				   EXTENT_DIRTY, 0)) {
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+		set_range_writeback(tree, cur, cur + iosize - 1);
+		ret = submit_extent_page(WRITE, tree, page,
+					 sector, iosize, page_offset, bdev,
+					 end_bio_extent_writepage);
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+done:
+	WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0));
+	unlock_extent(tree, start, page_end, GFP_NOFS);
+	unlock_page(page);
+	return 0;
+}
+EXPORT_SYMBOL(extent_write_full_page);
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_map_tree *tree,
+			  struct page *page, unsigned long offset)
+{
+	u64 start = (page->index << PAGE_CACHE_SHIFT);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+	start += (offset + blocksize -1) & ~(blocksize - 1);
+	if (start > end)
+		return 0;
+
+	lock_extent(tree, start, end, GFP_NOFS);
+	wait_on_extent_writeback(tree, start, end);
+	clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY,
+			 1, 1, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(extent_invalidatepage);
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_map_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to)
+{
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	set_page_dirty(page);
+
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_commit_write);
+
+int extent_prepare_write(struct extent_map_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent)
+{
+	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	u64 block_start;
+	u64 orig_block_start;
+	u64 block_end;
+	u64 cur_end;
+	struct extent_map *em;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	size_t page_offset = 0;
+	size_t block_off_start;
+	size_t block_off_end;
+	int err = 0;
+	int iocount = 0;
+	int ret = 0;
+	int isnew;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+	block_start = (page_start + from) & ~((u64)blocksize - 1);
+	block_end = (page_start + to - 1) | (blocksize - 1);
+	orig_block_start = block_start;
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	while(block_start <= block_end) {
+		em = get_extent(inode, page, page_offset, block_start,
+				block_end, 1);
+		if (IS_ERR(em) || !em) {
+			goto err;
+		}
+		cur_end = min(block_end, em->end);
+		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+		block_off_end = block_off_start + blocksize;
+		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+		if (!PageUptodate(page) && isnew &&
+		    (block_off_end > to || block_off_start < from)) {
+			void *kaddr;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (block_off_end > to)
+				memset(kaddr + to, 0, block_off_end - to);
+			if (block_off_start < from)
+				memset(kaddr + block_off_start, 0,
+				       from - block_off_start);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		if (!isnew && !PageUptodate(page) &&
+		    (block_off_end > to || block_off_start < from) &&
+		    !test_range_bit(tree, block_start, cur_end,
+				    EXTENT_UPTODATE, 1)) {
+			u64 sector;
+			u64 extent_offset = block_start - em->start;
+			size_t iosize;
+			sector = (em->block_start + extent_offset) >> 9;
+			iosize = (cur_end - block_start + blocksize - 1) &
+				~((u64)blocksize - 1);
+			/*
+			 * we've already got the extent locked, but we
+			 * need to split the state such that our end_bio
+			 * handler can clear the lock.
+			 */
+			set_extent_bit(tree, block_start,
+				       block_start + iosize - 1,
+				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+			ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset, em->bdev,
+					 end_bio_extent_preparewrite);
+			iocount++;
+			block_start = block_start + iosize;
+		} else {
+			set_extent_uptodate(tree, block_start, cur_end,
+					    GFP_NOFS);
+			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+			block_start = cur_end + 1;
+		}
+		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+		free_extent_map(em);
+	}
+	if (iocount) {
+		wait_extent_bit(tree, orig_block_start,
+				block_end, EXTENT_LOCKED);
+	}
+	check_page_uptodate(tree, page);
+err:
+	/* FIXME, zero out newly allocated blocks on error */
+	return err;
+}
+EXPORT_SYMBOL(extent_prepare_write);
+
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
+{
+	struct extent_map *em;
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 orig_start = start;
+
+	while (start <= end) {
+		em = lookup_extent_mapping(tree, start, end);
+		if (!em || IS_ERR(em))
+			break;
+		if (test_range_bit(tree, em->start, em->end,
+				   EXTENT_LOCKED, 0)) {
+			free_extent_map(em);
+			start = em->end + 1;
+printk("range still locked %Lu %Lu\n", em->start, em->end);
+			break;
+		}
+		remove_extent_mapping(tree, em);
+		start = em->end + 1;
+		/* once for the rb tree */
+		free_extent_map(em);
+		/* once for us */
+		free_extent_map(em);
+	}
+	WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0));
+	clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
+			 1, 1, GFP_NOFS);
+	return 1;
+}
+EXPORT_SYMBOL(try_release_extent_mapping);
+
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
new file mode 100644
index 00000000000..108944aab4b
--- /dev/null
+++ b/fs/btrfs/extent_map.h
@@ -0,0 +1,89 @@
+#ifndef __EXTENTMAP__
+#define __EXTENTMAP__
+
+#include <linux/rbtree.h>
+
+#define EXTENT_MAP_INLINE (u64)-2
+#define EXTENT_MAP_DELALLOC (u64)-1
+
+struct extent_map_tree {
+	struct rb_root map;
+	struct rb_root state;
+	struct address_space *mapping;
+	rwlock_t lock;
+};
+
+/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
+struct extent_map {
+	u64 start;
+	u64 end; /* inclusive */
+	int in_tree;
+	struct rb_node rb_node;
+	/* block_start and block_end are in bytes */
+	u64 block_start;
+	u64 block_end; /* inclusive */
+	struct block_device *bdev;
+	atomic_t refs;
+};
+
+/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
+struct extent_state {
+	u64 start;
+	u64 end; /* inclusive */
+	int in_tree;
+	struct rb_node rb_node;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned long state;
+	struct list_head list;
+};
+
+struct extent_buffer {
+	u64 start;
+	u64 end; /* inclusive */
+	char *addr;
+	struct page *pages[];
+};
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+					  struct page *page,
+					  size_t page_offset,
+					  u64 start, u64 end,
+					  int create);
+
+void extent_map_tree_init(struct extent_map_tree *tree,
+			  struct address_space *mapping, gfp_t mask);
+struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+					 u64 start, u64 end);
+int add_extent_mapping(struct extent_map_tree *tree,
+		       struct extent_map *em);
+int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page);
+int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
+struct extent_map *alloc_extent_map(gfp_t mask);
+void free_extent_map(struct extent_map *em);
+int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent);
+void __init extent_map_init(void);
+void __exit extent_map_exit(void);
+int extent_clean_all_trees(struct extent_map_tree *tree);
+int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
+			gfp_t mask);
+int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
+		   gfp_t mask);
+int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int extent_invalidatepage(struct extent_map_tree *tree,
+			  struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc);
+int extent_prepare_write(struct extent_map_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_map_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to);
+int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
+#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6933ab11a5c..71a481894ab 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -81,14 +81,14 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
-static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode,
 				u64 offset, ssize_t size,
-				struct buffer_head *bh)
+				struct page *page, size_t page_offset)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	char *ptr, *kaddr;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_file_extent_item *ei;
 	u32 datasize;
 	int err = 0;
@@ -98,8 +98,6 @@ static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
 	key.objectid = inode->i_ino;
@@ -122,18 +120,13 @@ static int insert_inline_extent(struct btrfs_root *root, struct inode *inode,
 				   BTRFS_FILE_EXTENT_INLINE);
 	ptr = btrfs_file_extent_inline_start(ei);
 
-	kaddr = kmap_atomic(bh->b_page, KM_USER0);
+	kaddr = kmap_atomic(page, KM_USER0);
 	btrfs_memcpy(root, path->nodes[0]->b_data,
-		     ptr, kaddr + bh_offset(bh),
-		     size);
+		     ptr, kaddr + page_offset, size);
 	kunmap_atomic(kaddr, KM_USER0);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_free_path(path);
-	ret = btrfs_end_transaction(trans, root);
-	if (ret && !err)
-		err = ret;
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return err;
 }
 
@@ -145,45 +138,143 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   loff_t pos,
 				   size_t write_bytes)
 {
-	int i;
-	int offset;
 	int err = 0;
-	int ret;
-	int this_write;
+	int i;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct buffer_head *bh;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key ins;
+	u64 hint_block;
+	u64 num_blocks;
+	u64 start_pos;
+	u64 end_of_last_block;
+	u64 end_pos = pos + write_bytes;
+	loff_t isize = i_size_read(inode);
 
-	for (i = 0; i < num_pages; i++) {
-		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
 
-		/* FIXME, one block at a time */
-		bh = page_buffers(pages[i]);
+	em->bdev = inode->i_sb->s_bdev;
 
-		if (buffer_mapped(bh) && bh->b_blocknr == 0) {
-			ret = insert_inline_extent(root, inode,
-					pages[i]->index << PAGE_CACHE_SHIFT,
-					offset + this_write, bh);
-			if (ret) {
-				err = ret;
-				goto failed;
-			}
-		}
+	start_pos = pos & ~((u64)root->blocksize - 1);
+	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
+			inode->i_blkbits;
 
-		ret = btrfs_commit_write(file, pages[i], offset,
-					 offset + this_write);
-		pos += this_write;
-		if (ret) {
-			err = ret;
+	end_of_last_block = start_pos + (num_blocks << inode->i_blkbits) - 1;
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+	btrfs_set_trans_block_group(trans, inode);
+	inode->i_blocks += num_blocks << 3;
+	hint_block = 0;
+
+	if ((end_of_last_block & 4095) == 0) {
+		printk("strange end of last %Lu %lu %Lu\n", start_pos, write_bytes, end_of_last_block);
+	}
+	set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS);
+
+	/* FIXME...EIEIO, ENOSPC and more */
+
+	/* step one, delete the existing extents in this range */
+	/* FIXME blocksize != pagesize */
+	if (start_pos < inode->i_size) {
+		err = btrfs_drop_extents(trans, root, inode,
+			 start_pos, (pos + write_bytes + root->blocksize -1) &
+			 ~((u64)root->blocksize - 1), &hint_block);
+		if (err)
+			goto failed;
+	}
+
+	/* insert any holes we need to create */
+	if (inode->i_size < start_pos) {
+		u64 last_pos_in_file;
+		u64 hole_size;
+		u64 mask = root->blocksize - 1;
+		last_pos_in_file = (isize + mask) & ~mask;
+		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		hole_size >>= inode->i_blkbits;
+		if (last_pos_in_file < start_pos) {
+			err = btrfs_insert_file_extent(trans, root,
+						       inode->i_ino,
+						       last_pos_in_file,
+						       0, 0, hole_size);
+		}
+		if (err)
 			goto failed;
+	}
+
+	/*
+	 * either allocate an extent for the new bytes or setup the key
+	 * to show we are doing inline data in the extent
+	 */
+	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
+	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+		err = btrfs_alloc_extent(trans, root, inode->i_ino,
+					 num_blocks, 0, hint_block, (u64)-1,
+					 &ins, 1);
+		BUG_ON(err);
+		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       start_pos, ins.objectid, ins.offset,
+				       ins.offset);
+		BUG_ON(err);
+		em->start = start_pos;
+		em->end = end_of_last_block;
+		em->block_start = ins.objectid << inode->i_blkbits;
+		em->block_end = em->block_start +
+			(ins.offset << inode->i_blkbits) - 1;
+		set_extent_dirty(em_tree, start_pos, end_of_last_block,
+				 GFP_NOFS);
+		err = add_extent_mapping(em_tree, em);
+		for (i = 0; i < num_pages; i++) {
+			struct page *p = pages[i];
+			SetPageUptodate(p);
+			__set_page_dirty_nobuffers(p);
 		}
-		WARN_ON(this_write > write_bytes);
-		write_bytes -= this_write;
+	} else {
+		struct page *p = pages[0];
+		err = insert_inline_extent(trans, root, inode, start_pos,
+					   end_pos - start_pos, p, 0);
+		BUG_ON(err);
+		em->start = start_pos;
+		em->end = end_pos;
+		em->block_start = EXTENT_MAP_INLINE;
+		em->block_end = EXTENT_MAP_INLINE;
+		add_extent_mapping(em_tree, em);
+	}
+	if (end_pos > isize) {
+		i_size_write(inode, end_pos);
+		btrfs_update_inode(trans, root, inode);
 	}
 failed:
+	err = btrfs_end_transaction(trans, root);
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	free_extent_map(em);
 	return err;
 }
 
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
+{
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+
+	while(1) {
+		em = lookup_extent_mapping(em_tree, start, end);
+		if (!em)
+			break;
+		remove_extent_mapping(em_tree, em);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree*/
+		free_extent_map(em);
+	}
+	return 0;
+}
+
 /*
  * this is very complex, but the basic idea is to drop all extents
  * in the range start - end.  hint_block is filled in with a block number
@@ -213,6 +304,8 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int found_inline;
 	int recow;
 
+	btrfs_drop_extent_cache(inode, start, end - 1);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -434,18 +527,9 @@ static int prepare_pages(struct btrfs_root *root,
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = file->f_path.dentry->d_inode;
-	int offset;
 	int err = 0;
-	int this_write;
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	loff_t isize = i_size_read(inode);
-	struct btrfs_trans_handle *trans;
-	u64 hint_block;
 	u64 num_blocks;
-	u64 alloc_extent_start;
 	u64 start_pos;
-	struct btrfs_key ins;
 
 	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
 	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
@@ -457,119 +541,17 @@ static int prepare_pages(struct btrfs_root *root,
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
 			err = -ENOMEM;
-			goto failed_release;
+			BUG_ON(1);
 		}
 		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
 		wait_on_page_writeback(pages[i]);
-	}
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	if (!trans) {
-		err = -ENOMEM;
-		mutex_unlock(&root->fs_info->fs_mutex);
-		goto out_unlock;
-	}
-	btrfs_set_trans_block_group(trans, inode);
-	/* FIXME blocksize != 4096 */
-	inode->i_blocks += num_blocks << 3;
-	hint_block = 0;
-
-	/* FIXME...EIEIO, ENOSPC and more */
-
-	/* step one, delete the existing extents in this range */
-	/* FIXME blocksize != pagesize */
-	if (start_pos < inode->i_size) {
-		err = btrfs_drop_extents(trans, root, inode,
-			 start_pos, (pos + write_bytes + root->blocksize -1) &
-			 ~((u64)root->blocksize - 1), &hint_block);
-		if (err)
-			goto failed_release;
-	}
-
-	/* insert any holes we need to create */
-	if (inode->i_size < start_pos) {
-		u64 last_pos_in_file;
-		u64 hole_size;
-		u64 mask = root->blocksize - 1;
-		last_pos_in_file = (isize + mask) & ~mask;
-		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		hole_size >>= inode->i_blkbits;
-		if (last_pos_in_file < start_pos) {
-			err = btrfs_insert_file_extent(trans, root,
-						       inode->i_ino,
-						       last_pos_in_file,
-						       0, 0, hole_size);
-		}
-		if (err)
-			goto failed_release;
-	}
-
-	/*
-	 * either allocate an extent for the new bytes or setup the key
-	 * to show we are doing inline data in the extent
-	 */
-	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
-	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		err = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, 0, hint_block, (u64)-1,
-					 &ins, 1);
-		if (err)
-			goto failed_truncate;
-		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       start_pos, ins.objectid, ins.offset,
-				       ins.offset);
-		if (err)
-			goto failed_truncate;
-	} else {
-		ins.offset = 0;
-		ins.objectid = 0;
-	}
-	BUG_ON(err);
-	alloc_extent_start = ins.objectid;
-	err = btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-	for (i = 0; i < num_pages; i++) {
-		offset = pos & (PAGE_CACHE_SIZE -1);
-		this_write = min((size_t)PAGE_CACHE_SIZE - offset, write_bytes);
-		if (!page_has_buffers(pages[i])) {
-			create_empty_buffers(pages[i],
-					     root->fs_info->sb->s_blocksize,
-					     (1 << BH_Uptodate));
+		if (!PagePrivate(pages[i])) {
+			SetPagePrivate(pages[i]);
+			set_page_private(pages[i], 1);
+			page_cache_get(pages[i]);
 		}
-		head = page_buffers(pages[i]);
-		bh = head;
-		do {
-			err = btrfs_map_bh_to_logical(root, bh,
-						      alloc_extent_start);
-			BUG_ON(err);
-			if (err)
-				goto failed_truncate;
-			bh = bh->b_this_page;
-			if (alloc_extent_start)
-				alloc_extent_start++;
-		} while (bh != head);
-		pos += this_write;
-		WARN_ON(this_write > write_bytes);
-		write_bytes -= this_write;
 	}
 	return 0;
-
-failed_release:
-	btrfs_drop_pages(pages, num_pages);
-	return err;
-
-failed_truncate:
-	btrfs_drop_pages(pages, num_pages);
-	if (pos > isize)
-		vmtruncate(inode, isize);
-	return err;
-
-out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
-	goto failed_release;
-
 }
 
 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
@@ -685,7 +667,6 @@ out:
 		page_cache_release(pinned[1]);
 	*ppos = pos;
 	current->backing_dev_info = NULL;
-	mark_inode_dirty(inode);
 	return num_written ? num_written : err;
 }
 
@@ -714,8 +695,8 @@ static int btrfs_sync_file(struct file *file,
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	/*
- 	 * ok we haven't committed the transaction yet, lets do a commit
- 	 */
+	 * ok we haven't committed the transaction yet, lets do a commit
+	 */
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 504d37dfa6c..fd4cb65fcf1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -121,6 +121,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+				     inode->i_mapping, GFP_NOFS);
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
@@ -169,7 +171,7 @@ static void fill_inode_item(struct btrfs_inode_item *item,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
-static int btrfs_update_inode(struct btrfs_trans_handle *trans,
+int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode)
 {
@@ -400,6 +402,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int found_extent;
 	int del_item;
 
+	btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -511,6 +514,50 @@ error:
 	return ret;
 }
 
+static int btrfs_cow_one_page(struct btrfs_trans_handle *trans,
+			      struct inode *inode, struct page *page,
+			      size_t zero_start)
+{
+	char *kaddr;
+	int ret = 0;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 alloc_hint = 0;
+	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	struct btrfs_key ins;
+
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+
+	btrfs_set_trans_block_group(trans, inode);
+
+	ret = btrfs_drop_extents(trans, root, inode,
+				 page_start, page_start + PAGE_CACHE_SIZE,
+				 &alloc_hint);
+	if (ret)
+		goto out;
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
+				 alloc_hint, (u64)-1, &ins, 1);
+	if (ret)
+		goto out;
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       page_start, ins.objectid, 1, 1);
+	if (ret)
+		goto out;
+	SetPageChecked(page);
+	kaddr = kmap(page);
+	if (zero_start != PAGE_CACHE_SIZE) {
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+	}
+	kunmap(page);
+
+out:
+	return ret;
+}
+
 /*
  * taken from block_truncate_page, but does cow as it zeros out
  * any bytes left in the last page in the file.
@@ -518,16 +565,14 @@ error:
 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
 	unsigned blocksize = 1 << inode->i_blkbits;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
-	char *kaddr;
 	int ret = 0;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 alloc_hint = 0;
-	struct btrfs_key ins;
-	struct btrfs_trans_handle *trans;
+	u64 page_start;
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
@@ -536,7 +581,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
-
 	if (!PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
@@ -545,37 +589,24 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 			goto out;
 		}
 	}
+	page_start = page->index << PAGE_CACHE_SHIFT;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-
-	ret = btrfs_drop_extents(trans, root, inode,
-				 page->index << PAGE_CACHE_SHIFT,
-				 (page->index + 1) << PAGE_CACHE_SHIFT,
-				 &alloc_hint);
-	if (ret)
-		goto out;
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
-				 alloc_hint, (u64)-1, &ins, 1);
-	if (ret)
-		goto out;
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       page->index << PAGE_CACHE_SHIFT,
-				       ins.objectid, 1, 1);
-	if (ret)
-		goto out;
-	SetPageChecked(page);
-	kaddr = kmap(page);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	ret = btrfs_csum_file_block(trans, root, inode->i_ino,
-			      page->index << PAGE_CACHE_SHIFT,
-			      kaddr, PAGE_CACHE_SIZE);
-	kunmap(page);
+	ret = btrfs_cow_one_page(trans, inode, page, offset);
+	if (!ret) {
+		char *kaddr = kmap(page);
+		ret = btrfs_csum_file_block(trans, root, inode->i_ino,
+					    page_start, kaddr, PAGE_CACHE_SIZE);
+		kunmap(page);
+	}
+	set_extent_dirty(&BTRFS_I(inode)->extent_tree,
+			 page_start, page_start + PAGE_CACHE_SIZE - 1,
+			 GFP_NOFS);
+	set_page_dirty(page);
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
-	set_page_dirty(page);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -1095,6 +1126,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+				     inode->i_mapping, GFP_NOFS);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -1238,6 +1271,182 @@ out_unlock:
 	return err;
 }
 
+struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
+				    size_t page_offset, u64 start, u64 end,
+				    int create)
+{
+	int ret;
+	int err = 0;
+	u64 blocknr;
+	u64 extent_start = 0;
+	u64 extent_end = 0;
+	u64 objectid = inode->i_ino;
+	u32 found_type;
+	int failed_insert = 0;
+	struct btrfs_path *path;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_leaf *leaf;
+	struct btrfs_disk_key *found_key;
+	struct extent_map *em = NULL;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_trans_handle *trans = NULL;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	mutex_lock(&root->fs_info->fs_mutex);
+
+again:
+	em = lookup_extent_mapping(em_tree, start, end);
+	if (em) {
+		goto out;
+	}
+	if (!em) {
+		em = alloc_extent_map(GFP_NOFS);
+		if (!em) {
+			err = -ENOMEM;
+			goto out;
+		}
+		em->start = 0;
+		em->end = 0;
+	}
+	em->bdev = inode->i_sb->s_bdev;
+	ret = btrfs_lookup_file_extent(NULL, root, path,
+				       objectid, start, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	if (ret != 0) {
+		if (path->slots[0] == 0)
+			goto not_found;
+		path->slots[0]--;
+	}
+
+	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+			      struct btrfs_file_extent_item);
+	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	blocknr = btrfs_file_extent_disk_blocknr(item);
+	blocknr += btrfs_file_extent_offset(item);
+
+	/* are we inside the extent that was found? */
+	found_key = &leaf->items[path->slots[0]].key;
+	found_type = btrfs_disk_key_type(found_key);
+	if (btrfs_disk_key_objectid(found_key) != objectid ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		goto not_found;
+	}
+
+	found_type = btrfs_file_extent_type(item);
+	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		extent_end = extent_start +
+		       (btrfs_file_extent_num_blocks(item) << inode->i_blkbits);
+		err = 0;
+		if (start < extent_start || start > extent_end) {
+			em->start = start;
+			if (start < extent_start) {
+				em->end = extent_end - 1;
+			} else {
+				em->end = end;
+			}
+			goto not_found_em;
+		}
+		if (btrfs_file_extent_disk_blocknr(item) == 0) {
+			em->start = extent_start;
+			em->end = extent_end - 1;
+			em->block_start = 0;
+			em->block_end = 0;
+			goto insert;
+		}
+		em->block_start = blocknr << inode->i_blkbits;
+		em->block_end = em->block_start +
+			(btrfs_file_extent_num_blocks(item) <<
+			 inode->i_blkbits) - 1;
+		em->start = extent_start;
+		em->end = extent_end - 1;
+		goto insert;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		char *ptr;
+		char *map;
+		u32 size;
+
+		size = btrfs_file_extent_inline_len(leaf->items +
+						    path->slots[0]);
+		extent_end = extent_start + size;
+		if (start < extent_start || start > extent_end) {
+			em->start = start;
+			if (start < extent_start) {
+				em->end = extent_end - 1;
+			} else {
+				em->end = end;
+			}
+			goto not_found_em;
+		}
+		em->block_start = EXTENT_MAP_INLINE;
+		em->block_end = EXTENT_MAP_INLINE;
+		em->start = extent_start;
+		em->end = extent_end - 1;
+		if (!page) {
+			goto insert;
+		}
+		ptr = btrfs_file_extent_inline_start(item);
+		map = kmap(page);
+		memcpy(map + page_offset, ptr, size);
+		flush_dcache_page(result->b_page);
+		kunmap(page);
+		set_extent_uptodate(em_tree, extent_start,
+				    extent_end, GFP_NOFS);
+		goto insert;
+	} else {
+		printk("unkknown found_type %d\n", found_type);
+		WARN_ON(1);
+	}
+not_found:
+	em->start = start;
+	em->end = end;
+not_found_em:
+	em->block_start = 0;
+	em->block_end = 0;
+insert:
+	btrfs_release_path(root, path);
+	if (em->start > start || em->end < start) {
+		printk("bad extent! %Lu %Lu start %Lu end %Lu\n", em->start, em->end, start, end);
+		WARN_ON(1);
+		err = -EIO;
+		goto out;
+	}
+	ret = add_extent_mapping(em_tree, em);
+	if (ret == -EEXIST) {
+		free_extent_map(em);
+		failed_insert++;
+		if (failed_insert > 5) {
+			printk("failing to insert %Lu %Lu\n", start, end);
+			err = -EIO;
+			goto out;
+		}
+		em = NULL;
+		goto again;
+	}
+	err = 0;
+out:
+	btrfs_free_path(path);
+	if (trans) {
+		ret = btrfs_end_transaction(trans, root);
+		if (!err)
+			err = ret;
+	}
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (err) {
+		free_extent_map(em);
+		WARN_ON(1);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
+
 /*
  * FIBMAP and others want to pass in a fake buffer head.  They need to
  * use BTRFS_GET_BLOCK_NO_DIRECT to make sure we don't try to memcpy
@@ -1398,46 +1607,22 @@ int btrfs_get_block(struct inode *inode, sector_t iblock,
 	return err;
 }
 
-static int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
-				struct buffer_head *result, int create)
-{
-	int ret;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *page = result->b_page;
-	u64 offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(result);
-	struct btrfs_csum_item *item;
-	struct btrfs_path *path = NULL;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_get_block_lock(inode, iblock, result, create);
-	if (ret)
-		goto out;
-
-	path = btrfs_alloc_path();
-	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, offset, 0);
-	if (IS_ERR(item)) {
-		ret = PTR_ERR(item);
-		/* a csum that isn't present is a preallocated region. */
-		if (ret == -ENOENT || ret == -EFBIG)
-			ret = 0;
-		result->b_private = NULL;
-		goto out;
-	}
-	memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE);
-out:
-	if (path)
-		btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
-
 static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
-	btrfs_get_block_lock(inode, iblock, result, BTRFS_GET_BLOCK_NO_DIRECT);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	u64 start = iblock << inode->i_blkbits;
+	u64 end = start + root->blocksize -1;
+	struct extent_map *em;
+
+	em = btrfs_get_extent(inode, NULL, 0, start, end, 0);
+	if (em && !IS_ERR(em) && em->block_start != EXTENT_MAP_INLINE &&
+	    em->block_start != 0) {
+		u64 offset;
+		offset = start - em->start;
+		start = (em->block_start + offset) >> inode->i_blkbits;
+		btrfs_map_bh_to_logical(root, result, start);
+	}
 	return 0;
 }
 
@@ -1449,442 +1634,50 @@ static sector_t btrfs_bmap(struct address_space *as, sector_t block)
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
-	return block_prepare_write(page, from, to, btrfs_get_block);
+	return extent_prepare_write(&BTRFS_I(page->mapping->host)->extent_tree,
+				    page->mapping->host, page, from, to,
+				    btrfs_get_extent);
 }
 
-static void buffer_io_error(struct buffer_head *bh)
+int btrfs_readpage(struct file *file, struct page *page)
 {
-	char b[BDEVNAME_SIZE];
-
-	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
-			bdevname(bh->b_bdev, b),
-			(unsigned long long)bh->b_blocknr);
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	return extent_read_full_page(tree, page, btrfs_get_extent);
 }
-
-/*
- * I/O completion handler for block_read_full_page() - pages
- * which come unlocked at the end of I/O.
- */
-static void btrfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	unsigned long flags;
-	struct buffer_head *first;
-	struct buffer_head *tmp;
-	struct page *page;
-	int page_uptodate = 1;
-	struct inode *inode;
-	int ret;
-
-	BUG_ON(!buffer_async_read(bh));
-
-	page = bh->b_page;
-	inode = page->mapping->host;
-	if (uptodate) {
-		void *kaddr;
-		struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-		if (bh->b_private) {
-			char csum[BTRFS_CRC32_SIZE];
-			kaddr = kmap_atomic(page, KM_IRQ0);
-			ret = btrfs_csum_data(root, kaddr + bh_offset(bh),
-					      bh->b_size, csum);
-			BUG_ON(ret);
-			if (memcmp(csum, &bh->b_private, BTRFS_CRC32_SIZE)) {
-				u64 offset;
-				offset = (page->index << PAGE_CACHE_SHIFT) +
-					bh_offset(bh);
-				printk("btrfs csum failed ino %lu off %llu\n",
-				       page->mapping->host->i_ino,
-				       (unsigned long long)offset);
-				memset(kaddr + bh_offset(bh), 1, bh->b_size);
-				flush_dcache_page(page);
-			}
-			kunmap_atomic(kaddr, KM_IRQ0);
-		}
-		set_buffer_uptodate(bh);
-	} else {
-		clear_buffer_uptodate(bh);
-		if (printk_ratelimit())
-			buffer_io_error(bh);
-		SetPageError(page);
-	}
-
-	/*
-	 * Be _very_ careful from here on. Bad things can happen if
-	 * two buffer heads end IO at almost the same time and both
-	 * decide that the page is now completely done.
-	 */
-	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
-	clear_buffer_async_read(bh);
-	unlock_buffer(bh);
-	tmp = bh;
-	do {
-		if (!buffer_uptodate(tmp))
-			page_uptodate = 0;
-		if (buffer_async_read(tmp)) {
-			BUG_ON(!buffer_locked(tmp));
-			goto still_busy;
-		}
-		tmp = tmp->b_this_page;
-	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-
-	/*
-	 * If none of the buffers had errors and they are all
-	 * uptodate then we can set the page uptodate.
-	 */
-	if (page_uptodate && !PageError(page))
-		SetPageUptodate(page);
-	unlock_page(page);
-	return;
-
-still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
-	return;
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-/*
- * Generic "read page" function for block devices that have the normal
- * get_block functionality. This is most of the block device filesystems.
- * Reads the page asynchronously --- the unlock_buffer() and
- * set/clear_buffer_uptodate() functions propagate buffer state into the
- * page struct once IO has completed.
- */
-int btrfs_readpage(struct file *file, struct page *page)
+static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
-	struct inode *inode = page->mapping->host;
-	sector_t iblock, lblock;
-	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-	unsigned int blocksize;
-	int nr, i;
-	int fully_mapped = 1;
-
-	BUG_ON(!PageLocked(page));
-	blocksize = 1 << inode->i_blkbits;
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, blocksize, 0);
-	head = page_buffers(page);
-
-	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
-	bh = head;
-	nr = 0;
-	i = 0;
-
-	do {
-		if (buffer_uptodate(bh))
-			continue;
-
-		if (!buffer_mapped(bh)) {
-			int err = 0;
-
-			fully_mapped = 0;
-			if (iblock < lblock) {
-				WARN_ON(bh->b_size != blocksize);
-				err = btrfs_get_block_csum(inode, iblock,
-							   bh, 0);
-				if (err)
-					SetPageError(page);
-			}
-			if (!buffer_mapped(bh)) {
-				void *kaddr = kmap_atomic(page, KM_USER0);
-				memset(kaddr + i * blocksize, 0, blocksize);
-				flush_dcache_page(page);
-				kunmap_atomic(kaddr, KM_USER0);
-				if (!err)
-					set_buffer_uptodate(bh);
-				continue;
-			}
-			/*
-			 * get_block() might have updated the buffer
-			 * synchronously
-			 */
-			if (buffer_uptodate(bh))
-				continue;
-		}
-		arr[nr++] = bh;
-	} while (i++, iblock++, (bh = bh->b_this_page) != head);
-
-	if (fully_mapped)
-		SetPageMappedToDisk(page);
-
-	if (!nr) {
-		/*
-		 * All buffers are uptodate - we can set the page uptodate
-		 * as well. But not if get_block() returned an error.
-		 */
-		if (!PageError(page))
-			SetPageUptodate(page);
-		unlock_page(page);
-		return 0;
-	}
-
-	/* Stage two: lock the buffers */
-	for (i = 0; i < nr; i++) {
-		bh = arr[i];
-		lock_buffer(bh);
-		bh->b_end_io = btrfs_end_buffer_async_read;
-		set_buffer_async_read(bh);
-	}
-
-	/*
-	 * Stage 3: start the IO.  Check for uptodateness
-	 * inside the buffer lock in case another process reading
-	 * the underlying blockdev brought it uptodate (the sct fix).
-	 */
-	for (i = 0; i < nr; i++) {
-		bh = arr[i];
-		if (buffer_uptodate(bh))
-			btrfs_end_buffer_async_read(bh, 1);
-		else
-			submit_bh(READ, bh);
-	}
-	return 0;
-}
-
-/*
- * Aside from a tiny bit of packed file data handling, this is the
- * same as the generic code.
- *
- * While block_write_full_page is writing back the dirty buffers under
- * the page lock, whoever dirtied the buffers may decide to clean them
- * again at any time.  We handle that by only looking at the buffer
- * state inside lock_buffer().
- *
- * If block_write_full_page() is called for regular writeback
- * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
- * locked buffer.   This only can happen if someone has written the buffer
- * directly, with submit_bh().  At the address_space level PageWriteback
- * prevents this contention from occurring.
- */
-static int __btrfs_write_full_page(struct inode *inode, struct page *page,
-				   struct writeback_control *wbc)
-{
-	int err;
-	sector_t block;
-	sector_t last_block;
-	struct buffer_head *bh, *head;
-	const unsigned blocksize = 1 << inode->i_blkbits;
-	int nr_underway = 0;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
-	BUG_ON(!PageLocked(page));
-
-	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
-	/* no csumming allowed when from PF_MEMALLOC */
-	if (current->flags & PF_MEMALLOC) {
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
-	}
+	struct extent_map_tree *tree;
+	int ret;
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, blocksize,
-					(1 << BH_Dirty)|(1 << BH_Uptodate));
+	if (page->private != 1) {
+		WARN_ON(1);
+		return try_to_free_buffers(page);
 	}
-
-	/*
-	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
-	 * here, and the (potentially unmapped) buffers may become dirty at
-	 * any time.  If a buffer becomes dirty here after we've inspected it
-	 * then we just miss that fact, and the page stays dirty.
-	 *
-	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
-	 * handle that here by just cleaning them.
-	 */
-
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	head = page_buffers(page);
-	bh = head;
-
-	/*
-	 * Get all the dirty buffers mapped to disk addresses and
-	 * handle any aliases from the underlying blockdev's mapping.
-	 */
-	do {
-		if (block > last_block) {
-			/*
-			 * mapped buffers outside i_size will occur, because
-			 * this page can be outside i_size when there is a
-			 * truncate in progress.
-			 */
-			/*
-			 * The buffer was zeroed by block_write_full_page()
-			 */
-			clear_buffer_dirty(bh);
-			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
-			WARN_ON(bh->b_size != blocksize);
-			err = btrfs_get_block(inode, block, bh, 0);
-			if (err) {
-				goto recover;
-			}
-			if (buffer_new(bh)) {
-				/* blockdev mappings never come here */
-				clear_buffer_new(bh);
-			}
-		}
-		bh = bh->b_this_page;
-		block++;
-	} while (bh != head);
-
-	do {
-		if (!buffer_mapped(bh))
-			continue;
-		/*
-		 * If it's a fully non-blocking write attempt and we cannot
-		 * lock the buffer then redirty the page.  Note that this can
-		 * potentially cause a busy-wait loop from pdflush and kswapd
-		 * activity, but those code paths have their own higher-level
-		 * throttling.
-		 */
-		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
-			lock_buffer(bh);
-		} else if (test_set_buffer_locked(bh)) {
-			redirty_page_for_writepage(wbc, page);
-			continue;
-		}
-		if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
-			struct btrfs_trans_handle *trans;
-			int ret;
-			u64 off = page->index << PAGE_CACHE_SHIFT;
-			char *kaddr;
-
-			off += bh_offset(bh);
-			mutex_lock(&root->fs_info->fs_mutex);
-			trans = btrfs_start_transaction(root, 1);
-			btrfs_set_trans_block_group(trans, inode);
-			kaddr = kmap(page);
-			btrfs_csum_file_block(trans, root, inode->i_ino,
-						    off, kaddr + bh_offset(bh),
-						    bh->b_size);
-			kunmap(page);
-			ret = btrfs_end_transaction(trans, root);
-			BUG_ON(ret);
-			mutex_unlock(&root->fs_info->fs_mutex);
-			mark_buffer_async_write(bh);
-		} else {
-			unlock_buffer(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-
-	/*
-	 * The page and its buffers are protected by PageWriteback(), so we can
-	 * drop the bh refcounts early.
-	 */
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			submit_bh(WRITE, bh);
-			nr_underway++;
-		}
-		bh = next;
-	} while (bh != head);
-	unlock_page(page);
-
-	err = 0;
-done:
-	if (nr_underway == 0) {
-		/*
-		 * The page was marked dirty, but the buffers were
-		 * clean.  Someone wrote them back by hand with
-		 * ll_rw_block/submit_bh.  A rare case.
-		 */
-		int uptodate = 1;
-		do {
-			if (!buffer_uptodate(bh)) {
-				uptodate = 0;
-				break;
-			}
-			bh = bh->b_this_page;
-		} while (bh != head);
-		if (uptodate)
-			SetPageUptodate(page);
-		end_page_writeback(page);
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(tree, page);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
 	}
-	return err;
-
-recover:
-	/*
-	 * ENOSPC, or some other error.  We may already have added some
-	 * blocks to the file, so we need to write these out to avoid
-	 * exposing stale data.
-	 * The page is currently locked and not marked for writeback
-	 */
-	bh = head;
-	/* Recovery: lock and submit the mapped buffers */
-	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
-			lock_buffer(bh);
-			mark_buffer_async_write(bh);
-		} else {
-			/*
-			 * The buffer may have been set dirty during
-			 * attachment to a dirty page.
-			 */
-			clear_buffer_dirty(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-	SetPageError(page);
-	BUG_ON(PageWriteback(page));
-	set_page_writeback(page);
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			clear_buffer_dirty(bh);
-			submit_bh(WRITE, bh);
-			nr_underway++;
-		}
-		bh = next;
-	} while (bh != head);
-	unlock_page(page);
-	goto done;
+	return ret;
 }
 
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
+static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct inode * const inode = page->mapping->host;
-	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	unsigned offset;
-	void *kaddr;
-
-	/* Is the page fully inside i_size? */
-	if (page->index < end_index)
-		return __btrfs_write_full_page(inode, page, wbc);
-
-	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
-	if (page->index >= end_index+1 || !offset) {
-		/*
-		 * The page may have dirty, unmapped buffers.  For example,
-		 * they may have been added in ext3_writepage().  Make them
-		 * freeable here, so the page does not leak.
-		 */
-		block_invalidatepage(page, 0);
-		unlock_page(page);
-		return 0; /* don't care */
-	}
+	struct extent_map_tree *tree;
 
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invokation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-	return __btrfs_write_full_page(inode, page, wbc);
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	extent_invalidatepage(tree, page, offset);
+	btrfs_releasepage(page, GFP_NOFS);
 }
 
 /*
@@ -1905,28 +1698,39 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
 	unsigned long end;
 	loff_t size;
 	int ret = -EINVAL;
+	u64 page_start;
 
 	lock_page(page);
 	wait_on_page_writeback(page);
 	size = i_size_read(inode);
+	page_start = page->index << PAGE_CACHE_SHIFT;
+
 	if ((page->mapping != inode->i_mapping) ||
-	    ((page->index << PAGE_CACHE_SHIFT) > size)) {
+	    (page_start > size)) {
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
 
 	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+	if (page_start + PAGE_CACHE_SIZE > size)
 		end = size & ~PAGE_CACHE_MASK;
 	else
 		end = PAGE_CACHE_SIZE;
 
-	ret = btrfs_prepare_write(NULL, page, 0, end);
-	if (!ret)
-		ret = btrfs_commit_write(NULL, page, 0, end);
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	ret = btrfs_cow_one_page(trans, inode, page, end);
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	set_extent_dirty(&BTRFS_I(inode)->extent_tree,
+			 page_start, page_start + PAGE_CACHE_SIZE - 1,
+			 GFP_NOFS);
+	set_page_dirty(page);
 
 out_unlock:
 	unlock_page(page);
@@ -1962,21 +1766,8 @@ static void btrfs_truncate(struct inode *inode)
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to)
 {
-	struct inode *inode = page->mapping->host;
-	struct buffer_head *bh;
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	SetPageUptodate(page);
-	bh = page_buffers(page);
-	set_buffer_uptodate(bh);
-	if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-		set_page_dirty(page);
-	}
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
+	return extent_commit_write(&BTRFS_I(page->mapping->host)->extent_tree,
+				   page->mapping->host, page, from, to);
 }
 
 static int create_subvol(struct btrfs_root *root, char *name, int namelen)
@@ -2471,6 +2262,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+				     inode->i_mapping, GFP_NOFS);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2553,6 +2346,9 @@ static struct address_space_operations btrfs_aops = {
 	.prepare_write	= btrfs_prepare_write,
 	.commit_write	= btrfs_commit_write,
 	.bmap		= btrfs_bmap,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
+	.set_page_dirty	= __set_page_dirty_nobuffers,
 };
 
 static struct address_space_operations btrfs_symlink_aops = {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 74f3de47423..7b7c32c73a2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -185,6 +185,7 @@ static int __init init_btrfs_fs(void)
 	err = btrfs_init_cachep();
 	if (err)
 		return err;
+	extent_map_init();
 	return register_filesystem(&btrfs_fs_type);
 }
 
@@ -192,6 +193,7 @@ static void __exit exit_btrfs_fs(void)
 {
 	btrfs_exit_transaction_sys();
 	btrfs_destroy_cachep();
+	extent_map_exit();
 	unregister_filesystem(&btrfs_fs_type);
 }
 
-- 
cgit v1.2.3


From b888db2bd7b67f190b32934e6a86181f262ac3ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 27 Aug 2007 16:49:44 -0400
Subject: Btrfs: Add delayed allocation to the extent based page tree code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |   9 ---
 fs/btrfs/extent_map.c  | 147 +++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/extent_map.h  |   3 +
 fs/btrfs/file.c        |  50 +++++++----------
 fs/btrfs/inode.c       | 141 ++++++++++++++++++++++++++---------------------
 fs/btrfs/super.c       |   1 +
 6 files changed, 216 insertions(+), 135 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4049aadbeda..c31e84d4265 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -39,7 +39,6 @@ static int cache_block_group(struct btrfs_root *root,
 	u64 i;
 	u64 last = 0;
 	u64 hole_size;
-	u64 limit;
 	int found = 0;
 
 	root = root->fs_info->extent_root;
@@ -62,7 +61,6 @@ static int cache_block_group(struct btrfs_root *root,
 		return ret;
 	if (ret && path->slots[0] > 0)
 		path->slots[0]--;
-	limit = block_group->key.objectid + block_group->key.offset;
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
@@ -982,7 +980,6 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
-	u64 limit;
 
 	WARN_ON(num_blocks < 1);
 	ins->flags = 0;
@@ -1049,12 +1046,6 @@ check_failed:
 		l = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(&l->header)) {
-			if (start_found)
-				limit = last_block +
-					(block_group->key.offset >> 1);
-			else
-				limit = search_start +
-					(block_group->key.offset >> 1);
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index d378edf0964..a9c7419615b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -30,9 +30,6 @@ struct tree_entry {
 
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
-static LIST_HEAD(all_states);
-spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
-
 void __init extent_map_init(void)
 {
 	extent_map_cache = kmem_cache_create("extent_map",
@@ -49,15 +46,6 @@ void __init extent_map_init(void)
 
 void __exit extent_map_exit(void)
 {
-	while(!list_empty(&all_states)) {
-		struct extent_state *state;
-		struct list_head *cur = all_states.next;
-		state = list_entry(cur, struct extent_state, list);
-		printk("found leaked state %Lu %Lu state %d in_tree %d\n",
-		       state->start, state->end, state->state, state->in_tree);
-		list_del(&state->list);
-		kfree(state);
-	}
 	if (extent_map_cache)
 		kmem_cache_destroy(extent_map_cache);
 	if (extent_state_cache)
@@ -69,6 +57,7 @@ void extent_map_tree_init(struct extent_map_tree *tree,
 {
 	tree->map.rb_node = NULL;
 	tree->state.rb_node = NULL;
+	tree->fill_delalloc = NULL;
 	rwlock_init(&tree->lock);
 	tree->mapping = mapping;
 }
@@ -106,9 +95,6 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 	state->in_tree = 0;
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
-	spin_lock_irq(&state_lock);
-	list_add(&state->list, &all_states);
-	spin_unlock_irq(&state_lock);
 	return state;
 }
 EXPORT_SYMBOL(alloc_extent_state);
@@ -117,9 +103,6 @@ void free_extent_state(struct extent_state *state)
 {
 	if (atomic_dec_and_test(&state->refs)) {
 		WARN_ON(state->in_tree);
-		spin_lock_irq(&state_lock);
-		list_del_init(&state->list);
-		spin_unlock_irq(&state_lock);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -369,7 +352,7 @@ static int insert_state(struct extent_map_tree *tree,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
 		free_extent_state(state);
 		return -EEXIST;
 	}
@@ -408,7 +391,7 @@ static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
+		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
@@ -792,10 +775,20 @@ int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_dirty);
 
+int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_delalloc);
+
 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, EXTENT_DIRTY, 0, 0, mask);
+	return clear_extent_bit(tree, start, end,
+				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
 }
 EXPORT_SYMBOL(clear_extent_dirty);
 
@@ -922,6 +915,62 @@ int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(set_range_writeback);
 
+u64 find_lock_delalloc_range(struct extent_map_tree *tree,
+			     u64 start, u64 lock_start, u64 *end, u64 max_bytes)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = start;
+	u64 found = 0;
+	u64 total_bytes = 0;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+search_again:
+	node = tree_search(&tree->state, cur_start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start != cur_start) {
+			goto out;
+		}
+		if (!(state->state & EXTENT_DELALLOC)) {
+			goto out;
+		}
+		if (state->start >= lock_start) {
+			if (state->state & EXTENT_LOCKED) {
+				DEFINE_WAIT(wait);
+				atomic_inc(&state->refs);
+				write_unlock_irq(&tree->lock);
+				schedule();
+				write_lock_irq(&tree->lock);
+				finish_wait(&state->wq, &wait);
+				free_extent_state(state);
+				goto search_again;
+			}
+			state->state |= EXTENT_LOCKED;
+		}
+		found++;
+		*end = state->end;
+		cur_start = state->end + 1;
+		node = rb_next(node);
+		if (!node)
+			break;
+		total_bytes = state->end - state->start + 1;
+		if (total_bytes >= max_bytes)
+			break;
+	}
+out:
+	write_unlock_irq(&tree->lock);
+	return found;
+}
+
 /*
  * helper function to lock both pages and extents in the tree.
  * pages must be locked first.
@@ -1285,6 +1334,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
 		set_page_private(page, 1);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
 		page_cache_get(page);
 	}
 
@@ -1384,7 +1434,10 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 	size_t blocksize;
 	loff_t i_size = i_size_read(inode);
 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	u64 nr_delalloc;
+	u64 delalloc_end;
 
+	WARN_ON(!PageLocked(page));
 	if (page->index > end_index) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
 		unlock_page(page);
@@ -1400,11 +1453,34 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
 		set_page_private(page, 1);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
 		page_cache_get(page);
 	}
 
-	end = page_end;
 	lock_extent(tree, start, page_end, GFP_NOFS);
+	nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
+					       &delalloc_end,
+					       128 * 1024 * 1024);
+	if (nr_delalloc) {
+		tree->fill_delalloc(inode, start, delalloc_end);
+		if (delalloc_end >= page_end + 1) {
+			clear_extent_bit(tree, page_end + 1, delalloc_end,
+					 EXTENT_LOCKED | EXTENT_DELALLOC,
+					 1, 0, GFP_NOFS);
+		}
+		clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC,
+				 0, 0, GFP_NOFS);
+		if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
+			printk("found delalloc bits after clear extent_bit\n");
+		}
+	} else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
+		printk("found delalloc bits after find_delalloc_range returns 0\n");
+	}
+
+	end = page_end;
+	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
+		printk("found delalloc bits after lock_extent\n");
+	}
 
 	if (last_byte <= start) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
@@ -1419,7 +1495,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
 			break;
 		}
-		em = get_extent(inode, page, page_offset, cur, end, 1);
+		em = get_extent(inode, page, page_offset, cur, end, 0);
 		if (IS_ERR(em) || !em) {
 			SetPageError(page);
 			break;
@@ -1507,6 +1583,7 @@ int extent_commit_write(struct extent_map_tree *tree,
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
 		set_page_private(page, 1);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
 		page_cache_get(page);
 	}
 
@@ -1543,6 +1620,7 @@ int extent_prepare_write(struct extent_map_tree *tree,
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
 		set_page_private(page, 1);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
 		page_cache_get(page);
 	}
 	block_start = (page_start + from) & ~((u64)blocksize - 1);
@@ -1628,29 +1706,28 @@ int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
 	u64 start = page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	u64 orig_start = start;
+	int ret = 1;
 
 	while (start <= end) {
 		em = lookup_extent_mapping(tree, start, end);
 		if (!em || IS_ERR(em))
 			break;
-		if (test_range_bit(tree, em->start, em->end,
-				   EXTENT_LOCKED, 0)) {
+		if (!test_range_bit(tree, em->start, em->end,
+				    EXTENT_LOCKED, 0)) {
+			remove_extent_mapping(tree, em);
+			/* once for the rb tree */
 			free_extent_map(em);
-			start = em->end + 1;
-printk("range still locked %Lu %Lu\n", em->start, em->end);
-			break;
 		}
-		remove_extent_mapping(tree, em);
 		start = em->end + 1;
-		/* once for the rb tree */
-		free_extent_map(em);
 		/* once for us */
 		free_extent_map(em);
 	}
-	WARN_ON(test_range_bit(tree, orig_start, end, EXTENT_WRITEBACK, 0));
-	clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
-			 1, 1, GFP_NOFS);
-	return 1;
+	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
+		ret = 0;
+	else
+		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
+				 1, 1, GFP_NOFS);
+	return ret;
 }
 EXPORT_SYMBOL(try_release_extent_mapping);
 
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 108944aab4b..e91a2e9619e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -11,6 +11,7 @@ struct extent_map_tree {
 	struct rb_root state;
 	struct address_space *mapping;
 	rwlock_t lock;
+	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 };
 
 /* note, this must start with the same fields as fs/extent_map.c:tree_entry */
@@ -74,6 +75,8 @@ int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int extent_invalidatepage(struct extent_map_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 71a481894ab..d3d39e4a279 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -143,7 +143,6 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct btrfs_key ins;
 	u64 hint_block;
 	u64 num_blocks;
 	u64 start_pos;
@@ -162,6 +161,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			inode->i_blkbits;
 
 	end_of_last_block = start_pos + (num_blocks << inode->i_blkbits) - 1;
+	lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
@@ -179,16 +179,6 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 	/* FIXME...EIEIO, ENOSPC and more */
 
-	/* step one, delete the existing extents in this range */
-	/* FIXME blocksize != pagesize */
-	if (start_pos < inode->i_size) {
-		err = btrfs_drop_extents(trans, root, inode,
-			 start_pos, (pos + write_bytes + root->blocksize -1) &
-			 ~((u64)root->blocksize - 1), &hint_block);
-		if (err)
-			goto failed;
-	}
-
 	/* insert any holes we need to create */
 	if (inode->i_size < start_pos) {
 		u64 last_pos_in_file;
@@ -213,29 +203,28 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 */
 	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
 	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		err = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 num_blocks, 0, hint_block, (u64)-1,
-					 &ins, 1);
-		BUG_ON(err);
-		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       start_pos, ins.objectid, ins.offset,
-				       ins.offset);
-		BUG_ON(err);
-		em->start = start_pos;
-		em->end = end_of_last_block;
-		em->block_start = ins.objectid << inode->i_blkbits;
-		em->block_end = em->block_start +
-			(ins.offset << inode->i_blkbits) - 1;
-		set_extent_dirty(em_tree, start_pos, end_of_last_block,
-				 GFP_NOFS);
-		err = add_extent_mapping(em_tree, em);
+		u64 last_end;
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
-			__set_page_dirty_nobuffers(p);
+			set_page_dirty(p);
 		}
+		last_end = pages[num_pages -1]->index << PAGE_CACHE_SHIFT;
+		last_end += PAGE_CACHE_SIZE - 1;
+		set_extent_delalloc(em_tree, start_pos, end_of_last_block,
+				 GFP_NOFS);
 	} else {
 		struct page *p = pages[0];
+		/* step one, delete the existing extents in this range */
+		/* FIXME blocksize != pagesize */
+		if (start_pos < inode->i_size) {
+			err = btrfs_drop_extents(trans, root, inode, start_pos,
+				 (pos + write_bytes + root->blocksize -1) &
+				 ~((u64)root->blocksize - 1), &hint_block);
+			if (err)
+				goto failed;
+		}
+
 		err = insert_inline_extent(trans, root, inode, start_pos,
 					   end_pos - start_pos, p, 0);
 		BUG_ON(err);
@@ -253,6 +242,7 @@ failed:
 	err = btrfs_end_transaction(trans, root);
 out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
+	unlock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
 	free_extent_map(em);
 	return err;
 }
@@ -531,7 +521,7 @@ static int prepare_pages(struct btrfs_root *root,
 	u64 num_blocks;
 	u64 start_pos;
 
-	start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+	start_pos = pos & ~((u64)root->blocksize - 1);
 	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
 			inode->i_blkbits;
 
@@ -548,8 +538,10 @@ static int prepare_pages(struct btrfs_root *root,
 		if (!PagePrivate(pages[i])) {
 			SetPagePrivate(pages[i]);
 			set_page_private(pages[i], 1);
+			WARN_ON(!pages[i]->mapping->a_ops->invalidatepage);
 			page_cache_get(pages[i]);
 		}
+		WARN_ON(!PageLocked(pages[i]));
 	}
 	return 0;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fd4cb65fcf1..dc181089aa7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -70,6 +70,40 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
+static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key ins;
+	u64 alloc_hint = 0;
+	u64 num_blocks;
+	int ret;
+	u64 blocksize = 1 << inode->i_blkbits;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	BUG_ON(!trans);
+	num_blocks = (end - start + blocksize) & ~(blocksize - 1);
+	ret = btrfs_drop_extents(trans, root, inode,
+				 start, start + num_blocks, &alloc_hint);
+	num_blocks = num_blocks >> inode->i_blkbits;
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, num_blocks, 0,
+				 alloc_hint, (u64)-1, &ins, 1);
+	if (ret) {
+		WARN_ON(1);
+		goto out;
+	}
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       start, ins.objectid, ins.offset,
+				       ins.offset);
+out:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+
 void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -121,8 +155,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
-				     inode->i_mapping, GFP_NOFS);
+		BTRFS_I(inode)->extent_tree.fill_delalloc = run_delalloc_range;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
@@ -450,11 +483,17 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 		if (item_end < inode->i_size) {
-			if (found_type) {
-				btrfs_set_key_type(&key, found_type - 1);
-				continue;
+			if (found_type == BTRFS_DIR_ITEM_KEY) {
+				found_type = BTRFS_INODE_ITEM_KEY;
+			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
+				found_type = BTRFS_CSUM_ITEM_KEY;
+			} else if (found_type) {
+				found_type--;
+			} else {
+				break;
 			}
-			break;
+			btrfs_set_key_type(&key, found_type - 1);
+			continue;
 		}
 		if (btrfs_disk_key_offset(found_key) >= inode->i_size)
 			del_item = 1;
@@ -514,47 +553,34 @@ error:
 	return ret;
 }
 
-static int btrfs_cow_one_page(struct btrfs_trans_handle *trans,
-			      struct inode *inode, struct page *page,
+static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 			      size_t zero_start)
 {
 	char *kaddr;
 	int ret = 0;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 alloc_hint = 0;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 page_start = page->index << PAGE_CACHE_SHIFT;
-	struct btrfs_key ins;
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
 		set_page_private(page, 1);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
 		page_cache_get(page);
 	}
 
-	btrfs_set_trans_block_group(trans, inode);
-
-	ret = btrfs_drop_extents(trans, root, inode,
-				 page_start, page_start + PAGE_CACHE_SIZE,
-				 &alloc_hint);
-	if (ret)
-		goto out;
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
-				 alloc_hint, (u64)-1, &ins, 1);
-	if (ret)
-		goto out;
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       page_start, ins.objectid, 1, 1);
-	if (ret)
-		goto out;
-	SetPageChecked(page);
-	kaddr = kmap(page);
+	lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+	set_extent_delalloc(&BTRFS_I(inode)->extent_tree, page_start,
+			    page_end, GFP_NOFS);
 	if (zero_start != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
 		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
 		flush_dcache_page(page);
+		kunmap(page);
 	}
-	kunmap(page);
+	set_page_dirty(page);
+	unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
 
-out:
 	return ret;
 }
 
@@ -565,8 +591,6 @@ out:
 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	unsigned blocksize = 1 << inode->i_blkbits;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
@@ -591,21 +615,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	}
 	page_start = page->index << PAGE_CACHE_SHIFT;
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	ret = btrfs_cow_one_page(trans, inode, page, offset);
-	if (!ret) {
-		char *kaddr = kmap(page);
-		ret = btrfs_csum_file_block(trans, root, inode->i_ino,
-					    page_start, kaddr, PAGE_CACHE_SIZE);
-		kunmap(page);
-	}
-	set_extent_dirty(&BTRFS_I(inode)->extent_tree,
-			 page_start, page_start + PAGE_CACHE_SIZE - 1,
-			 GFP_NOFS);
-	set_page_dirty(page);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	ret = btrfs_cow_one_page(inode, page, offset);
 
 	unlock_page(page);
 	page_cache_release(page);
@@ -757,6 +767,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
+	extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+			     inode->i_mapping, GFP_NOFS);
 	return 0;
 }
 
@@ -968,7 +980,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+			     inode->i_mapping, GFP_NOFS);
 	BTRFS_I(inode)->root = root;
+
 	if (mode & S_IFDIR)
 		owner = 0;
 	else
@@ -1128,6 +1143,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
 				     inode->i_mapping, GFP_NOFS);
+		BTRFS_I(inode)->extent_tree.fill_delalloc = run_delalloc_range;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -1344,9 +1360,11 @@ again:
 		extent_end = extent_start +
 		       (btrfs_file_extent_num_blocks(item) << inode->i_blkbits);
 		err = 0;
-		if (start < extent_start || start > extent_end) {
+		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
+				if (end < extent_start)
+					goto not_found;
 				em->end = extent_end - 1;
 			} else {
 				em->end = end;
@@ -1375,9 +1393,11 @@ again:
 		size = btrfs_file_extent_inline_len(leaf->items +
 						    path->slots[0]);
 		extent_end = extent_start + size;
-		if (start < extent_start || start > extent_end) {
+		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
+				if (end < extent_start)
+					goto not_found;
 				em->end = extent_end - 1;
 			} else {
 				em->end = end;
@@ -1412,8 +1432,7 @@ not_found_em:
 insert:
 	btrfs_release_path(root, path);
 	if (em->start > start || em->end < start) {
-		printk("bad extent! %Lu %Lu start %Lu end %Lu\n", em->start, em->end, start, end);
-		WARN_ON(1);
+		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->end, start, end);
 		err = -EIO;
 		goto out;
 	}
@@ -1648,6 +1667,13 @@ int btrfs_readpage(struct file *file, struct page *page)
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_map_tree *tree;
+
+
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
 	tree = &BTRFS_I(page->mapping->host)->extent_tree;
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
@@ -1698,8 +1724,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	unsigned long end;
 	loff_t size;
 	int ret = -EINVAL;
@@ -1722,15 +1746,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	else
 		end = PAGE_CACHE_SIZE;
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	ret = btrfs_cow_one_page(trans, inode, page, end);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	set_extent_dirty(&BTRFS_I(inode)->extent_tree,
-			 page_start, page_start + PAGE_CACHE_SIZE - 1,
-			 GFP_NOFS);
-	set_page_dirty(page);
+	ret = btrfs_cow_one_page(inode, page, end);
 
 out_unlock:
 	unlock_page(page);
@@ -2264,6 +2280,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
 				     inode->i_mapping, GFP_NOFS);
+		BTRFS_I(inode)->extent_tree.fill_delalloc = run_delalloc_range;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7b7c32c73a2..7da95538745 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -83,6 +83,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	bi->location.offset = 0;
 	bi->location.flags = 0;
 	bi->root = tree_root;
+
 	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 
 	if (!inode) {
-- 
cgit v1.2.3


From 58176a9604c5db1784d2c979aea472b3be40b6f0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 29 Aug 2007 15:47:34 -0400
Subject: Btrfs: Add per-root block accounting and sysfs entries

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/ctree.h       |  54 +++++++++++
 fs/btrfs/disk-io.c     |  31 ++++++-
 fs/btrfs/disk-io.h     |   3 +-
 fs/btrfs/extent-tree.c |  17 +++-
 fs/btrfs/inode.c       |  13 ++-
 fs/btrfs/super.c       |  14 +++
 fs/btrfs/sysfs.c       | 236 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/transaction.c |  34 ++++++-
 9 files changed, 393 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ea7a2287035..ee505c87ecf 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,7 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o
+	   extent_map.o sysfs.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 947f061ed11..ffeee546a0d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
+#include <linux/completion.h>
 #include "bit-radix.h"
 
 struct btrfs_trans_handle;
@@ -313,6 +314,8 @@ struct btrfs_fs_info {
 	struct list_head trans_list;
 	struct list_head dead_roots;
 	struct delayed_work trans_work;
+	struct kobject super_kobj;
+	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
 };
@@ -328,6 +331,8 @@ struct btrfs_root {
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
 	struct inode *inode;
+	struct kobject root_kobj;
+	struct completion kobj_unregister;
 	u64 objectid;
 	u64 last_trans;
 	u32 blocksize;
@@ -338,6 +343,7 @@ struct btrfs_root {
 	struct btrfs_key defrag_progress;
 	int defrag_running;
 	int defrag_level;
+	char *name;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -814,6 +820,28 @@ static inline void btrfs_set_root_flags(struct btrfs_root_item *item, u32 val)
 	item->flags = cpu_to_le32(val);
 }
 
+static inline void btrfs_set_root_blocks_used(struct btrfs_root_item *item,
+						   u64 val)
+{
+	item->blocks_used = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_root_blocks_used(struct btrfs_root_item *item)
+{
+	return le64_to_cpu(item->blocks_used);
+}
+
+static inline void btrfs_set_root_block_limit(struct btrfs_root_item *item,
+						u64 val)
+{
+	item->block_limit = cpu_to_le64(val);
+}
+
+static inline u64 btrfs_root_block_limit(struct btrfs_root_item *item)
+{
+	return le64_to_cpu(item->block_limit);
+}
+
 static inline u64 btrfs_super_blocknr(struct btrfs_super_block *s)
 {
 	return le64_to_cpu(s->blocknr);
@@ -1014,6 +1042,23 @@ static inline void btrfs_memmove(struct btrfs_root *root,
 	memmove(dst, src, nr);
 }
 
+static inline int btrfs_set_root_name(struct btrfs_root *root,
+				      const char *name, int len)
+{
+	/* if we already have a name just free it */
+	if (root->name)
+		kfree(root->name);
+
+	root->name = kmalloc(len+1, GFP_KERNEL);
+	if (!root->name)
+		return -ENOMEM;
+
+	memcpy(root->name, name, len);
+	root->name[len] ='\0';
+
+	return 0;
+}
+
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
@@ -1191,4 +1236,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only);
+
+/* sysfs.c */
+int btrfs_init_sysfs(void);
+void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
+int btrfs_sysfs_add_root(struct btrfs_root *root);
+void btrfs_sysfs_del_root(struct btrfs_root *root);
+void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
+
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b2f79878d51..c25ef0a68f1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -294,9 +294,12 @@ static int __setup_root(int blocksize,
 	root->last_trans = 0;
 	root->highest_inode = 0;
 	root->last_inode_alloc = 0;
+	root->name = NULL;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
+	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+	init_completion(&root->kobj_unregister);
 	root->defrag_running = 0;
 	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
@@ -384,7 +387,8 @@ insert:
 }
 
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-				      struct btrfs_key *location)
+				      struct btrfs_key *location,
+				      const char *name, int namelen)
 {
 	struct btrfs_root *root;
 	int ret;
@@ -405,6 +409,22 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+
+	ret = btrfs_set_root_name(root, name, namelen);
+	if (ret) {
+		brelse(root->node);
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+
+	ret = btrfs_sysfs_add_root(root);
+	if (ret) {
+		brelse(root->node);
+		kfree(root->name);
+		kfree(root);
+		return ERR_PTR(ret);
+	}
+
 	return root;
 }
 
@@ -433,6 +453,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_RADIX_TREE(&fs_info->block_group_data_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
+	memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj));
+	init_completion(&fs_info->kobj_unregister);
 	sb_set_blocksize(sb, 4096);
 	fs_info->running_transaction = NULL;
 	fs_info->last_trans_committed = 0;
@@ -500,8 +522,10 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
 	ret = btrfs_find_dead_roots(tree_root);
-	if (ret)
+	if (ret) {
+		mutex_unlock(&fs_info->fs_mutex);
 		goto fail_tree_root;
+	}
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
@@ -553,12 +577,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
+	btrfs_sysfs_del_root(root);
 	if (root->inode)
 		iput(root->inode);
 	if (root->node)
 		brelse(root->node);
 	if (root->commit_root)
 		brelse(root->commit_root);
+	if (root->name)
+		kfree(root->name);
 	kfree(root);
 	return 0;
 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5261733b873..da6bb72750f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -66,7 +66,8 @@ struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr);
 int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 		    char *result);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-				      struct btrfs_key *location);
+				      struct btrfs_key *location,
+				      const char *name, int namelen);
 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 					       struct btrfs_key *location);
 u64 bh_blocknr(struct buffer_head *bh);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c31e84d4265..ff3f7c2be60 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -858,16 +858,23 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_extent_refs(ei, refs);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	if (refs == 0) {
-		u64 super_blocks_used;
+		u64 super_blocks_used, root_blocks_used;
 
 		if (pin) {
 			ret = pin_down_block(root, blocknr, 0);
 			BUG_ON(ret);
 		}
 
+		/* block accounting for super block */
 		super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
 		btrfs_set_super_blocks_used(&info->super_copy,
 					    super_blocks_used - num_blocks);
+
+		/* block accounting for root item */
+		root_blocks_used = btrfs_root_blocks_used(&root->root_item);
+		btrfs_set_root_blocks_used(&root->root_item,
+					   root_blocks_used - num_blocks);
+
 		ret = btrfs_del_item(trans, extent_root, path);
 		if (ret) {
 			return ret;
@@ -1175,7 +1182,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	int pending_ret;
-	u64 super_blocks_used;
+	u64 super_blocks_used, root_blocks_used;
 	u64 search_start = 0;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
@@ -1193,10 +1200,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (ret)
 		return ret;
 
+	/* block accounting for super block */
 	super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
 	btrfs_set_super_blocks_used(&info->super_copy, super_blocks_used +
 				    num_blocks);
 
+	/* block accounting for root item */
+	root_blocks_used = btrfs_root_blocks_used(&root->root_item);
+	btrfs_set_root_blocks_used(&root->root_item, root_blocks_used +
+				   num_blocks);
+
 	if (root == extent_root) {
 		BUG_ON(num_blocks != 1);
 		set_radix_bit(&root->fs_info->extent_ins_radix, ins->objectid);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dc181089aa7..2e3918e6049 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -733,7 +733,8 @@ out:
  */
 static int fixup_tree_root_location(struct btrfs_root *root,
 			     struct btrfs_key *location,
-			     struct btrfs_root **sub_root)
+			     struct btrfs_root **sub_root,
+			     struct dentry *dentry)
 {
 	struct btrfs_path *path;
 	struct btrfs_root_item *ri;
@@ -747,7 +748,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
 
-	*sub_root = btrfs_read_fs_root(root->fs_info, location);
+	*sub_root = btrfs_read_fs_root(root->fs_info, location,
+					dentry->d_name.name,
+					dentry->d_name.len);
 	if (IS_ERR(*sub_root))
 		return PTR_ERR(*sub_root);
 
@@ -812,7 +815,8 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 		return ERR_PTR(ret);
 	inode = NULL;
 	if (location.objectid) {
-		ret = fixup_tree_root_location(root, &location, &sub_root);
+		ret = fixup_tree_root_location(root, &location, &sub_root,
+						dentry);
 		if (ret < 0)
 			return ERR_PTR(ret);
 		if (ret > 0)
@@ -1829,6 +1833,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 
 	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
 	btrfs_set_root_refs(&root_item, 1);
+	btrfs_set_root_blocks_used(&root_item, 0);
 	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
 	root_item.drop_level = 0;
 	brelse(subvol);
@@ -1865,7 +1870,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail_commit;
 
-	new_root = btrfs_read_fs_root(root->fs_info, &key);
+	new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen);
 	BUG_ON(!new_root);
 
 	trans = btrfs_start_transaction(new_root, 1);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7da95538745..a68101ae1a3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -45,12 +45,14 @@ static struct super_operations btrfs_super_ops;
 static void btrfs_put_super (struct super_block * sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs = root->fs_info;
 	int ret;
 
 	ret = close_ctree(root);
 	if (ret) {
 		printk("close ctree returns %d\n", ret);
 	}
+	btrfs_sysfs_del_super(fs);
 	sb->s_fs_info = NULL;
 }
 
@@ -101,6 +103,12 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 		err = -ENOMEM;
 		goto fail_close;
 	}
+
+	/* this does the super kobj at the same time */
+	err = btrfs_sysfs_add_super(tree_root->fs_info);
+	if (err)
+		goto fail_close;
+
 	sb->s_root = root_dentry;
 	btrfs_transaction_queue_work(tree_root, HZ * 30);
 	return 0;
@@ -182,6 +190,11 @@ static struct super_operations btrfs_super_ops = {
 static int __init init_btrfs_fs(void)
 {
 	int err;
+
+	err = btrfs_init_sysfs();
+	if (err)
+		return err;
+
 	btrfs_init_transaction_sys();
 	err = btrfs_init_cachep();
 	if (err)
@@ -196,6 +209,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_destroy_cachep();
 	extent_map_exit();
 	unregister_filesystem(&btrfs_fs_type);
+	btrfs_exit_sysfs();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index db8917e5b25..2058783373e 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -16,6 +16,242 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+
+static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_root_blocks_used(&root->root_item));
+}
+
+static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_root_block_limit(&root->root_item));
+}
+
+static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_super_blocks_used(fs->disk_super));
+}
+
+static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_super_total_blocks(fs->disk_super));
+}
+
+static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long)btrfs_super_blocksize(fs->disk_super));
+}
+
+/* this is for root attrs (subvols/snapshots) */
+struct btrfs_root_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct btrfs_root *, char *);
+	ssize_t (*store)(struct btrfs_root *, const char *, size_t);
+};
+
+#define ROOT_ATTR(name, mode, show, store) \
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store)
+
+ROOT_ATTR(blocks_used,	0444,	root_blocks_used_show,	NULL);
+ROOT_ATTR(block_limit,	0644,	root_block_limit_show,	NULL);
+
+static struct attribute *btrfs_root_attrs[] = {
+	&btrfs_root_attr_blocks_used.attr,
+	&btrfs_root_attr_block_limit.attr,
+	NULL,
+};
+
+/* this is for super attrs (actual full fs) */
+struct btrfs_super_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct btrfs_fs_info *, char *);
+	ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
+};
+
+#define SUPER_ATTR(name, mode, show, store) \
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store)
+
+SUPER_ATTR(blocks_used,		0444,	super_blocks_used_show,		NULL);
+SUPER_ATTR(total_blocks,	0444,	super_total_blocks_show,	NULL);
+SUPER_ATTR(blocksize,		0444,	super_blocksize_show,		NULL);
+
+static struct attribute *btrfs_super_attrs[] = {
+	&btrfs_super_attr_blocks_used.attr,
+	&btrfs_super_attr_total_blocks.attr,
+	&btrfs_super_attr_blocksize.attr,
+	NULL,
+};
+
+static ssize_t btrfs_super_attr_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+						super_kobj);
+	struct btrfs_super_attr *a = container_of(attr,
+						  struct btrfs_super_attr,
+						  attr);
+
+	return a->show ? a->show(fs, buf) : 0;
+}
+
+static ssize_t btrfs_super_attr_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+						super_kobj);
+	struct btrfs_super_attr *a = container_of(attr,
+						  struct btrfs_super_attr,
+						  attr);
+
+	return a->store ? a->store(fs, buf, len) : 0;
+}
+
+static ssize_t btrfs_root_attr_show(struct kobject *kobj,
+				    struct attribute *attr, char *buf)
+{
+	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+						root_kobj);
+	struct btrfs_root_attr *a = container_of(attr,
+						 struct btrfs_root_attr,
+						 attr);
+
+	return a->show ? a->show(root, buf) : 0;
+}
+
+static ssize_t btrfs_root_attr_store(struct kobject *kobj,
+				     struct attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+						root_kobj);
+	struct btrfs_root_attr *a = container_of(attr,
+						 struct btrfs_root_attr,
+						 attr);
+	return a->store ? a->store(root, buf, len) : 0;
+}
+
+static void btrfs_super_release(struct kobject *kobj)
+{
+	struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
+						super_kobj);
+	complete(&fs->kobj_unregister);
+}
+
+static void btrfs_root_release(struct kobject *kobj)
+{
+	struct btrfs_root *root = container_of(kobj, struct btrfs_root,
+						root_kobj);
+	complete(&root->kobj_unregister);
+}
+
+static struct sysfs_ops btrfs_super_attr_ops = {
+	.show	= btrfs_super_attr_show,
+	.store	= btrfs_super_attr_store,
+};
+
+static struct sysfs_ops btrfs_root_attr_ops = {
+	.show	= btrfs_root_attr_show,
+	.store	= btrfs_root_attr_store,
+};
+
+static struct kobj_type btrfs_root_ktype = {
+	.default_attrs	= btrfs_root_attrs,
+	.sysfs_ops	= &btrfs_root_attr_ops,
+	.release	= btrfs_root_release,
+};
+
+static struct kobj_type btrfs_super_ktype = {
+	.default_attrs	= btrfs_super_attrs,
+	.sysfs_ops	= &btrfs_super_attr_ops,
+	.release	= btrfs_super_release,
+};
+
+static struct kset btrfs_kset = {
+	.kobj	= {.name = "btrfs"},
+};
+
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+{
+	int error;
+
+	fs->super_kobj.kset = &btrfs_kset;
+	fs->super_kobj.ktype = &btrfs_super_ktype;
+
+	error = kobject_set_name(&fs->super_kobj, "%s",
+				 fs->sb->s_id);
+	if (error)
+		goto fail;
+
+	error = kobject_register(&fs->super_kobj);
+	if (error)
+		goto fail;
+
+	return 0;
+
+fail:
+	printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
+	return error;
+}
+
+int btrfs_sysfs_add_root(struct btrfs_root *root)
+{
+	int error;
+
+	root->root_kobj.ktype = &btrfs_root_ktype;
+	root->root_kobj.parent = &root->fs_info->super_kobj;
+
+	error = kobject_set_name(&root->root_kobj, "%s", root->name);
+	if (error) {
+		goto fail;
+	}
+
+	error = kobject_register(&root->root_kobj);
+	if (error)
+		goto fail;
+
+	return 0;
+
+fail:
+	printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
+	return error;
+}
+
+void btrfs_sysfs_del_root(struct btrfs_root *root)
+{
+	kobject_unregister(&root->root_kobj);
+	wait_for_completion(&root->kobj_unregister);
+}
+
+void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
+{
+	kobject_unregister(&fs->super_kobj);
+	wait_for_completion(&fs->kobj_unregister);
+}
+
+int btrfs_init_sysfs()
+{
+	kobj_set_kset_s(&btrfs_kset, fs_subsys);
+	return kset_register(&btrfs_kset);
+}
+
+void btrfs_exit_sysfs()
+{
+	kset_unregister(&btrfs_kset);
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 18abea80279..2b15daa3a9f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -236,6 +236,7 @@ static int wait_for_commit(struct btrfs_root *root,
 struct dirty_root {
 	struct list_head list;
 	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
 };
 
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
@@ -278,6 +279,15 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 					btrfs_root_blocknr(&root->root_item));
 				brelse(root->commit_root);
 				root->commit_root = NULL;
+
+				/* make sure to update the root on disk
+				 * so we get any updates to the block used
+				 * counts
+				 */
+				err = btrfs_update_root(trans,
+						root->fs_info->tree_root,
+						&root->root_key,
+						&root->root_item);
 				continue;
 			}
 			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
@@ -291,6 +301,7 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 
 			memcpy(dirty->root, root, sizeof(*root));
 			dirty->root->node = root->commit_root;
+			dirty->latest_root = root;
 			root->commit_root = NULL;
 
 			root->root_key.offset = root->fs_info->generation;
@@ -384,20 +395,29 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 {
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
+	u64 num_blocks;
+	u64 blocks_used;
 	int ret = 0;
 	int err;
 
 	while(!list_empty(list)) {
+		struct btrfs_root *root;
+
 		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
+		num_blocks = btrfs_root_blocks_used(&dirty->root->root_item);
+		root = dirty->latest_root;
+
 		while(1) {
 			trans = btrfs_start_transaction(tree_root, 1);
+
 			ret = btrfs_drop_snapshot(trans, dirty->root);
 			if (ret != -EAGAIN) {
 				break;
 			}
+
 			err = btrfs_update_root(trans,
 					tree_root,
 					&dirty->root->root_key,
@@ -414,9 +434,19 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 			mutex_lock(&tree_root->fs_info->fs_mutex);
 		}
 		BUG_ON(ret);
+
+		num_blocks -= btrfs_root_blocks_used(&dirty->root->root_item);
+		blocks_used = btrfs_root_blocks_used(&root->root_item);
+		if (num_blocks) {
+			record_root_in_trans(root);
+			btrfs_set_root_blocks_used(&root->root_item,
+						   blocks_used - num_blocks);
+		}
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
-		if (ret)
+		if (ret) {
+			BUG();
 			break;
+		}
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
@@ -534,10 +564,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
+
 	if (root->fs_info->closing)
 		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
 	else
 		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
+
 	mutex_unlock(&root->fs_info->trans_mutex);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-- 
cgit v1.2.3


From 252c38f0697b6889c03542fa3c96143d3db4d9d8 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Wed, 29 Aug 2007 09:11:44 -0400
Subject: Btrfs: ctree.c cleanups

Fixup a few buffer_head release errors, and fix an off by one in
balance_node_right.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c0782a5b04c..89e282c35ce 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -266,8 +266,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
 					&tmp_bh, search_start,
 					min(8, end_slot - i));
-		if (err)
+		if (err) {
+			brelse(cur_bh);
 			break;
+		}
 		search_start = bh_blocknr(tmp_bh);
 		*last_ret = search_start;
 		if (parent_level == 1)
@@ -881,7 +883,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow)
 {
 	struct buffer_head *b;
-	struct buffer_head *cow_buf;
 	struct btrfs_node *c;
 	u64 blocknr;
 	int slot;
@@ -905,12 +906,11 @@ again:
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
 					       p->slots[level + 1],
-					       &cow_buf);
+					       &b);
 			if (wret) {
-				btrfs_block_release(root, cow_buf);
+				btrfs_block_release(root, b);
 				return wret;
 			}
-			b = cow_buf;
 			c = btrfs_buffer_node(b);
 		}
 		BUG_ON(!cow && ins_len);
@@ -1075,8 +1075,9 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 
 	max_push = src_nritems / 2 + 1;
 	/* don't try to empty the node */
-	if (max_push > src_nritems)
+	if (max_push >= src_nritems)
 		return 1;
+
 	if (max_push < push_items)
 		push_items = max_push;
 
@@ -1465,6 +1466,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	ret = btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
+		btrfs_block_release(root, t);
 		return 1;
 	}
 	left = btrfs_buffer_leaf(t);
-- 
cgit v1.2.3


From 1b4ab1bb4bbd3045cd041259c30b826654599e96 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Wed, 29 Aug 2007 09:11:44 -0400
Subject: Btrfs: Fix mknod to properly send rdev info back to disk

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2e3918e6049..483eea69b0a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1095,6 +1095,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	else {
 		inode->i_op = &btrfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, rdev);
+		btrfs_update_inode(trans, root, inode);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
-- 
cgit v1.2.3


From 4b82d6e4a592016ac75e756ca519923e0fc18f00 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Wed, 29 Aug 2007 09:11:44 -0400
Subject: Btrfs: Add mount into directory support

Modified form of original patch from Christoph Hellwig to make btrfs
mount into the default subvolume by default.

mount /dev/somedevice:subvolumename to get other subvolumes or
mount /dev/somedevice:. to get the root

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 120 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a68101ae1a3..8a94be1a773 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
@@ -26,6 +27,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
+#include <linux/mount.h>
 #include <linux/mpage.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
@@ -145,11 +147,126 @@ static void btrfs_write_super(struct super_block *sb)
 	sb->s_dirt = 0;
 }
 
+/*
+ * This is almost a copy of get_sb_bdev in fs/super.c.
+ * We need the local copy to allow direct mounting of
+ * subvolumes, but this could be easily integrated back
+ * into the generic version.  --hch
+ */
+
+/* start copy & paste */
+static int set_bdev_super(struct super_block *s, void *data)
+{
+	s->s_bdev = data;
+	s->s_dev = s->s_bdev->bd_dev;
+	return 0;
+}
+
+static int test_bdev_super(struct super_block *s, void *data)
+{
+	return (void *)s->s_bdev == data;
+}
+
+int btrfs_get_sb_bdev(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data,
+	int (*fill_super)(struct super_block *, void *, int),
+	struct vfsmount *mnt, const char *subvol)
+{
+	struct block_device *bdev = NULL;
+	struct super_block *s;
+	struct dentry *root;
+	int error = 0;
+
+	bdev = open_bdev_excl(dev_name, flags, fs_type);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	/*
+	 * once the super is inserted into the list by sget, s_umount
+	 * will protect the lockfs code from trying to start a snapshot
+	 * while we are mounting
+	 */
+	down(&bdev->bd_mount_sem);
+	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
+	up(&bdev->bd_mount_sem);
+	if (IS_ERR(s))
+		goto error_s;
+
+	if (s->s_root) {
+		if ((flags ^ s->s_flags) & MS_RDONLY) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = -EBUSY;
+			goto error_bdev;
+		}
+
+		close_bdev_excl(bdev);
+	} else {
+		char b[BDEVNAME_SIZE];
+
+		s->s_flags = flags;
+		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		sb_set_blocksize(s, block_size(bdev));
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			goto error;
+		}
+
+		s->s_flags |= MS_ACTIVE;
+	}
+
+	if (subvol) {
+		root = lookup_one_len(subvol, s->s_root, strlen(subvol));
+		if (IS_ERR(root)) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = PTR_ERR(root);
+			goto error;
+		}
+		if (!root->d_inode) {
+			dput(root);
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = -ENXIO;
+			goto error;
+		}
+	} else {
+		root = dget(s->s_root);
+	}
+
+	mnt->mnt_sb = s;
+	mnt->mnt_root = root;
+	return 0;
+
+error_s:
+	error = PTR_ERR(s);
+error_bdev:
+	close_bdev_excl(bdev);
+error:
+	return error;
+}
+/* end copy & paste */
+
 static int btrfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+	int flags, const char *identifier, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data,
-			   btrfs_fill_super, mnt);
+	int ret;
+	char *_identifier = kstrdup(identifier, GFP_KERNEL);
+	char *subvol_name;
+	const char *dev_name;
+
+	subvol_name = _identifier;
+	dev_name = strsep(&subvol_name, ":");
+	if (!dev_name)
+		return -ENOMEM;
+
+	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data,
+			btrfs_fill_super, mnt,
+			subvol_name ? subvol_name : "default");
+	kfree(_identifier);
+	return ret;
 }
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
-- 
cgit v1.2.3


From e9fe395e47bef9b948657b788c65cbfbd2433cc4 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Wed, 29 Aug 2007 09:11:44 -0400
Subject: Btrfs: Fix oopsen in extent_tree.c during enospc

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ff3f7c2be60..b074ad1416d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -244,7 +244,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_block_group(info, search_start);
-		if (shint->data == data) {
+		if (shint && shint->data == data) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
 			    div_factor(shint->key.offset, factor)) {
@@ -1255,8 +1255,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct buffer_head *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, empty_size, hint,
-				 (unsigned long)-1, &ins, 0);
+				 1, empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
-- 
cgit v1.2.3


From 95e0528919cb0c7635b37496e0ad1c41e4c0dd7c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Aug 2007 09:11:44 -0400
Subject: Btrfs: Use mount -o subvol to select the subvol directory instead of
 dev:

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  6 ++++++
 fs/btrfs/super.c | 49 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ffeee546a0d..cd75c906048 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1064,6 +1064,12 @@ static inline int btrfs_set_root_name(struct btrfs_root *root,
 	((type *)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset((leaf)->items + (slot))))
 
+/* mount option defines and helpers */
+#define BTRFS_MOUNT_SUBVOL		0x000001
+#define btrfs_clear_opt(o, opt)		o &= ~BTRFS_MOUNT_##opt
+#define btrfs_set_opt(o, opt)		o |= BTRFS_MOUNT_##opt
+#define btrfs_test_opt(sb, opt)		(BTRFS_SB(sb)->s_mount_opt & \
+					 BTRFS_MOUNT_##opt)
 /* extent-tree.c */
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a94be1a773..66a01cbbbea 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -33,6 +33,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/parser.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -58,6 +59,41 @@ static void btrfs_put_super (struct super_block * sb)
 	sb->s_fs_info = NULL;
 }
 
+enum {
+	Opt_subvol, Opt_err,
+};
+
+static match_table_t tokens = {
+	{Opt_subvol, "subvol=%s"},
+	{Opt_err, NULL}
+};
+
+static int parse_options (char * options,
+			  struct btrfs_root *root,
+			  char **subvol_name)
+{
+	char * p;
+	substring_t args[MAX_OPT_ARGS];
+	if (!options)
+		return 1;
+
+	while ((p = strsep (&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_subvol:
+			*subvol_name = match_strdup(&args[0]);
+			break;
+		default:
+			return 0;
+		}
+	}
+	return 1;
+}
+
 static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 {
 	struct inode * inode;
@@ -250,22 +286,15 @@ error:
 /* end copy & paste */
 
 static int btrfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *identifier, void *data, struct vfsmount *mnt)
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	int ret;
-	char *_identifier = kstrdup(identifier, GFP_KERNEL);
-	char *subvol_name;
-	const char *dev_name;
-
-	subvol_name = _identifier;
-	dev_name = strsep(&subvol_name, ":");
-	if (!dev_name)
-		return -ENOMEM;
+	char *subvol_name = NULL;
 
+	parse_options((char *)data, NULL, &subvol_name);
 	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data,
 			btrfs_fill_super, mnt,
 			subvol_name ? subvol_name : "default");
-	kfree(_identifier);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 07157aacb1ecd394a5494910b5f80d7e7d407d37 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Aug 2007 08:50:51 -0400
Subject: Btrfs: Add file data csums back in via hooks in the extent map code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 94 ++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/extent_map.h | 15 +++++++-
 fs/btrfs/inode.c      | 98 +++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 193 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a9c7419615b..6053f9db073 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -57,7 +57,7 @@ void extent_map_tree_init(struct extent_map_tree *tree,
 {
 	tree->map.rb_node = NULL;
 	tree->state.rb_node = NULL;
-	tree->fill_delalloc = NULL;
+	tree->ops = NULL;
 	rwlock_init(&tree->lock);
 	tree->mapping = mapping;
 }
@@ -93,6 +93,7 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 		return state;
 	state->state = 0;
 	state->in_tree = 0;
+	state->private = 0;
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
 	return state;
@@ -1034,6 +1035,61 @@ int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(unlock_range);
 
+int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state->private = private;
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+
+}
+
+int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	read_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*private = state->private;
+out:
+	read_unlock_irq(&tree->lock);
+	return ret;
+}
+
 /*
  * searches a range in the state tree for a given mask.
  * If 'filled' == 1, this returns 1 only if ever extent in the tree
@@ -1185,12 +1241,13 @@ static int end_bio_extent_writepage(struct bio *bio,
 static int end_bio_extent_readpage(struct bio *bio,
 				   unsigned int bytes_done, int err)
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct extent_map_tree *tree = bio->bi_private;
 	u64 start;
 	u64 end;
 	int whole_page;
+	int ret;
 
 	if (bio->bi_size)
 		return 1;
@@ -1208,6 +1265,11 @@ static int end_bio_extent_readpage(struct bio *bio,
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 
+		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+			ret = tree->ops->readpage_end_io_hook(page, start, end);
+			if (ret)
+				uptodate = 0;
+		}
 		if (uptodate) {
 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
 			if (whole_page)
@@ -1388,9 +1450,16 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 			continue;
 		}
 
-		ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset, bdev,
-					 end_bio_extent_readpage);
+		ret = 0;
+		if (tree->ops && tree->ops->readpage_io_hook) {
+			ret = tree->ops->readpage_io_hook(page, cur,
+							  cur + iosize - 1);
+		}
+		if (!ret) {
+			ret = submit_extent_page(READ, tree, page,
+						 sector, iosize, page_offset,
+						 bdev, end_bio_extent_readpage);
+		}
 		if (ret)
 			SetPageError(page);
 		cur = cur + iosize;
@@ -1462,7 +1531,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 					       &delalloc_end,
 					       128 * 1024 * 1024);
 	if (nr_delalloc) {
-		tree->fill_delalloc(inode, start, delalloc_end);
+		tree->ops->fill_delalloc(inode, start, delalloc_end);
 		if (delalloc_end >= page_end + 1) {
 			clear_extent_bit(tree, page_end + 1, delalloc_end,
 					 EXTENT_LOCKED | EXTENT_DELALLOC,
@@ -1528,12 +1597,17 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			continue;
 		}
 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
-		set_range_writeback(tree, cur, cur + iosize - 1);
-		ret = submit_extent_page(WRITE, tree, page,
-					 sector, iosize, page_offset, bdev,
-					 end_bio_extent_writepage);
+		ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1);
 		if (ret)
 			SetPageError(page);
+		else {
+			set_range_writeback(tree, cur, cur + iosize - 1);
+			ret = submit_extent_page(WRITE, tree, page, sector,
+						 iosize, page_offset, bdev,
+						 end_bio_extent_writepage);
+			if (ret)
+				SetPageError(page);
+		}
 		cur = cur + iosize;
 		page_offset += iosize;
 		nr++;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e91a2e9619e..0eae6c420ac 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -6,12 +6,19 @@
 #define EXTENT_MAP_INLINE (u64)-2
 #define EXTENT_MAP_DELALLOC (u64)-1
 
+struct extent_map_ops {
+	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end);
+};
+
 struct extent_map_tree {
 	struct rb_root map;
 	struct rb_root state;
 	struct address_space *mapping;
 	rwlock_t lock;
-	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	struct extent_map_ops *ops;
 };
 
 /* note, this must start with the same fields as fs/extent_map.c:tree_entry */
@@ -36,6 +43,10 @@ struct extent_state {
 	wait_queue_head_t wq;
 	atomic_t refs;
 	unsigned long state;
+
+	/* for use by the FS */
+	u64 private;
+
 	struct list_head list;
 };
 
@@ -89,4 +100,6 @@ int extent_commit_write(struct extent_map_tree *tree,
 			struct inode *inode, struct page *page,
 			unsigned from, unsigned to);
 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_map_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 483eea69b0a..3ee6b2fadf5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -52,6 +52,7 @@ static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
 static struct address_space_operations btrfs_symlink_aops;
 static struct file_operations btrfs_dir_file_operations;
+static struct extent_map_ops btrfs_extent_map_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
@@ -103,6 +104,90 @@ out:
 	return ret;
 }
 
+int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	char *kaddr;
+	int ret;
+	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	size_t offset = start - page_start;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	kaddr = kmap(page);
+	btrfs_csum_file_block(trans, root, inode->i_ino,
+			      start, kaddr + offset, end - start + 1);
+	kunmap(page);
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
+{
+	int ret = 0;
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_csum_item *item;
+	struct btrfs_path *path = NULL;
+	u64 private;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	path = btrfs_alloc_path();
+	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
+	if (IS_ERR(item)) {
+		ret = PTR_ERR(item);
+		/* a csum that isn't present is a preallocated region. */
+		if (ret == -ENOENT || ret == -EFBIG)
+			ret = 0;
+		private = 0;
+		goto out;
+	}
+	memcpy((char *)&private, &item->csum, BTRFS_CRC32_SIZE);
+	set_state_private(em_tree, start, private);
+out:
+	if (path)
+		btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
+{
+	char csum[BTRFS_CRC32_SIZE];
+	size_t offset = start - (page->index << PAGE_CACHE_SHIFT);
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	char *kaddr;
+	u64 private;
+	int ret;
+
+	ret = get_state_private(em_tree, start, &private);
+	kaddr = kmap_atomic(page, KM_IRQ0);
+	if (ret) {
+		goto zeroit;
+	}
+	ret = btrfs_csum_data(root, kaddr + offset, end - start + 1, csum);
+	BUG_ON(ret);
+	if (memcmp(csum, &private, BTRFS_CRC32_SIZE)) {
+		goto zeroit;
+	}
+	kunmap_atomic(kaddr, KM_IRQ0);
+	return 0;
+
+zeroit:
+	printk("btrfs csum failed ino %lu off %llu\n",
+	       page->mapping->host->i_ino, (unsigned long long)start);
+	memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page);
+	kunmap_atomic(kaddr, KM_IRQ0);
+	return 0;
+}
 
 void btrfs_read_locked_inode(struct inode *inode)
 {
@@ -155,7 +240,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
-		BTRFS_I(inode)->extent_tree.fill_delalloc = run_delalloc_range;
+		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
@@ -1148,7 +1233,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
 				     inode->i_mapping, GFP_NOFS);
-		BTRFS_I(inode)->extent_tree.fill_delalloc = run_delalloc_range;
+		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2286,7 +2371,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
 				     inode->i_mapping, GFP_NOFS);
-		BTRFS_I(inode)->extent_tree.fill_delalloc = run_delalloc_range;
+		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2362,6 +2447,13 @@ static struct file_operations btrfs_dir_file_operations = {
 #endif
 };
 
+static struct extent_map_ops btrfs_extent_map_ops = {
+	.fill_delalloc = run_delalloc_range,
+	.writepage_io_hook = btrfs_writepage_io_hook,
+	.readpage_io_hook = btrfs_readpage_io_hook,
+	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+};
+
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
 	.writepage	= btrfs_writepage,
-- 
cgit v1.2.3


From 2bf5a725a3b82efeaf7b292c085e69a9388a89ea Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Aug 2007 11:54:02 -0400
Subject: Btrfs: fsx delalloc fixes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c |   9 ++-
 fs/btrfs/file.c       |  26 +++++---
 fs/btrfs/inode.c      | 177 ++++----------------------------------------------
 3 files changed, 38 insertions(+), 174 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6053f9db073..8ad6f8efc5a 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -77,6 +77,8 @@ EXPORT_SYMBOL(alloc_extent_map);
 
 void free_extent_map(struct extent_map *em)
 {
+	if (!em)
+		return;
 	if (atomic_dec_and_test(&em->refs)) {
 		WARN_ON(em->in_tree);
 		kmem_cache_free(extent_map_cache, em);
@@ -102,6 +104,8 @@ EXPORT_SYMBOL(alloc_extent_state);
 
 void free_extent_state(struct extent_state *state)
 {
+	if (!state)
+		return;
 	if (atomic_dec_and_test(&state->refs)) {
 		WARN_ON(state->in_tree);
 		kmem_cache_free(extent_state_cache, state);
@@ -1395,8 +1399,8 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
-		set_page_private(page, 1);
 		WARN_ON(!page->mapping->a_ops->invalidatepage);
+		set_page_private(page, 1);
 		page_cache_get(page);
 	}
 
@@ -1638,7 +1642,8 @@ int extent_invalidatepage(struct extent_map_tree *tree,
 
 	lock_extent(tree, start, end, GFP_NOFS);
 	wait_on_extent_writeback(tree, start, end);
-	clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY,
+	clear_extent_bit(tree, start, end,
+			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
 			 1, 1, GFP_NOFS);
 	return 0;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3d39e4a279..07b121d4bd9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -186,8 +186,16 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		u64 mask = root->blocksize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
 		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		hole_size >>= inode->i_blkbits;
+
 		if (last_pos_in_file < start_pos) {
+			err = btrfs_drop_extents(trans, root, inode,
+						 last_pos_in_file,
+						 last_pos_in_file + hole_size,
+						 &hint_block);
+			if (err)
+				goto failed;
+
+			hole_size >>= inode->i_blkbits;
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
@@ -217,13 +225,11 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		struct page *p = pages[0];
 		/* step one, delete the existing extents in this range */
 		/* FIXME blocksize != pagesize */
-		if (start_pos < inode->i_size) {
-			err = btrfs_drop_extents(trans, root, inode, start_pos,
-				 (pos + write_bytes + root->blocksize -1) &
-				 ~((u64)root->blocksize - 1), &hint_block);
-			if (err)
-				goto failed;
-		}
+		err = btrfs_drop_extents(trans, root, inode, start_pos,
+			 (pos + write_bytes + root->blocksize -1) &
+			 ~((u64)root->blocksize - 1), &hint_block);
+		if (err)
+			goto failed;
 
 		err = insert_inline_extent(trans, root, inode, start_pos,
 					   end_pos - start_pos, p, 0);
@@ -400,6 +406,8 @@ next_slot:
 			keep = 1;
 			WARN_ON(start & (root->blocksize - 1));
 			if (found_extent) {
+				btrfs_drop_extent_cache(inode, key.offset,
+							start - 1 );
 				new_num = (start - key.offset) >>
 					inode->i_blkbits;
 				old_num = btrfs_file_extent_num_blocks(extent);
@@ -464,7 +472,7 @@ next_slot:
 
 			if (ret) {
 				btrfs_print_leaf(root, btrfs_buffer_leaf(path->nodes[0]));
-				printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu\n", ret , ins.objectid, ins.flags, ins.offset, start, end, key.offset, extent_end);
+				printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.flags, ins.offset, start, end, key.offset, extent_end, keep);
 			}
 			BUG_ON(ret);
 			extent = btrfs_item_ptr(
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ee6b2fadf5..64710fa77d0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -721,25 +721,35 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
 		struct btrfs_trans_handle *trans;
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+
 		u64 mask = root->blocksize - 1;
 		u64 pos = (inode->i_size + mask) & ~mask;
+		u64 block_end = attr->ia_size | mask;
 		u64 hole_size;
+		u64 alloc_hint;
 
 		if (attr->ia_size <= pos)
 			goto out;
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
+		lock_extent(em_tree, pos, block_end, GFP_NOFS);
 		hole_size = (attr->ia_size - pos + mask) & ~mask;
-		hole_size >>= inode->i_blkbits;
 
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
+		err = btrfs_drop_extents(trans, root, inode,
+					 pos, pos + hole_size, &alloc_hint);
+
+		hole_size >>= inode->i_blkbits;
+
 		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       pos, 0, 0, hole_size);
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
+		unlock_extent(em_tree, pos, block_end, GFP_NOFS);
 		if (err)
 			return err;
 	}
@@ -1529,13 +1539,13 @@ insert:
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
 		free_extent_map(em);
+		em = NULL;
 		failed_insert++;
 		if (failed_insert > 5) {
 			printk("failing to insert %Lu %Lu\n", start, end);
 			err = -EIO;
 			goto out;
 		}
-		em = NULL;
 		goto again;
 	}
 	err = 0;
@@ -1555,167 +1565,6 @@ out:
 	return em;
 }
 
-
-/*
- * FIBMAP and others want to pass in a fake buffer head.  They need to
- * use BTRFS_GET_BLOCK_NO_DIRECT to make sure we don't try to memcpy
- * any packed file data into the fake bh
- */
-#define BTRFS_GET_BLOCK_NO_CREATE 0
-#define BTRFS_GET_BLOCK_CREATE 1
-#define BTRFS_GET_BLOCK_NO_DIRECT 2
-
-/*
- * FIXME create==1 doe not work.
- */
-static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
-				struct buffer_head *result, int create)
-{
-	int ret;
-	int err = 0;
-	u64 blocknr;
-	u64 extent_start = 0;
-	u64 extent_end = 0;
-	u64 objectid = inode->i_ino;
-	u32 found_type;
-	u64 alloc_hint = 0;
-	struct btrfs_path *path;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_file_extent_item *item;
-	struct btrfs_leaf *leaf;
-	struct btrfs_disk_key *found_key;
-	struct btrfs_trans_handle *trans = NULL;
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	if (create & BTRFS_GET_BLOCK_CREATE) {
-		/*
-		 * danger!, this only works if the page is properly up
-		 * to date somehow
-		 */
-		trans = btrfs_start_transaction(root, 1);
-		if (!trans) {
-			err = -ENOMEM;
-			goto out;
-		}
-		ret = btrfs_drop_extents(trans, root, inode,
-					 iblock << inode->i_blkbits,
-					 (iblock + 1) << inode->i_blkbits,
-					 &alloc_hint);
-		BUG_ON(ret);
-	}
-
-	ret = btrfs_lookup_file_extent(NULL, root, path,
-				       objectid,
-				       iblock << inode->i_blkbits, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-
-	if (ret != 0) {
-		if (path->slots[0] == 0) {
-			btrfs_release_path(root, path);
-			goto not_found;
-		}
-		path->slots[0]--;
-	}
-
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			      struct btrfs_file_extent_item);
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	blocknr = btrfs_file_extent_disk_blocknr(item);
-	blocknr += btrfs_file_extent_offset(item);
-
-	/* are we inside the extent that was found? */
-	found_key = &leaf->items[path->slots[0]].key;
-	found_type = btrfs_disk_key_type(found_key);
-	if (btrfs_disk_key_objectid(found_key) != objectid ||
-	    found_type != BTRFS_EXTENT_DATA_KEY) {
-		extent_end = 0;
-		extent_start = 0;
-		goto not_found;
-	}
-	found_type = btrfs_file_extent_type(item);
-	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
-		extent_start = extent_start >> inode->i_blkbits;
-		extent_end = extent_start + btrfs_file_extent_num_blocks(item);
-		err = 0;
-		if (btrfs_file_extent_disk_blocknr(item) == 0)
-			goto out;
-		if (iblock >= extent_start && iblock < extent_end) {
-			btrfs_map_bh_to_logical(root, result, blocknr +
-						iblock - extent_start);
-			goto out;
-		}
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		char *ptr;
-		char *map;
-		u32 size;
-
-		if (create & BTRFS_GET_BLOCK_NO_DIRECT) {
-			err = -EINVAL;
-			goto out;
-		}
-		size = btrfs_file_extent_inline_len(leaf->items +
-						    path->slots[0]);
-		extent_end = (extent_start + size) >> inode->i_blkbits;
-		extent_start >>= inode->i_blkbits;
-		if (iblock < extent_start || iblock > extent_end) {
-			goto not_found;
-		}
-		ptr = btrfs_file_extent_inline_start(item);
-		map = kmap(result->b_page);
-		memcpy(map, ptr, size);
-		memset(map + size, 0, PAGE_CACHE_SIZE - size);
-		flush_dcache_page(result->b_page);
-		kunmap(result->b_page);
-		set_buffer_uptodate(result);
-		SetPageChecked(result->b_page);
-		btrfs_map_bh_to_logical(root, result, 0);
-	}
-not_found:
-	if (create & BTRFS_GET_BLOCK_CREATE) {
-		struct btrfs_key ins;
-		ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-					 1, 0, alloc_hint, (u64)-1,
-					 &ins, 1);
-		if (ret) {
-			err = ret;
-			goto out;
-		}
-		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-					       iblock << inode->i_blkbits,
-					       ins.objectid, ins.offset,
-					       ins.offset);
-		if (ret) {
-			err = ret;
-			goto out;
-		}
-		btrfs_map_bh_to_logical(root, result, ins.objectid);
-	}
-out:
-	if (trans) {
-		ret = btrfs_end_transaction(trans, root);
-		if (!err)
-			err = ret;
-	}
-	btrfs_free_path(path);
-	return err;
-}
-
-int btrfs_get_block(struct inode *inode, sector_t iblock,
-		    struct buffer_head *result, int create)
-{
-	int err;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
-	err = btrfs_get_block_lock(inode, iblock, result, create);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return err;
-}
-
 static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
 			   struct buffer_head *result, int create)
 {
@@ -2469,6 +2318,8 @@ static struct address_space_operations btrfs_aops = {
 static struct address_space_operations btrfs_symlink_aops = {
 	.readpage	= btrfs_readpage,
 	.writepage	= btrfs_writepage,
+	.invalidatepage = btrfs_invalidatepage,
+	.releasepage	= btrfs_releasepage,
 };
 
 static struct inode_operations btrfs_file_inode_operations = {
-- 
cgit v1.2.3


From 8e21528f87854314792aaef4d279bc9e5a9be997 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Aug 2007 12:16:51 -0400
Subject: Btrfs: remove extra drop_extent_cache call

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 07b121d4bd9..acef8cb7fb7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -406,8 +406,6 @@ next_slot:
 			keep = 1;
 			WARN_ON(start & (root->blocksize - 1));
 			if (found_extent) {
-				btrfs_drop_extent_cache(inode, key.offset,
-							start - 1 );
 				new_num = (start - key.offset) >>
 					inode->i_blkbits;
 				old_num = btrfs_file_extent_num_blocks(extent);
-- 
cgit v1.2.3


From 86479a04eef8f304a13aeb8b64bcc8e506a68268 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Sep 2007 19:58:16 -0400
Subject: Add support for defragging files via btrfsctl -d.  Avoid OOM on
 extent tree defrag.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  4 +++
 fs/btrfs/disk-io.h     |  2 ++
 fs/btrfs/extent_map.c  | 16 +++++----
 fs/btrfs/inode.c       | 93 +++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/tree-defrag.c |  3 ++
 5 files changed, 100 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 89e282c35ce..b41f48ade41 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -217,6 +217,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
+	if (buffer_defrag_done(parent))
+		return 0;
+
 	parent_node = btrfs_buffer_node(parent);
 	parent_nritems = btrfs_header_nritems(&parent_node->header);
 	parent_level = btrfs_header_level(&parent_node->header);
@@ -274,6 +277,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		*last_ret = search_start;
 		if (parent_level == 1)
 			clear_buffer_defrag(tmp_bh);
+		set_buffer_defrag_done(tmp_bh);
 		brelse(tmp_bh);
 	}
 	return err;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index da6bb72750f..714fa877438 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -26,9 +26,11 @@
 enum btrfs_bh_state_bits {
 	BH_Checked = BH_PrivateStart,
 	BH_Defrag,
+	BH_DefragDone,
 };
 BUFFER_FNS(Checked, checked);
 BUFFER_FNS(Defrag, defrag);
+BUFFER_FNS(DefragDone, defrag_done);
 
 static inline struct btrfs_node *btrfs_buffer_node(struct buffer_head *bh)
 {
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8ad6f8efc5a..33f7a18dddf 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -10,6 +10,12 @@
 #include <linux/blkdev.h>
 #include "extent_map.h"
 
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+				       unsigned long extra_flags,
+				       void (*ctor)(void *, struct kmem_cache *,
+						    unsigned long));
+
 static struct kmem_cache *extent_map_cache;
 static struct kmem_cache *extent_state_cache;
 
@@ -32,14 +38,12 @@ struct tree_entry {
 
 void __init extent_map_init(void)
 {
-	extent_map_cache = kmem_cache_create("extent_map",
-					    sizeof(struct extent_map), 0,
-					    SLAB_RECLAIM_ACCOUNT |
+	extent_map_cache = btrfs_cache_create("extent_map",
+					    sizeof(struct extent_map),
 					    SLAB_DESTROY_BY_RCU,
 					    NULL);
-	extent_state_cache = kmem_cache_create("extent_state",
-					    sizeof(struct extent_state), 0,
-					    SLAB_RECLAIM_ACCOUNT |
+	extent_state_cache = btrfs_cache_create("extent_state",
+					    sizeof(struct extent_state),
 					    SLAB_DESTROY_BY_RCU,
 					    NULL);
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64710fa77d0..6b3e4404dc6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1904,6 +1904,70 @@ fail:
 	return ret;
 }
 
+static unsigned long force_ra(struct address_space *mapping,
+			      struct file_ra_state *ra, struct file *file,
+			      pgoff_t offset, pgoff_t last_index)
+{
+	pgoff_t req_size;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+	req_size = last_index - offset + 1;
+	offset = page_cache_readahead(mapping, ra, file, offset, req_size);
+	return offset;
+#else
+	req_size = min(last_index - offset + 1, (pgoff_t)128);
+	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
+	return offset + req_size;
+#endif
+}
+
+int btrfs_defrag_file(struct file *file) {
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct page *page;
+	unsigned long last_index;
+	unsigned long ra_index = 0;
+	u64 page_start;
+	u64 page_end;
+	unsigned long i;
+
+	mutex_lock(&inode->i_mutex);
+	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	for (i = 0; i <= last_index; i++) {
+		if (i == ra_index) {
+			ra_index = force_ra(inode->i_mapping, &file->f_ra,
+					    file, ra_index, last_index);
+		}
+		page = grab_cache_page(inode->i_mapping, i);
+		if (!page)
+			goto out_unlock;
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto out_unlock;
+			}
+		}
+		page_start = page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		set_extent_delalloc(em_tree, page_start,
+				    page_end, GFP_NOFS);
+		unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+	}
+
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
 int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		cmd, unsigned long arg)
 {
@@ -1948,10 +2012,14 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 		break;
 
 	case BTRFS_IOC_DEFRAG:
-		mutex_lock(&root->fs_info->fs_mutex);
-		btrfs_defrag_root(root, 0);
-		btrfs_defrag_root(root->fs_info->extent_root, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
+		if (S_ISDIR(inode->i_mode)) {
+			mutex_lock(&root->fs_info->fs_mutex);
+			btrfs_defrag_root(root, 0);
+			btrfs_defrag_root(root->fs_info->extent_root, 0);
+			mutex_unlock(&root->fs_info->fs_mutex);
+		} else if (S_ISREG(inode->i_mode)) {
+			btrfs_defrag_file(filp);
+		}
 		ret = 0;
 		break;
 	default:
@@ -2018,7 +2086,7 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_path_cachep);
 }
 
-static struct kmem_cache *cache_create(const char *name, size_t size,
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 				       unsigned long extra_flags,
 				       void (*ctor)(void *, struct kmem_cache *,
 						    unsigned long))
@@ -2033,27 +2101,28 @@ static struct kmem_cache *cache_create(const char *name, size_t size,
 
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = cache_create("btrfs_inode_cache",
+	btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache",
 					  sizeof(struct btrfs_inode),
 					  0, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
-	btrfs_trans_handle_cachep = cache_create("btrfs_trans_handle_cache",
-					     sizeof(struct btrfs_trans_handle),
-					     0, NULL);
+	btrfs_trans_handle_cachep =
+			btrfs_cache_create("btrfs_trans_handle_cache",
+					   sizeof(struct btrfs_trans_handle),
+					   0, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
-	btrfs_transaction_cachep = cache_create("btrfs_transaction_cache",
+	btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache",
 					     sizeof(struct btrfs_transaction),
 					     0, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
-	btrfs_path_cachep = cache_create("btrfs_path_cache",
+	btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
 					 sizeof(struct btrfs_transaction),
 					 0, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
-	btrfs_bit_radix_cachep = cache_create("btrfs_radix", 256,
+	btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256,
 					      SLAB_DESTROY_BY_RCU, NULL);
 	if (!btrfs_bit_radix_cachep)
 		goto fail;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 35fd20d2464..420597127ed 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -113,6 +113,8 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	}
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+	clear_buffer_defrag(path->nodes[*level]);
+	clear_buffer_defrag_done(path->nodes[*level]);
 	btrfs_block_release(root, path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -143,6 +145,7 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 			return 0;
 		} else {
 			clear_buffer_defrag(path->nodes[*level]);
+			clear_buffer_defrag_done(path->nodes[*level]);
 			btrfs_block_release(root, path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
-- 
cgit v1.2.3


From 011410bd859a481a335d7db1fb559542c5663fd5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Sep 2007 19:58:36 -0400
Subject: Btrfs: Add more synchronization before creating a snapshot

File data checksums are only done during writepage, so we have to make sure
all pages are written when the snapshot is taken.  This also adds some
locking so that new writes don't race in and add new dirty pages.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/file.c    |  2 ++
 fs/btrfs/inode.c   | 10 +++++++++-
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cd75c906048..03e1c4ad341 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -333,6 +333,7 @@ struct btrfs_root {
 	struct inode *inode;
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
+	struct rw_semaphore snap_sem;
 	u64 objectid;
 	u64 last_trans;
 	u32 blocksize;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c25ef0a68f1..6c953a0e0aa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -300,6 +300,7 @@ static int __setup_root(int blocksize,
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
 	init_completion(&root->kobj_unregister);
+	init_rwsem(&root->snap_sem);
 	root->defrag_running = 0;
 	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index acef8cb7fb7..77f1950b0aa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -160,6 +160,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
 			inode->i_blkbits;
 
+	down_read(&BTRFS_I(inode)->root->snap_sem);
 	end_of_last_block = start_pos + (num_blocks << inode->i_blkbits) - 1;
 	lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -250,6 +251,7 @@ out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	unlock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
 	free_extent_map(em);
+	up_read(&BTRFS_I(inode)->root->snap_sem);
 	return err;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b3e4404dc6..1ace6d11e09 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -686,6 +686,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
 
+	down_read(&BTRFS_I(inode)->root->snap_sem);
 	ret = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -704,6 +705,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	unlock_page(page);
 	page_cache_release(page);
+	up_read(&BTRFS_I(inode)->root->snap_sem);
 out:
 	return ret;
 }
@@ -1668,6 +1670,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	int ret = -EINVAL;
 	u64 page_start;
 
+	down_read(&BTRFS_I(inode)->root->snap_sem);
 	lock_page(page);
 	wait_on_page_writeback(page);
 	size = i_size_read(inode);
@@ -1688,6 +1691,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ret = btrfs_cow_one_page(inode, page, end);
 
 out_unlock:
+	up_read(&BTRFS_I(inode)->root->snap_sem);
 	unlock_page(page);
 	return ret;
 }
@@ -1851,6 +1855,10 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (!root->ref_cows)
 		return -EINVAL;
 
+	down_write(&root->snap_sem);
+	freeze_bdev(root->fs_info->sb->s_bdev);
+	thaw_bdev(root->fs_info->sb->s_bdev, root->fs_info->sb);
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
@@ -1894,12 +1902,12 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_inc_root_ref(trans, root);
 	if (ret)
 		goto fail;
-
 fail:
 	err = btrfs_commit_transaction(trans, root);
 	if (err && !ret)
 		ret = err;
 	mutex_unlock(&root->fs_info->fs_mutex);
+	up_write(&root->snap_sem);
 	btrfs_btree_balance_dirty(root);
 	return ret;
 }
-- 
cgit v1.2.3


From a8c450b211c010ac55190da23ceb6b39b393f411 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Sep 2007 20:00:27 -0400
Subject: Btrfs: Reorder tests in set_extent_bit to properly find holes

Yan Zheng noticed that set_extent_bit was exiting too early when there
was a hole in the map.  The fix is to reorder the tests to check for the
hole first.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 33f7a18dddf..370ed97e52f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -712,28 +712,6 @@ again:
 		}
 		goto search_again;
 	}
-	/*
-	 * | ---- desired range ---- |
-	 *                        | state |
-	 * We need to split the extent, and set the bit
-	 * on the first half
-	 */
-	if (state->start <= end && state->end > end) {
-		set = state->state & bits;
-		if (exclusive && set) {
-			*failed_start = start;
-			err = -EEXIST;
-			goto out;
-		}
-		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
-
-		prealloc->state |= bits;
-		merge_state(tree, prealloc);
-		prealloc = NULL;
-		goto out;
-	}
-
 	/*
 	 * | ---- desired range ---- |
 	 *     | state | or               | state |
@@ -756,6 +734,28 @@ again:
 		start = this_end + 1;
 		goto search_again;
 	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		prealloc->state |= bits;
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
 	goto search_again;
 
 out:
-- 
cgit v1.2.3


From 9433063bcb4b3c0c9e5ef3c24a811b63084dedb4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Sep 2007 20:02:22 -0400
Subject: [PATCH] btrfs: fix printk format warning

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 77f1950b0aa..c26ca54ed4d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -174,7 +174,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	hint_block = 0;
 
 	if ((end_of_last_block & 4095) == 0) {
-		printk("strange end of last %Lu %lu %Lu\n", start_pos, write_bytes, end_of_last_block);
+		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
 	}
 	set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS);
 
-- 
cgit v1.2.3


From 90f1c19a9fd2a943adc69d2b9b8c83bcc4bba6f9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Sep 2007 20:02:27 -0400
Subject: Btrfs: [PATCH] extent_map: fix locking for bio completion

The bio completion handlers can be run in any context, e.g. when using
the old ide driver they run in hardirq context with irqs disabled so
lockdep rightfully warns about using write_lock_irq useage in these
handlers.

This patch switches clear_extent_bit and set_extent_bit to
write_lock_irqsave to fix this problem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 370ed97e52f..3021564ac62 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -456,6 +456,7 @@ int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
+	unsigned long flags;
 	int err;
 	int set = 0;
 
@@ -466,7 +467,7 @@ again:
 			return -ENOMEM;
 	}
 
-	write_lock_irq(&tree->lock);
+	write_lock_irqsave(&tree->lock, flags);
 	/*
 	 * this search will find the extents that end after
 	 * our range starts
@@ -533,7 +534,7 @@ again:
 	goto search_again;
 
 out:
-	write_unlock_irq(&tree->lock);
+	write_unlock_irqrestore(&tree->lock, flags);
 	if (prealloc)
 		free_extent_state(prealloc);
 
@@ -542,7 +543,7 @@ out:
 search_again:
 	if (start >= end)
 		goto out;
-	write_unlock_irq(&tree->lock);
+	write_unlock_irqrestore(&tree->lock, flags);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
@@ -628,6 +629,7 @@ int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
+	unsigned long flags;
 	int err = 0;
 	int set;
 	u64 last_start;
@@ -639,7 +641,7 @@ again:
 			return -ENOMEM;
 	}
 
-	write_lock_irq(&tree->lock);
+	write_lock_irqsave(&tree->lock, flags);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -759,7 +761,7 @@ again:
 	goto search_again;
 
 out:
-	write_unlock_irq(&tree->lock);
+	write_unlock_irqrestore(&tree->lock, flags);
 	if (prealloc)
 		free_extent_state(prealloc);
 
@@ -768,7 +770,7 @@ out:
 search_again:
 	if (start > end)
 		goto out;
-	write_unlock_irq(&tree->lock);
+	write_unlock_irqrestore(&tree->lock, flags);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
-- 
cgit v1.2.3


From d396c6f554d182b7bce4e2bb2acb47f80430c05f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Sep 2007 20:02:30 -0400
Subject: Btrfs: [PATCH] extent_map: provide generic bmap

generic_bmap is completely trivial, while the extent to bh mapping in
btrfs is rather complex.  So provide a extent_bmap instead that takes
a get_extent callback and can be used by filesystem using the extent_map
code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 19 +++++++++++++++++++
 fs/btrfs/extent_map.h |  2 ++
 fs/btrfs/inode.c      | 23 ++---------------------
 3 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 3021564ac62..ff2b1dd25bf 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1816,3 +1816,22 @@ int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
 }
 EXPORT_SYMBOL(try_release_extent_mapping);
 
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent)
+{
+	struct inode *inode = mapping->host;
+	u64 start = iblock << inode->i_blkbits;
+	u64 end = start + (1 << inode->i_blkbits) - 1;
+	struct extent_map *em;
+
+	em = get_extent(inode, NULL, 0, start, end, 0);
+	if (!em || IS_ERR(em))
+		return 0;
+
+	// XXX(hch): block 0 is valid in some cases, e.g. XFS RT device
+	if (em->block_start == EXTENT_MAP_INLINE ||
+	    em->block_start == 0)
+	    	return 0;
+
+	return (em->block_start + start - em->start) >> inode->i_blkbits;
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 0eae6c420ac..da41aca74e0 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -99,6 +99,8 @@ int extent_prepare_write(struct extent_map_tree *tree,
 int extent_commit_write(struct extent_map_tree *tree,
 			struct inode *inode, struct page *page,
 			unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent);
 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1ace6d11e09..39c1fe178f4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1567,28 +1567,9 @@ out:
 	return em;
 }
 
-static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
-			   struct buffer_head *result, int create)
+static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 start = iblock << inode->i_blkbits;
-	u64 end = start + root->blocksize -1;
-	struct extent_map *em;
-
-	em = btrfs_get_extent(inode, NULL, 0, start, end, 0);
-	if (em && !IS_ERR(em) && em->block_start != EXTENT_MAP_INLINE &&
-	    em->block_start != 0) {
-		u64 offset;
-		offset = start - em->start;
-		start = (em->block_start + offset) >> inode->i_blkbits;
-		btrfs_map_bh_to_logical(root, result, start);
-	}
-	return 0;
-}
-
-static sector_t btrfs_bmap(struct address_space *as, sector_t block)
-{
-	return generic_block_bmap(as, block, btrfs_get_block_bmap);
+	return extent_bmap(mapping, iblock, btrfs_get_extent);
 }
 
 static int btrfs_prepare_write(struct file *file, struct page *page,
-- 
cgit v1.2.3


From b06355f0fe04a62d08238438654ec453e05304ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Sep 2007 20:02:32 -0400
Subject: Btrfs: [PATCH] extent_map: make the writepage_io hook optional

The writepage_io is not mandatory, e.g. my port of xfs to the extent_map
code does not have one for now.  So handle a NULL pointer gracefully
here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ff2b1dd25bf..162766773a3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1607,7 +1607,12 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			continue;
 		}
 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
-		ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1);
+		if (tree->ops && tree->ops->writepage_io_hook) {
+			ret = tree->ops->writepage_io_hook(page, cur,
+						cur + iosize - 1);
+		} else {
+			ret = 0;
+		}
 		if (ret)
 			SetPageError(page);
 		else {
-- 
cgit v1.2.3


From 0e2752a72cb37075b24899f01e9bc6a589de3b6c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 10 Sep 2007 20:02:33 -0400
Subject: Btrfs: [PATCH] extent_map: add writepage_end_io hook

XFS updates the ondisk inode size only after the data I/O has finished,
so it needs a hook when the writepage end_bio handler has finished.

Might not be worth applying as-is as the per-page callback is very
ineffcient.  What XFS really wants is a callback when writeout of a
whole extent has completed.  This delayed i_size updates scheme might
be worthwile for btrfs aswell, btw.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 2 ++
 fs/btrfs/extent_map.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 162766773a3..ab5bde31b68 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1231,6 +1231,8 @@ static int end_bio_extent_writepage(struct bio *bio,
 			end_page_writeback(page);
 		else
 			check_page_writeback(tree, page);
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start, end);
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index da41aca74e0..40b53ee274f 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -11,6 +11,7 @@ struct extent_map_ops {
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end);
+	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end);
 };
 
 struct extent_map_tree {
-- 
cgit v1.2.3


From 5ce14bbcdd1b5d9233d26a1e89faf3a26c820c58 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 Sep 2007 11:15:39 -0400
Subject: Btrfs: Find and remove dead roots the first time a root is loaded.

Dead roots are trees left over after a crash, and they were either in the
process of being removed or were waiting to be removed when the box crashed.
Before, a search of the entire tree of root pointers was done on mount
looking for dead roots.  Now, the search is done the first time we load
a root.

This makes mount faster when there are a large number of snapshots, and it
enables the block accounting code to properly update the block counts on
the latest root as old versions of the root are reaped after a crash.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/disk-io.c     |  9 ++++-----
 fs/btrfs/root-tree.c   | 16 +++++++++++++---
 fs/btrfs/transaction.c |  6 ++++--
 fs/btrfs/transaction.h |  3 ++-
 5 files changed, 25 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03e1c4ad341..6d6e94b6380 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1148,7 +1148,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *item);
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+			  struct btrfs_root *latest_root);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c953a0e0aa..4296839eea1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -426,6 +426,10 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		return ERR_PTR(ret);
 	}
 
+	ret = btrfs_find_dead_roots(fs_info->tree_root,
+				    root->root_key.objectid, root);
+	BUG_ON(ret);
+
 	return root;
 }
 
@@ -522,11 +526,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
-	ret = btrfs_find_dead_roots(tree_root);
-	if (ret) {
-		mutex_unlock(&fs_info->fs_mutex);
-		goto fail_tree_root;
-	}
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 402f67821c1..3b5926dfbeb 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -93,7 +93,8 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
-int btrfs_find_dead_roots(struct btrfs_root *root)
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
+			  struct btrfs_root *latest)
 {
 	struct btrfs_root *dead_root;
 	struct btrfs_item *item;
@@ -105,7 +106,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root)
 	struct btrfs_leaf *leaf;
 	int slot;
 
-	key.objectid = 0;
+	key.objectid = objectid;
 	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	key.offset = 0;
@@ -131,15 +132,24 @@ int btrfs_find_dead_roots(struct btrfs_root *root)
 		btrfs_disk_key_to_cpu(&key, &item->key);
 		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
 			goto next;
+
+		if (key.objectid < objectid)
+			goto next;
+
+		if (key.objectid > objectid)
+			break;
+
 		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
 		if (btrfs_root_refs(ri) != 0)
 			goto next;
+
 		dead_root = btrfs_read_fs_root_no_radix(root->fs_info, &key);
 		if (IS_ERR(dead_root)) {
 			ret = PTR_ERR(dead_root);
 			goto err;
 		}
-		ret = btrfs_add_dead_root(dead_root,
+
+		ret = btrfs_add_dead_root(dead_root, latest,
 					  &root->fs_info->dead_roots);
 		if (ret)
 			goto err;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2b15daa3a9f..29755593de6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -239,7 +239,9 @@ struct dirty_root {
 	struct btrfs_root *latest_root;
 };
 
-int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
+int btrfs_add_dead_root(struct btrfs_root *root,
+			struct btrfs_root *latest,
+			struct list_head *dead_list)
 {
 	struct dirty_root *dirty;
 
@@ -247,6 +249,7 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
 	if (!dirty)
 		return -ENOMEM;
 	dirty->root = root;
+	dirty->latest_root = latest;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -412,7 +415,6 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 
 		while(1) {
 			trans = btrfs_start_transaction(tree_root, 1);
-
 			ret = btrfs_drop_snapshot(trans, dirty->root);
 			if (ret != -EAGAIN) {
 				break;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e451783a1a4..4bc328cbb24 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -78,7 +78,8 @@ void btrfs_transaction_flush_work(struct btrfs_root *root);
 void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
 void btrfs_init_transaction_sys(void);
 void btrfs_exit_transaction_sys(void);
-int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list);
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
+			struct list_head *dead_list);
 int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
-- 
cgit v1.2.3


From 93a6925ec10c2d5792ffad35af8366179c9db57c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 14 Sep 2007 09:42:31 -0400
Subject: Btrfs: Fix extra link count dec in rename

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 39c1fe178f4..cde8ddde609 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2214,13 +2214,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
 		if (ret)
 			goto out_fail;
-		if (S_ISDIR(new_inode->i_mode))
-			clear_nlink(new_inode);
-		else
-			drop_nlink(new_inode);
-		ret = btrfs_update_inode(trans, root, new_inode);
-		if (ret)
-			goto out_fail;
 	}
 	ret = btrfs_add_link(trans, new_dentry, old_inode);
 	if (ret)
-- 
cgit v1.2.3


From 84a5d5eefd2b31c3cb97dc078589870bf85c7f23 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 14 Sep 2007 09:43:53 -0400
Subject: Btrfs: add modules_install target

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ee505c87ecf..058a2dae65b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -18,6 +18,8 @@ else
 KERNELDIR := /lib/modules/`uname -r`/build
 all::
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
+modules_install::
+	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
 clean::
 	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
 endif
-- 
cgit v1.2.3


From 432eba0882a32d6181ba44044f6e576226134784 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@computergmbh.de>
Date: Fri, 14 Sep 2007 10:22:19 -0400
Subject: Btrfs: Simplify makefile

Single-colons will do here.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 058a2dae65b..6341cf7a9f9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -16,10 +16,11 @@ else
 # Normal Makefile
 
 KERNELDIR := /lib/modules/`uname -r`/build
-all::
+all:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
-modules_install::
+modules_install:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
-clean::
+clean:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
+
 endif
-- 
cgit v1.2.3


From 34287aa3605d52ef5a4b6e5e2cca98667166508b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Sep 2007 10:22:47 -0400
Subject: Btrfs: use unlocked_ioctl

No reason to grab the BKL before calling into the btrfs ioctl code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  5 +----
 fs/btrfs/file.c  |  4 ++--
 fs/btrfs/inode.c | 23 +++++------------------
 3 files changed, 8 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6d6e94b6380..800a3499cc3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1219,10 +1219,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
-int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
-		unsigned long arg);
-long btrfs_compat_ioctl(struct file *file, unsigned int cmd,
-			       unsigned long arg);
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
 int btrfs_commit_write(struct file *file, struct page *page,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c26ca54ed4d..ca7e5d4474e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -732,10 +732,10 @@ struct file_operations btrfs_file_operations = {
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
-	.ioctl		= btrfs_ioctl,
 	.fsync		= btrfs_sync_file,
+	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl	= btrfs_compat_ioctl,
+	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cde8ddde609..3901a7fa094 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1957,9 +1957,10 @@ out_unlock:
 	return 0;
 }
 
-int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
+long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_vol_args vol_args;
 	int ret = 0;
@@ -2007,7 +2008,7 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 			btrfs_defrag_root(root->fs_info->extent_root, 0);
 			mutex_unlock(&root->fs_info->fs_mutex);
 		} else if (S_ISREG(inode->i_mode)) {
-			btrfs_defrag_file(filp);
+			btrfs_defrag_file(file);
 		}
 		ret = 0;
 		break;
@@ -2017,20 +2018,6 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-long btrfs_compat_ioctl(struct file *file, unsigned int cmd,
-			       unsigned long arg)
-{
-	struct inode *inode = file->f_path.dentry->d_inode;
-	int ret;
-	lock_kernel();
-	ret = btrfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg));
-	unlock_kernel();
-	return ret;
-
-}
-#endif
-
 /*
  * Called inside transaction, so use GFP_NOFS
  */
@@ -2341,9 +2328,9 @@ static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= btrfs_readdir,
-	.ioctl		= btrfs_ioctl,
+	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
-	.compat_ioctl	= btrfs_compat_ioctl,
+	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
 
-- 
cgit v1.2.3


From d03581f4345e30db078ea8f81189e8177280bbb8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Sep 2007 10:22:57 -0400
Subject: split up btrfs_ioctl

Add a helper per ioctl function to make the code more readable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 106 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 59 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3901a7fa094..e24b875327d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1957,65 +1957,77 @@ out_unlock:
 	return 0;
 }
 
-long btrfs_ioctl(struct file *file, unsigned int
-		cmd, unsigned long arg)
+static int btrfs_ioctl_snap_create(struct btrfs_root *root, void __user *arg)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ioctl_vol_args vol_args;
-	int ret = 0;
 	struct btrfs_dir_item *di;
-	int namelen;
 	struct btrfs_path *path;
+	int namelen;
 	u64 root_dirid;
 
-	switch (cmd) {
-	case BTRFS_IOC_SNAP_CREATE:
-		if (copy_from_user(&vol_args,
-				   (struct btrfs_ioctl_vol_args __user *)arg,
-				   sizeof(vol_args)))
-			return -EFAULT;
-		namelen = strlen(vol_args.name);
-		if (namelen > BTRFS_VOL_NAME_MAX)
-			return -EINVAL;
-		if (strchr(vol_args.name, '/'))
-			return -EINVAL;
-		path = btrfs_alloc_path();
-		if (!path)
-			return -ENOMEM;
-		root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+	if (copy_from_user(&vol_args, arg, sizeof(vol_args)))
+		return -EFAULT;
+	
+	namelen = strlen(vol_args.name);
+	if (namelen > BTRFS_VOL_NAME_MAX)
+		return -EINVAL;
+	if (strchr(vol_args.name, '/'))
+		return -EINVAL;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+	mutex_lock(&root->fs_info->fs_mutex);
+	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+			    path, root_dirid,
+			    vol_args.name, namelen, 0);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_free_path(path);
+	if (di && !IS_ERR(di))
+		return -EEXIST;
+	if (IS_ERR(di))
+		return PTR_ERR(di);
+
+	if (root == root->fs_info->tree_root)
+		return create_subvol(root, vol_args.name, namelen);
+	return create_snapshot(root, vol_args.name, namelen);
+}
+
+static int btrfs_ioctl_defrag(struct file *file)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
 		mutex_lock(&root->fs_info->fs_mutex);
-		di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
-				    path, root_dirid,
-				    vol_args.name, namelen, 0);
+		btrfs_defrag_root(root, 0);
+		btrfs_defrag_root(root->fs_info->extent_root, 0);
 		mutex_unlock(&root->fs_info->fs_mutex);
-		btrfs_free_path(path);
-		if (di && !IS_ERR(di))
-			return -EEXIST;
-		if (IS_ERR(di))
-			return PTR_ERR(di);
-
-		if (root == root->fs_info->tree_root)
-			ret = create_subvol(root, vol_args.name, namelen);
-		else
-			ret = create_snapshot(root, vol_args.name, namelen);
 		break;
+	case S_IFREG:
+		btrfs_defrag_file(file);
+		break;
+	}
+
+	return 0;
+}
 
+long btrfs_ioctl(struct file *file, unsigned int
+		cmd, unsigned long arg)
+{
+	struct btrfs_root *root = BTRFS_I(file->f_path.dentry->d_inode)->root;
+
+	switch (cmd) {
+	case BTRFS_IOC_SNAP_CREATE:
+		return btrfs_ioctl_snap_create(root, (void __user *)arg);
 	case BTRFS_IOC_DEFRAG:
-		if (S_ISDIR(inode->i_mode)) {
-			mutex_lock(&root->fs_info->fs_mutex);
-			btrfs_defrag_root(root, 0);
-			btrfs_defrag_root(root->fs_info->extent_root, 0);
-			mutex_unlock(&root->fs_info->fs_mutex);
-		} else if (S_ISREG(inode->i_mode)) {
-			btrfs_defrag_file(file);
-		}
-		ret = 0;
-		break;
-	default:
-		return -ENOTTY;
+		return btrfs_ioctl_defrag(file);
 	}
-	return ret;
+
+	return -ENOTTY;
 }
 
 /*
-- 
cgit v1.2.3


From 6af858b24c0c4e91ce0b40b5dd5fdfc1fe2dda70 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 14 Sep 2007 10:23:29 -0400
Subject: Btrfs: Fix off by one error in dirty_and_release_pages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ca7e5d4474e..26826a54335 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -236,7 +236,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 					   end_pos - start_pos, p, 0);
 		BUG_ON(err);
 		em->start = start_pos;
-		em->end = end_pos;
+		em->end = end_pos - 1;
 		em->block_start = EXTENT_MAP_INLINE;
 		em->block_end = EXTENT_MAP_INLINE;
 		add_extent_mapping(em_tree, em);
-- 
cgit v1.2.3


From 7d7d6068bee9d8004ea7cc8a2ebda2c99a57caec Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 14 Sep 2007 16:15:28 -0400
Subject: Btrfs: Fix cache_block_group to catch holes at the start of the group

Cache block group was overly complex and missed free blocks at the very start
of the group.  This patch simplifies things significantly.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 61 +++++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b074ad1416d..9151850266b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -39,6 +39,7 @@ static int cache_block_group(struct btrfs_root *root,
 	u64 i;
 	u64 last = 0;
 	u64 hole_size;
+	u64 first_free;
 	int found = 0;
 
 	root = root->fs_info->extent_root;
@@ -51,16 +52,22 @@ static int cache_block_group(struct btrfs_root *root,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+
 	path->reada = 2;
+	first_free = block_group->key.objectid;
 	key.objectid = block_group->key.objectid;
 	key.flags = 0;
 	key.offset = 0;
+
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
 	if (ret < 0)
 		return ret;
+
 	if (ret && path->slots[0] > 0)
 		path->slots[0]--;
+
 	while(1) {
 		leaf = btrfs_buffer_leaf(path->nodes[0]);
 		slot = path->slots[0];
@@ -71,50 +78,48 @@ static int cache_block_group(struct btrfs_root *root,
 			if (ret == 0) {
 				continue;
 			} else {
-				if (found) {
-					hole_size = block_group->key.objectid +
-						block_group->key.offset - last;
-				} else {
-					last = block_group->key.objectid;
-					hole_size = block_group->key.offset;
-				}
-				for (i = 0; i < hole_size; i++) {
-					set_radix_bit(extent_radix,
-						      last + i);
-				}
 				break;
 			}
 		}
+
 		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+		if (key.objectid < block_group->key.objectid) {
+			if (key.objectid + key.offset > first_free)
+				first_free = key.objectid + key.offset;
+			goto next;
+		}
+
 		if (key.objectid >= block_group->key.objectid +
 		    block_group->key.offset) {
-			if (found) {
-				hole_size = block_group->key.objectid +
-					block_group->key.offset - last;
-			} else {
-				last = block_group->key.objectid;
-				hole_size = block_group->key.offset;
-			}
-			for (i = 0; i < hole_size; i++) {
-				set_radix_bit(extent_radix, last + i);
-			}
 			break;
 		}
+
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
 			if (!found) {
-				last = key.objectid + key.offset;
+				last = first_free;
 				found = 1;
-			} else {
-				hole_size = key.objectid - last;
-				for (i = 0; i < hole_size; i++) {
-					set_radix_bit(extent_radix, last + i);
-				}
-				last = key.objectid + key.offset;
 			}
+			hole_size = key.objectid - last;
+			for (i = 0; i < hole_size; i++) {
+				set_radix_bit(extent_radix, last + i);
+			}
+			last = key.objectid + key.offset;
 		}
+next:
 		path->slots[0]++;
 	}
 
+	if (!found)
+		last = first_free;
+	if (block_group->key.objectid +
+	    block_group->key.offset > last) {
+		hole_size = block_group->key.objectid +
+			block_group->key.offset - last;
+		for (i = 0; i < hole_size; i++) {
+			set_radix_bit(extent_radix,
+					last + i);
+		}
+	}
 	block_group->cached = 1;
 err:
 	btrfs_free_path(path);
-- 
cgit v1.2.3


From d3c2fdcf7b79079f60ac64e61d886964d4647910 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Sep 2007 10:58:06 -0400
Subject: Btrfs: Use balance_dirty_pages_nr on btree blocks

btrfs_btree_balance_dirty is changed to pass the number of pages dirtied
for more accurate dirty throttling.  This lets the VM make better decisions
about when to force some writeback.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  5 +++--
 fs/btrfs/disk-io.h     |  2 +-
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/file.c        |  2 +-
 fs/btrfs/inode.c       | 47 +++++++++++++++++++++++++++++++++++------------
 fs/btrfs/transaction.c | 14 ++++++++++----
 6 files changed, 51 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4296839eea1..ef6d76f2ec2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -666,7 +666,8 @@ void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
 	brelse(buf);
 }
 
-void btrfs_btree_balance_dirty(struct btrfs_root *root)
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 {
-	balance_dirty_pages_ratelimited(root->fs_info->btree_inode->i_mapping);
+	balance_dirty_pages_ratelimited_nr(
+			root->fs_info->btree_inode->i_mapping, nr);
 }
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 714fa877438..fc6b520c6e0 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -80,7 +80,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   u64 num_blocks);
 int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
 			     u64 logical);
-void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct buffer_head *bh);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9151850266b..fe772f9b06c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1275,6 +1275,7 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	set_buffer_checked(buf);
 	set_buffer_defrag(buf);
 	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
+	trans->blocks_used++;
 	return buf;
 }
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 26826a54335..698eaea612f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -655,7 +655,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		num_written += write_bytes;
 
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
-		btrfs_btree_balance_dirty(root);
+		btrfs_btree_balance_dirty(root, 1);
 		cond_resched();
 	}
 	mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e24b875327d..bd00df33fb3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -387,15 +387,17 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
 	int ret;
+	unsigned long nr;
 
 	root = BTRFS_I(dir)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
 
@@ -412,6 +414,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int found_type;
 	struct btrfs_leaf *leaf;
 	char *goodnames = "..";
+	unsigned long nr;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -467,8 +470,9 @@ out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->fs_mutex);
+	nr = trans->blocks_used;
 	ret = btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -764,6 +768,7 @@ void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr;
 	int ret;
 
 	truncate_inode_pages(&inode->i_data, 0);
@@ -780,15 +785,17 @@ void btrfs_delete_inode(struct inode *inode)
 	ret = btrfs_free_inode(trans, root, inode);
 	if (ret)
 		goto no_delete_lock;
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return;
 
 no_delete_lock:
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 no_delete:
 	clear_inode(inode);
 }
@@ -1165,6 +1172,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
+	unsigned long nr;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -1198,6 +1206,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
@@ -1205,7 +1214,7 @@ out_unlock:
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return err;
 }
 
@@ -1217,6 +1226,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err;
 	int drop_inode = 0;
+	unsigned long nr;
 	u64 objectid;
 
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -1251,6 +1261,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
@@ -1258,7 +1269,7 @@ out_unlock:
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return err;
 }
 
@@ -1268,6 +1279,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = old_dentry->d_inode;
+	unsigned long nr;
 	int err;
 	int drop_inode = 0;
 
@@ -1288,6 +1300,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (err)
 		drop_inode = 1;
 
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 
@@ -1295,7 +1308,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return err;
 }
 
@@ -1336,6 +1349,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int err = 0;
 	int drop_on_err = 0;
 	u64 objectid;
+	unsigned long nr = 1;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
@@ -1380,12 +1394,13 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	btrfs_update_inode_block_group(trans, dir);
 
 out_fail:
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_on_err)
 		iput(inode);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return err;
 }
 
@@ -1682,6 +1697,7 @@ static void btrfs_truncate(struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 	struct btrfs_trans_handle *trans;
+	unsigned long nr;
 
 	if (!S_ISREG(inode->i_mode))
 		return;
@@ -1697,10 +1713,11 @@ static void btrfs_truncate(struct inode *inode)
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	ret = btrfs_truncate_in_trans(trans, root, inode);
 	btrfs_update_inode(trans, root, inode);
+	nr = trans->blocks_used;
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 }
 
 int btrfs_commit_write(struct file *file, struct page *page,
@@ -1725,6 +1742,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	int err;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+	unsigned long nr = 1;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
@@ -1814,12 +1832,13 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail;
 fail:
+	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, root);
 	if (err && !ret)
 		ret = err;
 fail_commit:
 	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
 
@@ -1832,6 +1851,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	int ret;
 	int err;
 	u64 objectid;
+	unsigned long nr;
 
 	if (!root->ref_cows)
 		return -EINVAL;
@@ -1884,12 +1904,13 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail;
 fail:
+	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, root);
 	if (err && !ret)
 		ret = err;
 	mutex_unlock(&root->fs_info->fs_mutex);
 	up_write(&root->snap_sem);
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
 
@@ -2240,6 +2261,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	int datasize;
 	char *ptr;
 	struct btrfs_file_extent_item *ei;
+	unsigned long nr;
 
 	name_len = strlen(symname) + 1;
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
@@ -2309,13 +2331,14 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		drop_inode = 1;
 
 out_unlock:
+	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root);
+	btrfs_btree_balance_dirty(root, nr);
 	return err;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 29755593de6..087074db0bd 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -18,6 +18,7 @@
 
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/writeback.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -340,6 +341,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	struct btrfs_fs_info *info = root->fs_info;
 	int ret;
 	struct btrfs_trans_handle *trans;
+	unsigned long nr;
 
 	if (root->defrag_running)
 		return 0;
@@ -348,10 +350,11 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	while (1) {
 		root->defrag_running = 1;
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&info->fs_mutex);
 
-		btrfs_btree_balance_dirty(root);
+		btrfs_btree_balance_dirty(info->tree_root, nr);
 		cond_resched();
 
 		mutex_lock(&info->fs_mutex);
@@ -398,6 +401,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 {
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
+	unsigned long nr;
 	u64 num_blocks;
 	u64 blocks_used;
 	int ret = 0;
@@ -426,11 +430,11 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 					&dirty->root->root_item);
 			if (err)
 				ret = err;
+			nr = trans->blocks_used;
 			ret = btrfs_end_transaction(trans, tree_root);
 			BUG_ON(ret);
 			mutex_unlock(&tree_root->fs_info->fs_mutex);
-
-			btrfs_btree_balance_dirty(tree_root);
+			btrfs_btree_balance_dirty(tree_root, nr);
 			schedule();
 
 			mutex_lock(&tree_root->fs_info->fs_mutex);
@@ -449,13 +453,15 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 			BUG();
 			break;
 		}
+		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
 		kfree(dirty->root);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
-		btrfs_btree_balance_dirty(tree_root);
+
+		btrfs_btree_balance_dirty(tree_root, nr);
 		schedule();
 	}
 	return ret;
-- 
cgit v1.2.3


From cf67582bb2ec364d71b3e4f8e49de98d8c3c832e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Sep 2007 11:00:51 -0400
Subject: Btrfs: Fix duplicate ENOSPC checks in find_free_extent

find_free_extent would fail to wrap around to the start of the drive because
it was doing the enospc case checking twice in some cases, causing it
to return -ENOSPC early.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe772f9b06c..f261a8326cd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1111,20 +1111,9 @@ check_pending:
 	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
 
-	if (ins->objectid + num_blocks >= search_end) {
-		if (full_scan) {
-			ret = -ENOSPC;
-			goto error;
-		}
-		search_start = orig_search_start;
-		if (wrapped) {
-			if (!full_scan)
-				total_needed -= empty_size;
-			full_scan = 1;
-		} else
-			wrapped = 1;
-		goto new_group;
-	}
+	if (ins->objectid + num_blocks >= search_end)
+		goto enospc;
+
 	for (test_block = ins->objectid;
 	     test_block < ins->objectid + num_blocks; test_block++) {
 		if (test_radix_bit(&info->pinned_radix, test_block) ||
@@ -1149,6 +1138,7 @@ check_pending:
 
 new_group:
 	if (search_start + num_blocks >= search_end) {
+enospc:
 		search_start = orig_search_start;
 		if (full_scan) {
 			ret = -ENOSPC;
-- 
cgit v1.2.3


From a61721d5b771d1735415a6b4c75409ab1fc9c845 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 17 Sep 2007 11:08:38 -0400
Subject: fix found_type decrement in btrfs_truncate_in_trans

found_type has already been decreased by codes above the change,  I
think decrease it by one again doesn't make sense.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bd00df33fb3..211ca32c65c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -581,7 +581,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			} else {
 				break;
 			}
-			btrfs_set_key_type(&key, found_type - 1);
+			btrfs_set_key_type(&key, found_type);
 			continue;
 		}
 		if (btrfs_disk_key_offset(found_key) >= inode->i_size)
-- 
cgit v1.2.3


From 23223584e44cb1b0b69ba3a9ad74956f292a416b Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 17 Sep 2007 11:08:52 -0400
Subject: create btrfs_path slab with the correct size

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 211ca32c65c..192fdda0d56 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2127,7 +2127,7 @@ int btrfs_init_cachep(void)
 	if (!btrfs_transaction_cachep)
 		goto fail;
 	btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache",
-					 sizeof(struct btrfs_transaction),
+					 sizeof(struct btrfs_path),
 					 0, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
-- 
cgit v1.2.3


From bab9fb035faa5119991b64cfee1b14c1d6f6a31b Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 17 Sep 2007 11:13:11 -0400
Subject: Btrfs: truncate: don't update inode->i_blocks when extent is a hole

I think check whether extent is a hole before update 'inode->i_blocks'
is unconditional required. (original codes check it only when
del_item isn't equal to 0)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 192fdda0d56..43bf4337859 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -595,6 +595,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			   btrfs_file_extent_type(fi) !=
 			   BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
+			extent_start = btrfs_file_extent_disk_blocknr(fi);
 			if (!del_item) {
 				u64 orig_num_blocks =
 					btrfs_file_extent_num_blocks(fi);
@@ -604,12 +605,13 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				extent_num_blocks >>= inode->i_blkbits;
 				btrfs_set_file_extent_num_blocks(fi,
 							 extent_num_blocks);
-				inode->i_blocks -= (orig_num_blocks -
-					extent_num_blocks) << 3;
+				num_dec = (orig_num_blocks -
+					   extent_num_blocks) << 3;
+				if (extent_start != 0) {
+					inode->i_blocks -= num_dec;
+				}
 				btrfs_mark_buffer_dirty(path->nodes[0]);
 			} else {
-				extent_start =
-					btrfs_file_extent_disk_blocknr(fi);
 				extent_num_blocks =
 					btrfs_file_extent_disk_num_blocks(fi);
 				/* FIXME blocksize != 4096 */
-- 
cgit v1.2.3


From 8e1cd766643f040fd88b9c4221353a3f6b77db1e Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 17 Sep 2007 11:13:12 -0400
Subject: Btrfs: Fix double free and off by one in inode.c

The first change removes  potential double free, the second fix a off
by one error.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43bf4337859..88b3ef20be6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -206,7 +206,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret) {
-		btrfs_free_path(path);
 		goto make_bad;
 	}
 	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
@@ -1536,7 +1535,7 @@ again:
 		flush_dcache_page(result->b_page);
 		kunmap(page);
 		set_extent_uptodate(em_tree, extent_start,
-				    extent_end, GFP_NOFS);
+				    extent_end - 1, GFP_NOFS);
 		goto insert;
 	} else {
 		printk("unkknown found_type %d\n", found_type);
-- 
cgit v1.2.3


From b3cfa35a493866cffd00952438b6c77364d42d3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Sep 2007 11:25:58 -0400
Subject: Btrfs: factor page private preparations into a helper

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 40 +++++++++++++++-------------------------
 fs/btrfs/extent_map.h |  1 +
 fs/btrfs/file.c       |  7 +------
 fs/btrfs/inode.c      |  7 +------
 4 files changed, 18 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ab5bde31b68..0ab368e091f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1379,6 +1379,16 @@ static int submit_extent_page(int rw, struct extent_map_tree *tree,
 	return ret;
 }
 
+void set_page_extent_mapped(struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
+		set_page_private(page, 1);
+		page_cache_get(page);
+	}
+}
+
 /*
  * basic readpage implementation.  Locked extent state structs are inserted
  * into the tree that are removed when the IO is done (by the end_io
@@ -1405,12 +1415,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 	size_t iosize;
 	size_t blocksize = inode->i_sb->s_blocksize;
 
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		set_page_private(page, 1);
-		page_cache_get(page);
-	}
+	set_page_extent_mapped(page);
 
 	end = page_end;
 	lock_extent(tree, start, end, GFP_NOFS);
@@ -1531,12 +1536,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			       PAGE_CACHE_SIZE - offset, KM_USER0);
 	}
 
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		set_page_private(page, 1);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		page_cache_get(page);
-	}
+	set_page_extent_mapped(page);
 
 	lock_extent(tree, start, page_end, GFP_NOFS);
 	nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
@@ -1672,13 +1672,7 @@ int extent_commit_write(struct extent_map_tree *tree,
 {
 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		set_page_private(page, 1);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		page_cache_get(page);
-	}
-
+	set_page_extent_mapped(page);
 	set_page_dirty(page);
 
 	if (pos > inode->i_size) {
@@ -1709,12 +1703,8 @@ int extent_prepare_write(struct extent_map_tree *tree,
 	int ret = 0;
 	int isnew;
 
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		set_page_private(page, 1);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		page_cache_get(page);
-	}
+	set_page_extent_mapped(page);
+
 	block_start = (page_start + from) & ~((u64)blocksize - 1);
 	block_end = (page_start + to - 1) | (blocksize - 1);
 	orig_block_start = block_start;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 40b53ee274f..d913ce01248 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -105,4 +105,5 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 698eaea612f..4cc459c943e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -543,12 +543,7 @@ static int prepare_pages(struct btrfs_root *root,
 		}
 		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
 		wait_on_page_writeback(pages[i]);
-		if (!PagePrivate(pages[i])) {
-			SetPagePrivate(pages[i]);
-			set_page_private(pages[i], 1);
-			WARN_ON(!pages[i]->mapping->a_ops->invalidatepage);
-			page_cache_get(pages[i]);
-		}
+		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 88b3ef20be6..0df325f89d6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -652,12 +652,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 	u64 page_start = page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		set_page_private(page, 1);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		page_cache_get(page);
-	}
+	set_page_extent_mapped(page);
 
 	lock_extent(em_tree, page_start, page_end, GFP_NOFS);
 	set_extent_delalloc(&BTRFS_I(inode)->extent_tree, page_start,
-- 
cgit v1.2.3


From 50b78c24d594e6985f36a9a8d8e2fb74e1664e90 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 20 Sep 2007 14:14:42 -0400
Subject: btrfs_get_extent should treat inline extents as though they hold a
 whole block

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0df325f89d6..b03d40a907c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1505,13 +1505,13 @@ again:
 
 		size = btrfs_file_extent_inline_len(leaf->items +
 						    path->slots[0]);
-		extent_end = extent_start + size;
+		extent_end = extent_start | ((u64)root->blocksize - 1);
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
 				if (end < extent_start)
 					goto not_found;
-				em->end = extent_end - 1;
+				em->end = extent_end;
 			} else {
 				em->end = end;
 			}
@@ -1520,17 +1520,19 @@ again:
 		em->block_start = EXTENT_MAP_INLINE;
 		em->block_end = EXTENT_MAP_INLINE;
 		em->start = extent_start;
-		em->end = extent_end - 1;
+		em->end = extent_end;
 		if (!page) {
 			goto insert;
 		}
 		ptr = btrfs_file_extent_inline_start(item);
 		map = kmap(page);
 		memcpy(map + page_offset, ptr, size);
-		flush_dcache_page(result->b_page);
+		memset(map + page_offset + size, 0,
+		       root->blocksize - (page_offset + size));
+		flush_dcache_page(page);
 		kunmap(page);
 		set_extent_uptodate(em_tree, extent_start,
-				    extent_end - 1, GFP_NOFS);
+				    extent_end, GFP_NOFS);
 		goto insert;
 	} else {
 		printk("unkknown found_type %d\n", found_type);
-- 
cgit v1.2.3


From 5f39d397dfbe140a14edecd4e73c34ce23c4f9ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:14:19 -0400
Subject: Btrfs: Create extent_buffer interface for large blocksizes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    2 +
 fs/btrfs/ctree.c       | 1484 ++++++++++++++++++++++++++----------------------
 fs/btrfs/ctree.h       |  849 ++++++++++-----------------
 fs/btrfs/dir-item.c    |  102 ++--
 fs/btrfs/disk-io.c     |  391 ++++++-------
 fs/btrfs/disk-io.h     |   55 +-
 fs/btrfs/extent-tree.c |  294 +++++-----
 fs/btrfs/extent_map.c  |  672 +++++++++++++++++++++-
 fs/btrfs/extent_map.h  |   75 ++-
 fs/btrfs/file-item.c   |   63 +-
 fs/btrfs/file.c        |  129 +++--
 fs/btrfs/inode-item.c  |   22 +-
 fs/btrfs/inode-map.c   |   17 +-
 fs/btrfs/inode.c       |  435 ++++++++------
 fs/btrfs/print-tree.c  |   96 ++--
 fs/btrfs/print-tree.h  |    4 +-
 fs/btrfs/root-tree.c   |   51 +-
 fs/btrfs/super.c       |    7 +-
 fs/btrfs/sysfs.c       |   10 +-
 fs/btrfs/transaction.c |   61 +-
 fs/btrfs/transaction.h |    5 +-
 fs/btrfs/tree-defrag.c |   58 +-
 22 files changed, 2794 insertions(+), 2088 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6341cf7a9f9..a3b51085d7f 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -23,4 +23,6 @@ modules_install:
 clean:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
 
+tester:
+	$(MAKE) -C $(KERNELDIR) M=`pwd` tree-defrag.o transaction.o sysfs.o super.o root-tree.o inode-map.o inode-item.o inode.o file-item.o file.o extent_map.o disk-io.o ctree.o dir-item.o extent-tree.o
 endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b41f48ade41..f60920e8a0e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,21 +16,24 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/highmem.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "print-tree.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *ins_key,
 		      struct btrfs_path *path, int data_size);
-static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct buffer_head *dst, struct buffer_head
-			  *src);
-static int balance_node_right(struct btrfs_trans_handle *trans, struct
-			      btrfs_root *root, struct buffer_head *dst_buf,
-			      struct buffer_head *src_buf);
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
+			  struct extent_buffer *src);
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst_buf,
+			      struct extent_buffer *src_buf);
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
@@ -62,40 +65,38 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i])
 			break;
-		btrfs_block_release(root, p->nodes[i]);
+		free_extent_buffer(p->nodes[i]);
 	}
 	memset(p, 0, sizeof(*p));
 }
 
-static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct buffer_head *buf, struct buffer_head
-			   *parent, int parent_slot, struct buffer_head
-			   **cow_ret, u64 search_start, u64 empty_size)
+static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct extent_buffer *buf,
+			     struct extent_buffer *parent, int parent_slot,
+			     struct extent_buffer **cow_ret,
+			     u64 search_start, u64 empty_size)
 {
-	struct buffer_head *cow;
-	struct btrfs_node *cow_node;
+	struct extent_buffer *cow;
 	int ret = 0;
 	int different_trans = 0;
 
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
-	WARN_ON(!buffer_uptodate(buf));
+
 	cow = btrfs_alloc_free_block(trans, root, search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
-	cow_node = btrfs_buffer_node(cow);
-	if (buf->b_size != root->blocksize || cow->b_size != root->blocksize)
+	if (buf->len != root->sectorsize || cow->len != root->sectorsize)
 		WARN_ON(1);
 
-	memcpy(cow_node, btrfs_buffer_node(buf), root->blocksize);
-	btrfs_set_header_blocknr(&cow_node->header, bh_blocknr(cow));
-	btrfs_set_header_generation(&cow_node->header, trans->transid);
-	btrfs_set_header_owner(&cow_node->header, root->root_key.objectid);
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_blocknr(cow, extent_buffer_blocknr(cow));
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	WARN_ON(btrfs_header_generation(btrfs_buffer_header(buf)) >
-		trans->transid);
-	if (btrfs_header_generation(btrfs_buffer_header(buf)) !=
-				    trans->transid) {
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	if (btrfs_header_generation(buf) != trans->transid) {
 		different_trans = 1;
 		ret = btrfs_inc_ref(trans, root, buf);
 		if (ret)
@@ -106,29 +107,29 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (buf == root->node) {
 		root->node = cow;
-		get_bh(cow);
+		extent_buffer_get(cow);
 		if (buf != root->commit_root) {
-			btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
+			btrfs_free_extent(trans, root,
+					  extent_buffer_blocknr(buf), 1, 1);
 		}
-		btrfs_block_release(root, buf);
+		free_extent_buffer(buf);
 	} else {
-		btrfs_set_node_blockptr(btrfs_buffer_node(parent), parent_slot,
-					bh_blocknr(cow));
+		btrfs_set_node_blockptr(parent, parent_slot,
+					extent_buffer_blocknr(cow));
 		btrfs_mark_buffer_dirty(parent);
-		WARN_ON(btrfs_header_generation(btrfs_buffer_header(parent)) !=
-				    trans->transid);
-		btrfs_free_extent(trans, root, bh_blocknr(buf), 1, 1);
+		WARN_ON(btrfs_header_generation(parent) != trans->transid);
+		btrfs_free_extent(trans, root, extent_buffer_blocknr(buf),1,1);
 	}
-	btrfs_block_release(root, buf);
+	free_extent_buffer(buf);
 	btrfs_mark_buffer_dirty(cow);
 	*cow_ret = cow;
 	return 0;
 }
 
-int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct buffer_head *buf, struct buffer_head
-			   *parent, int parent_slot, struct buffer_head
-			   **cow_ret)
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
 	if (trans->transaction != root->fs_info->running_transaction) {
@@ -141,13 +142,12 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	if (btrfs_header_generation(btrfs_buffer_header(buf)) ==
-				    trans->transid) {
+	if (btrfs_header_generation(buf) == trans->transid) {
 		*cow_ret = buf;
 		return 0;
 	}
 
-	search_start = bh_blocknr(buf) & ~((u64)65535);
+	search_start = extent_buffer_blocknr(buf) & ~((u64)65535);
 	return __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
 }
@@ -161,9 +161,11 @@ static int close_blocks(u64 blocknr, u64 other)
 	return 0;
 }
 
-static int should_defrag_leaf(struct buffer_head *bh)
+#if 0
+static int should_defrag_leaf(struct extent_buffer *eb)
 {
-	struct btrfs_leaf *leaf = btrfs_buffer_leaf(bh);
+	return 0;
+	struct btrfs_leaf *leaf = btrfs_buffer_leaf(eb);
 	struct btrfs_disk_key *key;
 	u32 nritems;
 
@@ -188,14 +190,17 @@ static int should_defrag_leaf(struct buffer_head *bh)
 	}
 	return 0;
 }
+#endif
 
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct buffer_head *parent,
+		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int cache_only, u64 *last_ret)
 {
+	return 0;
+#if 0
 	struct btrfs_node *parent_node;
-	struct buffer_head *cur_bh;
-	struct buffer_head *tmp_bh;
+	struct extent_buffer *cur_eb;
+	struct extent_buffer *tmp_eb;
 	u64 blocknr;
 	u64 search_start = *last_ret;
 	u64 last_block = 0;
@@ -281,6 +286,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		brelse(tmp_bh);
 	}
 	return err;
+#endif
 }
 
 /*
@@ -289,12 +295,12 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
  * which is the stop of the leaf data stack
  */
 static inline unsigned int leaf_data_end(struct btrfs_root *root,
-					 struct btrfs_leaf *leaf)
+					 struct extent_buffer *leaf)
 {
-	u32 nr = btrfs_header_nritems(&leaf->header);
+	u32 nr = btrfs_header_nritems(leaf);
 	if (nr == 0)
 		return BTRFS_LEAF_DATA_SIZE(root);
-	return btrfs_item_offset(leaf->items + nr - 1);
+	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
 /*
@@ -310,9 +316,9 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 		return 1;
 	if (k1.objectid < k2->objectid)
 		return -1;
-	if (k1.flags > k2->flags)
+	if (k1.type > k2->type)
 		return 1;
-	if (k1.flags < k2->flags)
+	if (k1.type < k2->type)
 		return -1;
 	if (k1.offset > k2->offset)
 		return 1;
@@ -324,37 +330,39 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
-	struct btrfs_node *parent = NULL;
-	struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]);
+	struct extent_buffer *parent = NULL;
+	struct extent_buffer *node = path->nodes[level];
+	struct btrfs_disk_key parent_key;
+	struct btrfs_disk_key node_key;
 	int parent_slot;
 	int slot;
 	struct btrfs_key cpukey;
-	u32 nritems = btrfs_header_nritems(&node->header);
+	u32 nritems = btrfs_header_nritems(node);
 
 	if (path->nodes[level + 1])
-		parent = btrfs_buffer_node(path->nodes[level + 1]);
+		parent = path->nodes[level + 1];
 
 	slot = path->slots[level];
-	BUG_ON(!buffer_uptodate(path->nodes[level]));
 	BUG_ON(nritems == 0);
 	if (parent) {
-		struct btrfs_disk_key *parent_key;
-
 		parent_slot = path->slots[level + 1];
-		parent_key = &parent->ptrs[parent_slot].key;
-		BUG_ON(memcmp(parent_key, &node->ptrs[0].key,
+		btrfs_node_key(parent, &parent_key, parent_slot);
+		btrfs_node_key(node, &node_key, 0);
+		BUG_ON(memcmp(&parent_key, &node_key,
 			      sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_blocknr(&node->header));
+		       btrfs_header_blocknr(node));
 	}
 	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
 	if (slot != 0) {
-		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[slot - 1].key);
-		BUG_ON(comp_keys(&node->ptrs[slot].key, &cpukey) <= 0);
+		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
+		btrfs_node_key(node, &node_key, slot);
+		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
 	}
 	if (slot < nritems - 1) {
-		btrfs_disk_key_to_cpu(&cpukey, &node->ptrs[slot + 1].key);
-		BUG_ON(comp_keys(&node->ptrs[slot].key, &cpukey) >= 0);
+		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
+		btrfs_node_key(node, &node_key, slot);
+		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
 	}
 	return 0;
 }
@@ -362,83 +370,172 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
-	struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[level]);
-	struct btrfs_node *parent = NULL;
+	struct extent_buffer *leaf = path->nodes[level];
+	struct extent_buffer *parent = NULL;
 	int parent_slot;
-	int slot = path->slots[0];
 	struct btrfs_key cpukey;
+	struct btrfs_disk_key parent_key;
+	struct btrfs_disk_key leaf_key;
+	int slot = path->slots[0];
 
-	u32 nritems = btrfs_header_nritems(&leaf->header);
+	u32 nritems = btrfs_header_nritems(leaf);
 
 	if (path->nodes[level + 1])
-		parent = btrfs_buffer_node(path->nodes[level + 1]);
-
-	BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
+		parent = path->nodes[level + 1];
 
 	if (nritems == 0)
 		return 0;
 
 	if (parent) {
-		struct btrfs_disk_key *parent_key;
-
 		parent_slot = path->slots[level + 1];
-		parent_key = &parent->ptrs[parent_slot].key;
+		btrfs_node_key(parent, &parent_key, parent_slot);
+		btrfs_item_key(leaf, &leaf_key, 0);
 
-		BUG_ON(memcmp(parent_key, &leaf->items[0].key,
+		BUG_ON(memcmp(&parent_key, &leaf_key,
 		       sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_blocknr(&leaf->header));
+		       btrfs_header_blocknr(leaf));
+	}
+#if 0
+	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
+		btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
+		btrfs_item_key(leaf, &leaf_key, i);
+		if (comp_keys(&leaf_key, &cpukey) >= 0) {
+			btrfs_print_leaf(root, leaf);
+			printk("slot %d offset bad key\n", i);
+			BUG_ON(1);
+		}
+		if (btrfs_item_offset_nr(leaf, i) !=
+			btrfs_item_end_nr(leaf, i + 1)) {
+			btrfs_print_leaf(root, leaf);
+			printk("slot %d offset bad\n", i);
+			BUG_ON(1);
+		}
+		if (i == 0) {
+			if (btrfs_item_offset_nr(leaf, i) +
+			       btrfs_item_size_nr(leaf, i) !=
+			       BTRFS_LEAF_DATA_SIZE(root)) {
+				btrfs_print_leaf(root, leaf);
+				printk("slot %d first offset bad\n", i);
+				BUG_ON(1);
+			}
+		}
 	}
-	if (slot != 0) {
-		btrfs_disk_key_to_cpu(&cpukey, &leaf->items[slot - 1].key);
-		BUG_ON(comp_keys(&leaf->items[slot].key, &cpukey) <= 0);
-		BUG_ON(btrfs_item_offset(leaf->items + slot - 1) !=
-			btrfs_item_end(leaf->items + slot));
+	if (nritems > 0) {
+		if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
+				btrfs_print_leaf(root, leaf);
+				printk("slot %d bad size \n", nritems - 1);
+				BUG_ON(1);
+		}
+	}
+#endif
+	if (slot != 0 && slot < nritems - 1) {
+		btrfs_item_key(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
+		if (comp_keys(&leaf_key, &cpukey) <= 0) {
+			btrfs_print_leaf(root, leaf);
+			printk("slot %d offset bad key\n", slot);
+			BUG_ON(1);
+		}
+		if (btrfs_item_offset_nr(leaf, slot - 1) !=
+		       btrfs_item_end_nr(leaf, slot)) {
+			btrfs_print_leaf(root, leaf);
+			printk("slot %d offset bad\n", slot);
+			BUG_ON(1);
+		}
 	}
 	if (slot < nritems - 1) {
-		btrfs_disk_key_to_cpu(&cpukey, &leaf->items[slot + 1].key);
-		BUG_ON(comp_keys(&leaf->items[slot].key, &cpukey) >= 0);
-		BUG_ON(btrfs_item_offset(leaf->items + slot) !=
-			btrfs_item_end(leaf->items + slot + 1));
+		btrfs_item_key(leaf, &leaf_key, slot);
+		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
+		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
+		if (btrfs_item_offset_nr(leaf, slot) !=
+			btrfs_item_end_nr(leaf, slot + 1)) {
+			btrfs_print_leaf(root, leaf);
+			printk("slot %d offset bad\n", slot);
+			BUG_ON(1);
+		}
 	}
-	BUG_ON(btrfs_item_offset(leaf->items) +
-	       btrfs_item_size(leaf->items) != BTRFS_LEAF_DATA_SIZE(root));
+	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
+	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
 	return 0;
 }
 
 static int check_block(struct btrfs_root *root, struct btrfs_path *path,
 			int level)
 {
-	struct btrfs_node *node = btrfs_buffer_node(path->nodes[level]);
-	if (memcmp(node->header.fsid, root->fs_info->disk_super->fsid,
-		   sizeof(node->header.fsid)))
-		BUG();
+	struct extent_buffer *buf = path->nodes[level];
+	char fsid[BTRFS_FSID_SIZE];
+
+	read_extent_buffer(buf, fsid, (unsigned long)btrfs_header_fsid(buf),
+			   BTRFS_FSID_SIZE);
+
+	if (memcmp(fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
+		int i = 0;
+		printk("warning bad block %Lu\n", buf->start);
+		if (!btrfs_buffer_uptodate(buf)) {
+			WARN_ON(1);
+		}
+		for (i = 0; i < BTRFS_FSID_SIZE; i++) {
+			printk("%x:%x ", root->fs_info->fsid[i], fsid[i]);
+		}
+		printk("\n");
+		// BUG();
+	}
 	if (level == 0)
 		return check_leaf(root, path, level);
 	return check_node(root, path, level);
 }
 
 /*
- * search for key in the array p.  items p are item_size apart
- * and there are 'max' items in p
+ * search for key in the extent_buffer.  The items start at offset p,
+ * and they are item_size apart.  There are 'max' items in p.
+ *
  * the slot in the array is returned via slot, and it points to
  * the place where you would insert key if it is not found in
  * the array.
  *
  * slot may point to max if the key is bigger than all of the keys
  */
-static int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
-		       int max, int *slot)
+static int generic_bin_search(struct extent_buffer *eb, unsigned long p,
+			      int item_size, struct btrfs_key *key,
+			      int max, int *slot)
 {
 	int low = 0;
 	int high = max;
 	int mid;
 	int ret;
 	struct btrfs_disk_key *tmp;
+	struct btrfs_disk_key unaligned;
+	unsigned long offset;
+	char *map_token = NULL;
+	char *kaddr = NULL;
+	unsigned long map_start = 0;
+	unsigned long map_len = 0;
 
 	while(low < high) {
 		mid = (low + high) / 2;
-		tmp = (struct btrfs_disk_key *)(p + mid * item_size);
+		offset = p + mid * item_size;
+
+		if (!map_token || offset < map_start ||
+		    (offset + sizeof(struct btrfs_disk_key)) >
+		    map_start + map_len) {
+			if (map_token)
+				unmap_extent_buffer(eb, map_token, KM_USER0);
+			map_extent_buffer(eb, offset, &map_token, &kaddr,
+					  &map_start, &map_len, KM_USER0);
+
+		}
+		if (offset + sizeof(struct btrfs_disk_key) >
+		    map_start + map_len) {
+			unmap_extent_buffer(eb, map_token, KM_USER0);
+			read_extent_buffer(eb, &unaligned,
+					   offset, sizeof(unaligned));
+			map_token = NULL;
+			tmp = &unaligned;
+		} else {
+			tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+		}
 		ret = comp_keys(tmp, key);
 
 		if (ret < 0)
@@ -447,10 +544,13 @@ static int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
 			high = mid;
 		else {
 			*slot = mid;
+			unmap_extent_buffer(eb, map_token, KM_USER0);
 			return 0;
 		}
 	}
 	*slot = low;
+	if (map_token)
+		unmap_extent_buffer(eb, map_token, KM_USER0);
 	return 1;
 }
 
@@ -458,46 +558,42 @@ static int generic_bin_search(char *p, int item_size, struct btrfs_key *key,
  * simple bin_search frontend that does the right thing for
  * leaves vs nodes
  */
-static int bin_search(struct btrfs_node *c, struct btrfs_key *key, int *slot)
+static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		      int level, int *slot)
 {
-	if (btrfs_is_leaf(c)) {
-		struct btrfs_leaf *l = (struct btrfs_leaf *)c;
-		return generic_bin_search((void *)l->items,
+	if (level == 0) {
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_leaf, items),
 					  sizeof(struct btrfs_item),
-					  key, btrfs_header_nritems(&c->header),
+					  key, btrfs_header_nritems(eb),
 					  slot);
 	} else {
-		return generic_bin_search((void *)c->ptrs,
+		return generic_bin_search(eb,
+					  offsetof(struct btrfs_node, ptrs),
 					  sizeof(struct btrfs_key_ptr),
-					  key, btrfs_header_nritems(&c->header),
+					  key, btrfs_header_nritems(eb),
 					  slot);
 	}
 	return -1;
 }
 
-static struct buffer_head *read_node_slot(struct btrfs_root *root,
-				   struct buffer_head *parent_buf,
-				   int slot)
+static struct extent_buffer *read_node_slot(struct btrfs_root *root,
+				   struct extent_buffer *parent, int slot)
 {
-	struct btrfs_node *node = btrfs_buffer_node(parent_buf);
 	if (slot < 0)
 		return NULL;
-	if (slot >= btrfs_header_nritems(&node->header))
+	if (slot >= btrfs_header_nritems(parent))
 		return NULL;
-	return read_tree_block(root, btrfs_node_blockptr(node, slot));
+	return read_tree_block(root, btrfs_node_blockptr(parent, slot));
 }
 
 static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			 *root, struct btrfs_path *path, int level)
 {
-	struct buffer_head *right_buf;
-	struct buffer_head *mid_buf;
-	struct buffer_head *left_buf;
-	struct buffer_head *parent_buf = NULL;
-	struct btrfs_node *right = NULL;
-	struct btrfs_node *mid;
-	struct btrfs_node *left = NULL;
-	struct btrfs_node *parent = NULL;
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
 	int ret = 0;
 	int wret;
 	int pslot;
@@ -508,60 +604,57 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (level == 0)
 		return 0;
 
-	mid_buf = path->nodes[level];
-	mid = btrfs_buffer_node(mid_buf);
+	mid = path->nodes[level];
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < BTRFS_MAX_LEVEL - 1)
-		parent_buf = path->nodes[level + 1];
+		parent = path->nodes[level + 1];
 	pslot = path->slots[level + 1];
 
 	/*
 	 * deal with the case where there is only one pointer in the root
 	 * by promoting the node below to a root
 	 */
-	if (!parent_buf) {
-		struct buffer_head *child;
-		u64 blocknr = bh_blocknr(mid_buf);
+	if (!parent) {
+		struct extent_buffer *child;
+		u64 blocknr = extent_buffer_blocknr(mid);
 
-		if (btrfs_header_nritems(&mid->header) != 1)
+		if (btrfs_header_nritems(mid) != 1)
 			return 0;
 
 		/* promote the child to a root */
-		child = read_node_slot(root, mid_buf, 0);
+		child = read_node_slot(root, mid, 0);
 		BUG_ON(!child);
 		root->node = child;
 		path->nodes[level] = NULL;
-		clean_tree_block(trans, root, mid_buf);
-		wait_on_buffer(mid_buf);
+		clean_tree_block(trans, root, mid);
+		wait_on_tree_block_writeback(root, mid);
 		/* once for the path */
-		btrfs_block_release(root, mid_buf);
+		free_extent_buffer(mid);
 		/* once for the root ptr */
-		btrfs_block_release(root, mid_buf);
+		free_extent_buffer(mid);
 		return btrfs_free_extent(trans, root, blocknr, 1, 1);
 	}
-	parent = btrfs_buffer_node(parent_buf);
-
-	if (btrfs_header_nritems(&mid->header) >
+	if (btrfs_header_nritems(mid) >
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (btrfs_header_nritems(&mid->header) < 2)
+	if (btrfs_header_nritems(mid) < 2)
 		err_on_enospc = 1;
 
-	left_buf = read_node_slot(root, parent_buf, pslot - 1);
-	if (left_buf) {
-		wret = btrfs_cow_block(trans, root, left_buf,
-				       parent_buf, pslot - 1, &left_buf);
+	left = read_node_slot(root, parent, pslot - 1);
+	if (left) {
+		wret = btrfs_cow_block(trans, root, left,
+				       parent, pslot - 1, &left);
 		if (wret) {
 			ret = wret;
 			goto enospc;
 		}
 	}
-	right_buf = read_node_slot(root, parent_buf, pslot + 1);
-	if (right_buf) {
-		wret = btrfs_cow_block(trans, root, right_buf,
-				       parent_buf, pslot + 1, &right_buf);
+	right = read_node_slot(root, parent, pslot + 1);
+	if (right) {
+		wret = btrfs_cow_block(trans, root, right,
+				       parent, pslot + 1, &right);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -569,30 +662,27 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	/* first, try to make some room in the middle buffer */
-	if (left_buf) {
-		left = btrfs_buffer_node(left_buf);
-		orig_slot += btrfs_header_nritems(&left->header);
-		wret = push_node_left(trans, root, left_buf, mid_buf);
+	if (left) {
+		orig_slot += btrfs_header_nritems(left);
+		wret = push_node_left(trans, root, left, mid);
 		if (wret < 0)
 			ret = wret;
-		if (btrfs_header_nritems(&mid->header) < 2)
+		if (btrfs_header_nritems(mid) < 2)
 			err_on_enospc = 1;
 	}
 
 	/*
 	 * then try to empty the right most buffer into the middle
 	 */
-	if (right_buf) {
-		right = btrfs_buffer_node(right_buf);
-		wret = push_node_left(trans, root, mid_buf, right_buf);
+	if (right) {
+		wret = push_node_left(trans, root, mid, right);
 		if (wret < 0 && wret != -ENOSPC)
 			ret = wret;
-		if (btrfs_header_nritems(&right->header) == 0) {
-			u64 blocknr = bh_blocknr(right_buf);
-			clean_tree_block(trans, root, right_buf);
-			wait_on_buffer(right_buf);
-			btrfs_block_release(root, right_buf);
-			right_buf = NULL;
+		if (btrfs_header_nritems(right) == 0) {
+			u64 blocknr = extent_buffer_blocknr(right);
+			clean_tree_block(trans, root, right);
+			wait_on_tree_block_writeback(root, right);
+			free_extent_buffer(right);
 			right = NULL;
 			wret = del_ptr(trans, root, path, level + 1, pslot +
 				       1);
@@ -602,14 +692,13 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			if (wret)
 				ret = wret;
 		} else {
-			btrfs_memcpy(root, parent,
-				     &parent->ptrs[pslot + 1].key,
-				     &right->ptrs[0].key,
-				     sizeof(struct btrfs_disk_key));
-			btrfs_mark_buffer_dirty(parent_buf);
+			struct btrfs_disk_key right_key;
+			btrfs_node_key(right, &right_key, 0);
+			btrfs_set_node_key(parent, &right_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
 		}
 	}
-	if (btrfs_header_nritems(&mid->header) == 1) {
+	if (btrfs_header_nritems(mid) == 1) {
 		/*
 		 * we're not allowed to leave a node with one item in the
 		 * tree during a delete.  A deletion from lower in the tree
@@ -619,21 +708,20 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		 * otherwise we would have pulled some pointers from the
 		 * right
 		 */
-		BUG_ON(!left_buf);
-		wret = balance_node_right(trans, root, mid_buf, left_buf);
+		BUG_ON(!left);
+		wret = balance_node_right(trans, root, mid, left);
 		if (wret < 0) {
 			ret = wret;
 			goto enospc;
 		}
 		BUG_ON(wret == 1);
 	}
-	if (btrfs_header_nritems(&mid->header) == 0) {
+	if (btrfs_header_nritems(mid) == 0) {
 		/* we've managed to empty the middle node, drop it */
-		u64 blocknr = bh_blocknr(mid_buf);
-		clean_tree_block(trans, root, mid_buf);
-		wait_on_buffer(mid_buf);
-		btrfs_block_release(root, mid_buf);
-		mid_buf = NULL;
+		u64 blocknr = extent_buffer_blocknr(mid);
+		clean_tree_block(trans, root, mid);
+		wait_on_tree_block_writeback(root, mid);
+		free_extent_buffer(mid);
 		mid = NULL;
 		wret = del_ptr(trans, root, path, level + 1, pslot);
 		if (wret)
@@ -643,37 +731,36 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = wret;
 	} else {
 		/* update the parent key to reflect our changes */
-		btrfs_memcpy(root, parent,
-			     &parent->ptrs[pslot].key, &mid->ptrs[0].key,
-			     sizeof(struct btrfs_disk_key));
-		btrfs_mark_buffer_dirty(parent_buf);
+		struct btrfs_disk_key mid_key;
+		btrfs_node_key(mid, &mid_key, 0);
+		btrfs_set_node_key(parent, &mid_key, pslot);
+		btrfs_mark_buffer_dirty(parent);
 	}
 
 	/* update the path */
-	if (left_buf) {
-		if (btrfs_header_nritems(&left->header) > orig_slot) {
-			get_bh(left_buf);
-			path->nodes[level] = left_buf;
+	if (left) {
+		if (btrfs_header_nritems(left) > orig_slot) {
+			extent_buffer_get(left);
+			path->nodes[level] = left;
 			path->slots[level + 1] -= 1;
 			path->slots[level] = orig_slot;
-			if (mid_buf)
-				btrfs_block_release(root, mid_buf);
+			if (mid)
+				free_extent_buffer(mid);
 		} else {
-			orig_slot -= btrfs_header_nritems(&left->header);
+			orig_slot -= btrfs_header_nritems(left);
 			path->slots[level] = orig_slot;
 		}
 	}
 	/* double check we haven't messed things up */
 	check_block(root, path, level);
 	if (orig_ptr !=
-	    btrfs_node_blockptr(btrfs_buffer_node(path->nodes[level]),
-				path->slots[level]))
+	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
 		BUG();
 enospc:
-	if (right_buf)
-		btrfs_block_release(root, right_buf);
-	if (left_buf)
-		btrfs_block_release(root, left_buf);
+	if (right)
+		free_extent_buffer(right);
+	if (left)
+		free_extent_buffer(left);
 	return ret;
 }
 
@@ -682,14 +769,10 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
-	struct buffer_head *right_buf;
-	struct buffer_head *mid_buf;
-	struct buffer_head *left_buf;
-	struct buffer_head *parent_buf = NULL;
-	struct btrfs_node *right = NULL;
-	struct btrfs_node *mid;
-	struct btrfs_node *left = NULL;
-	struct btrfs_node *parent = NULL;
+	struct extent_buffer *right = NULL;
+	struct extent_buffer *mid;
+	struct extent_buffer *left = NULL;
+	struct extent_buffer *parent = NULL;
 	int ret = 0;
 	int wret;
 	int pslot;
@@ -699,107 +782,101 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	if (level == 0)
 		return 1;
 
-	mid_buf = path->nodes[level];
-	mid = btrfs_buffer_node(mid_buf);
+	mid = path->nodes[level];
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < BTRFS_MAX_LEVEL - 1)
-		parent_buf = path->nodes[level + 1];
+		parent = path->nodes[level + 1];
 	pslot = path->slots[level + 1];
 
-	if (!parent_buf)
+	if (!parent)
 		return 1;
-	parent = btrfs_buffer_node(parent_buf);
 
-	left_buf = read_node_slot(root, parent_buf, pslot - 1);
+	left = read_node_slot(root, parent, pslot - 1);
 
 	/* first, try to make some room in the middle buffer */
-	if (left_buf) {
+	if (left) {
 		u32 left_nr;
-		left = btrfs_buffer_node(left_buf);
-		left_nr = btrfs_header_nritems(&left->header);
+		left_nr = btrfs_header_nritems(left);
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
 		} else {
-			ret = btrfs_cow_block(trans, root, left_buf, parent_buf,
-					      pslot - 1, &left_buf);
+			ret = btrfs_cow_block(trans, root, left, parent,
+					      pslot - 1, &left);
 			if (ret)
 				wret = 1;
 			else {
-				left = btrfs_buffer_node(left_buf);
 				wret = push_node_left(trans, root,
-						      left_buf, mid_buf);
+						      left, mid);
 			}
 		}
 		if (wret < 0)
 			ret = wret;
 		if (wret == 0) {
+			struct btrfs_disk_key disk_key;
 			orig_slot += left_nr;
-			btrfs_memcpy(root, parent,
-				     &parent->ptrs[pslot].key,
-				     &mid->ptrs[0].key,
-				     sizeof(struct btrfs_disk_key));
-			btrfs_mark_buffer_dirty(parent_buf);
-			if (btrfs_header_nritems(&left->header) > orig_slot) {
-				path->nodes[level] = left_buf;
+			btrfs_node_key(mid, &disk_key, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot);
+			btrfs_mark_buffer_dirty(parent);
+			if (btrfs_header_nritems(left) > orig_slot) {
+				path->nodes[level] = left;
 				path->slots[level + 1] -= 1;
 				path->slots[level] = orig_slot;
-				btrfs_block_release(root, mid_buf);
+				free_extent_buffer(mid);
 			} else {
 				orig_slot -=
-					btrfs_header_nritems(&left->header);
+					btrfs_header_nritems(left);
 				path->slots[level] = orig_slot;
-				btrfs_block_release(root, left_buf);
+				free_extent_buffer(left);
 			}
 			check_node(root, path, level);
 			return 0;
 		}
-		btrfs_block_release(root, left_buf);
+		free_extent_buffer(left);
 	}
-	right_buf = read_node_slot(root, parent_buf, pslot + 1);
+	right= read_node_slot(root, parent, pslot + 1);
 
 	/*
 	 * then try to empty the right most buffer into the middle
 	 */
-	if (right_buf) {
+	if (right) {
 		u32 right_nr;
-		right = btrfs_buffer_node(right_buf);
-		right_nr = btrfs_header_nritems(&right->header);
+		right_nr = btrfs_header_nritems(right);
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
 		} else {
-			ret = btrfs_cow_block(trans, root, right_buf,
-					      parent_buf, pslot + 1,
-					      &right_buf);
+			ret = btrfs_cow_block(trans, root, right,
+					      parent, pslot + 1,
+					      &right);
 			if (ret)
 				wret = 1;
 			else {
-				right = btrfs_buffer_node(right_buf);
 				wret = balance_node_right(trans, root,
-							  right_buf, mid_buf);
+							  right, mid);
 			}
 		}
 		if (wret < 0)
 			ret = wret;
 		if (wret == 0) {
-			btrfs_memcpy(root, parent,
-				     &parent->ptrs[pslot + 1].key,
-				     &right->ptrs[0].key,
-				     sizeof(struct btrfs_disk_key));
-			btrfs_mark_buffer_dirty(parent_buf);
-			if (btrfs_header_nritems(&mid->header) <= orig_slot) {
-				path->nodes[level] = right_buf;
+			struct btrfs_disk_key disk_key;
+
+			btrfs_node_key(right, &disk_key, 0);
+			btrfs_set_node_key(parent, &disk_key, pslot + 1);
+			btrfs_mark_buffer_dirty(parent);
+
+			if (btrfs_header_nritems(mid) <= orig_slot) {
+				path->nodes[level] = right;
 				path->slots[level + 1] += 1;
 				path->slots[level] = orig_slot -
-					btrfs_header_nritems(&mid->header);
-				btrfs_block_release(root, mid_buf);
+					btrfs_header_nritems(mid);
+				free_extent_buffer(mid);
 			} else {
-				btrfs_block_release(root, right_buf);
+				free_extent_buffer(right);
 			}
 			check_node(root, path, level);
 			return 0;
 		}
-		btrfs_block_release(root, right_buf);
+		free_extent_buffer(right);
 	}
 	check_node(root, path, level);
 	return 1;
@@ -811,10 +888,9 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 			     int level, int slot)
 {
-	struct btrfs_node *node;
+	struct extent_buffer *node;
 	int i;
 	u32 nritems;
-	u64 item_objectid;
 	u64 blocknr;
 	u64 search;
 	u64 cluster_start;
@@ -823,7 +899,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 	int direction = path->reada;
 	struct radix_tree_root found;
 	unsigned long gang[8];
-	struct buffer_head *bh;
+	struct extent_buffer *eb;
 
 	if (level == 0)
 		return;
@@ -831,18 +907,17 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 	if (!path->nodes[level])
 		return;
 
-	node = btrfs_buffer_node(path->nodes[level]);
+	node = path->nodes[level];
 	search = btrfs_node_blockptr(node, slot);
-	bh = btrfs_find_tree_block(root, search);
-	if (bh) {
-		brelse(bh);
+	eb = btrfs_find_tree_block(root, search);
+	if (eb) {
+		free_extent_buffer(eb);
 		return;
 	}
 
 	init_bit_radix(&found);
-	nritems = btrfs_header_nritems(&node->header);
+	nritems = btrfs_header_nritems(node);
 	for (i = slot; i < nritems; i++) {
-		item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
 		blocknr = btrfs_node_blockptr(node, i);
 		set_radix_bit(&found, blocknr);
 	}
@@ -886,8 +961,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow)
 {
-	struct buffer_head *b;
-	struct btrfs_node *c;
+	struct extent_buffer *b;
 	u64 blocknr;
 	int slot;
 	int ret;
@@ -901,10 +975,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 again:
 	b = root->node;
-	get_bh(b);
+	extent_buffer_get(b);
 	while (b) {
-		c = btrfs_buffer_node(b);
-		level = btrfs_header_level(&c->header);
+		level = btrfs_header_level(b);
 		if (cow) {
 			int wret;
 			wret = btrfs_cow_block(trans, root, b,
@@ -912,32 +985,30 @@ again:
 					       p->slots[level + 1],
 					       &b);
 			if (wret) {
-				btrfs_block_release(root, b);
+				free_extent_buffer(b);
 				return wret;
 			}
-			c = btrfs_buffer_node(b);
 		}
 		BUG_ON(!cow && ins_len);
-		if (level != btrfs_header_level(&c->header))
+		if (level != btrfs_header_level(b))
 			WARN_ON(1);
-		level = btrfs_header_level(&c->header);
+		level = btrfs_header_level(b);
 		p->nodes[level] = b;
 		ret = check_block(root, p, level);
 		if (ret)
 			return -1;
-		ret = bin_search(c, key, &slot);
-		if (!btrfs_is_leaf(c)) {
+		ret = bin_search(b, key, level, &slot);
+		if (level != 0) {
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			if (ins_len > 0 && btrfs_header_nritems(&c->header) >=
+			if (ins_len > 0 && btrfs_header_nritems(b) >=
 			    BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 				int sret = split_node(trans, root, p, level);
 				BUG_ON(sret > 0);
 				if (sret)
 					return sret;
 				b = p->nodes[level];
-				c = btrfs_buffer_node(b);
 				slot = p->slots[level];
 			} else if (ins_len < 0) {
 				int sret = balance_level(trans, root, p,
@@ -947,22 +1018,19 @@ again:
 				b = p->nodes[level];
 				if (!b)
 					goto again;
-				c = btrfs_buffer_node(b);
 				slot = p->slots[level];
-				BUG_ON(btrfs_header_nritems(&c->header) == 1);
+				BUG_ON(btrfs_header_nritems(b) == 1);
 			}
 			/* this is only true while dropping a snapshot */
 			if (level == lowest_level)
 				break;
-			blocknr = btrfs_node_blockptr(c, slot);
+			blocknr = btrfs_node_blockptr(b, slot);
 			if (should_reada)
 				reada_for_search(root, p, level, slot);
-			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
-
+			b = read_tree_block(root, btrfs_node_blockptr(b, slot));
 		} else {
-			struct btrfs_leaf *l = (struct btrfs_leaf *)c;
 			p->slots[level] = slot;
-			if (ins_len > 0 && btrfs_leaf_free_space(root, l) <
+			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
 			    sizeof(struct btrfs_item) + ins_len) {
 				int sret = split_leaf(trans, root, key,
 						      p, ins_len);
@@ -986,19 +1054,20 @@ again:
  * If this fails to write a tree block, it returns -1, but continues
  * fixing up the blocks in ram so the tree is consistent.
  */
-static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, struct btrfs_disk_key
-			  *key, int level)
+static int fixup_low_keys(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct btrfs_path *path,
+			  struct btrfs_disk_key *key, int level)
 {
 	int i;
 	int ret = 0;
+	struct extent_buffer *t;
+
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
-		struct btrfs_node *t;
 		int tslot = path->slots[i];
 		if (!path->nodes[i])
 			break;
-		t = btrfs_buffer_node(path->nodes[i]);
-		btrfs_memcpy(root, t, &t->ptrs[tslot].key, key, sizeof(*key));
+		t = path->nodes[i];
+		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
@@ -1014,18 +1083,16 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans, struct btrfs_root
  * error, and > 0 if there was no room in the left hand block.
  */
 static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct buffer_head *dst_buf, struct
-			  buffer_head *src_buf)
+			  *root, struct extent_buffer *dst,
+			  struct extent_buffer *src)
 {
-	struct btrfs_node *src = btrfs_buffer_node(src_buf);
-	struct btrfs_node *dst = btrfs_buffer_node(dst_buf);
 	int push_items = 0;
 	int src_nritems;
 	int dst_nritems;
 	int ret = 0;
 
-	src_nritems = btrfs_header_nritems(&src->header);
-	dst_nritems = btrfs_header_nritems(&dst->header);
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
 
 	if (push_items <= 0) {
@@ -1035,17 +1102,21 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (src_nritems < push_items)
 		push_items = src_nritems;
 
-	btrfs_memcpy(root, dst, dst->ptrs + dst_nritems, src->ptrs,
-		     push_items * sizeof(struct btrfs_key_ptr));
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(dst_nritems),
+			   btrfs_node_key_ptr_offset(0),
+		           push_items * sizeof(struct btrfs_key_ptr));
+
 	if (push_items < src_nritems) {
-		btrfs_memmove(root, src, src->ptrs, src->ptrs + push_items,
-			(src_nritems - push_items) *
-			sizeof(struct btrfs_key_ptr));
-	}
-	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
-	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
-	btrfs_mark_buffer_dirty(src_buf);
-	btrfs_mark_buffer_dirty(dst_buf);
+		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
+				      btrfs_node_key_ptr_offset(push_items),
+				      (src_nritems - push_items) *
+				      sizeof(struct btrfs_key_ptr));
+	}
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
 	return ret;
 }
 
@@ -1058,24 +1129,22 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
  *
  * this will  only push up to 1/2 the contents of the left node over
  */
-static int balance_node_right(struct btrfs_trans_handle *trans, struct
-			      btrfs_root *root, struct buffer_head *dst_buf,
-			      struct buffer_head *src_buf)
+static int balance_node_right(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct extent_buffer *dst,
+			      struct extent_buffer *src)
 {
-	struct btrfs_node *src = btrfs_buffer_node(src_buf);
-	struct btrfs_node *dst = btrfs_buffer_node(dst_buf);
 	int push_items = 0;
 	int max_push;
 	int src_nritems;
 	int dst_nritems;
 	int ret = 0;
 
-	src_nritems = btrfs_header_nritems(&src->header);
-	dst_nritems = btrfs_header_nritems(&dst->header);
+	src_nritems = btrfs_header_nritems(src);
+	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
-	if (push_items <= 0) {
+	if (push_items <= 0)
 		return 1;
-	}
 
 	max_push = src_nritems / 2 + 1;
 	/* don't try to empty the node */
@@ -1085,18 +1154,21 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
 	if (max_push < push_items)
 		push_items = max_push;
 
-	btrfs_memmove(root, dst, dst->ptrs + push_items, dst->ptrs,
-		      dst_nritems * sizeof(struct btrfs_key_ptr));
+	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
+				      btrfs_node_key_ptr_offset(0),
+				      (dst_nritems) *
+				      sizeof(struct btrfs_key_ptr));
 
-	btrfs_memcpy(root, dst, dst->ptrs,
-		     src->ptrs + src_nritems - push_items,
-		     push_items * sizeof(struct btrfs_key_ptr));
+	copy_extent_buffer(dst, src,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(src_nritems - push_items),
+		           push_items * sizeof(struct btrfs_key_ptr));
 
-	btrfs_set_header_nritems(&src->header, src_nritems - push_items);
-	btrfs_set_header_nritems(&dst->header, dst_nritems + push_items);
+	btrfs_set_header_nritems(src, src_nritems - push_items);
+	btrfs_set_header_nritems(dst, dst_nritems + push_items);
 
-	btrfs_mark_buffer_dirty(src_buf);
-	btrfs_mark_buffer_dirty(dst_buf);
+	btrfs_mark_buffer_dirty(src);
+	btrfs_mark_buffer_dirty(dst);
 	return ret;
 }
 
@@ -1107,45 +1179,46 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct
  *
  * returns zero on success or < 0 on failure.
  */
-static int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_path *path, int level)
+static int insert_new_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_path *path, int level)
 {
-	struct buffer_head *t;
-	struct btrfs_node *lower;
-	struct btrfs_node *c;
-	struct btrfs_disk_key *lower_key;
+	struct extent_buffer *lower;
+	struct extent_buffer *c;
+	struct btrfs_disk_key lower_key;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	t = btrfs_alloc_free_block(trans, root, root->node->b_blocknr, 0);
-	if (IS_ERR(t))
-		return PTR_ERR(t);
-	c = btrfs_buffer_node(t);
-	memset(c, 0, root->blocksize);
-	btrfs_set_header_nritems(&c->header, 1);
-	btrfs_set_header_level(&c->header, level);
-	btrfs_set_header_blocknr(&c->header, bh_blocknr(t));
-	btrfs_set_header_generation(&c->header, trans->transid);
-	btrfs_set_header_owner(&c->header, root->root_key.objectid);
-	lower = btrfs_buffer_node(path->nodes[level-1]);
-	memcpy(c->header.fsid, root->fs_info->disk_super->fsid,
-	       sizeof(c->header.fsid));
-	if (btrfs_is_leaf(lower))
-		lower_key = &((struct btrfs_leaf *)lower)->items[0].key;
+	c = btrfs_alloc_free_block(trans, root,
+				   extent_buffer_blocknr(root->node), 0);
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+	memset_extent_buffer(c, 0, 0, root->nodesize);
+	btrfs_set_header_nritems(c, 1);
+	btrfs_set_header_level(c, level);
+	btrfs_set_header_blocknr(c, extent_buffer_blocknr(c));
+	btrfs_set_header_generation(c, trans->transid);
+	btrfs_set_header_owner(c, root->root_key.objectid);
+	lower = path->nodes[level-1];
+
+	write_extent_buffer(c, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(c),
+			    BTRFS_FSID_SIZE);
+	if (level == 1)
+		btrfs_item_key(lower, &lower_key, 0);
 	else
-		lower_key = &lower->ptrs[0].key;
-	btrfs_memcpy(root, c, &c->ptrs[0].key, lower_key,
-		     sizeof(struct btrfs_disk_key));
-	btrfs_set_node_blockptr(c, 0, bh_blocknr(path->nodes[level - 1]));
+		btrfs_node_key(lower, &lower_key, 0);
+	btrfs_set_node_key(c, &lower_key, 0);
+	btrfs_set_node_blockptr(c, 0, extent_buffer_blocknr(lower));
 
-	btrfs_mark_buffer_dirty(t);
+	btrfs_mark_buffer_dirty(c);
 
 	/* the super has an extra ref to root->node */
-	btrfs_block_release(root, root->node);
-	root->node = t;
-	get_bh(t);
-	path->nodes[level] = t;
+	free_extent_buffer(root->node);
+	root->node = c;
+	extent_buffer_get(c);
+	path->nodes[level] = c;
 	path->slots[level] = 0;
 	return 0;
 }
@@ -1163,26 +1236,26 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, struct btrfs_disk_key
 		      *key, u64 blocknr, int slot, int level)
 {
-	struct btrfs_node *lower;
+	struct extent_buffer *lower;
 	int nritems;
 
 	BUG_ON(!path->nodes[level]);
-	lower = btrfs_buffer_node(path->nodes[level]);
-	nritems = btrfs_header_nritems(&lower->header);
+	lower = path->nodes[level];
+	nritems = btrfs_header_nritems(lower);
 	if (slot > nritems)
 		BUG();
 	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
 		BUG();
 	if (slot != nritems) {
-		btrfs_memmove(root, lower, lower->ptrs + slot + 1,
-			      lower->ptrs + slot,
+		memmove_extent_buffer(lower,
+			      btrfs_node_key_ptr_offset(slot + 1),
+			      btrfs_node_key_ptr_offset(slot),
 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
-	btrfs_memcpy(root, lower, &lower->ptrs[slot].key,
-		     key, sizeof(struct btrfs_disk_key));
+	btrfs_set_node_key(lower, key, slot);
 	btrfs_set_node_blockptr(lower, slot, blocknr);
-	btrfs_set_header_nritems(&lower->header, nritems + 1);
-	btrfs_mark_buffer_dirty(path->nodes[level]);
+	btrfs_set_header_nritems(lower, nritems + 1);
+	btrfs_mark_buffer_dirty(lower);
 	check_node(root, path, level);
 	return 0;
 }
@@ -1199,69 +1272,73 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level)
 {
-	struct buffer_head *t;
-	struct btrfs_node *c;
-	struct buffer_head *split_buffer;
-	struct btrfs_node *split;
+	struct extent_buffer *c;
+	struct extent_buffer *split;
+	struct btrfs_disk_key disk_key;
 	int mid;
 	int ret;
 	int wret;
 	u32 c_nritems;
 
-	t = path->nodes[level];
-	c = btrfs_buffer_node(t);
-	if (t == root->node) {
+	c = path->nodes[level];
+	if (c == root->node) {
 		/* trying to split the root, lets make a new one */
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
 	} else {
 		ret = push_nodes_for_insert(trans, root, path, level);
-		t = path->nodes[level];
-		c = btrfs_buffer_node(t);
-		if (!ret &&
-		    btrfs_header_nritems(&c->header) <
+		c = path->nodes[level];
+		if (!ret && btrfs_header_nritems(c) <
 		    BTRFS_NODEPTRS_PER_BLOCK(root) - 1)
 			return 0;
 		if (ret < 0)
 			return ret;
 	}
 
-	c_nritems = btrfs_header_nritems(&c->header);
-	split_buffer = btrfs_alloc_free_block(trans, root, t->b_blocknr, 0);
-	if (IS_ERR(split_buffer))
-		return PTR_ERR(split_buffer);
+	c_nritems = btrfs_header_nritems(c);
+	split = btrfs_alloc_free_block(trans, root,
+				       extent_buffer_blocknr(c), 0);
+	if (IS_ERR(split))
+		return PTR_ERR(split);
+
+	btrfs_set_header_flags(split, btrfs_header_flags(c));
+	btrfs_set_header_level(split, btrfs_header_level(c));
+	btrfs_set_header_blocknr(split, extent_buffer_blocknr(split));
+	btrfs_set_header_generation(split, trans->transid);
+	btrfs_set_header_owner(split, root->root_key.objectid);
+	write_extent_buffer(split, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(split),
+			    BTRFS_FSID_SIZE);
 
-	split = btrfs_buffer_node(split_buffer);
-	btrfs_set_header_flags(&split->header, btrfs_header_flags(&c->header));
-	btrfs_set_header_level(&split->header, btrfs_header_level(&c->header));
-	btrfs_set_header_blocknr(&split->header, bh_blocknr(split_buffer));
-	btrfs_set_header_generation(&split->header, trans->transid);
-	btrfs_set_header_owner(&split->header, root->root_key.objectid);
-	memcpy(split->header.fsid, root->fs_info->disk_super->fsid,
-	       sizeof(split->header.fsid));
 	mid = (c_nritems + 1) / 2;
-	btrfs_memcpy(root, split, split->ptrs, c->ptrs + mid,
-		     (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
-	btrfs_set_header_nritems(&split->header, c_nritems - mid);
-	btrfs_set_header_nritems(&c->header, mid);
+
+	copy_extent_buffer(split, c,
+			   btrfs_node_key_ptr_offset(0),
+			   btrfs_node_key_ptr_offset(mid),
+			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
+	btrfs_set_header_nritems(split, c_nritems - mid);
+	btrfs_set_header_nritems(c, mid);
 	ret = 0;
 
-	btrfs_mark_buffer_dirty(t);
-	btrfs_mark_buffer_dirty(split_buffer);
-	wret = insert_ptr(trans, root, path, &split->ptrs[0].key,
-			  bh_blocknr(split_buffer), path->slots[level + 1] + 1,
+	btrfs_mark_buffer_dirty(c);
+	btrfs_mark_buffer_dirty(split);
+
+	btrfs_node_key(split, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key,
+			  extent_buffer_blocknr(split),
+			  path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
 		ret = wret;
 
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
-		btrfs_block_release(root, t);
-		path->nodes[level] = split_buffer;
+		free_extent_buffer(c);
+		path->nodes[level] = split;
 		path->slots[level + 1] += 1;
 	} else {
-		btrfs_block_release(root, split_buffer);
+		free_extent_buffer(split);
 	}
 	return ret;
 }
@@ -1271,16 +1348,16 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  * and nr indicate which items in the leaf to check.  This totals up the
  * space used both by the item structs and the item data
  */
-static int leaf_space_used(struct btrfs_leaf *l, int start, int nr)
+static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 {
 	int data_len;
-	int nritems = btrfs_header_nritems(&l->header);
+	int nritems = btrfs_header_nritems(l);
 	int end = min(nritems, start + nr) - 1;
 
 	if (!nr)
 		return 0;
-	data_len = btrfs_item_end(l->items + start);
-	data_len = data_len - btrfs_item_offset(l->items + end);
+	data_len = btrfs_item_end_nr(l, start);
+	data_len = data_len - btrfs_item_offset_nr(l, end);
 	data_len += sizeof(struct btrfs_item) * nr;
 	WARN_ON(data_len < 0);
 	return data_len;
@@ -1291,10 +1368,17 @@ static int leaf_space_used(struct btrfs_leaf *l, int start, int nr)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf)
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf)
 {
-	int nritems = btrfs_header_nritems(&leaf->header);
-	return BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+	int nritems = btrfs_header_nritems(leaf);
+	int ret;
+	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
+	if (ret < 0) {
+		printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
+		       ret, BTRFS_LEAF_DATA_SIZE(root),
+		       leaf_space_used(leaf, 0, nritems), nritems);
+	}
+	return ret;
 }
 
 /*
@@ -1307,12 +1391,10 @@ int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf)
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 			   *root, struct btrfs_path *path, int data_size)
 {
-	struct buffer_head *left_buf = path->nodes[0];
-	struct btrfs_leaf *left = btrfs_buffer_leaf(left_buf);
-	struct btrfs_leaf *right;
-	struct buffer_head *right_buf;
-	struct buffer_head *upper;
-	struct btrfs_node *upper_node;
+	struct extent_buffer *left = path->nodes[0];
+	struct extent_buffer *right;
+	struct extent_buffer *upper;
+	struct btrfs_disk_key disk_key;
 	int slot;
 	int i;
 	int free_space;
@@ -1321,6 +1403,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_item *item;
 	u32 left_nritems;
 	u32 right_nritems;
+	u32 data_end;
 	int ret;
 
 	slot = path->slots[1];
@@ -1328,102 +1411,109 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 	upper = path->nodes[1];
-	upper_node = btrfs_buffer_node(upper);
-	if (slot >= btrfs_header_nritems(&upper_node->header) - 1) {
+	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
-	}
-	right_buf = read_tree_block(root,
-		    btrfs_node_blockptr(btrfs_buffer_node(upper), slot + 1));
-	right = btrfs_buffer_leaf(right_buf);
+
+	right = read_tree_block(root, btrfs_node_blockptr(upper, slot + 1));
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		btrfs_block_release(root, right_buf);
+		free_extent_buffer(right);
 		return 1;
 	}
+
 	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, right_buf, upper,
-			      slot + 1, &right_buf);
+	ret = btrfs_cow_block(trans, root, right, upper,
+			      slot + 1, &right);
 	if (ret) {
-		btrfs_block_release(root, right_buf);
+		free_extent_buffer(right);
 		return 1;
 	}
-	right = btrfs_buffer_leaf(right_buf);
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		btrfs_block_release(root, right_buf);
+		free_extent_buffer(right);
 		return 1;
 	}
 
-	left_nritems = btrfs_header_nritems(&left->header);
+	left_nritems = btrfs_header_nritems(left);
 	if (left_nritems == 0) {
-		btrfs_block_release(root, right_buf);
+		free_extent_buffer(right);
 		return 1;
 	}
+
 	for (i = left_nritems - 1; i >= 1; i--) {
-		item = left->items + i;
+		item = btrfs_item_nr(left, i);
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
-		if (btrfs_item_size(item) + sizeof(*item) + push_space >
+		if (btrfs_item_size(left, item) + sizeof(*item) + push_space >
 		    free_space)
 			break;
 		push_items++;
-		push_space += btrfs_item_size(item) + sizeof(*item);
+		push_space += btrfs_item_size(left, item) + sizeof(*item);
 	}
+
 	if (push_items == 0) {
-		btrfs_block_release(root, right_buf);
+		free_extent_buffer(right);
 		return 1;
 	}
+
 	if (push_items == left_nritems)
 		WARN_ON(1);
-	right_nritems = btrfs_header_nritems(&right->header);
+
 	/* push left to right */
-	push_space = btrfs_item_end(left->items + left_nritems - push_items);
+	right_nritems = btrfs_header_nritems(right);
+	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
 	push_space -= leaf_data_end(root, left);
+
 	/* make room in the right data area */
-	btrfs_memmove(root, right, btrfs_leaf_data(right) +
-		      leaf_data_end(root, right) - push_space,
-		      btrfs_leaf_data(right) +
-		      leaf_data_end(root, right), BTRFS_LEAF_DATA_SIZE(root) -
-		      leaf_data_end(root, right));
+	data_end = leaf_data_end(root, right);
+	memmove_extent_buffer(right,
+			      btrfs_leaf_data(right) + data_end - push_space,
+			      btrfs_leaf_data(right) + data_end,
+			      BTRFS_LEAF_DATA_SIZE(root) - data_end);
+
 	/* copy from the left data area */
-	btrfs_memcpy(root, right, btrfs_leaf_data(right) +
+	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
 		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
 		     btrfs_leaf_data(left) + leaf_data_end(root, left),
 		     push_space);
-	btrfs_memmove(root, right, right->items + push_items, right->items,
-		right_nritems * sizeof(struct btrfs_item));
+
+	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
+			      btrfs_item_nr_offset(0),
+			      right_nritems * sizeof(struct btrfs_item));
+
 	/* copy the items from left to right */
-	btrfs_memcpy(root, right, right->items, left->items +
-		     left_nritems - push_items,
-		     push_items * sizeof(struct btrfs_item));
+	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
+		   btrfs_item_nr_offset(left_nritems - push_items),
+		   push_items * sizeof(struct btrfs_item));
 
 	/* update the item pointers */
 	right_nritems += push_items;
-	btrfs_set_header_nritems(&right->header, right_nritems);
+	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
-		btrfs_set_item_offset(right->items + i, push_space -
-				      btrfs_item_size(right->items + i));
-		push_space = btrfs_item_offset(right->items + i);
+		item = btrfs_item_nr(right, i);
+		btrfs_set_item_offset(right, item, push_space -
+				      btrfs_item_size(right, item));
+		push_space = btrfs_item_offset(right, item);
 	}
 	left_nritems -= push_items;
-	btrfs_set_header_nritems(&left->header, left_nritems);
+	btrfs_set_header_nritems(left, left_nritems);
 
-	btrfs_mark_buffer_dirty(left_buf);
-	btrfs_mark_buffer_dirty(right_buf);
+	btrfs_mark_buffer_dirty(left);
+	btrfs_mark_buffer_dirty(right);
 
-	btrfs_memcpy(root, upper_node, &upper_node->ptrs[slot + 1].key,
-		&right->items[0].key, sizeof(struct btrfs_disk_key));
+	btrfs_item_key(right, &disk_key, 0);
+	btrfs_set_node_key(upper, &disk_key, slot + 1);
 	btrfs_mark_buffer_dirty(upper);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
 		path->slots[0] -= left_nritems;
-		btrfs_block_release(root, path->nodes[0]);
-		path->nodes[0] = right_buf;
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
 		path->slots[1] += 1;
 	} else {
-		btrfs_block_release(root, right_buf);
+		free_extent_buffer(right);
 	}
 	if (path->nodes[1])
 		check_node(root, path, 1);
@@ -1436,10 +1526,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, int data_size)
 {
-	struct buffer_head *right_buf = path->nodes[0];
-	struct btrfs_leaf *right = btrfs_buffer_leaf(right_buf);
-	struct buffer_head *t;
-	struct btrfs_leaf *left;
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *right = path->nodes[0];
+	struct extent_buffer *left;
 	int slot;
 	int i;
 	int free_space;
@@ -1447,119 +1536,128 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	int push_items = 0;
 	struct btrfs_item *item;
 	u32 old_left_nritems;
+	u32 right_nritems;
 	int ret = 0;
 	int wret;
 
 	slot = path->slots[1];
-	if (slot == 0) {
+	if (slot == 0)
 		return 1;
-	}
-	if (!path->nodes[1]) {
+	if (!path->nodes[1])
 		return 1;
-	}
-	t = read_tree_block(root,
-	    btrfs_node_blockptr(btrfs_buffer_node(path->nodes[1]), slot - 1));
-	left = btrfs_buffer_leaf(t);
+
+	left = read_tree_block(root, btrfs_node_blockptr(path->nodes[1],
+							 slot - 1));
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		btrfs_block_release(root, t);
+		free_extent_buffer(left);
 		return 1;
 	}
 
 	/* cow and double check */
-	ret = btrfs_cow_block(trans, root, t, path->nodes[1], slot - 1, &t);
+	ret = btrfs_cow_block(trans, root, left,
+			      path->nodes[1], slot - 1, &left);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
-		btrfs_block_release(root, t);
+		free_extent_buffer(left);
 		return 1;
 	}
-	left = btrfs_buffer_leaf(t);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		btrfs_block_release(root, t);
+		free_extent_buffer(left);
 		return 1;
 	}
 
-	if (btrfs_header_nritems(&right->header) == 0) {
-		btrfs_block_release(root, t);
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0) {
+		free_extent_buffer(left);
 		return 1;
 	}
 
-	for (i = 0; i < btrfs_header_nritems(&right->header) - 1; i++) {
-		item = right->items + i;
+	for (i = 0; i < right_nritems - 1; i++) {
+		item = btrfs_item_nr(right, i);
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
-		if (btrfs_item_size(item) + sizeof(*item) + push_space >
+		if (btrfs_item_size(right, item) + sizeof(*item) + push_space >
 		    free_space)
 			break;
 		push_items++;
-		push_space += btrfs_item_size(item) + sizeof(*item);
+		push_space += btrfs_item_size(right, item) + sizeof(*item);
 	}
 	if (push_items == 0) {
-		btrfs_block_release(root, t);
+		free_extent_buffer(left);
 		return 1;
 	}
-	if (push_items == btrfs_header_nritems(&right->header))
+	if (push_items == btrfs_header_nritems(right))
 		WARN_ON(1);
+
 	/* push data from right to left */
-	btrfs_memcpy(root, left, left->items +
-		     btrfs_header_nritems(&left->header),
-		     right->items, push_items * sizeof(struct btrfs_item));
+	copy_extent_buffer(left, right,
+			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
+			   btrfs_item_nr_offset(0),
+			   push_items * sizeof(struct btrfs_item));
+
 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
-		     btrfs_item_offset(right->items + push_items -1);
-	btrfs_memcpy(root, left, btrfs_leaf_data(left) +
+		     btrfs_item_offset_nr(right, push_items -1);
+
+	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
 		     leaf_data_end(root, left) - push_space,
 		     btrfs_leaf_data(right) +
-		     btrfs_item_offset(right->items + push_items - 1),
+		     btrfs_item_offset_nr(right, push_items - 1),
 		     push_space);
-	old_left_nritems = btrfs_header_nritems(&left->header);
+	old_left_nritems = btrfs_header_nritems(left);
 	BUG_ON(old_left_nritems < 0);
 
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
-		u32 ioff = btrfs_item_offset(left->items + i);
-		btrfs_set_item_offset(left->items + i, ioff -
-				     (BTRFS_LEAF_DATA_SIZE(root) -
-				      btrfs_item_offset(left->items +
-						        old_left_nritems - 1)));
+		u32 ioff;
+		item = btrfs_item_nr(left, i);
+		ioff = btrfs_item_offset(left, item);
+		btrfs_set_item_offset(left, item,
+		      ioff - (BTRFS_LEAF_DATA_SIZE(root) -
+		      btrfs_item_offset_nr(left, old_left_nritems - 1)));
 	}
-	btrfs_set_header_nritems(&left->header, old_left_nritems + push_items);
+	btrfs_set_header_nritems(left, old_left_nritems + push_items);
 
 	/* fixup right node */
-	push_space = btrfs_item_offset(right->items + push_items - 1) -
-		     leaf_data_end(root, right);
-	btrfs_memmove(root, right, btrfs_leaf_data(right) +
-		      BTRFS_LEAF_DATA_SIZE(root) - push_space,
-		      btrfs_leaf_data(right) +
-		      leaf_data_end(root, right), push_space);
-	btrfs_memmove(root, right, right->items, right->items + push_items,
-		(btrfs_header_nritems(&right->header) - push_items) *
-		sizeof(struct btrfs_item));
-	btrfs_set_header_nritems(&right->header,
-				 btrfs_header_nritems(&right->header) -
-				 push_items);
+	push_space = btrfs_item_offset_nr(right, push_items - 1) -
+					  leaf_data_end(root, right);
+	memmove_extent_buffer(right, btrfs_leaf_data(right) +
+			      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+			      btrfs_leaf_data(right) +
+			      leaf_data_end(root, right), push_space);
+
+	memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+			      btrfs_item_nr_offset(push_items),
+			     (btrfs_header_nritems(right) - push_items) *
+			     sizeof(struct btrfs_item));
+
+	right_nritems = btrfs_header_nritems(right) - push_items;
+	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 
-	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
-		btrfs_set_item_offset(right->items + i, push_space -
-				      btrfs_item_size(right->items + i));
-		push_space = btrfs_item_offset(right->items + i);
+	for (i = 0; i < right_nritems; i++) {
+		item = btrfs_item_nr(right, i);
+		btrfs_set_item_offset(right, item, push_space -
+				      btrfs_item_size(right, item));
+		push_space = btrfs_item_offset(right, item);
 	}
 
-	btrfs_mark_buffer_dirty(t);
-	btrfs_mark_buffer_dirty(right_buf);
+	btrfs_mark_buffer_dirty(left);
+	btrfs_mark_buffer_dirty(right);
 
-	wret = fixup_low_keys(trans, root, path, &right->items[0].key, 1);
+	btrfs_item_key(right, &disk_key, 0);
+	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	if (wret)
 		ret = wret;
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
 		path->slots[0] += old_left_nritems;
-		btrfs_block_release(root, path->nodes[0]);
-		path->nodes[0] = t;
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = left;
 		path->slots[1] -= 1;
 	} else {
-		btrfs_block_release(root, t);
+		free_extent_buffer(left);
 		path->slots[0] -= push_items;
 	}
 	BUG_ON(path->slots[0] < 0);
@@ -1578,13 +1676,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *ins_key,
 		      struct btrfs_path *path, int data_size)
 {
-	struct buffer_head *l_buf;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	u32 nritems;
 	int mid;
 	int slot;
-	struct btrfs_leaf *right;
-	struct buffer_head *right_buffer;
+	struct extent_buffer *right;
 	int space_needed = data_size + sizeof(struct btrfs_item);
 	int data_copy_size;
 	int rt_data_off;
@@ -1603,8 +1699,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret < 0)
 			return wret;
 	}
-	l_buf = path->nodes[0];
-	l = btrfs_buffer_leaf(l_buf);
+	l = path->nodes[0];
 
 	/* did the pushes work? */
 	if (btrfs_leaf_free_space(root, l) >=
@@ -1617,36 +1712,38 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			return ret;
 	}
 	slot = path->slots[0];
-	nritems = btrfs_header_nritems(&l->header);
+	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1)/ 2;
 
-	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
-	if (IS_ERR(right_buffer))
-		return PTR_ERR(right_buffer);
-
-	right = btrfs_buffer_leaf(right_buffer);
-	memset(&right->header, 0, sizeof(right->header));
-	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
-	btrfs_set_header_generation(&right->header, trans->transid);
-	btrfs_set_header_owner(&right->header, root->root_key.objectid);
-	btrfs_set_header_level(&right->header, 0);
-	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
-	       sizeof(right->header.fsid));
+	right = btrfs_alloc_free_block(trans, root,
+					      extent_buffer_blocknr(l), 0);
+	if (IS_ERR(right))
+		return PTR_ERR(right);
+
+	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_blocknr(right, extent_buffer_blocknr(right));
+	btrfs_set_header_generation(right, trans->transid);
+	btrfs_set_header_owner(right, root->root_key.objectid);
+	btrfs_set_header_level(right, 0);
+	write_extent_buffer(right, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(right),
+			    BTRFS_FSID_SIZE);
+
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + space_needed >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (slot >= nritems) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
-				btrfs_set_header_nritems(&right->header, 0);
+				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
 						  &disk_key,
-						  bh_blocknr(right_buffer),
+						  extent_buffer_blocknr(right),
 						  path->slots[1] + 1, 1);
 				if (wret)
 					ret = wret;
-				btrfs_block_release(root, path->nodes[0]);
-				path->nodes[0] = right_buffer;
+				free_extent_buffer(path->nodes[0]);
+				path->nodes[0] = right;
 				path->slots[0] = 0;
 				path->slots[1] += 1;
 				return ret;
@@ -1659,15 +1756,15 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (slot == 0) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
-				btrfs_set_header_nritems(&right->header, 0);
+				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
 						  &disk_key,
-						  bh_blocknr(right_buffer),
+						  extent_buffer_blocknr(right),
 						  path->slots[1], 1);
 				if (wret)
 					ret = wret;
-				btrfs_block_release(root, path->nodes[0]);
-				path->nodes[0] = right_buffer;
+				free_extent_buffer(path->nodes[0]);
+				path->nodes[0] = right;
 				path->slots[0] = 0;
 				if (path->slots[1] == 0) {
 					wret = fixup_low_keys(trans, root,
@@ -1681,61 +1778,74 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 			double_split = 1;
 		}
 	}
-	btrfs_set_header_nritems(&right->header, nritems - mid);
-	data_copy_size = btrfs_item_end(l->items + mid) -
-			 leaf_data_end(root, l);
-	btrfs_memcpy(root, right, right->items, l->items + mid,
-		     (nritems - mid) * sizeof(struct btrfs_item));
-	btrfs_memcpy(root, right,
+	nritems = nritems - mid;
+	btrfs_set_header_nritems(right, nritems);
+	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+			   btrfs_item_nr_offset(mid),
+			   nritems * sizeof(struct btrfs_item));
+
+	copy_extent_buffer(right, l,
 		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
 		     data_copy_size, btrfs_leaf_data(l) +
 		     leaf_data_end(root, l), data_copy_size);
+
 	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
-		      btrfs_item_end(l->items + mid);
+		      btrfs_item_end_nr(l, mid);
 
-	for (i = 0; i < btrfs_header_nritems(&right->header); i++) {
-		u32 ioff = btrfs_item_offset(right->items + i);
-		btrfs_set_item_offset(right->items + i, ioff + rt_data_off);
+	for (i = 0; i < nritems; i++) {
+		struct btrfs_item *item = btrfs_item_nr(right, i);
+		u32 ioff = btrfs_item_offset(right, item);
+		btrfs_set_item_offset(right, item, ioff + rt_data_off);
 	}
 
-	btrfs_set_header_nritems(&l->header, mid);
+	btrfs_set_header_nritems(l, mid);
 	ret = 0;
-	wret = insert_ptr(trans, root, path, &right->items[0].key,
-			  bh_blocknr(right_buffer), path->slots[1] + 1, 1);
+	btrfs_item_key(right, &disk_key, 0);
+	wret = insert_ptr(trans, root, path, &disk_key,
+			  extent_buffer_blocknr(right), path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
-	btrfs_mark_buffer_dirty(right_buffer);
-	btrfs_mark_buffer_dirty(l_buf);
+
+	btrfs_mark_buffer_dirty(right);
+	btrfs_mark_buffer_dirty(l);
 	BUG_ON(path->slots[0] != slot);
+
 	if (mid <= slot) {
-		btrfs_block_release(root, path->nodes[0]);
-		path->nodes[0] = right_buffer;
+		free_extent_buffer(path->nodes[0]);
+		path->nodes[0] = right;
 		path->slots[0] -= mid;
 		path->slots[1] += 1;
 	} else
-		btrfs_block_release(root, right_buffer);
+		free_extent_buffer(right);
+
 	BUG_ON(path->slots[0] < 0);
 	check_node(root, path, 1);
+	check_leaf(root, path, 0);
 
 	if (!double_split)
 		return ret;
-	right_buffer = btrfs_alloc_free_block(trans, root, l_buf->b_blocknr, 0);
-	if (IS_ERR(right_buffer))
-		return PTR_ERR(right_buffer);
-
-	right = btrfs_buffer_leaf(right_buffer);
-	memset(&right->header, 0, sizeof(right->header));
-	btrfs_set_header_blocknr(&right->header, bh_blocknr(right_buffer));
-	btrfs_set_header_generation(&right->header, trans->transid);
-	btrfs_set_header_owner(&right->header, root->root_key.objectid);
-	btrfs_set_header_level(&right->header, 0);
-	memcpy(right->header.fsid, root->fs_info->disk_super->fsid,
-	       sizeof(right->header.fsid));
+
+	right = btrfs_alloc_free_block(trans, root,
+				       extent_buffer_blocknr(l), 0);
+	if (IS_ERR(right))
+		return PTR_ERR(right);
+
+	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_blocknr(right, extent_buffer_blocknr(right));
+	btrfs_set_header_generation(right, trans->transid);
+	btrfs_set_header_owner(right, root->root_key.objectid);
+	btrfs_set_header_level(right, 0);
+	write_extent_buffer(right, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(right),
+			    BTRFS_FSID_SIZE);
+
 	btrfs_cpu_key_to_disk(&disk_key, ins_key);
-	btrfs_set_header_nritems(&right->header, 0);
+	btrfs_set_header_nritems(right, 0);
 	wret = insert_ptr(trans, root, path,
 			  &disk_key,
-			  bh_blocknr(right_buffer),
+			  extent_buffer_blocknr(right),
 			  path->slots[1], 1);
 	if (wret)
 		ret = wret;
@@ -1744,8 +1854,8 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret)
 			ret = wret;
 	}
-	btrfs_block_release(root, path->nodes[0]);
-	path->nodes[0] = right_buffer;
+	free_extent_buffer(path->nodes[0]);
+	path->nodes[0] = right;
 	path->slots[0] = 0;
 	check_node(root, path, 1);
 	check_leaf(root, path, 0);
@@ -1760,8 +1870,8 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int slot;
 	int slot_orig;
-	struct btrfs_leaf *leaf;
-	struct buffer_head *leaf_buf;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
 	u32 nritems;
 	unsigned int data_end;
 	unsigned int old_data_start;
@@ -1770,15 +1880,14 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	int i;
 
 	slot_orig = path->slots[0];
-	leaf_buf = path->nodes[0];
-	leaf = btrfs_buffer_leaf(leaf_buf);
+	leaf = path->nodes[0];
 
-	nritems = btrfs_header_nritems(&leaf->header);
+	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
 
 	slot = path->slots[0];
-	old_data_start = btrfs_item_offset(leaf->items + slot);
-	old_size = btrfs_item_size(leaf->items + slot);
+	old_data_start = btrfs_item_offset_nr(leaf, slot);
+	old_size = btrfs_item_size_nr(leaf, slot);
 	BUG_ON(old_size <= new_size);
 	size_diff = old_size - new_size;
 
@@ -1790,32 +1899,38 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	 */
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
-		u32 ioff = btrfs_item_offset(leaf->items + i);
-		btrfs_set_item_offset(leaf->items + i,
-				      ioff + size_diff);
+		u32 ioff;
+		item = btrfs_item_nr(leaf, i);
+		ioff = btrfs_item_offset(leaf, item);
+		btrfs_set_item_offset(leaf, item, ioff + size_diff);
 	}
 	/* shift the data */
-	btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
 		      data_end + size_diff, btrfs_leaf_data(leaf) +
 		      data_end, old_data_start + new_size - data_end);
-	btrfs_set_item_size(leaf->items + slot, new_size);
-	btrfs_mark_buffer_dirty(leaf_buf);
+
+	item = btrfs_item_nr(leaf, slot);
+	btrfs_set_item_size(leaf, item, new_size);
+	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
-	if (btrfs_leaf_free_space(root, leaf) < 0)
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
 		BUG();
+	}
 	check_leaf(root, path, 0);
 	return ret;
 }
 
-int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, u32 data_size)
+int btrfs_extend_item(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, struct btrfs_path *path,
+		      u32 data_size)
 {
 	int ret = 0;
 	int slot;
 	int slot_orig;
-	struct btrfs_leaf *leaf;
-	struct buffer_head *leaf_buf;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
 	u32 nritems;
 	unsigned int data_end;
 	unsigned int old_data;
@@ -1823,16 +1938,17 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	int i;
 
 	slot_orig = path->slots[0];
-	leaf_buf = path->nodes[0];
-	leaf = btrfs_buffer_leaf(leaf_buf);
+	leaf = path->nodes[0];
 
-	nritems = btrfs_header_nritems(&leaf->header);
+	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
 
-	if (btrfs_leaf_free_space(root, leaf) < data_size)
+	if (btrfs_leaf_free_space(root, leaf) < data_size) {
+		btrfs_print_leaf(root, leaf);
 		BUG();
+	}
 	slot = path->slots[0];
-	old_data = btrfs_item_end(leaf->items + slot);
+	old_data = btrfs_item_end_nr(leaf, slot);
 
 	BUG_ON(slot < 0);
 	BUG_ON(slot >= nritems);
@@ -1842,22 +1958,28 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	 */
 	/* first correct the data pointers */
 	for (i = slot; i < nritems; i++) {
-		u32 ioff = btrfs_item_offset(leaf->items + i);
-		btrfs_set_item_offset(leaf->items + i,
-				      ioff - data_size);
+		u32 ioff;
+		item = btrfs_item_nr(leaf, i);
+		ioff = btrfs_item_offset(leaf, item);
+		btrfs_set_item_offset(leaf, item, ioff - data_size);
 	}
+
 	/* shift the data */
-	btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
 		      data_end - data_size, btrfs_leaf_data(leaf) +
 		      data_end, old_data - data_end);
+
 	data_end = old_data;
-	old_size = btrfs_item_size(leaf->items + slot);
-	btrfs_set_item_size(leaf->items + slot, old_size + data_size);
-	btrfs_mark_buffer_dirty(leaf_buf);
+	old_size = btrfs_item_size_nr(leaf, slot);
+	item = btrfs_item_nr(leaf, slot);
+	btrfs_set_item_size(leaf, item, old_size + data_size);
+	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
-	if (btrfs_leaf_free_space(root, leaf) < 0)
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
 		BUG();
+	}
 	check_leaf(root, path, 0);
 	return ret;
 }
@@ -1866,15 +1988,16 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
-int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *root, struct btrfs_path *path, struct btrfs_key
-			    *cpu_key, u32 data_size)
+int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 data_size)
 {
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
 	int ret = 0;
 	int slot;
 	int slot_orig;
-	struct btrfs_leaf *leaf;
-	struct buffer_head *leaf_buf;
 	u32 nritems;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
@@ -1884,6 +2007,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
+
 	ret = btrfs_search_slot(trans, root, cpu_key, path, data_size, 1);
 	if (ret == 0) {
 		return -EEXIST;
@@ -1892,57 +2016,68 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		goto out;
 
 	slot_orig = path->slots[0];
-	leaf_buf = path->nodes[0];
-	leaf = btrfs_buffer_leaf(leaf_buf);
+	leaf = path->nodes[0];
 
-	nritems = btrfs_header_nritems(&leaf->header);
+	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
 
 	if (btrfs_leaf_free_space(root, leaf) <
 	    sizeof(struct btrfs_item) + data_size) {
 		BUG();
 	}
+
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
+
 	if (slot != nritems) {
 		int i;
-		unsigned int old_data = btrfs_item_end(leaf->items + slot);
+		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
+		if (old_data < data_end) {
+			btrfs_print_leaf(root, leaf);
+			printk("slot %d old_data %d data_end %d\n",
+			       slot, old_data, data_end);
+			BUG_ON(1);
+		}
 		/*
 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
 		 */
 		/* first correct the data pointers */
 		for (i = slot; i < nritems; i++) {
-			u32 ioff = btrfs_item_offset(leaf->items + i);
-			btrfs_set_item_offset(leaf->items + i,
-					      ioff - data_size);
+			u32 ioff;
+			item = btrfs_item_nr(leaf, i);
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff - data_size);
 		}
 
 		/* shift the items */
-		btrfs_memmove(root, leaf, leaf->items + slot + 1,
-			      leaf->items + slot,
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+			      btrfs_item_nr_offset(slot),
 			      (nritems - slot) * sizeof(struct btrfs_item));
 
 		/* shift the data */
-		btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
 			      data_end - data_size, btrfs_leaf_data(leaf) +
 			      data_end, old_data - data_end);
 		data_end = old_data;
 	}
+
 	/* setup the item for the new data */
-	btrfs_memcpy(root, leaf, &leaf->items[slot].key, &disk_key,
-		     sizeof(struct btrfs_disk_key));
-	btrfs_set_item_offset(leaf->items + slot, data_end - data_size);
-	btrfs_set_item_size(leaf->items + slot, data_size);
-	btrfs_set_header_nritems(&leaf->header, nritems + 1);
-	btrfs_mark_buffer_dirty(leaf_buf);
+	btrfs_set_item_key(leaf, &disk_key, slot);
+	item = btrfs_item_nr(leaf, slot);
+	btrfs_set_item_offset(leaf, item, data_end - data_size);
+	btrfs_set_item_size(leaf, item, data_size);
+	btrfs_set_header_nritems(leaf, nritems + 1);
+	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
 	if (slot == 0)
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
 
-	if (btrfs_leaf_free_space(root, leaf) < 0)
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
 		BUG();
+	}
 	check_leaf(root, path, 0);
 out:
 	return ret;
@@ -1958,17 +2093,17 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	int ret = 0;
 	struct btrfs_path *path;
-	u8 *ptr;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (!ret) {
-		ptr = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				     path->slots[0], u8);
-		btrfs_memcpy(root, path->nodes[0]->b_data,
-			     ptr, data, data_size);
-		btrfs_mark_buffer_dirty(path->nodes[0]);
+		leaf = path->nodes[0];
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		write_extent_buffer(leaf, data, ptr, data_size);
+		btrfs_mark_buffer_dirty(leaf);
 	}
 	btrfs_free_path(path);
 	return ret;
@@ -1984,30 +2119,30 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot)
 {
-	struct btrfs_node *node;
-	struct buffer_head *parent = path->nodes[level];
+	struct extent_buffer *parent = path->nodes[level];
 	u32 nritems;
 	int ret = 0;
 	int wret;
 
-	node = btrfs_buffer_node(parent);
-	nritems = btrfs_header_nritems(&node->header);
+	nritems = btrfs_header_nritems(parent);
 	if (slot != nritems -1) {
-		btrfs_memmove(root, node, node->ptrs + slot,
-			      node->ptrs + slot + 1,
+		memmove_extent_buffer(parent,
+			      btrfs_node_key_ptr_offset(slot),
+			      btrfs_node_key_ptr_offset(slot + 1),
 			      sizeof(struct btrfs_key_ptr) *
 			      (nritems - slot - 1));
 	}
 	nritems--;
-	btrfs_set_header_nritems(&node->header, nritems);
+	btrfs_set_header_nritems(parent, nritems);
 	if (nritems == 0 && parent == root->node) {
-		struct btrfs_header *header = btrfs_buffer_header(root->node);
-		BUG_ON(btrfs_header_level(header) != 1);
+		BUG_ON(btrfs_header_level(root->node) != 1);
 		/* just turn the root into a leaf and break */
-		btrfs_set_header_level(header, 0);
+		btrfs_set_header_level(root->node, 0);
 	} else if (slot == 0) {
-		wret = fixup_low_keys(trans, root, path, &node->ptrs[0].key,
-				      level + 1);
+		struct btrfs_disk_key disk_key;
+
+		btrfs_node_key(parent, &disk_key, 0);
+		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
 		if (wret)
 			ret = wret;
 	}
@@ -2023,59 +2158,67 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path)
 {
 	int slot;
-	struct btrfs_leaf *leaf;
-	struct buffer_head *leaf_buf;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
 	int doff;
 	int dsize;
 	int ret = 0;
 	int wret;
 	u32 nritems;
 
-	leaf_buf = path->nodes[0];
-	leaf = btrfs_buffer_leaf(leaf_buf);
+	leaf = path->nodes[0];
 	slot = path->slots[0];
-	doff = btrfs_item_offset(leaf->items + slot);
-	dsize = btrfs_item_size(leaf->items + slot);
-	nritems = btrfs_header_nritems(&leaf->header);
+	doff = btrfs_item_offset_nr(leaf, slot);
+	dsize = btrfs_item_size_nr(leaf, slot);
+	nritems = btrfs_header_nritems(leaf);
 
 	if (slot != nritems - 1) {
 		int i;
 		int data_end = leaf_data_end(root, leaf);
-		btrfs_memmove(root, leaf, btrfs_leaf_data(leaf) +
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
 			      data_end + dsize,
 			      btrfs_leaf_data(leaf) + data_end,
 			      doff - data_end);
+
 		for (i = slot + 1; i < nritems; i++) {
-			u32 ioff = btrfs_item_offset(leaf->items + i);
-			btrfs_set_item_offset(leaf->items + i, ioff + dsize);
+			u32 ioff;
+			item = btrfs_item_nr(leaf, i);
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff + dsize);
 		}
-		btrfs_memmove(root, leaf, leaf->items + slot,
-			      leaf->items + slot + 1,
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
+			      btrfs_item_nr_offset(slot + 1),
 			      sizeof(struct btrfs_item) *
 			      (nritems - slot - 1));
 	}
-	btrfs_set_header_nritems(&leaf->header, nritems - 1);
+	btrfs_set_header_nritems(leaf, nritems - 1);
 	nritems--;
+
 	/* delete the leaf if we've emptied it */
 	if (nritems == 0) {
-		if (leaf_buf == root->node) {
-			btrfs_set_header_level(&leaf->header, 0);
+		if (leaf == root->node) {
+			btrfs_set_header_level(leaf, 0);
 		} else {
-			clean_tree_block(trans, root, leaf_buf);
-			wait_on_buffer(leaf_buf);
+			clean_tree_block(trans, root, leaf);
+			wait_on_tree_block_writeback(root, leaf);
 			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root,
-						 bh_blocknr(leaf_buf), 1, 1);
+						 extent_buffer_blocknr(leaf),
+						 1, 1);
 			if (wret)
 				ret = wret;
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, nritems);
 		if (slot == 0) {
+			struct btrfs_disk_key disk_key;
+
+			btrfs_item_key(leaf, &disk_key, 0);
 			wret = fixup_low_keys(trans, root, path,
-					      &leaf->items[0].key, 1);
+					      &disk_key, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -2087,34 +2230,40 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			 * for possible call to del_ptr below
 			 */
 			slot = path->slots[1];
-			get_bh(leaf_buf);
+			extent_buffer_get(leaf);
+
 			wret = push_leaf_left(trans, root, path, 1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
-			if (path->nodes[0] == leaf_buf &&
-			    btrfs_header_nritems(&leaf->header)) {
+
+			if (path->nodes[0] == leaf &&
+			    btrfs_header_nritems(leaf)) {
 				wret = push_leaf_right(trans, root, path, 1);
 				if (wret < 0 && wret != -ENOSPC)
 					ret = wret;
 			}
-			if (btrfs_header_nritems(&leaf->header) == 0) {
-				u64 blocknr = bh_blocknr(leaf_buf);
-				clean_tree_block(trans, root, leaf_buf);
-				wait_on_buffer(leaf_buf);
+
+			if (btrfs_header_nritems(leaf) == 0) {
+				u64 blocknr = extent_buffer_blocknr(leaf);
+
+				clean_tree_block(trans, root, leaf);
+				wait_on_tree_block_writeback(root, leaf);
+
 				wret = del_ptr(trans, root, path, 1, slot);
 				if (wret)
 					ret = wret;
-				btrfs_block_release(root, leaf_buf);
+
+				free_extent_buffer(leaf);
 				wret = btrfs_free_extent(trans, root, blocknr,
 							 1, 1);
 				if (wret)
 					ret = wret;
 			} else {
-				btrfs_mark_buffer_dirty(leaf_buf);
-				btrfs_block_release(root, leaf_buf);
+				btrfs_mark_buffer_dirty(leaf);
+				free_extent_buffer(leaf);
 			}
 		} else {
-			btrfs_mark_buffer_dirty(leaf_buf);
+			btrfs_mark_buffer_dirty(leaf);
 		}
 	}
 	return ret;
@@ -2130,25 +2279,27 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	int slot;
 	int level = 1;
 	u64 blocknr;
-	struct buffer_head *c;
-	struct btrfs_node *c_node;
-	struct buffer_head *next = NULL;
+	struct extent_buffer *c;
+	struct extent_buffer *next = NULL;
 
 	while(level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
+
 		slot = path->slots[level] + 1;
 		c = path->nodes[level];
-		c_node = btrfs_buffer_node(c);
-		if (slot >= btrfs_header_nritems(&c_node->header)) {
+		if (slot >= btrfs_header_nritems(c)) {
 			level++;
 			continue;
 		}
-		blocknr = btrfs_node_blockptr(c_node, slot);
+
+		blocknr = btrfs_node_blockptr(c, slot);
 		if (next)
-			btrfs_block_release(root, next);
+			free_extent_buffer(next);
+
 		if (path->reada)
 			reada_for_search(root, path, level, slot);
+
 		next = read_tree_block(root, blocknr);
 		break;
 	}
@@ -2156,15 +2307,14 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	while(1) {
 		level--;
 		c = path->nodes[level];
-		btrfs_block_release(root, c);
+		free_extent_buffer(c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
 		if (!level)
 			break;
 		if (path->reada)
 			reada_for_search(root, path, level, 0);
-		next = read_tree_block(root,
-		       btrfs_node_blockptr(btrfs_buffer_node(next), 0));
+		next = read_tree_block(root, btrfs_node_blockptr(next, 0));
 	}
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 800a3499cc3..c4b82980685 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -20,10 +20,10 @@
 #define __BTRFS__
 
 #include <linux/fs.h>
-#include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
 #include "bit-radix.h"
+#include "extent_map.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -32,7 +32,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 
-#define BTRFS_MAGIC "_BtRfS_M"
+#define BTRFS_MAGIC "_B2RfS_M"
 
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
@@ -78,41 +78,41 @@ extern struct kmem_cache *btrfs_path_cachep;
  */
 struct btrfs_disk_key {
 	__le64 objectid;
-	__le32 flags;
+	u8 type;
 	__le64 offset;
 } __attribute__ ((__packed__));
 
 struct btrfs_key {
 	u64 objectid;
-	u32 flags;
+	u8 type;
 	u64 offset;
 } __attribute__ ((__packed__));
 
+#define BTRFS_FSID_SIZE 16
 /*
  * every tree block (leaf or node) starts with this header.
  */
 struct btrfs_header {
 	u8 csum[BTRFS_CSUM_SIZE];
-	u8 fsid[16]; /* FS specific uuid */
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 	__le64 blocknr; /* which block this node is supposed to live in */
 	__le64 generation;
 	__le64 owner;
-	__le16 nritems;
+	__le32 nritems;
 	__le16 flags;
 	u8 level;
 } __attribute__ ((__packed__));
 
 #define BTRFS_MAX_LEVEL 8
-#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->blocksize - \
+#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
 			        sizeof(struct btrfs_header)) / \
 			       (sizeof(struct btrfs_disk_key) + sizeof(u64)))
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->blocksize))
+#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
-struct buffer_head;
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -128,7 +128,9 @@ struct btrfs_super_block {
 	__le64 total_blocks;
 	__le64 blocks_used;
 	__le64 root_dir_objectid;
-	__le32 blocksize;
+	__le32 sectorsize;
+	__le32 nodesize;
+	__le32 leafsize;
 } __attribute__ ((__packed__));
 
 /*
@@ -138,7 +140,7 @@ struct btrfs_super_block {
 struct btrfs_item {
 	struct btrfs_disk_key key;
 	__le32 offset;
-	__le16 size;
+	__le32 size;
 } __attribute__ ((__packed__));
 
 /*
@@ -176,7 +178,7 @@ struct btrfs_node {
  * used while walking the tree.
  */
 struct btrfs_path {
-	struct buffer_head *nodes[BTRFS_MAX_LEVEL];
+	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
 	int reada;
 	int lowest_level;
@@ -292,6 +294,7 @@ struct btrfs_block_group_cache {
 };
 
 struct btrfs_fs_info {
+	u8 fsid[BTRFS_FSID_SIZE];
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct radix_tree_root fs_roots_radix;
@@ -304,9 +307,8 @@ struct btrfs_fs_info {
 	u64 generation;
 	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
-	struct btrfs_super_block *disk_super;
 	struct btrfs_super_block super_copy;
-	struct buffer_head *sb_buffer;
+	struct extent_buffer *sb_buffer;
 	struct super_block *sb;
 	struct inode *btree_inode;
 	struct mutex trans_mutex;
@@ -325,8 +327,8 @@ struct btrfs_fs_info {
  * and for the extent tree extent_root root.
  */
 struct btrfs_root {
-	struct buffer_head *node;
-	struct buffer_head *commit_root;
+	struct extent_buffer *node;
+	struct extent_buffer *commit_root;
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
@@ -336,7 +338,16 @@ struct btrfs_root {
 	struct rw_semaphore snap_sem;
 	u64 objectid;
 	u64 last_trans;
-	u32 blocksize;
+
+	/* data allocations are done in sectorsize units */
+	u32 sectorsize;
+
+	/* node allocations are done in nodesize units */
+	u32 nodesize;
+
+	/* leaf allocations are done in leafsize units */
+	u32 leafsize;
+
 	u32 type;
 	u64 highest_inode;
 	u64 last_inode_alloc;
@@ -347,12 +358,6 @@ struct btrfs_root {
 	char *name;
 };
 
-/* the lower bits in the key flags defines the item type */
-#define BTRFS_KEY_TYPE_MAX	256
-#define BTRFS_KEY_TYPE_SHIFT	24
-#define BTRFS_KEY_TYPE_MASK	(((u32)BTRFS_KEY_TYPE_MAX - 1) << \
-				  BTRFS_KEY_TYPE_SHIFT)
-
 /*
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
@@ -402,246 +407,253 @@ struct btrfs_root {
  */
 #define BTRFS_STRING_ITEM_KEY	253
 
+/* some macros to generate set/get funcs for the struct fields.  This
+ * assumes there is a lefoo_to_cpu for every type, so lets make a simple
+ * one for u8:
+ */
+#define le8_to_cpu(v) (v)
+#define cpu_to_le8(v) (v)
+#define __le8 u8
+
+#define read_eb_member(eb, ptr, type, member, result) (			\
+	read_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define write_eb_member(eb, ptr, type, member, result) (		\
+	write_extent_buffer(eb, (char *)(result),			\
+			   ((unsigned long)(ptr)) +			\
+			    offsetof(type, member),			\
+			   sizeof(((type *)0)->member)))
+
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+static inline u##bits btrfs_##name(struct extent_buffer *eb,		\
+				   type *s)				\
+{									\
+	__le##bits res;							\
+	read_eb_member(eb, s, type, member, &res);			\
+	return le##bits##_to_cpu(res);					\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb,		\
+				    type *s, u##bits val)		\
+{									\
+	val = cpu_to_le##bits(val);					\
+	write_eb_member(eb, s, type, member, &val);			\
+}
+
+#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
+{									\
+	__le##bits res;							\
+	read_eb_member(eb, NULL, type, member, &res);			\
+	return le##bits##_to_cpu(res);					\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb,		\
+				    u##bits val)			\
+{									\
+	val = cpu_to_le##bits(val);					\
+	write_eb_member(eb, NULL, type, member, &val);			\
+}
 
-static inline u64 btrfs_block_group_used(struct btrfs_block_group_item *bi)
-{
-	return le64_to_cpu(bi->used);
-}
-
-static inline void btrfs_set_block_group_used(struct
-						   btrfs_block_group_item *bi,
-						   u64 val)
-{
-	bi->used = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_inode_generation(struct btrfs_inode_item *i)
-{
-	return le64_to_cpu(i->generation);
-}
-
-static inline void btrfs_set_inode_generation(struct btrfs_inode_item *i,
-					      u64 val)
-{
-	i->generation = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_inode_size(struct btrfs_inode_item *i)
-{
-	return le64_to_cpu(i->size);
-}
-
-static inline void btrfs_set_inode_size(struct btrfs_inode_item *i, u64 val)
-{
-	i->size = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_inode_nblocks(struct btrfs_inode_item *i)
-{
-	return le64_to_cpu(i->nblocks);
-}
-
-static inline void btrfs_set_inode_nblocks(struct btrfs_inode_item *i, u64 val)
-{
-	i->nblocks = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_inode_block_group(struct btrfs_inode_item *i)
-{
-	return le64_to_cpu(i->block_group);
-}
-
-static inline void btrfs_set_inode_block_group(struct btrfs_inode_item *i,
-						u64 val)
-{
-	i->block_group = cpu_to_le64(val);
-}
-
-static inline u32 btrfs_inode_nlink(struct btrfs_inode_item *i)
-{
-	return le32_to_cpu(i->nlink);
-}
-
-static inline void btrfs_set_inode_nlink(struct btrfs_inode_item *i, u32 val)
-{
-	i->nlink = cpu_to_le32(val);
-}
-
-static inline u32 btrfs_inode_uid(struct btrfs_inode_item *i)
-{
-	return le32_to_cpu(i->uid);
-}
-
-static inline void btrfs_set_inode_uid(struct btrfs_inode_item *i, u32 val)
-{
-	i->uid = cpu_to_le32(val);
-}
-
-static inline u32 btrfs_inode_gid(struct btrfs_inode_item *i)
-{
-	return le32_to_cpu(i->gid);
-}
-
-static inline void btrfs_set_inode_gid(struct btrfs_inode_item *i, u32 val)
-{
-	i->gid = cpu_to_le32(val);
-}
-
-static inline u32 btrfs_inode_mode(struct btrfs_inode_item *i)
-{
-	return le32_to_cpu(i->mode);
-}
-
-static inline void btrfs_set_inode_mode(struct btrfs_inode_item *i, u32 val)
-{
-	i->mode = cpu_to_le32(val);
+#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
+static inline u##bits btrfs_##name(type *s)				\
+{									\
+	return le##bits##_to_cpu(s->member);				\
+}									\
+static inline void btrfs_set_##name(type *s, u##bits val)		\
+{									\
+	s->member = cpu_to_le##bits(val);				\
 }
 
-static inline u32 btrfs_inode_rdev(struct btrfs_inode_item *i)
-{
-	return le32_to_cpu(i->rdev);
-}
+/* struct btrfs_block_group_item */
+BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
+			 used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
+			 used, 64);
 
-static inline void btrfs_set_inode_rdev(struct btrfs_inode_item *i, u32 val)
-{
-	i->rdev = cpu_to_le32(val);
-}
+/* struct btrfs_inode_item */
+BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64);
+BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
+BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 32);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
+BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
+		   compat_flags, 16);
 
-static inline u16 btrfs_inode_flags(struct btrfs_inode_item *i)
+static inline struct btrfs_inode_timespec *
+btrfs_inode_atime(struct btrfs_inode_item *inode_item)
 {
-	return le16_to_cpu(i->flags);
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, atime);
+	return (struct btrfs_inode_timespec *)ptr;
 }
 
-static inline void btrfs_set_inode_flags(struct btrfs_inode_item *i, u16 val)
+static inline struct btrfs_inode_timespec *
+btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
 {
-	i->flags = cpu_to_le16(val);
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, mtime);
+	return (struct btrfs_inode_timespec *)ptr;
 }
 
-static inline u16 btrfs_inode_compat_flags(struct btrfs_inode_item *i)
+static inline struct btrfs_inode_timespec *
+btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
 {
-	return le16_to_cpu(i->compat_flags);
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, ctime);
+	return (struct btrfs_inode_timespec *)ptr;
 }
 
-static inline void btrfs_set_inode_compat_flags(struct btrfs_inode_item *i,
-						u16 val)
+static inline struct btrfs_inode_timespec *
+btrfs_inode_otime(struct btrfs_inode_item *inode_item)
 {
-	i->compat_flags = cpu_to_le16(val);
+	unsigned long ptr = (unsigned long)inode_item;
+	ptr += offsetof(struct btrfs_inode_item, otime);
+	return (struct btrfs_inode_timespec *)ptr;
 }
 
-static inline u64 btrfs_timespec_sec(struct btrfs_inode_timespec *ts)
-{
-	return le64_to_cpu(ts->sec);
-}
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_inode_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32);
 
-static inline void btrfs_set_timespec_sec(struct btrfs_inode_timespec *ts,
-					  u64 val)
-{
-	ts->sec = cpu_to_le64(val);
-}
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
+BTRFS_SETGET_FUNCS(extent_owner, struct btrfs_extent_item, owner, 32);
 
-static inline u32 btrfs_timespec_nsec(struct btrfs_inode_timespec *ts)
-{
-	return le32_to_cpu(ts->nsec);
-}
+BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
+			 refs, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_extent_owner, struct btrfs_extent_item,
+			 owner, 32);
 
-static inline void btrfs_set_timespec_nsec(struct btrfs_inode_timespec *ts,
-					  u32 val)
-{
-	ts->nsec = cpu_to_le32(val);
-}
+/* struct btrfs_node */
+BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
 
-static inline u32 btrfs_extent_refs(struct btrfs_extent_item *ei)
+static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
 {
-	return le32_to_cpu(ei->refs);
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr);
 }
 
-static inline void btrfs_set_extent_refs(struct btrfs_extent_item *ei, u32 val)
+static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
+					   int nr, u64 val)
 {
-	ei->refs = cpu_to_le32(val);
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
 }
 
-static inline u64 btrfs_extent_owner(struct btrfs_extent_item *ei)
+static unsigned long btrfs_node_key_ptr_offset(int nr)
 {
-	return le64_to_cpu(ei->owner);
+	return offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
 }
 
-static inline void btrfs_set_extent_owner(struct btrfs_extent_item *ei, u64 val)
+static void btrfs_node_key(struct extent_buffer *eb,
+			   struct btrfs_disk_key *disk_key, int nr)
 {
-	ei->owner = cpu_to_le64(val);
+	unsigned long ptr;
+	ptr = btrfs_node_key_ptr_offset(nr);
+	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
 }
-
-static inline u64 btrfs_node_blockptr(struct btrfs_node *n, int nr)
+static inline void btrfs_set_node_key(struct extent_buffer *eb,
+				      struct btrfs_disk_key *disk_key, int nr)
 {
-	return le64_to_cpu(n->ptrs[nr].blockptr);
+	unsigned long ptr;
+	ptr = btrfs_node_key_ptr_offset(nr);
+	write_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
 }
 
+/* struct btrfs_item */
+BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
 
-static inline void btrfs_set_node_blockptr(struct btrfs_node *n, int nr,
-					   u64 val)
+static inline unsigned long btrfs_item_nr_offset(int nr)
 {
-	n->ptrs[nr].blockptr = cpu_to_le64(val);
+	return offsetof(struct btrfs_leaf, items) +
+		sizeof(struct btrfs_item) * nr;
 }
 
-static inline u32 btrfs_item_offset(struct btrfs_item *item)
+static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
+					       int nr)
 {
-	return le32_to_cpu(item->offset);
+	return (struct btrfs_item *)btrfs_item_nr_offset(nr);
 }
 
-static inline void btrfs_set_item_offset(struct btrfs_item *item, u32 val)
+static inline u32 btrfs_item_end(struct extent_buffer *eb,
+				 struct btrfs_item *item)
 {
-	item->offset = cpu_to_le32(val);
+	return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item);
 }
 
-static inline u32 btrfs_item_end(struct btrfs_item *item)
+static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
 {
-	return le32_to_cpu(item->offset) + le16_to_cpu(item->size);
+	return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
 }
 
-static inline u16 btrfs_item_size(struct btrfs_item *item)
+static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
 {
-	return le16_to_cpu(item->size);
+	return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
 }
 
-static inline void btrfs_set_item_size(struct btrfs_item *item, u16 val)
+static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
 {
-	item->size = cpu_to_le16(val);
+	return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
 }
 
-static inline u16 btrfs_dir_flags(struct btrfs_dir_item *d)
+static inline void btrfs_item_key(struct extent_buffer *eb,
+			   struct btrfs_disk_key *disk_key, int nr)
 {
-	return le16_to_cpu(d->flags);
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	read_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
-static inline void btrfs_set_dir_flags(struct btrfs_dir_item *d, u16 val)
+static inline void btrfs_set_item_key(struct extent_buffer *eb,
+			       struct btrfs_disk_key *disk_key, int nr)
 {
-	d->flags = cpu_to_le16(val);
+	struct btrfs_item *item = btrfs_item_nr(eb, nr);
+	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
-static inline u8 btrfs_dir_type(struct btrfs_dir_item *d)
-{
-	return d->type;
-}
+/* struct btrfs_dir_item */
+BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, flags, 16);
+BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
 
-static inline void btrfs_set_dir_type(struct btrfs_dir_item *d, u8 val)
+static inline void btrfs_dir_item_key(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_disk_key *key)
 {
-	d->type = val;
+	read_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
 
-static inline u16 btrfs_dir_name_len(struct btrfs_dir_item *d)
+static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
+					  struct btrfs_dir_item *item,
+					  struct btrfs_disk_key *key)
 {
-	return le16_to_cpu(d->name_len);
+	write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
 
-static inline void btrfs_set_dir_name_len(struct btrfs_dir_item *d, u16 val)
-{
-	d->name_len = cpu_to_le16(val);
-}
+/* struct btrfs_disk_key */
+BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
+			 objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8);
 
 static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu,
 					 struct btrfs_disk_key *disk)
 {
 	cpu->offset = le64_to_cpu(disk->offset);
-	cpu->flags = le32_to_cpu(disk->flags);
+	cpu->type = disk->type;
 	cpu->objectid = le64_to_cpu(disk->objectid);
 }
 
@@ -649,400 +661,167 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk,
 					 struct btrfs_key *cpu)
 {
 	disk->offset = cpu_to_le64(cpu->offset);
-	disk->flags = cpu_to_le32(cpu->flags);
+	disk->type = cpu->type;
 	disk->objectid = cpu_to_le64(cpu->objectid);
 }
 
-static inline u64 btrfs_disk_key_objectid(struct btrfs_disk_key *disk)
-{
-	return le64_to_cpu(disk->objectid);
-}
-
-static inline void btrfs_set_disk_key_objectid(struct btrfs_disk_key *disk,
-					       u64 val)
-{
-	disk->objectid = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_disk_key_offset(struct btrfs_disk_key *disk)
-{
-	return le64_to_cpu(disk->offset);
-}
-
-static inline void btrfs_set_disk_key_offset(struct btrfs_disk_key *disk,
-					     u64 val)
-{
-	disk->offset = cpu_to_le64(val);
-}
-
-static inline u32 btrfs_disk_key_flags(struct btrfs_disk_key *disk)
-{
-	return le32_to_cpu(disk->flags);
-}
-
-static inline void btrfs_set_disk_key_flags(struct btrfs_disk_key *disk,
-					    u32 val)
-{
-	disk->flags = cpu_to_le32(val);
-}
-
-static inline u32 btrfs_disk_key_type(struct btrfs_disk_key *key)
-{
-	return le32_to_cpu(key->flags) >> BTRFS_KEY_TYPE_SHIFT;
-}
-
-static inline void btrfs_set_disk_key_type(struct btrfs_disk_key *key,
-					       u32 val)
-{
-	u32 flags = btrfs_disk_key_flags(key);
-	BUG_ON(val >= BTRFS_KEY_TYPE_MAX);
-	val = val << BTRFS_KEY_TYPE_SHIFT;
-	flags = (flags & ~BTRFS_KEY_TYPE_MASK) | val;
-	btrfs_set_disk_key_flags(key, flags);
-}
-
-static inline u32 btrfs_key_type(struct btrfs_key *key)
-{
-	return key->flags >> BTRFS_KEY_TYPE_SHIFT;
-}
-
-static inline void btrfs_set_key_type(struct btrfs_key *key, u32 val)
-{
-	BUG_ON(val >= BTRFS_KEY_TYPE_MAX);
-	val = val << BTRFS_KEY_TYPE_SHIFT;
-	key->flags = (key->flags & ~(BTRFS_KEY_TYPE_MASK)) | val;
-}
-
-static inline u64 btrfs_header_blocknr(struct btrfs_header *h)
-{
-	return le64_to_cpu(h->blocknr);
-}
-
-static inline void btrfs_set_header_blocknr(struct btrfs_header *h, u64 blocknr)
-{
-	h->blocknr = cpu_to_le64(blocknr);
-}
-
-static inline u64 btrfs_header_generation(struct btrfs_header *h)
+static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
 {
-	return le64_to_cpu(h->generation);
+	struct btrfs_disk_key disk_key;
+	btrfs_node_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline void btrfs_set_header_generation(struct btrfs_header *h,
-					       u64 val)
+static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb,
+				  struct btrfs_key *key, int nr)
 {
-	h->generation = cpu_to_le64(val);
+	struct btrfs_disk_key disk_key;
+	btrfs_item_key(eb, &disk_key, nr);
+	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline u64 btrfs_header_owner(struct btrfs_header *h)
+static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb,
+				      struct btrfs_dir_item *item,
+				      struct btrfs_key *key)
 {
-	return le64_to_cpu(h->owner);
+	struct btrfs_disk_key disk_key;
+	btrfs_dir_item_key(eb, item, &disk_key);
+	btrfs_disk_key_to_cpu(key, &disk_key);
 }
 
-static inline void btrfs_set_header_owner(struct btrfs_header *h,
-					       u64 val)
-{
-	h->owner = cpu_to_le64(val);
-}
-
-static inline u16 btrfs_header_nritems(struct btrfs_header *h)
-{
-	return le16_to_cpu(h->nritems);
-}
-
-static inline void btrfs_set_header_nritems(struct btrfs_header *h, u16 val)
-{
-	h->nritems = cpu_to_le16(val);
-}
-
-static inline u16 btrfs_header_flags(struct btrfs_header *h)
-{
-	return le16_to_cpu(h->flags);
-}
-
-static inline void btrfs_set_header_flags(struct btrfs_header *h, u16 val)
-{
-	h->flags = cpu_to_le16(val);
-}
-
-static inline int btrfs_header_level(struct btrfs_header *h)
-{
-	return h->level;
-}
-
-static inline void btrfs_set_header_level(struct btrfs_header *h, int level)
-{
-	BUG_ON(level > BTRFS_MAX_LEVEL);
-	h->level = level;
-}
-
-static inline int btrfs_is_leaf(struct btrfs_node *n)
-{
-	return (btrfs_header_level(&n->header) == 0);
-}
-
-static inline u64 btrfs_root_blocknr(struct btrfs_root_item *item)
-{
-	return le64_to_cpu(item->blocknr);
-}
-
-static inline void btrfs_set_root_blocknr(struct btrfs_root_item *item, u64 val)
-{
-	item->blocknr = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_root_dirid(struct btrfs_root_item *item)
-{
-	return le64_to_cpu(item->root_dirid);
-}
-
-static inline void btrfs_set_root_dirid(struct btrfs_root_item *item, u64 val)
-{
-	item->root_dirid = cpu_to_le64(val);
-}
-
-static inline u32 btrfs_root_refs(struct btrfs_root_item *item)
-{
-	return le32_to_cpu(item->refs);
-}
-
-static inline void btrfs_set_root_refs(struct btrfs_root_item *item, u32 val)
-{
-	item->refs = cpu_to_le32(val);
-}
-
-static inline u32 btrfs_root_flags(struct btrfs_root_item *item)
-{
-	return le32_to_cpu(item->flags);
-}
-
-static inline void btrfs_set_root_flags(struct btrfs_root_item *item, u32 val)
-{
-	item->flags = cpu_to_le32(val);
-}
-
-static inline void btrfs_set_root_blocks_used(struct btrfs_root_item *item,
-						   u64 val)
-{
-	item->blocks_used = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_root_blocks_used(struct btrfs_root_item *item)
-{
-	return le64_to_cpu(item->blocks_used);
-}
-
-static inline void btrfs_set_root_block_limit(struct btrfs_root_item *item,
-						u64 val)
-{
-	item->block_limit = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_root_block_limit(struct btrfs_root_item *item)
-{
-	return le64_to_cpu(item->block_limit);
-}
 
-static inline u64 btrfs_super_blocknr(struct btrfs_super_block *s)
+static inline u8 btrfs_key_type(struct btrfs_key *key)
 {
-	return le64_to_cpu(s->blocknr);
+	return key->type;
 }
 
-static inline void btrfs_set_super_blocknr(struct btrfs_super_block *s, u64 val)
+static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
 {
-	s->blocknr = cpu_to_le64(val);
+	key->type = val;
 }
 
-static inline u64 btrfs_super_generation(struct btrfs_super_block *s)
-{
-	return le64_to_cpu(s->generation);
-}
+/* struct btrfs_header */
+BTRFS_SETGET_HEADER_FUNCS(header_blocknr, struct btrfs_header, blocknr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
+			  generation, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 16);
+BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
 
-static inline void btrfs_set_super_generation(struct btrfs_super_block *s,
-					      u64 val)
+static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
-	s->generation = cpu_to_le64(val);
+	unsigned long ptr = offsetof(struct btrfs_header, fsid);
+	return (u8 *)ptr;
 }
 
-static inline u64 btrfs_super_root(struct btrfs_super_block *s)
+static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
 {
-	return le64_to_cpu(s->root);
+	unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
+	return (u8 *)ptr;
 }
 
-static inline void btrfs_set_super_root(struct btrfs_super_block *s, u64 val)
+static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
 {
-	s->root = cpu_to_le64(val);
+	unsigned long ptr = offsetof(struct btrfs_header, csum);
+	return (u8 *)ptr;
 }
 
-static inline u64 btrfs_super_total_blocks(struct btrfs_super_block *s)
+static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
 {
-	return le64_to_cpu(s->total_blocks);
+	return NULL;
 }
 
-static inline void btrfs_set_super_total_blocks(struct btrfs_super_block *s,
-						u64 val)
+static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
 {
-	s->total_blocks = cpu_to_le64(val);
+	return NULL;
 }
 
-static inline u64 btrfs_super_blocks_used(struct btrfs_super_block *s)
+static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
 {
-	return le64_to_cpu(s->blocks_used);
+	return NULL;
 }
 
-static inline void btrfs_set_super_blocks_used(struct btrfs_super_block *s,
-						u64 val)
+static inline int btrfs_is_leaf(struct extent_buffer *eb)
 {
-	s->blocks_used = cpu_to_le64(val);
+	return (btrfs_header_level(eb) == 0);
 }
 
-static inline u32 btrfs_super_blocksize(struct btrfs_super_block *s)
-{
-	return le32_to_cpu(s->blocksize);
-}
+/* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_FUNCS(disk_root_blocknr, struct btrfs_root_item, blocknr, 64);
 
-static inline void btrfs_set_super_blocksize(struct btrfs_super_block *s,
-						u32 val)
-{
-	s->blocksize = cpu_to_le32(val);
-}
+BTRFS_SETGET_STACK_FUNCS(root_blocknr, struct btrfs_root_item, blocknr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, blocks_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, block_limit, 64);
 
-static inline u64 btrfs_super_root_dir(struct btrfs_super_block *s)
-{
-	return le64_to_cpu(s->root_dir_objectid);
-}
+/* struct btrfs_super_block */
+BTRFS_SETGET_STACK_FUNCS(super_blocknr, struct btrfs_super_block, blocknr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_total_blocks, struct btrfs_super_block,
+		   total_blocks, 64);
+BTRFS_SETGET_STACK_FUNCS(super_blocks_used, struct btrfs_super_block,
+		   blocks_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
+			 sectorsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
+			 nodesize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
+			 leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
+			 root_dir_objectid, 64);
 
-static inline void btrfs_set_super_root_dir(struct btrfs_super_block *s, u64
-					    val)
+static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 {
-	s->root_dir_objectid = cpu_to_le64(val);
+	return offsetof(struct btrfs_leaf, items);
 }
 
-static inline u8 *btrfs_leaf_data(struct btrfs_leaf *l)
-{
-	return (u8 *)l->items;
-}
+/* struct btrfs_file_extent_item */
+BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
 
-static inline int btrfs_file_extent_type(struct btrfs_file_extent_item *e)
-{
-	return e->type;
-}
-static inline void btrfs_set_file_extent_type(struct btrfs_file_extent_item *e,
-					      u8 val)
-{
-	e->type = val;
-}
-
-static inline char *btrfs_file_extent_inline_start(struct
+static inline unsigned long btrfs_file_extent_inline_start(struct
 						   btrfs_file_extent_item *e)
 {
-	return (char *)(&e->disk_blocknr);
+	unsigned long offset = (unsigned long)e;
+	offset += offsetof(struct btrfs_file_extent_item, disk_blocknr);
+	return offset;
 }
 
 static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
 {
-	return (unsigned long)(&((struct
-		  btrfs_file_extent_item *)NULL)->disk_blocknr) + datasize;
-}
-
-static inline u32 btrfs_file_extent_inline_len(struct btrfs_item *e)
-{
-	struct btrfs_file_extent_item *fe = NULL;
-	return btrfs_item_size(e) - (unsigned long)(&fe->disk_blocknr);
-}
-
-static inline u64 btrfs_file_extent_disk_blocknr(struct btrfs_file_extent_item
-						 *e)
-{
-	return le64_to_cpu(e->disk_blocknr);
+	return offsetof(struct btrfs_file_extent_item, disk_blocknr) + datasize;
 }
 
-static inline void btrfs_set_file_extent_disk_blocknr(struct
-						      btrfs_file_extent_item
-						      *e, u64 val)
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       struct btrfs_item *e)
 {
-	e->disk_blocknr = cpu_to_le64(val);
+	unsigned long offset;
+	offset = offsetof(struct btrfs_file_extent_item, disk_blocknr);
+	return btrfs_item_size(eb, e) - offset;
 }
 
-static inline u64 btrfs_file_extent_generation(struct btrfs_file_extent_item *e)
-{
-	return le64_to_cpu(e->generation);
-}
-
-static inline void btrfs_set_file_extent_generation(struct
-						    btrfs_file_extent_item *e,
-						    u64 val)
-{
-	e->generation = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_file_extent_disk_num_blocks(struct
-						    btrfs_file_extent_item *e)
-{
-	return le64_to_cpu(e->disk_num_blocks);
-}
-
-static inline void btrfs_set_file_extent_disk_num_blocks(struct
-							 btrfs_file_extent_item
-							 *e, u64 val)
-{
-	e->disk_num_blocks = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_file_extent_offset(struct btrfs_file_extent_item *e)
-{
-	return le64_to_cpu(e->offset);
-}
-
-static inline void btrfs_set_file_extent_offset(struct btrfs_file_extent_item
-						*e, u64 val)
-{
-	e->offset = cpu_to_le64(val);
-}
-
-static inline u64 btrfs_file_extent_num_blocks(struct btrfs_file_extent_item
-					       *e)
-{
-	return le64_to_cpu(e->num_blocks);
-}
-
-static inline void btrfs_set_file_extent_num_blocks(struct
-						    btrfs_file_extent_item *e,
-						    u64 val)
-{
-	e->num_blocks = cpu_to_le64(val);
-}
+BTRFS_SETGET_FUNCS(file_extent_disk_blocknr, struct btrfs_file_extent_item,
+		   disk_blocknr, 64);
+BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_blocks, struct btrfs_file_extent_item,
+		   disk_num_blocks, 64);
+BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
+		  offset, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_blocks, struct btrfs_file_extent_item,
+		   num_blocks, 64);
 
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
 
-static inline void btrfs_check_bounds(void *vptr, size_t len,
-				     void *vcontainer, size_t container_len)
-{
-	char *ptr = vptr;
-	char *container = vcontainer;
-	WARN_ON(ptr < container);
-	WARN_ON(ptr + len > container + container_len);
-}
-
-static inline void btrfs_memcpy(struct btrfs_root *root,
-				void *dst_block,
-				void *dst, const void *src, size_t nr)
-{
-	btrfs_check_bounds(dst, nr, dst_block, root->fs_info->sb->s_blocksize);
-	memcpy(dst, src, nr);
-}
-
-static inline void btrfs_memmove(struct btrfs_root *root,
-				void *dst_block,
-				void *dst, void *src, size_t nr)
-{
-	btrfs_check_bounds(dst, nr, dst_block, root->fs_info->sb->s_blocksize);
-	memmove(dst, src, nr);
-}
-
 static inline int btrfs_set_root_name(struct btrfs_root *root,
 				      const char *name, int len)
 {
@@ -1063,7 +842,11 @@ static inline int btrfs_set_root_name(struct btrfs_root *root,
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
-	btrfs_item_offset((leaf)->items + (slot))))
+	btrfs_item_offset_nr(leaf, slot)))
+
+#define btrfs_item_ptr_offset(leaf, slot) \
+	((unsigned long)(btrfs_leaf_data(leaf) + \
+	btrfs_item_offset_nr(leaf, slot)))
 
 /* mount option defines and helpers */
 #define BTRFS_MOUNT_SUBVOL		0x000001
@@ -1084,7 +867,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 int data, int owner);
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
-struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root, u64 hint,
 					    u64 empty_size);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
@@ -1092,7 +875,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 num_blocks, u64 empty_size, u64 search_start,
 		       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct buffer_head *buf);
+		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -1106,10 +889,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 /* ctree.c */
-int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct buffer_head *buf, struct buffer_head
-			   *parent, int parent_slot, struct buffer_head
-			   **cow_ret);
+int btrfs_cow_block(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct extent_buffer *buf,
+		    struct extent_buffer *parent, int parent_slot,
+		    struct extent_buffer **cow_ret);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1120,7 +903,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, struct buffer_head *parent,
+		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int cache_only, u64 *last_ret);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
@@ -1134,7 +917,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *root, struct btrfs_path *path, struct btrfs_key
 			    *cpu_key, u32 data_size);
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
+int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root);
 /* root-item.c */
@@ -1179,9 +962,9 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 
 /* inode-item.c */
-int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, u64 objectid, struct btrfs_inode_item
-		       *inode_item);
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
 int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       *root, struct btrfs_path *path,
 		       struct btrfs_key *location, int mod);
@@ -1224,8 +1007,6 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
-int btrfs_get_block(struct inode *inode, sector_t iblock,
-		    struct buffer_head *result, int create);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t page_offset, u64 start, u64 end,
 				    int create);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 49db5fa7ced..6f19de41b87 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -33,7 +33,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	int ret;
 	char *ptr;
 	struct btrfs_item *item;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 
 	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
 	if (ret == -EEXIST) {
@@ -49,11 +49,11 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	if (ret < 0)
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	item = leaf->items + path->slots[0];
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(leaf, path->slots[0]);
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
-	BUG_ON(data_size > btrfs_item_size(item));
-	ptr += btrfs_item_size(item) - data_size;
+	BUG_ON(data_size > btrfs_item_size(leaf, item));
+	ptr += btrfs_item_size(leaf, item) - data_size;
 	return (struct btrfs_dir_item *)ptr;
 }
 
@@ -65,12 +65,13 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret2 = 0;
 	struct btrfs_path *path;
 	struct btrfs_dir_item *dir_item;
-	char *name_ptr;
+	struct extent_buffer *leaf;
+	unsigned long name_ptr;
 	struct btrfs_key key;
+	struct btrfs_disk_key disk_key;
 	u32 data_size;
 
 	key.objectid = dir;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
@@ -85,14 +86,16 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		goto out;
 	}
 
-	btrfs_cpu_key_to_disk(&dir_item->location, location);
-	btrfs_set_dir_type(dir_item, type);
-	btrfs_set_dir_flags(dir_item, 0);
-	btrfs_set_dir_name_len(dir_item, name_len);
-	name_ptr = (char *)(dir_item + 1);
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_flags(leaf, dir_item, 0);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	name_ptr = (unsigned long)(dir_item + 1);
 
-	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
 
 second_insert:
 	/* FIXME, use some real flag for selecting the extra index */
@@ -110,13 +113,15 @@ second_insert:
 		ret2 = PTR_ERR(dir_item);
 		goto out;
 	}
-	btrfs_cpu_key_to_disk(&dir_item->location, location);
-	btrfs_set_dir_type(dir_item, type);
-	btrfs_set_dir_flags(dir_item, 0);
-	btrfs_set_dir_name_len(dir_item, name_len);
-	name_ptr = (char *)(dir_item + 1);
-	btrfs_memcpy(root, path->nodes[0]->b_data, name_ptr, name, name_len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_flags(leaf, dir_item, 0);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	name_ptr = (unsigned long)(dir_item + 1);
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
 out:
 	btrfs_free_path(path);
 	if (ret)
@@ -136,14 +141,15 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
-	struct btrfs_disk_key *found_key;
-	struct btrfs_leaf *leaf;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
 
 	key.objectid = dir;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+
 	ret = btrfs_name_hash(name, name_len, &key.offset);
 	BUG_ON(ret);
+
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
 		return ERR_PTR(ret);
@@ -152,12 +158,13 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 			return NULL;
 		path->slots[0]--;
 	}
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	found_key = &leaf->items[path->slots[0]].key;
 
-	if (btrfs_disk_key_objectid(found_key) != dir ||
-	    btrfs_disk_key_type(found_key) != BTRFS_DIR_ITEM_KEY ||
-	    btrfs_disk_key_offset(found_key) != key.offset)
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != dir ||
+	    btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
+	    found_key.offset != key.offset)
 		return NULL;
 
 	return btrfs_match_dir_item_name(root, path, name, name_len);
@@ -176,7 +183,6 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 	int cow = mod != 0;
 
 	key.objectid = dir;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
 	key.offset = objectid;
 
@@ -193,21 +199,22 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      const char *name, int name_len)
 {
 	struct btrfs_dir_item *dir_item;
-	char *name_ptr;
+	unsigned long name_ptr;
 	u32 total_len;
 	u32 cur = 0;
 	u32 this_len;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	leaf = path->nodes[0];
 	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
-	total_len = btrfs_item_size(leaf->items + path->slots[0]);
+	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
 	while(cur < total_len) {
-		this_len = sizeof(*dir_item) + btrfs_dir_name_len(dir_item);
-		name_ptr = (char *)(dir_item + 1);
+		this_len = sizeof(*dir_item) +
+			btrfs_dir_name_len(leaf, dir_item);
+		name_ptr = (unsigned long)(dir_item + 1);
 
-		if (btrfs_dir_name_len(dir_item) == name_len &&
-		    memcmp(name_ptr, name, name_len) == 0)
+		if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
+		    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
 			return dir_item;
 
 		cur += this_len;
@@ -223,20 +230,23 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_dir_item *di)
 {
 
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	u32 sub_item_len;
 	u32 item_len;
 	int ret = 0;
 
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	sub_item_len = sizeof(*di) + btrfs_dir_name_len(di);
-	item_len = btrfs_item_size(leaf->items + path->slots[0]);
-	if (sub_item_len == btrfs_item_size(leaf->items + path->slots[0])) {
+	leaf = path->nodes[0];
+	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di);
+	item_len = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (sub_item_len == item_len) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
-		char *ptr = (char *)di;
-		char *start = btrfs_item_ptr(leaf, path->slots[0], char);
-		btrfs_memmove(root, leaf, ptr, ptr + sub_item_len,
+		/* MARKER */
+		unsigned long ptr = (unsigned long)di;
+		unsigned long start;
+
+		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
 		ret = btrfs_truncate_item(trans, root, path,
 					  item_len - sub_item_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ef6d76f2ec2..0c1f90cbedb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,138 +23,132 @@
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
+#include <linux/buffer_head.h> // for block_sync_page
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 
-u64 bh_blocknr(struct buffer_head *bh)
+#if 0
+static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 {
-	return bh->b_blocknr;
-}
-
-static int check_tree_block(struct btrfs_root *root, struct buffer_head *buf)
-{
-	struct btrfs_node *node = btrfs_buffer_node(buf);
-	if (bh_blocknr(buf) != btrfs_header_blocknr(&node->header)) {
-		printk(KERN_CRIT "bh_blocknr(buf) is %llu, header is %llu\n",
-		       (unsigned long long)bh_blocknr(buf),
-		       (unsigned long long)btrfs_header_blocknr(&node->header));
+	if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) {
+		printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n",
+		       (unsigned long long)extent_buffer_blocknr(buf),
+		       (unsigned long long)btrfs_header_blocknr(buf));
 		return 1;
 	}
 	return 0;
 }
+#endif
 
-struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr)
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+					    u64 blocknr)
 {
-	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
-	int blockbits = root->fs_info->sb->s_blocksize_bits;
-	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
-	struct page *page;
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	struct buffer_head *ret = NULL;
-
-
-	page = find_lock_page(mapping, index);
-	if (!page)
-		return NULL;
-
-	if (!page_has_buffers(page))
-		goto out_unlock;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	return find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+				   blocknr * root->sectorsize,
+				   root->sectorsize, GFP_NOFS);
+}
 
-	head = page_buffers(page);
-	bh = head;
-	do {
-		if (buffer_mapped(bh) && bh_blocknr(bh) == blocknr) {
-			ret = bh;
-			get_bh(bh);
-			goto out_unlock;
-		}
-		bh = bh->b_this_page;
-	} while (bh != head);
-out_unlock:
-	unlock_page(page);
-	page_cache_release(page);
-	return ret;
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						 u64 blocknr)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	return alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+				   blocknr * root->sectorsize,
+				   root->sectorsize, GFP_NOFS);
 }
 
-int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
-			     u64 logical)
+struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
+				    size_t page_offset, u64 start, u64 end,
+				    int create)
 {
-	if (logical == 0) {
-		bh->b_bdev = NULL;
-		bh->b_blocknr = 0;
-		set_buffer_mapped(bh);
-	} else {
-		map_bh(bh, root->fs_info->sb, logical);
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	int ret;
+
+again:
+	em = lookup_extent_mapping(em_tree, start, end);
+	if (em) {
+		goto out;
 	}
-	return 0;
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		em = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	em->start = 0;
+	em->end = (i_size_read(inode) & ~((u64)PAGE_CACHE_SIZE -1)) - 1;
+	em->block_start = 0;
+	em->block_end = em->end;
+	em->bdev = inode->i_sb->s_bdev;
+	ret = add_extent_mapping(em_tree, em);
+	if (ret == -EEXIST) {
+		free_extent_map(em);
+		em = NULL;
+		goto again;
+	} else if (ret) {
+		em = ERR_PTR(ret);
+	}
+out:
+	return em;
 }
 
-struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
-						 u64 blocknr)
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
-	int blockbits = root->fs_info->sb->s_blocksize_bits;
-	unsigned long index = blocknr >> (PAGE_CACHE_SHIFT - blockbits);
-	struct page *page;
-	struct buffer_head *bh;
-	struct buffer_head *head;
-	struct buffer_head *ret = NULL;
-	int err;
-	u64 first_block = index << (PAGE_CACHE_SHIFT - blockbits);
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	return extent_write_full_page(tree, page, btree_get_extent, wbc);
+}
+int btree_readpage(struct file *file, struct page *page)
+{
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	return extent_read_full_page(tree, page, btree_get_extent);
+}
 
-	page = find_or_create_page(mapping, index, GFP_NOFS);
-	if (!page)
-		return NULL;
+static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+	struct extent_map_tree *tree;
+	int ret;
 
-	if (!page_has_buffers(page))
-		create_empty_buffers(page, root->fs_info->sb->s_blocksize, 0);
-	head = page_buffers(page);
-	bh = head;
-	do {
-		if (!buffer_mapped(bh)) {
-			err = btrfs_map_bh_to_logical(root, bh, first_block);
-			BUG_ON(err);
-		}
-		if (bh_blocknr(bh) == blocknr) {
-			ret = bh;
-			get_bh(bh);
-			goto out_unlock;
-		}
-		bh = bh->b_this_page;
-		first_block++;
-	} while (bh != head);
-out_unlock:
-	unlock_page(page);
-	if (ret)
-		touch_buffer(ret);
-	page_cache_release(page);
+	BUG_ON(page->private != 1);
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(tree, page);
+	if (ret == 1) {
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
 	return ret;
 }
 
-static int btree_get_block(struct inode *inode, sector_t iblock,
-			   struct buffer_head *bh, int create)
+static void btree_invalidatepage(struct page *page, unsigned long offset)
 {
-	int err;
-	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
-	err = btrfs_map_bh_to_logical(root, bh, iblock);
-	return err;
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	extent_invalidatepage(tree, page, offset);
+	btree_releasepage(page, GFP_NOFS);
 }
 
 int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
 		    char *result)
 {
+	return 0;
+#if 0
 	u32 crc;
 	crc = crc32c(0, data, len);
 	memcpy(result, &crc, BTRFS_CRC32_SIZE);
 	return 0;
+#endif
 }
 
-static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
+#if 0
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
+	return 0;
 	char result[BTRFS_CRC32_SIZE];
 	int ret;
 	struct btrfs_node *node;
@@ -176,7 +170,9 @@ static int csum_tree_block(struct btrfs_root *root, struct buffer_head *bh,
 	}
 	return 0;
 }
+#endif
 
+#if 0
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct buffer_head *bh;
@@ -195,87 +191,65 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
 	} while (bh != head);
 	return block_write_full_page(page, btree_get_block, wbc);
 }
-
-static int btree_readpage(struct file * file, struct page * page)
-{
-	return block_read_full_page(page, btree_get_block);
-}
+#endif
 
 static struct address_space_operations btree_aops = {
 	.readpage	= btree_readpage,
 	.writepage	= btree_writepage,
+	.releasepage	= btree_releasepage,
+	.invalidatepage = btree_invalidatepage,
 	.sync_page	= block_sync_page,
 };
 
 int readahead_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct buffer_head *bh = NULL;
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
 	int ret = 0;
 
-	bh = btrfs_find_create_tree_block(root, blocknr);
-	if (!bh)
+	buf = btrfs_find_create_tree_block(root, blocknr);
+	if (!buf)
 		return 0;
-	if (buffer_uptodate(bh)) {
-		ret = 1;
-		goto done;
-	}
-	if (test_set_buffer_locked(bh)) {
-		ret = 1;
-		goto done;
-	}
-	if (!buffer_uptodate(bh)) {
-		get_bh(bh);
-		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(READ, bh);
-	} else {
-		unlock_buffer(bh);
-		ret = 1;
-	}
-done:
-	brelse(bh);
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
+				 buf, 0);
+	free_extent_buffer(buf);
 	return ret;
 }
 
-struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr)
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
 {
-	struct buffer_head *bh = NULL;
-
-	bh = btrfs_find_create_tree_block(root, blocknr);
-	if (!bh)
-		return bh;
-	if (buffer_uptodate(bh))
-		goto uptodate;
-	lock_buffer(bh);
-	if (!buffer_uptodate(bh)) {
-		get_bh(bh);
-		bh->b_end_io = end_buffer_read_sync;
-		submit_bh(READ, bh);
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(bh))
-			goto fail;
-	} else {
-		unlock_buffer(bh);
-	}
-uptodate:
-	if (!buffer_checked(bh)) {
-		csum_tree_block(root, bh, 1);
-		set_buffer_checked(bh);
-	}
-	if (check_tree_block(root, bh))
-		goto fail;
-	return bh;
-fail:
-	brelse(bh);
-	return NULL;
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+
+	buf = btrfs_find_create_tree_block(root, blocknr);
+	if (!buf)
+		return NULL;
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
+				 buf, 1);
+	return buf;
 }
 
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct buffer_head *buf)
+		     struct extent_buffer *buf)
 {
-	WARN_ON(atomic_read(&buf->b_count) == 0);
-	lock_buffer(buf);
-	clear_buffer_dirty(buf);
-	unlock_buffer(buf);
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
+	return 0;
+}
+
+int wait_on_tree_block_writeback(struct btrfs_root *root,
+				 struct extent_buffer *buf)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->extent_tree,
+					buf);
+	return 0;
+}
+
+int set_tree_block_dirty(struct btrfs_root *root, struct extent_buffer *buf)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
 	return 0;
 }
 
@@ -287,7 +261,9 @@ static int __setup_root(int blocksize,
 	root->node = NULL;
 	root->inode = NULL;
 	root->commit_root = NULL;
-	root->blocksize = blocksize;
+	root->sectorsize = blocksize;
+	root->nodesize = blocksize;
+	root->leafsize = blocksize;
 	root->ref_cows = 0;
 	root->fs_info = fs_info;
 	root->objectid = objectid;
@@ -332,7 +308,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 	struct btrfs_root *root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_path *path;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	u64 highest_inode;
 	int ret = 0;
 
@@ -361,11 +337,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 			ret = -ENOENT;
 		goto out;
 	}
-	l = btrfs_buffer_leaf(path->nodes[0]);
-	memcpy(&root->root_item,
-	       btrfs_item_ptr(l, path->slots[0], struct btrfs_root_item),
+	l = path->nodes[0];
+	read_extent_buffer(l, &root->root_item,
+	       btrfs_item_ptr_offset(l, path->slots[0]),
 	       sizeof(root->root_item));
-	memcpy(&root->root_key, location, sizeof(*location));
 	ret = 0;
 out:
 	btrfs_release_path(root, path);
@@ -406,21 +381,21 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				(unsigned long)root->root_key.objectid,
 				root);
 	if (ret) {
-		brelse(root->node);
+		free_extent_buffer(root->node);
 		kfree(root);
 		return ERR_PTR(ret);
 	}
 
 	ret = btrfs_set_root_name(root, name, namelen);
 	if (ret) {
-		brelse(root->node);
+		free_extent_buffer(root->node);
 		kfree(root);
 		return ERR_PTR(ret);
 	}
 
 	ret = btrfs_sysfs_add_root(root);
 	if (ret) {
-		brelse(root->node);
+		free_extent_buffer(root->node);
 		kfree(root->name);
 		kfree(root);
 		return ERR_PTR(ret);
@@ -471,6 +446,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_nlink = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+			     fs_info->btree_inode->i_mapping,
+			     GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
 
@@ -493,10 +471,14 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 
 	if (!fs_info->sb_buffer)
 		goto fail_iput;
-	disk_super = (struct btrfs_super_block *)fs_info->sb_buffer->b_data;
-	fs_info->disk_super = disk_super;
-	memcpy(&fs_info->super_copy, disk_super, sizeof(fs_info->super_copy));
 
+	read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0,
+			   sizeof(fs_info->super_copy));
+
+	read_extent_buffer(fs_info->sb_buffer, fs_info->fsid,
+			   (unsigned long)btrfs_super_fsid(fs_info->sb_buffer),
+			   BTRFS_FSID_SIZE);
+	disk_super = &fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
@@ -530,9 +512,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	return tree_root;
 
 fail_tree_root:
-	btrfs_block_release(tree_root, tree_root->node);
+	free_extent_buffer(tree_root->node);
 fail_sb_buffer:
-	btrfs_block_release(tree_root, fs_info->sb_buffer);
+	free_extent_buffer(fs_info->sb_buffer);
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
@@ -546,31 +528,13 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root)
 {
 	int ret;
-	struct buffer_head *bh = root->fs_info->sb_buffer;
-
-	lock_buffer(bh);
-	WARN_ON(atomic_read(&bh->b_count) < 1);
-	clear_buffer_dirty(bh);
-	csum_tree_block(root, bh, 0);
-	bh->b_end_io = end_buffer_write_sync;
-	get_bh(bh);
-	if (root->fs_info->do_barriers)
-		ret = submit_bh(WRITE_BARRIER, bh);
-	else
-		ret = submit_bh(WRITE, bh);
-	if (ret == -EOPNOTSUPP) {
-		get_bh(bh);
-		lock_buffer(bh);
-		set_buffer_uptodate(bh);
-		root->fs_info->do_barriers = 0;
-		ret = submit_bh(WRITE, bh);
-	}
-	wait_on_buffer(bh);
-	if (!buffer_uptodate(bh)) {
-		WARN_ON(1);
-		return -EIO;
-	}
-	return 0;
+	struct extent_buffer *super = root->fs_info->sb_buffer;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, super);
+	ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping,
+				     super->start, super->len);
+	return ret;
 }
 
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
@@ -581,9 +545,9 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 	if (root->inode)
 		iput(root->inode);
 	if (root->node)
-		brelse(root->node);
+		free_extent_buffer(root->node);
 	if (root->commit_root)
-		brelse(root->commit_root);
+		free_extent_buffer(root->commit_root);
 	if (root->name)
 		kfree(root->name);
 	kfree(root);
@@ -629,12 +593,10 @@ int close_ctree(struct btrfs_root *root)
 	mutex_unlock(&fs_info->fs_mutex);
 
 	if (fs_info->extent_root->node)
-		btrfs_block_release(fs_info->extent_root,
-				    fs_info->extent_root->node);
+		free_extent_buffer(fs_info->extent_root->node);
 	if (fs_info->tree_root->node)
-		btrfs_block_release(fs_info->tree_root,
-				    fs_info->tree_root->node);
-	btrfs_block_release(root, fs_info->sb_buffer);
+		free_extent_buffer(fs_info->tree_root->node);
+	free_extent_buffer(fs_info->sb_buffer);
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
 
@@ -645,25 +607,32 @@ int close_ctree(struct btrfs_root *root)
 	return 0;
 }
 
-void btrfs_mark_buffer_dirty(struct buffer_head *bh)
+int btrfs_buffer_uptodate(struct extent_buffer *buf)
+{
+	struct inode *btree_inode = buf->pages[0]->mapping->host;
+	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf);
+}
+
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(bh->b_page->mapping->host)->root;
-	u64 transid = btrfs_header_generation(btrfs_buffer_header(bh));
+	struct inode *btree_inode = buf->pages[0]->mapping->host;
+	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree,
+					  buf);
+}
 
-	WARN_ON(!atomic_read(&bh->b_count));
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	u64 transid = btrfs_header_generation(buf);
+	struct inode *btree_inode = root->fs_info->btree_inode;
 
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
-			(unsigned long long)bh->b_blocknr,
+			(unsigned long long)extent_buffer_blocknr(buf),
 			transid, root->fs_info->generation);
 		WARN_ON(1);
 	}
-	mark_buffer_dirty(bh);
-}
-
-void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf)
-{
-	brelse(buf);
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index fc6b520c6e0..70d9413c599 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,68 +19,35 @@
 #ifndef __DISKIO__
 #define __DISKIO__
 
-#include <linux/buffer_head.h>
-
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
 
-enum btrfs_bh_state_bits {
-	BH_Checked = BH_PrivateStart,
-	BH_Defrag,
-	BH_DefragDone,
-};
-BUFFER_FNS(Checked, checked);
-BUFFER_FNS(Defrag, defrag);
-BUFFER_FNS(DefragDone, defrag_done);
-
-static inline struct btrfs_node *btrfs_buffer_node(struct buffer_head *bh)
-{
-	return (struct btrfs_node *)bh->b_data;
-}
-
-static inline struct btrfs_leaf *btrfs_buffer_leaf(struct buffer_head *bh)
-{
-	return (struct btrfs_leaf *)bh->b_data;
-}
-
-static inline struct btrfs_header *btrfs_buffer_header(struct buffer_head *bh)
-{
-	return &((struct btrfs_node *)bh->b_data)->header;
-}
-
-struct buffer_head *read_tree_block(struct btrfs_root *root, u64 blocknr);
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr);
 int readahead_tree_block(struct btrfs_root *root, u64 blocknr);
-struct buffer_head *btrfs_find_create_tree_block(struct btrfs_root *root,
-						 u64 blocknr);
-int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct buffer_head *buf);
-int dirty_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct buffer_head *buf);
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						   u64 blocknr);
 int clean_tree_block(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct buffer_head *buf);
-int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root);
+		     struct btrfs_root *root, struct extent_buffer *buf);
 struct btrfs_root *open_ctree(struct super_block *sb);
 int close_ctree(struct btrfs_root *root);
-void btrfs_block_release(struct btrfs_root *root, struct buffer_head *buf);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
-struct buffer_head *btrfs_find_tree_block(struct btrfs_root *root, u64 blocknr);
-int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
-		    char *result);
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+					    u64 blocknr);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen);
 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 					       struct btrfs_key *location);
-u64 bh_blocknr(struct buffer_head *bh);
 int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   struct block_device *bdev,
 			   u64 device_id,
 			   u64 block_start,
 			   u64 num_blocks);
-int btrfs_map_bh_to_logical(struct btrfs_root *root, struct buffer_head *bh,
-			     u64 logical);
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
-void btrfs_mark_buffer_dirty(struct buffer_head *bh);
+void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf);
+int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
+int wait_on_tree_block_writeback(struct btrfs_root *root,
+				 struct extent_buffer *buf);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f261a8326cd..089c41cbca7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,7 +33,7 @@ static int cache_block_group(struct btrfs_root *root,
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	struct radix_tree_root *extent_radix;
 	int slot;
 	u64 i;
@@ -56,7 +56,6 @@ static int cache_block_group(struct btrfs_root *root,
 	path->reada = 2;
 	first_free = block_group->key.objectid;
 	key.objectid = block_group->key.objectid;
-	key.flags = 0;
 	key.offset = 0;
 
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -69,9 +68,9 @@ static int cache_block_group(struct btrfs_root *root,
 		path->slots[0]--;
 
 	while(1) {
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		leaf = path->nodes[0];
 		slot = path->slots[0];
-		if (slot >= btrfs_header_nritems(&leaf->header)) {
+		if (slot >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto err;
@@ -82,7 +81,7 @@ static int cache_block_group(struct btrfs_root *root,
 			}
 		}
 
-		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid < block_group->key.objectid) {
 			if (key.objectid + key.offset > first_free)
 				first_free = key.objectid + key.offset;
@@ -116,8 +115,7 @@ next:
 		hole_size = block_group->key.objectid +
 			block_group->key.offset - last;
 		for (i = 0; i < hole_size; i++) {
-			set_radix_bit(extent_radix,
-					last + i);
+			set_radix_bit(extent_radix, last + i);
 		}
 	}
 	block_group->cached = 1;
@@ -366,7 +364,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	struct btrfs_extent_item *item;
 	u32 refs;
 
@@ -375,7 +373,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	key.objectid = blocknr;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
@@ -386,10 +383,10 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 	BUG_ON(ret != 0);
-	l = btrfs_buffer_leaf(path->nodes[0]);
+	l = path->nodes[0];
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-	refs = btrfs_extent_refs(item);
-	btrfs_set_extent_refs(item, refs + 1);
+	refs = btrfs_extent_refs(l, item);
+	btrfs_set_extent_refs(l, item, refs + 1);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, path);
@@ -414,23 +411,25 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	struct btrfs_extent_item *item;
 
 	path = btrfs_alloc_path();
 	key.objectid = blocknr;
 	key.offset = num_blocks;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 0);
 	if (ret < 0)
 		goto out;
-	if (ret != 0)
+	if (ret != 0) {
+		btrfs_print_leaf(root, path->nodes[0]);
+		printk("failed to find block number %Lu\n", blocknr);
 		BUG();
-	l = btrfs_buffer_leaf(path->nodes[0]);
+	}
+	l = path->nodes[0];
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-	*refs = btrfs_extent_refs(item);
+	*refs = btrfs_extent_refs(l, item);
 out:
 	btrfs_free_path(path);
 	return 0;
@@ -439,16 +438,16 @@ out:
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root)
 {
-	return btrfs_inc_extent_ref(trans, root, bh_blocknr(root->node), 1);
+	return btrfs_inc_extent_ref(trans, root,
+				    extent_buffer_blocknr(root->node), 1);
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct buffer_head *buf)
+		  struct extent_buffer *buf)
 {
 	u64 blocknr;
-	struct btrfs_node *buf_node;
-	struct btrfs_leaf *buf_leaf;
-	struct btrfs_disk_key *key;
+	u32 nritems;
+	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int leaf;
@@ -458,31 +457,31 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 	if (!root->ref_cows)
 		return 0;
-	buf_node = btrfs_buffer_node(buf);
-	leaf = btrfs_is_leaf(buf_node);
-	buf_leaf = btrfs_buffer_leaf(buf);
-	for (i = 0; i < btrfs_header_nritems(&buf_node->header); i++) {
+
+	leaf = btrfs_is_leaf(buf);
+	nritems = btrfs_header_nritems(buf);
+	for (i = 0; i < nritems; i++) {
 		if (leaf) {
 			u64 disk_blocknr;
-			key = &buf_leaf->items[i].key;
-			if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
-			fi = btrfs_item_ptr(buf_leaf, i,
+			fi = btrfs_item_ptr(buf, i,
 					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(fi) ==
+			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
 				continue;
-			disk_blocknr = btrfs_file_extent_disk_blocknr(fi);
+			disk_blocknr = btrfs_file_extent_disk_blocknr(buf, fi);
 			if (disk_blocknr == 0)
 				continue;
 			ret = btrfs_inc_extent_ref(trans, root, disk_blocknr,
-				    btrfs_file_extent_disk_num_blocks(fi));
+				    btrfs_file_extent_disk_num_blocks(buf, fi));
 			if (ret) {
 				faili = i;
 				goto fail;
 			}
 		} else {
-			blocknr = btrfs_node_blockptr(buf_node, i);
+			blocknr = btrfs_node_blockptr(buf, i);
 			ret = btrfs_inc_extent_ref(trans, root, blocknr, 1);
 			if (ret) {
 				faili = i;
@@ -496,22 +495,23 @@ fail:
 	for (i =0; i < faili; i++) {
 		if (leaf) {
 			u64 disk_blocknr;
-			key = &buf_leaf->items[i].key;
-			if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
-			fi = btrfs_item_ptr(buf_leaf, i,
+			fi = btrfs_item_ptr(buf, i,
 					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(fi) ==
+			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
 				continue;
-			disk_blocknr = btrfs_file_extent_disk_blocknr(fi);
+			disk_blocknr = btrfs_file_extent_disk_blocknr(buf, fi);
 			if (disk_blocknr == 0)
 				continue;
 			err = btrfs_free_extent(trans, root, disk_blocknr,
-				    btrfs_file_extent_disk_num_blocks(fi), 0);
+				    btrfs_file_extent_disk_num_blocks(buf,
+								      fi), 0);
 			BUG_ON(err);
 		} else {
-			blocknr = btrfs_node_blockptr(buf_node, i);
+			blocknr = btrfs_node_blockptr(buf, i);
 			err = btrfs_free_extent(trans, root, blocknr, 1, 0);
 			BUG_ON(err);
 		}
@@ -527,16 +527,18 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	int ret;
 	int pending_ret;
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	struct btrfs_block_group_item *bi;
+	unsigned long bi;
+	struct extent_buffer *leaf;
 
 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 	if (ret < 0)
 		goto fail;
 	BUG_ON(ret);
-	bi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			    struct btrfs_block_group_item);
-	memcpy(bi, &cache->item, sizeof(*bi));
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	leaf = path->nodes[0];
+	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
+	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(extent_root, path);
 fail:
 	finish_current_insert(trans, extent_root);
@@ -768,11 +770,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	unsigned long gang[8];
 	struct btrfs_fs_info *info = extent_root->fs_info;
 
-	btrfs_set_extent_refs(&extent_item, 1);
+	btrfs_set_stack_extent_refs(&extent_item, 1);
 	ins.offset = 1;
-	ins.flags = 0;
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
-	btrfs_set_extent_owner(&extent_item, extent_root->root_key.objectid);
+	btrfs_set_stack_extent_owner(&extent_item,
+				     extent_root->root_key.objectid);
 
 	while(1) {
 		ret = find_first_radix_bit(&info->extent_ins_radix, gang, 0,
@@ -795,23 +797,20 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 {
 	int err;
-	struct btrfs_header *header;
-	struct buffer_head *bh;
+	struct extent_buffer *buf;
 
 	if (!pending) {
-		bh = btrfs_find_tree_block(root, blocknr);
-		if (bh) {
-			if (buffer_uptodate(bh)) {
+		buf = btrfs_find_tree_block(root, blocknr);
+		if (buf) {
+			if (btrfs_buffer_uptodate(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
-				header = btrfs_buffer_header(bh);
-				if (btrfs_header_generation(header) ==
-				    transid) {
-					btrfs_block_release(root, bh);
+				if (btrfs_header_generation(buf) == transid) {
+					free_extent_buffer(buf);
 					return 0;
 				}
 			}
-			btrfs_block_release(root, bh);
+			free_extent_buffer(buf);
 		}
 		err = set_radix_bit(&root->fs_info->pinned_radix, blocknr);
 		if (!err) {
@@ -839,12 +838,12 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_key key;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
+	struct extent_buffer *leaf;
 	int ret;
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
 	key.objectid = blocknr;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_blocks;
 
@@ -856,12 +855,16 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret < 0)
 		return ret;
 	BUG_ON(ret);
-	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_extent_item);
-	BUG_ON(ei->refs == 0);
-	refs = btrfs_extent_refs(ei) - 1;
-	btrfs_set_extent_refs(ei, refs);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	refs = btrfs_extent_refs(leaf, ei);
+	BUG_ON(refs == 0);
+	refs -= 1;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	btrfs_mark_buffer_dirty(leaf);
+
 	if (refs == 0) {
 		u64 super_blocks_used, root_blocks_used;
 
@@ -876,8 +879,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 					    super_blocks_used - num_blocks);
 
 		/* block accounting for root item */
-		root_blocks_used = btrfs_root_blocks_used(&root->root_item);
-		btrfs_set_root_blocks_used(&root->root_item,
+		root_blocks_used = btrfs_root_used(&root->root_item);
+		btrfs_set_root_used(&root->root_item,
 					   root_blocks_used - num_blocks);
 
 		ret = btrfs_del_item(trans, extent_root, path);
@@ -984,7 +987,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	u64 test_block;
 	u64 orig_search_start = search_start;
 	int start_found;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	int total_needed = num_blocks;
@@ -994,10 +997,10 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int wrapped = 0;
 
 	WARN_ON(num_blocks < 1);
-	ins->flags = 0;
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
-	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	level = btrfs_header_level(root->node);
+
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_blocks(&info->super_copy);
 	if (hint_block) {
@@ -1034,8 +1037,9 @@ check_failed:
 		path->slots[0]--;
 	}
 
-	l = btrfs_buffer_leaf(path->nodes[0]);
-	btrfs_disk_key_to_cpu(&key, &l->items[path->slots[0]].key);
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
 	/*
 	 * a rare case, go back one key if we hit a block group item
 	 * instead of an extent item
@@ -1055,9 +1059,9 @@ check_failed:
 	}
 
 	while (1) {
-		l = btrfs_buffer_leaf(path->nodes[0]);
+		l = path->nodes[0];
 		slot = path->slots[0];
-		if (slot >= btrfs_header_nritems(&l->header)) {
+		if (slot >= btrfs_header_nritems(l)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
@@ -1075,7 +1079,7 @@ check_failed:
 			goto check_pending;
 		}
 
-		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
+		btrfs_item_key_to_cpu(l, &key, slot);
 		if (key.objectid >= search_start && key.objectid > last_block &&
 		    start_found) {
 			if (last_block < search_start)
@@ -1183,8 +1187,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
 
-	btrfs_set_extent_refs(&extent_item, 1);
-	btrfs_set_extent_owner(&extent_item, owner);
+	btrfs_set_stack_extent_refs(&extent_item, 1);
+	btrfs_set_stack_extent_owner(&extent_item, owner);
 
 	WARN_ON(num_blocks < 1);
 	ret = find_free_extent(trans, root, num_blocks, empty_size,
@@ -1201,8 +1205,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 				    num_blocks);
 
 	/* block accounting for root item */
-	root_blocks_used = btrfs_root_blocks_used(&root->root_item);
-	btrfs_set_root_blocks_used(&root->root_item, root_blocks_used +
+	root_blocks_used = btrfs_root_used(&root->root_item);
+	btrfs_set_root_used(&root->root_item, root_blocks_used +
 				   num_blocks);
 
 	if (root == extent_root) {
@@ -1241,13 +1245,13 @@ update_block:
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
  */
-struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root, u64 hint,
-					   u64 empty_size)
+struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root, u64 hint,
+					     u64 empty_size)
 {
 	struct btrfs_key ins;
 	int ret;
-	struct buffer_head *buf;
+	struct extent_buffer *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
 				 1, empty_size, hint, (u64)-1, &ins, 0);
@@ -1260,53 +1264,57 @@ struct buffer_head *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		btrfs_free_extent(trans, root, ins.objectid, 1, 0);
 		return ERR_PTR(-ENOMEM);
 	}
-	WARN_ON(buffer_dirty(buf));
-	set_buffer_uptodate(buf);
+	btrfs_set_buffer_uptodate(buf);
+	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	/*
 	set_buffer_checked(buf);
 	set_buffer_defrag(buf);
-	set_radix_bit(&trans->transaction->dirty_pages, buf->b_page->index);
+	*/
+	/* FIXME!!!!!!!!!!!!!!!!
+	set_radix_bit(&trans->transaction->dirty_pages, buf->pages[0]->index);
+	*/
 	trans->blocks_used++;
 	return buf;
 }
 
 static int drop_leaf_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root, struct buffer_head *cur)
+			 struct btrfs_root *root, struct extent_buffer *leaf)
 {
-	struct btrfs_disk_key *key;
-	struct btrfs_leaf *leaf;
+	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int nritems;
 	int ret;
 
-	BUG_ON(!btrfs_is_leaf(btrfs_buffer_node(cur)));
-	leaf = btrfs_buffer_leaf(cur);
-	nritems = btrfs_header_nritems(&leaf->header);
+	BUG_ON(!btrfs_is_leaf(leaf));
+	nritems = btrfs_header_nritems(leaf);
 	for (i = 0; i < nritems; i++) {
 		u64 disk_blocknr;
-		key = &leaf->items[i].key;
-		if (btrfs_disk_key_type(key) != BTRFS_EXTENT_DATA_KEY)
+
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 			continue;
 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-		if (btrfs_file_extent_type(fi) == BTRFS_FILE_EXTENT_INLINE)
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
 			continue;
 		/*
 		 * FIXME make sure to insert a trans record that
 		 * repeats the snapshot del on crash
 		 */
-		disk_blocknr = btrfs_file_extent_disk_blocknr(fi);
+		disk_blocknr = btrfs_file_extent_disk_blocknr(leaf, fi);
 		if (disk_blocknr == 0)
 			continue;
 		ret = btrfs_free_extent(trans, root, disk_blocknr,
-					btrfs_file_extent_disk_num_blocks(fi),
-					0);
+				btrfs_file_extent_disk_num_blocks(leaf, fi), 0);
 		BUG_ON(ret);
 	}
 	return 0;
 }
 
 static void reada_walk_down(struct btrfs_root *root,
-			    struct btrfs_node *node)
+			    struct extent_buffer *node)
 {
 	int i;
 	u32 nritems;
@@ -1314,7 +1322,7 @@ static void reada_walk_down(struct btrfs_root *root,
 	int ret;
 	u32 refs;
 
-	nritems = btrfs_header_nritems(&node->header);
+	nritems = btrfs_header_nritems(node);
 	for (i = 0; i < nritems; i++) {
 		blocknr = btrfs_node_blockptr(node, i);
 		ret = lookup_extent_ref(NULL, root, blocknr, 1, &refs);
@@ -1337,16 +1345,17 @@ static void reada_walk_down(struct btrfs_root *root,
 static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, int *level)
 {
-	struct buffer_head *next;
-	struct buffer_head *cur;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
 	u64 blocknr;
 	int ret;
 	u32 refs;
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = lookup_extent_ref(trans, root, bh_blocknr(path->nodes[*level]),
-			       1, &refs);
+	ret = lookup_extent_ref(trans, root,
+				extent_buffer_blocknr(path->nodes[*level]),
+				1, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
@@ -1360,21 +1369,20 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		cur = path->nodes[*level];
 
 		if (*level > 0 && path->slots[*level] == 0)
-			reada_walk_down(root, btrfs_buffer_node(cur));
+			reada_walk_down(root, cur);
 
-		if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+		if (btrfs_header_level(cur) != *level)
 			WARN_ON(1);
 
 		if (path->slots[*level] >=
-		    btrfs_header_nritems(btrfs_buffer_header(cur)))
+		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
 			ret = drop_leaf_ref(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
-		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
-					      path->slots[*level]);
+		blocknr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ret = lookup_extent_ref(trans, root, blocknr, 1, &refs);
 		BUG_ON(ret);
 		if (refs != 1) {
@@ -1384,8 +1392,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			continue;
 		}
 		next = btrfs_find_tree_block(root, blocknr);
-		if (!next || !buffer_uptodate(next)) {
-			brelse(next);
+		if (!next || !btrfs_buffer_uptodate(next)) {
+			free_extent_buffer(next);
 			mutex_unlock(&root->fs_info->fs_mutex);
 			next = read_tree_block(root, blocknr);
 			mutex_lock(&root->fs_info->fs_mutex);
@@ -1395,7 +1403,7 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			BUG_ON(ret);
 			if (refs != 1) {
 				path->slots[*level]++;
-				brelse(next);
+				free_extent_buffer(next);
 				ret = btrfs_free_extent(trans, root,
 							blocknr, 1, 1);
 				BUG_ON(ret);
@@ -1404,17 +1412,17 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
-			btrfs_block_release(root, path->nodes[*level-1]);
+			free_extent_buffer(path->nodes[*level-1]);
 		path->nodes[*level-1] = next;
-		*level = btrfs_header_level(btrfs_buffer_header(next));
+		*level = btrfs_header_level(next);
 		path->slots[*level] = 0;
 	}
 out:
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = btrfs_free_extent(trans, root,
-				bh_blocknr(path->nodes[*level]), 1, 1);
-	btrfs_block_release(root, path->nodes[*level]);
+			extent_buffer_blocknr(path->nodes[*level]), 1, 1);
+	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	BUG_ON(ret);
@@ -1436,24 +1444,24 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot < btrfs_header_nritems(
-		    btrfs_buffer_header(path->nodes[i])) - 1) {
-			struct btrfs_node *node;
-			node = btrfs_buffer_node(path->nodes[i]);
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+			struct extent_buffer *node;
+			struct btrfs_disk_key disk_key;
+			node = path->nodes[i];
 			path->slots[i]++;
 			*level = i;
 			WARN_ON(*level == 0);
+			btrfs_node_key(node, &disk_key, path->slots[i]);
 			memcpy(&root_item->drop_progress,
-			       &node->ptrs[path->slots[i]].key,
-			       sizeof(root_item->drop_progress));
+			       &disk_key, sizeof(disk_key));
 			root_item->drop_level = i;
 			return 0;
 		} else {
 			ret = btrfs_free_extent(trans, root,
-						bh_blocknr(path->nodes[*level]),
-						1, 1);
+				    extent_buffer_blocknr(path->nodes[*level]),
+				    1, 1);
 			BUG_ON(ret);
-			btrfs_block_release(root, path->nodes[*level]);
+			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
 		}
@@ -1480,15 +1488,15 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	level = btrfs_header_level(root->node);
 	orig_level = level;
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 		path->nodes[level] = root->node;
 		path->slots[level] = 0;
 	} else {
 		struct btrfs_key key;
-		struct btrfs_disk_key *found_key;
-		struct btrfs_node *node;
+		struct btrfs_disk_key found_key;
+		struct extent_buffer *node;
 
 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
 		level = root_item->drop_level;
@@ -1498,10 +1506,10 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = wret;
 			goto out;
 		}
-		node = btrfs_buffer_node(path->nodes[level]);
-		found_key = &node->ptrs[path->slots[level]].key;
-		WARN_ON(memcmp(found_key, &root_item->drop_progress,
-			       sizeof(*found_key)));
+		node = path->nodes[level];
+		btrfs_node_key(node, &found_key, path->slots[level]);
+		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
+			       sizeof(found_key)));
 	}
 	while(1) {
 		wret = walk_down_tree(trans, root, path, &level);
@@ -1516,12 +1524,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret < 0)
 			ret = wret;
 		ret = -EAGAIN;
-		get_bh(root->node);
+		extent_buffer_get(root->node);
 		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
-			btrfs_block_release(root, path->nodes[i]);
+			free_extent_buffer(path->nodes[i]);
 			path->nodes[i] = 0;
 		}
 	}
@@ -1581,13 +1589,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_path *path;
 	int ret;
 	int err = 0;
-	struct btrfs_block_group_item *bi;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct radix_tree_root *radix;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	u64 group_size_blocks;
 	u64 used;
 
@@ -1596,7 +1603,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	root = info->extent_root;
 	key.objectid = 0;
 	key.offset = group_size_blocks;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
 	path = btrfs_alloc_path();
@@ -1610,18 +1616,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			err = ret;
 			break;
 		}
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		btrfs_disk_key_to_cpu(&found_key,
-				      &leaf->items[path->slots[0]].key);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		cache = kmalloc(sizeof(*cache), GFP_NOFS);
 		if (!cache) {
 			err = -1;
 			break;
 		}
 
-		bi = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_block_group_item);
-		if (bi->flags & BTRFS_BLOCK_GROUP_DATA) {
+		read_extent_buffer(leaf, &cache->item,
+				   btrfs_item_ptr_offset(leaf, path->slots[0]),
+				   sizeof(cache->item));
+		if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
 			radix = &info->block_group_data_radix;
 			cache->data = 1;
 		} else {
@@ -1629,7 +1635,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			cache->data = 0;
 		}
 
-		memcpy(&cache->item, bi, sizeof(*bi));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->last_alloc = cache->key.objectid;
 		cache->first_free = cache->key.objectid;
@@ -1640,11 +1645,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
+
 		ret = radix_tree_insert(radix, found_key.objectid +
 					found_key.offset - 1,
 					(void *)cache);
 		BUG_ON(ret);
-		used = btrfs_block_group_used(bi);
+		used = btrfs_block_group_used(&cache->item);
 		if (used < div_factor(key.offset, 8)) {
 			radix_tree_tag_set(radix, found_key.objectid +
 					   found_key.offset - 1,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 0ab368e091f..55fefdfab84 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -26,16 +26,6 @@ struct tree_entry {
 	struct rb_node rb_node;
 };
 
-/* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-
 void __init extent_map_init(void)
 {
 	extent_map_cache = btrfs_cache_create("extent_map",
@@ -223,7 +213,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		if (rb)
 			prev = rb_entry(rb, struct extent_map, rb_node);
 		if (prev && prev->end + 1 == em->start &&
-		    ((em->block_start == 0 && prev->block_start == 0) ||
+		    ((em->block_start == EXTENT_MAP_HOLE &&
+		      prev->block_start == EXTENT_MAP_HOLE) ||
 			     (em->block_start == prev->block_end + 1))) {
 			em->start = prev->start;
 			em->block_start = prev->block_start;
@@ -926,6 +917,40 @@ int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(set_range_writeback);
 
+int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 1;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->state & bits) {
+			*start_ret = state->start;
+			*end_ret = state->end;
+			ret = 0;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+EXPORT_SYMBOL(find_first_extent_bit);
+
 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
 			     u64 start, u64 lock_start, u64 *end, u64 max_bytes)
 {
@@ -1450,7 +1475,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 		em = NULL;
 
 		/* we've found a hole, just zero and go on */
-		if (block_start == 0) {
+		if (block_start == EXTENT_MAP_HOLE) {
 			zero_user_page(page, page_offset, iosize, KM_USER0);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    GFP_NOFS);
@@ -1593,7 +1618,8 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 		free_extent_map(em);
 		em = NULL;
 
-		if (block_start == 0 || block_start == EXTENT_MAP_INLINE) {
+		if (block_start == EXTENT_MAP_HOLE ||
+		    block_start == EXTENT_MAP_INLINE) {
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
 			cur = cur + iosize;
@@ -1630,7 +1656,6 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 		nr++;
 	}
 done:
-	WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0));
 	unlock_extent(tree, start, page_end, GFP_NOFS);
 	unlock_page(page);
 	return 0;
@@ -1827,8 +1852,623 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 
 	// XXX(hch): block 0 is valid in some cases, e.g. XFS RT device
 	if (em->block_start == EXTENT_MAP_INLINE ||
-	    em->block_start == 0)
-	    	return 0;
+	    em->block_start == EXTENT_MAP_HOLE)
+		return 0;
 
 	return (em->block_start + start - em->start) >> inode->i_blkbits;
 }
+
+struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
+					  u64 start, unsigned long len,
+					  gfp_t mask)
+{
+	unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) -
+				  (start >> PAGE_CACHE_SHIFT) + 1;
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+	int uptodate = 0;
+
+	eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask);
+	if (!eb || IS_ERR(eb))
+		return NULL;
+
+	eb->start = start;
+	eb->len = len;
+	atomic_set(&eb->refs, 1);
+
+	for (i = 0; i < num_pages; i++, index++) {
+		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+		if (!p)
+			goto fail;
+		eb->pages[i] = p;
+		if (!PageUptodate(p))
+			uptodate = 0;
+		unlock_page(p);
+	}
+	if (uptodate)
+		eb->flags |= EXTENT_UPTODATE;
+	return eb;
+fail:
+	free_extent_buffer(eb);
+	return NULL;
+}
+EXPORT_SYMBOL(alloc_extent_buffer);
+
+struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask)
+{
+	unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) -
+				  (start >> PAGE_CACHE_SHIFT) + 1;
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+
+	eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask);
+	if (!eb || IS_ERR(eb))
+		return NULL;
+
+	eb->start = start;
+	eb->len = len;
+	atomic_set(&eb->refs, 1);
+
+	for (i = 0; i < num_pages; i++, index++) {
+		p = find_get_page(mapping, index);
+		if (!p)
+			goto fail;
+		eb->pages[i] = p;
+	}
+	return eb;
+fail:
+	free_extent_buffer(eb);
+	return NULL;
+}
+EXPORT_SYMBOL(find_extent_buffer);
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+
+	if (!eb)
+		return;
+
+	if (!atomic_dec_and_test(&eb->refs))
+		return;
+
+	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
+		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+
+	for (i = 0; i < num_pages; i++) {
+		if (eb->pages[i])
+			page_cache_release(eb->pages[i]);
+	}
+	kfree(eb);
+}
+EXPORT_SYMBOL(free_extent_buffer);
+
+int clear_extent_buffer_dirty(struct extent_map_tree *tree,
+			      struct extent_buffer *eb)
+{
+	int set;
+	unsigned long i;
+	unsigned long num_pages;
+	struct page *page;
+
+	u64 start = eb->start;
+	u64 end = start + eb->len - 1;
+
+	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
+	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
+		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+
+	for (i = 0; i < num_pages; i++) {
+		page = eb->pages[i];
+		lock_page(page);
+		/*
+		 * if we're on the last page or the first page and the
+		 * block isn't aligned on a page boundary, do extra checks
+		 * to make sure we don't clean page that is partially dirty
+		 */
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
+			start = page->index << PAGE_CACHE_SHIFT;
+			end  = start + PAGE_CACHE_SIZE - 1;
+			if (test_range_bit(tree, start, end,
+					   EXTENT_DIRTY, 0)) {
+				unlock_page(page);
+				continue;
+			}
+		}
+		clear_page_dirty_for_io(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(clear_extent_buffer_dirty);
+
+int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
+				    struct extent_buffer *eb)
+{
+	return wait_on_extent_writeback(tree, eb->start,
+					eb->start + eb->len - 1);
+}
+EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
+
+int set_extent_buffer_dirty(struct extent_map_tree *tree,
+			     struct extent_buffer *eb)
+{
+	return set_range_dirty(tree, eb->start, eb->start + eb->len - 1);
+}
+EXPORT_SYMBOL(set_extent_buffer_dirty);
+
+int set_extent_buffer_uptodate(struct extent_map_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
+		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+
+	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			    GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = eb->pages[i];
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
+			check_page_uptodate(tree, page);
+			continue;
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(set_extent_buffer_uptodate);
+
+int extent_buffer_uptodate(struct extent_map_tree *tree,
+			     struct extent_buffer *eb)
+{
+	if (eb->flags & EXTENT_UPTODATE)
+		return 1;
+	return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1);
+}
+EXPORT_SYMBOL(extent_buffer_uptodate);
+
+int read_extent_buffer_pages(struct extent_map_tree *tree,
+			     struct extent_buffer *eb, int wait)
+{
+	unsigned long i;
+	struct page *page;
+	int err;
+	int ret = 0;
+	unsigned long num_pages;
+
+	if (eb->flags & EXTENT_UPTODATE)
+		return 0;
+
+	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1)) {
+		return 0;
+	}
+
+	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
+		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+	for (i = 0; i < num_pages; i++) {
+		page = eb->pages[i];
+		if (PageUptodate(page)) {
+			continue;
+		}
+		if (!wait) {
+			if (TestSetPageLocked(page)) {
+				continue;
+			}
+		} else {
+			lock_page(page);
+		}
+		if (!PageUptodate(page)) {
+			err = page->mapping->a_ops->readpage(NULL, page);
+			if (err) {
+				ret = err;
+			}
+		} else {
+			unlock_page(page);
+		}
+	}
+
+	if (ret || !wait) {
+		return ret;
+	}
+
+	for (i = 0; i < num_pages; i++) {
+		page = eb->pages[i];
+		wait_on_page_locked(page);
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+		}
+	}
+	eb->flags |= EXTENT_UPTODATE;
+	return ret;
+}
+EXPORT_SYMBOL(read_extent_buffer_pages);
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	page = eb->pages[i];
+	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	if (i == 0)
+		offset += start_offset;
+
+	while(len > 0) {
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		// kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = page_address(page);
+		memcpy(dst, kaddr + offset, cur);
+		// kunmap_atomic(kaddr, KM_USER0);
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+		page = eb->pages[i];
+	}
+}
+EXPORT_SYMBOL(read_extent_buffer);
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+		      char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km)
+{
+	size_t offset;
+	char *kaddr;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+
+	if (i == 0) {
+		offset = start_offset;
+		*map_start = 0;
+	} else {
+		offset = 0;
+		*map_start = (i << PAGE_CACHE_SHIFT) - offset;
+	}
+
+	// kaddr = kmap_atomic(eb->pages[i], km);
+	kaddr = page_address(eb->pages[i]);
+	*token = kaddr;
+	*map = kaddr + offset;
+	*map_len = PAGE_CACHE_SIZE - offset;
+	return 0;
+}
+EXPORT_SYMBOL(map_extent_buffer);
+
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+	// kunmap_atomic(token, km);
+}
+EXPORT_SYMBOL(unmap_extent_buffer);
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *ptr = (char *)ptrv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	page = eb->pages[i];
+	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	if (i == 0)
+		offset += start_offset;
+
+	while(len > 0) {
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+		// kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = page_address(page);
+		ret = memcmp(ptr, kaddr + offset, cur);
+		// kunmap_atomic(kaddr, KM_USER0);
+		if (ret)
+			break;
+
+		ptr += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+		page = eb->pages[i];
+	}
+	return ret;
+}
+EXPORT_SYMBOL(memcmp_extent_buffer);
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+			 unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *src = (char *)srcv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	page = eb->pages[i];
+	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	if (i == 0)
+		offset += start_offset;
+
+	while(len > 0) {
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		// kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = page_address(page);
+		memcpy(kaddr + offset, src, cur);
+		// kunmap_atomic(kaddr, KM_USER0);
+
+		src += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+		page = eb->pages[i];
+	}
+}
+EXPORT_SYMBOL(write_extent_buffer);
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	page = eb->pages[i];
+	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	if (i == 0)
+		offset += start_offset;
+
+	while(len > 0) {
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		// kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = page_address(page);
+		memset(kaddr + offset, c, cur);
+		// kunmap_atomic(kaddr, KM_USER0);
+
+		len -= cur;
+		offset = 0;
+		i++;
+		page = eb->pages[i];
+	}
+}
+EXPORT_SYMBOL(memset_extent_buffer);
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len)
+{
+	u64 dst_len = dst->len;
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(src->len != dst_len);
+
+	offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	if (i == 0)
+		offset += start_offset;
+
+	while(len > 0) {
+		page = dst->pages[i];
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+		// kaddr = kmap_atomic(page, KM_USER1);
+		kaddr = page_address(page);
+		read_extent_buffer(src, kaddr + offset, src_offset, cur);
+		// kunmap_atomic(kaddr, KM_USER1);
+
+		src_offset += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(copy_extent_buffer);
+
+static void move_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	// char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	char *dst_kaddr = page_address(dst_page);
+	if (dst_page == src_page) {
+		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+	} else {
+		// char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+		char *src_kaddr = page_address(src_page);
+		char *p = dst_kaddr + dst_off + len;
+		char *s = src_kaddr + src_off + len;
+
+		while (len--)
+			*--p = *--s;
+
+		// kunmap_atomic(src_kaddr, KM_USER1);
+	}
+	// kunmap_atomic(dst_kaddr, KM_USER0);
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	//kmap_atomic(dst_page, KM_USER0);
+	char *dst_kaddr = page_address(dst_page);
+	char *src_kaddr;
+
+	if (dst_page != src_page)
+		src_kaddr = page_address(src_page); // kmap_atomic(src_page, KM_USER1);
+	else
+		src_kaddr = dst_kaddr;
+
+	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	/*
+	kunmap_atomic(dst_kaddr, KM_USER0);
+	if (dst_page != src_page)
+		kunmap_atomic(src_kaddr, KM_USER1);
+	*/
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
+		       src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
+		       dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+
+	while(len > 0) {
+		dst_off_in_page = dst_offset &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = src_offset &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+		if (src_i == 0)
+			src_off_in_page += start_offset;
+		if (dst_i == 0)
+			dst_off_in_page += start_offset;
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+					       src_off_in_page));
+		cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE -
+					       dst_off_in_page));
+
+		copy_pages(dst->pages[dst_i], dst->pages[src_i],
+			   dst_off_in_page, src_off_in_page, cur);
+
+		src_offset += cur;
+		dst_offset += cur;
+		len -= cur;
+	}
+}
+EXPORT_SYMBOL(memcpy_extent_buffer);
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	unsigned long dst_end = dst_offset + len - 1;
+	unsigned long src_end = src_offset + len - 1;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
+		       src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
+		       dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset < src_offset) {
+		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+		return;
+	}
+	while(len > 0) {
+		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+		dst_off_in_page = dst_end &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = src_end &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		if (src_i == 0)
+			src_off_in_page += start_offset;
+		if (dst_i == 0)
+			dst_off_in_page += start_offset;
+
+		cur = min(len, src_off_in_page + 1);
+		cur = min(cur, dst_off_in_page + 1);
+// printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur);
+		move_pages(dst->pages[dst_i], dst->pages[src_i],
+			   dst_off_in_page - cur + 1,
+			   src_off_in_page - cur + 1, cur);
+
+		dst_end -= cur - 1;
+		src_end -= cur - 1;
+		len -= cur;
+	}
+}
+EXPORT_SYMBOL(memmove_extent_buffer);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index d913ce01248..430b997a70f 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,9 +3,20 @@
 
 #include <linux/rbtree.h>
 
+#define EXTENT_MAP_HOLE (u64)-3
 #define EXTENT_MAP_INLINE (u64)-2
 #define EXTENT_MAP_DELALLOC (u64)-1
 
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+
 struct extent_map_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
@@ -51,10 +62,13 @@ struct extent_state {
 	struct list_head list;
 };
 
+#define EXTENT_BUFFER_SIZE(nr) (sizeof(struct extent_buffer) + \
+			       (nr - 1) * sizeof(struct page *))
 struct extent_buffer {
 	u64 start;
-	u64 end; /* inclusive */
-	char *addr;
+	unsigned long len;
+	atomic_t refs;
+	int flags;
 	struct page *pages[];
 };
 
@@ -87,8 +101,12 @@ int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits);
 int extent_invalidatepage(struct extent_map_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
@@ -106,4 +124,57 @@ int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
+					  u64 start, unsigned long len,
+					  gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_map_tree *tree,
+			     struct extent_buffer *eb, int wait);
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->refs);
+}
+
+static inline u64 extent_buffer_blocknr(struct extent_buffer *eb)
+{
+	return eb->start / 4096;
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+			unsigned long start,
+			unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+			 unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
+				    struct extent_buffer *eb);
+int clear_extent_buffer_dirty(struct extent_map_tree *tree,
+			      struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_map_tree *tree,
+			     struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_map_tree *tree,
+			       struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_map_tree *tree,
+			   struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ea4dd4c5fce..226f6d028c3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -34,12 +34,12 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *item;
 	struct btrfs_key file_key;
 	struct btrfs_path *path;
+	struct extent_buffer *leaf;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	file_key.objectid = objectid;
 	file_key.offset = pos;
-	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -47,15 +47,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret);
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-	btrfs_set_file_extent_disk_blocknr(item, offset);
-	btrfs_set_file_extent_disk_num_blocks(item, disk_num_blocks);
-	btrfs_set_file_extent_offset(item, 0);
-	btrfs_set_file_extent_num_blocks(item, num_blocks);
-	btrfs_set_file_extent_generation(item, trans->transid);
-	btrfs_set_file_extent_type(item, BTRFS_FILE_EXTENT_REG);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_set_file_extent_disk_blocknr(leaf, item, offset);
+	btrfs_set_file_extent_disk_num_blocks(leaf, item, disk_num_blocks);
+	btrfs_set_file_extent_offset(leaf, item, 0);
+	btrfs_set_file_extent_num_blocks(leaf, item, num_blocks);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_mark_buffer_dirty(leaf);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -71,32 +72,30 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
 	struct btrfs_csum_item *item;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	u64 csum_offset = 0;
 	int csums_in_item;
 
 	file_key.objectid = objectid;
 	file_key.offset = offset;
-	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
 	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
 	if (ret < 0)
 		goto fail;
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
+	leaf = path->nodes[0];
 	if (ret > 0) {
 		ret = 1;
 		if (path->slots[0] == 0)
 			goto fail;
 		path->slots[0]--;
-		btrfs_disk_key_to_cpu(&found_key,
-				      &leaf->items[path->slots[0]].key);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 		    found_key.objectid != objectid) {
 			goto fail;
 		}
 		csum_offset = (offset - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
-		csums_in_item = btrfs_item_size(leaf->items + path->slots[0]);
+		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
 		csums_in_item /= BTRFS_CRC32_SIZE;
 
 		if (csum_offset >= csums_in_item) {
@@ -127,7 +126,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 
 	file_key.objectid = objectid;
 	file_key.offset = offset;
-	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
 	ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
 	return ret;
@@ -138,12 +136,14 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  u64 objectid, u64 offset,
 			  char *data, size_t len)
 {
+	return 0;
+#if 0
 	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	u64 csum_offset;
 
 	path = btrfs_alloc_path();
@@ -161,8 +161,8 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	if (ret == -EFBIG) {
 		u32 item_size;
 		/* we found one, but it isn't big enough yet */
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		item_size = btrfs_item_size(leaf->items + path->slots[0]);
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 		if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) {
 			/* already at max size, make a new one */
 			goto insert;
@@ -188,8 +188,8 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 		goto insert;
 	}
 	path->slots[0]--;
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	btrfs_disk_key_to_cpu(&found_key, &leaf->items[path->slots[0]].key);
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 	csum_offset = (offset - found_key.offset) >>
 			root->fs_info->sb->s_blocksize_bits;
 	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
@@ -197,10 +197,10 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 	    csum_offset >= MAX_CSUM_ITEMS(root)) {
 		goto insert;
 	}
-	if (csum_offset >= btrfs_item_size(leaf->items + path->slots[0]) /
+	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
 	    BTRFS_CRC32_SIZE) {
 		u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
-		diff = diff - btrfs_item_size(leaf->items + path->slots[0]);
+		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
 		if (diff != BTRFS_CRC32_SIZE)
 			goto insert;
 		ret = btrfs_extend_item(trans, root, path, diff);
@@ -220,21 +220,20 @@ insert:
 		goto fail;
 	}
 csum:
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
-			      struct btrfs_csum_item);
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	ret = 0;
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
 					  csum_offset * BTRFS_CRC32_SIZE);
 found:
-	btrfs_check_bounds(&item->csum, BTRFS_CRC32_SIZE,
-			   path->nodes[0]->b_data,
-			   root->fs_info->sb->s_blocksize);
+	/* FIXME!!!!!!!!!!!! */
 	ret = btrfs_csum_data(root, data, len, &item->csum);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
+#endif
 }
 
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
@@ -242,21 +241,21 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			u64 isize)
 {
 	struct btrfs_key key;
-	struct btrfs_leaf *leaf = btrfs_buffer_leaf(path->nodes[0]);
+	struct extent_buffer *leaf = path->nodes[0];
 	int slot = path->slots[0];
 	int ret;
 	u32 new_item_size;
 	u64 new_item_span;
 	u64 blocks;
 
-	btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+	btrfs_item_key_to_cpu(leaf, &key, slot);
 	if (isize <= key.offset)
 		return 0;
 	new_item_span = isize - key.offset;
-	blocks = (new_item_span + root->blocksize - 1) >>
+	blocks = (new_item_span + root->sectorsize - 1) >>
 		root->fs_info->sb->s_blocksize_bits;
 	new_item_size = blocks * BTRFS_CRC32_SIZE;
-	if (new_item_size >= btrfs_item_size(leaf->items + slot))
+	if (new_item_size >= btrfs_item_size_nr(leaf, slot))
 		return 0;
 	ret = btrfs_truncate_item(trans, root, path, new_item_size);
 	BUG_ON(ret);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4cc459c943e..1734ca69555 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -88,7 +87,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
-	char *ptr, *kaddr;
+	struct extent_buffer *leaf;
+	char *kaddr;
+	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
 	u32 datasize;
 	int err = 0;
@@ -102,7 +103,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 
 	key.objectid = inode->i_ino;
 	key.offset = offset;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 	BUG_ON(size >= PAGE_CACHE_SIZE);
 	datasize = btrfs_file_extent_calc_inline_size(size);
@@ -113,18 +113,17 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		err = ret;
 		goto fail;
 	}
-	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-	       path->slots[0], struct btrfs_file_extent_item);
-	btrfs_set_file_extent_generation(ei, trans->transid);
-	btrfs_set_file_extent_type(ei,
-				   BTRFS_FILE_EXTENT_INLINE);
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 	ptr = btrfs_file_extent_inline_start(ei);
 
 	kaddr = kmap_atomic(page, KM_USER0);
-	btrfs_memcpy(root, path->nodes[0]->b_data,
-		     ptr, kaddr + page_offset, size);
+	write_extent_buffer(leaf, kaddr + page_offset, ptr, size);
 	kunmap_atomic(kaddr, KM_USER0);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_mark_buffer_dirty(leaf);
 fail:
 	btrfs_free_path(path);
 	return err;
@@ -156,8 +155,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 	em->bdev = inode->i_sb->s_bdev;
 
-	start_pos = pos & ~((u64)root->blocksize - 1);
-	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
+	start_pos = pos & ~((u64)root->sectorsize - 1);
+	num_blocks = (write_bytes + pos - start_pos + root->sectorsize - 1) >>
 			inode->i_blkbits;
 
 	down_read(&BTRFS_I(inode)->root->snap_sem);
@@ -184,7 +183,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	if (inode->i_size < start_pos) {
 		u64 last_pos_in_file;
 		u64 hole_size;
-		u64 mask = root->blocksize - 1;
+		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
 		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
 
@@ -227,8 +226,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		/* step one, delete the existing extents in this range */
 		/* FIXME blocksize != pagesize */
 		err = btrfs_drop_extents(trans, root, inode, start_pos,
-			 (pos + write_bytes + root->blocksize -1) &
-			 ~((u64)root->blocksize - 1), &hint_block);
+			 (pos + write_bytes + root->sectorsize -1) &
+			 ~((u64)root->sectorsize - 1), &hint_block);
 		if (err)
 			goto failed;
 
@@ -288,7 +287,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	struct btrfs_key key;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	int slot;
 	struct btrfs_file_extent_item *extent;
 	u64 extent_end = 0;
@@ -327,10 +326,10 @@ next_slot:
 		found_extent = 0;
 		found_inline = 0;
 		extent = NULL;
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
+		leaf = path->nodes[0];
 		slot = path->slots[0];
 		ret = 0;
-		btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
+		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.offset >= end || key.objectid != inode->i_ino) {
 			goto out;
 		}
@@ -344,17 +343,18 @@ next_slot:
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
-			found_type = btrfs_file_extent_type(extent);
+			found_type = btrfs_file_extent_type(leaf, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				extent_end = key.offset +
-					(btrfs_file_extent_num_blocks(extent) <<
+				 (btrfs_file_extent_num_blocks(leaf, extent) <<
 					 inode->i_blkbits);
 				found_extent = 1;
 			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+				struct btrfs_item *item;
+				item = btrfs_item_nr(leaf, slot);
 				found_inline = 1;
 				extent_end = key.offset +
-				     btrfs_file_extent_inline_len(leaf->items +
-								  slot);
+				     btrfs_file_extent_inline_len(leaf, item);
 			}
 		} else {
 			extent_end = search_start;
@@ -365,8 +365,7 @@ next_slot:
 		    search_start >= extent_end) {
 			int nextret;
 			u32 nritems;
-			nritems = btrfs_header_nritems(
-					btrfs_buffer_header(path->nodes[0]));
+			nritems = btrfs_header_nritems(leaf);
 			if (slot >= nritems - 1) {
 				nextret = btrfs_next_leaf(root, path);
 				if (nextret)
@@ -380,7 +379,7 @@ next_slot:
 
 		/* FIXME, there's only one inline extent allowed right now */
 		if (found_inline) {
-			u64 mask = root->blocksize - 1;
+			u64 mask = root->sectorsize - 1;
 			search_start = (extent_end + mask) & ~mask;
 		} else
 			search_start = extent_end;
@@ -388,10 +387,13 @@ next_slot:
 		if (end < extent_end && end >= key.offset) {
 			if (found_extent) {
 				u64 disk_blocknr =
-					btrfs_file_extent_disk_blocknr(extent);
+				    btrfs_file_extent_disk_blocknr(leaf,extent);
 				u64 disk_num_blocks =
-				      btrfs_file_extent_disk_num_blocks(extent);
-				memcpy(&old, extent, sizeof(old));
+				    btrfs_file_extent_disk_num_blocks(leaf,
+								      extent);
+				read_extent_buffer(leaf, &old,
+						   (unsigned long)extent,
+						   sizeof(old));
 				if (disk_blocknr != 0) {
 					ret = btrfs_inc_extent_ref(trans, root,
 					         disk_blocknr, disk_num_blocks);
@@ -406,20 +408,24 @@ next_slot:
 			u64 new_num;
 			u64 old_num;
 			keep = 1;
-			WARN_ON(start & (root->blocksize - 1));
+			WARN_ON(start & (root->sectorsize - 1));
 			if (found_extent) {
 				new_num = (start - key.offset) >>
 					inode->i_blkbits;
-				old_num = btrfs_file_extent_num_blocks(extent);
+				old_num = btrfs_file_extent_num_blocks(leaf,
+								       extent);
 				*hint_block =
-					btrfs_file_extent_disk_blocknr(extent);
-				if (btrfs_file_extent_disk_blocknr(extent)) {
+					btrfs_file_extent_disk_blocknr(leaf,
+								       extent);
+				if (btrfs_file_extent_disk_blocknr(leaf,
+								   extent)) {
 					inode->i_blocks -=
 						(old_num - new_num) << 3;
 				}
-				btrfs_set_file_extent_num_blocks(extent,
+				btrfs_set_file_extent_num_blocks(leaf,
+								 extent,
 								 new_num);
-				btrfs_mark_buffer_dirty(path->nodes[0]);
+				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				WARN_ON(1);
 			}
@@ -431,13 +437,17 @@ next_slot:
 			u64 extent_num_blocks = 0;
 			if (found_extent) {
 				disk_blocknr =
-				      btrfs_file_extent_disk_blocknr(extent);
+				      btrfs_file_extent_disk_blocknr(leaf,
+								     extent);
 				disk_num_blocks =
-				      btrfs_file_extent_disk_num_blocks(extent);
+				      btrfs_file_extent_disk_num_blocks(leaf,
+									extent);
 				extent_num_blocks =
-				      btrfs_file_extent_num_blocks(extent);
+				      btrfs_file_extent_num_blocks(leaf,
+								   extent);
 				*hint_block =
-					btrfs_file_extent_disk_blocknr(extent);
+					btrfs_file_extent_disk_blocknr(leaf,
+								       extent);
 			}
 			ret = btrfs_del_item(trans, root, path);
 			/* TODO update progress marker and return */
@@ -464,42 +474,37 @@ next_slot:
 			struct btrfs_key ins;
 			ins.objectid = inode->i_ino;
 			ins.offset = end;
-			ins.flags = 0;
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
 			btrfs_release_path(root, path);
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
 
+			leaf = path->nodes[0];
 			if (ret) {
-				btrfs_print_leaf(root, btrfs_buffer_leaf(path->nodes[0]));
-				printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.flags, ins.offset, start, end, key.offset, extent_end, keep);
+				btrfs_print_leaf(root, leaf);
+				printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.type, ins.offset, start, end, key.offset, extent_end, keep);
 			}
 			BUG_ON(ret);
-			extent = btrfs_item_ptr(
-				    btrfs_buffer_leaf(path->nodes[0]),
-				    path->slots[0],
-				    struct btrfs_file_extent_item);
-			btrfs_set_file_extent_disk_blocknr(extent,
-				    btrfs_file_extent_disk_blocknr(&old));
-			btrfs_set_file_extent_disk_num_blocks(extent,
-				    btrfs_file_extent_disk_num_blocks(&old));
-
-			btrfs_set_file_extent_offset(extent,
-				    btrfs_file_extent_offset(&old) +
+			extent = btrfs_item_ptr(leaf, path->slots[0],
+						struct btrfs_file_extent_item);
+			write_extent_buffer(leaf, &old,
+					    (unsigned long)extent, sizeof(old));
+
+			btrfs_set_file_extent_offset(leaf, extent,
+				    le64_to_cpu(old.offset) +
 				    ((end - key.offset) >> inode->i_blkbits));
-			WARN_ON(btrfs_file_extent_num_blocks(&old) <
+			WARN_ON(le64_to_cpu(old.num_blocks) <
 				(extent_end - end) >> inode->i_blkbits);
-			btrfs_set_file_extent_num_blocks(extent,
+			btrfs_set_file_extent_num_blocks(leaf, extent,
 				    (extent_end - end) >> inode->i_blkbits);
 
-			btrfs_set_file_extent_type(extent,
+			btrfs_set_file_extent_type(leaf, extent,
 						   BTRFS_FILE_EXTENT_REG);
-			btrfs_set_file_extent_generation(extent,
-				    btrfs_file_extent_generation(&old));
 			btrfs_mark_buffer_dirty(path->nodes[0]);
-			if (btrfs_file_extent_disk_blocknr(&old) != 0) {
+			if (le64_to_cpu(old.disk_blocknr) != 0) {
 				inode->i_blocks +=
-				      btrfs_file_extent_num_blocks(extent) << 3;
+				      btrfs_file_extent_num_blocks(leaf,
+								   extent) << 3;
 			}
 			ret = 0;
 			goto out;
@@ -529,8 +534,8 @@ static int prepare_pages(struct btrfs_root *root,
 	u64 num_blocks;
 	u64 start_pos;
 
-	start_pos = pos & ~((u64)root->blocksize - 1);
-	num_blocks = (write_bytes + pos - start_pos + root->blocksize - 1) >>
+	start_pos = pos & ~((u64)root->sectorsize - 1);
+	num_blocks = (write_bytes + pos - start_pos + root->sectorsize - 1) >>
 			inode->i_blkbits;
 
 	memset(pages, 0, num_pages * sizeof(struct page *));
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index b7834617753..35d2608f891 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -20,24 +20,18 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-int btrfs_insert_inode(struct btrfs_trans_handle *trans, struct btrfs_root
-		       *root, u64 objectid, struct btrfs_inode_item
-		       *inode_item)
+int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid)
 {
-	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
 	key.objectid = objectid;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	ret = btrfs_insert_item(trans, root, &key, inode_item,
-				sizeof(*inode_item));
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(struct btrfs_inode_item));
 	if (ret == 0 && objectid > root->highest_inode)
 		root->highest_inode = objectid;
 	return ret;
@@ -51,15 +45,15 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 	int cow = mod != 0;
 	int ret;
 	int slot;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	struct btrfs_key found_key;
 
 	ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
 	if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY &&
 	    location->offset == (u64)-1 && path->slots[0] != 0) {
 		slot = path->slots[0] - 1;
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		btrfs_disk_key_to_cpu(&found_key, &leaf->items[slot].key);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 		if (found_key.objectid == location->objectid &&
 		    btrfs_key_type(&found_key) == btrfs_key_type(location)) {
 			path->slots[0]--;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 40547086625..ab74977adf5 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -24,8 +24,9 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 {
 	struct btrfs_path *path;
 	int ret;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	struct btrfs_key search_key;
+	struct btrfs_key found_key;
 	int slot;
 
 	path = btrfs_alloc_path();
@@ -39,8 +40,9 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 	BUG_ON(ret == 0);
 	if (path->slots[0] > 0) {
 		slot = path->slots[0] - 1;
-		l = btrfs_buffer_leaf(path->nodes[0]);
-		*objectid = btrfs_disk_key_objectid(&l->items[slot].key);
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &found_key, slot);
+		*objectid = found_key.objectid;
 	} else {
 		*objectid = BTRFS_FIRST_FREE_OBJECTID;
 	}
@@ -64,13 +66,12 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	int slot = 0;
 	u64 last_ino = 0;
 	int start_found;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	struct btrfs_key search_key;
 	u64 search_start = dirid;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	search_key.flags = 0;
 	search_start = root->last_inode_alloc;
 	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
@@ -86,9 +87,9 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 		path->slots[0]--;
 
 	while (1) {
-		l = btrfs_buffer_leaf(path->nodes[0]);
+		l = path->nodes[0];
 		slot = path->slots[0];
-		if (slot >= btrfs_header_nritems(&l->header)) {
+		if (slot >= btrfs_header_nritems(l)) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret == 0)
 				continue;
@@ -103,7 +104,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 				last_ino : search_start;
 			goto found;
 		}
-		btrfs_disk_key_to_cpu(&key, &l->items[slot].key);
+		btrfs_item_key_to_cpu(l, &key, slot);
 		if (key.objectid >= search_start) {
 			if (start_found) {
 				if (last_ino < search_start)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b03d40a907c..fbe2836364e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -159,10 +159,8 @@ out:
 
 int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 {
-	char csum[BTRFS_CRC32_SIZE];
 	size_t offset = start - (page->index << PAGE_CACHE_SHIFT);
 	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	char *kaddr;
 	u64 private;
@@ -173,11 +171,15 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	if (ret) {
 		goto zeroit;
 	}
+	/*
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	char csum[BTRFS_CRC32_SIZE];
 	ret = btrfs_csum_data(root, kaddr + offset, end - start + 1, csum);
 	BUG_ON(ret);
 	if (memcmp(csum, &private, BTRFS_CRC32_SIZE)) {
 		goto zeroit;
 	}
+	*/
 	kunmap_atomic(kaddr, KM_IRQ0);
 	return 0;
 
@@ -192,7 +194,9 @@ zeroit:
 void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
+	struct extent_buffer *leaf;
 	struct btrfs_inode_item *inode_item;
+	struct btrfs_inode_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
 	u64 alloc_group_block;
@@ -205,29 +209,37 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
-	if (ret) {
+	if (ret)
 		goto make_bad;
-	}
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
-				  struct btrfs_inode_item);
 
-	inode->i_mode = btrfs_inode_mode(inode_item);
-	inode->i_nlink = btrfs_inode_nlink(inode_item);
-	inode->i_uid = btrfs_inode_uid(inode_item);
-	inode->i_gid = btrfs_inode_gid(inode_item);
-	inode->i_size = btrfs_inode_size(inode_item);
-	inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime);
-	inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime);
-	inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime);
-	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime);
-	inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime);
-	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
-	inode->i_blocks = btrfs_inode_nblocks(inode_item);
-	inode->i_generation = btrfs_inode_generation(inode_item);
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_inode_item);
+
+	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
+	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
+	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
+	inode->i_size = btrfs_inode_size(leaf, inode_item);
+
+	tspec = btrfs_inode_atime(inode_item);
+	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	tspec = btrfs_inode_mtime(inode_item);
+	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	tspec = btrfs_inode_ctime(inode_item);
+	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+
+	inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item);
+	inode->i_generation = btrfs_inode_generation(leaf, inode_item);
 	inode->i_rdev = 0;
-	rdev = btrfs_inode_rdev(inode_item);
-	alloc_group_block = btrfs_inode_block_group(inode_item);
+	rdev = btrfs_inode_rdev(leaf, inode_item);
+
+	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
 	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
 						       alloc_group_block);
 
@@ -267,24 +279,35 @@ make_bad:
 	make_bad_inode(inode);
 }
 
-static void fill_inode_item(struct btrfs_inode_item *item,
+static void fill_inode_item(struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
 			    struct inode *inode)
 {
-	btrfs_set_inode_uid(item, inode->i_uid);
-	btrfs_set_inode_gid(item, inode->i_gid);
-	btrfs_set_inode_size(item, inode->i_size);
-	btrfs_set_inode_mode(item, inode->i_mode);
-	btrfs_set_inode_nlink(item, inode->i_nlink);
-	btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
-	btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
-	btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
-	btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
-	btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
-	btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
-	btrfs_set_inode_nblocks(item, inode->i_blocks);
-	btrfs_set_inode_generation(item, inode->i_generation);
-	btrfs_set_inode_rdev(item, inode->i_rdev);
-	btrfs_set_inode_block_group(item,
+	btrfs_set_inode_uid(leaf, item, inode->i_uid);
+	btrfs_set_inode_gid(leaf, item, inode->i_gid);
+	btrfs_set_inode_size(leaf, item, inode->i_size);
+	btrfs_set_inode_mode(leaf, item, inode->i_mode);
+	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+			       inode->i_atime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+				inode->i_atime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+			       inode->i_mtime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				inode->i_mtime.tv_nsec);
+
+	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+			       inode->i_ctime.tv_sec);
+	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				inode->i_ctime.tv_nsec);
+
+	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
+	btrfs_set_inode_generation(leaf, item, inode->i_generation);
+	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+	btrfs_set_inode_block_group(leaf, item,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
@@ -294,6 +317,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_path *path;
+	struct extent_buffer *leaf;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -306,12 +330,12 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 		goto failed;
 	}
 
-	inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-				  path->slots[0],
+	leaf = path->nodes[0];
+	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				  struct btrfs_inode_item);
 
-	fill_inode_item(inode_item, inode);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	fill_inode_item(leaf, inode_item, inode);
+	btrfs_mark_buffer_dirty(leaf);
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
@@ -330,8 +354,9 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	const char *name = dentry->d_name.name;
 	int name_len = dentry->d_name.len;
 	int ret = 0;
-	u64 objectid;
+	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
+	struct btrfs_key key;
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -349,14 +374,15 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 		ret = -ENOENT;
 		goto err;
 	}
-	objectid = btrfs_disk_key_objectid(&di->location);
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 	if (ret)
 		goto err;
 	btrfs_release_path(root, path);
 
 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					 objectid, name, name_len, -1);
+					 key.objectid, name, name_len, -1);
 	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
 		goto err;
@@ -391,12 +417,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	root = BTRFS_I(dir)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+
 	btrfs_set_trans_block_group(trans, dir);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	nr = trans->blocks_used;
+
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+
 	return ret;
 }
 
@@ -411,7 +440,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
 	int found_type;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	char *goodnames = "..";
 	unsigned long nr;
 
@@ -419,10 +448,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+
 	btrfs_set_trans_block_group(trans, dir);
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
-	key.flags = (u32)-1;
+	key.type = (u8)-1;
 	while(1) {
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0) {
@@ -435,9 +465,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 			goto out;
 		}
 		path->slots[0]--;
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		btrfs_disk_key_to_cpu(&found_key,
-				      &leaf->items[path->slots[0]].key);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
 		if (found_key.objectid != inode->i_ino) {
 			err = -ENOENT;
@@ -513,9 +542,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_disk_key *found_key;
+	struct btrfs_key found_key;
 	u32 found_type;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
 	u64 extent_num_blocks = 0;
@@ -527,10 +556,12 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
+
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	key.objectid = inode->i_ino;
 	key.offset = (u64)-1;
-	key.flags = (u32)-1;
+	key.type = (u8)-1;
+
 	while(1) {
 		btrfs_init_path(path);
 		fi = NULL;
@@ -542,27 +573,28 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			BUG_ON(path->slots[0] == 0);
 			path->slots[0]--;
 		}
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		found_key = &leaf->items[path->slots[0]].key;
-		found_type = btrfs_disk_key_type(found_key);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		found_type = btrfs_key_type(&found_key);
 
-		if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
+		if (found_key.objectid != inode->i_ino)
 			break;
+
 		if (found_type != BTRFS_CSUM_ITEM_KEY &&
 		    found_type != BTRFS_DIR_ITEM_KEY &&
 		    found_type != BTRFS_DIR_INDEX_KEY &&
 		    found_type != BTRFS_EXTENT_DATA_KEY)
 			break;
 
-		item_end = btrfs_disk_key_offset(found_key);
+		item_end = found_key.offset;
 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
-			fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-					    path->slots[0],
+			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(fi) !=
+			if (btrfs_file_extent_type(leaf, fi) !=
 			    BTRFS_FILE_EXTENT_INLINE) {
-				item_end += btrfs_file_extent_num_blocks(fi) <<
-						inode->i_blkbits;
+				item_end +=
+				    btrfs_file_extent_num_blocks(leaf, fi) <<
+				    inode->i_blkbits;
 			}
 		}
 		if (found_type == BTRFS_CSUM_ITEM_KEY) {
@@ -583,7 +615,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			btrfs_set_key_type(&key, found_type);
 			continue;
 		}
-		if (btrfs_disk_key_offset(found_key) >= inode->i_size)
+		if (found_key.offset >= inode->i_size)
 			del_item = 1;
 		else
 			del_item = 0;
@@ -591,30 +623,31 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 
 		/* FIXME, shrink the extent if the ref count is only 1 */
 		if (found_type == BTRFS_EXTENT_DATA_KEY &&
-			   btrfs_file_extent_type(fi) !=
+			   btrfs_file_extent_type(leaf, fi) !=
 			   BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
-			extent_start = btrfs_file_extent_disk_blocknr(fi);
+			extent_start = btrfs_file_extent_disk_blocknr(leaf, fi);
 			if (!del_item) {
 				u64 orig_num_blocks =
-					btrfs_file_extent_num_blocks(fi);
+					btrfs_file_extent_num_blocks(leaf, fi);
 				extent_num_blocks = inode->i_size -
-					btrfs_disk_key_offset(found_key) +
-					root->blocksize - 1;
+					found_key.offset + root->sectorsize - 1;
 				extent_num_blocks >>= inode->i_blkbits;
-				btrfs_set_file_extent_num_blocks(fi,
+				btrfs_set_file_extent_num_blocks(leaf, fi,
 							 extent_num_blocks);
 				num_dec = (orig_num_blocks -
 					   extent_num_blocks) << 3;
 				if (extent_start != 0) {
 					inode->i_blocks -= num_dec;
 				}
-				btrfs_mark_buffer_dirty(path->nodes[0]);
+				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				extent_num_blocks =
-					btrfs_file_extent_disk_num_blocks(fi);
+					btrfs_file_extent_disk_num_blocks(leaf,
+									  fi);
 				/* FIXME blocksize != 4096 */
-				num_dec = btrfs_file_extent_num_blocks(fi) << 3;
+				num_dec = btrfs_file_extent_num_blocks(leaf,
+								       fi) << 3;
 				if (extent_start != 0) {
 					found_extent = 1;
 					inode->i_blocks -= num_dec;
@@ -725,7 +758,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 
-		u64 mask = root->blocksize - 1;
+		u64 mask = root->sectorsize - 1;
 		u64 pos = (inode->i_size + mask) & ~mask;
 		u64 block_end = attr->ia_size | mask;
 		u64 hole_size;
@@ -771,9 +804,11 @@ void btrfs_delete_inode(struct inode *inode)
 	if (is_bad_inode(inode)) {
 		goto no_delete;
 	}
+
 	inode->i_size = 0;
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+
 	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode);
 	if (ret)
@@ -782,6 +817,7 @@ void btrfs_delete_inode(struct inode *inode)
 	if (ret)
 		goto no_delete_lock;
 	nr = trans->blocks_used;
+
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
@@ -819,7 +855,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 		ret = 0;
 		goto out;
 	}
-	btrfs_disk_key_to_cpu(location, &di->location);
+	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
@@ -856,7 +892,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 
 	ri = &(*sub_root)->root_item;
 	location->objectid = btrfs_root_dirid(ri);
-	location->flags = 0;
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 	location->offset = 0;
 
@@ -908,11 +943,14 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_inode_by_name(dir, dentry, &location);
 	mutex_unlock(&root->fs_info->fs_mutex);
+
 	if (ret < 0)
 		return ERR_PTR(ret);
+
 	inode = NULL;
 	if (location.objectid) {
 		ret = fixup_tree_root_location(root, &location, &sub_root,
@@ -952,10 +990,11 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
 	struct btrfs_path *path;
 	int ret;
 	u32 nritems;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	int slot;
 	int advance;
 	unsigned char d_type;
@@ -964,15 +1003,19 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	u32 di_total;
 	u32 di_len;
 	int key_type = BTRFS_DIR_INDEX_KEY;
+	char tmp_name[32];
+	char *name_ptr;
+	int name_len;
 
 	/* FIXME, use a real flag for deciding about the key type */
 	if (root->fs_info->tree_root == root)
 		key_type = BTRFS_DIR_ITEM_KEY;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
-	key.flags = 0;
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
+
 	path = btrfs_alloc_path();
 	path->reada = 2;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -980,16 +1023,16 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		goto err;
 	advance = 0;
 	while(1) {
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		nritems = btrfs_header_nritems(&leaf->header);
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
 		if (advance || slot >= nritems) {
 			if (slot >= nritems -1) {
 				ret = btrfs_next_leaf(root, path);
 				if (ret)
 					break;
-				leaf = btrfs_buffer_leaf(path->nodes[0]);
-				nritems = btrfs_header_nritems(&leaf->header);
+				leaf = path->nodes[0];
+				nritems = btrfs_header_nritems(leaf);
 				slot = path->slots[0];
 			} else {
 				slot++;
@@ -997,28 +1040,48 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			}
 		}
 		advance = 1;
-		item = leaf->items + slot;
-		if (btrfs_disk_key_objectid(&item->key) != key.objectid)
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid != key.objectid)
 			break;
-		if (btrfs_disk_key_type(&item->key) != key_type)
+		if (btrfs_key_type(&found_key) != key_type)
 			break;
-		if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
+		if (found_key.offset < filp->f_pos)
 			continue;
-		filp->f_pos = btrfs_disk_key_offset(&item->key);
+
+		filp->f_pos = found_key.offset;
 		advance = 1;
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		di_cur = 0;
-		di_total = btrfs_item_size(leaf->items + slot);
+		di_total = btrfs_item_size(leaf, item);
 		while(di_cur < di_total) {
-			d_type = btrfs_filetype_table[btrfs_dir_type(di)];
-			over = filldir(dirent, (const char *)(di + 1),
-				       btrfs_dir_name_len(di),
-				       btrfs_disk_key_offset(&item->key),
-				       btrfs_disk_key_objectid(&di->location),
+			struct btrfs_key location;
+
+			name_len = btrfs_dir_name_len(leaf, di);
+			if (name_len < 32) {
+				name_ptr = tmp_name;
+			} else {
+				name_ptr = kmalloc(name_len, GFP_NOFS);
+				BUG_ON(!name_ptr);
+			}
+			read_extent_buffer(leaf, name_ptr,
+					   (unsigned long)(di + 1), name_len);
+
+			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
+			btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+			over = filldir(dirent, name_ptr, name_len,
+				       found_key.offset,
+				       location.objectid,
 				       d_type);
+
+			if (name_ptr != tmp_name)
+				kfree(name_ptr);
+
 			if (over)
 				goto nopos;
-			di_len = btrfs_dir_name_len(di) + sizeof(*di);
+			di_len = btrfs_dir_name_len(leaf, di) + sizeof(*di);
 			di_cur += di_len;
 			di = (struct btrfs_dir_item *)((char *)di + di_len);
 		}
@@ -1075,11 +1138,15 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     int mode)
 {
 	struct inode *inode;
-	struct btrfs_inode_item inode_item;
+	struct btrfs_inode_item *inode_item;
 	struct btrfs_key *location;
+	struct btrfs_path *path;
 	int ret;
 	int owner;
 
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
 	inode = new_inode(root->fs_info->sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -1095,24 +1162,32 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	group = btrfs_find_block_group(root, group, 0, 0, owner);
 	BTRFS_I(inode)->block_group = group;
 
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto fail;
+
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	fill_inode_item(&inode_item, inode);
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				  struct btrfs_inode_item);
+	fill_inode_item(path->nodes[0], inode_item, inode);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_free_path(path);
+
 	location = &BTRFS_I(inode)->location;
 	location->objectid = objectid;
-	location->flags = 0;
 	location->offset = 0;
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
-	ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
-	if (ret)
-		return ERR_PTR(ret);
 	insert_inode_hash(inode);
 	return inode;
+fail:
+	btrfs_free_path(path);
+	return ERR_PTR(ret);
 }
 
 static inline u8 btrfs_inode_type(struct inode *inode)
@@ -1127,8 +1202,8 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
 	struct inode *parent_inode;
+
 	key.objectid = inode->i_ino;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
@@ -1285,14 +1360,18 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	inc_nlink(inode);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
 	err = btrfs_add_nondir(trans, dentry, inode);
+
 	if (err)
 		drop_inode = 1;
+
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, dir);
 	err = btrfs_update_inode(trans, root, inode);
+
 	if (err)
 		drop_inode = 1;
 
@@ -1321,13 +1400,13 @@ static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
 
 	key.objectid = objectid;
 	key.offset = 0;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 
 	ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
 				    &key, BTRFS_FT_DIR);
 	if (ret)
 		goto error;
+
 	key.objectid = dirid;
 	ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
 				    &key, BTRFS_FT_DIR);
@@ -1350,6 +1429,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
+
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out_unlock;
@@ -1367,6 +1447,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		err = PTR_ERR(inode);
 		goto out_fail;
 	}
+
 	drop_on_err = 1;
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
@@ -1380,9 +1461,11 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		goto out_fail;
+
 	err = btrfs_add_link(trans, dentry, inode);
 	if (err)
 		goto out_fail;
+
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
 	dir->i_sb->s_dirt = 1;
@@ -1392,6 +1475,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
+
 out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_on_err)
@@ -1415,8 +1499,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_file_extent_item *item;
-	struct btrfs_leaf *leaf;
-	struct btrfs_disk_key *found_key;
+	struct extent_buffer *leaf;
+	struct btrfs_key found_key;
 	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct btrfs_trans_handle *trans = NULL;
@@ -1436,8 +1520,8 @@ again:
 			err = -ENOMEM;
 			goto out;
 		}
-		em->start = 0;
-		em->end = 0;
+		em->start = EXTENT_MAP_HOLE;
+		em->end = EXTENT_MAP_HOLE;
 	}
 	em->bdev = inode->i_sb->s_bdev;
 	ret = btrfs_lookup_file_extent(NULL, root, path,
@@ -1453,25 +1537,27 @@ again:
 		path->slots[0]--;
 	}
 
-	item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-	leaf = btrfs_buffer_leaf(path->nodes[0]);
-	blocknr = btrfs_file_extent_disk_blocknr(item);
-	blocknr += btrfs_file_extent_offset(item);
+
+	blocknr = btrfs_file_extent_disk_blocknr(leaf, item);
+	blocknr += btrfs_file_extent_offset(leaf, item);
 
 	/* are we inside the extent that was found? */
-	found_key = &leaf->items[path->slots[0]].key;
-	found_type = btrfs_disk_key_type(found_key);
-	if (btrfs_disk_key_objectid(found_key) != objectid ||
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	found_type = btrfs_key_type(&found_key);
+	if (found_key.objectid != objectid ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
 		goto not_found;
 	}
 
-	found_type = btrfs_file_extent_type(item);
-	extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
+	found_type = btrfs_file_extent_type(leaf, item);
+	extent_start = found_key.offset;
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
 		extent_end = extent_start +
-		       (btrfs_file_extent_num_blocks(item) << inode->i_blkbits);
+		       (btrfs_file_extent_num_blocks(leaf, item) <<
+			inode->i_blkbits);
 		err = 0;
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
@@ -1484,28 +1570,29 @@ again:
 			}
 			goto not_found_em;
 		}
-		if (btrfs_file_extent_disk_blocknr(item) == 0) {
+		if (btrfs_file_extent_disk_blocknr(leaf, item) == 0) {
 			em->start = extent_start;
 			em->end = extent_end - 1;
-			em->block_start = 0;
-			em->block_end = 0;
+			em->block_start = EXTENT_MAP_HOLE;
+			em->block_end = EXTENT_MAP_HOLE;
 			goto insert;
 		}
 		em->block_start = blocknr << inode->i_blkbits;
 		em->block_end = em->block_start +
-			(btrfs_file_extent_num_blocks(item) <<
+			(btrfs_file_extent_num_blocks(leaf, item) <<
 			 inode->i_blkbits) - 1;
 		em->start = extent_start;
 		em->end = extent_end - 1;
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		char *ptr;
+		unsigned long ptr;
 		char *map;
 		u32 size;
 
-		size = btrfs_file_extent_inline_len(leaf->items +
-						    path->slots[0]);
-		extent_end = extent_start | ((u64)root->blocksize - 1);
+		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
+						    path->slots[0]));
+
+		extent_end = extent_start | ((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
@@ -1517,18 +1604,21 @@ again:
 			}
 			goto not_found_em;
 		}
+
 		em->block_start = EXTENT_MAP_INLINE;
 		em->block_end = EXTENT_MAP_INLINE;
 		em->start = extent_start;
 		em->end = extent_end;
+
 		if (!page) {
 			goto insert;
 		}
+
 		ptr = btrfs_file_extent_inline_start(item);
 		map = kmap(page);
-		memcpy(map + page_offset, ptr, size);
+		read_extent_buffer(leaf, map + page_offset, ptr, size);
 		memset(map + page_offset + size, 0,
-		       root->blocksize - (page_offset + size));
+		       root->sectorsize - (page_offset + size));
 		flush_dcache_page(page);
 		kunmap(page);
 		set_extent_uptodate(em_tree, extent_start,
@@ -1542,8 +1632,8 @@ not_found:
 	em->start = start;
 	em->end = end;
 not_found_em:
-	em->block_start = 0;
-	em->block_end = 0;
+	em->block_start = EXTENT_MAP_HOLE;
+	em->block_end = EXTENT_MAP_HOLE;
 insert:
 	btrfs_release_path(root, path);
 	if (em->start > start || em->end < start) {
@@ -1712,6 +1802,7 @@ static void btrfs_truncate(struct inode *inode)
 	ret = btrfs_truncate_in_trans(trans, root, inode);
 	btrfs_update_inode(trans, root, inode);
 	nr = trans->blocks_used;
+
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -1731,8 +1822,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	struct btrfs_key key;
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
-	struct buffer_head *subvol;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	struct btrfs_root *new_root;
 	struct inode *inode;
 	struct inode *dir;
@@ -1746,34 +1836,37 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
-	subvol = btrfs_alloc_free_block(trans, root, 0, 0);
-	if (IS_ERR(subvol))
-		return PTR_ERR(subvol);
-	leaf = btrfs_buffer_leaf(subvol);
-	btrfs_set_header_nritems(&leaf->header, 0);
-	btrfs_set_header_level(&leaf->header, 0);
-	btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
-	btrfs_set_header_generation(&leaf->header, trans->transid);
-	btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
-	memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
-	       sizeof(leaf->header.fsid));
-	btrfs_mark_buffer_dirty(subvol);
+	leaf = btrfs_alloc_free_block(trans, root, 0, 0);
+	if (IS_ERR(leaf))
+		return PTR_ERR(leaf);
+
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
+	btrfs_set_header_blocknr(leaf, extent_buffer_blocknr(leaf));
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_owner(leaf, root->root_key.objectid);
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
 
 	inode_item = &root_item.inode;
 	memset(inode_item, 0, sizeof(*inode_item));
-	btrfs_set_inode_generation(inode_item, 1);
-	btrfs_set_inode_size(inode_item, 3);
-	btrfs_set_inode_nlink(inode_item, 1);
-	btrfs_set_inode_nblocks(inode_item, 1);
-	btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
-	btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
+	btrfs_set_root_blocknr(&root_item, extent_buffer_blocknr(leaf));
 	btrfs_set_root_refs(&root_item, 1);
-	btrfs_set_root_blocks_used(&root_item, 0);
+	btrfs_set_root_used(&root_item, 0);
+
 	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
 	root_item.drop_level = 0;
-	brelse(subvol);
-	subvol = NULL;
+
+	free_extent_buffer(leaf);
+	leaf = NULL;
 
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
@@ -1784,7 +1877,6 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 
 	key.objectid = objectid;
 	key.offset = 1;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&root_item);
@@ -1845,7 +1937,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
 	struct btrfs_root_item new_root_item;
-	struct buffer_head *tmp;
+	struct extent_buffer *tmp;
 	int ret;
 	int err;
 	u64 objectid;
@@ -1876,10 +1968,11 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 
 	key.objectid = objectid;
 	key.offset = 1;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-	btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
+	btrfs_set_root_blocknr(&new_root_item,
+			       extent_buffer_blocknr(root->node));
 
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
@@ -1904,8 +1997,10 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 fail:
 	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, root);
+
 	if (err && !ret)
 		ret = err;
+
 	mutex_unlock(&root->fs_info->fs_mutex);
 	up_write(&root->snap_sem);
 	btrfs_btree_balance_dirty(root, nr);
@@ -1986,7 +2081,7 @@ static int btrfs_ioctl_snap_create(struct btrfs_root *root, void __user *arg)
 
 	if (copy_from_user(&vol_args, arg, sizeof(vol_args)))
 		return -EFAULT;
-	
+
 	namelen = strlen(vol_args.name);
 	if (namelen > BTRFS_VOL_NAME_MAX)
 		return -EINVAL;
@@ -2164,8 +2259,10 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
 		return -ENOTEMPTY;
 	}
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+
 	btrfs_set_trans_block_group(trans, new_dir);
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -2177,9 +2274,10 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
+
 	if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
 		struct btrfs_key *location = &BTRFS_I(new_dir)->location;
-		u64 old_parent_oid;
+		struct btrfs_key old_parent_key;
 		di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino,
 					   "..", 2, -1);
 		if (IS_ERR(di)) {
@@ -2190,7 +2288,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 			ret = -ENOENT;
 			goto out_fail;
 		}
-		old_parent_oid = btrfs_disk_key_objectid(&di->location);
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &old_parent_key);
 		ret = btrfs_del_item(trans, root, path);
 		if (ret) {
 			goto out_fail;
@@ -2199,7 +2297,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 
 		di = btrfs_lookup_dir_index_item(trans, root, path,
 						 old_inode->i_ino,
-						 old_parent_oid,
+						 old_parent_key.objectid,
 						 "..", 2, -1);
 		if (IS_ERR(di)) {
 			ret = PTR_ERR(di);
@@ -2257,8 +2355,9 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	int name_len;
 	int datasize;
-	char *ptr;
+	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
+	struct extent_buffer *leaf;
 	unsigned long nr;
 
 	name_len = strlen(symname) + 1;
@@ -2302,7 +2401,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	BUG_ON(!path);
 	key.objectid = inode->i_ino;
 	key.offset = 0;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
 	datasize = btrfs_file_extent_calc_inline_size(name_len);
 	err = btrfs_insert_empty_item(trans, root, path, &key,
@@ -2311,16 +2409,17 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		drop_inode = 1;
 		goto out_unlock;
 	}
-	ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-	       path->slots[0], struct btrfs_file_extent_item);
-	btrfs_set_file_extent_generation(ei, trans->transid);
-	btrfs_set_file_extent_type(ei,
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei,
 				   BTRFS_FILE_EXTENT_INLINE);
 	ptr = btrfs_file_extent_inline_start(ei);
-	btrfs_memcpy(root, path->nodes[0]->b_data,
-		     ptr, symname, name_len);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	write_extent_buffer(leaf, symname, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
+
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_size = name_len - 1;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bafa78603bd..a825ce078a5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -20,10 +20,10 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
-void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
-	u32 nr = btrfs_header_nritems(&l->header);
+	u32 nr = btrfs_header_nritems(l);
 	struct btrfs_item *item;
 	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
@@ -31,119 +31,113 @@ void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l)
 	struct btrfs_inode_item *ii;
 	struct btrfs_block_group_item *bi;
 	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
 	u32 type;
 
 	printk("leaf %llu total ptrs %d free space %d\n",
-		(unsigned long long)btrfs_header_blocknr(&l->header), nr,
+		(unsigned long long)btrfs_header_blocknr(l), nr,
 		btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
-		item = l->items + i;
-		type = btrfs_disk_key_type(&item->key);
+		item = btrfs_item_nr(l, i);
+		btrfs_item_key_to_cpu(l, &key, i);
+		type = btrfs_key_type(&key);
 		printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
 			i,
-			(unsigned long long)btrfs_disk_key_objectid(&item->key),
-			btrfs_disk_key_flags(&item->key),
-			(unsigned long long)btrfs_disk_key_offset(&item->key),
-			btrfs_item_offset(item),
-			btrfs_item_size(item));
+			(unsigned long long)key.objectid, type,
+			(unsigned long long)key.offset,
+			btrfs_item_offset(l, item), btrfs_item_size(l, item));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
 			printk("\t\tinode generation %llu size %llu mode %o\n",
-			       (unsigned long long)btrfs_inode_generation(ii),
-			       (unsigned long long)btrfs_inode_size(ii),
-			       btrfs_inode_mode(ii));
+		              (unsigned long long)btrfs_inode_generation(l, ii),
+			      (unsigned long long)btrfs_inode_size(l, ii),
+			       btrfs_inode_mode(l, ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
+			btrfs_dir_item_key_to_cpu(l, di, &found_key);
 			printk("\t\tdir oid %llu flags %u type %u\n",
-				(unsigned long long)btrfs_disk_key_objectid(
-							    &di->location),
-				btrfs_dir_flags(di),
-				btrfs_dir_type(di));
-			printk("\t\tname %.*s\n",
-			       btrfs_dir_name_len(di),(char *)(di + 1));
+				(unsigned long long)found_key.objectid,
+				btrfs_dir_flags(l, di),
+				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
 			printk("\t\troot data blocknr %llu refs %u\n",
-				(unsigned long long)btrfs_root_blocknr(ri),
-				btrfs_root_refs(ri));
+				(unsigned long long)btrfs_disk_root_blocknr(l, ri),
+				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
 			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
 			printk("\t\textent data refs %u\n",
-				btrfs_extent_refs(ei));
+				btrfs_extent_refs(l, ei));
 			break;
 
 		case BTRFS_EXTENT_DATA_KEY:
 			fi = btrfs_item_ptr(l, i,
 					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(fi) ==
+			if (btrfs_file_extent_type(l, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE) {
 				printk("\t\tinline extent data size %u\n",
-			           btrfs_file_extent_inline_len(l->items + i));
+			           btrfs_file_extent_inline_len(l, item));
 				break;
 			}
 			printk("\t\textent data disk block %llu nr %llu\n",
-			       (unsigned long long)btrfs_file_extent_disk_blocknr(fi),
-			       (unsigned long long)btrfs_file_extent_disk_num_blocks(fi));
+			       (unsigned long long)btrfs_file_extent_disk_blocknr(l, fi),
+			       (unsigned long long)btrfs_file_extent_disk_num_blocks(l, fi));
 			printk("\t\textent data offset %llu nr %llu\n",
-			  (unsigned long long)btrfs_file_extent_offset(fi),
-			  (unsigned long long)btrfs_file_extent_num_blocks(fi));
+			  (unsigned long long)btrfs_file_extent_offset(l, fi),
+			  (unsigned long long)btrfs_file_extent_num_blocks(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
 			printk("\t\tblock group used %llu\n",
-			       (unsigned long long)btrfs_block_group_used(bi));
-			break;
-		case BTRFS_STRING_ITEM_KEY:
-			printk("\t\titem data %.*s\n", btrfs_item_size(item),
-				btrfs_leaf_data(l) + btrfs_item_offset(item));
+			       (unsigned long long)btrfs_disk_block_group_used(l, bi));
 			break;
 		};
 	}
 }
 
-void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t)
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 {
 	int i;
 	u32 nr;
-	struct btrfs_node *c;
+	struct btrfs_key key;
 
-	if (!t)
+	if (!c)
 		return;
-	c = btrfs_buffer_node(t);
-	nr = btrfs_header_nritems(&c->header);
+	nr = btrfs_header_nritems(c);
 	if (btrfs_is_leaf(c)) {
-		btrfs_print_leaf(root, (struct btrfs_leaf *)c);
+		btrfs_print_leaf(root, c);
 		return;
 	}
 	printk("node %llu level %d total ptrs %d free spc %u\n",
-	       (unsigned long long)btrfs_header_blocknr(&c->header),
-	       btrfs_header_level(&c->header), nr,
+	       (unsigned long long)btrfs_header_blocknr(c),
+	       btrfs_header_level(c), nr,
 	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
+		btrfs_node_key_to_cpu(c, &key, i);
 		printk("\tkey %d (%llu %u %llu) block %llu\n",
 		       i,
-		       (unsigned long long)c->ptrs[i].key.objectid,
-		       c->ptrs[i].key.flags,
-		       (unsigned long long)c->ptrs[i].key.offset,
+		       (unsigned long long)key.objectid,
+		       key.type,
+		       (unsigned long long)key.offset,
 		       (unsigned long long)btrfs_node_blockptr(c, i));
 	}
 	for (i = 0; i < nr; i++) {
-		struct buffer_head *next_buf = read_tree_block(root,
+		struct extent_buffer *next = read_tree_block(root,
 						btrfs_node_blockptr(c, i));
-		struct btrfs_node *next = btrfs_buffer_node(next_buf);
 		if (btrfs_is_leaf(next) &&
-		    btrfs_header_level(&c->header) != 1)
+		    btrfs_header_level(c) != 1)
 			BUG();
-		if (btrfs_header_level(&next->header) !=
-			btrfs_header_level(&c->header) - 1)
+		if (btrfs_header_level(next) !=
+			btrfs_header_level(c) - 1)
 			BUG();
-		btrfs_print_tree(root, next_buf);
-		btrfs_block_release(root, next_buf);
+		btrfs_print_tree(root, next);
+		free_extent_buffer(next);
 	}
 }
 
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 205cd03601a..da75efe534d 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -18,6 +18,6 @@
 
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
-void btrfs_print_leaf(struct btrfs_root *root, struct btrfs_leaf *l);
-void btrfs_print_tree(struct btrfs_root *root, struct buffer_head *t);
+void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
 #endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3b5926dfbeb..88bcdd33f56 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -26,12 +26,13 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 {
 	struct btrfs_path *path;
 	struct btrfs_key search_key;
-	struct btrfs_leaf *l;
+	struct btrfs_key found_key;
+	struct extent_buffer *l;
 	int ret;
 	int slot;
 
 	search_key.objectid = objectid;
-	search_key.flags = (u32)-1;
+	search_key.type = (u8)-1;
 	search_key.offset = (u64)-1;
 
 	path = btrfs_alloc_path();
@@ -39,17 +40,19 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto out;
+
 	BUG_ON(ret == 0);
-	l = btrfs_buffer_leaf(path->nodes[0]);
+	l = path->nodes[0];
 	BUG_ON(path->slots[0] == 0);
 	slot = path->slots[0] - 1;
-	if (btrfs_disk_key_objectid(&l->items[slot].key) != objectid) {
+	btrfs_item_key_to_cpu(l, &found_key, slot);
+	if (found_key.objectid != objectid) {
 		ret = 1;
 		goto out;
 	}
-	memcpy(item, btrfs_item_ptr(l, slot, struct btrfs_root_item),
-		sizeof(*item));
-	btrfs_disk_key_to_cpu(key, &l->items[slot].key);
+	read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+			   sizeof(*item));
+	memcpy(key, &found_key, sizeof(found_key));
 	ret = 0;
 out:
 	btrfs_release_path(root, path);
@@ -62,10 +65,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *item)
 {
 	struct btrfs_path *path;
-	struct btrfs_leaf *l;
+	struct extent_buffer *l;
 	int ret;
 	int slot;
-	struct btrfs_root_item *update_item;
+	unsigned long ptr;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -73,10 +76,10 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
-	l = btrfs_buffer_leaf(path->nodes[0]);
+	l = path->nodes[0];
 	slot = path->slots[0];
-	update_item = btrfs_item_ptr(l, slot, struct btrfs_root_item);
-	btrfs_memcpy(root, l, update_item, item, sizeof(*item));
+	ptr = btrfs_item_ptr_offset(l, slot);
+	write_extent_buffer(l, item, ptr, sizeof(*item));
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
@@ -103,11 +106,10 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 	struct btrfs_path *path;
 	int ret;
 	u32 nritems;
-	struct btrfs_leaf *leaf;
+	struct extent_buffer *leaf;
 	int slot;
 
 	key.objectid = objectid;
-	key.flags = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	key.offset = 0;
 	path = btrfs_alloc_path();
@@ -117,19 +119,19 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 	if (ret < 0)
 		goto err;
 	while(1) {
-		leaf = btrfs_buffer_leaf(path->nodes[0]);
-		nritems = btrfs_header_nritems(&leaf->header);
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
 		if (slot >= nritems) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret)
 				break;
-			leaf = btrfs_buffer_leaf(path->nodes[0]);
-			nritems = btrfs_header_nritems(&leaf->header);
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
 			slot = path->slots[0];
 		}
-		item = leaf->items + slot;
-		btrfs_disk_key_to_cpu(&key, &item->key);
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
 			goto next;
 
@@ -140,7 +142,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			break;
 
 		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
-		if (btrfs_root_refs(ri) != 0)
+		if (btrfs_disk_root_refs(leaf, ri) != 0)
 			goto next;
 
 		dead_root = btrfs_read_fs_root_no_radix(root->fs_info, &key);
@@ -170,6 +172,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int ret;
 	u32 refs;
 	struct btrfs_root_item *ri;
+	struct extent_buffer *leaf;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -177,10 +180,10 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret != 0);
-	ri = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
-			    path->slots[0], struct btrfs_root_item);
+	leaf = path->nodes[0];
+	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
 
-	refs = btrfs_root_refs(ri);
+	refs = btrfs_disk_root_refs(leaf, ri);
 	BUG_ON(refs != 0);
 	ret = btrfs_del_item(trans, root, path);
 out:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 66a01cbbbea..39a1435c68f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,7 +41,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 
-#define BTRFS_SUPER_MAGIC 0x9123682E
+#define BTRFS_SUPER_MAGIC 0x9123683E
 
 static struct super_operations btrfs_super_ops;
 
@@ -115,13 +115,12 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 		return -EIO;
 	}
 	sb->s_fs_info = tree_root;
-	disk_super = tree_root->fs_info->disk_super;
+	disk_super = &tree_root->fs_info->super_copy;
 	inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
 				  tree_root);
 	bi = BTRFS_I(inode);
 	bi->location.objectid = inode->i_ino;
 	bi->location.offset = 0;
-	bi->location.flags = 0;
 	bi->root = tree_root;
 
 	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
@@ -281,6 +280,7 @@ error_s:
 error_bdev:
 	close_bdev_excl(bdev);
 error:
+printk("get_sb failed\n");
 	return error;
 }
 /* end copy & paste */
@@ -295,6 +295,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data,
 			btrfs_fill_super, mnt,
 			subvol_name ? subvol_name : "default");
+printk("btrfs_get_sb returns %d\n", ret);
 	return ret;
 }
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 2058783373e..9654e90eec8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -31,31 +31,31 @@
 static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_root_blocks_used(&root->root_item));
+		(unsigned long long)btrfs_root_used(&root->root_item));
 }
 
 static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_root_block_limit(&root->root_item));
+		(unsigned long long)btrfs_root_limit(&root->root_item));
 }
 
 static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_blocks_used(fs->disk_super));
+		(unsigned long long)btrfs_super_blocks_used(&fs->super_copy));
 }
 
 static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_total_blocks(fs->disk_super));
+		(unsigned long long)btrfs_super_total_blocks(&fs->super_copy));
 }
 
 static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_blocksize(fs->disk_super));
+		(unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
 }
 
 /* this is for root attrs (subvols/snapshots) */
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 087074db0bd..750f35a37aa 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -19,6 +19,7 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
+#include <linux/pagemap.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -66,7 +67,9 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-		init_bit_radix(&cur_trans->dirty_pages);
+		extent_map_tree_init(&cur_trans->dirty_pages,
+				     root->fs_info->btree_inode->i_mapping,
+				     GFP_NOFS);
 	} else {
 		cur_trans->num_writers++;
 		cur_trans->num_joined++;
@@ -88,7 +91,7 @@ static int record_root_in_trans(struct btrfs_root *root)
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_DEFRAG_TAG);
 			root->commit_root = root->node;
-			get_bh(root->node);
+			extent_buffer_get(root->node);
 		} else {
 			WARN_ON(1);
 		}
@@ -144,29 +147,30 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
-	unsigned long gang[16];
 	int ret;
-	int i;
 	int err;
 	int werr = 0;
+	struct extent_map_tree *dirty_pages;
 	struct page *page;
-	struct radix_tree_root *dirty_pages;
 	struct inode *btree_inode = root->fs_info->btree_inode;
+	u64 start;
+	u64 end;
+	unsigned long index;
 
 	if (!trans || !trans->transaction) {
 		return filemap_write_and_wait(btree_inode->i_mapping);
 	}
 	dirty_pages = &trans->transaction->dirty_pages;
 	while(1) {
-		ret = find_first_radix_bit(dirty_pages, gang,
-					   0, ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			/* FIXME EIO */
-			clear_radix_bit(dirty_pages, gang[i]);
-			page = find_lock_page(btree_inode->i_mapping,
-					      gang[i]);
+		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+		while(start <= end) {
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (index + 1) << PAGE_CACHE_SHIFT;
+			page = find_lock_page(btree_inode->i_mapping, index);
 			if (!page)
 				continue;
 			if (PageWriteback(page)) {
@@ -202,10 +206,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	btrfs_write_dirty_block_groups(trans, extent_root);
 	while(1) {
 		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
-		if (old_extent_block == bh_blocknr(extent_root->node))
+		if (old_extent_block ==
+		    extent_buffer_blocknr(extent_root->node))
 			break;
 		btrfs_set_root_blocknr(&extent_root->root_item,
-				       bh_blocknr(extent_root->node));
+			       extent_buffer_blocknr(extent_root->node));
 		ret = btrfs_update_root(trans, tree_root,
 					&extent_root->root_key,
 					&extent_root->root_item);
@@ -279,9 +284,9 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 				     (unsigned long)root->root_key.objectid,
 				     BTRFS_ROOT_TRANS_TAG);
 			if (root->commit_root == root->node) {
-				WARN_ON(bh_blocknr(root->node) !=
+				WARN_ON(extent_buffer_blocknr(root->node) !=
 					btrfs_root_blocknr(&root->root_item));
-				brelse(root->commit_root);
+				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
 
 				/* make sure to update the root on disk
@@ -310,7 +315,7 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_blocknr(&root->root_item,
-					       bh_blocknr(root->node));
+				       extent_buffer_blocknr(root->node));
 			err = btrfs_insert_root(trans, root->fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
@@ -389,10 +394,10 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
 			last = root->root_key.objectid + 1;
-			btrfs_defrag_root(root, 1);
+			// btrfs_defrag_root(root, 1);
 		}
 	}
-	btrfs_defrag_root(info->extent_root, 1);
+	// btrfs_defrag_root(info->extent_root, 1);
 	return err;
 }
 
@@ -414,7 +419,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
-		num_blocks = btrfs_root_blocks_used(&dirty->root->root_item);
+		num_blocks = btrfs_root_used(&dirty->root->root_item);
 		root = dirty->latest_root;
 
 		while(1) {
@@ -441,11 +446,11 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		}
 		BUG_ON(ret);
 
-		num_blocks -= btrfs_root_blocks_used(&dirty->root->root_item);
-		blocks_used = btrfs_root_blocks_used(&root->root_item);
+		num_blocks -= btrfs_root_used(&dirty->root->root_item);
+		blocks_used = btrfs_root_used(&root->root_item);
 		if (num_blocks) {
 			record_root_in_trans(root);
-			btrfs_set_root_blocks_used(&root->root_item,
+			btrfs_set_root_used(&root->root_item,
 						   blocks_used - num_blocks);
 		}
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
@@ -553,9 +558,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_set_super_generation(&root->fs_info->super_copy,
 				   cur_trans->transid);
 	btrfs_set_super_root(&root->fs_info->super_copy,
-			     bh_blocknr(root->fs_info->tree_root->node));
-	memcpy(root->fs_info->disk_super, &root->fs_info->super_copy,
-	       sizeof(root->fs_info->super_copy));
+		     extent_buffer_blocknr(root->fs_info->tree_root->node));
+
+	write_extent_buffer(root->fs_info->sb_buffer,
+			    &root->fs_info->super_copy, 0,
+			    sizeof(root->fs_info->super_copy));
 
 	btrfs_copy_pinned(root, &pinned_copy);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 4bc328cbb24..ae39fcfc169 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -28,7 +28,7 @@ struct btrfs_transaction {
 	int use_count;
 	int commit_done;
 	struct list_head list;
-	struct radix_tree_root dirty_pages;
+	struct extent_map_tree dirty_pages;
 	unsigned long start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
@@ -83,5 +83,6 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
 int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
-
+int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 420597127ed..daf019afa0a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,14 +23,14 @@
 #include "transaction.h"
 
 static void reada_defrag(struct btrfs_root *root,
-			 struct btrfs_node *node)
+			 struct extent_buffer *node)
 {
 	int i;
 	u32 nritems;
 	u64 blocknr;
 	int ret;
 
-	nritems = btrfs_header_nritems(&node->header);
+	nritems = btrfs_header_nritems(node);
 	for (i = 0; i < nritems; i++) {
 		blocknr = btrfs_node_blockptr(node, i);
 		ret = readahead_tree_block(root, blocknr);
@@ -44,8 +44,8 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 			    struct btrfs_path *path, int *level,
 			    int cache_only, u64 *last_ret)
 {
-	struct buffer_head *next;
-	struct buffer_head *cur;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
 	u64 blocknr;
 	int ret = 0;
 	int is_extent = 0;
@@ -62,13 +62,13 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		cur = path->nodes[*level];
 
 		if (!cache_only && *level > 1 && path->slots[*level] == 0)
-			reada_defrag(root, btrfs_buffer_node(cur));
+			reada_defrag(root, cur);
 
-		if (btrfs_header_level(btrfs_buffer_header(cur)) != *level)
+		if (btrfs_header_level(cur) != *level)
 			WARN_ON(1);
 
 		if (path->slots[*level] >=
-		    btrfs_header_nritems(btrfs_buffer_header(cur)))
+		    btrfs_header_nritems(cur))
 			break;
 
 		if (*level == 1) {
@@ -80,14 +80,13 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 
 			break;
 		}
-		blocknr = btrfs_node_blockptr(btrfs_buffer_node(cur),
-					      path->slots[*level]);
+		blocknr = btrfs_node_blockptr(cur, path->slots[*level]);
 
 		if (cache_only) {
 			next = btrfs_find_tree_block(root, blocknr);
-			if (!next || !buffer_uptodate(next) ||
-			   buffer_locked(next) || !buffer_defrag(next)) {
-				brelse(next);
+			/* FIXME, test for defrag */
+			if (!next || !btrfs_buffer_uptodate(next)) {
+				free_extent_buffer(next);
 				path->slots[*level]++;
 				continue;
 			}
@@ -106,16 +105,18 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
-			btrfs_block_release(root, path->nodes[*level-1]);
+			free_extent_buffer(path->nodes[*level-1]);
 		path->nodes[*level-1] = next;
-		*level = btrfs_header_level(btrfs_buffer_header(next));
+		*level = btrfs_header_level(next);
 		path->slots[*level] = 0;
 	}
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+#if 0
 	clear_buffer_defrag(path->nodes[*level]);
 	clear_buffer_defrag_done(path->nodes[*level]);
-	btrfs_block_release(root, path->nodes[*level]);
+#endif
+	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	WARN_ON(ret);
@@ -129,24 +130,25 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 {
 	int i;
 	int slot;
-	struct btrfs_node *node;
+	struct extent_buffer *node;
 
 	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot < btrfs_header_nritems(
-		    btrfs_buffer_header(path->nodes[i])) - 1) {
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			path->slots[i]++;
 			*level = i;
-			node = btrfs_buffer_node(path->nodes[i]);
+			node = path->nodes[i];
 			WARN_ON(i == 0);
-			btrfs_disk_key_to_cpu(&root->defrag_progress,
-					      &node->ptrs[path->slots[i]].key);
+			btrfs_node_key_to_cpu(node, &root->defrag_progress,
+					      path->slots[i]);
 			root->defrag_level = i;
 			return 0;
 		} else {
+			/*
 			clear_buffer_defrag(path->nodes[*level]);
 			clear_buffer_defrag_done(path->nodes[*level]);
-			btrfs_block_release(root, path->nodes[*level]);
+			*/
+			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
 		}
@@ -158,7 +160,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only)
 {
 	struct btrfs_path *path = NULL;
-	struct buffer_head *tmp;
+	struct extent_buffer *tmp;
 	int ret = 0;
 	int wret;
 	int level;
@@ -172,17 +174,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 	if (root->ref_cows == 0 && !is_extent)
 		goto out;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	level = btrfs_header_level(btrfs_buffer_header(root->node));
+	level = btrfs_header_level(root->node);
 	orig_level = level;
 	if (level == 0) {
 		goto out;
 	}
 	if (root->defrag_progress.objectid == 0) {
-		get_bh(root->node);
+		extent_buffer_get(root->node);
 		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 		BUG_ON(ret);
 		ret = btrfs_realloc_node(trans, root, root->node, cache_only,
@@ -200,12 +203,15 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 		if (is_extent)
 			btrfs_extent_post_op(trans, root);
+
 		if (wret < 0) {
 			ret = wret;
 			goto out;
 		}
+
 		while(level > 0 && !path->nodes[level])
 			level--;
+
 		if (!path->nodes[level]) {
 			ret = 0;
 			goto out;
@@ -230,7 +236,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
-			btrfs_block_release(root, path->nodes[i]);
+			free_extent_buffer(path->nodes[i]);
 			path->nodes[i] = 0;
 		}
 	}
-- 
cgit v1.2.3


From 479965d66e320f1a095bb76027171daa675a9c72 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:14:27 -0400
Subject: Btrfs: Optimizations for the extent_buffer code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c      | 51 ++++++++++++++++++++++++---------------------------
 fs/btrfs/ctree.h      | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/extent_map.c | 11 ++++++++---
 fs/btrfs/extent_map.h |  2 +-
 4 files changed, 76 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f60920e8a0e..9427b79c5d7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -464,22 +464,12 @@ static int check_block(struct btrfs_root *root, struct btrfs_path *path,
 			int level)
 {
 	struct extent_buffer *buf = path->nodes[level];
-	char fsid[BTRFS_FSID_SIZE];
 
-	read_extent_buffer(buf, fsid, (unsigned long)btrfs_header_fsid(buf),
-			   BTRFS_FSID_SIZE);
-
-	if (memcmp(fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
-		int i = 0;
+	if (memcmp_extent_buffer(buf, root->fs_info->fsid,
+				 (unsigned long)btrfs_header_fsid(buf),
+				 BTRFS_FSID_SIZE)) {
 		printk("warning bad block %Lu\n", buf->start);
-		if (!btrfs_buffer_uptodate(buf)) {
-			WARN_ON(1);
-		}
-		for (i = 0; i < BTRFS_FSID_SIZE; i++) {
-			printk("%x:%x ", root->fs_info->fsid[i], fsid[i]);
-		}
-		printk("\n");
-		// BUG();
+		BUG();
 	}
 	if (level == 0)
 		return check_leaf(root, path, level);
@@ -504,13 +494,14 @@ static int generic_bin_search(struct extent_buffer *eb, unsigned long p,
 	int high = max;
 	int mid;
 	int ret;
-	struct btrfs_disk_key *tmp;
+	struct btrfs_disk_key *tmp = NULL;
 	struct btrfs_disk_key unaligned;
 	unsigned long offset;
 	char *map_token = NULL;
 	char *kaddr = NULL;
 	unsigned long map_start = 0;
 	unsigned long map_len = 0;
+	int err;
 
 	while(low < high) {
 		mid = (low + high) / 2;
@@ -519,19 +510,24 @@ static int generic_bin_search(struct extent_buffer *eb, unsigned long p,
 		if (!map_token || offset < map_start ||
 		    (offset + sizeof(struct btrfs_disk_key)) >
 		    map_start + map_len) {
-			if (map_token)
+			if (map_token) {
 				unmap_extent_buffer(eb, map_token, KM_USER0);
-			map_extent_buffer(eb, offset, &map_token, &kaddr,
-					  &map_start, &map_len, KM_USER0);
+				map_token = NULL;
+			}
+			err = map_extent_buffer(eb, offset,
+						sizeof(struct btrfs_disk_key),
+						&map_token, &kaddr,
+						&map_start, &map_len, KM_USER0);
+
+			if (!err) {
+				tmp = (struct btrfs_disk_key *)(kaddr + offset -
+							map_start);
+			} else {
+				read_extent_buffer(eb, &unaligned,
+						   offset, sizeof(unaligned));
+				tmp = &unaligned;
+			}
 
-		}
-		if (offset + sizeof(struct btrfs_disk_key) >
-		    map_start + map_len) {
-			unmap_extent_buffer(eb, map_token, KM_USER0);
-			read_extent_buffer(eb, &unaligned,
-					   offset, sizeof(unaligned));
-			map_token = NULL;
-			tmp = &unaligned;
 		} else {
 			tmp = (struct btrfs_disk_key *)(kaddr + offset -
 							map_start);
@@ -544,7 +540,8 @@ static int generic_bin_search(struct extent_buffer *eb, unsigned long p,
 			high = mid;
 		else {
 			*slot = mid;
-			unmap_extent_buffer(eb, map_token, KM_USER0);
+			if (map_token)
+				unmap_extent_buffer(eb, map_token, KM_USER0);
 			return 0;
 		}
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4b82980685..30fbbd7221a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
+#include <asm/kmap_types.h>
 #include "bit-radix.h"
 #include "extent_map.h"
 
@@ -431,15 +432,52 @@ struct btrfs_root {
 static inline u##bits btrfs_##name(struct extent_buffer *eb,		\
 				   type *s)				\
 {									\
-	__le##bits res;							\
-	read_eb_member(eb, s, type, member, &res);			\
-	return le##bits##_to_cpu(res);					\
+	int err;							\
+	char *map_token;						\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	unsigned long offset = (unsigned long)s +			\
+				offsetof(type, member);			\
+	err = map_extent_buffer(eb, offset,				\
+			        sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER0);	\
+	if (!err) {							\
+		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
+					       map_start);		\
+		u##bits res = le##bits##_to_cpu(*tmp);			\
+		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+		return res;						\
+	} else {							\
+		__le##bits res;						\
+		read_eb_member(eb, s, type, member, &res);		\
+		return le##bits##_to_cpu(res);				\
+	}								\
 }									\
 static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 				    type *s, u##bits val)		\
 {									\
-	val = cpu_to_le##bits(val);					\
-	write_eb_member(eb, s, type, member, &val);			\
+	int err;							\
+	char *map_token;						\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	unsigned long offset = (unsigned long)s +			\
+				offsetof(type, member);			\
+	err = map_extent_buffer(eb, offset,				\
+			        sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER0);	\
+	if (!err) {							\
+		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
+					       map_start);		\
+		*tmp = cpu_to_le##bits(val);				\
+		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+	} else {							\
+		val = cpu_to_le##bits(val);				\
+		write_eb_member(eb, s, type, member, &val);		\
+	}								\
 }
 
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 55fefdfab84..8bef309e1b3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2140,14 +2140,20 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 EXPORT_SYMBOL(read_extent_buffer);
 
 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+		      unsigned long min_len,
 		      char **token, char **map,
 		      unsigned long *map_start,
 		      unsigned long *map_len, int km)
 {
-	size_t offset;
+	size_t offset = start & (PAGE_CACHE_SIZE - 1);
 	char *kaddr;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long end_i = (start_offset + start + min_len) >>
+				PAGE_CACHE_SHIFT;
+
+	if (i != end_i)
+		return -EINVAL;
 
 	WARN_ON(start > eb->len);
 
@@ -2155,8 +2161,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		offset = start_offset;
 		*map_start = 0;
 	} else {
-		offset = 0;
-		*map_start = (i << PAGE_CACHE_SHIFT) - offset;
+		*map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
 	}
 
 	// kaddr = kmap_atomic(eb->pages[i], km);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 430b997a70f..895789039dd 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -173,7 +173,7 @@ int set_extent_buffer_uptodate(struct extent_map_tree *tree,
 int extent_buffer_uptodate(struct extent_map_tree *tree,
 			   struct extent_buffer *eb);
 int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      char **token, char **map,
+		      unsigned long min_len, char **token, char **map,
 		      unsigned long *map_start,
 		      unsigned long *map_len, int km);
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
-- 
cgit v1.2.3


From 6d36dcd48f1e4e7446d603a3df9638bd314a182d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:14:37 -0400
Subject: Btrfs: Avoid memcpy where possible in extent_buffers

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      |  45 +++++++++++++++--
 fs/btrfs/disk-io.c    |   6 +--
 fs/btrfs/extent_map.c | 132 ++++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/extent_map.h |   5 +-
 4 files changed, 147 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 30fbbd7221a..aed08618aca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -483,15 +483,50 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
 {									\
-	__le##bits res;							\
-	read_eb_member(eb, NULL, type, member, &res);			\
-	return le##bits##_to_cpu(res);					\
+	int err;							\
+	char *map_token;						\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	unsigned long offset = offsetof(type, member);			\
+	err = map_extent_buffer(eb, offset,				\
+			        sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER0);	\
+	if (!err) {							\
+		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
+					       map_start);		\
+		u##bits res = le##bits##_to_cpu(*tmp);			\
+		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+		return res;						\
+	} else {							\
+		__le##bits res;						\
+		read_eb_member(eb, NULL, type, member, &res);		\
+		return le##bits##_to_cpu(res);				\
+	}								\
 }									\
 static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 				    u##bits val)			\
 {									\
-	val = cpu_to_le##bits(val);					\
-	write_eb_member(eb, NULL, type, member, &val);			\
+	int err;							\
+	char *map_token;						\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	unsigned long offset = offsetof(type, member);			\
+	err = map_extent_buffer(eb, offset,				\
+			        sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER0);	\
+	if (!err) {							\
+		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
+					       map_start);		\
+		*tmp = cpu_to_le##bits(val);				\
+		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+	} else {							\
+		val = cpu_to_le##bits(val);				\
+		write_eb_member(eb, NULL, type, member, &val);		\
+	}								\
 }
 
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0c1f90cbedb..8242933a1d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -609,20 +609,20 @@ int close_ctree(struct btrfs_root *root)
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->pages[0]->mapping->host;
+	struct inode *btree_inode = buf->first_page->mapping->host;
 	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf);
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->pages[0]->mapping->host;
+	struct inode *btree_inode = buf->first_page->mapping->host;
 	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree,
 					  buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8bef309e1b3..d2c733c68b4 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -18,6 +18,11 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 
 static struct kmem_cache *extent_map_cache;
 static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+static LIST_HEAD(extent_buffers);
+static spinlock_t extent_buffers_lock;
+static int nr_extent_buffers;
+#define MAX_EXTENT_BUFFER_CACHE 128
 
 struct tree_entry {
 	u64 start;
@@ -29,21 +34,33 @@ struct tree_entry {
 void __init extent_map_init(void)
 {
 	extent_map_cache = btrfs_cache_create("extent_map",
-					    sizeof(struct extent_map),
-					    SLAB_DESTROY_BY_RCU,
+					    sizeof(struct extent_map), 0,
 					    NULL);
 	extent_state_cache = btrfs_cache_create("extent_state",
-					    sizeof(struct extent_state),
-					    SLAB_DESTROY_BY_RCU,
+					    sizeof(struct extent_state), 0,
 					    NULL);
+	extent_buffer_cache = btrfs_cache_create("extent_buffers",
+					    sizeof(struct extent_buffer), 0,
+					    NULL);
+	spin_lock_init(&extent_buffers_lock);
 }
 
 void __exit extent_map_exit(void)
 {
+	struct extent_buffer *eb;
+
+	while (!list_empty(&extent_buffers)) {
+		eb = list_entry(extent_buffers.next,
+				struct extent_buffer, list);
+		list_del(&eb->list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
 	if (extent_map_cache)
 		kmem_cache_destroy(extent_map_cache);
 	if (extent_state_cache)
 		kmem_cache_destroy(extent_state_cache);
+	if (extent_buffer_cache)
+		kmem_cache_destroy(extent_buffer_cache);
 }
 
 void extent_map_tree_init(struct extent_map_tree *tree,
@@ -1858,6 +1875,48 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 	return (em->block_start + start - em->start) >> inode->i_blkbits;
 }
 
+static struct extent_buffer *__alloc_extent_buffer(gfp_t mask)
+{
+	struct extent_buffer *eb = NULL;
+	spin_lock(&extent_buffers_lock);
+	if (!list_empty(&extent_buffers)) {
+		eb = list_entry(extent_buffers.next, struct extent_buffer,
+				list);
+		list_del(&eb->list);
+		WARN_ON(nr_extent_buffers == 0);
+		nr_extent_buffers--;
+	}
+	spin_unlock(&extent_buffers_lock);
+	if (eb) {
+		memset(eb, 0, sizeof(*eb));
+		return eb;
+	}
+	return kmem_cache_zalloc(extent_buffer_cache, mask);
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+	if (nr_extent_buffers >= MAX_EXTENT_BUFFER_CACHE) {
+		kmem_cache_free(extent_buffer_cache, eb);
+	} else {
+		spin_lock(&extent_buffers_lock);
+		list_add(&eb->list, &extent_buffers);
+		nr_extent_buffers++;
+		spin_unlock(&extent_buffers_lock);
+	}
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i)
+{
+	struct page *p;
+	if (i == 0)
+		return eb->first_page;
+	i += eb->start >> PAGE_CACHE_SHIFT;
+	p = find_get_page(eb->first_page->mapping, i);
+	page_cache_release(p);
+	return p;
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 					  u64 start, unsigned long len,
 					  gfp_t mask)
@@ -1871,7 +1930,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	struct address_space *mapping = tree->mapping;
 	int uptodate = 0;
 
-	eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask);
+	eb = __alloc_extent_buffer(mask);
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
@@ -1881,9 +1940,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
-		if (!p)
+		if (!p) {
+			/* make sure the free only frees the pages we've
+			 * grabbed a reference on
+			 */
+			eb->len = i << PAGE_CACHE_SHIFT;
+			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
-		eb->pages[i] = p;
+		}
+		if (i == 0)
+			eb->first_page = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
@@ -1909,7 +1975,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
 
-	eb = kzalloc(EXTENT_BUFFER_SIZE(num_pages), mask);
+	eb = __alloc_extent_buffer(mask);
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
@@ -1919,9 +1985,16 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_get_page(mapping, index);
-		if (!p)
+		if (!p) {
+			/* make sure the free only frees the pages we've
+			 * grabbed a reference on
+			 */
+			eb->len = i << PAGE_CACHE_SHIFT;
+			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
-		eb->pages[i] = p;
+		}
+		if (i == 0)
+			eb->first_page = p;
 	}
 	return eb;
 fail:
@@ -1944,11 +2017,12 @@ void free_extent_buffer(struct extent_buffer *eb)
 	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
 		(eb->start >> PAGE_CACHE_SHIFT) + 1;
 
-	for (i = 0; i < num_pages; i++) {
-		if (eb->pages[i])
-			page_cache_release(eb->pages[i]);
+	if (eb->first_page)
+		page_cache_release(eb->first_page);
+	for (i = 1; i < num_pages; i++) {
+		page_cache_release(extent_buffer_page(eb, i));
 	}
-	kfree(eb);
+	__free_extent_buffer(eb);
 }
 EXPORT_SYMBOL(free_extent_buffer);
 
@@ -1968,7 +2042,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree,
 		(eb->start >> PAGE_CACHE_SHIFT) + 1;
 
 	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
+		page = extent_buffer_page(eb, i);
 		lock_page(page);
 		/*
 		 * if we're on the last page or the first page and the
@@ -2021,7 +2095,7 @@ int set_extent_buffer_uptodate(struct extent_map_tree *tree,
 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
 			    GFP_NOFS);
 	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
+		page = extent_buffer_page(eb, i);
 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
 		    ((i == num_pages - 1) &&
 		     ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
@@ -2064,7 +2138,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
 		(eb->start >> PAGE_CACHE_SHIFT) + 1;
 	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
+		page = extent_buffer_page(eb, i);
 		if (PageUptodate(page)) {
 			continue;
 		}
@@ -2090,7 +2164,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 	}
 
 	for (i = 0; i < num_pages; i++) {
-		page = eb->pages[i];
+		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
 		if (!PageUptodate(page)) {
 			ret = -EIO;
@@ -2116,12 +2190,12 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	page = eb->pages[i];
 	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	if (i == 0)
 		offset += start_offset;
 
 	while(len > 0) {
+		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -2134,7 +2208,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 		len -= cur;
 		offset = 0;
 		i++;
-		page = eb->pages[i];
 	}
 }
 EXPORT_SYMBOL(read_extent_buffer);
@@ -2165,7 +2238,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	}
 
 	// kaddr = kmap_atomic(eb->pages[i], km);
-	kaddr = page_address(eb->pages[i]);
+	kaddr = page_address(extent_buffer_page(eb, i));
 	*token = kaddr;
 	*map = kaddr + offset;
 	*map_len = PAGE_CACHE_SIZE - offset;
@@ -2195,12 +2268,12 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	page = eb->pages[i];
 	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	if (i == 0)
 		offset += start_offset;
 
 	while(len > 0) {
+		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -2216,7 +2289,6 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 		len -= cur;
 		offset = 0;
 		i++;
-		page = eb->pages[i];
 	}
 	return ret;
 }
@@ -2236,12 +2308,12 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	page = eb->pages[i];
 	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	if (i == 0)
 		offset += start_offset;
 
 	while(len > 0) {
+		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -2254,7 +2326,6 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 		len -= cur;
 		offset = 0;
 		i++;
-		page = eb->pages[i];
 	}
 }
 EXPORT_SYMBOL(write_extent_buffer);
@@ -2272,12 +2343,12 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	page = eb->pages[i];
 	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
 	if (i == 0)
 		offset += start_offset;
 
 	while(len > 0) {
+		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -2289,7 +2360,6 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 		len -= cur;
 		offset = 0;
 		i++;
-		page = eb->pages[i];
 	}
 }
 EXPORT_SYMBOL(memset_extent_buffer);
@@ -2313,7 +2383,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 		offset += start_offset;
 
 	while(len > 0) {
-		page = dst->pages[i];
+		page = extent_buffer_page(dst, i);
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -2414,7 +2484,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE -
 					       dst_off_in_page));
 
-		copy_pages(dst->pages[dst_i], dst->pages[src_i],
+		copy_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page, src_off_in_page, cur);
 
 		src_offset += cur;
@@ -2467,7 +2538,8 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		cur = min(len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
 // printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur);
-		move_pages(dst->pages[dst_i], dst->pages[src_i],
+		move_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page - cur + 1,
 			   src_off_in_page - cur + 1, cur);
 
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 895789039dd..3b3abf34c40 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -62,14 +62,13 @@ struct extent_state {
 	struct list_head list;
 };
 
-#define EXTENT_BUFFER_SIZE(nr) (sizeof(struct extent_buffer) + \
-			       (nr - 1) * sizeof(struct page *))
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
 	atomic_t refs;
 	int flags;
-	struct page *pages[];
+	struct list_head list;
+	struct page *first_page;
 };
 
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
-- 
cgit v1.2.3


From ae5252bd51a252b7b8b02289337c36774835101c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:14:41 -0400
Subject: Btrfs: Go back to kmaps instead of page_address in extent_buffers

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 50 +++++++++++++++++++-------------------------------
 1 file changed, 19 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index d2c733c68b4..f150188f621 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1867,7 +1867,6 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 	if (!em || IS_ERR(em))
 		return 0;
 
-	// XXX(hch): block 0 is valid in some cases, e.g. XFS RT device
 	if (em->block_start == EXTENT_MAP_INLINE ||
 	    em->block_start == EXTENT_MAP_HOLE)
 		return 0;
@@ -2199,10 +2198,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
-		// kaddr = kmap_atomic(page, KM_USER0);
-		kaddr = page_address(page);
+		kaddr = kmap_atomic(page, KM_USER0);
 		memcpy(dst, kaddr + offset, cur);
-		// kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr, KM_USER0);
 
 		dst += cur;
 		len -= cur;
@@ -2237,8 +2235,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		*map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
 	}
 
-	// kaddr = kmap_atomic(eb->pages[i], km);
-	kaddr = page_address(extent_buffer_page(eb, i));
+	kaddr = kmap_atomic(extent_buffer_page(eb, i), km);
 	*token = kaddr;
 	*map = kaddr + offset;
 	*map_len = PAGE_CACHE_SIZE - offset;
@@ -2248,7 +2245,7 @@ EXPORT_SYMBOL(map_extent_buffer);
 
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
 {
-	// kunmap_atomic(token, km);
+	kunmap_atomic(token, km);
 }
 EXPORT_SYMBOL(unmap_extent_buffer);
 
@@ -2278,10 +2275,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
 
-		// kaddr = kmap_atomic(page, KM_USER0);
-		kaddr = page_address(page);
+		kaddr = kmap_atomic(page, KM_USER0);
 		ret = memcmp(ptr, kaddr + offset, cur);
-		// kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr, KM_USER0);
 		if (ret)
 			break;
 
@@ -2317,10 +2313,9 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_CACHE_SIZE - offset);
-		// kaddr = kmap_atomic(page, KM_USER0);
-		kaddr = page_address(page);
+		kaddr = kmap_atomic(page, KM_USER0);
 		memcpy(kaddr + offset, src, cur);
-		// kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr, KM_USER0);
 
 		src += cur;
 		len -= cur;
@@ -2352,10 +2347,9 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_CACHE_SIZE - offset);
-		// kaddr = kmap_atomic(page, KM_USER0);
-		kaddr = page_address(page);
+		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr + offset, c, cur);
-		// kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr, KM_USER0);
 
 		len -= cur;
 		offset = 0;
@@ -2388,10 +2382,9 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
 
-		// kaddr = kmap_atomic(page, KM_USER1);
-		kaddr = page_address(page);
+		kaddr = kmap_atomic(page, KM_USER1);
 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
-		// kunmap_atomic(kaddr, KM_USER1);
+		kunmap_atomic(kaddr, KM_USER1);
 
 		src_offset += cur;
 		len -= cur;
@@ -2405,43 +2398,38 @@ static void move_pages(struct page *dst_page, struct page *src_page,
 		       unsigned long dst_off, unsigned long src_off,
 		       unsigned long len)
 {
-	// char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
-	char *dst_kaddr = page_address(dst_page);
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
 	if (dst_page == src_page) {
 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
 	} else {
-		// char *src_kaddr = kmap_atomic(src_page, KM_USER1);
-		char *src_kaddr = page_address(src_page);
+		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
 		char *p = dst_kaddr + dst_off + len;
 		char *s = src_kaddr + src_off + len;
 
 		while (len--)
 			*--p = *--s;
 
-		// kunmap_atomic(src_kaddr, KM_USER1);
+		kunmap_atomic(src_kaddr, KM_USER1);
 	}
-	// kunmap_atomic(dst_kaddr, KM_USER0);
+	kunmap_atomic(dst_kaddr, KM_USER0);
 }
 
 static void copy_pages(struct page *dst_page, struct page *src_page,
 		       unsigned long dst_off, unsigned long src_off,
 		       unsigned long len)
 {
-	//kmap_atomic(dst_page, KM_USER0);
-	char *dst_kaddr = page_address(dst_page);
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
 	char *src_kaddr;
 
 	if (dst_page != src_page)
-		src_kaddr = page_address(src_page); // kmap_atomic(src_page, KM_USER1);
+		src_kaddr = kmap_atomic(src_page, KM_USER1);
 	else
 		src_kaddr = dst_kaddr;
 
 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
-	/*
 	kunmap_atomic(dst_kaddr, KM_USER0);
 	if (dst_page != src_page)
 		kunmap_atomic(src_kaddr, KM_USER1);
-	*/
 }
 
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -2537,7 +2525,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
 		cur = min(len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
-// printk("move pages orig dst %lu src %lu len %lu, this %lu %lu %lu\n", dst_offset, src_offset, len, dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur);
+
 		move_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page - cur + 1,
-- 
cgit v1.2.3


From f510cfecfc98759d75283823cfccf0cc0d59a4c6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:14:48 -0400
Subject: Btrfs: Fix extent_buffer and extent_state leaks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  10 +++-
 fs/btrfs/ctree.h       |   2 +-
 fs/btrfs/disk-io.c     |  17 +++++--
 fs/btrfs/extent-tree.c | 126 +++++++++++++++++++++++--------------------------
 fs/btrfs/extent_map.c  |  64 ++++++++++++++++++++-----
 fs/btrfs/extent_map.h  |   2 +
 fs/btrfs/transaction.c |   5 +-
 7 files changed, 139 insertions(+), 87 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9427b79c5d7..50ef351ef6b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -87,6 +87,7 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
+	cow->alloc_addr = (unsigned long)__builtin_return_address(0);
 	if (buf->len != root->sectorsize || cow->len != root->sectorsize)
 		WARN_ON(1);
 
@@ -132,6 +133,7 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
+	int ret;
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
 		       root->fs_info->running_transaction->transid);
@@ -148,8 +150,10 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	}
 
 	search_start = extent_buffer_blocknr(buf) & ~((u64)65535);
-	return __btrfs_cow_block(trans, root, buf, parent,
+	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
+	(*cow_ret)->alloc_addr = (unsigned long)__builtin_return_address(0);
+	return ret;
 }
 
 static int close_blocks(u64 blocknr, u64 other)
@@ -1013,8 +1017,10 @@ again:
 				if (sret)
 					return sret;
 				b = p->nodes[level];
-				if (!b)
+				if (!b) {
+					btrfs_release_path(NULL, p);
 					goto again;
+				}
 				slot = p->slots[level];
 				BUG_ON(btrfs_header_nritems(b) == 1);
 			}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aed08618aca..5262b28f468 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -303,8 +303,8 @@ struct btrfs_fs_info {
 	struct radix_tree_root pinned_radix;
 	struct radix_tree_root block_group_radix;
 	struct radix_tree_root block_group_data_radix;
-	struct radix_tree_root extent_map_radix;
 	struct radix_tree_root extent_ins_radix;
+	struct extent_map_tree free_space_cache;
 	u64 generation;
 	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8242933a1d8..09f4e694624 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,18 +46,25 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 blocknr)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	struct extent_buffer *eb;
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
 				   blocknr * root->sectorsize,
 				   root->sectorsize, GFP_NOFS);
+	if (eb)
+		eb->alloc_addr = (unsigned long)__builtin_return_address(0);
+	return eb;
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 blocknr)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	struct extent_buffer *eb;
+	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
 				   blocknr * root->sectorsize,
 				   root->sectorsize, GFP_NOFS);
+	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
+	return eb;
 }
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
@@ -226,6 +233,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
 		return NULL;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
 				 buf, 1);
+	buf->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return buf;
 }
 
@@ -426,7 +434,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
-	init_bit_radix(&fs_info->extent_map_radix);
 	init_bit_radix(&fs_info->extent_ins_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
@@ -449,6 +456,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
+	extent_map_tree_init(&fs_info->free_space_cache,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
 
@@ -594,8 +603,10 @@ int close_ctree(struct btrfs_root *root)
 
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
+
 	if (fs_info->tree_root->node)
 		free_extent_buffer(fs_info->tree_root->node);
+
 	free_extent_buffer(fs_info->sb_buffer);
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 089c41cbca7..74cfbee2ff3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,21 +34,19 @@ static int cache_block_group(struct btrfs_root *root,
 	int ret;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct radix_tree_root *extent_radix;
+	struct extent_map_tree *free_space_cache;
 	int slot;
-	u64 i;
 	u64 last = 0;
 	u64 hole_size;
 	u64 first_free;
 	int found = 0;
 
 	root = root->fs_info->extent_root;
-	extent_radix = &root->fs_info->extent_map_radix;
+	free_space_cache = &root->fs_info->free_space_cache;
 
 	if (block_group->cached)
 		return 0;
-	if (block_group->data)
-		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -98,9 +96,11 @@ static int cache_block_group(struct btrfs_root *root,
 				last = first_free;
 				found = 1;
 			}
-			hole_size = key.objectid - last;
-			for (i = 0; i < hole_size; i++) {
-				set_radix_bit(extent_radix, last + i);
+			if (key.objectid > last) {
+				hole_size = key.objectid - last;
+				set_extent_dirty(free_space_cache, last,
+						 last + hole_size - 1,
+						 GFP_NOFS);
 			}
 			last = key.objectid + key.offset;
 		}
@@ -114,9 +114,8 @@ next:
 	    block_group->key.offset > last) {
 		hole_size = block_group->key.objectid +
 			block_group->key.offset - last;
-		for (i = 0; i < hole_size; i++) {
-			set_radix_bit(extent_radix, last + i);
-		}
+		set_extent_dirty(free_space_cache, last,
+				 last + hole_size - 1, GFP_NOFS);
 	}
 	block_group->cached = 1;
 err:
@@ -150,47 +149,33 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	return NULL;
 }
 
-static u64 leaf_range(struct btrfs_root *root)
-{
-	u64 size = BTRFS_LEAF_DATA_SIZE(root);
-	do_div(size, sizeof(struct btrfs_extent_item) +
-		sizeof(struct btrfs_item));
-	return size;
-}
-
 static u64 find_search_start(struct btrfs_root *root,
 			     struct btrfs_block_group_cache **cache_ret,
-			     u64 search_start, int num)
+			     u64 search_start, int num, int data)
 {
-	unsigned long gang[8];
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
 	u64 last = max(search_start, cache->key.objectid);
+	u64 start = 0;
+	u64 end = 0;
 
-	if (cache->data)
-		goto out;
 again:
 	ret = cache_block_group(root, cache);
 	if (ret)
 		goto out;
 	while(1) {
-		ret = find_first_radix_bit(&root->fs_info->extent_map_radix,
-					   gang, last, ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
+					    last, &start, &end, EXTENT_DIRTY);
+		if (ret)
 			goto out;
-		last = gang[ret-1] + 1;
-		if (num > 1) {
-			if (ret != ARRAY_SIZE(gang)) {
-				goto new_group;
-			}
-			if (gang[ret-1] - gang[0] > leaf_range(root)) {
-				continue;
-			}
-		}
-		if (gang[0] >= cache->key.objectid + cache->key.offset) {
+
+		start = max(last, start);
+		last = end + 1;
+		if (end + 1 - start < num)
+			continue;
+		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
-		}
-		return gang[0];
+		return start;
 	}
 out:
 	return max(cache->last_alloc, search_start);
@@ -202,7 +187,7 @@ new_group:
 		return max((*cache_ret)->last_alloc, search_start);
 	}
 	cache = btrfs_find_block_group(root, cache,
-				       last + cache->key.offset - 1, 0, 0);
+				       last + cache->key.offset - 1, data, 0);
 	*cache_ret = cache;
 	goto again;
 }
@@ -625,7 +610,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 total = num;
 	u64 old_val;
 	u64 block_in_group;
-	u64 i;
 	int ret;
 
 	while(total) {
@@ -644,12 +628,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (alloc) {
 			if (blocknr > cache->last_alloc)
 				cache->last_alloc = blocknr;
-			if (!cache->data) {
-				for (i = 0; i < num; i++) {
-					clear_radix_bit(&info->extent_map_radix,
-						        blocknr + i);
-				}
-			}
 			if (cache->data != data &&
 			    old_val < (cache->key.offset >> 1)) {
 				cache->data = data;
@@ -677,11 +655,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			old_val -= num;
 			if (blocknr < cache->first_free)
 				cache->first_free = blocknr;
-			if (!cache->data && mark_free) {
-				for (i = 0; i < num; i++) {
-					set_radix_bit(&info->extent_map_radix,
-						      blocknr + i);
-				}
+			if (mark_free) {
+				set_extent_dirty(&info->free_space_cache,
+						 blocknr, blocknr + num - 1,
+						 GFP_NOFS);
 			}
 			if (old_val < (cache->key.offset >> 1) &&
 			    old_val + num >= (cache->key.offset >> 1)) {
@@ -732,7 +709,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	int ret;
 	int i;
 	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
-	struct radix_tree_root *extent_radix = &root->fs_info->extent_map_radix;
+	struct extent_map_tree *free_space_cache;
+
+	free_space_cache = &root->fs_info->free_space_cache;
 
 	while(1) {
 		ret = find_first_radix_bit(unpin_radix, gang, 0,
@@ -751,8 +730,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 				block_group->pinned--;
 				if (gang[i] < block_group->last_alloc)
 					block_group->last_alloc = gang[i];
-				if (!block_group->data)
-					set_radix_bit(extent_radix, gang[i]);
+				if (!block_group->data) {
+					set_extent_dirty(free_space_cache,
+							 gang[i], gang[i],
+							 GFP_NOFS);
+				}
 			}
 		}
 	}
@@ -995,6 +977,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	u64 cached_search_start = 0;
 
 	WARN_ON(num_blocks < 1);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -1017,11 +1000,9 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	path = btrfs_alloc_path();
 
 check_failed:
-	if (!block_group->data)
-		search_start = find_search_start(root, &block_group,
-						 search_start, total_needed);
-	else if (!full_scan)
-		search_start = max(block_group->last_alloc, search_start);
+	search_start = find_search_start(root, &block_group,
+					 search_start, total_needed, data);
+	cached_search_start = search_start;
 
 	btrfs_init_path(path);
 	ins->objectid = search_start;
@@ -1097,6 +1078,7 @@ check_failed:
 
 		start_found = 1;
 		last_block = key.objectid + key.offset;
+
 		if (!full_scan && last_block >= block_group->key.objectid +
 		    block_group->key.offset) {
 			btrfs_release_path(root, path);
@@ -1138,6 +1120,9 @@ check_pending:
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
+	if (0 && ins->objectid != cached_search_start) {
+printk("\tcached was %Lu found %Lu\n", cached_search_start, ins->objectid);
+	}
 	return 0;
 
 new_group:
@@ -1209,6 +1194,10 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, root_blocks_used +
 				   num_blocks);
 
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+
 	if (root == extent_root) {
 		BUG_ON(num_blocks != 1);
 		set_radix_bit(&root->fs_info->extent_ins_radix, ins->objectid);
@@ -1227,6 +1216,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
+
 	if (ret) {
 		return ret;
 	}
@@ -1265,6 +1255,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 	}
 	btrfs_set_buffer_uptodate(buf);
+	buf->alloc_addr = (unsigned long)__builtin_return_address(0);
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
 	/*
@@ -1492,6 +1483,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	orig_level = level;
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 		path->nodes[level] = root->node;
+		extent_buffer_get(root->node);
 		path->slots[level] = 0;
 	} else {
 		struct btrfs_key key;
@@ -1524,7 +1516,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret < 0)
 			ret = wret;
 		ret = -EAGAIN;
-		extent_buffer_get(root->node);
 		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
@@ -1562,8 +1553,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	int ret;
 	int ret2;
-	unsigned long gang[16];
-	int i;
+	u64 start;
+	u64 end;
 
 	ret = free_block_group_radix(&info->block_group_radix);
 	ret2 = free_block_group_radix(&info->block_group_data_radix);
@@ -1573,13 +1564,12 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		return ret2;
 
 	while(1) {
-		ret = find_first_radix_bit(&info->extent_map_radix,
-					   gang, 0, ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(&info->free_space_cache, 0,
+					    &start, &end, EXTENT_DIRTY);
+		if (ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			clear_radix_bit(&info->extent_map_radix, gang[i]);
-		}
+		clear_extent_dirty(&info->free_space_cache, start,
+				   end, GFP_NOFS);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f150188f621..5b7dbcaacd1 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -19,8 +19,13 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 static struct kmem_cache *extent_map_cache;
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
+
 static LIST_HEAD(extent_buffers);
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
 static spinlock_t extent_buffers_lock;
+static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
 static int nr_extent_buffers;
 #define MAX_EXTENT_BUFFER_CACHE 128
 
@@ -48,6 +53,7 @@ void __init extent_map_init(void)
 void __exit extent_map_exit(void)
 {
 	struct extent_buffer *eb;
+	struct extent_state *state;
 
 	while (!list_empty(&extent_buffers)) {
 		eb = list_entry(extent_buffers.next,
@@ -55,6 +61,22 @@ void __exit extent_map_exit(void)
 		list_del(&eb->list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, list);
+		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
+		list_del(&state->list);
+		kmem_cache_free(extent_state_cache, state);
+
+	}
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next,
+				struct extent_buffer, leak_list);
+		printk("buffer leak start %Lu len %lu return %lX\n", eb->start, eb->len, eb->alloc_addr);
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+
+
 	if (extent_map_cache)
 		kmem_cache_destroy(extent_map_cache);
 	if (extent_state_cache)
@@ -101,12 +123,19 @@ EXPORT_SYMBOL(free_extent_map);
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
+	unsigned long flags;
+
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state || IS_ERR(state))
 		return state;
 	state->state = 0;
 	state->in_tree = 0;
 	state->private = 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_add(&state->list, &states);
+	spin_unlock_irqrestore(&state_lock, flags);
+
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
 	return state;
@@ -115,10 +144,14 @@ EXPORT_SYMBOL(alloc_extent_state);
 
 void free_extent_state(struct extent_state *state)
 {
+	unsigned long flags;
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
 		WARN_ON(state->in_tree);
+		spin_lock_irqsave(&state_lock, flags);
+		list_del(&state->list);
+		spin_unlock_irqrestore(&state_lock, flags);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -361,10 +394,6 @@ static int insert_state(struct extent_map_tree *tree,
 	state->state |= bits;
 	state->start = start;
 	state->end = end;
-	if ((end & 4095) == 0) {
-		printk("insert state %Lu %Lu strange end\n", start, end);
-		WARN_ON(1);
-	}
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -399,11 +428,7 @@ static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
 	prealloc->end = split - 1;
 	prealloc->state = orig->state;
 	orig->start = split;
-	if ((prealloc->end & 4095) == 0) {
-		printk("insert state %Lu %Lu strange end\n", prealloc->start,
-		       prealloc->end);
-		WARN_ON(1);
-	}
+
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -957,6 +982,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
 			*start_ret = state->start;
 			*end_ret = state->end;
 			ret = 0;
+			break;
 		}
 		node = rb_next(node);
 		if (!node)
@@ -1877,6 +1903,7 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 static struct extent_buffer *__alloc_extent_buffer(gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
+
 	spin_lock(&extent_buffers_lock);
 	if (!list_empty(&extent_buffers)) {
 		eb = list_entry(extent_buffers.next, struct extent_buffer,
@@ -1886,15 +1913,26 @@ static struct extent_buffer *__alloc_extent_buffer(gfp_t mask)
 		nr_extent_buffers--;
 	}
 	spin_unlock(&extent_buffers_lock);
+
 	if (eb) {
 		memset(eb, 0, sizeof(*eb));
-		return eb;
+	} else {
+		eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	}
-	return kmem_cache_zalloc(extent_buffer_cache, mask);
+	spin_lock(&extent_buffers_lock);
+	list_add(&eb->leak_list, &buffers);
+	spin_unlock(&extent_buffers_lock);
+
+	return eb;
 }
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
+
+	spin_lock(&extent_buffers_lock);
+	list_del_init(&eb->leak_list);
+	spin_unlock(&extent_buffers_lock);
+
 	if (nr_extent_buffers >= MAX_EXTENT_BUFFER_CACHE) {
 		kmem_cache_free(extent_buffer_cache, eb);
 	} else {
@@ -1933,6 +1971,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
+	eb->alloc_addr = __builtin_return_address(0);
 	eb->start = start;
 	eb->len = len;
 	atomic_set(&eb->refs, 1);
@@ -1947,6 +1986,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
 		}
+		set_page_extent_mapped(p);
 		if (i == 0)
 			eb->first_page = p;
 		if (!PageUptodate(p))
@@ -1978,6 +2018,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
+	eb->alloc_addr = __builtin_return_address(0);
 	eb->start = start;
 	eb->len = len;
 	atomic_set(&eb->refs, 1);
@@ -1992,6 +2033,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
 		}
+		set_page_extent_mapped(p);
 		if (i == 0)
 			eb->first_page = p;
 	}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 3b3abf34c40..d100f7c1476 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -68,7 +68,9 @@ struct extent_buffer {
 	atomic_t refs;
 	int flags;
 	struct list_head list;
+	struct list_head leak_list;
 	struct page *first_page;
+	unsigned long alloc_addr;
 };
 
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 750f35a37aa..372b61f5733 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -352,7 +352,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		return 0;
 
 	trans = btrfs_start_transaction(root, 1);
-	while (1) {
+	while (0) {
 		root->defrag_running = 1;
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
 		nr = trans->blocks_used;
@@ -394,7 +394,7 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
 			last = root->root_key.objectid + 1;
-			// btrfs_defrag_root(root, 1);
+			btrfs_defrag_root(root, 1);
 		}
 	}
 	// btrfs_defrag_root(info->extent_root, 1);
@@ -462,6 +462,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
-- 
cgit v1.2.3


From 96b5179d0d9b6368c203856f2ad6e8e12a8b2a2c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:15:19 -0400
Subject: Btrfs: Stop using radix trees for the block group cache

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   6 +-
 fs/btrfs/disk-io.c     |   4 +-
 fs/btrfs/extent-tree.c | 362 ++++++++++++++++++++-----------------------------
 fs/btrfs/extent_map.c  |  18 ++-
 fs/btrfs/extent_map.h  |   4 +
 5 files changed, 171 insertions(+), 223 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5262b28f468..c6174b27fd1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -271,8 +271,6 @@ struct btrfs_csum_item {
 } __attribute__ ((__packed__));
 
 /* tag for the radix tree of block groups in ram */
-#define BTRFS_BLOCK_GROUP_DIRTY 0
-#define BTRFS_BLOCK_GROUP_AVAIL 1
 #define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
 
 
@@ -285,7 +283,6 @@ struct btrfs_block_group_item {
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
-	struct radix_tree_root *radix;
 	u64 first_free;
 	u64 last_alloc;
 	u64 pinned;
@@ -301,10 +298,9 @@ struct btrfs_fs_info {
 	struct radix_tree_root fs_roots_radix;
 	struct radix_tree_root pending_del_radix;
 	struct radix_tree_root pinned_radix;
-	struct radix_tree_root block_group_radix;
-	struct radix_tree_root block_group_data_radix;
 	struct radix_tree_root extent_ins_radix;
 	struct extent_map_tree free_space_cache;
+	struct extent_map_tree block_group_cache;
 	u64 generation;
 	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 09f4e694624..aac7c82b0dc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -436,8 +436,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	init_bit_radix(&fs_info->pending_del_radix);
 	init_bit_radix(&fs_info->extent_ins_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
-	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
-	INIT_RADIX_TREE(&fs_info->block_group_data_radix, GFP_KERNEL);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj));
@@ -458,6 +456,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 			     GFP_NOFS);
 	extent_map_tree_init(&fs_info->free_space_cache,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_map_tree_init(&fs_info->block_group_cache,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 74cfbee2ff3..4bc639565d1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,10 @@
 #include "print-tree.h"
 #include "transaction.h"
 
+#define BLOCK_GROUP_DATA EXTENT_WRITEBACK
+#define BLOCK_GROUP_METADATA EXTENT_UPTODATE
+#define BLOCK_GROUP_DIRTY EXTENT_DIRTY
+
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -127,25 +131,31 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 blocknr)
 {
-	struct btrfs_block_group_cache *block_group;
+	struct extent_map_tree *block_group_cache;
+	struct btrfs_block_group_cache *block_group = NULL;
+	u64 ptr;
+	u64 start;
+	u64 end;
 	int ret;
 
-	ret = radix_tree_gang_lookup(&info->block_group_radix,
-				     (void **)&block_group,
-				     blocknr, 1);
+	block_group_cache = &info->block_group_cache;
+	ret = find_first_extent_bit(block_group_cache,
+				    blocknr, &start, &end,
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA);
 	if (ret) {
-		if (block_group->key.objectid <= blocknr && blocknr <=
-		    block_group->key.objectid + block_group->key.offset)
-			return block_group;
-	}
-	ret = radix_tree_gang_lookup(&info->block_group_data_radix,
-				     (void **)&block_group,
-				     blocknr, 1);
-	if (ret) {
-		if (block_group->key.objectid <= blocknr && blocknr <=
-		    block_group->key.objectid + block_group->key.offset)
-			return block_group;
+		return NULL;
 	}
+	ret = get_state_private(block_group_cache, start, &ptr);
+	if (ret)
+		return NULL;
+
+	block_group = (struct btrfs_block_group_cache *)ptr;
+
+
+	if (block_group->key.objectid <= blocknr && blocknr <=
+	    block_group->key.objectid + block_group->key.offset)
+		return block_group;
+
 	return NULL;
 }
 
@@ -173,7 +183,7 @@ again:
 		last = end + 1;
 		if (end + 1 - start < num)
 			continue;
-		if (start + num > cache->key.objectid + cache->key.offset)
+		if (start + num >= cache->key.objectid + cache->key.offset)
 			goto new_group;
 		return start;
 	}
@@ -189,6 +199,7 @@ new_group:
 	cache = btrfs_find_block_group(root, cache,
 				       last + cache->key.offset - 1, data, 0);
 	*cache_ret = cache;
+	last = min(cache->key.objectid, last);
 	goto again;
 }
 
@@ -204,30 +215,32 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 *hint, u64 search_start,
 						 int data, int owner)
 {
-	struct btrfs_block_group_cache *cache[8];
+	struct btrfs_block_group_cache *cache;
+	struct extent_map_tree *block_group_cache;
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct radix_tree_root *radix;
-	struct radix_tree_root *swap_radix;
 	u64 used;
 	u64 last = 0;
 	u64 hint_last;
-	int i;
+	u64 start;
+	u64 end;
+	u64 free_check;
+	u64 ptr;
+	int bit;
 	int ret;
 	int full_search = 0;
 	int factor = 8;
 	int data_swap = 0;
 
+	block_group_cache = &info->block_group_cache;
+
 	if (!owner)
 		factor = 5;
 
-	if (data) {
-		radix = &info->block_group_data_radix;
-		swap_radix = &info->block_group_radix;
-	} else {
-		radix = &info->block_group_radix;
-		swap_radix = &info->block_group_data_radix;
-	}
+	if (data)
+		bit = BLOCK_GROUP_DATA;
+	else
+		bit = BLOCK_GROUP_METADATA;
 
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
@@ -246,12 +259,6 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		    div_factor(hint->key.offset, factor)) {
 			return hint;
 		}
-		if (used >= div_factor(hint->key.offset, 8)) {
-			radix_tree_tag_clear(radix,
-					     hint->key.objectid +
-					     hint->key.offset - 1,
-					     BTRFS_BLOCK_GROUP_AVAIL);
-		}
 		last = hint->key.offset * 3;
 		if (hint->key.objectid >= last)
 			last = max(search_start + hint->key.offset - 1,
@@ -267,51 +274,29 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 
 		last = hint_last;
 	}
-	while(1) {
-		ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
-						 last, ARRAY_SIZE(cache),
-						 BTRFS_BLOCK_GROUP_AVAIL);
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++) {
-			last = cache[i]->key.objectid +
-				cache[i]->key.offset;
-			used = btrfs_block_group_used(&cache[i]->item);
-			if (used + cache[i]->pinned <
-			    div_factor(cache[i]->key.offset, factor)) {
-				found_group = cache[i];
-				goto found;
-			}
-			if (used >= div_factor(cache[i]->key.offset, 8)) {
-				radix_tree_tag_clear(radix,
-						     cache[i]->key.objectid +
-						     cache[i]->key.offset - 1,
-						     BTRFS_BLOCK_GROUP_AVAIL);
-			}
-		}
-		cond_resched();
-	}
-	last = hint_last;
 again:
 	while(1) {
-		ret = radix_tree_gang_lookup(radix, (void **)cache,
-					     last, ARRAY_SIZE(cache));
-		if (!ret)
+		ret = find_first_extent_bit(block_group_cache, last,
+					    &start, &end, bit);
+		if (ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			last = cache[i]->key.objectid +
-				cache[i]->key.offset;
-			used = btrfs_block_group_used(&cache[i]->item);
-			if (used + cache[i]->pinned < cache[i]->key.offset) {
-				found_group = cache[i];
-				goto found;
-			}
-			if (used >= cache[i]->key.offset) {
-				radix_tree_tag_clear(radix,
-						     cache[i]->key.objectid +
-						     cache[i]->key.offset - 1,
-						     BTRFS_BLOCK_GROUP_AVAIL);
-			}
+
+		ret = get_state_private(block_group_cache, start, &ptr);
+		if (ret)
+			break;
+
+		cache = (struct btrfs_block_group_cache *)ptr;
+		last = cache->key.objectid + cache->key.offset;
+		used = btrfs_block_group_used(&cache->item);
+
+		if (full_search)
+			free_check = cache->key.offset;
+		else
+			free_check = div_factor(cache->key.offset, factor);
+
+		if (used + cache->pinned < free_check) {
+			found_group = cache;
+			goto found;
 		}
 		cond_resched();
 	}
@@ -321,23 +306,11 @@ again:
 		goto again;
 	}
 	if (!data_swap) {
-		struct radix_tree_root *tmp = radix;
 		data_swap = 1;
-		radix = swap_radix;
-		swap_radix = tmp;
+		bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
 		last = search_start;
 		goto again;
 	}
-	if (!found_group) {
-		ret = radix_tree_gang_lookup(radix,
-					     (void **)&found_group, 0, 1);
-		if (ret == 0) {
-			ret = radix_tree_gang_lookup(swap_radix,
-						     (void **)&found_group,
-						     0, 1);
-		}
-		BUG_ON(ret != 1);
-	}
 found:
 	return found_group;
 }
@@ -538,68 +511,55 @@ fail:
 
 }
 
-static int write_dirty_block_radix(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct radix_tree_root *radix)
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
 {
-	struct btrfs_block_group_cache *cache[8];
+	struct extent_map_tree *block_group_cache;
+	struct btrfs_block_group_cache *cache;
 	int ret;
 	int err = 0;
 	int werr = 0;
-	int i;
 	struct btrfs_path *path;
-	unsigned long off = 0;
+	u64 last = 0;
+	u64 start;
+	u64 end;
+	u64 ptr;
 
+	block_group_cache = &root->fs_info->block_group_cache;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	while(1) {
-		ret = radix_tree_gang_lookup_tag(radix, (void **)cache,
-						 off, ARRAY_SIZE(cache),
-						 BTRFS_BLOCK_GROUP_DIRTY);
-		if (!ret)
+		ret = find_first_extent_bit(block_group_cache, last,
+					    &start, &end, BLOCK_GROUP_DIRTY);
+		if (ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			err = write_one_cache_group(trans, root,
-						    path, cache[i]);
-			/*
-			 * if we fail to write the cache group, we want
-			 * to keep it marked dirty in hopes that a later
-			 * write will work
-			 */
-			if (err) {
-				werr = err;
-				off = cache[i]->key.objectid +
-					cache[i]->key.offset;
-				continue;
-			}
 
-			radix_tree_tag_clear(radix, cache[i]->key.objectid +
-					     cache[i]->key.offset - 1,
-					     BTRFS_BLOCK_GROUP_DIRTY);
+		last = end + 1;
+		ret = get_state_private(block_group_cache, start, &ptr);
+		if (ret)
+			break;
+
+		cache = (struct btrfs_block_group_cache *)ptr;
+		err = write_one_cache_group(trans, root,
+					    path, cache);
+		/*
+		 * if we fail to write the cache group, we want
+		 * to keep it marked dirty in hopes that a later
+		 * write will work
+		 */
+		if (err) {
+			werr = err;
+			continue;
 		}
+		clear_extent_bits(block_group_cache, start, end,
+				  BLOCK_GROUP_DIRTY, GFP_NOFS);
 	}
 	btrfs_free_path(path);
 	return werr;
 }
 
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root)
-{
-	int ret;
-	int ret2;
-	ret = write_dirty_block_radix(trans, root,
-				      &root->fs_info->block_group_radix);
-	ret2 = write_dirty_block_radix(trans, root,
-				      &root->fs_info->block_group_data_radix);
-	if (ret)
-		return ret;
-	if (ret2)
-		return ret2;
-	return 0;
-}
-
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 blocknr, u64 num, int alloc, int mark_free,
@@ -610,7 +570,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 total = num;
 	u64 old_val;
 	u64 block_in_group;
-	int ret;
+	u64 start;
+	u64 end;
 
 	while(total) {
 		cache = btrfs_lookup_block_group(info, blocknr);
@@ -619,9 +580,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		}
 		block_in_group = blocknr - cache->key.objectid;
 		WARN_ON(block_in_group > cache->key.offset);
-		radix_tree_tag_set(cache->radix, cache->key.objectid +
-				   cache->key.offset - 1,
-				   BTRFS_BLOCK_GROUP_DIRTY);
+		start = cache->key.objectid;
+		end = start + cache->key.offset - 1;
+		set_extent_bits(&info->block_group_cache, start, end,
+				BLOCK_GROUP_DIRTY, GFP_NOFS);
 
 		old_val = btrfs_block_group_used(&cache->item);
 		num = min(total, cache->key.offset - block_in_group);
@@ -630,25 +592,27 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				cache->last_alloc = blocknr;
 			if (cache->data != data &&
 			    old_val < (cache->key.offset >> 1)) {
-				cache->data = data;
-				radix_tree_delete(cache->radix,
-						  cache->key.objectid +
-						  cache->key.offset - 1);
+				int bit_to_clear;
+				int bit_to_set;
 
+				cache->data = data;
 				if (data) {
-					cache->radix =
-						&info->block_group_data_radix;
+					bit_to_clear = BLOCK_GROUP_DATA;
+					bit_to_set = BLOCK_GROUP_METADATA;
 					cache->item.flags |=
 						BTRFS_BLOCK_GROUP_DATA;
 				} else {
-					cache->radix = &info->block_group_radix;
+					bit_to_clear = BLOCK_GROUP_METADATA;
+					bit_to_set = BLOCK_GROUP_DATA;
 					cache->item.flags &=
 						~BTRFS_BLOCK_GROUP_DATA;
 				}
-				ret = radix_tree_insert(cache->radix,
-							cache->key.objectid +
-							cache->key.offset - 1,
-							(void *)cache);
+				clear_extent_bits(&info->block_group_cache,
+						  start, end, bit_to_clear,
+						  GFP_NOFS);
+				set_extent_bits(&info->block_group_cache,
+						start, end, bit_to_set,
+						GFP_NOFS);
 			}
 			old_val += num;
 		} else {
@@ -660,13 +624,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 						 blocknr, blocknr + num - 1,
 						 GFP_NOFS);
 			}
-			if (old_val < (cache->key.offset >> 1) &&
-			    old_val + num >= (cache->key.offset >> 1)) {
-				radix_tree_tag_set(cache->radix,
-						   cache->key.objectid +
-						   cache->key.offset - 1,
-						   BTRFS_BLOCK_GROUP_AVAIL);
-			}
 		}
 		btrfs_set_block_group_used(&cache->item, old_val);
 		total -= num;
@@ -730,11 +687,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 				block_group->pinned--;
 				if (gang[i] < block_group->last_alloc)
 					block_group->last_alloc = gang[i];
-				if (!block_group->data) {
-					set_extent_dirty(free_space_cache,
-							 gang[i], gang[i],
-							 GFP_NOFS);
-				}
+				set_extent_dirty(free_space_cache,
+						 gang[i], gang[i], GFP_NOFS);
 			}
 		}
 	}
@@ -1059,8 +1013,8 @@ check_failed:
 			ins->offset = search_end - ins->objectid;
 			goto check_pending;
 		}
-
 		btrfs_item_key_to_cpu(l, &key, slot);
+
 		if (key.objectid >= search_start && key.objectid > last_block &&
 		    start_found) {
 			if (last_block < search_start)
@@ -1072,9 +1026,14 @@ check_failed:
 				goto check_pending;
 			}
 		}
-
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
+			if (!start_found) {
+				last_block = key.objectid;
+				start_found = 1;
+			}
 			goto next;
+		}
+
 
 		start_found = 1;
 		last_block = key.objectid + key.offset;
@@ -1120,9 +1079,6 @@ check_pending:
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
-	if (0 && ins->objectid != cached_search_start) {
-printk("\tcached was %Lu found %Lu\n", cached_search_start, ins->objectid);
-	}
 	return 0;
 
 new_group:
@@ -1529,40 +1485,20 @@ out:
 	return ret;
 }
 
-static int free_block_group_radix(struct radix_tree_root *radix)
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
+	u64 start;
+	u64 end;
 	int ret;
-	struct btrfs_block_group_cache *cache[8];
-	int i;
 
 	while(1) {
-		ret = radix_tree_gang_lookup(radix, (void **)cache, 0,
-					     ARRAY_SIZE(cache));
-		if (!ret)
+		ret = find_first_extent_bit(&info->block_group_cache, 0,
+					    &start, &end, (unsigned int)-1);
+		if (ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			radix_tree_delete(radix, cache[i]->key.objectid +
-					  cache[i]->key.offset - 1);
-			kfree(cache[i]);
-		}
+		clear_extent_bits(&info->block_group_cache, start,
+				  end, (unsigned int)-1, GFP_NOFS);
 	}
-	return 0;
-}
-
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
-{
-	int ret;
-	int ret2;
-	u64 start;
-	u64 end;
-
-	ret = free_block_group_radix(&info->block_group_radix);
-	ret2 = free_block_group_radix(&info->block_group_data_radix);
-	if (ret)
-		return ret;
-	if (ret2)
-		return ret2;
-
 	while(1) {
 		ret = find_first_extent_bit(&info->free_space_cache, 0,
 					    &start, &end, EXTENT_DIRTY);
@@ -1579,17 +1515,20 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_path *path;
 	int ret;
 	int err = 0;
+	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct radix_tree_root *radix;
+	struct extent_map_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 	u64 group_size_blocks;
-	u64 used;
+
+	block_group_cache = &info->block_group_cache;
 
 	group_size_blocks = BTRFS_BLOCK_GROUP_SIZE >>
-		root->fs_info->sb->s_blocksize_bits;
+		info->sb->s_blocksize_bits;
+
 	root = info->extent_root;
 	key.objectid = 0;
 	key.offset = group_size_blocks;
@@ -1617,35 +1556,30 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
-		if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
-			radix = &info->block_group_data_radix;
-			cache->data = 1;
-		} else {
-			radix = &info->block_group_radix;
-			cache->data = 0;
-		}
-
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->last_alloc = cache->key.objectid;
 		cache->first_free = cache->key.objectid;
 		cache->pinned = 0;
 		cache->cached = 0;
 
-		cache->radix = radix;
-
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 
-		ret = radix_tree_insert(radix, found_key.objectid +
-					found_key.offset - 1,
-					(void *)cache);
-		BUG_ON(ret);
-		used = btrfs_block_group_used(&cache->item);
-		if (used < div_factor(key.offset, 8)) {
-			radix_tree_tag_set(radix, found_key.objectid +
-					   found_key.offset - 1,
-					   BTRFS_BLOCK_GROUP_AVAIL);
+		if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
+			bit = BLOCK_GROUP_DATA;
+			cache->data = 1;
+		} else {
+			bit = BLOCK_GROUP_METADATA;
+			cache->data = 0;
 		}
+
+		/* use EXTENT_LOCKED to prevent merging */
+		set_extent_bits(block_group_cache, found_key.objectid,
+				found_key.objectid + found_key.offset - 1,
+				bit | EXTENT_LOCKED, GFP_NOFS);
+		set_state_private(block_group_cache, found_key.objectid,
+				  (u64)cache);
+
 		if (key.objectid >=
 		    btrfs_super_total_blocks(&info->super_copy))
 			break;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 5b7dbcaacd1..1b2f9e059de 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -574,7 +574,7 @@ out:
 	return set;
 
 search_again:
-	if (start >= end)
+	if (start > end)
 		goto out;
 	write_unlock_irqrestore(&tree->lock, flags);
 	if (mask & __GFP_WAIT)
@@ -819,6 +819,21 @@ int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_dirty);
 
+int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, bits, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_bits);
+
+int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_bits);
+
 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
@@ -1138,7 +1153,6 @@ int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
 out:
 	write_unlock_irq(&tree->lock);
 	return ret;
-
 }
 
 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index d100f7c1476..5a63b417034 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -96,6 +96,10 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 void __init extent_map_init(void);
 void __exit extent_map_exit(void);
 int extent_clean_all_trees(struct extent_map_tree *tree);
+int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask);
+int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask);
 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
 			gfp_t mask);
 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
-- 
cgit v1.2.3


From 1a5bc167f6707542b79a55452075525620ed43f5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:15:26 -0400
Subject: Btrfs: Change the remaining radix trees used by extent-tree.c to
 extent_map trees

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  16 ++--
 fs/btrfs/disk-io.c     |   9 ++-
 fs/btrfs/extent-tree.c | 193 ++++++++++++++++++-------------------------------
 fs/btrfs/extent_map.c  |   5 +-
 fs/btrfs/extent_map.h  |   4 +-
 fs/btrfs/transaction.c |   5 +-
 6 files changed, 94 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c6174b27fd1..256689551eb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -283,10 +283,6 @@ struct btrfs_block_group_item {
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
-	u64 first_free;
-	u64 last_alloc;
-	u64 pinned;
-	u64 last_prealloc;
 	int data;
 	int cached;
 };
@@ -296,11 +292,13 @@ struct btrfs_fs_info {
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct radix_tree_root fs_roots_radix;
-	struct radix_tree_root pending_del_radix;
-	struct radix_tree_root pinned_radix;
-	struct radix_tree_root extent_ins_radix;
+
 	struct extent_map_tree free_space_cache;
 	struct extent_map_tree block_group_cache;
+	struct extent_map_tree pinned_extents;
+	struct extent_map_tree pending_del;
+	struct extent_map_tree extent_ins;
+
 	u64 generation;
 	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
@@ -926,7 +924,7 @@ static inline int btrfs_set_root_name(struct btrfs_root *root,
 /* extent-tree.c */
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
-int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 blocknr);
@@ -949,7 +947,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 blocknr, u64 num_blocks, int pin);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct radix_tree_root *unpin_radix);
+			       struct extent_map_tree *unpin);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 blocknr, u64 num_blocks);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aac7c82b0dc..2b86a1d779b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -432,9 +432,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		err = -ENOMEM;
 		goto fail;
 	}
-	init_bit_radix(&fs_info->pinned_radix);
-	init_bit_radix(&fs_info->pending_del_radix);
-	init_bit_radix(&fs_info->extent_ins_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -458,6 +455,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	extent_map_tree_init(&fs_info->block_group_cache,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_map_tree_init(&fs_info->pinned_extents,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_map_tree_init(&fs_info->pending_del,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	extent_map_tree_init(&fs_info->extent_ins,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4bc639565d1..477466d167a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -188,13 +188,13 @@ again:
 		return start;
 	}
 out:
-	return max(cache->last_alloc, search_start);
+	return search_start;
 
 new_group:
 	cache = btrfs_lookup_block_group(root->fs_info,
 					 last + cache->key.offset - 1);
 	if (!cache) {
-		return max((*cache_ret)->last_alloc, search_start);
+		return search_start;
 	}
 	cache = btrfs_find_block_group(root, cache,
 				       last + cache->key.offset - 1, data, 0);
@@ -247,16 +247,14 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		shint = btrfs_lookup_block_group(info, search_start);
 		if (shint && shint->data == data) {
 			used = btrfs_block_group_used(&shint->item);
-			if (used + shint->pinned <
-			    div_factor(shint->key.offset, factor)) {
+			if (used < div_factor(shint->key.offset, factor)) {
 				return shint;
 			}
 		}
 	}
 	if (hint && hint->data == data) {
 		used = btrfs_block_group_used(&hint->item);
-		if (used + hint->pinned <
-		    div_factor(hint->key.offset, factor)) {
+		if (used < div_factor(hint->key.offset, factor)) {
 			return hint;
 		}
 		last = hint->key.offset * 3;
@@ -294,7 +292,7 @@ again:
 		else
 			free_check = div_factor(cache->key.offset, factor);
 
-		if (used + cache->pinned < free_check) {
+		if (used < free_check) {
 			found_group = cache;
 			goto found;
 		}
@@ -505,8 +503,6 @@ fail:
 		return ret;
 	if (pending_ret)
 		return pending_ret;
-	if (cache->data)
-		cache->last_alloc = cache->first_free;
 	return 0;
 
 }
@@ -588,8 +584,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		old_val = btrfs_block_group_used(&cache->item);
 		num = min(total, cache->key.offset - block_in_group);
 		if (alloc) {
-			if (blocknr > cache->last_alloc)
-				cache->last_alloc = blocknr;
 			if (cache->data != data &&
 			    old_val < (cache->key.offset >> 1)) {
 				int bit_to_clear;
@@ -617,8 +611,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			old_val += num;
 		} else {
 			old_val -= num;
-			if (blocknr < cache->first_free)
-				cache->first_free = blocknr;
 			if (mark_free) {
 				set_extent_dirty(&info->free_space_cache,
 						 blocknr, blocknr + num - 1,
@@ -632,65 +624,47 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_copy_pinned(struct btrfs_root *root, struct radix_tree_root *copy)
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy)
 {
-	unsigned long gang[8];
 	u64 last = 0;
-	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
+	u64 start;
+	u64 end;
+	struct extent_map_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
-	int i;
 
 	while(1) {
-		ret = find_first_radix_bit(pinned_radix, gang, last,
-					   ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(pinned_extents, last,
+					    &start, &end, EXTENT_DIRTY);
+		if (ret)
 			break;
-		for (i = 0 ; i < ret; i++) {
-			set_radix_bit(copy, gang[i]);
-			last = gang[i] + 1;
-		}
+		set_extent_dirty(copy, start, end, GFP_NOFS);
+		last = end + 1;
 	}
-	ret = find_first_radix_bit(&root->fs_info->extent_ins_radix, gang, 0,
-				   ARRAY_SIZE(gang));
-	WARN_ON(ret);
 	return 0;
 }
 
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct radix_tree_root *unpin_radix)
+			       struct extent_map_tree *unpin)
 {
-	unsigned long gang[8];
-	struct btrfs_block_group_cache *block_group;
-	u64 first = 0;
+	u64 start;
+	u64 end;
 	int ret;
-	int i;
-	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
+	struct extent_map_tree *pinned_extents = &root->fs_info->pinned_extents;
 	struct extent_map_tree *free_space_cache;
 
 	free_space_cache = &root->fs_info->free_space_cache;
 
 	while(1) {
-		ret = find_first_radix_bit(unpin_radix, gang, 0,
-					   ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(unpin, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
 			break;
-		if (!first)
-			first = gang[0];
-		for (i = 0; i < ret; i++) {
-			clear_radix_bit(pinned_radix, gang[i]);
-			clear_radix_bit(unpin_radix, gang[i]);
-			block_group = btrfs_lookup_block_group(root->fs_info,
-							       gang[i]);
-			if (block_group) {
-				WARN_ON(block_group->pinned == 0);
-				block_group->pinned--;
-				if (gang[i] < block_group->last_alloc)
-					block_group->last_alloc = gang[i];
-				set_extent_dirty(free_space_cache,
-						 gang[i], gang[i], GFP_NOFS);
-			}
-		}
+
+		clear_extent_dirty(pinned_extents, start, end,
+				   GFP_NOFS);
+		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
 	}
 	return 0;
 }
@@ -700,39 +674,36 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 {
 	struct btrfs_key ins;
 	struct btrfs_extent_item extent_item;
-	int i;
 	int ret;
-	int err;
-	unsigned long gang[8];
+	int err = 0;
+	u64 start;
+	u64 end;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 
 	btrfs_set_stack_extent_refs(&extent_item, 1);
-	ins.offset = 1;
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 	btrfs_set_stack_extent_owner(&extent_item,
 				     extent_root->root_key.objectid);
 
 	while(1) {
-		ret = find_first_radix_bit(&info->extent_ins_radix, gang, 0,
-					   ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(&info->extent_ins, 0, &start,
+					    &end, EXTENT_LOCKED);
+		if (ret)
 			break;
 
-		for (i = 0; i < ret; i++) {
-			ins.objectid = gang[i];
-			err = btrfs_insert_item(trans, extent_root, &ins,
-						&extent_item,
-						sizeof(extent_item));
-			clear_radix_bit(&info->extent_ins_radix, gang[i]);
-			WARN_ON(err);
-		}
+		ins.objectid = start;
+		ins.offset = end + 1 - start;
+		err = btrfs_insert_item(trans, extent_root, &ins,
+					&extent_item, sizeof(extent_item));
+		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
+				  GFP_NOFS);
 	}
 	return 0;
 }
 
 static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 {
-	int err;
+	int err = 0;
 	struct extent_buffer *buf;
 
 	if (!pending) {
@@ -748,16 +719,11 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 			}
 			free_extent_buffer(buf);
 		}
-		err = set_radix_bit(&root->fs_info->pinned_radix, blocknr);
-		if (!err) {
-			struct btrfs_block_group_cache *cache;
-			cache = btrfs_lookup_block_group(root->fs_info,
-							 blocknr);
-			if (cache)
-				cache->pinned++;
-		}
+		set_extent_dirty(&root->fs_info->pinned_extents,
+				 blocknr, blocknr, GFP_NOFS);
 	} else {
-		err = set_radix_bit(&root->fs_info->pending_del_radix, blocknr);
+		set_extent_bits(&root->fs_info->pending_del,
+				  blocknr, blocknr, EXTENT_LOCKED, GFP_NOFS);
 	}
 	BUG_ON(err < 0);
 	return 0;
@@ -840,43 +806,28 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root)
 {
 	int ret;
-	int wret;
 	int err = 0;
-	unsigned long gang[4];
-	int i;
-	struct radix_tree_root *pending_radix;
-	struct radix_tree_root *pinned_radix;
-	struct btrfs_block_group_cache *cache;
+	u64 start;
+	u64 end;
+	struct extent_map_tree *pending_del;
+	struct extent_map_tree *pinned_extents;
 
-	pending_radix = &extent_root->fs_info->pending_del_radix;
-	pinned_radix = &extent_root->fs_info->pinned_radix;
+	pending_del = &extent_root->fs_info->pending_del;
+	pinned_extents = &extent_root->fs_info->pinned_extents;
 
 	while(1) {
-		ret = find_first_radix_bit(pending_radix, gang, 0,
-					   ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(pending_del, 0, &start, &end,
+					    EXTENT_LOCKED);
+		if (ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			wret = set_radix_bit(pinned_radix, gang[i]);
-			if (wret == 0) {
-				cache =
-				  btrfs_lookup_block_group(extent_root->fs_info,
-							   gang[i]);
-				if (cache)
-					cache->pinned++;
-			}
-			if (wret < 0) {
-				printk(KERN_CRIT "set_radix_bit, err %d\n",
-				       wret);
-				BUG_ON(wret < 0);
-			}
-			wret = clear_radix_bit(pending_radix, gang[i]);
-			BUG_ON(wret);
-			wret = __free_extent(trans, extent_root,
-					     gang[i], 1, 0, 0);
-			if (wret)
-				err = wret;
-		}
+
+		set_extent_dirty(pinned_extents, start, end, GFP_NOFS);
+		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
+				  GFP_NOFS);
+		ret = __free_extent(trans, extent_root,
+				     start, end + 1 - start, 0, 0);
+		if (ret)
+			err = ret;
 	}
 	return err;
 }
@@ -920,7 +871,6 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	u64 hole_size = 0;
 	int slot = 0;
 	u64 last_block = 0;
-	u64 test_block;
 	u64 orig_search_start = search_start;
 	int start_found;
 	struct extent_buffer *l;
@@ -1059,13 +1009,15 @@ check_pending:
 	if (ins->objectid + num_blocks >= search_end)
 		goto enospc;
 
-	for (test_block = ins->objectid;
-	     test_block < ins->objectid + num_blocks; test_block++) {
-		if (test_radix_bit(&info->pinned_radix, test_block) ||
-		    test_radix_bit(&info->extent_ins_radix, test_block)) {
-			search_start = test_block + 1;
-			goto new_group;
-		}
+	if (test_range_bit(&info->extent_ins, ins->objectid,
+			   ins->objectid + num_blocks -1, EXTENT_LOCKED, 0)) {
+		search_start = ins->objectid + num_blocks;
+		goto new_group;
+	}
+	if (test_range_bit(&info->pinned_extents, ins->objectid,
+			   ins->objectid + num_blocks -1, EXTENT_DIRTY, 0)) {
+		search_start = ins->objectid + num_blocks;
+		goto new_group;
 	}
 	if (exclude_nr > 0 && (ins->objectid + num_blocks > exclude_start &&
 	    ins->objectid < exclude_start + exclude_nr)) {
@@ -1156,7 +1108,9 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 	if (root == extent_root) {
 		BUG_ON(num_blocks != 1);
-		set_radix_bit(&root->fs_info->extent_ins_radix, ins->objectid);
+		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
+				ins->objectid + ins->offset - 1,
+				EXTENT_LOCKED, GFP_NOFS);
 		goto update_block;
 	}
 
@@ -1557,9 +1511,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
-		cache->last_alloc = cache->key.objectid;
-		cache->first_free = cache->key.objectid;
-		cache->pinned = 0;
 		cache->cached = 0;
 
 		key.objectid = found_key.objectid + found_key.offset;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 1b2f9e059de..e081558d52f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1188,8 +1188,8 @@ out:
  * has the bits set.  Otherwise, 1 is returned if any bit in the
  * range is found set.
  */
-static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
-			  int bits, int filled)
+int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
+		   int bits, int filled)
 {
 	struct extent_state *state = NULL;
 	struct rb_node *node;
@@ -1222,6 +1222,7 @@ static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
 	read_unlock_irq(&tree->lock);
 	return bitset;
 }
+EXPORT_SYMBOL(test_range_bit);
 
 /*
  * helper function to set a given page up to date if all the
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5a63b417034..75dc600dc0f 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -95,7 +95,9 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 			  get_extent_t *get_extent);
 void __init extent_map_init(void);
 void __exit extent_map_exit(void);
-int extent_clean_all_trees(struct extent_map_tree *tree);
+
+int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
+		   int bits, int filled);
 int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask);
 int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 372b61f5733..55289b71056 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -481,11 +481,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
-	struct radix_tree_root pinned_copy;
+	struct extent_map_tree pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
 
-	init_bit_radix(&pinned_copy);
+	extent_map_tree_init(&pinned_copy,
+			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&dirty_fs_roots);
 
 	mutex_lock(&root->fs_info->trans_mutex);
-- 
cgit v1.2.3


From db94535db75e67fab12ccbb7f5ee548e33fed891 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:15:53 -0400
Subject: Btrfs: Allow tree blocks larger than the page size

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 319 +++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/ctree.h       | 112 ++++++++++-------
 fs/btrfs/disk-io.c     |  92 ++++++++------
 fs/btrfs/disk-io.h     |   9 +-
 fs/btrfs/extent-tree.c | 285 ++++++++++++++++++++++---------------------
 fs/btrfs/extent_map.c  |  91 ++++++++++----
 fs/btrfs/extent_map.h  |  10 +-
 fs/btrfs/file-item.c   |  10 +-
 fs/btrfs/file.c        | 126 ++++++++++---------
 fs/btrfs/inode.c       |  90 +++++++-------
 fs/btrfs/print-tree.c  |  23 ++--
 fs/btrfs/super.c       |   6 +-
 fs/btrfs/sysfs.c       |   5 +-
 fs/btrfs/transaction.c |  39 +++---
 fs/btrfs/tree-defrag.c |  18 +--
 15 files changed, 738 insertions(+), 497 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 50ef351ef6b..34de83630ae 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -83,16 +83,15 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
-	cow = btrfs_alloc_free_block(trans, root, search_start, empty_size);
+	cow = btrfs_alloc_free_block(trans, root, buf->len,
+				     search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
 	cow->alloc_addr = (unsigned long)__builtin_return_address(0);
-	if (buf->len != root->sectorsize || cow->len != root->sectorsize)
-		WARN_ON(1);
 
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
-	btrfs_set_header_blocknr(cow, extent_buffer_blocknr(cow));
+	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
 	btrfs_set_header_owner(cow, root->root_key.objectid);
 
@@ -110,16 +109,16 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		root->node = cow;
 		extent_buffer_get(cow);
 		if (buf != root->commit_root) {
-			btrfs_free_extent(trans, root,
-					  extent_buffer_blocknr(buf), 1, 1);
+			btrfs_free_extent(trans, root, buf->start,
+					  buf->len, 1);
 		}
 		free_extent_buffer(buf);
 	} else {
 		btrfs_set_node_blockptr(parent, parent_slot,
-					extent_buffer_blocknr(cow));
+					cow->start);
 		btrfs_mark_buffer_dirty(parent);
 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
-		btrfs_free_extent(trans, root, extent_buffer_blocknr(buf),1,1);
+		btrfs_free_extent(trans, root, buf->start, buf->len, 1);
 	}
 	free_extent_buffer(buf);
 	btrfs_mark_buffer_dirty(cow);
@@ -149,13 +148,14 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	search_start = extent_buffer_blocknr(buf) & ~((u64)65535);
+	search_start = buf->start & ~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
 	(*cow_ret)->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return ret;
 }
 
+#if 0
 static int close_blocks(u64 blocknr, u64 other)
 {
 	if (blocknr < other && other - blocknr < 8)
@@ -165,7 +165,6 @@ static int close_blocks(u64 blocknr, u64 other)
 	return 0;
 }
 
-#if 0
 static int should_defrag_leaf(struct extent_buffer *eb)
 {
 	return 0;
@@ -355,7 +354,7 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		BUG_ON(memcmp(&parent_key, &node_key,
 			      sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_blocknr(node));
+		       btrfs_header_bytenr(node));
 	}
 	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
 	if (slot != 0) {
@@ -398,7 +397,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		BUG_ON(memcmp(&parent_key, &leaf_key,
 		       sizeof(struct btrfs_disk_key)));
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-		       btrfs_header_blocknr(leaf));
+		       btrfs_header_bytenr(leaf));
 	}
 #if 0
 	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
@@ -467,14 +466,16 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 static int check_block(struct btrfs_root *root, struct btrfs_path *path,
 			int level)
 {
+#if 0
 	struct extent_buffer *buf = path->nodes[level];
 
 	if (memcmp_extent_buffer(buf, root->fs_info->fsid,
 				 (unsigned long)btrfs_header_fsid(buf),
 				 BTRFS_FSID_SIZE)) {
 		printk("warning bad block %Lu\n", buf->start);
-		BUG();
+		return 1;
 	}
+#endif
 	if (level == 0)
 		return check_leaf(root, path, level);
 	return check_node(root, path, level);
@@ -585,7 +586,8 @@ static struct extent_buffer *read_node_slot(struct btrfs_root *root,
 		return NULL;
 	if (slot >= btrfs_header_nritems(parent))
 		return NULL;
-	return read_tree_block(root, btrfs_node_blockptr(parent, slot));
+	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
+		       btrfs_level_size(root, btrfs_header_level(parent) - 1));
 }
 
 static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -618,7 +620,6 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	 */
 	if (!parent) {
 		struct extent_buffer *child;
-		u64 blocknr = extent_buffer_blocknr(mid);
 
 		if (btrfs_header_nritems(mid) != 1)
 			return 0;
@@ -632,9 +633,10 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		wait_on_tree_block_writeback(root, mid);
 		/* once for the path */
 		free_extent_buffer(mid);
+		ret = btrfs_free_extent(trans, root, mid->start, mid->len, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
-		return btrfs_free_extent(trans, root, blocknr, 1, 1);
+		return ret;
 	}
 	if (btrfs_header_nritems(mid) >
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -680,7 +682,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret < 0 && wret != -ENOSPC)
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
-			u64 blocknr = extent_buffer_blocknr(right);
+			u64 bytenr = right->start;
+			u32 blocksize = right->len;
+
 			clean_tree_block(trans, root, right);
 			wait_on_tree_block_writeback(root, right);
 			free_extent_buffer(right);
@@ -689,7 +693,8 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 				       1);
 			if (wret)
 				ret = wret;
-			wret = btrfs_free_extent(trans, root, blocknr, 1, 1);
+			wret = btrfs_free_extent(trans, root, bytenr,
+						 blocksize, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -719,7 +724,8 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	if (btrfs_header_nritems(mid) == 0) {
 		/* we've managed to empty the middle node, drop it */
-		u64 blocknr = extent_buffer_blocknr(mid);
+		u64 bytenr = mid->start;
+		u32 blocksize = mid->len;
 		clean_tree_block(trans, root, mid);
 		wait_on_tree_block_writeback(root, mid);
 		free_extent_buffer(mid);
@@ -727,7 +733,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		wret = del_ptr(trans, root, path, level + 1, pslot);
 		if (wret)
 			ret = wret;
-		wret = btrfs_free_extent(trans, root, blocknr, 1, 1);
+		wret = btrfs_free_extent(trans, root, bytenr, blocksize, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -830,7 +836,6 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				path->slots[level] = orig_slot;
 				free_extent_buffer(left);
 			}
-			check_node(root, path, level);
 			return 0;
 		}
 		free_extent_buffer(left);
@@ -874,12 +879,10 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			} else {
 				free_extent_buffer(right);
 			}
-			check_node(root, path, level);
 			return 0;
 		}
 		free_extent_buffer(right);
 	}
-	check_node(root, path, level);
 	return 1;
 }
 
@@ -889,19 +892,23 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 			     int level, int slot)
 {
+	return;
+#if 0
 	struct extent_buffer *node;
 	int i;
 	u32 nritems;
-	u64 blocknr;
+	u64 bytenr;
 	u64 search;
 	u64 cluster_start;
 	int ret;
 	int nread = 0;
 	int direction = path->reada;
+	int level;
 	struct radix_tree_root found;
 	unsigned long gang[8];
 	struct extent_buffer *eb;
 
+
 	if (level == 0)
 		return;
 
@@ -918,8 +925,9 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 
 	init_bit_radix(&found);
 	nritems = btrfs_header_nritems(node);
+	level = btrfs_header_level(node) - 1;
 	for (i = slot; i < nritems; i++) {
-		blocknr = btrfs_node_blockptr(node, i);
+		bytenr = btrfs_node_blockptr(node, i);
 		set_radix_bit(&found, blocknr);
 	}
 	if (direction > 0) {
@@ -944,6 +952,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 			}
 		}
 	}
+#endif
 }
 /*
  * look for key in the tree.  path is filled in with nodes along the way
@@ -963,7 +972,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow)
 {
 	struct extent_buffer *b;
-	u64 blocknr;
+	u64 bytenr;
 	int slot;
 	int ret;
 	int level;
@@ -1027,10 +1036,11 @@ again:
 			/* this is only true while dropping a snapshot */
 			if (level == lowest_level)
 				break;
-			blocknr = btrfs_node_blockptr(b, slot);
+			bytenr = btrfs_node_blockptr(b, slot);
 			if (should_reada)
 				reada_for_search(root, p, level, slot);
-			b = read_tree_block(root, btrfs_node_blockptr(b, slot));
+			b = read_tree_block(root, bytenr,
+					    btrfs_level_size(root, level - 1));
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -1193,14 +1203,14 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	c = btrfs_alloc_free_block(trans, root,
-				   extent_buffer_blocknr(root->node), 0);
+	c = btrfs_alloc_free_block(trans, root, root->nodesize,
+				   root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 	memset_extent_buffer(c, 0, 0, root->nodesize);
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_header_level(c, level);
-	btrfs_set_header_blocknr(c, extent_buffer_blocknr(c));
+	btrfs_set_header_bytenr(c, c->start);
 	btrfs_set_header_generation(c, trans->transid);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 	lower = path->nodes[level-1];
@@ -1213,7 +1223,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 	btrfs_set_node_key(c, &lower_key, 0);
-	btrfs_set_node_blockptr(c, 0, extent_buffer_blocknr(lower));
+	btrfs_set_node_blockptr(c, 0, lower->start);
 
 	btrfs_mark_buffer_dirty(c);
 
@@ -1237,7 +1247,7 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
  */
 static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, struct btrfs_disk_key
-		      *key, u64 blocknr, int slot, int level)
+		      *key, u64 bytenr, int slot, int level)
 {
 	struct extent_buffer *lower;
 	int nritems;
@@ -1256,10 +1266,9 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
 	btrfs_set_node_key(lower, key, slot);
-	btrfs_set_node_blockptr(lower, slot, blocknr);
+	btrfs_set_node_blockptr(lower, slot, bytenr);
 	btrfs_set_header_nritems(lower, nritems + 1);
 	btrfs_mark_buffer_dirty(lower);
-	check_node(root, path, level);
 	return 0;
 }
 
@@ -1300,14 +1309,14 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	c_nritems = btrfs_header_nritems(c);
-	split = btrfs_alloc_free_block(trans, root,
-				       extent_buffer_blocknr(c), 0);
+	split = btrfs_alloc_free_block(trans, root, root->nodesize,
+				       c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
 	btrfs_set_header_flags(split, btrfs_header_flags(c));
 	btrfs_set_header_level(split, btrfs_header_level(c));
-	btrfs_set_header_blocknr(split, extent_buffer_blocknr(split));
+	btrfs_set_header_bytenr(split, split->start);
 	btrfs_set_header_generation(split, trans->transid);
 	btrfs_set_header_owner(split, root->root_key.objectid);
 	write_extent_buffer(split, root->fs_info->fsid,
@@ -1328,8 +1337,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_mark_buffer_dirty(split);
 
 	btrfs_node_key(split, &disk_key, 0);
-	wret = insert_ptr(trans, root, path, &disk_key,
-			  extent_buffer_blocknr(split),
+	wret = insert_ptr(trans, root, path, &disk_key, split->start,
 			  path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
@@ -1407,6 +1415,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 left_nritems;
 	u32 right_nritems;
 	u32 data_end;
+	u32 this_item_size;
 	int ret;
 
 	slot = path->slots[1];
@@ -1417,7 +1426,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
-	right = read_tree_block(root, btrfs_node_blockptr(upper, slot + 1));
+	right = read_tree_block(root, btrfs_node_blockptr(upper, slot + 1),
+				root->leafsize);
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		free_extent_buffer(right);
@@ -1445,13 +1455,27 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	for (i = left_nritems - 1; i >= 1; i--) {
 		item = btrfs_item_nr(left, i);
+
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
-		if (btrfs_item_size(left, item) + sizeof(*item) + push_space >
-		    free_space)
+
+		if (!left->map_token) {
+			map_extent_buffer(left, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&left->map_token, &left->kaddr,
+					&left->map_start, &left->map_len,
+					KM_USER1);
+		}
+
+		this_item_size = btrfs_item_size(left, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
 			break;
 		push_items++;
-		push_space += btrfs_item_size(left, item) + sizeof(*item);
+		push_space += this_item_size + sizeof(*item);
+	}
+	if (left->map_token) {
+		unmap_extent_buffer(left, left->map_token, KM_USER1);
+		left->map_token = NULL;
 	}
 
 	if (push_items == 0) {
@@ -1493,11 +1517,23 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	right_nritems += push_items;
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
+
 	for (i = 0; i < right_nritems; i++) {
 		item = btrfs_item_nr(right, i);
-		btrfs_set_item_offset(right, item, push_space -
-				      btrfs_item_size(right, item));
-		push_space = btrfs_item_offset(right, item);
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+		push_space -= btrfs_item_size(right, item);
+		btrfs_set_item_offset(right, item, push_space);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
 	}
 	left_nritems -= push_items;
 	btrfs_set_header_nritems(left, left_nritems);
@@ -1518,8 +1554,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	} else {
 		free_extent_buffer(right);
 	}
-	if (path->nodes[1])
-		check_node(root, path, 1);
 	return 0;
 }
 /*
@@ -1542,6 +1576,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 right_nritems;
 	int ret = 0;
 	int wret;
+	u32 this_item_size;
+	u32 old_left_item_size;
 
 	slot = path->slots[1];
 	if (slot == 0)
@@ -1550,7 +1586,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 
 	left = read_tree_block(root, btrfs_node_blockptr(path->nodes[1],
-							 slot - 1));
+			       slot - 1), root->leafsize);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		free_extent_buffer(left);
@@ -1579,14 +1615,30 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	for (i = 0; i < right_nritems - 1; i++) {
 		item = btrfs_item_nr(right, i);
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
-		if (btrfs_item_size(right, item) + sizeof(*item) + push_space >
-		    free_space)
+
+		this_item_size = btrfs_item_size(right, item);
+		if (this_item_size + sizeof(*item) + push_space > free_space)
 			break;
+
 		push_items++;
-		push_space += btrfs_item_size(right, item) + sizeof(*item);
+		push_space += this_item_size + sizeof(*item);
+	}
+
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
 	}
+
 	if (push_items == 0) {
 		free_extent_buffer(left);
 		return 1;
@@ -1611,15 +1663,28 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	old_left_nritems = btrfs_header_nritems(left);
 	BUG_ON(old_left_nritems < 0);
 
+	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		u32 ioff;
+
 		item = btrfs_item_nr(left, i);
+		if (!left->map_token) {
+			map_extent_buffer(left, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&left->map_token, &left->kaddr,
+					&left->map_start, &left->map_len,
+					KM_USER1);
+		}
+
 		ioff = btrfs_item_offset(left, item);
 		btrfs_set_item_offset(left, item,
-		      ioff - (BTRFS_LEAF_DATA_SIZE(root) -
-		      btrfs_item_offset_nr(left, old_left_nritems - 1)));
+		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
 	}
 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
+	if (left->map_token) {
+		unmap_extent_buffer(left, left->map_token, KM_USER1);
+		left->map_token = NULL;
+	}
 
 	/* fixup right node */
 	push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -1640,9 +1705,21 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	for (i = 0; i < right_nritems; i++) {
 		item = btrfs_item_nr(right, i);
-		btrfs_set_item_offset(right, item, push_space -
-				      btrfs_item_size(right, item));
-		push_space = btrfs_item_offset(right, item);
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		push_space = push_space - btrfs_item_size(right, item);
+		btrfs_set_item_offset(right, item, push_space);
+	}
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
 	}
 
 	btrfs_mark_buffer_dirty(left);
@@ -1664,8 +1741,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		path->slots[0] -= push_items;
 	}
 	BUG_ON(path->slots[0] < 0);
-	if (path->nodes[1])
-		check_node(root, path, 1);
 	return ret;
 }
 
@@ -1718,13 +1793,13 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1)/ 2;
 
-	right = btrfs_alloc_free_block(trans, root,
-					      extent_buffer_blocknr(l), 0);
+	right = btrfs_alloc_free_block(trans, root, root->leafsize,
+				       l->start, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
-	btrfs_set_header_blocknr(right, extent_buffer_blocknr(right));
+	btrfs_set_header_bytenr(right, right->start);
 	btrfs_set_header_generation(right, trans->transid);
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
@@ -1740,8 +1815,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
-						  &disk_key,
-						  extent_buffer_blocknr(right),
+						  &disk_key, right->start,
 						  path->slots[1] + 1, 1);
 				if (wret)
 					ret = wret;
@@ -1762,7 +1836,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
 						  &disk_key,
-						  extent_buffer_blocknr(right),
+						  right->start,
 						  path->slots[1], 1);
 				if (wret)
 					ret = wret;
@@ -1799,15 +1873,30 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	for (i = 0; i < nritems; i++) {
 		struct btrfs_item *item = btrfs_item_nr(right, i);
-		u32 ioff = btrfs_item_offset(right, item);
+		u32 ioff;
+
+		if (!right->map_token) {
+			map_extent_buffer(right, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&right->map_token, &right->kaddr,
+					&right->map_start, &right->map_len,
+					KM_USER1);
+		}
+
+		ioff = btrfs_item_offset(right, item);
 		btrfs_set_item_offset(right, item, ioff + rt_data_off);
 	}
 
+	if (right->map_token) {
+		unmap_extent_buffer(right, right->map_token, KM_USER1);
+		right->map_token = NULL;
+	}
+
 	btrfs_set_header_nritems(l, mid);
 	ret = 0;
 	btrfs_item_key(right, &disk_key, 0);
-	wret = insert_ptr(trans, root, path, &disk_key,
-			  extent_buffer_blocknr(right), path->slots[1] + 1, 1);
+	wret = insert_ptr(trans, root, path, &disk_key, right->start,
+			  path->slots[1] + 1, 1);
 	if (wret)
 		ret = wret;
 
@@ -1824,19 +1913,17 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		free_extent_buffer(right);
 
 	BUG_ON(path->slots[0] < 0);
-	check_node(root, path, 1);
-	check_leaf(root, path, 0);
 
 	if (!double_split)
 		return ret;
 
-	right = btrfs_alloc_free_block(trans, root,
-				       extent_buffer_blocknr(l), 0);
+	right = btrfs_alloc_free_block(trans, root, root->leafsize,
+				       l->start, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
-	btrfs_set_header_blocknr(right, extent_buffer_blocknr(right));
+	btrfs_set_header_bytenr(right, right->start);
 	btrfs_set_header_generation(right, trans->transid);
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
@@ -1847,8 +1934,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_cpu_key_to_disk(&disk_key, ins_key);
 	btrfs_set_header_nritems(right, 0);
 	wret = insert_ptr(trans, root, path,
-			  &disk_key,
-			  extent_buffer_blocknr(right),
+			  &disk_key, right->start,
 			  path->slots[1], 1);
 	if (wret)
 		ret = wret;
@@ -1860,8 +1946,6 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	free_extent_buffer(path->nodes[0]);
 	path->nodes[0] = right;
 	path->slots[0] = 0;
-	check_node(root, path, 1);
-	check_leaf(root, path, 0);
 	return ret;
 }
 
@@ -1904,9 +1988,24 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
 		item = btrfs_item_nr(leaf, i);
+
+		if (!leaf->map_token) {
+			map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+		}
+
 		ioff = btrfs_item_offset(leaf, item);
 		btrfs_set_item_offset(leaf, item, ioff + size_diff);
 	}
+
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
 	/* shift the data */
 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
 		      data_end + size_diff, btrfs_leaf_data(leaf) +
@@ -1921,7 +2020,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	check_leaf(root, path, 0);
 	return ret;
 }
 
@@ -1963,10 +2061,23 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	for (i = slot; i < nritems; i++) {
 		u32 ioff;
 		item = btrfs_item_nr(leaf, i);
+
+		if (!leaf->map_token) {
+			map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+		}
 		ioff = btrfs_item_offset(leaf, item);
 		btrfs_set_item_offset(leaf, item, ioff - data_size);
 	}
 
+	if (leaf->map_token) {
+		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+		leaf->map_token = NULL;
+	}
+
 	/* shift the data */
 	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
 		      data_end - data_size, btrfs_leaf_data(leaf) +
@@ -1983,7 +2094,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	check_leaf(root, path, 0);
 	return ret;
 }
 
@@ -2046,12 +2156,26 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
 		 */
 		/* first correct the data pointers */
+		WARN_ON(leaf->map_token);
 		for (i = slot; i < nritems; i++) {
 			u32 ioff;
+
 			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+
 			ioff = btrfs_item_offset(leaf, item);
 			btrfs_set_item_offset(leaf, item, ioff - data_size);
 		}
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
 
 		/* shift the items */
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
@@ -2081,7 +2205,6 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	check_leaf(root, path, 0);
 out:
 	return ret;
 }
@@ -2186,10 +2309,24 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 		for (i = slot + 1; i < nritems; i++) {
 			u32 ioff;
+
 			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
 			ioff = btrfs_item_offset(leaf, item);
 			btrfs_set_item_offset(leaf, item, ioff + dsize);
 		}
+
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
 			      btrfs_item_nr_offset(slot + 1),
 			      sizeof(struct btrfs_item) *
@@ -2209,8 +2346,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root,
-						 extent_buffer_blocknr(leaf),
-						 1, 1);
+						 leaf->start, leaf->len, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -2247,7 +2383,8 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 
 			if (btrfs_header_nritems(leaf) == 0) {
-				u64 blocknr = extent_buffer_blocknr(leaf);
+				u64 bytenr = leaf->start;
+				u32 blocksize = leaf->len;
 
 				clean_tree_block(trans, root, leaf);
 				wait_on_tree_block_writeback(root, leaf);
@@ -2257,8 +2394,8 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					ret = wret;
 
 				free_extent_buffer(leaf);
-				wret = btrfs_free_extent(trans, root, blocknr,
-							 1, 1);
+				wret = btrfs_free_extent(trans, root, bytenr,
+							 blocksize, 1);
 				if (wret)
 					ret = wret;
 			} else {
@@ -2281,7 +2418,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
 	int slot;
 	int level = 1;
-	u64 blocknr;
+	u64 bytenr;
 	struct extent_buffer *c;
 	struct extent_buffer *next = NULL;
 
@@ -2296,14 +2433,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			continue;
 		}
 
-		blocknr = btrfs_node_blockptr(c, slot);
+		bytenr = btrfs_node_blockptr(c, slot);
 		if (next)
 			free_extent_buffer(next);
 
 		if (path->reada)
 			reada_for_search(root, path, level, slot);
 
-		next = read_tree_block(root, blocknr);
+		next = read_tree_block(root, bytenr,
+				       btrfs_level_size(root, level -1));
 		break;
 	}
 	path->slots[level] = slot;
@@ -2317,7 +2455,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			break;
 		if (path->reada)
 			reada_for_search(root, path, level, 0);
-		next = read_tree_block(root, btrfs_node_blockptr(next, 0));
+		next = read_tree_block(root, btrfs_node_blockptr(next, 0),
+				       btrfs_level_size(root, level - 1));
 	}
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 256689551eb..4d05456ec32 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -96,7 +96,7 @@ struct btrfs_key {
 struct btrfs_header {
 	u8 csum[BTRFS_CSUM_SIZE];
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
-	__le64 blocknr; /* which block this node is supposed to live in */
+	__le64 bytenr; /* which block this node is supposed to live in */
 	__le64 generation;
 	__le64 owner;
 	__le32 nritems;
@@ -122,16 +122,17 @@ struct btrfs_super_block {
 	u8 csum[BTRFS_CSUM_SIZE];
 	/* the first 3 fields must match struct btrfs_header */
 	u8 fsid[16];    /* FS specific uuid */
-	__le64 blocknr; /* this block number */
+	__le64 bytenr; /* this block number */
 	__le64 magic;
 	__le64 generation;
 	__le64 root;
-	__le64 total_blocks;
-	__le64 blocks_used;
+	__le64 total_bytes;
+	__le64 bytes_used;
 	__le64 root_dir_objectid;
 	__le32 sectorsize;
 	__le32 nodesize;
 	__le32 leafsize;
+	u8 root_level;
 } __attribute__ ((__packed__));
 
 /*
@@ -231,13 +232,14 @@ struct btrfs_dir_item {
 struct btrfs_root_item {
 	struct btrfs_inode_item inode;
 	__le64 root_dirid;
-	__le64 blocknr;
-	__le64 block_limit;
-	__le64 blocks_used;
+	__le64 bytenr;
+	__le64 byte_limit;
+	__le64 bytes_used;
 	__le32 flags;
 	__le32 refs;
 	struct btrfs_disk_key drop_progress;
 	u8 drop_level;
+	u8 level;
 } __attribute__ ((__packed__));
 
 #define BTRFS_FILE_EXTENT_REG 0
@@ -250,8 +252,8 @@ struct btrfs_file_extent_item {
 	 * disk space consumed by the extent, checksum blocks are included
 	 * in these numbers
 	 */
-	__le64 disk_blocknr;
-	__le64 disk_num_blocks;
+	__le64 disk_bytenr;
+	__le64 disk_num_bytes;
 	/*
 	 * the logical offset in file blocks (no csums)
 	 * this extent record is for.  This allows a file extent to point
@@ -263,7 +265,7 @@ struct btrfs_file_extent_item {
 	/*
 	 * the logical number of file blocks (no csums included)
 	 */
-	__le64 num_blocks;
+	__le64 num_bytes;
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
@@ -429,6 +431,7 @@ static inline u##bits btrfs_##name(struct extent_buffer *eb,		\
 	int err;							\
 	char *map_token;						\
 	char *kaddr;							\
+	int unmap_on_exit = (eb->map_token == NULL);			\
 	unsigned long map_start;					\
 	unsigned long map_len;						\
 	unsigned long offset = (unsigned long)s +			\
@@ -436,12 +439,13 @@ static inline u##bits btrfs_##name(struct extent_buffer *eb,		\
 	err = map_extent_buffer(eb, offset,				\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER0);	\
+				&map_start, &map_len, KM_USER1);	\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
 		u##bits res = le##bits##_to_cpu(*tmp);			\
-		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
 		return res;						\
 	} else {							\
 		__le##bits res;						\
@@ -457,17 +461,19 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 	char *kaddr;							\
 	unsigned long map_start;					\
 	unsigned long map_len;						\
+	int unmap_on_exit = (eb->map_token == NULL);			\
 	unsigned long offset = (unsigned long)s +			\
 				offsetof(type, member);			\
 	err = map_extent_buffer(eb, offset,				\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER0);	\
+				&map_start, &map_len, KM_USER1);	\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
 		*tmp = cpu_to_le##bits(val);				\
-		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
 	} else {							\
 		val = cpu_to_le##bits(val);				\
 		write_eb_member(eb, s, type, member, &val);		\
@@ -483,15 +489,17 @@ static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
 	unsigned long map_start;					\
 	unsigned long map_len;						\
 	unsigned long offset = offsetof(type, member);			\
+	int unmap_on_exit = (eb->map_token == NULL);			\
 	err = map_extent_buffer(eb, offset,				\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER0);	\
+				&map_start, &map_len, KM_USER1);	\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
 		u##bits res = le##bits##_to_cpu(*tmp);			\
-		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
 		return res;						\
 	} else {							\
 		__le##bits res;						\
@@ -508,15 +516,17 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 	unsigned long map_start;					\
 	unsigned long map_len;						\
 	unsigned long offset = offsetof(type, member);			\
+	int unmap_on_exit = (eb->map_token == NULL);			\
 	err = map_extent_buffer(eb, offset,				\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER0);	\
+				&map_start, &map_len, KM_USER1);	\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
 		*tmp = cpu_to_le##bits(val);				\
-		unmap_extent_buffer(eb, map_token, KM_USER0);		\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
 	} else {							\
 		val = cpu_to_le##bits(val);				\
 		write_eb_member(eb, NULL, type, member, &val);		\
@@ -769,7 +779,7 @@ static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val)
 }
 
 /* struct btrfs_header */
-BTRFS_SETGET_HEADER_FUNCS(header_blocknr, struct btrfs_header, blocknr, 64);
+BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
 			  generation, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
@@ -817,24 +827,28 @@ static inline int btrfs_is_leaf(struct extent_buffer *eb)
 
 /* struct btrfs_root_item */
 BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_FUNCS(disk_root_blocknr, struct btrfs_root_item, blocknr, 64);
+BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
 
-BTRFS_SETGET_STACK_FUNCS(root_blocknr, struct btrfs_root_item, blocknr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
 BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
 BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
 BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32);
-BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, blocks_used, 64);
-BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, block_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 
 /* struct btrfs_super_block */
-BTRFS_SETGET_STACK_FUNCS(super_blocknr, struct btrfs_super_block, blocknr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
-BTRFS_SETGET_STACK_FUNCS(super_total_blocks, struct btrfs_super_block,
-		   total_blocks, 64);
-BTRFS_SETGET_STACK_FUNCS(super_blocks_used, struct btrfs_super_block,
-		   blocks_used, 64);
+BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
+			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
+			 bytes_used, 64);
 BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
 			 sectorsize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
@@ -856,33 +870,33 @@ static inline unsigned long btrfs_file_extent_inline_start(struct
 						   btrfs_file_extent_item *e)
 {
 	unsigned long offset = (unsigned long)e;
-	offset += offsetof(struct btrfs_file_extent_item, disk_blocknr);
+	offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
 	return offset;
 }
 
 static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
 {
-	return offsetof(struct btrfs_file_extent_item, disk_blocknr) + datasize;
+	return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
 }
 
 static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
 					       struct btrfs_item *e)
 {
 	unsigned long offset;
-	offset = offsetof(struct btrfs_file_extent_item, disk_blocknr);
+	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
 	return btrfs_item_size(eb, e) - offset;
 }
 
-BTRFS_SETGET_FUNCS(file_extent_disk_blocknr, struct btrfs_file_extent_item,
-		   disk_blocknr, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
+		   disk_bytenr, 64);
 BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
 		   generation, 64);
-BTRFS_SETGET_FUNCS(file_extent_disk_num_blocks, struct btrfs_file_extent_item,
-		   disk_num_blocks, 64);
+BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item,
+		   disk_num_bytes, 64);
 BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
 		  offset, 64);
-BTRFS_SETGET_FUNCS(file_extent_num_blocks, struct btrfs_file_extent_item,
-		   num_blocks, 64);
+BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
+		   num_bytes, 64);
 
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
@@ -906,6 +920,12 @@ static inline int btrfs_set_root_name(struct btrfs_root *root,
 	return 0;
 }
 
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
+	if (level == 0)
+		return root->leafsize;
+	return root->nodesize;
+}
+
 /* helper function to cast into the data area of the leaf. */
 #define btrfs_item_ptr(leaf, slot, type) \
 	((type *)(btrfs_leaf_data(leaf) + \
@@ -927,7 +947,7 @@ int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
-							 u64 blocknr);
+							 u64 bytenr);
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
@@ -935,22 +955,22 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root, u64 hint,
-					    u64 empty_size);
+					    struct btrfs_root *root, u32 size,
+					    u64 hint, u64 empty_size);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u64 num_blocks, u64 empty_size, u64 search_start,
+		       u64 num_bytes, u64 empty_size, u64 search_start,
 		       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 blocknr, u64 num_blocks, int pin);
+		      *root, u64 bytenr, u64 num_bytes, int pin);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct extent_map_tree *unpin);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 blocknr, u64 num_blocks);
+				u64 bytenr, u64 num_bytes);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
@@ -1040,12 +1060,12 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 pos, u64 offset,
-			       u64 disk_num_blocks,
-			       u64 num_blocks);
+			       u64 disk_num_bytes,
+			       u64 num_bytes);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
-			     u64 blocknr, int mod);
+			     u64 bytenr, int mod);
 int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2b86a1d779b..fad9298c696 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "print-tree.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -43,26 +44,25 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 #endif
 
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
-					    u64 blocknr)
+					    u64 bytenr, u32 blocksize)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
 	eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
-				   blocknr * root->sectorsize,
-				   root->sectorsize, GFP_NOFS);
+				bytenr, blocksize, GFP_NOFS);
 	if (eb)
 		eb->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return eb;
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-						 u64 blocknr)
+						 u64 bytenr, u32 blocksize)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
+
 	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
-				   blocknr * root->sectorsize,
-				   root->sectorsize, GFP_NOFS);
+				 bytenr, blocksize, GFP_NOFS);
 	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return eb;
 }
@@ -208,13 +208,13 @@ static struct address_space_operations btree_aops = {
 	.sync_page	= block_sync_page,
 };
 
-int readahead_tree_block(struct btrfs_root *root, u64 blocknr)
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	int ret = 0;
 
-	buf = btrfs_find_create_tree_block(root, blocknr);
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
@@ -223,12 +223,13 @@ int readahead_tree_block(struct btrfs_root *root, u64 blocknr)
 	return ret;
 }
 
-struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr)
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u32 blocksize)
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
-	buf = btrfs_find_create_tree_block(root, blocknr);
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
@@ -261,7 +262,7 @@ int set_tree_block_dirty(struct btrfs_root *root, struct extent_buffer *buf)
 	return 0;
 }
 
-static int __setup_root(int blocksize,
+static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
 			u64 objectid)
@@ -269,9 +270,9 @@ static int __setup_root(int blocksize,
 	root->node = NULL;
 	root->inode = NULL;
 	root->commit_root = NULL;
-	root->sectorsize = blocksize;
-	root->nodesize = blocksize;
-	root->leafsize = blocksize;
+	root->sectorsize = sectorsize;
+	root->nodesize = nodesize;
+	root->leafsize = leafsize;
 	root->ref_cows = 0;
 	root->fs_info = fs_info;
 	root->objectid = objectid;
@@ -291,21 +292,23 @@ static int __setup_root(int blocksize,
 	return 0;
 }
 
-static int find_and_setup_root(int blocksize,
-			       struct btrfs_root *tree_root,
+static int find_and_setup_root(struct btrfs_root *tree_root,
 			       struct btrfs_fs_info *fs_info,
 			       u64 objectid,
 			       struct btrfs_root *root)
 {
 	int ret;
+	u32 blocksize;
 
-	__setup_root(blocksize, root, fs_info, objectid);
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, root, fs_info, objectid);
 	ret = btrfs_find_last_root(tree_root, objectid,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
 
-	root->node = read_tree_block(root,
-				     btrfs_root_blocknr(&root->root_item));
+	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+				     blocksize);
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -318,14 +321,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 	struct btrfs_path *path;
 	struct extent_buffer *l;
 	u64 highest_inode;
+	u32 blocksize;
 	int ret = 0;
 
 	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 	if (location->offset == (u64)-1) {
-		ret = find_and_setup_root(fs_info->sb->s_blocksize,
-					  fs_info->tree_root, fs_info,
+		ret = find_and_setup_root(tree_root, fs_info,
 					  location->objectid, root);
 		if (ret) {
 			kfree(root);
@@ -334,7 +337,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 		goto insert;
 	}
 
-	__setup_root(fs_info->sb->s_blocksize, root, fs_info,
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, root, fs_info,
 		     location->objectid);
 
 	path = btrfs_alloc_path();
@@ -357,8 +361,9 @@ out:
 		kfree(root);
 		return ERR_PTR(ret);
 	}
-	root->node = read_tree_block(root,
-				     btrfs_root_blocknr(&root->root_item));
+	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
+				     blocksize);
 	BUG_ON(!root->node);
 insert:
 	root->ref_cows = 1;
@@ -418,6 +423,10 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 
 struct btrfs_root *open_ctree(struct super_block *sb)
 {
+	u32 sectorsize;
+	u32 nodesize;
+	u32 leafsize;
+	u32 blocksize;
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
@@ -474,12 +483,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 
-	__setup_root(sb->s_blocksize, tree_root,
+	__setup_root(512, 512, 512, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 	fs_info->sb_buffer = read_tree_block(tree_root,
-					     BTRFS_SUPER_INFO_OFFSET /
-					     sb->s_blocksize);
+					     BTRFS_SUPER_INFO_OFFSET,
+					     512);
 
 	if (!fs_info->sb_buffer)
 		goto fail_iput;
@@ -494,9 +503,15 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
+	nodesize = btrfs_super_nodesize(disk_super);
+	leafsize = btrfs_super_leafsize(disk_super);
+	sectorsize = btrfs_super_sectorsize(disk_super);
+	tree_root->nodesize = nodesize;
+	tree_root->leafsize = leafsize;
+	tree_root->sectorsize = sectorsize;
+
 	i_size_write(fs_info->btree_inode,
-		     btrfs_super_total_blocks(disk_super) <<
-		     fs_info->btree_inode->i_blkbits);
+		     btrfs_super_total_bytes(disk_super));
 
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
@@ -504,13 +519,22 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		printk("btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
+	blocksize = btrfs_level_size(tree_root,
+				     btrfs_super_root_level(disk_super));
 	tree_root->node = read_tree_block(tree_root,
-					  btrfs_super_root(disk_super));
+					  btrfs_super_root(disk_super),
+					  blocksize);
 	if (!tree_root->node)
 		goto fail_sb_buffer;
 
+#if 0
+	btrfs_print_leaf(tree_root, tree_root->node);
+	err = -EIO;
+	goto fail_tree_root;
+#endif
 	mutex_lock(&fs_info->fs_mutex);
-	ret = find_and_setup_root(sb->s_blocksize, tree_root, fs_info,
+
+	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret) {
 		mutex_unlock(&fs_info->fs_mutex);
@@ -611,11 +635,11 @@ int close_ctree(struct btrfs_root *root)
 		free_extent_buffer(fs_info->tree_root->node);
 
 	free_extent_buffer(fs_info->sb_buffer);
-	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
-	iput(fs_info->btree_inode);
 
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
+	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+	iput(fs_info->btree_inode);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
 	return 0;
@@ -642,7 +666,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
-			(unsigned long long)extent_buffer_blocknr(buf),
+			(unsigned long long)buf->start,
 			transid, root->fs_info->generation);
 		WARN_ON(1);
 	}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 70d9413c599..fd4db5f810c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,10 +21,11 @@
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
 
-struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 blocknr);
-int readahead_tree_block(struct btrfs_root *root, u64 blocknr);
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u32 blocksize);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-						   u64 blocknr);
+						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
 struct btrfs_root *open_ctree(struct super_block *sb);
@@ -32,7 +33,7 @@ int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
-					    u64 blocknr);
+					    u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 477466d167a..1be8f9f04a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -129,7 +129,7 @@ err:
 
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
-							 u64 blocknr)
+							 u64 bytenr)
 {
 	struct extent_map_tree *block_group_cache;
 	struct btrfs_block_group_cache *block_group = NULL;
@@ -140,7 +140,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 
 	block_group_cache = &info->block_group_cache;
 	ret = find_first_extent_bit(block_group_cache,
-				    blocknr, &start, &end,
+				    bytenr, &start, &end,
 				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA);
 	if (ret) {
 		return NULL;
@@ -152,7 +152,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	block_group = (struct btrfs_block_group_cache *)ptr;
 
 
-	if (block_group->key.objectid <= blocknr && blocknr <=
+	if (block_group->key.objectid <= bytenr && bytenr <=
 	    block_group->key.objectid + block_group->key.offset)
 		return block_group;
 
@@ -315,7 +315,7 @@ found:
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 blocknr, u64 num_blocks)
+				u64 bytenr, u64 num_bytes)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -324,13 +324,14 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *item;
 	u32 refs;
 
+	WARN_ON(num_bytes < root->sectorsize);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	key.objectid = blocknr;
+	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	key.offset = num_blocks;
+	key.offset = num_bytes;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 1);
 	if (ret < 0)
@@ -361,8 +362,8 @@ int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 }
 
 static int lookup_extent_ref(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 blocknr,
-			     u64 num_blocks, u32 *refs)
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u32 *refs)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -370,9 +371,10 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 	struct extent_buffer *l;
 	struct btrfs_extent_item *item;
 
+	WARN_ON(num_bytes < root->sectorsize);
 	path = btrfs_alloc_path();
-	key.objectid = blocknr;
-	key.offset = num_blocks;
+	key.objectid = bytenr;
+	key.offset = num_bytes;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 0);
@@ -380,7 +382,7 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk("failed to find block number %Lu\n", blocknr);
+		printk("failed to find block number %Lu\n", bytenr);
 		BUG();
 	}
 	l = path->nodes[0];
@@ -394,19 +396,19 @@ out:
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root)
 {
-	return btrfs_inc_extent_ref(trans, root,
-				    extent_buffer_blocknr(root->node), 1);
+	return btrfs_inc_extent_ref(trans, root, root->node->start,
+				    root->node->len);
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf)
 {
-	u64 blocknr;
+	u64 bytenr;
 	u32 nritems;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
-	int leaf;
+	int level;
 	int ret;
 	int faili;
 	int err;
@@ -414,11 +416,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (!root->ref_cows)
 		return 0;
 
-	leaf = btrfs_is_leaf(buf);
+	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 	for (i = 0; i < nritems; i++) {
-		if (leaf) {
-			u64 disk_blocknr;
+		if (level == 0) {
+			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
@@ -427,18 +429,19 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
 				continue;
-			disk_blocknr = btrfs_file_extent_disk_blocknr(buf, fi);
-			if (disk_blocknr == 0)
+			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (disk_bytenr == 0)
 				continue;
-			ret = btrfs_inc_extent_ref(trans, root, disk_blocknr,
-				    btrfs_file_extent_disk_num_blocks(buf, fi));
+			ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
+				    btrfs_file_extent_disk_num_bytes(buf, fi));
 			if (ret) {
 				faili = i;
 				goto fail;
 			}
 		} else {
-			blocknr = btrfs_node_blockptr(buf, i);
-			ret = btrfs_inc_extent_ref(trans, root, blocknr, 1);
+			bytenr = btrfs_node_blockptr(buf, i);
+			ret = btrfs_inc_extent_ref(trans, root, bytenr,
+					   btrfs_level_size(root, level - 1));
 			if (ret) {
 				faili = i;
 				goto fail;
@@ -449,8 +452,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 fail:
 	WARN_ON(1);
 	for (i =0; i < faili; i++) {
-		if (leaf) {
-			u64 disk_blocknr;
+		if (level == 0) {
+			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
@@ -459,16 +462,17 @@ fail:
 			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
 				continue;
-			disk_blocknr = btrfs_file_extent_disk_blocknr(buf, fi);
-			if (disk_blocknr == 0)
+			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (disk_bytenr == 0)
 				continue;
-			err = btrfs_free_extent(trans, root, disk_blocknr,
-				    btrfs_file_extent_disk_num_blocks(buf,
+			err = btrfs_free_extent(trans, root, disk_bytenr,
+				    btrfs_file_extent_disk_num_bytes(buf,
 								      fi), 0);
 			BUG_ON(err);
 		} else {
-			blocknr = btrfs_node_blockptr(buf, i);
-			err = btrfs_free_extent(trans, root, blocknr, 1, 0);
+			bytenr = btrfs_node_blockptr(buf, i);
+			err = btrfs_free_extent(trans, root, bytenr,
+					btrfs_level_size(root, level - 1), 0);
 			BUG_ON(err);
 		}
 	}
@@ -558,31 +562,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
-			      u64 blocknr, u64 num, int alloc, int mark_free,
-			      int data)
+			      u64 bytenr, u64 num_bytes, int alloc,
+			      int mark_free, int data)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
-	u64 total = num;
+	u64 total = num_bytes;
 	u64 old_val;
-	u64 block_in_group;
+	u64 byte_in_group;
 	u64 start;
 	u64 end;
 
 	while(total) {
-		cache = btrfs_lookup_block_group(info, blocknr);
+		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache) {
 			return -1;
 		}
-		block_in_group = blocknr - cache->key.objectid;
-		WARN_ON(block_in_group > cache->key.offset);
+		byte_in_group = bytenr - cache->key.objectid;
+		WARN_ON(byte_in_group > cache->key.offset);
 		start = cache->key.objectid;
 		end = start + cache->key.offset - 1;
 		set_extent_bits(&info->block_group_cache, start, end,
 				BLOCK_GROUP_DIRTY, GFP_NOFS);
 
 		old_val = btrfs_block_group_used(&cache->item);
-		num = min(total, cache->key.offset - block_in_group);
+		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
 			if (cache->data != data &&
 			    old_val < (cache->key.offset >> 1)) {
@@ -608,18 +612,18 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 						start, end, bit_to_set,
 						GFP_NOFS);
 			}
-			old_val += num;
+			old_val += num_bytes;
 		} else {
-			old_val -= num;
+			old_val -= num_bytes;
 			if (mark_free) {
 				set_extent_dirty(&info->free_space_cache,
-						 blocknr, blocknr + num - 1,
+						 bytenr, bytenr + num_bytes - 1,
 						 GFP_NOFS);
 			}
 		}
 		btrfs_set_block_group_used(&cache->item, old_val);
-		total -= num;
-		blocknr += num;
+		total -= num_bytes;
+		bytenr += num_bytes;
 	}
 	return 0;
 }
@@ -701,13 +705,14 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	return 0;
 }
 
-static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
+static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
+			  int pending)
 {
 	int err = 0;
 	struct extent_buffer *buf;
 
 	if (!pending) {
-		buf = btrfs_find_tree_block(root, blocknr);
+		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
 			if (btrfs_buffer_uptodate(buf)) {
 				u64 transid =
@@ -720,10 +725,11 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
 			free_extent_buffer(buf);
 		}
 		set_extent_dirty(&root->fs_info->pinned_extents,
-				 blocknr, blocknr, GFP_NOFS);
+				 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
-				  blocknr, blocknr, EXTENT_LOCKED, GFP_NOFS);
+				bytenr, bytenr + num_bytes - 1,
+				EXTENT_LOCKED, GFP_NOFS);
 	}
 	BUG_ON(err < 0);
 	return 0;
@@ -733,7 +739,7 @@ static int pin_down_block(struct btrfs_root *root, u64 blocknr, int pending)
  * remove an extent from the root, returns 0 on success
  */
 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 blocknr, u64 num_blocks, int pin,
+			 *root, u64 bytenr, u64 num_bytes, int pin,
 			 int mark_free)
 {
 	struct btrfs_path *path;
@@ -745,9 +751,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
-	key.objectid = blocknr;
+	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	key.offset = num_blocks;
+	key.offset = num_bytes;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -768,28 +774,29 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (refs == 0) {
-		u64 super_blocks_used, root_blocks_used;
+		u64 super_used;
+		u64 root_used;
 
 		if (pin) {
-			ret = pin_down_block(root, blocknr, 0);
+			ret = pin_down_bytes(root, bytenr, num_bytes, 0);
 			BUG_ON(ret);
 		}
 
 		/* block accounting for super block */
-		super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
-		btrfs_set_super_blocks_used(&info->super_copy,
-					    super_blocks_used - num_blocks);
+		super_used = btrfs_super_bytes_used(&info->super_copy);
+		btrfs_set_super_bytes_used(&info->super_copy,
+					   super_used - num_bytes);
 
 		/* block accounting for root item */
-		root_blocks_used = btrfs_root_used(&root->root_item);
+		root_used = btrfs_root_used(&root->root_item);
 		btrfs_set_root_used(&root->root_item,
-					   root_blocks_used - num_blocks);
+					   root_used - num_bytes);
 
 		ret = btrfs_del_item(trans, extent_root, path);
 		if (ret) {
 			return ret;
 		}
-		ret = update_block_group(trans, root, blocknr, num_blocks, 0,
+		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free, 0);
 		BUG_ON(ret);
 	}
@@ -836,17 +843,18 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
  * remove an extent from the root, returns 0 on success
  */
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 blocknr, u64 num_blocks, int pin)
+		      *root, u64 bytenr, u64 num_bytes, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
 	int ret;
 
+	WARN_ON(num_bytes < root->sectorsize);
 	if (root == extent_root) {
-		pin_down_block(root, blocknr, 1);
+		pin_down_bytes(root, bytenr, num_bytes, 1);
 		return 0;
 	}
-	ret = __free_extent(trans, root, blocknr, num_blocks, pin, pin == 0);
+	ret = __free_extent(trans, root, bytenr, num_bytes, pin, pin == 0);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
@@ -860,8 +868,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
  * Any available blocks before search_start are skipped.
  */
 static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_blocks, u64 empty_size,
-			    u64 search_start, u64 search_end, u64 hint_block,
+			    *orig_root, u64 num_bytes, u64 empty_size,
+			    u64 search_start, u64 search_end, u64 hint_byte,
 			    struct btrfs_key *ins, u64 exclude_start,
 			    u64 exclude_nr, int data)
 {
@@ -870,30 +878,29 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	u64 hole_size = 0;
 	int slot = 0;
-	u64 last_block = 0;
+	u64 last_byte = 0;
 	u64 orig_search_start = search_start;
 	int start_found;
 	struct extent_buffer *l;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
-	int total_needed = num_blocks;
+	u64 total_needed = num_bytes;
 	int level;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
-	u64 cached_search_start = 0;
 
-	WARN_ON(num_blocks < 1);
+	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
 	level = btrfs_header_level(root->node);
 
 	if (search_end == (u64)-1)
-		search_end = btrfs_super_total_blocks(&info->super_copy);
-	if (hint_block) {
-		block_group = btrfs_lookup_block_group(info, hint_block);
+		search_end = btrfs_super_total_bytes(&info->super_copy);
+	if (hint_byte) {
+		block_group = btrfs_lookup_block_group(info, hint_byte);
 		block_group = btrfs_find_block_group(root, block_group,
-						     hint_block, data, 1);
+						     hint_byte, data, 1);
 	} else {
 		block_group = btrfs_find_block_group(root,
 						     trans->block_group, 0,
@@ -906,7 +913,6 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 check_failed:
 	search_start = find_search_start(root, &block_group,
 					 search_start, total_needed, data);
-	cached_search_start = search_start;
 
 	btrfs_init_path(path);
 	ins->objectid = search_start;
@@ -958,27 +964,27 @@ check_failed:
 				start_found = 1;
 				goto check_pending;
 			}
-			ins->objectid = last_block > search_start ?
-					last_block : search_start;
+			ins->objectid = last_byte > search_start ?
+					last_byte : search_start;
 			ins->offset = search_end - ins->objectid;
 			goto check_pending;
 		}
 		btrfs_item_key_to_cpu(l, &key, slot);
 
-		if (key.objectid >= search_start && key.objectid > last_block &&
+		if (key.objectid >= search_start && key.objectid > last_byte &&
 		    start_found) {
-			if (last_block < search_start)
-				last_block = search_start;
-			hole_size = key.objectid - last_block;
-			if (hole_size >= num_blocks) {
-				ins->objectid = last_block;
+			if (last_byte < search_start)
+				last_byte = search_start;
+			hole_size = key.objectid - last_byte;
+			if (hole_size >= num_bytes) {
+				ins->objectid = last_byte;
 				ins->offset = hole_size;
 				goto check_pending;
 			}
 		}
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
 			if (!start_found) {
-				last_block = key.objectid;
+				last_byte = key.objectid;
 				start_found = 1;
 			}
 			goto next;
@@ -986,9 +992,9 @@ check_failed:
 
 
 		start_found = 1;
-		last_block = key.objectid + key.offset;
+		last_byte = key.objectid + key.offset;
 
-		if (!full_scan && last_block >= block_group->key.objectid +
+		if (!full_scan && last_byte >= block_group->key.objectid +
 		    block_group->key.offset) {
 			btrfs_release_path(root, path);
 			search_start = block_group->key.objectid +
@@ -1006,20 +1012,20 @@ check_pending:
 	btrfs_release_path(root, path);
 	BUG_ON(ins->objectid < search_start);
 
-	if (ins->objectid + num_blocks >= search_end)
+	if (ins->objectid + num_bytes >= search_end)
 		goto enospc;
 
 	if (test_range_bit(&info->extent_ins, ins->objectid,
-			   ins->objectid + num_blocks -1, EXTENT_LOCKED, 0)) {
-		search_start = ins->objectid + num_blocks;
+			   ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
+		search_start = ins->objectid + num_bytes;
 		goto new_group;
 	}
 	if (test_range_bit(&info->pinned_extents, ins->objectid,
-			   ins->objectid + num_blocks -1, EXTENT_DIRTY, 0)) {
-		search_start = ins->objectid + num_blocks;
+			   ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
+		search_start = ins->objectid + num_bytes;
 		goto new_group;
 	}
-	if (exclude_nr > 0 && (ins->objectid + num_blocks > exclude_start &&
+	if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
 	    ins->objectid < exclude_start + exclude_nr)) {
 		search_start = exclude_start + exclude_nr;
 		goto new_group;
@@ -1029,12 +1035,12 @@ check_pending:
 		if (block_group)
 			trans->block_group = block_group;
 	}
-	ins->offset = num_blocks;
+	ins->offset = num_bytes;
 	btrfs_free_path(path);
 	return 0;
 
 new_group:
-	if (search_start + num_blocks >= search_end) {
+	if (search_start + num_bytes >= search_end) {
 enospc:
 		search_start = orig_search_start;
 		if (full_scan) {
@@ -1069,12 +1075,12 @@ error:
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner,
-		       u64 num_blocks, u64 empty_size, u64 hint_block,
+		       u64 num_bytes, u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, int data)
 {
 	int ret;
 	int pending_ret;
-	u64 super_blocks_used, root_blocks_used;
+	u64 super_used, root_used;
 	u64 search_start = 0;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
@@ -1083,9 +1089,9 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 	btrfs_set_stack_extent_owner(&extent_item, owner);
 
-	WARN_ON(num_blocks < 1);
-	ret = find_free_extent(trans, root, num_blocks, empty_size,
-			       search_start, search_end, hint_block, ins,
+	WARN_ON(num_bytes < root->sectorsize);
+	ret = find_free_extent(trans, root, num_bytes, empty_size,
+			       search_start, search_end, hint_byte, ins,
 			       trans->alloc_exclude_start,
 			       trans->alloc_exclude_nr, data);
 	BUG_ON(ret);
@@ -1093,21 +1099,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		return ret;
 
 	/* block accounting for super block */
-	super_blocks_used = btrfs_super_blocks_used(&info->super_copy);
-	btrfs_set_super_blocks_used(&info->super_copy, super_blocks_used +
-				    num_blocks);
+	super_used = btrfs_super_bytes_used(&info->super_copy);
+	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
 
 	/* block accounting for root item */
-	root_blocks_used = btrfs_root_used(&root->root_item);
-	btrfs_set_root_used(&root->root_item, root_blocks_used +
-				   num_blocks);
+	root_used = btrfs_root_used(&root->root_item);
+	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 
 	clear_extent_dirty(&root->fs_info->free_space_cache,
 			   ins->objectid, ins->objectid + ins->offset - 1,
 			   GFP_NOFS);
 
 	if (root == extent_root) {
-		BUG_ON(num_blocks != 1);
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
 				EXTENT_LOCKED, GFP_NOFS);
@@ -1146,7 +1149,8 @@ update_block:
  * returns the tree buffer or NULL.
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root, u64 hint,
+					     struct btrfs_root *root,
+					     u32 blocksize, u64 hint,
 					     u64 empty_size)
 {
 	struct btrfs_key ins;
@@ -1154,14 +1158,15 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *buf;
 
 	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 1, empty_size, hint, (u64)-1, &ins, 0);
+				 blocksize, empty_size, hint,
+				 (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
 	}
-	buf = btrfs_find_create_tree_block(root, ins.objectid);
+	buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
 	if (!buf) {
-		btrfs_free_extent(trans, root, ins.objectid, 1, 0);
+		btrfs_free_extent(trans, root, ins.objectid, blocksize, 0);
 		return ERR_PTR(-ENOMEM);
 	}
 	btrfs_set_buffer_uptodate(buf);
@@ -1191,7 +1196,7 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
 	for (i = 0; i < nritems; i++) {
-		u64 disk_blocknr;
+		u64 disk_bytenr;
 
 		btrfs_item_key_to_cpu(leaf, &key, i);
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1204,11 +1209,11 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 		 * FIXME make sure to insert a trans record that
 		 * repeats the snapshot del on crash
 		 */
-		disk_blocknr = btrfs_file_extent_disk_blocknr(leaf, fi);
-		if (disk_blocknr == 0)
+		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		if (disk_bytenr == 0)
 			continue;
-		ret = btrfs_free_extent(trans, root, disk_blocknr,
-				btrfs_file_extent_disk_num_blocks(leaf, fi), 0);
+		ret = btrfs_free_extent(trans, root, disk_bytenr,
+				btrfs_file_extent_disk_num_bytes(leaf, fi), 0);
 		BUG_ON(ret);
 	}
 	return 0;
@@ -1219,19 +1224,23 @@ static void reada_walk_down(struct btrfs_root *root,
 {
 	int i;
 	u32 nritems;
-	u64 blocknr;
+	u64 bytenr;
 	int ret;
 	u32 refs;
+	int level;
+	u32 blocksize;
 
 	nritems = btrfs_header_nritems(node);
+	level = btrfs_header_level(node);
 	for (i = 0; i < nritems; i++) {
-		blocknr = btrfs_node_blockptr(node, i);
-		ret = lookup_extent_ref(NULL, root, blocknr, 1, &refs);
+		bytenr = btrfs_node_blockptr(node, i);
+		blocksize = btrfs_level_size(root, level - 1);
+		ret = lookup_extent_ref(NULL, root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1)
 			continue;
 		mutex_unlock(&root->fs_info->fs_mutex);
-		ret = readahead_tree_block(root, blocknr);
+		ret = readahead_tree_block(root, bytenr, blocksize);
 		cond_resched();
 		mutex_lock(&root->fs_info->fs_mutex);
 		if (ret)
@@ -1248,15 +1257,16 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
-	u64 blocknr;
+	u64 bytenr;
+	u32 blocksize;
 	int ret;
 	u32 refs;
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = lookup_extent_ref(trans, root,
-				extent_buffer_blocknr(path->nodes[*level]),
-				1, &refs);
+				path->nodes[*level]->start,
+				path->nodes[*level]->len, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
@@ -1283,30 +1293,33 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			BUG_ON(ret);
 			break;
 		}
-		blocknr = btrfs_node_blockptr(cur, path->slots[*level]);
-		ret = lookup_extent_ref(trans, root, blocknr, 1, &refs);
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+		ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1) {
 			path->slots[*level]++;
-			ret = btrfs_free_extent(trans, root, blocknr, 1, 1);
+			ret = btrfs_free_extent(trans, root, bytenr,
+						blocksize, 1);
 			BUG_ON(ret);
 			continue;
 		}
-		next = btrfs_find_tree_block(root, blocknr);
+		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next)) {
 			free_extent_buffer(next);
 			mutex_unlock(&root->fs_info->fs_mutex);
-			next = read_tree_block(root, blocknr);
+			next = read_tree_block(root, bytenr, blocksize);
 			mutex_lock(&root->fs_info->fs_mutex);
 
 			/* we dropped the lock, check one more time */
-			ret = lookup_extent_ref(trans, root, blocknr, 1, &refs);
+			ret = lookup_extent_ref(trans, root, bytenr,
+						blocksize, &refs);
 			BUG_ON(ret);
 			if (refs != 1) {
 				path->slots[*level]++;
 				free_extent_buffer(next);
 				ret = btrfs_free_extent(trans, root,
-							blocknr, 1, 1);
+							bytenr, blocksize, 1);
 				BUG_ON(ret);
 				continue;
 			}
@@ -1321,8 +1334,8 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 out:
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = btrfs_free_extent(trans, root,
-			extent_buffer_blocknr(path->nodes[*level]), 1, 1);
+	ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
+				path->nodes[*level]->len, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -1359,8 +1372,8 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			return 0;
 		} else {
 			ret = btrfs_free_extent(trans, root,
-				    extent_buffer_blocknr(path->nodes[*level]),
-				    1, 1);
+						path->nodes[*level]->start,
+						path->nodes[*level]->len, 1);
 			BUG_ON(ret);
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -1476,16 +1489,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
-	u64 group_size_blocks;
 
 	block_group_cache = &info->block_group_cache;
 
-	group_size_blocks = BTRFS_BLOCK_GROUP_SIZE >>
-		info->sb->s_blocksize_bits;
-
 	root = info->extent_root;
 	key.objectid = 0;
-	key.offset = group_size_blocks;
+	key.offset = BTRFS_BLOCK_GROUP_SIZE;
 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
 	path = btrfs_alloc_path();
@@ -1532,7 +1541,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				  (u64)cache);
 
 		if (key.objectid >=
-		    btrfs_super_total_blocks(&info->super_copy))
+		    btrfs_super_total_bytes(&info->super_copy))
 			break;
 	}
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index e081558d52f..f658703c42e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1963,18 +1963,27 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i)
 	struct page *p;
 	if (i == 0)
 		return eb->first_page;
+
 	i += eb->start >> PAGE_CACHE_SHIFT;
+	if (eb->last_page && eb->last_page->index == i)
+		return eb->last_page;
+
 	p = find_get_page(eb->first_page->mapping, i);
 	page_cache_release(p);
+	eb->last_page = p;
 	return p;
 }
 
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 					  u64 start, unsigned long len,
 					  gfp_t mask)
 {
-	unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) -
-				  (start >> PAGE_CACHE_SHIFT) + 1;
+	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	struct extent_buffer *eb;
@@ -1986,7 +1995,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
-	eb->alloc_addr = __builtin_return_address(0);
+	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
 	eb->start = start;
 	eb->len = len;
 	atomic_set(&eb->refs, 1);
@@ -1994,6 +2003,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
 		if (!p) {
+			WARN_ON(1);
 			/* make sure the free only frees the pages we've
 			 * grabbed a reference on
 			 */
@@ -2021,8 +2031,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 					 u64 start, unsigned long len,
 					  gfp_t mask)
 {
-	unsigned long num_pages = ((start + len - 1) >> PAGE_CACHE_SHIFT) -
-				  (start >> PAGE_CACHE_SHIFT) + 1;
+	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	struct extent_buffer *eb;
@@ -2033,7 +2042,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
-	eb->alloc_addr = __builtin_return_address(0);
+	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
 	eb->start = start;
 	eb->len = len;
 	atomic_set(&eb->refs, 1);
@@ -2070,8 +2079,7 @@ void free_extent_buffer(struct extent_buffer *eb)
 	if (!atomic_dec_and_test(&eb->refs))
 		return;
 
-	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
-		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+	num_pages = num_extent_pages(eb->start, eb->len);
 
 	if (eb->first_page)
 		page_cache_release(eb->first_page);
@@ -2094,8 +2102,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree,
 	u64 end = start + eb->len - 1;
 
 	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
-	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
-		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+	num_pages = num_extent_pages(eb->start, eb->len);
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
@@ -2145,8 +2152,7 @@ int set_extent_buffer_uptodate(struct extent_map_tree *tree,
 	struct page *page;
 	unsigned long num_pages;
 
-	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
-		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+	num_pages = num_extent_pages(eb->start, eb->len);
 
 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
 			    GFP_NOFS);
@@ -2191,8 +2197,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 		return 0;
 	}
 
-	num_pages = ((eb->start + eb->len - 1) >> PAGE_CACHE_SHIFT) -
-		(eb->start >> PAGE_CACHE_SHIFT) + 1;
+	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (PageUptodate(page)) {
@@ -2267,14 +2272,14 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 }
 EXPORT_SYMBOL(read_extent_buffer);
 
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
-		      unsigned long min_len,
-		      char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km)
+static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+			       unsigned long min_len, char **token, char **map,
+			       unsigned long *map_start,
+			       unsigned long *map_len, int km)
 {
 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
 	char *kaddr;
+	struct page *p;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 	unsigned long end_i = (start_offset + start + min_len) >>
@@ -2283,21 +2288,59 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	if (i != end_i)
 		return -EINVAL;
 
-	WARN_ON(start > eb->len);
+	if (start >= eb->len) {
+		printk("bad start in map eb start %Lu len %lu caller start %lu min %lu\n", eb->start, eb->len, start, min_len);
+		WARN_ON(1);
+	}
 
 	if (i == 0) {
 		offset = start_offset;
 		*map_start = 0;
 	} else {
+		offset = 0;
 		*map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
 	}
 
-	kaddr = kmap_atomic(extent_buffer_page(eb, i), km);
+	p = extent_buffer_page(eb, i);
+	WARN_ON(!PageUptodate(p));
+	kaddr = kmap_atomic(p, km);
 	*token = kaddr;
 	*map = kaddr + offset;
 	*map_len = PAGE_CACHE_SIZE - offset;
 	return 0;
 }
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+		      unsigned long min_len,
+		      char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km)
+{
+	int err;
+	int save = 0;
+	if (eb->map_token) {
+		if (start >= eb->map_start &&
+		    start + min_len <= eb->map_start + eb->map_len) {
+			*token = eb->map_token;
+			*map = eb->kaddr;
+			*map_start = eb->map_start;
+			*map_len = eb->map_len;
+			return 0;
+		}
+		unmap_extent_buffer(eb, eb->map_token, km);
+		eb->map_token = NULL;
+		save = 1;
+	}
+	err = __map_extent_buffer(eb, start, min_len, token, map,
+				   map_start, map_len, km);
+	if (!err && save) {
+		eb->map_token = *token;
+		eb->kaddr = *map;
+		eb->map_start = *map_start;
+		eb->map_len = *map_len;
+	}
+	return err;
+}
 EXPORT_SYMBOL(map_extent_buffer);
 
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
@@ -2574,7 +2617,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		src_off_in_page = src_end &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
-
 		if (src_i == 0)
 			src_off_in_page += start_offset;
 		if (dst_i == 0)
@@ -2582,14 +2624,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
 		cur = min(len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
-
 		move_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
 			   dst_off_in_page - cur + 1,
 			   src_off_in_page - cur + 1, cur);
 
-		dst_end -= cur - 1;
-		src_end -= cur - 1;
+		dst_end -= cur;
+		src_end -= cur;
 		len -= cur;
 	}
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 75dc600dc0f..4ef8bdd6834 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -70,7 +70,12 @@ struct extent_buffer {
 	struct list_head list;
 	struct list_head leak_list;
 	struct page *first_page;
+	struct page *last_page;
 	unsigned long alloc_addr;
+	char *map_token;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
 };
 
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
@@ -147,11 +152,6 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
 	atomic_inc(&eb->refs);
 }
 
-static inline u64 extent_buffer_blocknr(struct extent_buffer *eb)
-{
-	return eb->start / 4096;
-}
-
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
 			  unsigned long len);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 226f6d028c3..9a65e97a4e2 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -27,8 +27,8 @@
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 pos,
-			       u64 offset, u64 disk_num_blocks,
-			       u64 num_blocks)
+			       u64 offset, u64 disk_num_bytes,
+			       u64 num_bytes)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -50,10 +50,10 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-	btrfs_set_file_extent_disk_blocknr(leaf, item, offset);
-	btrfs_set_file_extent_disk_num_blocks(leaf, item, disk_num_blocks);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, offset);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
 	btrfs_set_file_extent_offset(leaf, item, 0);
-	btrfs_set_file_extent_num_blocks(leaf, item, num_blocks);
+	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
 	btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1734ca69555..844d8807e44 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -120,9 +120,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 	ptr = btrfs_file_extent_inline_start(ei);
 
-	kaddr = kmap_atomic(page, KM_USER0);
+	kaddr = kmap_atomic(page, KM_USER1);
 	write_extent_buffer(leaf, kaddr + page_offset, ptr, size);
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr, KM_USER1);
 	btrfs_mark_buffer_dirty(leaf);
 fail:
 	btrfs_free_path(path);
@@ -142,11 +142,12 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	u64 hint_block;
-	u64 num_blocks;
+	u64 hint_byte;
+	u64 num_bytes;
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
+	u32 inline_size;
 	loff_t isize = i_size_read(inode);
 
 	em = alloc_extent_map(GFP_NOFS);
@@ -156,11 +157,12 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	em->bdev = inode->i_sb->s_bdev;
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
-	num_blocks = (write_bytes + pos - start_pos + root->sectorsize - 1) >>
-			inode->i_blkbits;
+	num_bytes = (write_bytes + pos - start_pos +
+		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
 	down_read(&BTRFS_I(inode)->root->snap_sem);
-	end_of_last_block = start_pos + (num_blocks << inode->i_blkbits) - 1;
+	end_of_last_block = start_pos + num_bytes - 1;
+
 	lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
@@ -169,8 +171,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		goto out_unlock;
 	}
 	btrfs_set_trans_block_group(trans, inode);
-	inode->i_blocks += num_blocks << 3;
-	hint_block = 0;
+	inode->i_blocks += num_bytes >> 9;
+	hint_byte = 0;
 
 	if ((end_of_last_block & 4095) == 0) {
 		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
@@ -191,11 +193,10 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
-						 &hint_block);
+						 &hint_byte);
 			if (err)
 				goto failed;
 
-			hole_size >>= inode->i_blkbits;
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
@@ -209,8 +210,10 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 * either allocate an extent for the new bytes or setup the key
 	 * to show we are doing inline data in the extent
 	 */
+	inline_size = end_pos - start_pos;
 	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
-	    pos + write_bytes - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
+	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    inline_size >= PAGE_CACHE_SIZE) {
 		u64 last_end;
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
@@ -224,10 +227,9 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	} else {
 		struct page *p = pages[0];
 		/* step one, delete the existing extents in this range */
-		/* FIXME blocksize != pagesize */
 		err = btrfs_drop_extents(trans, root, inode, start_pos,
 			 (pos + write_bytes + root->sectorsize -1) &
-			 ~((u64)root->sectorsize - 1), &hint_block);
+			 ~((u64)root->sectorsize - 1), &hint_byte);
 		if (err)
 			goto failed;
 
@@ -283,7 +285,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
  */
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_block)
+		       u64 start, u64 end, u64 *hint_byte)
 {
 	int ret;
 	struct btrfs_key key;
@@ -346,8 +348,7 @@ next_slot:
 			found_type = btrfs_file_extent_type(leaf, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				extent_end = key.offset +
-				 (btrfs_file_extent_num_blocks(leaf, extent) <<
-					 inode->i_blkbits);
+				     btrfs_file_extent_num_bytes(leaf, extent);
 				found_extent = 1;
 			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 				struct btrfs_item *item;
@@ -386,17 +387,17 @@ next_slot:
 
 		if (end < extent_end && end >= key.offset) {
 			if (found_extent) {
-				u64 disk_blocknr =
-				    btrfs_file_extent_disk_blocknr(leaf,extent);
-				u64 disk_num_blocks =
-				    btrfs_file_extent_disk_num_blocks(leaf,
+				u64 disk_bytenr =
+				    btrfs_file_extent_disk_bytenr(leaf, extent);
+				u64 disk_num_bytes =
+				    btrfs_file_extent_disk_num_bytes(leaf,
 								      extent);
 				read_extent_buffer(leaf, &old,
 						   (unsigned long)extent,
 						   sizeof(old));
-				if (disk_blocknr != 0) {
+				if (disk_bytenr != 0) {
 					ret = btrfs_inc_extent_ref(trans, root,
-					         disk_blocknr, disk_num_blocks);
+					         disk_bytenr, disk_num_bytes);
 					BUG_ON(ret);
 				}
 			}
@@ -410,21 +411,19 @@ next_slot:
 			keep = 1;
 			WARN_ON(start & (root->sectorsize - 1));
 			if (found_extent) {
-				new_num = (start - key.offset) >>
-					inode->i_blkbits;
-				old_num = btrfs_file_extent_num_blocks(leaf,
-								       extent);
-				*hint_block =
-					btrfs_file_extent_disk_blocknr(leaf,
-								       extent);
-				if (btrfs_file_extent_disk_blocknr(leaf,
-								   extent)) {
+				new_num = start - key.offset;
+				old_num = btrfs_file_extent_num_bytes(leaf,
+								      extent);
+				*hint_byte =
+					btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				if (btrfs_file_extent_disk_bytenr(leaf,
+								  extent)) {
 					inode->i_blocks -=
-						(old_num - new_num) << 3;
+						(old_num - new_num) >> 9;
 				}
-				btrfs_set_file_extent_num_blocks(leaf,
-								 extent,
-								 new_num);
+				btrfs_set_file_extent_num_bytes(leaf, extent,
+								new_num);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				WARN_ON(1);
@@ -432,33 +431,32 @@ next_slot:
 		}
 		/* delete the entire extent */
 		if (!keep) {
-			u64 disk_blocknr = 0;
-			u64 disk_num_blocks = 0;
-			u64 extent_num_blocks = 0;
+			u64 disk_bytenr = 0;
+			u64 disk_num_bytes = 0;
+			u64 extent_num_bytes = 0;
 			if (found_extent) {
-				disk_blocknr =
-				      btrfs_file_extent_disk_blocknr(leaf,
+				disk_bytenr =
+				      btrfs_file_extent_disk_bytenr(leaf,
 								     extent);
-				disk_num_blocks =
-				      btrfs_file_extent_disk_num_blocks(leaf,
-									extent);
-				extent_num_blocks =
-				      btrfs_file_extent_num_blocks(leaf,
-								   extent);
-				*hint_block =
-					btrfs_file_extent_disk_blocknr(leaf,
+				disk_num_bytes =
+				      btrfs_file_extent_disk_num_bytes(leaf,
 								       extent);
+				extent_num_bytes =
+				      btrfs_file_extent_num_bytes(leaf, extent);
+				*hint_byte =
+					btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
 			}
 			ret = btrfs_del_item(trans, root, path);
 			/* TODO update progress marker and return */
 			BUG_ON(ret);
 			btrfs_release_path(root, path);
 			extent = NULL;
-			if (found_extent && disk_blocknr != 0) {
-				inode->i_blocks -= extent_num_blocks << 3;
+			if (found_extent && disk_bytenr != 0) {
+				inode->i_blocks -= extent_num_bytes >> 9;
 				ret = btrfs_free_extent(trans, root,
-							disk_blocknr,
-							disk_num_blocks, 0);
+							disk_bytenr,
+							disk_num_bytes, 0);
 			}
 
 			BUG_ON(ret);
@@ -491,20 +489,19 @@ next_slot:
 					    (unsigned long)extent, sizeof(old));
 
 			btrfs_set_file_extent_offset(leaf, extent,
-				    le64_to_cpu(old.offset) +
-				    ((end - key.offset) >> inode->i_blkbits));
-			WARN_ON(le64_to_cpu(old.num_blocks) <
-				(extent_end - end) >> inode->i_blkbits);
-			btrfs_set_file_extent_num_blocks(leaf, extent,
-				    (extent_end - end) >> inode->i_blkbits);
-
+				    le64_to_cpu(old.offset) + end - key.offset);
+			WARN_ON(le64_to_cpu(old.num_bytes) <
+				(extent_end - end));
+			btrfs_set_file_extent_num_bytes(leaf, extent,
+							extent_end - end);
 			btrfs_set_file_extent_type(leaf, extent,
 						   BTRFS_FILE_EXTENT_REG);
+
 			btrfs_mark_buffer_dirty(path->nodes[0]);
-			if (le64_to_cpu(old.disk_blocknr) != 0) {
+			if (le64_to_cpu(old.disk_bytenr) != 0) {
 				inode->i_blocks +=
-				      btrfs_file_extent_num_blocks(leaf,
-								   extent) << 3;
+				      btrfs_file_extent_num_bytes(leaf,
+								  extent) >> 9;
 			}
 			ret = 0;
 			goto out;
@@ -531,12 +528,9 @@ static int prepare_pages(struct btrfs_root *root,
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	int err = 0;
-	u64 num_blocks;
 	u64 start_pos;
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
-	num_blocks = (write_bytes + pos - start_pos + root->sectorsize - 1) >>
-			inode->i_blkbits;
 
 	memset(pages, 0, num_pages * sizeof(struct page *));
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fbe2836364e..d6b3a55ed8e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -77,19 +77,19 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 	u64 alloc_hint = 0;
-	u64 num_blocks;
+	u64 num_bytes;
 	int ret;
-	u64 blocksize = 1 << inode->i_blkbits;
+	u64 blocksize = root->sectorsize;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	BUG_ON(!trans);
-	num_blocks = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	ret = btrfs_drop_extents(trans, root, inode,
-				 start, start + num_blocks, &alloc_hint);
-	num_blocks = num_blocks >> inode->i_blkbits;
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, num_blocks, 0,
+				 start, start + num_bytes, &alloc_hint);
+
+	ret = btrfs_alloc_extent(trans, root, inode->i_ino, num_bytes, 0,
 				 alloc_hint, (u64)-1, &ins, 1);
 	if (ret) {
 		WARN_ON(1);
@@ -186,7 +186,8 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 zeroit:
 	printk("btrfs csum failed ino %lu off %llu\n",
 	       page->mapping->host->i_ino, (unsigned long long)start);
-	memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page);
+	memset(kaddr + offset, 1, end - start + 1);
+	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
 	return 0;
 }
@@ -547,7 +548,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
-	u64 extent_num_blocks = 0;
+	u64 extent_num_bytes = 0;
 	u64 item_end = 0;
 	int found_extent;
 	int del_item;
@@ -593,8 +594,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			if (btrfs_file_extent_type(leaf, fi) !=
 			    BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
-				    btrfs_file_extent_num_blocks(leaf, fi) <<
-				    inode->i_blkbits;
+				    btrfs_file_extent_num_bytes(leaf, fi);
 			}
 		}
 		if (found_type == BTRFS_CSUM_ITEM_KEY) {
@@ -626,28 +626,27 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 			   btrfs_file_extent_type(leaf, fi) !=
 			   BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
-			extent_start = btrfs_file_extent_disk_blocknr(leaf, fi);
+			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
 			if (!del_item) {
-				u64 orig_num_blocks =
-					btrfs_file_extent_num_blocks(leaf, fi);
-				extent_num_blocks = inode->i_size -
+				u64 orig_num_bytes =
+					btrfs_file_extent_num_bytes(leaf, fi);
+				extent_num_bytes = inode->i_size -
 					found_key.offset + root->sectorsize - 1;
-				extent_num_blocks >>= inode->i_blkbits;
-				btrfs_set_file_extent_num_blocks(leaf, fi,
-							 extent_num_blocks);
-				num_dec = (orig_num_blocks -
-					   extent_num_blocks) << 3;
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							 extent_num_bytes);
+				num_dec = (orig_num_bytes -
+					   extent_num_bytes) >> 9;
 				if (extent_start != 0) {
 					inode->i_blocks -= num_dec;
 				}
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
-				extent_num_blocks =
-					btrfs_file_extent_disk_num_blocks(leaf,
-									  fi);
+				extent_num_bytes =
+					btrfs_file_extent_disk_num_bytes(leaf,
+									 fi);
 				/* FIXME blocksize != 4096 */
-				num_dec = btrfs_file_extent_num_blocks(leaf,
-								       fi) << 3;
+				num_dec = btrfs_file_extent_num_bytes(leaf,
+								       fi) >> 9;
 				if (extent_start != 0) {
 					found_extent = 1;
 					inode->i_blocks -= num_dec;
@@ -664,7 +663,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		btrfs_release_path(root, path);
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
-						extent_num_blocks, 0);
+						extent_num_bytes, 0);
 			BUG_ON(ret);
 		}
 	}
@@ -709,7 +708,8 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
-	unsigned blocksize = 1 << inode->i_blkbits;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u32 blocksize = root->sectorsize;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
@@ -719,7 +719,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
+	down_read(&root->snap_sem);
 	ret = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -778,8 +778,6 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		err = btrfs_drop_extents(trans, root, inode,
 					 pos, pos + hole_size, &alloc_hint);
 
-		hole_size >>= inode->i_blkbits;
-
 		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       pos, 0, 0, hole_size);
 		btrfs_end_transaction(trans, root);
@@ -1490,7 +1488,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 {
 	int ret;
 	int err = 0;
-	u64 blocknr;
+	u64 bytenr;
 	u64 extent_start = 0;
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
@@ -1540,10 +1538,6 @@ again:
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-
-	blocknr = btrfs_file_extent_disk_blocknr(leaf, item);
-	blocknr += btrfs_file_extent_offset(leaf, item);
-
 	/* are we inside the extent that was found? */
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 	found_type = btrfs_key_type(&found_key);
@@ -1556,8 +1550,7 @@ again:
 	extent_start = found_key.offset;
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
 		extent_end = extent_start +
-		       (btrfs_file_extent_num_blocks(leaf, item) <<
-			inode->i_blkbits);
+		       btrfs_file_extent_num_bytes(leaf, item);
 		err = 0;
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
@@ -1570,17 +1563,18 @@ again:
 			}
 			goto not_found_em;
 		}
-		if (btrfs_file_extent_disk_blocknr(leaf, item) == 0) {
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+		if (bytenr == 0) {
 			em->start = extent_start;
 			em->end = extent_end - 1;
 			em->block_start = EXTENT_MAP_HOLE;
 			em->block_end = EXTENT_MAP_HOLE;
 			goto insert;
 		}
-		em->block_start = blocknr << inode->i_blkbits;
+		bytenr += btrfs_file_extent_offset(leaf, item);
+		em->block_start = bytenr;
 		em->block_end = em->block_start +
-			(btrfs_file_extent_num_blocks(leaf, item) <<
-			 inode->i_blkbits) - 1;
+			btrfs_file_extent_num_bytes(leaf, item) - 1;
 		em->start = extent_start;
 		em->end = extent_end - 1;
 		goto insert;
@@ -1592,7 +1586,8 @@ again:
 		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
 						    path->slots[0]));
 
-		extent_end = extent_start | ((u64)root->sectorsize - 1);
+		extent_end = (extent_start + size) |
+			((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
@@ -1617,8 +1612,10 @@ again:
 		ptr = btrfs_file_extent_inline_start(item);
 		map = kmap(page);
 		read_extent_buffer(leaf, map + page_offset, ptr, size);
+		/*
 		memset(map + page_offset + size, 0,
 		       root->sectorsize - (page_offset + size));
+		       */
 		flush_dcache_page(page);
 		kunmap(page);
 		set_extent_uptodate(em_tree, extent_start,
@@ -1836,13 +1833,13 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
-	leaf = btrfs_alloc_free_block(trans, root, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 0);
 	if (IS_ERR(leaf))
 		return PTR_ERR(leaf);
 
 	btrfs_set_header_nritems(leaf, 0);
 	btrfs_set_header_level(leaf, 0);
-	btrfs_set_header_blocknr(leaf, extent_buffer_blocknr(leaf));
+	btrfs_set_header_bytenr(leaf, leaf->start);
 	btrfs_set_header_generation(leaf, trans->transid);
 	btrfs_set_header_owner(leaf, root->root_key.objectid);
 	write_extent_buffer(leaf, root->fs_info->fsid,
@@ -1858,7 +1855,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	inode_item->nblocks = cpu_to_le64(1);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
-	btrfs_set_root_blocknr(&root_item, extent_buffer_blocknr(leaf));
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_level(&root_item, 0);
 	btrfs_set_root_refs(&root_item, 1);
 	btrfs_set_root_used(&root_item, 0);
 
@@ -1971,8 +1969,8 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-	btrfs_set_root_blocknr(&new_root_item,
-			       extent_buffer_blocknr(root->node));
+	btrfs_set_root_bytenr(&new_root_item, root->node->start);
+	btrfs_set_root_level(&new_root_item, btrfs_header_level(root->node));
 
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index a825ce078a5..9f8696c8a8e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -36,7 +36,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	u32 type;
 
 	printk("leaf %llu total ptrs %d free space %d\n",
-		(unsigned long long)btrfs_header_blocknr(l), nr,
+		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(l, i);
@@ -65,8 +65,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-			printk("\t\troot data blocknr %llu refs %u\n",
-				(unsigned long long)btrfs_disk_root_blocknr(l, ri),
+			printk("\t\troot data bytenr %llu refs %u\n",
+				(unsigned long long)btrfs_disk_root_bytenr(l, ri),
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
@@ -84,12 +84,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			           btrfs_file_extent_inline_len(l, item));
 				break;
 			}
-			printk("\t\textent data disk block %llu nr %llu\n",
-			       (unsigned long long)btrfs_file_extent_disk_blocknr(l, fi),
-			       (unsigned long long)btrfs_file_extent_disk_num_blocks(l, fi));
+			printk("\t\textent data disk bytenr %llu nr %llu\n",
+			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
+			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
 			printk("\t\textent data offset %llu nr %llu\n",
 			  (unsigned long long)btrfs_file_extent_offset(l, fi),
-			  (unsigned long long)btrfs_file_extent_num_blocks(l, fi));
+			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
@@ -106,16 +106,18 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 	int i;
 	u32 nr;
 	struct btrfs_key key;
+	int level;
 
 	if (!c)
 		return;
 	nr = btrfs_header_nritems(c);
-	if (btrfs_is_leaf(c)) {
+	level = btrfs_header_level(c);
+	if (level == 0) {
 		btrfs_print_leaf(root, c);
 		return;
 	}
 	printk("node %llu level %d total ptrs %d free spc %u\n",
-	       (unsigned long long)btrfs_header_blocknr(c),
+	       (unsigned long long)btrfs_header_bytenr(c),
 	       btrfs_header_level(c), nr,
 	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
@@ -129,7 +131,8 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 	}
 	for (i = 0; i < nr; i++) {
 		struct extent_buffer *next = read_tree_block(root,
-						btrfs_node_blockptr(c, i));
+					btrfs_node_blockptr(c, i),
+					btrfs_level_size(root, level - 1));
 		if (btrfs_is_leaf(next) &&
 		    btrfs_header_level(c) != 1)
 			BUG();
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 39a1435c68f..5c4370f3a5b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -303,10 +303,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	int bits = dentry->d_sb->s_blocksize_bits;
 
 	buf->f_namelen = BTRFS_NAME_LEN;
-	buf->f_blocks = btrfs_super_total_blocks(disk_super);
-	buf->f_bfree = buf->f_blocks - btrfs_super_blocks_used(disk_super);
+	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
+	buf->f_bfree = buf->f_blocks -
+		(btrfs_super_bytes_used(disk_super) >> bits);
 	buf->f_bavail = buf->f_bfree;
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9654e90eec8..0bd1fd3d29d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -42,14 +42,15 @@ static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
 
 static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
 {
+
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_blocks_used(&fs->super_copy));
+		(unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
 }
 
 static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
-		(unsigned long long)btrfs_super_total_blocks(&fs->super_copy));
+		(unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
 }
 
 static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 55289b71056..60f61345a8d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -205,12 +205,13 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 
 	btrfs_write_dirty_block_groups(trans, extent_root);
 	while(1) {
-		old_extent_block = btrfs_root_blocknr(&extent_root->root_item);
-		if (old_extent_block ==
-		    extent_buffer_blocknr(extent_root->node))
+		old_extent_block = btrfs_root_bytenr(&extent_root->root_item);
+		if (old_extent_block == extent_root->node->start)
 			break;
-		btrfs_set_root_blocknr(&extent_root->root_item,
-			       extent_buffer_blocknr(extent_root->node));
+		btrfs_set_root_bytenr(&extent_root->root_item,
+				      extent_root->node->start);
+		btrfs_set_root_level(&extent_root->root_item,
+				     btrfs_header_level(extent_root->node));
 		ret = btrfs_update_root(trans, tree_root,
 					&extent_root->root_key,
 					&extent_root->root_item);
@@ -284,8 +285,8 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 				     (unsigned long)root->root_key.objectid,
 				     BTRFS_ROOT_TRANS_TAG);
 			if (root->commit_root == root->node) {
-				WARN_ON(extent_buffer_blocknr(root->node) !=
-					btrfs_root_blocknr(&root->root_item));
+				WARN_ON(root->node->start !=
+					btrfs_root_bytenr(&root->root_item));
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
 
@@ -314,8 +315,10 @@ static int add_dirty_roots(struct btrfs_trans_handle *trans,
 			root->commit_root = NULL;
 
 			root->root_key.offset = root->fs_info->generation;
-			btrfs_set_root_blocknr(&root->root_item,
-				       extent_buffer_blocknr(root->node));
+			btrfs_set_root_bytenr(&root->root_item,
+					      root->node->start);
+			btrfs_set_root_level(&root->root_item,
+					     btrfs_header_level(root->node));
 			err = btrfs_insert_root(trans, root->fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
@@ -407,8 +410,8 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
-	u64 num_blocks;
-	u64 blocks_used;
+	u64 num_bytes;
+	u64 bytes_used;
 	int ret = 0;
 	int err;
 
@@ -419,7 +422,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
-		num_blocks = btrfs_root_used(&dirty->root->root_item);
+		num_bytes = btrfs_root_used(&dirty->root->root_item);
 		root = dirty->latest_root;
 
 		while(1) {
@@ -446,12 +449,12 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		}
 		BUG_ON(ret);
 
-		num_blocks -= btrfs_root_used(&dirty->root->root_item);
-		blocks_used = btrfs_root_used(&root->root_item);
-		if (num_blocks) {
+		num_bytes -= btrfs_root_used(&dirty->root->root_item);
+		bytes_used = btrfs_root_used(&root->root_item);
+		if (num_bytes) {
 			record_root_in_trans(root);
 			btrfs_set_root_used(&root->root_item,
-						   blocks_used - num_blocks);
+					    bytes_used - num_bytes);
 		}
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret) {
@@ -560,7 +563,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_set_super_generation(&root->fs_info->super_copy,
 				   cur_trans->transid);
 	btrfs_set_super_root(&root->fs_info->super_copy,
-		     extent_buffer_blocknr(root->fs_info->tree_root->node));
+			     root->fs_info->tree_root->node->start);
+	btrfs_set_super_root_level(&root->fs_info->super_copy,
+			   btrfs_header_level(root->fs_info->tree_root->node));
 
 	write_extent_buffer(root->fs_info->sb_buffer,
 			    &root->fs_info->super_copy, 0,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index daf019afa0a..3feac2f2862 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -27,13 +27,15 @@ static void reada_defrag(struct btrfs_root *root,
 {
 	int i;
 	u32 nritems;
-	u64 blocknr;
+	u64 bytenr;
+	u32 blocksize;
 	int ret;
 
+	blocksize = btrfs_level_size(root, btrfs_header_level(node) - 1);
 	nritems = btrfs_header_nritems(node);
 	for (i = 0; i < nritems; i++) {
-		blocknr = btrfs_node_blockptr(node, i);
-		ret = readahead_tree_block(root, blocknr);
+		bytenr = btrfs_node_blockptr(node, i);
+		ret = readahead_tree_block(root, bytenr, blocksize);
 		if (ret)
 			break;
 	}
@@ -46,7 +48,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 {
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
-	u64 blocknr;
+	u64 bytenr;
 	int ret = 0;
 	int is_extent = 0;
 
@@ -80,10 +82,11 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 
 			break;
 		}
-		blocknr = btrfs_node_blockptr(cur, path->slots[*level]);
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 
 		if (cache_only) {
-			next = btrfs_find_tree_block(root, blocknr);
+			next = btrfs_find_tree_block(root, bytenr,
+					   btrfs_level_size(root, *level - 1));
 			/* FIXME, test for defrag */
 			if (!next || !btrfs_buffer_uptodate(next)) {
 				free_extent_buffer(next);
@@ -91,7 +94,8 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 				continue;
 			}
 		} else {
-			next = read_tree_block(root, blocknr);
+			next = read_tree_block(root, bytenr,
+				       btrfs_level_size(root, *level - 1));
 		}
 		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
 				      path->slots[*level], &next);
-- 
cgit v1.2.3


From 14048ed0c415b8729b194e92c16d31c61628d216 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:16:28 -0400
Subject: Btrfs: Cache extent buffer mappings

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      | 40 ++++++++++++++++++++++++++++++++++++----
 fs/btrfs/extent_map.c | 15 +++++++++++++--
 2 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4d05456ec32..54c754dd9a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -436,10 +436,18 @@ static inline u##bits btrfs_##name(struct extent_buffer *eb,		\
 	unsigned long map_len;						\
 	unsigned long offset = (unsigned long)s +			\
 				offsetof(type, member);			\
-	err = map_extent_buffer(eb, offset,				\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+	    kaddr = eb->kaddr;						\
+	    map_start = eb->map_start;					\
+	    err = 0;							\
+	} else {							\
+		err = map_extent_buffer(eb, offset,			\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
+	}								\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
@@ -464,10 +472,18 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 	int unmap_on_exit = (eb->map_token == NULL);			\
 	unsigned long offset = (unsigned long)s +			\
 				offsetof(type, member);			\
-	err = map_extent_buffer(eb, offset,				\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+	    kaddr = eb->kaddr;						\
+	    map_start = eb->map_start;					\
+	    err = 0;							\
+	} else {							\
+		err = map_extent_buffer(eb, offset,			\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
+	}								\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
@@ -490,10 +506,18 @@ static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
 	unsigned long map_len;						\
 	unsigned long offset = offsetof(type, member);			\
 	int unmap_on_exit = (eb->map_token == NULL);			\
-	err = map_extent_buffer(eb, offset,				\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+	    kaddr = eb->kaddr;						\
+	    map_start = eb->map_start;					\
+	    err = 0;							\
+	} else {							\
+		err = map_extent_buffer(eb, offset,			\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
+	}								\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
@@ -517,10 +541,18 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 	unsigned long map_len;						\
 	unsigned long offset = offsetof(type, member);			\
 	int unmap_on_exit = (eb->map_token == NULL);			\
-	err = map_extent_buffer(eb, offset,				\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+	    kaddr = eb->kaddr;						\
+	    map_start = eb->map_start;					\
+	    err = 0;							\
+	} else {							\
+		err = map_extent_buffer(eb, offset,			\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
+	}								\
 	if (!err) {							\
 		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
 					       map_start);		\
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f658703c42e..ea6ee68ef53 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2037,6 +2037,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 	struct extent_buffer *eb;
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
+	int uptodate = 1;
 
 	eb = __alloc_extent_buffer(mask);
 	if (!eb || IS_ERR(eb))
@@ -2048,7 +2049,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 	atomic_set(&eb->refs, 1);
 
 	for (i = 0; i < num_pages; i++, index++) {
-		p = find_get_page(mapping, index);
+		p = find_lock_page(mapping, index);
 		if (!p) {
 			/* make sure the free only frees the pages we've
 			 * grabbed a reference on
@@ -2060,7 +2061,12 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 		set_page_extent_mapped(p);
 		if (i == 0)
 			eb->first_page = p;
+		if (!PageUptodate(p))
+			uptodate = 0;
+		unlock_page(p);
 	}
+	if (uptodate)
+		eb->flags |= EXTENT_UPTODATE;
 	return eb;
 fail:
 	free_extent_buffer(eb);
@@ -2192,7 +2198,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 	if (eb->flags & EXTENT_UPTODATE)
 		return 0;
 
-	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
 			   EXTENT_UPTODATE, 1)) {
 		return 0;
 	}
@@ -2247,6 +2253,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	char *dst = (char *)dstv;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
@@ -2257,6 +2264,10 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 
 	while(len > 0) {
 		page = extent_buffer_page(eb, i);
+		if (!PageUptodate(page)) {
+			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
+			WARN_ON(1);
+		}
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
-- 
cgit v1.2.3


From 09e71a326341f40111400c88aaf0498ef622824b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:17:04 -0400
Subject: Btrfs: Use an array of pages in the extent buffers to reduce the cost
 of find_get_page

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c    |  6 +++---
 fs/btrfs/extent_map.c | 22 ++++++++--------------
 fs/btrfs/extent_map.h |  4 ++--
 3 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fad9298c696..0636f79672e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -647,20 +647,20 @@ int close_ctree(struct btrfs_root *root)
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->first_page->mapping->host;
+	struct inode *btree_inode = buf->pages[0]->mapping->host;
 	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf);
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->first_page->mapping->host;
+	struct inode *btree_inode = buf->pages[0]->mapping->host;
 	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree,
 					  buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ea6ee68ef53..7ef3397a266 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1961,16 +1961,12 @@ static void __free_extent_buffer(struct extent_buffer *eb)
 static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i)
 {
 	struct page *p;
-	if (i == 0)
-		return eb->first_page;
 
+	if (i < EXTENT_INLINE_PAGES)
+		return eb->pages[i];
 	i += eb->start >> PAGE_CACHE_SHIFT;
-	if (eb->last_page && eb->last_page->index == i)
-		return eb->last_page;
-
-	p = find_get_page(eb->first_page->mapping, i);
+	p = find_get_page(eb->pages[0]->mapping, i);
 	page_cache_release(p);
-	eb->last_page = p;
 	return p;
 }
 
@@ -2012,8 +2008,8 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
-		if (i == 0)
-			eb->first_page = p;
+		if (i < EXTENT_INLINE_PAGES)
+			eb->pages[i] = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
@@ -2059,8 +2055,8 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
-		if (i == 0)
-			eb->first_page = p;
+		if (i < EXTENT_INLINE_PAGES)
+			eb->pages[i] = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
@@ -2087,9 +2083,7 @@ void free_extent_buffer(struct extent_buffer *eb)
 
 	num_pages = num_extent_pages(eb->start, eb->len);
 
-	if (eb->first_page)
-		page_cache_release(eb->first_page);
-	for (i = 1; i < num_pages; i++) {
+	for (i = 0; i < num_pages; i++) {
 		page_cache_release(extent_buffer_page(eb, i));
 	}
 	__free_extent_buffer(eb);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 4ef8bdd6834..d74a2b3e3b5 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -62,6 +62,7 @@ struct extent_state {
 	struct list_head list;
 };
 
+#define EXTENT_INLINE_PAGES 32
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
@@ -69,13 +70,12 @@ struct extent_buffer {
 	int flags;
 	struct list_head list;
 	struct list_head leak_list;
-	struct page *first_page;
-	struct page *last_page;
 	unsigned long alloc_addr;
 	char *map_token;
 	char *kaddr;
 	unsigned long map_start;
 	unsigned long map_len;
+	struct page *pages[EXTENT_INLINE_PAGES];
 };
 
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
-- 
cgit v1.2.3


From 6b80053d02be41886344b5007d04e345311ec0b5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:17:34 -0400
Subject: Btrfs: Add back the online defragging code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 177 ++++++++++++++++++++++++-------------------------
 fs/btrfs/disk-io.c     |  60 +++++++++++++++++
 fs/btrfs/disk-io.h     |   7 ++
 fs/btrfs/extent-tree.c |   8 +--
 fs/btrfs/extent_map.h  |   2 +
 fs/btrfs/transaction.c |   4 +-
 fs/btrfs/tree-defrag.c |  14 ++--
 7 files changed, 166 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 34de83630ae..08ddf1873ab 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -155,55 +155,49 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-#if 0
-static int close_blocks(u64 blocknr, u64 other)
+static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
 {
-	if (blocknr < other && other - blocknr < 8)
+	if (blocknr < other && other - (blocknr + blocksize) < 32768)
 		return 1;
-	if (blocknr > other && blocknr - other < 8)
+	if (blocknr > other && blocknr - (other + blocksize) < 32768)
 		return 1;
 	return 0;
 }
 
-static int should_defrag_leaf(struct extent_buffer *eb)
+static int should_defrag_leaf(struct extent_buffer *leaf)
 {
-	return 0;
-	struct btrfs_leaf *leaf = btrfs_buffer_leaf(eb);
-	struct btrfs_disk_key *key;
+	struct btrfs_key key;
 	u32 nritems;
 
-	if (buffer_defrag(bh))
+	if (btrfs_buffer_defrag(leaf))
 		return 1;
 
-	nritems = btrfs_header_nritems(&leaf->header);
+	nritems = btrfs_header_nritems(leaf);
 	if (nritems == 0)
 		return 0;
 
-	key = &leaf->items[0].key;
-	if (btrfs_disk_key_type(key) == BTRFS_DIR_ITEM_KEY)
+	btrfs_item_key_to_cpu(leaf, &key, 0);
+	if (key.type == BTRFS_DIR_ITEM_KEY)
 		return 1;
 
-	key = &leaf->items[nritems-1].key;
-	if (btrfs_disk_key_type(key) == BTRFS_DIR_ITEM_KEY)
+
+	btrfs_item_key_to_cpu(leaf, &key, nritems - 1);
+	if (key.type == BTRFS_DIR_ITEM_KEY)
 		return 1;
 	if (nritems > 4) {
-		key = &leaf->items[nritems/2].key;
-		if (btrfs_disk_key_type(key) == BTRFS_DIR_ITEM_KEY)
+		btrfs_item_key_to_cpu(leaf, &key, nritems / 2);
+		if (key.type == BTRFS_DIR_ITEM_KEY)
 			return 1;
 	}
 	return 0;
 }
-#endif
 
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int cache_only, u64 *last_ret)
 {
-	return 0;
-#if 0
-	struct btrfs_node *parent_node;
-	struct extent_buffer *cur_eb;
-	struct extent_buffer *tmp_eb;
+	struct extent_buffer *cur;
+	struct extent_buffer *tmp;
 	u64 blocknr;
 	u64 search_start = *last_ret;
 	u64 last_block = 0;
@@ -214,6 +208,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	int i;
 	int err = 0;
 	int parent_level;
+	int uptodate;
+	u32 blocksize;
 
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
@@ -225,12 +221,12 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	if (buffer_defrag_done(parent))
+	if (btrfs_buffer_defrag_done(parent))
 		return 0;
 
-	parent_node = btrfs_buffer_node(parent);
-	parent_nritems = btrfs_header_nritems(&parent_node->header);
-	parent_level = btrfs_header_level(&parent_node->header);
+	parent_nritems = btrfs_header_nritems(parent);
+	parent_level = btrfs_header_level(parent);
+	blocksize = btrfs_level_size(root, parent_level - 1);
 
 	start_slot = 0;
 	end_slot = parent_nritems;
@@ -240,56 +236,60 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
-		blocknr = btrfs_node_blockptr(parent_node, i);
+		blocknr = btrfs_node_blockptr(parent, i);
 		if (last_block == 0)
 			last_block = blocknr;
 		if (i > 0) {
-			other = btrfs_node_blockptr(parent_node, i - 1);
-			close = close_blocks(blocknr, other);
+			other = btrfs_node_blockptr(parent, i - 1);
+			close = close_blocks(blocknr, other, blocksize);
 		}
 		if (close && i < end_slot - 1) {
-			other = btrfs_node_blockptr(parent_node, i + 1);
-			close = close_blocks(blocknr, other);
+			other = btrfs_node_blockptr(parent, i + 1);
+			close = close_blocks(blocknr, other, blocksize);
 		}
 		if (close) {
 			last_block = blocknr;
 			continue;
 		}
 
-		cur_bh = btrfs_find_tree_block(root, blocknr);
-		if (!cur_bh || !buffer_uptodate(cur_bh) ||
-		    buffer_locked(cur_bh) ||
-		    (parent_level != 1 && !buffer_defrag(cur_bh)) ||
-		    (parent_level == 1 && !should_defrag_leaf(cur_bh))) {
+		cur = btrfs_find_tree_block(root, blocknr, blocksize);
+		if (cur)
+			uptodate = btrfs_buffer_uptodate(cur);
+		else
+			uptodate = 0;
+		if (!cur || !uptodate ||
+		    (parent_level != 1 && !btrfs_buffer_defrag(cur)) ||
+		    (parent_level == 1 && !should_defrag_leaf(cur))) {
 			if (cache_only) {
-				brelse(cur_bh);
+				free_extent_buffer(cur);
 				continue;
 			}
-			if (!cur_bh || !buffer_uptodate(cur_bh) ||
-			    buffer_locked(cur_bh)) {
-				brelse(cur_bh);
-				cur_bh = read_tree_block(root, blocknr);
+			if (!cur) {
+				cur = read_tree_block(root, blocknr,
+							 blocksize);
+			} else if (!uptodate) {
+				btrfs_read_buffer(cur);
 			}
 		}
 		if (search_start == 0)
-			search_start = last_block & ~((u64)65535);
+			search_start = last_block;
 
-		err = __btrfs_cow_block(trans, root, cur_bh, parent, i,
-					&tmp_bh, search_start,
-					min(8, end_slot - i));
+		err = __btrfs_cow_block(trans, root, cur, parent, i,
+					&tmp, search_start,
+					min(16 * blocksize,
+					    (end_slot - i) * blocksize));
 		if (err) {
-			brelse(cur_bh);
+			free_extent_buffer(cur);
 			break;
 		}
-		search_start = bh_blocknr(tmp_bh);
+		search_start = tmp->start;
 		*last_ret = search_start;
 		if (parent_level == 1)
-			clear_buffer_defrag(tmp_bh);
-		set_buffer_defrag_done(tmp_bh);
-		brelse(tmp_bh);
+			btrfs_clear_buffer_defrag(tmp);
+		btrfs_set_buffer_defrag_done(tmp);
+		free_extent_buffer(tmp);
 	}
 	return err;
-#endif
 }
 
 /*
@@ -892,22 +892,17 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 			     int level, int slot)
 {
-	return;
-#if 0
 	struct extent_buffer *node;
-	int i;
 	u32 nritems;
-	u64 bytenr;
 	u64 search;
-	u64 cluster_start;
-	int ret;
-	int nread = 0;
+	u64 lowest_read;
+	u64 highest_read;
+	u64 nread = 0;
 	int direction = path->reada;
-	int level;
-	struct radix_tree_root found;
-	unsigned long gang[8];
 	struct extent_buffer *eb;
-
+	u32 nr;
+	u32 blocksize;
+	u32 nscan = 0;
 
 	if (level == 0)
 		return;
@@ -917,42 +912,46 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 
 	node = path->nodes[level];
 	search = btrfs_node_blockptr(node, slot);
-	eb = btrfs_find_tree_block(root, search);
+	blocksize = btrfs_level_size(root, level - 1);
+	eb = btrfs_find_tree_block(root, search, blocksize);
 	if (eb) {
 		free_extent_buffer(eb);
 		return;
 	}
 
-	init_bit_radix(&found);
+	highest_read = search;
+	lowest_read = search;
+
 	nritems = btrfs_header_nritems(node);
-	level = btrfs_header_level(node) - 1;
-	for (i = slot; i < nritems; i++) {
-		bytenr = btrfs_node_blockptr(node, i);
-		set_radix_bit(&found, blocknr);
-	}
-	if (direction > 0) {
-		cluster_start = search - 4;
-		if (cluster_start > search)
-			cluster_start = 0;
-	} else
-		cluster_start = search + 4;
+	nr = slot;
 	while(1) {
-		ret = find_first_radix_bit(&found, gang, 0, ARRAY_SIZE(gang));
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++) {
-			blocknr = gang[i];
-			clear_radix_bit(&found, blocknr);
-			if (path->reada == 1 && nread > 16)
-				continue;
-			if (close_blocks(cluster_start, blocknr)) {
-				readahead_tree_block(root, blocknr);
-				nread++;
-				cluster_start = blocknr;
-			}
+		if (direction < 0) {
+			if (nr == 0)
+				break;
+			nr--;
+		} else if (direction > 0) {
+			nr++;
+			if (nr >= nritems)
+				break;
 		}
+		search = btrfs_node_blockptr(node, nr);
+		if ((search >= lowest_read && search <= highest_read) ||
+		    (search < lowest_read && lowest_read - search <= 32768) ||
+		    (search > highest_read && search - highest_read <= 32768)) {
+			readahead_tree_block(root, search, blocksize);
+			nread += blocksize;
+		}
+		nscan++;
+		if (path->reada < 2 && (nread > (256 * 1024) || nscan > 32))
+			break;
+		if(nread > (1024 * 1024) || nscan > 128)
+			break;
+
+		if (search < lowest_read)
+			lowest_read = search;
+		if (search > highest_read)
+			highest_read = search;
 	}
-#endif
 }
 /*
  * look for key in the tree.  path is filled in with nodes along the way
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0636f79672e..8e606e6658a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -678,3 +678,63 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	balance_dirty_pages_ratelimited_nr(
 			root->fs_info->btree_inode->i_mapping, nr);
 }
+
+void btrfs_set_buffer_defrag(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
+			buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
+}
+
+void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
+			buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
+			GFP_NOFS);
+}
+
+int btrfs_buffer_defrag(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
+		     buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
+}
+
+int btrfs_buffer_defrag_done(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
+		     buf->start, buf->start + buf->len - 1,
+		     EXTENT_DEFRAG_DONE, 0);
+}
+
+int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
+		     buf->start, buf->start + buf->len - 1,
+		     EXTENT_DEFRAG_DONE, GFP_NOFS);
+}
+
+int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
+		     buf->start, buf->start + buf->len - 1,
+		     EXTENT_DEFRAG, GFP_NOFS);
+}
+
+int btrfs_read_buffer(struct extent_buffer *buf)
+{
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
+					buf, 1);
+}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index fd4db5f810c..190b07bc642 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -51,4 +51,11 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
 				 struct extent_buffer *buf);
+void btrfs_set_buffer_defrag(struct extent_buffer *buf);
+void btrfs_set_buffer_defrag_done(struct extent_buffer *buf);
+int btrfs_buffer_defrag(struct extent_buffer *buf);
+int btrfs_buffer_defrag_done(struct extent_buffer *buf);
+int btrfs_clear_buffer_defrag(struct extent_buffer *buf);
+int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1be8f9f04a1..0b0c947194a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1173,13 +1173,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	buf->alloc_addr = (unsigned long)__builtin_return_address(0);
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
-	/*
-	set_buffer_checked(buf);
-	set_buffer_defrag(buf);
-	*/
-	/* FIXME!!!!!!!!!!!!!!!!
-	set_radix_bit(&trans->transaction->dirty_pages, buf->pages[0]->index);
-	*/
+	btrfs_set_buffer_defrag(buf);
 	trans->blocks_used++;
 	return buf;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index d74a2b3e3b5..8409b5cbeda 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -14,6 +14,8 @@
 #define EXTENT_LOCKED (1 << 3)
 #define EXTENT_NEW (1 << 4)
 #define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 60f61345a8d..87456ab7427 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -355,7 +355,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		return 0;
 
 	trans = btrfs_start_transaction(root, 1);
-	while (0) {
+	while (1) {
 		root->defrag_running = 1;
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
 		nr = trans->blocks_used;
@@ -400,7 +400,7 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 			btrfs_defrag_root(root, 1);
 		}
 	}
-	// btrfs_defrag_root(info->extent_root, 1);
+	btrfs_defrag_root(info->extent_root, 1);
 	return err;
 }
 
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3feac2f2862..d23216aa4ab 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -116,10 +116,10 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	}
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-#if 0
-	clear_buffer_defrag(path->nodes[*level]);
-	clear_buffer_defrag_done(path->nodes[*level]);
-#endif
+
+	btrfs_clear_buffer_defrag(path->nodes[*level]);
+	btrfs_clear_buffer_defrag_done(path->nodes[*level]);
+
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -148,10 +148,8 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 			root->defrag_level = i;
 			return 0;
 		} else {
-			/*
-			clear_buffer_defrag(path->nodes[*level]);
-			clear_buffer_defrag_done(path->nodes[*level]);
-			*/
+			btrfs_clear_buffer_defrag(path->nodes[*level]);
+			btrfs_clear_buffer_defrag_done(path->nodes[*level]);
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
-- 
cgit v1.2.3


From e19caa5f0e34b571ed0c2617554af5c43cb124d1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:17:44 -0400
Subject: Btrfs: Fix allocation routines to avoid intermixing data and metadata
 allocations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 38 ++++++++++++++++++++++----------------
 fs/btrfs/extent_map.c  |  6 +++---
 2 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0b0c947194a..72e6b274a45 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -165,7 +165,7 @@ static u64 find_search_start(struct btrfs_root *root,
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
-	u64 last = max(search_start, cache->key.objectid);
+	u64 last;
 	u64 start = 0;
 	u64 end = 0;
 
@@ -173,11 +173,14 @@ again:
 	ret = cache_block_group(root, cache);
 	if (ret)
 		goto out;
+	last = max(search_start, cache->key.objectid);
+
 	while(1) {
 		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
 					    last, &start, &end, EXTENT_DIRTY);
-		if (ret)
-			goto out;
+		if (ret) {
+			goto new_group;
+		}
 
 		start = max(last, start);
 		last = end + 1;
@@ -191,15 +194,13 @@ out:
 	return search_start;
 
 new_group:
-	cache = btrfs_lookup_block_group(root->fs_info,
-					 last + cache->key.offset - 1);
+	last = cache->key.objectid + cache->key.offset;
+	cache = btrfs_lookup_block_group(root->fs_info, last);
 	if (!cache) {
 		return search_start;
 	}
-	cache = btrfs_find_block_group(root, cache,
-				       last + cache->key.offset - 1, data, 0);
+	cache = btrfs_find_block_group(root, cache, last, data, 0);
 	*cache_ret = cache;
-	last = min(cache->key.objectid, last);
 	goto again;
 }
 
@@ -257,12 +258,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		if (used < div_factor(hint->key.offset, factor)) {
 			return hint;
 		}
-		last = hint->key.offset * 3;
-		if (hint->key.objectid >= last)
-			last = max(search_start + hint->key.offset - 1,
-				   hint->key.objectid - last);
-		else
-			last = hint->key.objectid + hint->key.offset;
+		last = hint->key.objectid + hint->key.offset;
 		hint_last = last;
 	} else {
 		if (hint)
@@ -913,7 +909,6 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 check_failed:
 	search_start = find_search_start(root, &block_group,
 					 search_start, total_needed, data);
-
 	btrfs_init_path(path);
 	ins->objectid = search_start;
 	ins->offset = 0;
@@ -958,6 +953,9 @@ check_failed:
 				continue;
 			if (ret < 0)
 				goto error;
+
+			search_start = max(search_start,
+					   block_group->key.objectid);
 			if (!start_found) {
 				ins->objectid = search_start;
 				ins->offset = search_end - search_start;
@@ -967,6 +965,7 @@ check_failed:
 			ins->objectid = last_byte > search_start ?
 					last_byte : search_start;
 			ins->offset = search_end - ins->objectid;
+			BUG_ON(ins->objectid >= search_end);
 			goto check_pending;
 		}
 		btrfs_item_key_to_cpu(l, &key, slot);
@@ -998,7 +997,7 @@ check_failed:
 		    block_group->key.offset) {
 			btrfs_release_path(root, path);
 			search_start = block_group->key.objectid +
-				block_group->key.offset * 2;
+				block_group->key.offset;
 			goto new_group;
 		}
 next:
@@ -1015,6 +1014,12 @@ check_pending:
 	if (ins->objectid + num_bytes >= search_end)
 		goto enospc;
 
+	if (!full_scan && ins->objectid + num_bytes >= block_group->
+	    key.objectid + block_group->key.offset) {
+		search_start = block_group->key.objectid +
+			block_group->key.offset;
+		goto new_group;
+	}
 	if (test_range_bit(&info->extent_ins, ins->objectid,
 			   ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
 		search_start = ins->objectid + num_bytes;
@@ -1114,6 +1119,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
 				EXTENT_LOCKED, GFP_NOFS);
+		WARN_ON(data == 1);
 		goto update_block;
 	}
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7ef3397a266..e241699024d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -981,7 +981,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
 	struct extent_state *state;
 	int ret = 1;
 
-	write_lock_irq(&tree->lock);
+	read_lock_irq(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -993,7 +993,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
 
 	while(1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->state & bits) {
+		if (state->end >= start && (state->state & bits)) {
 			*start_ret = state->start;
 			*end_ret = state->end;
 			ret = 0;
@@ -1004,7 +1004,7 @@ int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
 			break;
 	}
 out:
-	write_unlock_irq(&tree->lock);
+	read_unlock_irq(&tree->lock);
 	return ret;
 }
 EXPORT_SYMBOL(find_first_extent_bit);
-- 
cgit v1.2.3


From 4dc119046d0d8501afa4346472917fb05586ad9c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:18:14 -0400
Subject: Btrfs: Add an extent buffer LRU to reduce radix tree hits

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |   3 -
 fs/btrfs/disk-io.c     |  25 +++----
 fs/btrfs/extent-tree.c |   1 -
 fs/btrfs/extent_map.c  | 183 ++++++++++++++++++++++++++++---------------------
 fs/btrfs/extent_map.h  |  16 +++--
 fs/btrfs/transaction.c |   5 +-
 6 files changed, 128 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 08ddf1873ab..bef61ee883e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -88,8 +88,6 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
-	cow->alloc_addr = (unsigned long)__builtin_return_address(0);
-
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
@@ -151,7 +149,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	search_start = buf->start & ~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
-	(*cow_ret)->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e606e6658a..fd7e6c182b9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,8 +50,6 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 	struct extent_buffer *eb;
 	eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
 				bytenr, blocksize, GFP_NOFS);
-	if (eb)
-		eb->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return eb;
 }
 
@@ -63,7 +61,6 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 
 	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
 				 bytenr, blocksize, GFP_NOFS);
-	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return eb;
 }
 
@@ -234,7 +231,6 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 		return NULL;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
 				 buf, 1);
-	buf->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return buf;
 }
 
@@ -638,6 +634,7 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
+	extent_map_tree_cleanup(&BTRFS_I(fs_info->btree_inode)->extent_tree);
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
 	kfree(fs_info->extent_root);
@@ -647,20 +644,20 @@ int close_ctree(struct btrfs_root *root)
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->pages[0]->mapping->host;
+	struct inode *btree_inode = buf->last_page->mapping->host;
 	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf);
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->pages[0]->mapping->host;
+	struct inode *btree_inode = buf->last_page->mapping->host;
 	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree,
 					  buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
@@ -681,7 +678,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 
 void btrfs_set_buffer_defrag(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
@@ -689,7 +686,7 @@ void btrfs_set_buffer_defrag(struct extent_buffer *buf)
 
 void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
@@ -698,7 +695,7 @@ void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
 
 int btrfs_buffer_defrag(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
@@ -706,7 +703,7 @@ int btrfs_buffer_defrag(struct extent_buffer *buf)
 
 int btrfs_buffer_defrag_done(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1,
@@ -715,7 +712,7 @@ int btrfs_buffer_defrag_done(struct extent_buffer *buf)
 
 int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1,
@@ -724,7 +721,7 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
 
 int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1,
@@ -733,7 +730,7 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 
 int btrfs_read_buffer(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
 					buf, 1);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72e6b274a45..525fa845d61 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1176,7 +1176,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 	}
 	btrfs_set_buffer_uptodate(buf);
-	buf->alloc_addr = (unsigned long)__builtin_return_address(0);
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
 	btrfs_set_buffer_defrag(buf);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index e241699024d..85b28a6a4e0 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
+#include <linux/swap.h>
 #include "extent_map.h"
 
 /* temporary define until extent_map moves out of btrfs */
@@ -20,14 +21,11 @@ static struct kmem_cache *extent_map_cache;
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
 
-static LIST_HEAD(extent_buffers);
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
-static spinlock_t extent_buffers_lock;
 static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
-static int nr_extent_buffers;
-#define MAX_EXTENT_BUFFER_CACHE 128
+#define BUFFER_LRU_MAX 64
 
 struct tree_entry {
 	u64 start;
@@ -47,20 +45,12 @@ void __init extent_map_init(void)
 	extent_buffer_cache = btrfs_cache_create("extent_buffers",
 					    sizeof(struct extent_buffer), 0,
 					    NULL);
-	spin_lock_init(&extent_buffers_lock);
 }
 
 void __exit extent_map_exit(void)
 {
-	struct extent_buffer *eb;
 	struct extent_state *state;
 
-	while (!list_empty(&extent_buffers)) {
-		eb = list_entry(extent_buffers.next,
-				struct extent_buffer, list);
-		list_del(&eb->list);
-		kmem_cache_free(extent_buffer_cache, eb);
-	}
 	while (!list_empty(&states)) {
 		state = list_entry(states.next, struct extent_state, list);
 		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
@@ -68,14 +58,6 @@ void __exit extent_map_exit(void)
 		kmem_cache_free(extent_state_cache, state);
 
 	}
-	while (!list_empty(&buffers)) {
-		eb = list_entry(buffers.next,
-				struct extent_buffer, leak_list);
-		printk("buffer leak start %Lu len %lu return %lX\n", eb->start, eb->len, eb->alloc_addr);
-		list_del(&eb->leak_list);
-		kmem_cache_free(extent_buffer_cache, eb);
-	}
-
 
 	if (extent_map_cache)
 		kmem_cache_destroy(extent_map_cache);
@@ -92,10 +74,25 @@ void extent_map_tree_init(struct extent_map_tree *tree,
 	tree->state.rb_node = NULL;
 	tree->ops = NULL;
 	rwlock_init(&tree->lock);
+	spin_lock_init(&tree->lru_lock);
 	tree->mapping = mapping;
+	INIT_LIST_HEAD(&tree->buffer_lru);
+	tree->lru_size = 0;
 }
 EXPORT_SYMBOL(extent_map_tree_init);
 
+void extent_map_tree_cleanup(struct extent_map_tree *tree)
+{
+	struct extent_buffer *eb;
+	while(!list_empty(&tree->buffer_lru)) {
+		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
+				lru);
+		list_del(&eb->lru);
+		free_extent_buffer(eb);
+	}
+}
+EXPORT_SYMBOL(extent_map_tree_cleanup);
+
 struct extent_map *alloc_extent_map(gfp_t mask)
 {
 	struct extent_map *em;
@@ -1915,66 +1912,99 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 	return (em->block_start + start - em->start) >> inode->i_blkbits;
 }
 
-static struct extent_buffer *__alloc_extent_buffer(gfp_t mask)
+static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
 {
-	struct extent_buffer *eb = NULL;
-
-	spin_lock(&extent_buffers_lock);
-	if (!list_empty(&extent_buffers)) {
-		eb = list_entry(extent_buffers.next, struct extent_buffer,
-				list);
-		list_del(&eb->list);
-		WARN_ON(nr_extent_buffers == 0);
-		nr_extent_buffers--;
-	}
-	spin_unlock(&extent_buffers_lock);
+	if (list_empty(&eb->lru)) {
+		extent_buffer_get(eb);
+		list_add(&eb->lru, &tree->buffer_lru);
+		tree->lru_size++;
+		if (tree->lru_size >= BUFFER_LRU_MAX) {
+			struct extent_buffer *rm;
+			rm = list_entry(tree->buffer_lru.prev,
+					struct extent_buffer, lru);
+			tree->lru_size--;
+			list_del(&rm->lru);
+			free_extent_buffer(rm);
+		}
+	} else
+		list_move(&eb->lru, &tree->buffer_lru);
+	return 0;
+}
+static struct extent_buffer *find_lru(struct extent_map_tree *tree,
+				      u64 start, unsigned long len)
+{
+	struct list_head *lru = &tree->buffer_lru;
+	struct list_head *cur = lru->next;
+	struct extent_buffer *eb;
 
-	if (eb) {
-		memset(eb, 0, sizeof(*eb));
-	} else {
-		eb = kmem_cache_zalloc(extent_buffer_cache, mask);
-	}
-	spin_lock(&extent_buffers_lock);
-	list_add(&eb->leak_list, &buffers);
-	spin_unlock(&extent_buffers_lock);
+	if (list_empty(lru))
+		return NULL;
 
-	return eb;
+	do {
+		eb = list_entry(cur, struct extent_buffer, lru);
+		if (eb->start == start && eb->len == len) {
+			extent_buffer_get(eb);
+			return eb;
+		}
+		cur = cur->next;
+	} while (cur != lru);
+	return NULL;
 }
 
-static void __free_extent_buffer(struct extent_buffer *eb)
+static inline unsigned long num_extent_pages(u64 start, u64 len)
 {
-
-	spin_lock(&extent_buffers_lock);
-	list_del_init(&eb->leak_list);
-	spin_unlock(&extent_buffers_lock);
-
-	if (nr_extent_buffers >= MAX_EXTENT_BUFFER_CACHE) {
-		kmem_cache_free(extent_buffer_cache, eb);
-	} else {
-		spin_lock(&extent_buffers_lock);
-		list_add(&eb->list, &extent_buffers);
-		nr_extent_buffers++;
-		spin_unlock(&extent_buffers_lock);
-	}
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
 }
 
-static inline struct page *extent_buffer_page(struct extent_buffer *eb, int i)
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
 {
 	struct page *p;
 
-	if (i < EXTENT_INLINE_PAGES)
-		return eb->pages[i];
+	if (i == 0)
+		return eb->last_page;
 	i += eb->start >> PAGE_CACHE_SHIFT;
-	p = find_get_page(eb->pages[0]->mapping, i);
+	p = find_get_page(eb->last_page->mapping, i);
 	page_cache_release(p);
 	return p;
 }
 
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
+						   u64 start,
+						   unsigned long len,
+						   gfp_t mask)
 {
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
+	struct extent_buffer *eb = NULL;
+
+	spin_lock(&tree->lru_lock);
+	eb = find_lru(tree, start, len);
+	if (eb)
+		goto lru_add;
+	spin_unlock(&tree->lru_lock);
+
+	if (eb) {
+		memset(eb, 0, sizeof(*eb));
+	} else {
+		eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	}
+	INIT_LIST_HEAD(&eb->lru);
+	eb->start = start;
+	eb->len = len;
+	atomic_set(&eb->refs, 1);
+
+	spin_lock(&tree->lru_lock);
+lru_add:
+	add_lru(tree, eb);
+	spin_unlock(&tree->lru_lock);
+	return eb;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+	kmem_cache_free(extent_buffer_cache, eb);
 }
+
 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 					  u64 start, unsigned long len,
 					  gfp_t mask)
@@ -1987,14 +2017,12 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	struct address_space *mapping = tree->mapping;
 	int uptodate = 0;
 
-	eb = __alloc_extent_buffer(mask);
+	eb = __alloc_extent_buffer(tree, start, len, mask);
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
-	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
-	eb->start = start;
-	eb->len = len;
-	atomic_set(&eb->refs, 1);
+	if (eb->flags & EXTENT_BUFFER_FILLED)
+		return eb;
 
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
@@ -2008,14 +2036,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
-		if (i < EXTENT_INLINE_PAGES)
-			eb->pages[i] = p;
+		if (i == 0)
+			eb->last_page = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
 	}
 	if (uptodate)
 		eb->flags |= EXTENT_UPTODATE;
+	eb->flags |= EXTENT_BUFFER_FILLED;
 	return eb;
 fail:
 	free_extent_buffer(eb);
@@ -2035,14 +2064,12 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 	struct address_space *mapping = tree->mapping;
 	int uptodate = 1;
 
-	eb = __alloc_extent_buffer(mask);
+	eb = __alloc_extent_buffer(tree, start, len, mask);
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
-	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
-	eb->start = start;
-	eb->len = len;
-	atomic_set(&eb->refs, 1);
+	if (eb->flags & EXTENT_BUFFER_FILLED)
+		return eb;
 
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_lock_page(mapping, index);
@@ -2055,14 +2082,15 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
-		if (i < EXTENT_INLINE_PAGES)
-			eb->pages[i] = p;
+		if (i == 0)
+			eb->last_page = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
 	}
 	if (uptodate)
 		eb->flags |= EXTENT_UPTODATE;
+	eb->flags |= EXTENT_BUFFER_FILLED;
 	return eb;
 fail:
 	free_extent_buffer(eb);
@@ -2231,7 +2259,8 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 			ret = -EIO;
 		}
 	}
-	eb->flags |= EXTENT_UPTODATE;
+	if (!ret)
+		eb->flags |= EXTENT_UPTODATE;
 	return ret;
 }
 EXPORT_SYMBOL(read_extent_buffer_pages);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8409b5cbeda..52a8b9394fc 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -16,6 +16,7 @@
 #define EXTENT_DELALLOC (1 << 5)
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 
@@ -33,6 +34,9 @@ struct extent_map_tree {
 	struct address_space *mapping;
 	rwlock_t lock;
 	struct extent_map_ops *ops;
+	spinlock_t lru_lock;
+	struct list_head buffer_lru;
+	int lru_size;
 };
 
 /* note, this must start with the same fields as fs/extent_map.c:tree_entry */
@@ -64,20 +68,17 @@ struct extent_state {
 	struct list_head list;
 };
 
-#define EXTENT_INLINE_PAGES 32
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
-	atomic_t refs;
-	int flags;
-	struct list_head list;
-	struct list_head leak_list;
-	unsigned long alloc_addr;
 	char *map_token;
 	char *kaddr;
 	unsigned long map_start;
 	unsigned long map_len;
-	struct page *pages[EXTENT_INLINE_PAGES];
+	struct page *last_page;
+	struct list_head lru;
+	atomic_t refs;
+	int flags;
 };
 
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
@@ -88,6 +89,7 @@ typedef struct extent_map *(get_extent_t)(struct inode *inode,
 
 void extent_map_tree_init(struct extent_map_tree *tree,
 			  struct address_space *mapping, gfp_t mask);
+void extent_map_tree_cleanup(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 end);
 int add_extent_mapping(struct extent_map_tree *tree,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87456ab7427..67e4aca36a6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -443,8 +443,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 			BUG_ON(ret);
 			mutex_unlock(&tree_root->fs_info->fs_mutex);
 			btrfs_btree_balance_dirty(tree_root, nr);
-			schedule();
-
+			cond_resched();
 			mutex_lock(&tree_root->fs_info->fs_mutex);
 		}
 		BUG_ON(ret);
@@ -471,7 +470,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
 
 		btrfs_btree_balance_dirty(tree_root, nr);
-		schedule();
+		cond_resched();
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 3326d1b07c0cb6a2ff5b835b7a2cffa54124d074 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:18:25 -0400
Subject: Btrfs: Allow tails larger than one page

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c |  24 +++++++--
 fs/btrfs/ctree.h |   2 +-
 fs/btrfs/file.c  | 152 ++++++++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/inode.c |  32 +++++++-----
 fs/btrfs/super.c |   2 -
 5 files changed, 156 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bef61ee883e..f691542dfa1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1766,8 +1766,9 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* first try to make some room by pushing left and right */
 	wret = push_leaf_left(trans, root, path, data_size);
-	if (wret < 0)
+	if (wret < 0) {
 		return wret;
+	}
 	if (wret) {
 		wret = push_leaf_right(trans, root, path, data_size);
 		if (wret < 0)
@@ -1777,8 +1778,9 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* did the pushes work? */
 	if (btrfs_leaf_free_space(root, l) >=
-	    sizeof(struct btrfs_item) + data_size)
+	    sizeof(struct btrfs_item) + data_size) {
 		return 0;
+	}
 
 	if (!path->nodes[1]) {
 		ret = insert_new_root(trans, root, path, 1);
@@ -1822,7 +1824,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				return ret;
 			}
 			mid = slot;
-			double_split = 1;
+			if (mid != nritems &&
+			    leaf_space_used(l, mid, nritems - mid) +
+			    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
+				double_split = 1;
+			}
 		}
 	} else {
 		if (leaf_space_used(l, 0, mid + 1) + space_needed >
@@ -1910,8 +1916,9 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	BUG_ON(path->slots[0] < 0);
 
-	if (!double_split)
+	if (!double_split) {
 		return ret;
+	}
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
 				       l->start, 0);
@@ -2048,7 +2055,11 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	old_data = btrfs_item_end_nr(leaf, slot);
 
 	BUG_ON(slot < 0);
-	BUG_ON(slot >= nritems);
+	if (slot >= nritems) {
+		btrfs_print_leaf(root, leaf);
+		printk("slot %d too large, nritems %d\n", slot, nritems);
+		BUG_ON(1);
+	}
 
 	/*
 	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
@@ -2132,6 +2143,9 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 
 	if (btrfs_leaf_free_space(root, leaf) <
 	    sizeof(struct btrfs_item) + data_size) {
+		btrfs_print_leaf(root, leaf);
+		printk("not enough freespace need %u have %d\n",
+		       data_size, btrfs_leaf_free_space(root, leaf));
 		BUG();
 	}
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54c754dd9a1..18994c53106 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1137,7 +1137,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_block);
+		       u64 start, u64 end, u64 inline_end, u64 *hint_block);
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 844d8807e44..1af2b6534da 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -82,8 +82,9 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 
 static int insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode,
-				u64 offset, ssize_t size,
-				struct page *page, size_t page_offset)
+				u64 offset, size_t size,
+				struct page **pages, size_t page_offset,
+				int num_pages)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -91,9 +92,12 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	char *kaddr;
 	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
+	struct page *page;
 	u32 datasize;
 	int err = 0;
 	int ret;
+	int i;
+	ssize_t cur_size;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -104,25 +108,97 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	key.objectid = inode->i_ino;
 	key.offset = offset;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	BUG_ON(size >= PAGE_CACHE_SIZE);
-	datasize = btrfs_file_extent_calc_inline_size(size);
+	datasize = btrfs_file_extent_calc_inline_size(offset + size);
 
-	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      datasize);
-	if (ret) {
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0) {
 		err = ret;
 		goto fail;
 	}
-	leaf = path->nodes[0];
-	ei = btrfs_item_ptr(leaf, path->slots[0],
-			    struct btrfs_file_extent_item);
-	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-	ptr = btrfs_file_extent_inline_start(ei);
-
-	kaddr = kmap_atomic(page, KM_USER1);
-	write_extent_buffer(leaf, kaddr + page_offset, ptr, size);
-	kunmap_atomic(kaddr, KM_USER1);
+	if (ret == 1) {
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		ei = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, ei) !=
+		    BTRFS_FILE_EXTENT_INLINE) {
+			goto insert;
+		}
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		ret = 0;
+	}
+	if (ret == 0) {
+		u32 found_size;
+		u64 found_start;
+
+		leaf = path->nodes[0];
+		ei = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, ei) !=
+		    BTRFS_FILE_EXTENT_INLINE) {
+			err = ret;
+			btrfs_print_leaf(root, leaf);
+			printk("found wasn't inline offset %Lu inode %lu\n",
+			       offset, inode->i_ino);
+			goto fail;
+		}
+		found_start = key.offset;
+		found_size = btrfs_file_extent_inline_len(leaf,
+					  btrfs_item_nr(leaf, path->slots[0]));
+
+		if (found_size < offset + size) {
+			btrfs_release_path(root, path);
+			ret = btrfs_search_slot(trans, root, &key, path,
+						offset + size - found_size -
+						found_start, 1);
+			BUG_ON(ret != 0);
+			ret = btrfs_extend_item(trans, root, path,
+						offset + size - found_size -
+						found_start);
+			if (ret) {
+				err = ret;
+				goto fail;
+			}
+			leaf = path->nodes[0];
+			ei = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+		}
+	} else {
+insert:
+		btrfs_release_path(root, path);
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+					      datasize);
+		if (ret) {
+			err = ret;
+			printk("got bad ret %d\n", ret);
+			goto fail;
+		}
+		leaf = path->nodes[0];
+		ei = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+		btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+	}
+	ptr = btrfs_file_extent_inline_start(ei) + offset;
+
+	cur_size = size;
+	i = 0;
+	while (size > 0) {
+		page = pages[i];
+		kaddr = kmap_atomic(page, KM_USER0);
+		cur_size = min(PAGE_CACHE_SIZE - page_offset, size);
+		write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_offset = 0;
+		ptr += cur_size;
+		size -= cur_size;
+		if (i >= num_pages) {
+			printk("i %d num_pages %d\n", i, num_pages);
+		}
+		i++;
+	}
 	btrfs_mark_buffer_dirty(leaf);
 fail:
 	btrfs_free_path(path);
@@ -193,6 +269,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
+						 last_pos_in_file,
 						 &hint_byte);
 			if (err)
 				goto failed;
@@ -210,11 +287,12 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 * either allocate an extent for the new bytes or setup the key
 	 * to show we are doing inline data in the extent
 	 */
-	inline_size = end_pos - start_pos;
-	if (isize >= PAGE_CACHE_SIZE || pos + write_bytes < inode->i_size ||
-	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size >= PAGE_CACHE_SIZE) {
+	inline_size = end_pos;
+	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    inline_size > 16384 ||
+	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
+
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
@@ -225,22 +303,18 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		set_extent_delalloc(em_tree, start_pos, end_of_last_block,
 				 GFP_NOFS);
 	} else {
-		struct page *p = pages[0];
+		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
+		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
 		err = btrfs_drop_extents(trans, root, inode, start_pos,
-			 (pos + write_bytes + root->sectorsize -1) &
-			 ~((u64)root->sectorsize - 1), &hint_byte);
+					 aligned_end, end_pos, &hint_byte);
 		if (err)
 			goto failed;
-
 		err = insert_inline_extent(trans, root, inode, start_pos,
-					   end_pos - start_pos, p, 0);
+					   end_pos - start_pos, pages, 0,
+					   num_pages);
 		BUG_ON(err);
-		em->start = start_pos;
-		em->end = end_pos - 1;
-		em->block_start = EXTENT_MAP_INLINE;
-		em->block_end = EXTENT_MAP_INLINE;
-		add_extent_mapping(em_tree, em);
 	}
 	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
@@ -285,7 +359,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
  */
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte)
+		       u64 start, u64 end, u64 inline_end, u64 *hint_byte)
 {
 	int ret;
 	struct btrfs_key key;
@@ -401,8 +475,8 @@ next_slot:
 					BUG_ON(ret);
 				}
 			}
-			WARN_ON(found_inline);
-			bookend = 1;
+			if (!found_inline)
+				bookend = 1;
 		}
 		/* truncate existing extent */
 		if (start > key.offset) {
@@ -425,8 +499,14 @@ next_slot:
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								new_num);
 				btrfs_mark_buffer_dirty(leaf);
-			} else {
-				WARN_ON(1);
+			} else if (end > extent_end &&
+				   key.offset < inline_end &&
+				   inline_end < extent_end) {
+				u32 new_size;
+				new_size = btrfs_file_extent_calc_inline_size(
+						   inline_end - key.offset);
+				btrfs_truncate_item(trans, root, path,
+						    new_size);
 			}
 		}
 		/* delete the entire extent */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d6b3a55ed8e..84f496c838c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -87,7 +87,7 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	BUG_ON(!trans);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	ret = btrfs_drop_extents(trans, root, inode,
-				 start, start + num_bytes, &alloc_hint);
+				 start, start + num_bytes, start, &alloc_hint);
 
 	ret = btrfs_alloc_extent(trans, root, inode->i_ino, num_bytes, 0,
 				 alloc_hint, (u64)-1, &ins, 1);
@@ -776,7 +776,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		err = btrfs_drop_extents(trans, root, inode,
-					 pos, pos + hole_size, &alloc_hint);
+					 pos, pos + hole_size, pos,
+					 &alloc_hint);
 
 		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       pos, 0, 0, hole_size);
@@ -1581,7 +1582,9 @@ again:
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		unsigned long ptr;
 		char *map;
-		u32 size;
+		size_t size;
+		size_t extent_offset;
+		size_t copy_size;
 
 		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
 						    path->slots[0]));
@@ -1600,26 +1603,31 @@ again:
 			goto not_found_em;
 		}
 
+		extent_offset = (page->index << PAGE_CACHE_SHIFT) -
+			extent_start;
+		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+		map = kmap(page);
+		copy_size = min(PAGE_CACHE_SIZE - page_offset,
+				size - extent_offset);
+
 		em->block_start = EXTENT_MAP_INLINE;
 		em->block_end = EXTENT_MAP_INLINE;
-		em->start = extent_start;
-		em->end = extent_end;
+		em->start = extent_start + extent_offset;
+		em->end = (em->start + copy_size -1) |
+			((u64)root->sectorsize -1);
 
 		if (!page) {
 			goto insert;
 		}
 
-		ptr = btrfs_file_extent_inline_start(item);
-		map = kmap(page);
-		read_extent_buffer(leaf, map + page_offset, ptr, size);
+		read_extent_buffer(leaf, map + page_offset, ptr, copy_size);
 		/*
-		memset(map + page_offset + size, 0,
-		       root->sectorsize - (page_offset + size));
+		memset(map + page_offset + copy_size, 0,
+		       PAGE_CACHE_SIZE - copy_size - page_offset);
 		       */
 		flush_dcache_page(page);
 		kunmap(page);
-		set_extent_uptodate(em_tree, extent_start,
-				    extent_end, GFP_NOFS);
+		set_extent_uptodate(em_tree, em->start, em->end, GFP_NOFS);
 		goto insert;
 	} else {
 		printk("unkknown found_type %d\n", found_type);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5c4370f3a5b..f94aa1f97a0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -280,7 +280,6 @@ error_s:
 error_bdev:
 	close_bdev_excl(bdev);
 error:
-printk("get_sb failed\n");
 	return error;
 }
 /* end copy & paste */
@@ -295,7 +294,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data,
 			btrfs_fill_super, mnt,
 			subvol_name ? subvol_name : "default");
-printk("btrfs_get_sb returns %d\n", ret);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 810191ff3087e8143b41a944fcf4fd8c693f00e3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:18:55 -0400
Subject: Btrfs: extent_map optimizations to cut down on CPU usage

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c      |  1 +
 fs/btrfs/ctree.h      | 74 ++++++++++-----------------------------------------
 fs/btrfs/disk-io.c    | 20 +++++++-------
 fs/btrfs/extent_map.c | 33 ++++++++++-------------
 fs/btrfs/extent_map.h |  2 +-
 fs/btrfs/file.c       |  1 -
 6 files changed, 40 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f691542dfa1..54a5d006c56 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -463,6 +463,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 static int check_block(struct btrfs_root *root, struct btrfs_path *path,
 			int level)
 {
+	return 0;
 #if 0
 	struct extent_buffer *buf = path->nodes[level];
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 18994c53106..a942a242722 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -19,6 +19,8 @@
 #ifndef __BTRFS__
 #define __BTRFS__
 
+#include <linux/mm.h>
+#include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
@@ -499,70 +501,22 @@ static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
 {									\
-	int err;							\
-	char *map_token;						\
-	char *kaddr;							\
-	unsigned long map_start;					\
-	unsigned long map_len;						\
+	char *kaddr = kmap_atomic(eb->first_page, KM_USER0);		\
 	unsigned long offset = offsetof(type, member);			\
-	int unmap_on_exit = (eb->map_token == NULL);			\
-	if (eb->map_token && offset >= eb->map_start &&			\
-	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
-	    eb->map_len) {						\
-	    kaddr = eb->kaddr;						\
-	    map_start = eb->map_start;					\
-	    err = 0;							\
-	} else {							\
-		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
-				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER1);	\
-	}								\
-	if (!err) {							\
-		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
-					       map_start);		\
-		u##bits res = le##bits##_to_cpu(*tmp);			\
-		if (unmap_on_exit)					\
-			unmap_extent_buffer(eb, map_token, KM_USER1);	\
-		return res;						\
-	} else {							\
-		__le##bits res;						\
-		read_eb_member(eb, NULL, type, member, &res);		\
-		return le##bits##_to_cpu(res);				\
-	}								\
+	u##bits res;							\
+	__le##bits *tmp = (__le##bits *)(kaddr + offset);		\
+	res = le##bits##_to_cpu(*tmp);					\
+	kunmap_atomic(kaddr, KM_USER0);					\
+	return res;							\
 }									\
 static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 				    u##bits val)			\
 {									\
-	int err;							\
-	char *map_token;						\
-	char *kaddr;							\
-	unsigned long map_start;					\
-	unsigned long map_len;						\
+	char *kaddr = kmap_atomic(eb->first_page, KM_USER0);		\
 	unsigned long offset = offsetof(type, member);			\
-	int unmap_on_exit = (eb->map_token == NULL);			\
-	if (eb->map_token && offset >= eb->map_start &&			\
-	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
-	    eb->map_len) {						\
-	    kaddr = eb->kaddr;						\
-	    map_start = eb->map_start;					\
-	    err = 0;							\
-	} else {							\
-		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
-				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER1);	\
-	}								\
-	if (!err) {							\
-		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
-					       map_start);		\
-		*tmp = cpu_to_le##bits(val);				\
-		if (unmap_on_exit)					\
-			unmap_extent_buffer(eb, map_token, KM_USER1);	\
-	} else {							\
-		val = cpu_to_le##bits(val);				\
-		write_eb_member(eb, NULL, type, member, &val);		\
-	}								\
+	__le##bits *tmp = (__le##bits *)(kaddr + offset);		\
+	*tmp = cpu_to_le##bits(val);					\
+	kunmap_atomic(kaddr, KM_USER0);					\
 }
 
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
@@ -659,13 +613,13 @@ static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
 	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
 }
 
-static unsigned long btrfs_node_key_ptr_offset(int nr)
+static inline unsigned long btrfs_node_key_ptr_offset(int nr)
 {
 	return offsetof(struct btrfs_node, ptrs) +
 		sizeof(struct btrfs_key_ptr) * nr;
 }
 
-static void btrfs_node_key(struct extent_buffer *eb,
+static inline void btrfs_node_key(struct extent_buffer *eb,
 			   struct btrfs_disk_key *disk_key, int nr)
 {
 	unsigned long ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fd7e6c182b9..16f0260fca6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -644,20 +644,20 @@ int close_ctree(struct btrfs_root *root)
 
 int btrfs_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->last_page->mapping->host;
+	struct inode *btree_inode = buf->first_page->mapping->host;
 	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf);
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->last_page->mapping->host;
+	struct inode *btree_inode = buf->first_page->mapping->host;
 	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree,
 					  buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
@@ -678,7 +678,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 
 void btrfs_set_buffer_defrag(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
@@ -686,7 +686,7 @@ void btrfs_set_buffer_defrag(struct extent_buffer *buf)
 
 void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
@@ -695,7 +695,7 @@ void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
 
 int btrfs_buffer_defrag(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
@@ -703,7 +703,7 @@ int btrfs_buffer_defrag(struct extent_buffer *buf)
 
 int btrfs_buffer_defrag_done(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1,
@@ -712,7 +712,7 @@ int btrfs_buffer_defrag_done(struct extent_buffer *buf)
 
 int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1,
@@ -721,7 +721,7 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
 
 int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
 		     buf->start, buf->start + buf->len - 1,
@@ -730,7 +730,7 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 
 int btrfs_read_buffer(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->last_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
 					buf, 1);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 85b28a6a4e0..f8aaba8a30a 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1963,9 +1963,9 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	struct page *p;
 
 	if (i == 0)
-		return eb->last_page;
+		return eb->first_page;
 	i += eb->start >> PAGE_CACHE_SHIFT;
-	p = find_get_page(eb->last_page->mapping, i);
+	p = find_get_page(eb->first_page->mapping, i);
 	page_cache_release(p);
 	return p;
 }
@@ -2037,7 +2037,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 		}
 		set_page_extent_mapped(p);
 		if (i == 0)
-			eb->last_page = p;
+			eb->first_page = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
@@ -2083,7 +2083,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 		}
 		set_page_extent_mapped(p);
 		if (i == 0)
-			eb->last_page = p;
+			eb->first_page = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
@@ -2169,7 +2169,15 @@ EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
 int set_extent_buffer_dirty(struct extent_map_tree *tree,
 			     struct extent_buffer *eb)
 {
-	return set_range_dirty(tree, eb->start, eb->start + eb->len - 1);
+	unsigned long i;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+	}
+	return set_extent_dirty(tree, eb->start,
+				eb->start + eb->len - 1, GFP_NOFS);
 }
 EXPORT_SYMBOL(set_extent_buffer_dirty);
 
@@ -2317,16 +2325,11 @@ static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
 	unsigned long end_i = (start_offset + start + min_len) >>
-				PAGE_CACHE_SHIFT;
+		PAGE_CACHE_SHIFT;
 
 	if (i != end_i)
 		return -EINVAL;
 
-	if (start >= eb->len) {
-		printk("bad start in map eb start %Lu len %lu caller start %lu min %lu\n", eb->start, eb->len, start, min_len);
-		WARN_ON(1);
-	}
-
 	if (i == 0) {
 		offset = start_offset;
 		*map_start = 0;
@@ -2353,14 +2356,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	int err;
 	int save = 0;
 	if (eb->map_token) {
-		if (start >= eb->map_start &&
-		    start + min_len <= eb->map_start + eb->map_len) {
-			*token = eb->map_token;
-			*map = eb->kaddr;
-			*map_start = eb->map_start;
-			*map_len = eb->map_len;
-			return 0;
-		}
 		unmap_extent_buffer(eb, eb->map_token, km);
 		eb->map_token = NULL;
 		save = 1;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 52a8b9394fc..f1dc28d260e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -75,7 +75,7 @@ struct extent_buffer {
 	char *kaddr;
 	unsigned long map_start;
 	unsigned long map_len;
-	struct page *last_page;
+	struct page *first_page;
 	struct list_head lru;
 	atomic_t refs;
 	int flags;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1af2b6534da..fe28404ae7f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -289,7 +289,6 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 */
 	inline_size = end_pos;
 	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > 16384 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
 
-- 
cgit v1.2.3


From 0f82731fc56448c2733f58e1f5db6c2cbfc90652 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:18:56 -0400
Subject: Breakout BTRFS_SETGET_FUNCS into a separate C file, the inlines were
 too big.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile       |  2 +-
 fs/btrfs/ctree.c        |  1 -
 fs/btrfs/ctree.h        | 74 ++-----------------------------------
 fs/btrfs/extent-tree.c  |  2 +-
 fs/btrfs/struct-funcs.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-defrag.c  |  2 +-
 6 files changed, 104 insertions(+), 74 deletions(-)
 create mode 100644 fs/btrfs/struct-funcs.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a3b51085d7f..551743be5f0 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,7 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o
+	   extent_map.o sysfs.o struct-funcs.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 54a5d006c56..0c6ed17ac1b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/highmem.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a942a242722..d1c6f023a30 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -426,77 +426,11 @@ struct btrfs_root {
 			    offsetof(type, member),			\
 			   sizeof(((type *)0)->member)))
 
+#ifndef BTRFS_SETGET_FUNCS
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-static inline u##bits btrfs_##name(struct extent_buffer *eb,		\
-				   type *s)				\
-{									\
-	int err;							\
-	char *map_token;						\
-	char *kaddr;							\
-	int unmap_on_exit = (eb->map_token == NULL);			\
-	unsigned long map_start;					\
-	unsigned long map_len;						\
-	unsigned long offset = (unsigned long)s +			\
-				offsetof(type, member);			\
-	if (eb->map_token && offset >= eb->map_start &&			\
-	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
-	    eb->map_len) {						\
-	    kaddr = eb->kaddr;						\
-	    map_start = eb->map_start;					\
-	    err = 0;							\
-	} else {							\
-		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
-				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER1);	\
-	}								\
-	if (!err) {							\
-		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
-					       map_start);		\
-		u##bits res = le##bits##_to_cpu(*tmp);			\
-		if (unmap_on_exit)					\
-			unmap_extent_buffer(eb, map_token, KM_USER1);	\
-		return res;						\
-	} else {							\
-		__le##bits res;						\
-		read_eb_member(eb, s, type, member, &res);		\
-		return le##bits##_to_cpu(res);				\
-	}								\
-}									\
-static inline void btrfs_set_##name(struct extent_buffer *eb,		\
-				    type *s, u##bits val)		\
-{									\
-	int err;							\
-	char *map_token;						\
-	char *kaddr;							\
-	unsigned long map_start;					\
-	unsigned long map_len;						\
-	int unmap_on_exit = (eb->map_token == NULL);			\
-	unsigned long offset = (unsigned long)s +			\
-				offsetof(type, member);			\
-	if (eb->map_token && offset >= eb->map_start &&			\
-	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
-	    eb->map_len) {						\
-	    kaddr = eb->kaddr;						\
-	    map_start = eb->map_start;					\
-	    err = 0;							\
-	} else {							\
-		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
-				&map_token, &kaddr,			\
-				&map_start, &map_len, KM_USER1);	\
-	}								\
-	if (!err) {							\
-		__le##bits *tmp = (__le##bits *)(kaddr + offset -	\
-					       map_start);		\
-		*tmp = cpu_to_le##bits(val);				\
-		if (unmap_on_exit)					\
-			unmap_extent_buffer(eb, map_token, KM_USER1);	\
-	} else {							\
-		val = cpu_to_le##bits(val);				\
-		write_eb_member(eb, s, type, member, &val);		\
-	}								\
-}
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
+#endif
 
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 525fa845d61..e4e68ea9648 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1443,7 +1443,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
 			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = 0;
+			path->nodes[i] = NULL;
 		}
 	}
 out:
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
new file mode 100644
index 00000000000..cf68fcf9b55
--- /dev/null
+++ b/fs/btrfs/struct-funcs.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/highmem.h>
+#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+u##bits btrfs_##name(struct extent_buffer *eb,				\
+				   type *s)				\
+{									\
+	unsigned long offset = (unsigned long)s +			\
+				offsetof(type, member);			\
+	__le##bits *tmp;						\
+	/* ugly, but we want the fast path here */			\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+		tmp = (__le##bits *)(eb->kaddr + offset -		\
+				     eb->map_start);			\
+		return le##bits##_to_cpu(*tmp);				\
+	}								\
+	{								\
+		int err;						\
+		char *map_token;					\
+		char *kaddr;						\
+		int unmap_on_exit = (eb->map_token == NULL);		\
+		unsigned long map_start;				\
+		unsigned long map_len;					\
+		__le##bits res;						\
+		err = map_extent_buffer(eb, offset,			\
+			        sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER1);	\
+		if (err) {						\
+			read_eb_member(eb, s, type, member, &res);	\
+			return le##bits##_to_cpu(res);			\
+		}							\
+		tmp = (__le##bits *)(kaddr + offset - map_start);	\
+		res = le##bits##_to_cpu(*tmp);				\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+		return res;						\
+	}								\
+}									\
+void btrfs_set_##name(struct extent_buffer *eb,				\
+				    type *s, u##bits val)		\
+{									\
+	unsigned long offset = (unsigned long)s +			\
+				offsetof(type, member);			\
+	__le##bits *tmp;						\
+	/* ugly, but we want the fast path here */			\
+	if (eb->map_token && offset >= eb->map_start &&			\
+	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
+	    eb->map_len) {						\
+		tmp = (__le##bits *)(eb->kaddr + offset -		\
+				     eb->map_start);			\
+		*tmp = cpu_to_le##bits(val);				\
+		return;							\
+	}								\
+	{								\
+		int err;						\
+		char *map_token;					\
+		char *kaddr;						\
+		int unmap_on_exit = (eb->map_token == NULL);		\
+		unsigned long map_start;				\
+		unsigned long map_len;					\
+		err = map_extent_buffer(eb, offset,			\
+			        sizeof(((type *)0)->member),		\
+				&map_token, &kaddr,			\
+				&map_start, &map_len, KM_USER1);	\
+		if (err) {						\
+			val = cpu_to_le##bits(val);			\
+			write_eb_member(eb, s, type, member, &val);	\
+			return;						\
+		}							\
+		tmp = (__le##bits *)(kaddr + offset - map_start);	\
+		*tmp = cpu_to_le##bits(val);				\
+		if (unmap_on_exit)					\
+			unmap_extent_buffer(eb, map_token, KM_USER1);	\
+	}								\
+}
+
+#include "ctree.h"
+
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index d23216aa4ab..0d1a1be0995 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -239,7 +239,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
 			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = 0;
+			path->nodes[i] = NULL;
 		}
 	}
 out:
-- 
cgit v1.2.3


From 19c00ddcc31ad4bdfb86b57085e06d6135b9b1d7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:19:22 -0400
Subject: Btrfs: Add back metadata checksumming

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   2 +
 fs/btrfs/disk-io.c     | 215 +++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/extent-tree.c |   4 +
 fs/btrfs/extent_map.c  |  83 +++++++++++++++----
 fs/btrfs/extent_map.h  |  18 ++++-
 fs/btrfs/inode.c       |   4 -
 6 files changed, 245 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d1c6f023a30..1af0a966f85 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -310,9 +310,11 @@ struct btrfs_fs_info {
 	struct extent_buffer *sb_buffer;
 	struct super_block *sb;
 	struct inode *btree_inode;
+	spinlock_t hash_lock;
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
 	struct list_head trans_list;
+	struct list_head hashers;
 	struct list_head dead_roots;
 	struct delayed_work trans_work;
 	struct kobject super_kobj;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 16f0260fca6..1176e5420c5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -60,7 +60,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 	struct extent_buffer *eb;
 
 	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
-				 bytenr, blocksize, GFP_NOFS);
+				 bytenr, blocksize, NULL, GFP_NOFS);
 	return eb;
 }
 
@@ -99,10 +99,102 @@ out:
 	return em;
 }
 
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+{
+	return crc32c(seed, data, len);
+}
+
+void btrfs_csum_final(u32 crc, char *result)
+{
+	*(__le32 *)result = ~cpu_to_le32(crc);
+}
+
+static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+			   int verify)
+{
+	char result[BTRFS_CRC32_SIZE];
+	unsigned long len;
+	unsigned long cur_len;
+	unsigned long offset = BTRFS_CSUM_SIZE;
+	char *map_token = NULL;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	int err;
+	u32 crc = ~(u32)0;
+
+	len = buf->len - offset;
+	while(len > 0) {
+		err = map_private_extent_buffer(buf, offset, 32,
+					&map_token, &kaddr,
+					&map_start, &map_len, KM_USER0);
+		if (err) {
+			printk("failed to map extent buffer! %lu\n",
+			       offset);
+			return 1;
+		}
+		cur_len = min(len, map_len - (offset - map_start));
+		crc = btrfs_csum_data(root, kaddr + offset - map_start,
+				      crc, cur_len);
+		len -= cur_len;
+		offset += cur_len;
+		unmap_extent_buffer(buf, map_token, KM_USER0);
+	}
+	btrfs_csum_final(crc, result);
+
+	if (verify) {
+		if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
+			printk("btrfs: %s checksum verify failed on %llu\n",
+			       root->fs_info->sb->s_id,
+			       buf->start);
+			return 1;
+		}
+	} else {
+		write_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE);
+	}
+	return 0;
+}
+
+
+int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+{
+	struct extent_map_tree *tree;
+	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 found_start;
+	int found_level;
+	unsigned long len;
+	struct extent_buffer *eb;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+	len = page->private >> 2;
+	if (len == 0) {
+		WARN_ON(1);
+	}
+	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1);
+	found_start = btrfs_header_bytenr(eb);
+	if (found_start != start) {
+		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
+		       start, found_start, len);
+	}
+	found_level = btrfs_header_level(eb);
+	csum_tree_block(root, eb, 0);
+	free_extent_buffer(eb);
+out:
+	return 0;
+}
+
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_map_tree *tree;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+
+	csum_dirty_buffer(root, page);
 	return extent_write_full_page(tree, page, btree_get_extent, wbc);
 }
 int btree_readpage(struct file *file, struct page *page)
@@ -117,7 +209,6 @@ static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
 	struct extent_map_tree *tree;
 	int ret;
 
-	BUG_ON(page->private != 1);
 	tree = &BTRFS_I(page->mapping->host)->extent_tree;
 	ret = try_release_extent_mapping(tree, page);
 	if (ret == 1) {
@@ -136,46 +227,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	btree_releasepage(page, GFP_NOFS);
 }
 
-int btrfs_csum_data(struct btrfs_root * root, char *data, size_t len,
-		    char *result)
-{
-	return 0;
-#if 0
-	u32 crc;
-	crc = crc32c(0, data, len);
-	memcpy(result, &crc, BTRFS_CRC32_SIZE);
-	return 0;
-#endif
-}
-
-#if 0
-static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
-			   int verify)
-{
-	return 0;
-	char result[BTRFS_CRC32_SIZE];
-	int ret;
-	struct btrfs_node *node;
-
-	ret = btrfs_csum_data(root, bh->b_data + BTRFS_CSUM_SIZE,
-			      bh->b_size - BTRFS_CSUM_SIZE, result);
-	if (ret)
-		return ret;
-	if (verify) {
-		if (memcmp(bh->b_data, result, BTRFS_CRC32_SIZE)) {
-			printk("btrfs: %s checksum verify failed on %llu\n",
-			       root->fs_info->sb->s_id,
-			       (unsigned long long)bh_blocknr(bh));
-			return 1;
-		}
-	} else {
-		node = btrfs_buffer_node(bh);
-		memcpy(node->header.csum, result, BTRFS_CRC32_SIZE);
-	}
-	return 0;
-}
-#endif
-
 #if 0
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
@@ -215,7 +266,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
-				 buf, 0);
+				 buf, 0, 0);
 	free_extent_buffer(buf);
 	return ret;
 }
@@ -225,12 +276,29 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_map_tree *extent_tree;
+	int ret;
+
+	extent_tree = &BTRFS_I(btree_inode)->extent_tree;
 
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
-				 buf, 1);
+				 buf, 0, 1);
+	if (buf->flags & EXTENT_CSUM) {
+		return buf;
+	}
+	if (test_range_bit(extent_tree, buf->start, buf->start + buf->len - 1,
+			   EXTENT_CSUM, 1)) {
+		buf->flags |= EXTENT_CSUM;
+		return buf;
+	}
+	ret = csum_tree_block(root, buf, 1);
+	set_extent_bits(extent_tree, buf->start,
+			buf->start + buf->len - 1,
+			EXTENT_CSUM, GFP_NOFS);
+	buf->flags |= EXTENT_CSUM;
 	return buf;
 }
 
@@ -251,13 +319,6 @@ int wait_on_tree_block_writeback(struct btrfs_root *root,
 	return 0;
 }
 
-int set_tree_block_dirty(struct btrfs_root *root, struct extent_buffer *buf)
-{
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
-	return 0;
-}
-
 static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
@@ -416,7 +477,24 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 
 	return root;
 }
-
+#if 0
+static int add_hasher(struct btrfs_fs_info *info, char *type) {
+	struct btrfs_hasher *hasher;
+
+	hasher = kmalloc(sizeof(*hasher), GFP_NOFS);
+	if (!hasher)
+		return -ENOMEM;
+	hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC);
+	if (!hasher->hash_tfm) {
+		kfree(hasher);
+		return -EINVAL;
+	}
+	spin_lock(&info->hash_lock);
+	list_add(&hasher->list, &info->hashers);
+	spin_unlock(&info->hash_lock);
+	return 0;
+}
+#endif
 struct btrfs_root *open_ctree(struct super_block *sb)
 {
 	u32 sectorsize;
@@ -440,6 +518,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
+	INIT_LIST_HEAD(&fs_info->hashers);
+	spin_lock_init(&fs_info->hash_lock);
+
 	memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj));
 	init_completion(&fs_info->kobj_unregister);
 	sb_set_blocksize(sb, 4096);
@@ -479,6 +560,14 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 
+#if 0
+	ret = add_hasher(fs_info, "crc32c");
+	if (ret) {
+		printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
+		err = -ENOMEM;
+		goto fail_iput;
+	}
+#endif
 	__setup_root(512, 512, 512, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -509,25 +598,21 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	i_size_write(fs_info->btree_inode,
 		     btrfs_super_total_bytes(disk_super));
 
-
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
 		printk("btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
+
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
+
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  blocksize);
 	if (!tree_root->node)
 		goto fail_sb_buffer;
 
-#if 0
-	btrfs_print_leaf(tree_root, tree_root->node);
-	err = -EIO;
-	goto fail_tree_root;
-#endif
 	mutex_lock(&fs_info->fs_mutex);
 
 	ret = find_and_setup_root(tree_root, fs_info,
@@ -634,9 +719,19 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
-	extent_map_tree_cleanup(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+	extent_map_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->extent_tree);
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
+#if 0
+	while(!list_empty(&fs_info->hashers)) {
+		struct btrfs_hasher *hasher;
+		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
+				    hashers);
+		list_del(&hasher->hashers);
+		crypto_free_hash(&fs_info->hash_tfm);
+		kfree(hasher);
+	}
+#endif
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
 	return 0;
@@ -733,5 +828,5 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
-					buf, 1);
+					buf, 0, 1);
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e4e68ea9648..6b2f2b414fa 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1178,6 +1178,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	btrfs_set_buffer_uptodate(buf);
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
+	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->extent_tree,
+			buf->start, buf->start + buf->len - 1,
+			EXTENT_CSUM, GFP_NOFS);
+	buf->flags |= EXTENT_CSUM;
 	btrfs_set_buffer_defrag(buf);
 	trans->blocks_used++;
 	return buf;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f8aaba8a30a..2a8bc4bd43a 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -81,7 +81,7 @@ void extent_map_tree_init(struct extent_map_tree *tree,
 }
 EXPORT_SYMBOL(extent_map_tree_init);
 
-void extent_map_tree_cleanup(struct extent_map_tree *tree)
+void extent_map_tree_empty_lru(struct extent_map_tree *tree)
 {
 	struct extent_buffer *eb;
 	while(!list_empty(&tree->buffer_lru)) {
@@ -91,7 +91,7 @@ void extent_map_tree_cleanup(struct extent_map_tree *tree)
 		free_extent_buffer(eb);
 	}
 }
-EXPORT_SYMBOL(extent_map_tree_cleanup);
+EXPORT_SYMBOL(extent_map_tree_empty_lru);
 
 struct extent_map *alloc_extent_map(gfp_t mask)
 {
@@ -1464,7 +1464,7 @@ void set_page_extent_mapped(struct page *page)
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
 		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		set_page_private(page, 1);
+		set_page_private(page, EXTENT_PAGE_PRIVATE);
 		page_cache_get(page);
 	}
 }
@@ -1979,8 +1979,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
 
 	spin_lock(&tree->lru_lock);
 	eb = find_lru(tree, start, len);
-	if (eb)
+	if (eb) {
 		goto lru_add;
+	}
 	spin_unlock(&tree->lru_lock);
 
 	if (eb) {
@@ -2007,6 +2008,7 @@ static void __free_extent_buffer(struct extent_buffer *eb)
 
 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 					  u64 start, unsigned long len,
+					  struct page *page0,
 					  gfp_t mask)
 {
 	unsigned long num_pages = num_extent_pages(start, len);
@@ -2024,7 +2026,18 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	if (eb->flags & EXTENT_BUFFER_FILLED)
 		return eb;
 
-	for (i = 0; i < num_pages; i++, index++) {
+	if (page0) {
+		eb->first_page = page0;
+		i = 1;
+		index++;
+		page_cache_get(page0);
+		set_page_extent_mapped(page0);
+		set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
+				 len << 2);
+	} else {
+		i = 0;
+	}
+	for (; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
 		if (!p) {
 			WARN_ON(1);
@@ -2036,8 +2049,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
-		if (i == 0)
+		if (i == 0) {
 			eb->first_page = p;
+			set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
+					 len << 2);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
@@ -2057,8 +2075,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 					  gfp_t mask)
 {
 	unsigned long num_pages = num_extent_pages(start, len);
-	unsigned long i;
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT;
 	struct extent_buffer *eb;
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
@@ -2082,8 +2099,15 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
-		if (i == 0)
+
+		if (i == 0) {
 			eb->first_page = p;
+			set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
+					 len << 2);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
+
 		if (!PageUptodate(p))
 			uptodate = 0;
 		unlock_page(p);
@@ -2174,7 +2198,21 @@ int set_extent_buffer_dirty(struct extent_map_tree *tree,
 
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
+		struct page *page = extent_buffer_page(eb, i);
+		/* writepage may need to do something special for the
+		 * first page, we have to make sure page->private is
+		 * properly set.  releasepage may drop page->private
+		 * on us if the page isn't already dirty.
+		 */
+		if (i == 0) {
+			lock_page(page);
+			set_page_private(page,
+					 EXTENT_PAGE_PRIVATE_FIRST_PAGE |
+					 eb->len << 2);
+		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+		if (i == 0)
+			unlock_page(page);
 	}
 	return set_extent_dirty(tree, eb->start,
 				eb->start + eb->len - 1, GFP_NOFS);
@@ -2217,9 +2255,12 @@ int extent_buffer_uptodate(struct extent_map_tree *tree,
 EXPORT_SYMBOL(extent_buffer_uptodate);
 
 int read_extent_buffer_pages(struct extent_map_tree *tree,
-			     struct extent_buffer *eb, int wait)
+			     struct extent_buffer *eb,
+			     u64 start,
+			     int wait)
 {
 	unsigned long i;
+	unsigned long start_i;
 	struct page *page;
 	int err;
 	int ret = 0;
@@ -2232,9 +2273,16 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 			   EXTENT_UPTODATE, 1)) {
 		return 0;
 	}
+	if (start) {
+		WARN_ON(start < eb->start);
+		start_i = (start >> PAGE_CACHE_SHIFT) -
+			(eb->start >> PAGE_CACHE_SHIFT);
+	} else {
+		start_i = 0;
+	}
 
 	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
+	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (PageUptodate(page)) {
 			continue;
@@ -2260,7 +2308,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 		return ret;
 	}
 
-	for (i = 0; i < num_pages; i++) {
+	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
 		if (!PageUptodate(page)) {
@@ -2314,7 +2362,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 }
 EXPORT_SYMBOL(read_extent_buffer);
 
-static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 			       unsigned long min_len, char **token, char **map,
 			       unsigned long *map_start,
 			       unsigned long *map_len, int km)
@@ -2337,6 +2385,10 @@ static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		offset = 0;
 		*map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
 	}
+	if (start + min_len >= eb->len) {
+printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
+		WARN_ON(1);
+	}
 
 	p = extent_buffer_page(eb, i);
 	WARN_ON(!PageUptodate(p));
@@ -2346,6 +2398,7 @@ static int __map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	*map_len = PAGE_CACHE_SIZE - offset;
 	return 0;
 }
+EXPORT_SYMBOL(map_private_extent_buffer);
 
 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		      unsigned long min_len,
@@ -2360,8 +2413,8 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		eb->map_token = NULL;
 		save = 1;
 	}
-	err = __map_extent_buffer(eb, start, min_len, token, map,
-				   map_start, map_len, km);
+	err = map_private_extent_buffer(eb, start, min_len, token, map,
+				       map_start, map_len, km);
 	if (!err && save) {
 		eb->map_token = *token;
 		eb->kaddr = *map;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index f1dc28d260e..39d78d31b6a 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -17,8 +17,17 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_CSUM (1 << 9)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+
 
 struct extent_map_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
@@ -89,7 +98,7 @@ typedef struct extent_map *(get_extent_t)(struct inode *inode,
 
 void extent_map_tree_init(struct extent_map_tree *tree,
 			  struct address_space *mapping, gfp_t mask);
-void extent_map_tree_cleanup(struct extent_map_tree *tree);
+void extent_map_tree_empty_lru(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 end);
 int add_extent_mapping(struct extent_map_tree *tree,
@@ -143,13 +152,14 @@ void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 					  u64 start, unsigned long len,
+					  struct page *page0,
 					  gfp_t mask);
 struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 					 u64 start, unsigned long len,
 					  gfp_t mask);
 void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_map_tree *tree,
-			     struct extent_buffer *eb, int wait);
+			     struct extent_buffer *eb, u64 start, int wait);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
@@ -187,5 +197,9 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long min_len, char **token, char **map,
 		      unsigned long *map_start,
 		      unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 84f496c838c..80f84b48a70 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1713,10 +1713,6 @@ static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 	struct extent_map_tree *tree;
 	int ret;
 
-	if (page->private != 1) {
-		WARN_ON(1);
-		return try_to_free_buffers(page);
-	}
 	tree = &BTRFS_I(page->mapping->host)->extent_tree;
 	ret = try_release_extent_mapping(tree, page);
 	if (ret == 1) {
-- 
cgit v1.2.3


From 0f1ebbd159baade0417f8f62f0cd6810cc950832 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:19:46 -0400
Subject: Btrfs: Large block related defrag optimizations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  4 +---
 fs/btrfs/tree-defrag.c | 23 ++++++++++++++++++++++-
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0c6ed17ac1b..e1557e573d7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -217,11 +217,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	if (btrfs_buffer_defrag_done(parent))
-		return 0;
+	parent_level = btrfs_header_level(parent);
 
 	parent_nritems = btrfs_header_nritems(parent);
-	parent_level = btrfs_header_level(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
 
 	start_slot = 0;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 0d1a1be0995..d31c21ac6f2 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -118,7 +118,6 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
 	btrfs_clear_buffer_defrag(path->nodes[*level]);
-	btrfs_clear_buffer_defrag_done(path->nodes[*level]);
 
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
@@ -148,6 +147,26 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 			root->defrag_level = i;
 			return 0;
 		} else {
+			if (*level > 1 && path->nodes[*level] != root->node) {
+				struct extent_buffer *next;
+				u64 last;
+				int ret;
+				ret = btrfs_cow_block(trans, root,
+						      path->nodes[*level],
+						      path->nodes[*level + 1],
+						      path->slots[*level + 1],
+						      &next);
+				BUG_ON(ret);
+				path->nodes[*level] = next;
+				last = next->start;
+				ret = btrfs_realloc_node(trans, root, next,
+							 cache_only, &last);
+				BUG_ON(ret);
+
+				if (root == root->fs_info->extent_root)
+					btrfs_extent_post_op(trans, root);
+			}
+
 			btrfs_clear_buffer_defrag(path->nodes[*level]);
 			btrfs_clear_buffer_defrag_done(path->nodes[*level]);
 			free_extent_buffer(path->nodes[*level]);
@@ -183,9 +202,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 	level = btrfs_header_level(root->node);
 	orig_level = level;
+
 	if (level == 0) {
 		goto out;
 	}
+
 	if (root->defrag_progress.objectid == 0) {
 		extent_buffer_get(root->node);
 		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-- 
cgit v1.2.3


From 304fced668079ce222200d5742657d34cc44af09 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:21:17 -0400
Subject: Btrfs: balance_dirty_pages_ratelimited is causing problems, use nr ==
 1 always

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1176e5420c5..3878772fcff 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -768,7 +768,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 {
 	balance_dirty_pages_ratelimited_nr(
-			root->fs_info->btree_inode->i_mapping, nr);
+			root->fs_info->btree_inode->i_mapping, 1);
 }
 
 void btrfs_set_buffer_defrag(struct extent_buffer *buf)
-- 
cgit v1.2.3


From cf786e79e3f96f4bc037c69e1a7afb5fb85c6111 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:22:11 -0400
Subject: Btrfs: Defrag: only walk into nodes with the defrag bit set

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 1 -
 fs/btrfs/tree-defrag.c | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e1557e573d7..984f4745440 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		*last_ret = search_start;
 		if (parent_level == 1)
 			btrfs_clear_buffer_defrag(tmp);
-		btrfs_set_buffer_defrag_done(tmp);
 		free_extent_buffer(tmp);
 	}
 	return err;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index d31c21ac6f2..f86eccf511b 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -87,8 +87,8 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		if (cache_only) {
 			next = btrfs_find_tree_block(root, bytenr,
 					   btrfs_level_size(root, *level - 1));
-			/* FIXME, test for defrag */
-			if (!next || !btrfs_buffer_uptodate(next)) {
+			if (!next || !btrfs_buffer_uptodate(next) ||
+			    !btrfs_buffer_defrag(next)) {
 				free_extent_buffer(next);
 				path->slots[*level]++;
 				continue;
@@ -147,7 +147,8 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 			root->defrag_level = i;
 			return 0;
 		} else {
-			if (*level > 1 && path->nodes[*level] != root->node) {
+			if (*level > 1 && path->nodes[*level] != root->node &&
+			    btrfs_buffer_defrag(path->nodes[*level])) {
 				struct extent_buffer *next;
 				u64 last;
 				int ret;
@@ -168,7 +169,6 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 			}
 
 			btrfs_clear_buffer_defrag(path->nodes[*level]);
-			btrfs_clear_buffer_defrag_done(path->nodes[*level]);
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
-- 
cgit v1.2.3


From ff79f8190b6e955ff7a71faf804a3017d526e657 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:22:25 -0400
Subject: Btrfs: Add back file data checksumming

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c    |  1 +
 fs/btrfs/disk-io.h    |  2 ++
 fs/btrfs/extent_map.c |  3 +++
 fs/btrfs/file-item.c  | 17 +++++++++--------
 fs/btrfs/file.c       |  1 +
 fs/btrfs/inode.c      | 21 ++++++++++-----------
 6 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3878772fcff..5accced2858 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -594,6 +594,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	tree_root->nodesize = nodesize;
 	tree_root->leafsize = leafsize;
 	tree_root->sectorsize = sectorsize;
+	sb_set_blocksize(sb, sectorsize);
 
 	i_size_write(fs_info->btree_inode,
 		     btrfs_super_total_bytes(disk_super));
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 190b07bc642..33d2ccfd74d 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -58,4 +58,6 @@ int btrfs_buffer_defrag_done(struct extent_buffer *buf);
 int btrfs_clear_buffer_defrag(struct extent_buffer *buf);
 int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf);
+u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+void btrfs_csum_final(u32 crc, char *result);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2a8bc4bd43a..e87e476dca9 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2031,6 +2031,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 		i = 1;
 		index++;
 		page_cache_get(page0);
+		mark_page_accessed(page0);
 		set_page_extent_mapped(page0);
 		set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
 				 len << 2);
@@ -2049,6 +2050,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
+		mark_page_accessed(p);
 		if (i == 0) {
 			eb->first_page = p;
 			set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
@@ -2099,6 +2101,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 			goto fail;
 		}
 		set_page_extent_mapped(p);
+		mark_page_accessed(p);
 
 		if (i == 0) {
 			eb->first_page = p;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9a65e97a4e2..10a4c740860 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -136,27 +136,27 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  u64 objectid, u64 offset,
 			  char *data, size_t len)
 {
-	return 0;
-#if 0
 	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
-	struct extent_buffer *leaf;
+	struct extent_buffer *leaf = NULL;
 	u64 csum_offset;
+	u32 csum_result = ~(u32)0;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
 	file_key.objectid = objectid;
 	file_key.offset = offset;
-	file_key.flags = 0;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
 
 	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
-	if (!IS_ERR(item))
+	if (!IS_ERR(item)) {
+		leaf = path->nodes[0];
 		goto found;
+	}
 	ret = PTR_ERR(item);
 	if (ret == -EFBIG) {
 		u32 item_size;
@@ -226,14 +226,15 @@ csum:
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
 					  csum_offset * BTRFS_CRC32_SIZE);
 found:
-	/* FIXME!!!!!!!!!!!! */
-	ret = btrfs_csum_data(root, data, len, &item->csum);
+	csum_result = btrfs_csum_data(root, data, csum_result, len);
+	btrfs_csum_final(csum_result, (char *)&csum_result);
+	write_extent_buffer(leaf, &csum_result, (unsigned long)item,
+			    BTRFS_CRC32_SIZE);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 fail:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
-#endif
 }
 
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fe28404ae7f..1af2b6534da 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -289,6 +289,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 */
 	inline_size = end_pos;
 	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    inline_size > 16384 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 80f84b48a70..ac69f6926af 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -135,7 +135,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct btrfs_csum_item *item;
 	struct btrfs_path *path = NULL;
-	u64 private;
+	u32 csum;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	path = btrfs_alloc_path();
@@ -145,11 +145,12 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 		/* a csum that isn't present is a preallocated region. */
 		if (ret == -ENOENT || ret == -EFBIG)
 			ret = 0;
-		private = 0;
+		csum = 0;
 		goto out;
 	}
-	memcpy((char *)&private, &item->csum, BTRFS_CRC32_SIZE);
-	set_state_private(em_tree, start, private);
+	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
+			   BTRFS_CRC32_SIZE);
+	set_state_private(em_tree, start, csum);
 out:
 	if (path)
 		btrfs_free_path(path);
@@ -165,21 +166,19 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	char *kaddr;
 	u64 private;
 	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u32 csum = ~(u32)0;
 
 	ret = get_state_private(em_tree, start, &private);
 	kaddr = kmap_atomic(page, KM_IRQ0);
 	if (ret) {
 		goto zeroit;
 	}
-	/*
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	char csum[BTRFS_CRC32_SIZE];
-	ret = btrfs_csum_data(root, kaddr + offset, end - start + 1, csum);
-	BUG_ON(ret);
-	if (memcmp(csum, &private, BTRFS_CRC32_SIZE)) {
+	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+	btrfs_csum_final(csum, (char *)&csum);
+	if (csum != private) {
 		goto zeroit;
 	}
-	*/
 	kunmap_atomic(kaddr, KM_IRQ0);
 	return 0;
 
-- 
cgit v1.2.3


From a6b6e75e096f436f0cc56edf5bca96301e194491 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Oct 2007 16:22:39 -0400
Subject: Btrfs: Defrag only leaves, and only when the parent node has a single
 objectid

This allows us to defrag huge directories, but skip the expensive defrag
case in more common usage, where it does not help as much.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 22 ++++++++++++++++------
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/transaction.c |  2 --
 fs/btrfs/tree-defrag.c | 36 +++++-------------------------------
 4 files changed, 23 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 984f4745440..74fec6b83a8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/sched.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -190,7 +191,8 @@ static int should_defrag_leaf(struct extent_buffer *leaf)
 
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int cache_only, u64 *last_ret)
+		       int start_slot, int cache_only, u64 *last_ret,
+		       struct btrfs_key *progress)
 {
 	struct extent_buffer *cur;
 	struct extent_buffer *tmp;
@@ -199,7 +201,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	u64 last_block = 0;
 	u64 other;
 	u32 parent_nritems;
-	int start_slot;
 	int end_slot;
 	int i;
 	int err = 0;
@@ -221,15 +222,24 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
-
-	start_slot = 0;
 	end_slot = parent_nritems;
 
 	if (parent_nritems == 1)
 		return 0;
 
+	if (root != root->fs_info->extent_root) {
+		struct btrfs_key first_key;
+		struct btrfs_key last_key;
+
+		btrfs_node_key_to_cpu(parent, &first_key, 0);
+		btrfs_node_key_to_cpu(parent, &last_key, parent_nritems - 1);
+		if (first_key.objectid != last_key.objectid)
+			return 0;
+	}
+
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
+
 		blocknr = btrfs_node_blockptr(parent, i);
 		if (last_block == 0)
 			last_block = blocknr;
@@ -898,7 +908,7 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 	u32 blocksize;
 	u32 nscan = 0;
 
-	if (level == 0)
+	if (level != 1)
 		return;
 
 	if (!path->nodes[level])
@@ -2370,7 +2380,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+		if (0 && used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1af0a966f85..37bccb1a9a7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -913,7 +913,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int cache_only, u64 *last_ret);
+		       int start_slot, int cache_only, u64 *last_ret,
+		       struct btrfs_key *progress);
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 67e4aca36a6..bdfe05cf26c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -353,7 +353,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 
 	if (root->defrag_running)
 		return 0;
-
 	trans = btrfs_start_transaction(root, 1);
 	while (1) {
 		root->defrag_running = 1;
@@ -361,7 +360,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&info->fs_mutex);
-
 		btrfs_btree_balance_dirty(info->tree_root, nr);
 		cond_resched();
 
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f86eccf511b..b02355a7b14 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -76,7 +76,9 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		if (*level == 1) {
 			ret = btrfs_realloc_node(trans, root,
 						 path->nodes[*level],
-						 cache_only, last_ret);
+						 path->slots[*level],
+						 cache_only, last_ret,
+						 &root->defrag_progress);
 			if (is_extent)
 				btrfs_extent_post_op(trans, root);
 
@@ -100,10 +102,6 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
 				      path->slots[*level], &next);
 		BUG_ON(ret);
-		ret = btrfs_realloc_node(trans, root, next, cache_only,
-					 last_ret);
-		BUG_ON(ret);
-
 		if (is_extent)
 			btrfs_extent_post_op(trans, root);
 
@@ -122,8 +120,8 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
-	WARN_ON(ret);
-	return 0;
+	WARN_ON(ret && ret != -EAGAIN);
+	return ret;
 }
 
 static int defrag_walk_up(struct btrfs_trans_handle *trans,
@@ -147,27 +145,6 @@ static int defrag_walk_up(struct btrfs_trans_handle *trans,
 			root->defrag_level = i;
 			return 0;
 		} else {
-			if (*level > 1 && path->nodes[*level] != root->node &&
-			    btrfs_buffer_defrag(path->nodes[*level])) {
-				struct extent_buffer *next;
-				u64 last;
-				int ret;
-				ret = btrfs_cow_block(trans, root,
-						      path->nodes[*level],
-						      path->nodes[*level + 1],
-						      path->slots[*level + 1],
-						      &next);
-				BUG_ON(ret);
-				path->nodes[*level] = next;
-				last = next->start;
-				ret = btrfs_realloc_node(trans, root, next,
-							 cache_only, &last);
-				BUG_ON(ret);
-
-				if (root == root->fs_info->extent_root)
-					btrfs_extent_post_op(trans, root);
-			}
-
 			btrfs_clear_buffer_defrag(path->nodes[*level]);
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -211,9 +188,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		extent_buffer_get(root->node);
 		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 		BUG_ON(ret);
-		ret = btrfs_realloc_node(trans, root, root->node, cache_only,
-					 &last_ret);
-		BUG_ON(ret);
 		path->nodes[level] = root->node;
 		path->slots[level] = 0;
 		if (is_extent)
-- 
cgit v1.2.3


From 7936ca3883e5fef8ce5cc367a4356ad5fed67180 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 19 Oct 2007 09:22:41 -0400
Subject: Btrfs: Default to 8k max packed tails

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 fs/btrfs/file.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 74fec6b83a8..54967099a8e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2380,7 +2380,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (0 && used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1af2b6534da..4aacf99bd97 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -289,7 +289,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 */
 	inline_size = end_pos;
 	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > 16384 ||
+	    inline_size > 8192 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
 
-- 
cgit v1.2.3


From 6d7231f7d33fc14f2d41abc3b9cb28dcb208735d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 19 Oct 2007 09:22:49 -0400
Subject: Btrfs: Fix typo: owner is a 64 bit field

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 37bccb1a9a7..1d2b5415007 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -522,12 +522,12 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32);
 
 /* struct btrfs_extent_item */
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
-BTRFS_SETGET_FUNCS(extent_owner, struct btrfs_extent_item, owner, 32);
+BTRFS_SETGET_FUNCS(extent_owner, struct btrfs_extent_item, owner, 64);
 
 BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
 			 refs, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_extent_owner, struct btrfs_extent_item,
-			 owner, 32);
+			 owner, 64);
 
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
-- 
cgit v1.2.3


From 44b36eb20bb60d7da1375c1fb45784740a4dc358 Mon Sep 17 00:00:00 2001
From: Yan Zheng <yanzheng@21cn.com>
Date: Fri, 19 Oct 2007 09:22:56 -0400
Subject: Btrfs: Properly setup root key while reading the root

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5accced2858..5a7e60d7f44 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -410,6 +410,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 	read_extent_buffer(l, &root->root_item,
 	       btrfs_item_ptr_offset(l, path->slots[0]),
 	       sizeof(root->root_item));
+	memcpy(&root->root_key, location, sizeof(*location));
 	ret = 0;
 out:
 	btrfs_release_path(root, path);
-- 
cgit v1.2.3


From ae2f5411c4ce7180cca8418853db50c8e52d40db Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 19 Oct 2007 09:22:59 -0400
Subject: btrfs: 32-bit type problems

An assorted set of casts to get rid of the warnings on 32-bit archs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 2 +-
 fs/btrfs/extent-tree.c | 8 ++++----
 fs/btrfs/extent_map.c  | 6 +++---
 fs/btrfs/file.c        | 2 +-
 fs/btrfs/inode.c       | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 54967099a8e..17a322663f3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1389,7 +1389,7 @@ int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf)
 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
 	if (ret < 0) {
 		printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
-		       ret, BTRFS_LEAF_DATA_SIZE(root),
+		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
 		       leaf_space_used(leaf, 0, nritems), nritems);
 	}
 	return ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b2f2b414fa..7de7707d939 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -149,7 +149,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	if (ret)
 		return NULL;
 
-	block_group = (struct btrfs_block_group_cache *)ptr;
+	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 
 
 	if (block_group->key.objectid <= bytenr && bytenr <=
@@ -279,7 +279,7 @@ again:
 		if (ret)
 			break;
 
-		cache = (struct btrfs_block_group_cache *)ptr;
+		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
@@ -537,7 +537,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		if (ret)
 			break;
 
-		cache = (struct btrfs_block_group_cache *)ptr;
+		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		err = write_one_cache_group(trans, root,
 					    path, cache);
 		/*
@@ -1541,7 +1541,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				found_key.objectid + found_key.offset - 1,
 				bit | EXTENT_LOCKED, GFP_NOFS);
 		set_state_private(block_group_cache, found_key.objectid,
-				  (u64)cache);
+				  (unsigned long)cache);
 
 		if (key.objectid >=
 		    btrfs_super_total_bytes(&info->super_copy))
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index e87e476dca9..89d0ebd0ea8 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2654,8 +2654,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
 					       src_off_in_page));
-		cur = min(cur, (unsigned long)(PAGE_CACHE_SIZE -
-					       dst_off_in_page));
+		cur = min_t(unsigned long, cur,
+			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
 
 		copy_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
@@ -2707,7 +2707,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		if (dst_i == 0)
 			dst_off_in_page += start_offset;
 
-		cur = min(len, src_off_in_page + 1);
+		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
 		move_pages(extent_buffer_page(dst, dst_i),
 			   extent_buffer_page(dst, src_i),
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4aacf99bd97..96df1b10cb6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -188,7 +188,7 @@ insert:
 	while (size > 0) {
 		page = pages[i];
 		kaddr = kmap_atomic(page, KM_USER0);
-		cur_size = min(PAGE_CACHE_SIZE - page_offset, size);
+		cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
 		write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
 		kunmap_atomic(kaddr, KM_USER0);
 		page_offset = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ac69f6926af..0e1ca423e22 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1606,7 +1606,7 @@ again:
 			extent_start;
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		map = kmap(page);
-		copy_size = min(PAGE_CACHE_SIZE - page_offset,
+		copy_size = min_t(u64, PAGE_CACHE_SIZE - page_offset,
 				size - extent_offset);
 
 		em->block_start = EXTENT_MAP_INLINE;
-- 
cgit v1.2.3


From 0a2118dfd4c328e8823cdcfebf839dff73ac21e9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 19 Oct 2007 09:23:05 -0400
Subject: Btrfs: Fix bi_end_io() functions on > 2.6.23 kernels

It now returns void and it is never called for partial completions, so
the bio->bi_size check must go.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 89d0ebd0ea8..c0f834563ce 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -9,6 +9,7 @@
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
+#include <linux/version.h>
 #include "extent_map.h"
 
 /* temporary define until extent_map moves out of btrfs */
@@ -1274,8 +1275,12 @@ static int check_page_writeback(struct extent_map_tree *tree,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_writepage(struct bio *bio, int err)
+#else
 static int end_bio_extent_writepage(struct bio *bio,
 				   unsigned int bytes_done, int err)
+#endif
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1284,8 +1289,10 @@ static int end_bio_extent_writepage(struct bio *bio,
 	u64 end;
 	int whole_page;
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
 		return 1;
+#endif
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -1316,7 +1323,9 @@ static int end_bio_extent_writepage(struct bio *bio,
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
+#endif
 }
 
 /*
@@ -1330,8 +1339,12 @@ static int end_bio_extent_writepage(struct bio *bio,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_readpage(struct bio *bio, int err)
+#else
 static int end_bio_extent_readpage(struct bio *bio,
 				   unsigned int bytes_done, int err)
+#endif
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1341,8 +1354,10 @@ static int end_bio_extent_readpage(struct bio *bio,
 	int whole_page;
 	int ret;
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
 		return 1;
+#endif
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -1382,7 +1397,9 @@ static int end_bio_extent_readpage(struct bio *bio,
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
+#endif
 }
 
 /*
@@ -1390,8 +1407,12 @@ static int end_bio_extent_readpage(struct bio *bio,
  * the structs in the extent tree when done, and set the uptodate bits
  * as appropriate.
  */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+#else
 static int end_bio_extent_preparewrite(struct bio *bio,
 				       unsigned int bytes_done, int err)
+#endif
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1399,8 +1420,10 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 	u64 start;
 	u64 end;
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
 		return 1;
+#endif
 
 	do {
 		struct page *page = bvec->bv_page;
@@ -1422,7 +1445,9 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
+#endif
 }
 
 static int submit_extent_page(int rw, struct extent_map_tree *tree,
-- 
cgit v1.2.3


From bbf0d0062da2b3e1e239e381f8ac20971ef1b2ed Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 19 Oct 2007 09:23:07 -0400
Subject: Btrfs: KM_IRQ0 usage in end_io handling

endio handling is typically called with interrupts disabled, but can
also be called with it enabled. So save interrupts before using KM_IRQ0
to be completely safe.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0e1ca423e22..02d802cdd59 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -168,8 +168,10 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
+	unsigned long flags;
 
 	ret = get_state_private(em_tree, start, &private);
+	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
 	if (ret) {
 		goto zeroit;
@@ -180,6 +182,7 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 		goto zeroit;
 	}
 	kunmap_atomic(kaddr, KM_IRQ0);
+	local_irq_restore(flags);
 	return 0;
 
 zeroit:
@@ -188,6 +191,7 @@ zeroit:
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
+	local_irq_restore(flags);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b4a2a0edfda5a721bb4c5fa0068d2ad7ea67f98b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 19 Oct 2007 09:23:08 -0400
Subject: Btrfs: sysfs compile fixup

Just use kobject_set_name(), that works in all kernels (I think...).
Kernels newer than 2.6.23 currently fail with:

/home/axboe/git/btrfs/btrfs-unstable/sysfs.c:188: error: unknown field
'name' specified in initializer

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/sysfs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 0bd1fd3d29d..1671c731800 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -184,9 +184,7 @@ static struct kobj_type btrfs_super_ktype = {
 	.release	= btrfs_super_release,
 };
 
-static struct kset btrfs_kset = {
-	.kobj	= {.name = "btrfs"},
-};
+static struct kset btrfs_kset;
 
 int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
 {
@@ -249,6 +247,7 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
 int btrfs_init_sysfs()
 {
 	kobj_set_kset_s(&btrfs_kset, fs_subsys);
+	kobject_set_name(&btrfs_kset.kobj, "btrfs");
 	return kset_register(&btrfs_kset);
 }
 
-- 
cgit v1.2.3


From 59d169e2b3d69ba0beca0fa0936789080c8d61a9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 19 Oct 2007 09:23:09 -0400
Subject: Btrfs: Fix read/write_extent_buffer to use KM_USER1 instead of
 KM_USER0

This avoids recursive use of KM_USER0 during btrfs_file_write

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index c0f834563ce..3c81f5eab15 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2378,9 +2378,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page, KM_USER1);
 		memcpy(dst, kaddr + offset, cur);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr, KM_USER1);
 
 		dst += cur;
 		len -= cur;
@@ -2523,9 +2523,9 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, PAGE_CACHE_SIZE - offset);
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page, KM_USER1);
 		memcpy(kaddr + offset, src, cur);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr, KM_USER1);
 
 		src += cur;
 		len -= cur;
-- 
cgit v1.2.3


From 3685f791659c9f21b763ee1702ac8ca58bc20f81 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 19 Oct 2007 09:23:27 -0400
Subject: Btrfs: CPU usage optimizations in push and the extent_map code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c      | 44 ++++++++++++++++++++++++--------------------
 fs/btrfs/extent_map.c | 45 +++++++++++++++------------------------------
 2 files changed, 39 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 17a322663f3..e8466940fa3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1588,6 +1588,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path->nodes[1])
 		return 1;
 
+	right_nritems = btrfs_header_nritems(right);
+	if (right_nritems == 0) {
+		return 1;
+	}
+
 	left = read_tree_block(root, btrfs_node_blockptr(path->nodes[1],
 			       slot - 1), root->leafsize);
 	free_space = btrfs_leaf_free_space(root, left);
@@ -1604,18 +1609,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		free_extent_buffer(left);
 		return 1;
 	}
+
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		free_extent_buffer(left);
 		return 1;
 	}
 
-	right_nritems = btrfs_header_nritems(right);
-	if (right_nritems == 0) {
-		free_extent_buffer(left);
-		return 1;
-	}
-
 	for (i = 0; i < right_nritems - 1; i++) {
 		item = btrfs_item_nr(right, i);
 		if (!right->map_token) {
@@ -1772,21 +1772,25 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_disk_key disk_key;
 
 	/* first try to make some room by pushing left and right */
-	wret = push_leaf_left(trans, root, path, data_size);
-	if (wret < 0) {
-		return wret;
-	}
-	if (wret) {
+	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size);
-		if (wret < 0)
+		if (wret < 0) {
 			return wret;
-	}
-	l = path->nodes[0];
+		}
+		if (wret) {
+			wret = push_leaf_left(trans, root, path, data_size);
+			if (wret < 0)
+				return wret;
+		}
+		l = path->nodes[0];
 
-	/* did the pushes work? */
-	if (btrfs_leaf_free_space(root, l) >=
-	    sizeof(struct btrfs_item) + data_size) {
-		return 0;
+		/* did the pushes work? */
+		if (btrfs_leaf_free_space(root, l) >=
+		    sizeof(struct btrfs_item) + data_size) {
+			return 0;
+		}
+	} else {
+		l = path->nodes[0];
 	}
 
 	if (!path->nodes[1]) {
@@ -2388,13 +2392,13 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			slot = path->slots[1];
 			extent_buffer_get(leaf);
 
-			wret = push_leaf_left(trans, root, path, 1);
+			wret = push_leaf_right(trans, root, path, 1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
 
 			if (path->nodes[0] == leaf &&
 			    btrfs_header_nritems(leaf)) {
-				wret = push_leaf_right(trans, root, path, 1);
+				wret = push_leaf_left(trans, root, path, 1);
 				if (wret < 0 && wret != -ENOSPC)
 					ret = wret;
 			}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 3c81f5eab15..caaf0bf0e05 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1986,12 +1986,15 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
 	struct page *p;
+	struct address_space *mapping;
 
 	if (i == 0)
 		return eb->first_page;
 	i += eb->start >> PAGE_CACHE_SHIFT;
-	p = find_get_page(eb->first_page->mapping, i);
-	page_cache_release(p);
+	mapping = eb->first_page->mapping;
+	read_lock_irq(&mapping->tree_lock);
+	p = radix_tree_lookup(&mapping->page_tree, i);
+	read_unlock_irq(&mapping->tree_lock);
 	return p;
 }
 
@@ -2365,9 +2368,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
-	if (i == 0)
-		offset += start_offset;
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
 	while(len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -2475,9 +2476,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
-	if (i == 0)
-		offset += start_offset;
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
 	while(len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -2514,9 +2513,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
-	if (i == 0)
-		offset += start_offset;
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
 	while(len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -2548,9 +2545,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = start & ((unsigned long)PAGE_CACHE_SIZE - 1);
-	if (i == 0)
-		offset += start_offset;
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
 	while(len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -2582,9 +2577,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 
 	WARN_ON(src->len != dst_len);
 
-	offset = dst_offset & ((unsigned long)PAGE_CACHE_SIZE - 1);
-	if (i == 0)
-		offset += start_offset;
+	offset = (start_offset + dst_offset) &
+		((unsigned long)PAGE_CACHE_SIZE - 1);
 
 	while(len > 0) {
 		page = extent_buffer_page(dst, i);
@@ -2664,19 +2658,14 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	}
 
 	while(len > 0) {
-		dst_off_in_page = dst_offset &
+		dst_off_in_page = (start_offset + dst_offset) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
-		src_off_in_page = src_offset &
+		src_off_in_page = (start_offset + src_offset) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 
 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
 
-		if (src_i == 0)
-			src_off_in_page += start_offset;
-		if (dst_i == 0)
-			dst_off_in_page += start_offset;
-
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
 					       src_off_in_page));
 		cur = min_t(unsigned long, cur,
@@ -2723,14 +2712,10 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 
-		dst_off_in_page = dst_end &
+		dst_off_in_page = (start_offset + dst_end) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
-		src_off_in_page = src_end &
+		src_off_in_page = (start_offset + src_end) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
-		if (src_i == 0)
-			src_off_in_page += start_offset;
-		if (dst_i == 0)
-			dst_off_in_page += start_offset;
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
-- 
cgit v1.2.3


From ff190c0c004d8e51195c7bcf5a8490aeefccbce0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 19 Oct 2007 10:39:41 -0400
Subject: Btrfs: Avoid recursive KM_USER1 mappings in copy_extent_buffer

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index caaf0bf0e05..a61379230dd 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2586,9 +2586,9 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 
 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
 
-		kaddr = kmap_atomic(page, KM_USER1);
+		kaddr = kmap_atomic(page, KM_USER0);
 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
-		kunmap_atomic(kaddr, KM_USER1);
+		kunmap_atomic(kaddr, KM_USER0);
 
 		src_offset += cur;
 		len -= cur;
-- 
cgit v1.2.3


From 5ee78ac70f6c01f2a5acf662c2af975665710edc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 19 Oct 2007 14:01:21 -0400
Subject: Btrfs: Fix split_leaf to avoid incorrect double splits

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e8466940fa3..53e40b5c084 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1865,7 +1865,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 				return ret;
 			}
 			mid = slot;
-			double_split = 1;
+			if (mid != nritems &&
+			    leaf_space_used(l, mid, nritems - mid) +
+			    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
+				double_split = 1;
+			}
 		}
 	}
 	nritems = nritems - mid;
-- 
cgit v1.2.3


From f578d4bd7e141dd03ca7e8695c1cc118c326e69e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 25 Oct 2007 15:42:56 -0400
Subject: Btrfs: Optimize csum insertion to create larger items when possible

This reduces the number of calls to btrfs_extend_item and greatly lowers
the cpu usage while writing large files.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  1 +
 fs/btrfs/file-item.c | 37 ++++++++++++++++++++++++++++++++++++-
 fs/btrfs/inode.c     |  2 +-
 3 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1d2b5415007..70e143b1357 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -991,6 +991,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     u64 bytenr, int mod);
 int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
+			  struct inode *inode,
 			  u64 objectid, u64 offset,
 			  char *data, size_t len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 10a4c740860..482a2b61532 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -133,17 +133,22 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 
 int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
+			  struct inode *inode,
 			  u64 objectid, u64 offset,
 			  char *data, size_t len)
 {
 	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
+	u64 next_offset = (u64)-1;
+	int found_next = 0;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	struct extent_buffer *leaf = NULL;
 	u64 csum_offset;
 	u32 csum_result = ~(u32)0;
+	u32 nritems;
+	u32 ins_size;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -168,7 +173,27 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 			goto insert;
 		}
 	} else {
+		int slot = path->slots[0] + 1;
 		/* we didn't find a csum item, insert one */
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems - 1) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 1) {
+				found_next = 1;
+			} else if (ret == 0) {
+				slot = 0;
+			} else {
+				goto insert;
+			}
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+		if (found_key.objectid != objectid ||
+		    found_key.type != BTRFS_CSUM_ITEM_KEY) {
+			found_next = 1;
+			goto insert;
+		}
+		next_offset = found_key.offset;
+		found_next = 1;
 		goto insert;
 	}
 
@@ -211,8 +236,18 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 insert:
 	btrfs_release_path(root, path);
 	csum_offset = 0;
+	if (found_next) {
+		u64 tmp = min((u64)i_size_read(inode), next_offset);
+		tmp -= offset + root->sectorsize - 1;
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		tmp = max((u64)1, tmp);
+		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root));
+		ins_size = BTRFS_CRC32_SIZE * tmp;
+	} else {
+		ins_size = BTRFS_CRC32_SIZE;
+	}
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
-				      BTRFS_CRC32_SIZE);
+				      ins_size);
 	if (ret < 0)
 		goto fail;
 	if (ret != 0) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 02d802cdd59..7efc0470abf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -118,7 +118,7 @@ int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	kaddr = kmap(page);
-	btrfs_csum_file_block(trans, root, inode->i_ino,
+	btrfs_csum_file_block(trans, root, inode, inode->i_ino,
 			      start, kaddr + offset, end - start + 1);
 	kunmap(page);
 	ret = btrfs_end_transaction(trans, root);
-- 
cgit v1.2.3


From 65555a06b4d1ae116ce223dc4b82d6068b36df96 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 25 Oct 2007 15:42:57 -0400
Subject: Btrfs: Off by one fixes in extent_map.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 10 +++++-----
 fs/btrfs/inode.c      |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a61379230dd..c44989a1e52 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2045,7 +2045,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	struct extent_buffer *eb;
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
-	int uptodate = 0;
+	int uptodate = 1;
 
 	eb = __alloc_extent_buffer(tree, start, len, mask);
 	if (!eb || IS_ERR(eb))
@@ -2197,7 +2197,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree,
 		 */
 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
 		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
 			start = page->index << PAGE_CACHE_SHIFT;
 			end  = start + PAGE_CACHE_SIZE - 1;
 			if (test_range_bit(tree, start, end,
@@ -2265,7 +2265,7 @@ int set_extent_buffer_uptodate(struct extent_map_tree *tree,
 		page = extent_buffer_page(eb, i);
 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
 		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len - 1) & (PAGE_CACHE_SIZE - 1)))) {
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
 			check_page_uptodate(tree, page);
 			continue;
 		}
@@ -2401,7 +2401,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	struct page *p;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	unsigned long end_i = (start_offset + start + min_len) >>
+	unsigned long end_i = (start_offset + start + min_len - 1) >>
 		PAGE_CACHE_SHIFT;
 
 	if (i != end_i)
@@ -2414,7 +2414,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		offset = 0;
 		*map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
 	}
-	if (start + min_len >= eb->len) {
+	if (start + min_len > eb->len) {
 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
 		WARN_ON(1);
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7efc0470abf..c0743307232 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -616,6 +616,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				break;
 			}
 			btrfs_set_key_type(&key, found_type);
+			btrfs_release_path(root, path);
 			continue;
 		}
 		if (found_key.offset >= inode->i_size)
-- 
cgit v1.2.3


From cc0c55384796b422133ff1f21646835b31590f88 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 25 Oct 2007 15:42:57 -0400
Subject: Btrfs: Fix split_leaf to detect when it is extending an item

When making room for a new item, it is ok to create an empty leaf, but
when making room to extend an item, split_leaf needs to make sure it
keeps the item we're extending in the path and make sure we don't end up
with an empty leaf.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 78 ++++++++++++++++++++------------------------------------
 1 file changed, 27 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 53e40b5c084..3eb5a9f30d1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -26,7 +26,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *ins_key,
-		      struct btrfs_path *path, int data_size);
+		      struct btrfs_path *path, int data_size, int extend);
 static int push_node_left(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct extent_buffer *dst,
 			  struct extent_buffer *src);
@@ -1049,7 +1049,7 @@ again:
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
 			    sizeof(struct btrfs_item) + ins_len) {
 				int sret = split_leaf(trans, root, key,
-						      p, ins_len);
+						      p, ins_len, ret == 0);
 				BUG_ON(sret > 0);
 				if (sret)
 					return sret;
@@ -1755,7 +1755,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
  */
 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *ins_key,
-		      struct btrfs_path *path, int data_size)
+		      struct btrfs_path *path, int data_size, int extend)
 {
 	struct extent_buffer *l;
 	u32 nritems;
@@ -1768,9 +1768,13 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	int i;
 	int ret = 0;
 	int wret;
-	int double_split = 0;
+	int double_split;
+	int num_doubles = 0;
 	struct btrfs_disk_key disk_key;
 
+	if (extend)
+		space_needed = data_size;
+
 	/* first try to make some room by pushing left and right */
 	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size);
@@ -1785,12 +1789,8 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		l = path->nodes[0];
 
 		/* did the pushes work? */
-		if (btrfs_leaf_free_space(root, l) >=
-		    sizeof(struct btrfs_item) + data_size) {
+		if (btrfs_leaf_free_space(root, l) >= space_needed)
 			return 0;
-		}
-	} else {
-		l = path->nodes[0];
 	}
 
 	if (!path->nodes[1]) {
@@ -1798,6 +1798,9 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (ret)
 			return ret;
 	}
+again:
+	double_split = 0;
+	l = path->nodes[0];
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1)/ 2;
@@ -1815,7 +1818,6 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	write_extent_buffer(right, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(right),
 			    BTRFS_FSID_SIZE);
-
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + space_needed >
@@ -1844,7 +1846,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	} else {
 		if (leaf_space_used(l, 0, mid + 1) + space_needed >
 			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (slot == 0) {
+			if (!extend && slot == 0) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -1863,12 +1865,15 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 						ret = wret;
 				}
 				return ret;
-			}
-			mid = slot;
-			if (mid != nritems &&
-			    leaf_space_used(l, mid, nritems - mid) +
-			    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
-				double_split = 1;
+			} else if (extend && slot == 0) {
+				mid = 1;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
+					double_split = 1;
+				}
 			}
 		}
 	}
@@ -1931,39 +1936,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	BUG_ON(path->slots[0] < 0);
 
-	if (!double_split) {
-		return ret;
-	}
-
-	right = btrfs_alloc_free_block(trans, root, root->leafsize,
-				       l->start, 0);
-	if (IS_ERR(right))
-		return PTR_ERR(right);
-
-	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
-	btrfs_set_header_bytenr(right, right->start);
-	btrfs_set_header_generation(right, trans->transid);
-	btrfs_set_header_owner(right, root->root_key.objectid);
-	btrfs_set_header_level(right, 0);
-	write_extent_buffer(right, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(right),
-			    BTRFS_FSID_SIZE);
-
-	btrfs_cpu_key_to_disk(&disk_key, ins_key);
-	btrfs_set_header_nritems(right, 0);
-	wret = insert_ptr(trans, root, path,
-			  &disk_key, right->start,
-			  path->slots[1], 1);
-	if (wret)
-		ret = wret;
-	if (path->slots[1] == 0) {
-		wret = fixup_low_keys(trans, root, path, &disk_key, 1);
-		if (wret)
-			ret = wret;
+	if (double_split) {
+		BUG_ON(num_doubles != 0);
+		num_doubles++;
+		goto again;
 	}
-	free_extent_buffer(path->nodes[0]);
-	path->nodes[0] = right;
-	path->slots[0] = 0;
 	return ret;
 }
 
@@ -1992,8 +1969,7 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 
 	slot = path->slots[0];
 	old_data_start = btrfs_item_offset_nr(leaf, slot);
-	old_size = btrfs_item_size_nr(leaf, slot);
-	BUG_ON(old_size <= new_size);
+	old_size = btrfs_item_size_nr(leaf, slot); BUG_ON(old_size <= new_size);
 	size_diff = old_size - new_size;
 
 	BUG_ON(slot < 0);
-- 
cgit v1.2.3


From 18f16f7ba62a01c29e09b40ac6ad6d92a8955859 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 25 Oct 2007 15:42:57 -0400
Subject: Btrfs: Fix for insert_inline_extent to handle offset != 0

This modifies inline extent size calculation, so that
insert_inline_extent can handle the case that parameter 'offset' is
not zero; it also a few codes to zero uninitialized area in inline
extent.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 96df1b10cb6..9260d3478aa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -108,7 +108,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	key.objectid = inode->i_ino;
 	key.offset = offset;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	datasize = btrfs_file_extent_calc_inline_size(offset + size);
 
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
@@ -130,7 +129,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 	}
 	if (ret == 0) {
 		u32 found_size;
-		u64 found_start;
+		u64 found_end;
 
 		leaf = path->nodes[0];
 		ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -144,19 +143,17 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 			       offset, inode->i_ino);
 			goto fail;
 		}
-		found_start = key.offset;
 		found_size = btrfs_file_extent_inline_len(leaf,
 					  btrfs_item_nr(leaf, path->slots[0]));
+		found_end = key.offset + found_size;
 
-		if (found_size < offset + size) {
+		if (found_end < offset + size) {
 			btrfs_release_path(root, path);
 			ret = btrfs_search_slot(trans, root, &key, path,
-						offset + size - found_size -
-						found_start, 1);
+						offset + size - found_end, 1);
 			BUG_ON(ret != 0);
 			ret = btrfs_extend_item(trans, root, path,
-						offset + size - found_size -
-						found_start);
+						offset + size - found_end);
 			if (ret) {
 				err = ret;
 				goto fail;
@@ -165,9 +162,15 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 			ei = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 		}
+		if (found_end < offset) {
+			ptr = btrfs_file_extent_inline_start(ei) + found_size;
+			memset_extent_buffer(leaf, 0, ptr, offset - found_end);
+		}
 	} else {
 insert:
 		btrfs_release_path(root, path);
+		datasize = offset + size - key.offset;
+		datasize = btrfs_file_extent_calc_inline_size(datasize);
 		ret = btrfs_insert_empty_item(trans, root, path, &key,
 					      datasize);
 		if (ret) {
@@ -181,7 +184,7 @@ insert:
 		btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 		btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
 	}
-	ptr = btrfs_file_extent_inline_start(ei) + offset;
+	ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
 
 	cur_size = size;
 	i = 0;
-- 
cgit v1.2.3


From 5708b9591617486bf1aa5b1a97f2c0549ec87933 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 25 Oct 2007 15:43:18 -0400
Subject: Btrfs: Tune the automatic defrag code

1) Forced defrag wasn't working properly (btrfsctl -d) because some
cache only checks were incorrect.

2) Defrag only the leaves unless in forced defrag mode.

3) Don't use complex logic to figure out if a leaf is needs defrag

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 69 +++++++++++++++++++-------------------------------
 fs/btrfs/tree-defrag.c |  1 -
 2 files changed, 26 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3eb5a9f30d1..516b90224a1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -161,34 +161,6 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
 	return 0;
 }
 
-static int should_defrag_leaf(struct extent_buffer *leaf)
-{
-	struct btrfs_key key;
-	u32 nritems;
-
-	if (btrfs_buffer_defrag(leaf))
-		return 1;
-
-	nritems = btrfs_header_nritems(leaf);
-	if (nritems == 0)
-		return 0;
-
-	btrfs_item_key_to_cpu(leaf, &key, 0);
-	if (key.type == BTRFS_DIR_ITEM_KEY)
-		return 1;
-
-
-	btrfs_item_key_to_cpu(leaf, &key, nritems - 1);
-	if (key.type == BTRFS_DIR_ITEM_KEY)
-		return 1;
-	if (nritems > 4) {
-		btrfs_item_key_to_cpu(leaf, &key, nritems / 2);
-		if (key.type == BTRFS_DIR_ITEM_KEY)
-			return 1;
-	}
-	return 0;
-}
-
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int start_slot, int cache_only, u64 *last_ret,
@@ -208,6 +180,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	int uptodate;
 	u32 blocksize;
 
+	parent_level = btrfs_header_level(parent);
+	if (cache_only && parent_level != 1)
+		return 0;
+
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
 		       root->fs_info->running_transaction->transid);
@@ -218,7 +194,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	parent_level = btrfs_header_level(parent);
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
@@ -227,27 +202,26 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (parent_nritems == 1)
 		return 0;
 
-	if (root != root->fs_info->extent_root) {
-		struct btrfs_key first_key;
-		struct btrfs_key last_key;
-
-		btrfs_node_key_to_cpu(parent, &first_key, 0);
-		btrfs_node_key_to_cpu(parent, &last_key, parent_nritems - 1);
-		if (first_key.objectid != last_key.objectid)
-			return 0;
-	}
-
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
 
+		if (!parent->map_token) {
+			map_extent_buffer(parent,
+					btrfs_node_key_ptr_offset(i),
+					sizeof(struct btrfs_key_ptr),
+					&parent->map_token, &parent->kaddr,
+					&parent->map_start, &parent->map_len,
+					KM_USER1);
+		}
 		blocknr = btrfs_node_blockptr(parent, i);
 		if (last_block == 0)
 			last_block = blocknr;
+
 		if (i > 0) {
 			other = btrfs_node_blockptr(parent, i - 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
-		if (close && i < end_slot - 1) {
+		if (close && i < end_slot - 2) {
 			other = btrfs_node_blockptr(parent, i + 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
@@ -255,15 +229,18 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			last_block = blocknr;
 			continue;
 		}
+		if (parent->map_token) {
+			unmap_extent_buffer(parent, parent->map_token,
+					    KM_USER1);
+			parent->map_token = NULL;
+		}
 
 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
 		if (cur)
 			uptodate = btrfs_buffer_uptodate(cur);
 		else
 			uptodate = 0;
-		if (!cur || !uptodate ||
-		    (parent_level != 1 && !btrfs_buffer_defrag(cur)) ||
-		    (parent_level == 1 && !should_defrag_leaf(cur))) {
+		if (!cur || !uptodate) {
 			if (cache_only) {
 				free_extent_buffer(cur);
 				continue;
@@ -287,11 +264,17 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			break;
 		}
 		search_start = tmp->start;
+		last_block = tmp->start;
 		*last_ret = search_start;
 		if (parent_level == 1)
 			btrfs_clear_buffer_defrag(tmp);
 		free_extent_buffer(tmp);
 	}
+	if (parent->map_token) {
+		unmap_extent_buffer(parent, parent->map_token,
+				    KM_USER1);
+		parent->map_token = NULL;
+	}
 	return err;
 }
 
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b02355a7b14..65ef1235156 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -183,7 +183,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (level == 0) {
 		goto out;
 	}
-
 	if (root->defrag_progress.objectid == 0) {
 		extent_buffer_get(root->node);
 		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-- 
cgit v1.2.3


From 0d9f7f3e27a92b07212090a1772fdbb3a92fef81 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 25 Oct 2007 15:48:28 -0400
Subject: btrfs_inode_by_name return random value.

When inode is found, the return value is from the uninitialized
variable 'ret'.

--

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0743307232..b74a831fab1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -847,15 +847,16 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	int ret;
+	int ret = 0;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
+	if (IS_ERR(di))
+		ret = PTR_ERR(di);
 	if (!di || IS_ERR(di)) {
 		location->objectid = 0;
-		ret = 0;
 		goto out;
 	}
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
-- 
cgit v1.2.3


From 134d45120164141c0847daf5222aaec77e04724a Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 25 Oct 2007 15:49:25 -0400
Subject: Fix ENOTEMPTY check in btrfs_rmdir

The ENOTEMPTY check in btrfs_rmdir isn't reliable. It's possible that
the backward search finds . or .. at first, then some other directory
entry. In that case,  btrfs_rmdir delete . or .. improperly.  The
patch also fixes a fs_mutex unlock issue in  btrfs_rmdir.

--

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b74a831fab1..7e4b2c1b4a5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -448,6 +448,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	char *goodnames = "..";
 	unsigned long nr;
 
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+		return -ENOTEMPTY;
+
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -501,9 +504,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 out:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	if (ret && !err)
 		err = ret;
-- 
cgit v1.2.3


From 44ec0b7179b9bce7df1080370f51314a24a00216 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Oct 2007 10:55:05 -0400
Subject: Btrfs: Compile fixes for 2.6.24-rc1

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7e4b2c1b4a5..c88f13a8855 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2176,8 +2176,12 @@ void btrfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void init_once(struct kmem_cache * cachep, void *foo)
+#else
 static void init_once(void * foo, struct kmem_cache * cachep,
 		      unsigned long flags)
+#endif
 {
 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
 
@@ -2200,8 +2204,13 @@ void btrfs_destroy_cachep(void)
 
 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 				       unsigned long extra_flags,
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+				       void (*ctor)(struct kmem_cache *, void *)
+#else
 				       void (*ctor)(void *, struct kmem_cache *,
-						    unsigned long))
+						    unsigned long)
+#endif
+				     )
 {
 	return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
 				 SLAB_MEM_SPREAD | extra_flags), ctor
-- 
cgit v1.2.3


From c67cda1758130f3e437744c4c2ec4a7cc25564e3 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 29 Oct 2007 11:41:05 -0400
Subject: Btrfs: Fix extent_map leak in extent_bmap

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index c44989a1e52..e4ee980cc13 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1924,6 +1924,7 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 	struct inode *inode = mapping->host;
 	u64 start = iblock << inode->i_blkbits;
 	u64 end = start + (1 << inode->i_blkbits) - 1;
+	sector_t sector = 0;
 	struct extent_map *em;
 
 	em = get_extent(inode, NULL, 0, start, end, 0);
@@ -1932,9 +1933,12 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 
 	if (em->block_start == EXTENT_MAP_INLINE ||
 	    em->block_start == EXTENT_MAP_HOLE)
-		return 0;
+		goto out;
 
-	return (em->block_start + start - em->start) >> inode->i_blkbits;
+	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+out:
+	free_extent_map(em);
+	return sector;
 }
 
 static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
-- 
cgit v1.2.3


From 689f9346612d96ab93890160cf2fe0df5217ab73 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 29 Oct 2007 11:41:07 -0400
Subject: Fix inline extent handling in btrfs_get_extent

1.  Reorder kmap and the test for  'page != NULL'
2.  Zero-fill rest area of a block when inline extent isn't big enough.
3.  Do not insert extent_map into the map tree when page == NULL.
(If insert the extent_map into the map tree,  subsequent read requests
will find it in the map tree directly and the corresponding inline
extent data aren't copied into page by the the get_extent function.
extent_read_full_page can't handle that case)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c88f13a8855..0ba1cdbdf11 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1596,8 +1596,7 @@ again:
 
 		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
 						    path->slots[0]));
-
-		extent_end = (extent_start + size) |
+		extent_end = (extent_start + size - 1) |
 			((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
@@ -1610,29 +1609,32 @@ again:
 			}
 			goto not_found_em;
 		}
+		em->block_start = EXTENT_MAP_INLINE;
+		em->block_end = EXTENT_MAP_INLINE;
+
+		if (!page) {
+			em->start = extent_start;
+			em->end = extent_start + size - 1;
+			goto out;
+		}
 
 		extent_offset = (page->index << PAGE_CACHE_SHIFT) -
-			extent_start;
-		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
-		map = kmap(page);
+			extent_start + page_offset;
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - page_offset,
 				size - extent_offset);
-
-		em->block_start = EXTENT_MAP_INLINE;
-		em->block_end = EXTENT_MAP_INLINE;
 		em->start = extent_start + extent_offset;
 		em->end = (em->start + copy_size -1) |
 			((u64)root->sectorsize -1);
-
-		if (!page) {
-			goto insert;
+		map = kmap(page);
+		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+		read_extent_buffer(leaf, map + page_offset, ptr, copy_size);
+		
+		if (em->start + copy_size <= em->end) {
+			size = min_t(u64, em->end + 1 - em->start,
+				PAGE_CACHE_SIZE - page_offset) - copy_size;
+			memset(map + page_offset + copy_size, 0, size);
 		}
 
-		read_extent_buffer(leaf, map + page_offset, ptr, copy_size);
-		/*
-		memset(map + page_offset + copy_size, 0,
-		       PAGE_CACHE_SIZE - copy_size - page_offset);
-		       */
 		flush_dcache_page(page);
 		kunmap(page);
 		set_extent_uptodate(em_tree, em->start, em->end, GFP_NOFS);
-- 
cgit v1.2.3


From b56baf5bedccd3258643b09289f17ceab3ddea52 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 29 Oct 2007 12:01:05 -0400
Subject: Minor fix for btrfs_csum_file_block.

Execution should goto label 'insert' when 'btrfs_next_leaf' return a
non-zero value, otherwise the parameter 'slot' for
'btrfs_item_key_to_cpu' may be out of bounds. The original codes jump
to  label 'insert' only when 'btrfs_next_leaf' return a negative
value.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 482a2b61532..7eb9a5412e2 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -178,13 +178,11 @@ int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
 		nritems = btrfs_header_nritems(path->nodes[0]);
 		if (path->slots[0] >= nritems - 1) {
 			ret = btrfs_next_leaf(root, path);
-			if (ret == 1) {
+			if (ret == 1)
 				found_next = 1;
-			} else if (ret == 0) {
-				slot = 0;
-			} else {
+			if (ret != 0)
 				goto insert;
-			}
+			slot = 0;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
 		if (found_key.objectid != objectid ||
@@ -238,7 +236,7 @@ insert:
 	csum_offset = 0;
 	if (found_next) {
 		u64 tmp = min((u64)i_size_read(inode), next_offset);
-		tmp -= offset + root->sectorsize - 1;
+		tmp -= offset & ~((u64)root->sectorsize -1);
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
 		tmp = max((u64)1, tmp);
 		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root));
-- 
cgit v1.2.3


From 2ff3e9b61d02b03e3157f7d43ba20ee1452814de Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Oct 2007 14:36:41 -0400
Subject: Add O_SYNC support to btrfs_file_write

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9260d3478aa..843e920388e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -635,8 +635,9 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	loff_t pos;
-	size_t num_written = 0;
-	int err = 0;
+	loff_t start_pos;
+	ssize_t num_written = 0;
+	ssize_t err = 0;
 	int ret = 0;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -652,7 +653,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	pinned[1] = NULL;
 	if (file->f_flags & O_DIRECT)
 		return -EINVAL;
+
 	pos = *ppos;
+	start_pos = pos;
+
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
@@ -743,6 +747,13 @@ out:
 	if (pinned[1])
 		page_cache_release(pinned[1]);
 	*ppos = pos;
+
+	if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+		err = sync_page_range(inode, inode->i_mapping,
+				      start_pos, num_written);
+		if (err < 0)
+			num_written = err;
+	}
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
 }
-- 
cgit v1.2.3


From 35ebb934bd7fcc7ca991b155b7980c3c4ff9f1a5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 30 Oct 2007 16:56:53 -0400
Subject: Btrfs: Fix PAGE_CACHE_SHIFT shifts on 32 bit machines

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent_map.c  | 27 +++++++++++++++------------
 fs/btrfs/file.c        |  3 ++-
 fs/btrfs/inode.c       | 14 +++++++-------
 fs/btrfs/transaction.c |  2 +-
 5 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5a7e60d7f44..8935eec31ee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -159,7 +159,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_map_tree *tree;
-	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 found_start;
 	int found_level;
 	unsigned long len;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index e4ee980cc13..238cb1d81d5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1229,7 +1229,7 @@ EXPORT_SYMBOL(test_range_bit);
 static int check_page_uptodate(struct extent_map_tree *tree,
 			       struct page *page)
 {
-	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
 		SetPageUptodate(page);
@@ -1243,7 +1243,7 @@ static int check_page_uptodate(struct extent_map_tree *tree,
 static int check_page_locked(struct extent_map_tree *tree,
 			     struct page *page)
 {
-	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
 		unlock_page(page);
@@ -1257,7 +1257,7 @@ static int check_page_locked(struct extent_map_tree *tree,
 static int check_page_writeback(struct extent_map_tree *tree,
 			     struct page *page)
 {
-	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
 		end_page_writeback(page);
@@ -1296,7 +1296,8 @@ static int end_bio_extent_writepage(struct bio *bio,
 
 	do {
 		struct page *page = bvec->bv_page;
-		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			 bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
 
 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -1361,7 +1362,8 @@ static int end_bio_extent_readpage(struct bio *bio,
 
 	do {
 		struct page *page = bvec->bv_page;
-		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
 
 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -1427,7 +1429,8 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 
 	do {
 		struct page *page = bvec->bv_page;
-		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
 
 		if (--bvec >= bio->bi_io_vec)
@@ -1503,7 +1506,7 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 			  get_extent_t *get_extent)
 {
 	struct inode *inode = page->mapping->host;
-	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
@@ -1608,7 +1611,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			  struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
@@ -1750,7 +1753,7 @@ EXPORT_SYMBOL(extent_write_full_page);
 int extent_invalidatepage(struct extent_map_tree *tree,
 			  struct page *page, unsigned long offset)
 {
-	u64 start = (page->index << PAGE_CACHE_SHIFT);
+	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
@@ -1792,7 +1795,7 @@ int extent_prepare_write(struct extent_map_tree *tree,
 			 struct inode *inode, struct page *page,
 			 unsigned from, unsigned to, get_extent_t *get_extent)
 {
-	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 	u64 block_start;
 	u64 orig_block_start;
@@ -1890,7 +1893,7 @@ EXPORT_SYMBOL(extent_prepare_write);
 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
 {
 	struct extent_map *em;
-	u64 start = page->index << PAGE_CACHE_SHIFT;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	u64 orig_start = start;
 	int ret = 1;
@@ -2202,7 +2205,7 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree,
 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
 		    ((i == num_pages - 1) &&
 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			start = page->index << PAGE_CACHE_SHIFT;
+			start = (u64)page->index << PAGE_CACHE_SHIFT;
 			end  = start + PAGE_CACHE_SIZE - 1;
 			if (test_range_bit(tree, start, end,
 					   EXTENT_DIRTY, 0)) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 843e920388e..abdd9caad94 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -301,7 +301,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			SetPageUptodate(p);
 			set_page_dirty(p);
 		}
-		last_end = pages[num_pages -1]->index << PAGE_CACHE_SHIFT;
+		last_end = (u64)(pages[num_pages -1]->index) <<
+				PAGE_CACHE_SHIFT;
 		last_end += PAGE_CACHE_SIZE - 1;
 		set_extent_delalloc(em_tree, start_pos, end_of_last_block,
 				 GFP_NOFS);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0ba1cdbdf11..5d10b64e42d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -111,7 +111,7 @@ int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	char *kaddr;
 	int ret;
-	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	size_t offset = start - page_start;
 
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -160,7 +160,7 @@ out:
 
 int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 {
-	size_t offset = start - (page->index << PAGE_CACHE_SHIFT);
+	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
 	struct inode *inode = page->mapping->host;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	char *kaddr;
@@ -688,7 +688,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 	char *kaddr;
 	int ret = 0;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	u64 page_start = page->index << PAGE_CACHE_SHIFT;
+	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 	set_page_extent_mapped(page);
@@ -739,7 +739,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 			goto out;
 		}
 	}
-	page_start = page->index << PAGE_CACHE_SHIFT;
+	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 
 	ret = btrfs_cow_one_page(inode, page, offset);
 
@@ -1618,7 +1618,7 @@ again:
 			goto out;
 		}
 
-		extent_offset = (page->index << PAGE_CACHE_SHIFT) -
+		extent_offset = ((u64)page->index << PAGE_CACHE_SHIFT) -
 			extent_start + page_offset;
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - page_offset,
 				size - extent_offset);
@@ -1769,7 +1769,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	lock_page(page);
 	wait_on_page_writeback(page);
 	size = i_size_read(inode);
-	page_start = page->index << PAGE_CACHE_SHIFT;
+	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start > size)) {
@@ -2065,7 +2065,7 @@ int btrfs_defrag_file(struct file *file) {
 				goto out_unlock;
 			}
 		}
-		page_start = page->index << PAGE_CACHE_SHIFT;
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bdfe05cf26c..4af1c0dab7f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -169,7 +169,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
 		while(start <= end) {
 			index = start >> PAGE_CACHE_SHIFT;
-			start = (index + 1) << PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 			page = find_lock_page(btree_inode->i_mapping, index);
 			if (!page)
 				continue;
-- 
cgit v1.2.3


From 179e29e488cc74f1e9bd67bc45f70b832740e9ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 1 Nov 2007 11:28:41 -0400
Subject: Btrfs: Fix a number of inline extent problems that Yan Zheng
 reported.

The fixes do a number of things:

1) Most btrfs_drop_extent callers will try to leave the inline extents in
place.  It can truncate bytes off the beginning of the inline extent if
required.

2) writepage can now update the inline extent, allowing mmap writes to
go directly into the inline extent.

3) btrfs_truncate_in_transaction truncates inline extents

4) extent_map.c fixed to not merge inline extent mappings and hole
mappings together

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c      | 52 ++++++++++++++++++++++++++++++++++-----
 fs/btrfs/ctree.h      |  2 +-
 fs/btrfs/dir-item.c   |  2 +-
 fs/btrfs/extent_map.c | 11 ++++++---
 fs/btrfs/extent_map.h |  1 -
 fs/btrfs/file-item.c  |  2 +-
 fs/btrfs/file.c       | 36 ++++++++++++++++++++++-----
 fs/btrfs/inode.c      | 67 ++++++++++++++++++++++++++++++++++++---------------
 8 files changed, 135 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 516b90224a1..eef9c92f86d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1930,7 +1930,7 @@ again:
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_path *path,
-			u32 new_size)
+			u32 new_size, int from_end)
 {
 	int ret = 0;
 	int slot;
@@ -1946,13 +1946,17 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 
 	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	old_size = btrfs_item_size_nr(leaf, slot);
+	if (old_size == new_size)
+		return 0;
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
 
-	slot = path->slots[0];
 	old_data_start = btrfs_item_offset_nr(leaf, slot);
-	old_size = btrfs_item_size_nr(leaf, slot); BUG_ON(old_size <= new_size);
+
 	size_diff = old_size - new_size;
 
 	BUG_ON(slot < 0);
@@ -1984,9 +1988,45 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	}
 
 	/* shift the data */
-	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-		      data_end + size_diff, btrfs_leaf_data(leaf) +
-		      data_end, old_data_start + new_size - data_end);
+	if (from_end) {
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start + new_size - data_end);
+	} else {
+		struct btrfs_disk_key disk_key;
+		u64 offset;
+
+		btrfs_item_key(leaf, &disk_key, slot);
+
+		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
+			unsigned long ptr;
+			struct btrfs_file_extent_item *fi;
+
+			fi = btrfs_item_ptr(leaf, slot,
+					    struct btrfs_file_extent_item);
+			fi = (struct btrfs_file_extent_item *)(
+			     (unsigned long)fi - size_diff);
+
+			if (btrfs_file_extent_type(leaf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE) {
+				ptr = btrfs_item_ptr_offset(leaf, slot);
+				memmove_extent_buffer(leaf, ptr,
+				        (unsigned long)fi,
+				        offsetof(struct btrfs_file_extent_item,
+						 disk_bytenr));
+			}
+		}
+
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end + size_diff, btrfs_leaf_data(leaf) +
+			      data_end, old_data_start - data_end);
+
+		offset = btrfs_disk_key_offset(&disk_key);
+		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
+		btrfs_set_item_key(leaf, &disk_key, slot);
+		if (slot == 0)
+			fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
 
 	item = btrfs_item_nr(leaf, slot);
 	btrfs_set_item_size(leaf, item, new_size);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 70e143b1357..d82afb618bf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -907,7 +907,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_path *path,
-			u32 new_size);
+			u32 new_size, int from_end);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 6f19de41b87..514a1dc337a 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -249,7 +249,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
 		ret = btrfs_truncate_item(trans, root, path,
-					  item_len - sub_item_len);
+					  item_len - sub_item_len, 1);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 238cb1d81d5..44be9cfd30e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -263,7 +263,12 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		if (prev && prev->end + 1 == em->start &&
 		    ((em->block_start == EXTENT_MAP_HOLE &&
 		      prev->block_start == EXTENT_MAP_HOLE) ||
-			     (em->block_start == prev->block_end + 1))) {
+		     (em->block_start == EXTENT_MAP_INLINE &&
+		      prev->block_start == EXTENT_MAP_INLINE) ||
+		     (em->block_start == EXTENT_MAP_DELALLOC &&
+		      prev->block_start == EXTENT_MAP_DELALLOC) ||
+		     (em->block_start < EXTENT_MAP_DELALLOC - 1 &&
+		      em->block_start == prev->block_end + 1))) {
 			em->start = prev->start;
 			em->block_start = prev->block_start;
 			rb_erase(&prev->rb_node, &tree->map);
@@ -1618,13 +1623,13 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 	u64 extent_offset;
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
+	u64 iosize;
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
 	int ret;
 	int nr = 0;
 	size_t page_offset = 0;
-	size_t iosize;
 	size_t blocksize;
 	loff_t i_size = i_size_read(inode);
 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1684,7 +1689,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
 			break;
 		}
-		em = get_extent(inode, page, page_offset, cur, end, 0);
+		em = get_extent(inode, page, page_offset, cur, end, 1);
 		if (IS_ERR(em) || !em) {
 			SetPageError(page);
 			break;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 39d78d31b6a..b6f6519f9ba 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -24,7 +24,6 @@
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
  */
-
 #define EXTENT_PAGE_PRIVATE 1
 #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7eb9a5412e2..614176e5285 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -291,7 +291,7 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 	new_item_size = blocks * BTRFS_CRC32_SIZE;
 	if (new_item_size >= btrfs_item_size_nr(leaf, slot))
 		return 0;
-	ret = btrfs_truncate_item(trans, root, path, new_item_size);
+	ret = btrfs_truncate_item(trans, root, path, new_item_size, 1);
 	BUG_ON(ret);
 	return ret;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index abdd9caad94..4e52f7ec1cb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -115,8 +115,20 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		goto fail;
 	}
 	if (ret == 1) {
+		struct btrfs_key found_key;
+
+		if (path->slots[0] == 0)
+			goto insert;
+
 		path->slots[0]--;
 		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		if (found_key.objectid != inode->i_ino)
+			goto insert;
+
+		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto insert;
 		ei = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 
@@ -152,6 +164,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 			ret = btrfs_search_slot(trans, root, &key, path,
 						offset + size - found_end, 1);
 			BUG_ON(ret != 0);
+
 			ret = btrfs_extend_item(trans, root, path,
 						offset + size - found_end);
 			if (ret) {
@@ -292,7 +305,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 */
 	inline_size = end_pos;
 	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > 8192 ||
+	    inline_size > 32768 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
 
@@ -312,7 +325,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
 		err = btrfs_drop_extents(trans, root, inode, start_pos,
-					 aligned_end, end_pos, &hint_byte);
+					 aligned_end, aligned_end, &hint_byte);
 		if (err)
 			goto failed;
 		err = insert_inline_extent(trans, root, inode, start_pos,
@@ -456,13 +469,15 @@ next_slot:
 			goto next_slot;
 		}
 
-		/* FIXME, there's only one inline extent allowed right now */
 		if (found_inline) {
 			u64 mask = root->sectorsize - 1;
 			search_start = (extent_end + mask) & ~mask;
 		} else
 			search_start = extent_end;
 
+		if (end <= extent_end && start >= key.offset && found_inline) {
+			*hint_byte = EXTENT_MAP_INLINE;
+		}
 		if (end < extent_end && end >= key.offset) {
 			if (found_extent) {
 				u64 disk_bytenr =
@@ -479,8 +494,10 @@ next_slot:
 					BUG_ON(ret);
 				}
 			}
-			if (!found_inline)
-				bookend = 1;
+			bookend = 1;
+			if (found_inline && start <= key.offset &&
+			    inline_end < extent_end)
+				keep = 1;
 		}
 		/* truncate existing extent */
 		if (start > key.offset) {
@@ -510,7 +527,7 @@ next_slot:
 				new_size = btrfs_file_extent_calc_inline_size(
 						   inline_end - key.offset);
 				btrfs_truncate_item(trans, root, path,
-						    new_size);
+						    new_size, 1);
 			}
 		}
 		/* delete the entire extent */
@@ -551,6 +568,13 @@ next_slot:
 			if (!bookend)
 				continue;
 		}
+		if (bookend && found_inline && start <= key.offset &&
+		    inline_end < extent_end) {
+			u32 new_size;
+			new_size = btrfs_file_extent_calc_inline_size(
+						   extent_end - inline_end);
+			btrfs_truncate_item(trans, root, path, new_size, 0);
+		}
 		/* create bookend, splitting the extent in two */
 		if (bookend && found_extent) {
 			struct btrfs_key ins;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5d10b64e42d..0c65141b999 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,6 +89,9 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	ret = btrfs_drop_extents(trans, root, inode,
 				 start, start + num_bytes, start, &alloc_hint);
 
+	if (alloc_hint == EXTENT_MAP_INLINE)
+		goto out;
+
 	ret = btrfs_alloc_extent(trans, root, inode->i_ino, num_bytes, 0,
 				 alloc_hint, (u64)-1, &ins, 1);
 	if (ret) {
@@ -558,6 +561,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	u64 item_end = 0;
 	int found_extent;
 	int del_item;
+	int extent_type = -1;
 
 	btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
 	path = btrfs_alloc_path();
@@ -597,10 +601,15 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(leaf, fi) !=
-			    BTRFS_FILE_EXTENT_INLINE) {
+			extent_type = btrfs_file_extent_type(leaf, fi);
+			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
+			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				struct btrfs_item *item = btrfs_item_nr(leaf,
+							        path->slots[0]);
+				item_end += btrfs_file_extent_inline_len(leaf,
+									 item);
 			}
 		}
 		if (found_type == BTRFS_CSUM_ITEM_KEY) {
@@ -608,7 +617,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 						  inode->i_size);
 			BUG_ON(ret);
 		}
-		if (item_end < inode->i_size) {
+		if (item_end <= inode->i_size) {
 			if (found_type == BTRFS_DIR_ITEM_KEY) {
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
@@ -629,9 +638,10 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		found_extent = 0;
 
 		/* FIXME, shrink the extent if the ref count is only 1 */
-		if (found_type == BTRFS_EXTENT_DATA_KEY &&
-			   btrfs_file_extent_type(leaf, fi) !=
-			   BTRFS_FILE_EXTENT_INLINE) {
+		if (found_type != BTRFS_EXTENT_DATA_KEY)
+			goto delete;
+
+		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
 			if (!del_item) {
@@ -659,7 +669,15 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 					inode->i_blocks -= num_dec;
 				}
 			}
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE &&
+			   !del_item) {
+			u32 newsize = inode->i_size - found_key.offset;
+			newsize = btrfs_file_extent_calc_inline_size(newsize);
+			ret = btrfs_truncate_item(trans, root, path,
+						  newsize, 1);
+			BUG_ON(ret);
 		}
+delete:
 		if (del_item) {
 			ret = btrfs_del_item(trans, root, path);
 			if (ret)
@@ -769,7 +787,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		u64 pos = (inode->i_size + mask) & ~mask;
 		u64 block_end = attr->ia_size | mask;
 		u64 hole_size;
-		u64 alloc_hint;
+		u64 alloc_hint = 0;
 
 		if (attr->ia_size <= pos)
 			goto out;
@@ -786,8 +804,11 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 					 pos, pos + hole_size, pos,
 					 &alloc_hint);
 
-		err = btrfs_insert_file_extent(trans, root, inode->i_ino,
-					       pos, 0, 0, hole_size);
+		if (alloc_hint != EXTENT_MAP_INLINE) {
+			err = btrfs_insert_file_extent(trans, root,
+						       inode->i_ino,
+						       pos, 0, 0, hole_size);
+		}
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
 		unlock_extent(em_tree, pos, block_end, GFP_NOFS);
@@ -1531,8 +1552,8 @@ again:
 		em->end = EXTENT_MAP_HOLE;
 	}
 	em->bdev = inode->i_sb->s_bdev;
-	ret = btrfs_lookup_file_extent(NULL, root, path,
-				       objectid, start, 0);
+	ret = btrfs_lookup_file_extent(trans, root, path,
+				       objectid, start, trans != NULL);
 	if (ret < 0) {
 		err = ret;
 		goto out;
@@ -1627,15 +1648,23 @@ again:
 			((u64)root->sectorsize -1);
 		map = kmap(page);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
-		read_extent_buffer(leaf, map + page_offset, ptr, copy_size);
-		
-		if (em->start + copy_size <= em->end) {
-			size = min_t(u64, em->end + 1 - em->start,
-				PAGE_CACHE_SIZE - page_offset) - copy_size;
-			memset(map + page_offset + copy_size, 0, size);
+		if (create == 0 && !PageUptodate(page)) {
+			read_extent_buffer(leaf, map + page_offset, ptr,
+					   copy_size);
+			flush_dcache_page(page);
+		} else if (create && PageUptodate(page)) {
+			if (!trans) {
+				kunmap(page);
+				free_extent_map(em);
+				em = NULL;
+				btrfs_release_path(root, path);
+				trans = btrfs_start_transaction(root, 1);
+				goto again;
+			}
+			write_extent_buffer(leaf, map + page_offset, ptr,
+					    copy_size);
+			btrfs_mark_buffer_dirty(leaf);
 		}
-
-		flush_dcache_page(page);
 		kunmap(page);
 		set_extent_uptodate(em_tree, em->start, em->end, GFP_NOFS);
 		goto insert;
-- 
cgit v1.2.3


From b97f9203b4d672c06eca50a1b1b99e4f69e1daf8 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 1 Nov 2007 11:28:41 -0400
Subject: Btrfs: Fix typo and memory leak in extent-tree.c

This patch fixes a typo in update_block_group and memory leak in
btrfs_free_block_groups.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7de7707d939..e7192ea8f96 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -588,16 +588,15 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			    old_val < (cache->key.offset >> 1)) {
 				int bit_to_clear;
 				int bit_to_set;
-
 				cache->data = data;
 				if (data) {
-					bit_to_clear = BLOCK_GROUP_DATA;
-					bit_to_set = BLOCK_GROUP_METADATA;
+					bit_to_clear = BLOCK_GROUP_METADATA;
+					bit_to_set = BLOCK_GROUP_DATA;
 					cache->item.flags |=
 						BTRFS_BLOCK_GROUP_DATA;
 				} else {
-					bit_to_clear = BLOCK_GROUP_METADATA;
-					bit_to_set = BLOCK_GROUP_DATA;
+					bit_to_clear = BLOCK_GROUP_DATA;
+					bit_to_set = BLOCK_GROUP_METADATA;
 					cache->item.flags &=
 						~BTRFS_BLOCK_GROUP_DATA;
 				}
@@ -1459,13 +1458,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	u64 start;
 	u64 end;
+	u64 ptr;
 	int ret;
-
 	while(1) {
 		ret = find_first_extent_bit(&info->block_group_cache, 0,
 					    &start, &end, (unsigned int)-1);
 		if (ret)
 			break;
+		ret = get_state_private(&info->block_group_cache, start, &ptr);
+		if (!ret)
+			kfree((void *)(unsigned long)ptr);
 		clear_extent_bits(&info->block_group_cache, start,
 				  end, (unsigned int)-1, GFP_NOFS);
 	}
-- 
cgit v1.2.3


From 944746ec7575258475958d858cdf3a0e4c8f7b9e Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 1 Nov 2007 11:28:42 -0400
Subject: Btrfs: small fixes for find_lock_delalloc_range.

There is a 'finish_wait', but no 'prepare_to_wait' . So I think that
the 'prepare_to_wait' is missing. The second change is  according to
the name of variable.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 44be9cfd30e..ff8881fb56d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1044,6 +1044,8 @@ search_again:
 			if (state->state & EXTENT_LOCKED) {
 				DEFINE_WAIT(wait);
 				atomic_inc(&state->refs);
+				prepare_to_wait(&state->wq, &wait,
+						TASK_UNINTERRUPTIBLE);
 				write_unlock_irq(&tree->lock);
 				schedule();
 				write_lock_irq(&tree->lock);
@@ -1059,7 +1061,7 @@ search_again:
 		node = rb_next(node);
 		if (!node)
 			break;
-		total_bytes = state->end - state->start + 1;
+		total_bytes += state->end - state->start + 1;
 		if (total_bytes >= max_bytes)
 			break;
 	}
-- 
cgit v1.2.3


From b293f02e1423f2099744f3ade23ddd83b65321fc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 1 Nov 2007 19:45:34 -0400
Subject: Btrfs: Add writepages support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 135 ++++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/extent_map.h |   4 ++
 fs/btrfs/inode.c      |  10 ++++
 3 files changed, 124 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ff8881fb56d..b0677c84bb7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -10,6 +10,7 @@
 #include <linux/blkdev.h>
 #include <linux/swap.h>
 #include <linux/version.h>
+#include <linux/writeback.h>
 #include "extent_map.h"
 
 /* temporary define until extent_map moves out of btrfs */
@@ -35,6 +36,12 @@ struct tree_entry {
 	struct rb_node rb_node;
 };
 
+struct extent_page_data {
+	struct bio *bio;
+	struct extent_map_tree *tree;
+	get_extent_t *get_extent;
+};
+
 void __init extent_map_init(void)
 {
 	extent_map_cache = btrfs_cache_create("extent_map",
@@ -1460,40 +1467,76 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 #endif
 }
 
-static int submit_extent_page(int rw, struct extent_map_tree *tree,
-			      struct page *page, sector_t sector,
-			      size_t size, unsigned long offset,
-			      struct block_device *bdev,
-			      bio_end_io_t end_io_func)
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		 gfp_t gfp_flags)
 {
 	struct bio *bio;
-	int ret = 0;
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc(gfp_flags, nr_vecs);
 
-	bio->bi_sector = sector;
-	bio->bi_bdev = bdev;
-	bio->bi_io_vec[0].bv_page = page;
-	bio->bi_io_vec[0].bv_len = size;
-	bio->bi_io_vec[0].bv_offset = offset;
-
-	bio->bi_vcnt = 1;
-	bio->bi_idx = 0;
-	bio->bi_size = size;
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
 
-	bio->bi_end_io = end_io_func;
-	bio->bi_private = tree;
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_sector;
+	}
+	return bio;
+}
 
+static int submit_one_bio(int rw, struct bio *bio)
+{
+	int ret = 0;
 	bio_get(bio);
 	submit_bio(rw, bio);
-
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
-
 	bio_put(bio);
 	return ret;
 }
 
+static int submit_extent_page(int rw, struct extent_map_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      struct bio **bio_ret,
+			      int max_pages,
+			      bio_end_io_t end_io_func)
+{
+	int ret = 0;
+	struct bio *bio;
+	int nr;
+
+	if (bio_ret && *bio_ret) {
+		bio = *bio_ret;
+		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		    bio_add_page(bio, page, size, offset) < size) {
+			ret = submit_one_bio(rw, bio);
+			bio = NULL;
+		} else {
+			return 0;
+		}
+	}
+	nr = min(max_pages, bio_get_nr_vecs(bdev));
+	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	if (!bio) {
+		printk("failed to allocate bio nr %d\n", nr);
+	}
+	bio_add_page(bio, page, size, offset);
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+	if (bio_ret) {
+		*bio_ret = bio;
+	} else {
+		ret = submit_one_bio(rw, bio);
+	}
+
+	return ret;
+}
+
 void set_page_extent_mapped(struct page *page)
 {
 	if (!PagePrivate(page)) {
@@ -1590,7 +1633,8 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 		if (!ret) {
 			ret = submit_extent_page(READ, tree, page,
 						 sector, iosize, page_offset,
-						 bdev, end_bio_extent_readpage);
+						 bdev, NULL, 1,
+						 end_bio_extent_readpage);
 		}
 		if (ret)
 			SetPageError(page);
@@ -1613,11 +1657,12 @@ EXPORT_SYMBOL(extent_read_full_page);
  * are found, they are marked writeback.  Then the lock bits are removed
  * and the end_io handler clears the writeback ranges
  */
-int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent,
-			  struct writeback_control *wbc)
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
 {
 	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	struct extent_map_tree *tree = epd->tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
@@ -1691,7 +1736,7 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
 			break;
 		}
-		em = get_extent(inode, page, page_offset, cur, end, 1);
+		em = epd->get_extent(inode, page, page_offset, cur, end, 1);
 		if (IS_ERR(em) || !em) {
 			SetPageError(page);
 			break;
@@ -1734,9 +1779,12 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 		if (ret)
 			SetPageError(page);
 		else {
+			unsigned long nr = end_index + 1;
 			set_range_writeback(tree, cur, cur + iosize - 1);
+
 			ret = submit_extent_page(WRITE, tree, page, sector,
 						 iosize, page_offset, bdev,
+						 &epd->bio, nr,
 						 end_bio_extent_writepage);
 			if (ret)
 				SetPageError(page);
@@ -1750,8 +1798,44 @@ done:
 	unlock_page(page);
 	return 0;
 }
+
+int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	int ret;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+	};
+
+	ret = __extent_writepage(page, wbc, &epd);
+	if (epd.bio)
+		submit_one_bio(WRITE, epd.bio);
+	return ret;
+}
 EXPORT_SYMBOL(extent_write_full_page);
 
+int extent_writepages(struct extent_map_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc)
+{
+	int ret;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+	};
+
+	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+	if (epd.bio)
+		submit_one_bio(WRITE, epd.bio);
+	return ret;
+}
+EXPORT_SYMBOL(extent_writepages);
+
 /*
  * basic invalidatepage code, this waits on any locked or writeback
  * ranges corresponding to the page, and then deletes any extent state
@@ -1869,6 +1953,7 @@ int extent_prepare_write(struct extent_map_tree *tree,
 				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
+					 NULL, 1,
 					 end_bio_extent_preparewrite);
 			iocount++;
 			block_start = block_start + iosize;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b6f6519f9ba..0b0dcf19fe4 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -136,6 +136,10 @@ int extent_invalidatepage(struct extent_map_tree *tree,
 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc);
+int extent_writepages(struct extent_map_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc);
 int extent_prepare_write(struct extent_map_tree *tree,
 			 struct inode *inode, struct page *page,
 			 unsigned from, unsigned to, get_extent_t *get_extent);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0c65141b999..25f32d7c7ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1747,6 +1747,15 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
+
+static int btrfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(mapping->host)->extent_tree;
+	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
+}
+
 static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
 	struct extent_map_tree *tree;
@@ -2526,6 +2535,7 @@ static struct extent_map_ops btrfs_extent_map_ops = {
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
 	.writepage	= btrfs_writepage,
+	.writepages	= btrfs_writepages,
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
 	.commit_write	= btrfs_commit_write,
-- 
cgit v1.2.3


From c549228ff63498025bf0b065c5a5909a06f21647 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 6 Nov 2007 10:25:25 -0500
Subject: Btrfs: Properly update free space cache in __free_extent

When pin_down_bytes decides not to pin a block because it was from the
current transaction, make sure the in memory cache of free extents is updated

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e7192ea8f96..9eb2ee02f64 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -714,7 +714,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 				    root->fs_info->running_transaction->transid;
 				if (btrfs_header_generation(buf) == transid) {
 					free_extent_buffer(buf);
-					return 0;
+					return 1;
 				}
 			}
 			free_extent_buffer(buf);
@@ -774,7 +774,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 
 		if (pin) {
 			ret = pin_down_bytes(root, bytenr, num_bytes, 0);
-			BUG_ON(ret);
+			if (ret > 0)
+				mark_free = 1;
+			BUG_ON(ret < 0);
 		}
 
 		/* block accounting for super block */
-- 
cgit v1.2.3


From 081e95736d9118a96e9f6dcc5cec02dc75c9e1cb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 6 Nov 2007 10:26:24 -0500
Subject: Btrfs: Make defrag check nodes against the progress key to prevent
 repeating work

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 56 ++++++++++++++++++++++++++++----------------------
 fs/btrfs/tree-defrag.c |  3 ++-
 2 files changed, 34 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eef9c92f86d..ea9b4669934 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -161,6 +161,31 @@ static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
 	return 0;
 }
 
+/*
+ * compare two keys in a memcmp fashion
+ */
+static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
+{
+	struct btrfs_key k1;
+
+	btrfs_disk_key_to_cpu(&k1, disk);
+
+	if (k1.objectid > k2->objectid)
+		return 1;
+	if (k1.objectid < k2->objectid)
+		return -1;
+	if (k1.type > k2->type)
+		return 1;
+	if (k1.type < k2->type)
+		return -1;
+	if (k1.offset > k2->offset)
+		return 1;
+	if (k1.offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int start_slot, int cache_only, u64 *last_ret,
@@ -179,6 +204,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	int parent_level;
 	int uptodate;
 	u32 blocksize;
+	int progress_passed = 0;
+	struct btrfs_disk_key disk_key;
 
 	parent_level = btrfs_header_level(parent);
 	if (cache_only && parent_level != 1)
@@ -213,6 +240,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 					&parent->map_start, &parent->map_len,
 					KM_USER1);
 		}
+		btrfs_node_key(parent, &disk_key, i);
+		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
+			continue;
+
+		progress_passed = 1;
 		blocknr = btrfs_node_blockptr(parent, i);
 		if (last_block == 0)
 			last_block = blocknr;
@@ -292,30 +324,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
-/*
- * compare two keys in a memcmp fashion
- */
-static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
-{
-	struct btrfs_key k1;
-
-	btrfs_disk_key_to_cpu(&k1, disk);
-
-	if (k1.objectid > k2->objectid)
-		return 1;
-	if (k1.objectid < k2->objectid)
-		return -1;
-	if (k1.type > k2->type)
-		return 1;
-	if (k1.type < k2->type)
-		return -1;
-	if (k1.offset > k2->offset)
-		return 1;
-	if (k1.offset < k2->offset)
-		return -1;
-	return 0;
-}
-
 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 65ef1235156..6ef1ba5f9c2 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -227,7 +227,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			break;
 		if (wret < 0)
 			ret = wret;
-		ret = -EAGAIN;
+		else
+			ret = -EAGAIN;
 		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
-- 
cgit v1.2.3


From dcfec0dcb1b1a037fb26177789e8f108bc429cb3 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 6 Nov 2007 10:26:26 -0500
Subject: Btrfs: Fix u32 overflow in dirty_and_release_pages.

When calculating the size of inline extent,  inode->i_size should also
be take into consideration, otherwise sys_write may drop some data
silently.  You can test this bug by:

#dd if=/dev/zero bs=4k count=1 of=test_file
#dd if=/dev/zero bs=2k count=1 of=test_file conv=notrunc

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4e52f7ec1cb..bb98f52f4ea 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -239,9 +239,8 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
-	u32 inline_size;
+	u64 inline_size;
 	loff_t isize = i_size_read(inode);
-
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
@@ -328,9 +327,11 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 					 aligned_end, aligned_end, &hint_byte);
 		if (err)
 			goto failed;
+		if (isize > inline_size)
+			inline_size = min_t(u64, isize, aligned_end);
+		inline_size -= start_pos;
 		err = insert_inline_extent(trans, root, inode, start_pos,
-					   end_pos - start_pos, pages, 0,
-					   num_pages);
+					   inline_size, pages, 0, num_pages);
 		BUG_ON(err);
 	}
 	if (end_pos > isize) {
-- 
cgit v1.2.3


From a273208edd55463b3bcd8b77a6fe8ba54afc6940 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 6 Nov 2007 10:26:28 -0500
Subject: Fix EXTENT_MAP_INLINE off by one in btrfs_drop_extents

Don't set hint_byte to EXTENT_MAP_INLINE when 'end == extent_end' or
'start == key.offset' . The inline extent will be truncated in these
cases.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb98f52f4ea..5ceaed25dce 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -475,8 +475,7 @@ next_slot:
 			search_start = (extent_end + mask) & ~mask;
 		} else
 			search_start = extent_end;
-
-		if (end <= extent_end && start >= key.offset && found_inline) {
+		if (end < extent_end && start > key.offset && found_inline) {
 			*hint_byte = EXTENT_MAP_INLINE;
 		}
 		if (end < extent_end && end >= key.offset) {
-- 
cgit v1.2.3


From f84a8b362d9785ca1fa0598d8a90f35184bd8750 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 6 Nov 2007 10:26:29 -0500
Subject: Btrfs: Optimize allocations as we need to mix data and metadata into
 one group

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c | 42 ++++++++++++++++++++++++++++++++++++------
 fs/btrfs/tree-defrag.c |  6 +++++-
 3 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d82afb618bf..f7907b02fa7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -279,6 +279,8 @@ struct btrfs_csum_item {
 
 
 #define BTRFS_BLOCK_GROUP_DATA 1
+#define BTRFS_BLOCK_GROUP_MIXED 2
+
 struct btrfs_block_group_item {
 	__le64 used;
 	u8 flags;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9eb2ee02f64..d54ab8ef06d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -168,11 +168,13 @@ static u64 find_search_start(struct btrfs_root *root,
 	u64 last;
 	u64 start = 0;
 	u64 end = 0;
+	int wrapped = 0;
 
 again:
 	ret = cache_block_group(root, cache);
 	if (ret)
 		goto out;
+
 	last = max(search_start, cache->key.objectid);
 
 	while(1) {
@@ -195,8 +197,15 @@ out:
 
 new_group:
 	last = cache->key.objectid + cache->key.offset;
+wrapped:
 	cache = btrfs_lookup_block_group(root->fs_info, last);
 	if (!cache) {
+		if (!wrapped) {
+			wrapped = 1;
+			last = search_start;
+			data = BTRFS_BLOCK_GROUP_MIXED;
+			goto wrapped;
+		}
 		return search_start;
 	}
 	cache = btrfs_find_block_group(root, cache, last, data, 0);
@@ -236,9 +245,11 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	block_group_cache = &info->block_group_cache;
 
 	if (!owner)
-		factor = 5;
+		factor = 8;
 
-	if (data)
+	if (data == BTRFS_BLOCK_GROUP_MIXED)
+		bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
+	else if (data)
 		bit = BLOCK_GROUP_DATA;
 	else
 		bit = BLOCK_GROUP_METADATA;
@@ -246,14 +257,16 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_block_group(info, search_start);
-		if (shint && shint->data == data) {
+		if (shint && (shint->data == data ||
+			      shint->data == BTRFS_BLOCK_GROUP_MIXED)) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used < div_factor(shint->key.offset, factor)) {
 				return shint;
 			}
 		}
 	}
-	if (hint && hint->data == data) {
+	if (hint && (hint->data == data ||
+		     hint->data == BTRFS_BLOCK_GROUP_MIXED)) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used < div_factor(hint->key.offset, factor)) {
 			return hint;
@@ -592,11 +605,15 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				if (data) {
 					bit_to_clear = BLOCK_GROUP_METADATA;
 					bit_to_set = BLOCK_GROUP_DATA;
+					cache->item.flags &=
+						~BTRFS_BLOCK_GROUP_MIXED;
 					cache->item.flags |=
 						BTRFS_BLOCK_GROUP_DATA;
 				} else {
 					bit_to_clear = BLOCK_GROUP_DATA;
 					bit_to_set = BLOCK_GROUP_METADATA;
+					cache->item.flags &=
+						~BTRFS_BLOCK_GROUP_MIXED;
 					cache->item.flags &=
 						~BTRFS_BLOCK_GROUP_DATA;
 				}
@@ -606,6 +623,14 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				set_extent_bits(&info->block_group_cache,
 						start, end, bit_to_set,
 						GFP_NOFS);
+			} else if (cache->data != data &&
+				   cache->data != BTRFS_BLOCK_GROUP_MIXED) {
+				cache->data = BTRFS_BLOCK_GROUP_MIXED;
+				set_extent_bits(&info->block_group_cache,
+						start, end,
+						BLOCK_GROUP_DATA |
+						BLOCK_GROUP_METADATA,
+						GFP_NOFS);
 			}
 			old_val += num_bytes;
 		} else {
@@ -886,6 +911,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	u64 cached_start;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -910,6 +936,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 check_failed:
 	search_start = find_search_start(root, &block_group,
 					 search_start, total_needed, data);
+	cached_start = search_start;
 	btrfs_init_path(path);
 	ins->objectid = search_start;
 	ins->offset = 0;
@@ -1532,9 +1559,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 
-		if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
+		if (cache->item.flags & BTRFS_BLOCK_GROUP_MIXED) {
+			bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
+			cache->data = BTRFS_BLOCK_GROUP_MIXED;
+		} else if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
 			bit = BLOCK_GROUP_DATA;
-			cache->data = 1;
+			cache->data = BTRFS_BLOCK_GROUP_DATA;
 		} else {
 			bit = BLOCK_GROUP_METADATA;
 			cache->data = 0;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 6ef1ba5f9c2..3994795edfe 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -58,6 +58,10 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	if (root->fs_info->extent_root == root)
 		is_extent = 1;
 
+	if (*level == 1 && cache_only && path->nodes[1] &&
+	    !btrfs_buffer_defrag(path->nodes[1])) {
+		goto out;
+	}
 	while(*level > 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
@@ -116,7 +120,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
 	btrfs_clear_buffer_defrag(path->nodes[*level]);
-
+out:
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
-- 
cgit v1.2.3


From e644d021e328d3902559e5db687383f2da85993c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 6 Nov 2007 15:09:29 -0500
Subject: Fix recursive KM_USER1 usage in btrfs_realloc_node

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        | 11 +++--------
 fs/btrfs/struct-funcs.c | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f7907b02fa7..030d21d7f98 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -557,14 +557,9 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr)
 		sizeof(struct btrfs_key_ptr) * nr;
 }
 
-static inline void btrfs_node_key(struct extent_buffer *eb,
-			   struct btrfs_disk_key *disk_key, int nr)
-{
-	unsigned long ptr;
-	ptr = btrfs_node_key_ptr_offset(nr);
-	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
-		       struct btrfs_key_ptr, key, disk_key);
-}
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr);
+
 static inline void btrfs_set_node_key(struct extent_buffer *eb,
 				      struct btrfs_disk_key *disk_key, int nr)
 {
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index cf68fcf9b55..c5715a60554 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -95,3 +95,19 @@ void btrfs_set_##name(struct extent_buffer *eb,				\
 
 #include "ctree.h"
 
+void btrfs_node_key(struct extent_buffer *eb,
+		    struct btrfs_disk_key *disk_key, int nr)
+{
+	unsigned long ptr = btrfs_node_key_ptr_offset(nr);
+	if (eb->map_token && ptr >= eb->map_start &&
+	    ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
+		memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
+			sizeof(*disk_key));
+		return;
+	} else if (eb->map_token) {
+		unmap_extent_buffer(eb, eb->map_token, KM_USER1);
+		eb->map_token = NULL;
+	}
+	read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
+		       struct btrfs_key_ptr, key, disk_key);
+}
-- 
cgit v1.2.3


From 34a3821873aeabff2607c8093bce82cd1fbcfd60 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Nov 2007 13:31:03 -0500
Subject: Btrfs: Change push_leaf_{leaf,right} to empty the src leave during
 item deletion

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 74 ++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 50 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ea9b4669934..1b47fe71e0b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1394,19 +1394,21 @@ int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf)
  * room, 0 if everything worked out and < 0 if there were major errors.
  */
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_path *path, int data_size)
+			   *root, struct btrfs_path *path, int data_size,
+			   int empty)
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *right;
 	struct extent_buffer *upper;
 	struct btrfs_disk_key disk_key;
 	int slot;
-	int i;
+	u32 i;
 	int free_space;
 	int push_space = 0;
 	int push_items = 0;
 	struct btrfs_item *item;
 	u32 left_nritems;
+	u32 nr;
 	u32 right_nritems;
 	u32 data_end;
 	u32 this_item_size;
@@ -1447,7 +1449,13 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 
-	for (i = left_nritems - 1; i >= 1; i--) {
+	if (empty)
+		nr = 0;
+	else
+		nr = 1;
+
+	i = left_nritems - 1;
+	while (i >= nr) {
 		item = btrfs_item_nr(left, i);
 
 		if (path->slots[0] == i)
@@ -1466,6 +1474,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		push_items++;
 		push_space += this_item_size + sizeof(*item);
+		if (i == 0)
+			break;
+		i--;
 	}
 	if (left->map_token) {
 		unmap_extent_buffer(left, left->map_token, KM_USER1);
@@ -1477,11 +1488,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 
-	if (push_items == left_nritems)
+	if (!empty && push_items == left_nritems)
 		WARN_ON(1);
 
 	/* push left to right */
 	right_nritems = btrfs_header_nritems(right);
+
 	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
 	push_space -= leaf_data_end(root, left);
 
@@ -1511,7 +1523,6 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	right_nritems += push_items;
 	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
-
 	for (i = 0; i < right_nritems; i++) {
 		item = btrfs_item_nr(right, i);
 		if (!right->map_token) {
@@ -1532,7 +1543,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	left_nritems -= push_items;
 	btrfs_set_header_nritems(left, left_nritems);
 
-	btrfs_mark_buffer_dirty(left);
+	if (left_nritems)
+		btrfs_mark_buffer_dirty(left);
 	btrfs_mark_buffer_dirty(right);
 
 	btrfs_item_key(right, &disk_key, 0);
@@ -1555,7 +1567,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
  */
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, int data_size)
+			  *root, struct btrfs_path *path, int data_size,
+			  int empty)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
@@ -1568,6 +1581,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_item *item;
 	u32 old_left_nritems;
 	u32 right_nritems;
+	u32 nr;
 	int ret = 0;
 	int wret;
 	u32 this_item_size;
@@ -1607,7 +1621,12 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 
-	for (i = 0; i < right_nritems - 1; i++) {
+	if (empty)
+		nr = right_nritems;
+	else
+		nr = right_nritems - 1;
+
+	for (i = 0; i < nr; i++) {
 		item = btrfs_item_nr(right, i);
 		if (!right->map_token) {
 			map_extent_buffer(right, (unsigned long)item,
@@ -1637,7 +1656,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		free_extent_buffer(left);
 		return 1;
 	}
-	if (push_items == btrfs_header_nritems(right))
+	if (!empty && push_items == btrfs_header_nritems(right))
 		WARN_ON(1);
 
 	/* push data from right to left */
@@ -1681,20 +1700,26 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	/* fixup right node */
-	push_space = btrfs_item_offset_nr(right, push_items - 1) -
-					  leaf_data_end(root, right);
-	memmove_extent_buffer(right, btrfs_leaf_data(right) +
-			      BTRFS_LEAF_DATA_SIZE(root) - push_space,
-			      btrfs_leaf_data(right) +
-			      leaf_data_end(root, right), push_space);
-
-	memmove_extent_buffer(right, btrfs_item_nr_offset(0),
+	if (push_items > right_nritems) {
+		printk("push items %d nr %u\n", push_items, right_nritems);
+		WARN_ON(1);
+	}
+
+	if (push_items < right_nritems) {
+		push_space = btrfs_item_offset_nr(right, push_items - 1) -
+						  leaf_data_end(root, right);
+		memmove_extent_buffer(right, btrfs_leaf_data(right) +
+				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
+				      btrfs_leaf_data(right) +
+				      leaf_data_end(root, right), push_space);
+
+		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
 			      btrfs_item_nr_offset(push_items),
 			     (btrfs_header_nritems(right) - push_items) *
 			     sizeof(struct btrfs_item));
 
-	right_nritems = btrfs_header_nritems(right) - push_items;
-	btrfs_set_header_nritems(right, right_nritems);
+	}
+	btrfs_set_header_nritems(right, right_nritems - push_items);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 
 	for (i = 0; i < right_nritems; i++) {
@@ -1717,7 +1742,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	btrfs_mark_buffer_dirty(left);
-	btrfs_mark_buffer_dirty(right);
+	if (right_nritems)
+		btrfs_mark_buffer_dirty(right);
 
 	btrfs_item_key(right, &disk_key, 0);
 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -1768,12 +1794,12 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* first try to make some room by pushing left and right */
 	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
-		wret = push_leaf_right(trans, root, path, data_size);
+		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0) {
 			return wret;
 		}
 		if (wret) {
-			wret = push_leaf_left(trans, root, path, data_size);
+			wret = push_leaf_left(trans, root, path, data_size, 0);
 			if (wret < 0)
 				return wret;
 		}
@@ -2403,13 +2429,13 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			slot = path->slots[1];
 			extent_buffer_get(leaf);
 
-			wret = push_leaf_right(trans, root, path, 1);
+			wret = push_leaf_right(trans, root, path, 1, 1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
 
 			if (path->nodes[0] == leaf &&
 			    btrfs_header_nritems(leaf)) {
-				wret = push_leaf_left(trans, root, path, 1);
+				wret = push_leaf_left(trans, root, path, 1, 1);
 				if (wret < 0 && wret != -ENOSPC)
 					ret = wret;
 			}
-- 
cgit v1.2.3


From 008630c17cc5654e141c7fd24a3e737de643fec1 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Wed, 7 Nov 2007 13:31:09 -0500
Subject: Properly delete csum item in btrfs_truncate_in_trans.

When 'item_end' is equal to 'inode->i_size',   'found_type' is updated
and current item is skipped. This behavior is correct for extent item,
but incorrect for csum item. For example, there is a csum item with
'offset == 0'. When deleting the inode,  'inode->i_size' is set to 0,
so the csum item isn't deleted.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 25f32d7c7ee..90cb71d6829 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -611,13 +611,14 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				item_end += btrfs_file_extent_inline_len(leaf,
 									 item);
 			}
+			item_end--;
 		}
 		if (found_type == BTRFS_CSUM_ITEM_KEY) {
 			ret = btrfs_csum_truncate(trans, root, path,
 						  inode->i_size);
 			BUG_ON(ret);
 		}
-		if (item_end <= inode->i_size) {
+		if (item_end < inode->i_size) {
 			if (found_type == BTRFS_DIR_ITEM_KEY) {
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
-- 
cgit v1.2.3


From 0da5468f4724a59e745d938395beffe830c424ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Nov 2007 21:08:01 -0500
Subject: Btrfs: Use writepages for the metadata too

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8935eec31ee..6c8533fba7c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,8 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 }
 #endif
 
+static struct extent_map_ops btree_extent_map_ops;
+
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
@@ -188,15 +190,29 @@ out:
 	return 0;
 }
 
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
+static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
 {
-	struct extent_map_tree *tree;
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
 
 	csum_dirty_buffer(root, page);
+	return 0;
+}
+
+static int btree_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->extent_tree;
 	return extent_write_full_page(tree, page, btree_get_extent, wbc);
 }
+
+static int btree_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
+{
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(mapping->host)->extent_tree;
+	return extent_writepages(tree, mapping, btree_get_extent, wbc);
+}
+
 int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_map_tree *tree;
@@ -251,6 +267,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
 static struct address_space_operations btree_aops = {
 	.readpage	= btree_readpage,
 	.writepage	= btree_writepage,
+	.writepages	= btree_writepages,
 	.releasepage	= btree_releasepage,
 	.invalidatepage = btree_invalidatepage,
 	.sync_page	= block_sync_page,
@@ -538,6 +555,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
+	BTRFS_I(fs_info->btree_inode)->extent_tree.ops = &btree_extent_map_ops;
+
 	extent_map_tree_init(&fs_info->free_space_cache,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	extent_map_tree_init(&fs_info->block_group_cache,
@@ -832,3 +851,7 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
 					buf, 0, 1);
 }
+
+static struct extent_map_ops btree_extent_map_ops = {
+	.writepage_io_hook = btree_writepage_io_hook,
+};
-- 
cgit v1.2.3


From 09be207d1ba224531a61de9afdc07a125e45318c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Nov 2007 21:08:16 -0500
Subject: Btrfs: Fix failure cleanups when allocating extent buffers fail

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 56 +++++++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0677c84bb7..c976615dcda 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2106,25 +2106,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
 
 	spin_lock(&tree->lru_lock);
 	eb = find_lru(tree, start, len);
-	if (eb) {
-		goto lru_add;
-	}
 	spin_unlock(&tree->lru_lock);
-
 	if (eb) {
-		memset(eb, 0, sizeof(*eb));
-	} else {
-		eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+		return eb;
 	}
+
+	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	INIT_LIST_HEAD(&eb->lru);
 	eb->start = start;
 	eb->len = len;
 	atomic_set(&eb->refs, 1);
 
-	spin_lock(&tree->lru_lock);
-lru_add:
-	add_lru(tree, eb);
-	spin_unlock(&tree->lru_lock);
 	return eb;
 }
 
@@ -2151,7 +2143,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 		return NULL;
 
 	if (eb->flags & EXTENT_BUFFER_FILLED)
-		return eb;
+		goto lru_add;
 
 	if (page0) {
 		eb->first_page = page0;
@@ -2169,11 +2161,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
 		if (!p) {
 			WARN_ON(1);
-			/* make sure the free only frees the pages we've
-			 * grabbed a reference on
-			 */
-			eb->len = i << PAGE_CACHE_SHIFT;
-			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
 		}
 		set_page_extent_mapped(p);
@@ -2192,9 +2179,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 	if (uptodate)
 		eb->flags |= EXTENT_UPTODATE;
 	eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+	spin_lock(&tree->lru_lock);
+	add_lru(tree, eb);
+	spin_unlock(&tree->lru_lock);
 	return eb;
+
 fail:
-	free_extent_buffer(eb);
+	if (!atomic_dec_and_test(&eb->refs))
+		return NULL;
+	for (index = 0; index < i; index++) {
+		page_cache_release(extent_buffer_page(eb, index));
+	}
+	__free_extent_buffer(eb);
 	return NULL;
 }
 EXPORT_SYMBOL(alloc_extent_buffer);
@@ -2204,7 +2202,8 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 					  gfp_t mask)
 {
 	unsigned long num_pages = num_extent_pages(start, len);
-	unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	struct extent_buffer *eb;
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
@@ -2215,16 +2214,11 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 		return NULL;
 
 	if (eb->flags & EXTENT_BUFFER_FILLED)
-		return eb;
+		goto lru_add;
 
 	for (i = 0; i < num_pages; i++, index++) {
 		p = find_lock_page(mapping, index);
 		if (!p) {
-			/* make sure the free only frees the pages we've
-			 * grabbed a reference on
-			 */
-			eb->len = i << PAGE_CACHE_SHIFT;
-			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
 		}
 		set_page_extent_mapped(p);
@@ -2245,9 +2239,19 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 	if (uptodate)
 		eb->flags |= EXTENT_UPTODATE;
 	eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+	spin_lock(&tree->lru_lock);
+	add_lru(tree, eb);
+	spin_unlock(&tree->lru_lock);
 	return eb;
 fail:
-	free_extent_buffer(eb);
+	if (!atomic_dec_and_test(&eb->refs))
+		return NULL;
+	for (index = 0; index < i; index++) {
+		page_cache_release(extent_buffer_page(eb, index));
+	}
+	__free_extent_buffer(eb);
 	return NULL;
 }
 EXPORT_SYMBOL(find_extent_buffer);
-- 
cgit v1.2.3


From 257d0ce36f215937602eb5c1b328b69afaf3bcd8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Nov 2007 21:08:16 -0500
Subject: Btrfs: Allow large data extents in a single file to span into
 metadata block groups

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 36 ++++++++++++++++++++++++++++++------
 fs/btrfs/file.c        |  6 ++++++
 2 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d54ab8ef06d..7405bd5301a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -168,6 +168,7 @@ static u64 find_search_start(struct btrfs_root *root,
 	u64 last;
 	u64 start = 0;
 	u64 end = 0;
+	u64 cache_miss = 0;
 	int wrapped = 0;
 
 again:
@@ -181,14 +182,20 @@ again:
 		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
 					    last, &start, &end, EXTENT_DIRTY);
 		if (ret) {
+			if (!cache_miss)
+				cache_miss = last;
 			goto new_group;
 		}
 
 		start = max(last, start);
 		last = end + 1;
-		if (end + 1 - start < num)
+		if (last - start < num) {
+			if (last == cache->key.objectid + cache->key.offset)
+				cache_miss = start;
 			continue;
-		if (start + num >= cache->key.objectid + cache->key.offset)
+		}
+		if (data != BTRFS_BLOCK_GROUP_MIXED &&
+		    start + num >= cache->key.objectid + cache->key.offset)
 			goto new_group;
 		return start;
 	}
@@ -208,13 +215,22 @@ wrapped:
 		}
 		return search_start;
 	}
+	if (cache_miss && !cache->cached) {
+		cache_block_group(root, cache);
+		last = cache_miss;
+
+		cache = btrfs_lookup_block_group(root->fs_info, last);
+	}
 	cache = btrfs_find_block_group(root, cache, last, data, 0);
 	*cache_ret = cache;
+	cache_miss = 0;
 	goto again;
 }
 
 static u64 div_factor(u64 num, int factor)
 {
+	if (factor == 10)
+		return num;
 	num *= factor;
 	do_div(num, 10);
 	return num;
@@ -247,9 +263,10 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (!owner)
 		factor = 8;
 
-	if (data == BTRFS_BLOCK_GROUP_MIXED)
+	if (data == BTRFS_BLOCK_GROUP_MIXED) {
 		bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
-	else if (data)
+		factor = 10;
+	} else if (data)
 		bit = BLOCK_GROUP_DATA;
 	else
 		bit = BLOCK_GROUP_METADATA;
@@ -918,6 +935,10 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	level = btrfs_header_level(root->node);
 
+	if (num_bytes >= 96 * 1024 * 1024 && hint_byte) {
+		data = BTRFS_BLOCK_GROUP_MIXED;
+	}
+
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 	if (hint_byte) {
@@ -937,6 +958,7 @@ check_failed:
 	search_start = find_search_start(root, &block_group,
 					 search_start, total_needed, data);
 	cached_start = search_start;
+
 	btrfs_init_path(path);
 	ins->objectid = search_start;
 	ins->offset = 0;
@@ -1021,7 +1043,8 @@ check_failed:
 		start_found = 1;
 		last_byte = key.objectid + key.offset;
 
-		if (!full_scan && last_byte >= block_group->key.objectid +
+		if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
+		    last_byte >= block_group->key.objectid +
 		    block_group->key.offset) {
 			btrfs_release_path(root, path);
 			search_start = block_group->key.objectid +
@@ -1042,7 +1065,8 @@ check_pending:
 	if (ins->objectid + num_bytes >= search_end)
 		goto enospc;
 
-	if (!full_scan && ins->objectid + num_bytes >= block_group->
+	if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
+	    ins->objectid + num_bytes >= block_group->
 	    key.objectid + block_group->key.offset) {
 		search_start = block_group->key.objectid +
 			block_group->key.offset;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5ceaed25dce..9dd2c5c23b5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -439,6 +439,12 @@ next_slot:
 						struct btrfs_file_extent_item);
 			found_type = btrfs_file_extent_type(leaf, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				extent_end =
+				     btrfs_file_extent_disk_bytenr(leaf,
+								   extent);
+				if (extent_end)
+					*hint_byte = extent_end;
+
 				extent_end = key.offset +
 				     btrfs_file_extent_num_bytes(leaf, extent);
 				found_extent = 1;
-- 
cgit v1.2.3


From 856bf3e592f917e7d663cb7fa93e83fe795e8f4e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 Nov 2007 10:59:05 -0500
Subject: Btrfs: Avoid extent_buffer lru corruption

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index c976615dcda..754bc42c162 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2047,7 +2047,7 @@ static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
 			rm = list_entry(tree->buffer_lru.prev,
 					struct extent_buffer, lru);
 			tree->lru_size--;
-			list_del(&rm->lru);
+			list_del_init(&rm->lru);
 			free_extent_buffer(rm);
 		}
 	} else
@@ -2187,6 +2187,9 @@ lru_add:
 	return eb;
 
 fail:
+	spin_lock(&tree->lru_lock);
+	list_del_init(&eb->lru);
+	spin_unlock(&tree->lru_lock);
 	if (!atomic_dec_and_test(&eb->refs))
 		return NULL;
 	for (index = 0; index < i; index++) {
@@ -2246,6 +2249,9 @@ lru_add:
 	spin_unlock(&tree->lru_lock);
 	return eb;
 fail:
+	spin_lock(&tree->lru_lock);
+	list_del_init(&eb->lru);
+	spin_unlock(&tree->lru_lock);
 	if (!atomic_dec_and_test(&eb->refs))
 		return NULL;
 	for (index = 0; index < i; index++) {
-- 
cgit v1.2.3


From 3ab2fb5a8cb003897016b6eb38ddad916226c1b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 Nov 2007 10:59:22 -0500
Subject: Btrfs: Add readpages support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 70 +++++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/extent_map.h |  4 +++
 fs/btrfs/inode.c      | 12 ++++++++-
 3 files changed, 78 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 754bc42c162..0077c6c1d9f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -11,6 +11,7 @@
 #include <linux/swap.h>
 #include <linux/version.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include "extent_map.h"
 
 /* temporary define until extent_map moves out of btrfs */
@@ -1503,7 +1504,7 @@ static int submit_extent_page(int rw, struct extent_map_tree *tree,
 			      size_t size, unsigned long offset,
 			      struct block_device *bdev,
 			      struct bio **bio_ret,
-			      int max_pages,
+			      unsigned long max_pages,
 			      bio_end_io_t end_io_func)
 {
 	int ret = 0;
@@ -1520,7 +1521,7 @@ static int submit_extent_page(int rw, struct extent_map_tree *tree,
 			return 0;
 		}
 	}
-	nr = min(max_pages, bio_get_nr_vecs(bdev));
+	nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
 	if (!bio) {
 		printk("failed to allocate bio nr %d\n", nr);
@@ -1552,8 +1553,10 @@ void set_page_extent_mapped(struct page *page)
  * into the tree that are removed when the IO is done (by the end_io
  * handlers)
  */
-int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent)
+static int __extent_read_full_page(struct extent_map_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1631,10 +1634,12 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 							  cur + iosize - 1);
 		}
 		if (!ret) {
+			unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+			nr -= page->index;
 			ret = submit_extent_page(READ, tree, page,
-						 sector, iosize, page_offset,
-						 bdev, NULL, 1,
-						 end_bio_extent_readpage);
+					 sector, iosize, page_offset,
+					 bdev, bio, nr,
+					 end_bio_extent_readpage);
 		}
 		if (ret)
 			SetPageError(page);
@@ -1649,6 +1654,18 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 	}
 	return 0;
 }
+
+int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
+			    get_extent_t *get_extent)
+{
+	struct bio *bio = NULL;
+	int ret;
+
+	ret = __extent_read_full_page(tree, page, get_extent, &bio);
+	if (bio)
+		submit_one_bio(READ, bio);
+	return ret;
+}
 EXPORT_SYMBOL(extent_read_full_page);
 
 /*
@@ -1836,6 +1853,45 @@ int extent_writepages(struct extent_map_tree *tree,
 }
 EXPORT_SYMBOL(extent_writepages);
 
+int extent_readpages(struct extent_map_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	struct pagevec pvec;
+
+	pagevec_init(&pvec, 0);
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		/*
+		 * what we want to do here is call add_to_page_cache_lru,
+		 * but that isn't exported, so we reproduce it here
+		 */
+		if (!add_to_page_cache(page, mapping,
+					page->index, GFP_KERNEL)) {
+
+			/* open coding of lru_cache_add, also not exported */
+			page_cache_get(page);
+			if (!pagevec_add(&pvec, page))
+				__pagevec_lru_add(&pvec);
+			__extent_read_full_page(tree, page, get_extent, &bio);
+		}
+		page_cache_release(page);
+	}
+	if (pagevec_count(&pvec))
+		__pagevec_lru_add(&pvec);
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		submit_one_bio(READ, bio);
+	return 0;
+}
+EXPORT_SYMBOL(extent_readpages);
+
 /*
  * basic invalidatepage code, this waits on any locked or writeback
  * ranges corresponding to the page, and then deletes any extent state
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 0b0dcf19fe4..fbd23a72f53 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -140,6 +140,10 @@ int extent_writepages(struct extent_map_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
 		      struct writeback_control *wbc);
+int extent_readpages(struct extent_map_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent);
 int extent_prepare_write(struct extent_map_tree *tree,
 			 struct inode *inode, struct page *page,
 			 unsigned from, unsigned to, get_extent_t *get_extent);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 90cb71d6829..d6d32465bc9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1748,7 +1748,6 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-
 static int btrfs_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
@@ -1757,6 +1756,16 @@ static int btrfs_writepages(struct address_space *mapping,
 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
 }
 
+static int
+btrfs_readpages(struct file *file, struct address_space *mapping,
+		struct list_head *pages, unsigned nr_pages)
+{
+	struct extent_map_tree *tree;
+	tree = &BTRFS_I(mapping->host)->extent_tree;
+	return extent_readpages(tree, mapping, pages, nr_pages,
+				btrfs_get_extent);
+}
+
 static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
 	struct extent_map_tree *tree;
@@ -2537,6 +2546,7 @@ static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
 	.writepage	= btrfs_writepage,
 	.writepages	= btrfs_writepages,
+	.readpages	= btrfs_readpages,
 	.sync_page	= block_sync_page,
 	.prepare_write	= btrfs_prepare_write,
 	.commit_write	= btrfs_commit_write,
-- 
cgit v1.2.3


From 0591fb56fb732c7bf500d1446d0bf927046d7002 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 11 Nov 2007 08:22:00 -0500
Subject: Btrfs: Fix extent bit range testing

It could return the bit as set when there was actually a hole at the
very end of the range.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 0077c6c1d9f..b3ff63c4bf2 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -96,7 +96,7 @@ void extent_map_tree_empty_lru(struct extent_map_tree *tree)
 	while(!list_empty(&tree->buffer_lru)) {
 		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
 				lru);
-		list_del(&eb->lru);
+		list_del_init(&eb->lru);
 		free_extent_buffer(eb);
 	}
 }
@@ -1212,13 +1212,15 @@ int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
 	node = tree_search(&tree->state, start);
 	while (node && start <= end) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->start > end)
-			break;
 
 		if (filled && state->start > start) {
 			bitset = 0;
 			break;
 		}
+
+		if (state->start > end)
+			break;
+
 		if (state->state & bits) {
 			bitset = 1;
 			if (!filled)
@@ -2208,6 +2210,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 		page_cache_get(page0);
 		mark_page_accessed(page0);
 		set_page_extent_mapped(page0);
+		WARN_ON(!PageUptodate(page0));
 		set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
 				 len << 2);
 	} else {
@@ -2248,9 +2251,11 @@ fail:
 	spin_unlock(&tree->lru_lock);
 	if (!atomic_dec_and_test(&eb->refs))
 		return NULL;
-	for (index = 0; index < i; index++) {
+	for (index = 1; index < i; index++) {
 		page_cache_release(extent_buffer_page(eb, index));
 	}
+	if (i > 0)
+		page_cache_release(extent_buffer_page(eb, 0));
 	__free_extent_buffer(eb);
 	return NULL;
 }
@@ -2310,9 +2315,11 @@ fail:
 	spin_unlock(&tree->lru_lock);
 	if (!atomic_dec_and_test(&eb->refs))
 		return NULL;
-	for (index = 0; index < i; index++) {
+	for (index = 1; index < i; index++) {
 		page_cache_release(extent_buffer_page(eb, index));
 	}
+	if (i > 0)
+		page_cache_release(extent_buffer_page(eb, 0));
 	__free_extent_buffer(eb);
 	return NULL;
 }
@@ -2329,11 +2336,13 @@ void free_extent_buffer(struct extent_buffer *eb)
 	if (!atomic_dec_and_test(&eb->refs))
 		return;
 
+	WARN_ON(!list_empty(&eb->lru));
 	num_pages = num_extent_pages(eb->start, eb->len);
 
-	for (i = 0; i < num_pages; i++) {
+	for (i = 1; i < num_pages; i++) {
 		page_cache_release(extent_buffer_page(eb, i));
 	}
+	page_cache_release(extent_buffer_page(eb, 0));
 	__free_extent_buffer(eb);
 }
 EXPORT_SYMBOL(free_extent_buffer);
@@ -2469,6 +2478,7 @@ int read_extent_buffer_pages(struct extent_map_tree *tree,
 			   EXTENT_UPTODATE, 1)) {
 		return 0;
 	}
+
 	if (start) {
 		WARN_ON(start < eb->start);
 		start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -2577,7 +2587,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		*map_start = 0;
 	} else {
 		offset = 0;
-		*map_start = (i << PAGE_CACHE_SHIFT) - start_offset;
+		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
 	}
 	if (start + min_len > eb->len) {
 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
-- 
cgit v1.2.3


From 5103e947b9b7ac18ddb21a04ee3486e94c6504d7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 16 Nov 2007 11:45:54 -0500
Subject: xattr support for btrfs

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile     |   2 +-
 fs/btrfs/acl.c        |  66 +++++++
 fs/btrfs/ctree.h      |  22 ++-
 fs/btrfs/dir-item.c   | 100 +++++++++-
 fs/btrfs/inode.c      |  15 +-
 fs/btrfs/print-tree.c |   3 +-
 fs/btrfs/super.c      |   2 +
 fs/btrfs/xattr.c      | 497 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/xattr.h      |  58 ++++++
 9 files changed, 751 insertions(+), 14 deletions(-)
 create mode 100644 fs/btrfs/acl.c
 create mode 100644 fs/btrfs/xattr.c
 create mode 100644 fs/btrfs/xattr.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 551743be5f0..d5804c5ca07 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,7 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o
+	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
new file mode 100644
index 00000000000..aee9f0657c3
--- /dev/null
+++ b/fs/btrfs/acl.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include "ctree.h"
+#include "xattr.h"
+
+static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
+				      void *value, size_t size)
+{
+	return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name,
+			       value, size);
+}
+
+static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
+				      const void *value, size_t size, int flags)
+{
+	return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name,
+			       value, size, flags);
+}
+
+static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
+				       void *value, size_t size)
+{
+	return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
+			       name, value, size);
+}
+
+static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
+				       const void *value, size_t size, int flags)
+{
+	return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
+			       name, value, size, flags);
+}
+
+struct xattr_handler btrfs_xattr_acl_default_handler = {
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.list	= btrfs_xattr_generic_list,
+	.get	= btrfs_xattr_acl_default_get,
+	.set	= btrfs_xattr_acl_default_set,
+};
+
+struct xattr_handler btrfs_xattr_acl_access_handler = {
+	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.list	= btrfs_xattr_generic_list,
+	.get	= btrfs_xattr_acl_access_get,
+	.set	= btrfs_xattr_acl_access_set,
+};
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 030d21d7f98..27cadae1af6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -63,7 +63,8 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_FT_FIFO		5
 #define BTRFS_FT_SOCK		6
 #define BTRFS_FT_SYMLINK	7
-#define BTRFS_FT_MAX		8
+#define BTRFS_FT_XATTR		8
+#define BTRFS_FT_MAX		9
 
 /*
  * the key defines the order in the tree, and so it also defines (optimal)
@@ -226,7 +227,7 @@ struct btrfs_inode_item {
 
 struct btrfs_dir_item {
 	struct btrfs_disk_key location;
-	__le16 flags;
+	__le16 data_len;
 	__le16 name_len;
 	u8 type;
 } __attribute__ ((__packed__));
@@ -367,7 +368,7 @@ struct btrfs_root {
  * the FS
  */
 #define BTRFS_INODE_ITEM_KEY		1
-
+#define BTRFS_XATTR_ITEM_KEY		2
 /* reserve 2-15 close to the inode for later flexibility */
 
 /*
@@ -621,7 +622,7 @@ static inline void btrfs_set_item_key(struct extent_buffer *eb,
 }
 
 /* struct btrfs_dir_item */
-BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, flags, 16);
+BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
 BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
 
@@ -962,6 +963,15 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      struct btrfs_dir_item *di);
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, const char *name,
+			    u16 name_len, const void *data, u16 data_len,
+			    u64 dir);
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod);
 /* inode-map.c */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
@@ -1039,4 +1049,8 @@ int btrfs_sysfs_add_root(struct btrfs_root *root);
 void btrfs_sysfs_del_root(struct btrfs_root *root);
 void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 
+/* xattr.c */
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct inode *inode);
 #endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 514a1dc337a..ddbe12ae0d6 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -43,8 +43,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 			return ERR_PTR(-EEXIST);
 		ret = btrfs_extend_item(trans, root, path, data_size);
 		WARN_ON(ret > 0);
-		if (ret)
-			return ERR_PTR(ret);
 	}
 	if (ret < 0)
 		return ERR_PTR(ret);
@@ -57,6 +55,57 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	return (struct btrfs_dir_item *)ptr;
 }
 
+int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, const char *name,
+			    u16 name_len, const void *data, u16 data_len,
+			    u64 dir)
+{
+	int ret = 0;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
+	unsigned long name_ptr, data_ptr;
+	struct btrfs_key key, location;
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *leaf;
+	u32 data_size;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	ret = btrfs_name_hash(name, name_len, &key.offset);
+	BUG_ON(ret);
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	data_size = sizeof(*dir_item) + name_len + data_len;
+	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+					name, name_len);
+	/*
+	 * FIXME: at some point we should handle xattr's that are larger than
+	 * what we can fit in our leaf.  We set location to NULL b/c we arent
+	 * pointing at anything else, that will change if we store the xattr
+	 * data in a separate inode.
+	 */
+	BUG_ON(IS_ERR(dir_item));
+	memset(&location, 0, sizeof(location));
+
+	leaf = path->nodes[0];
+	btrfs_cpu_key_to_disk(&disk_key, &location);
+	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
+	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_data_len(leaf, dir_item, data_len);
+	name_ptr = (unsigned long)(dir_item + 1);
+	data_ptr = (unsigned long)((char *)name_ptr + name_len);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	write_extent_buffer(leaf, data, data_ptr, data_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
 			  struct btrfs_key *location, u8 type)
@@ -90,7 +139,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_cpu_key_to_disk(&disk_key, location);
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
 	btrfs_set_dir_type(leaf, dir_item, type);
-	btrfs_set_dir_flags(leaf, dir_item, 0);
+	btrfs_set_dir_data_len(leaf, dir_item, 0);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
 	name_ptr = (unsigned long)(dir_item + 1);
 
@@ -117,7 +166,7 @@ second_insert:
 	btrfs_cpu_key_to_disk(&disk_key, location);
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
 	btrfs_set_dir_type(leaf, dir_item, type);
-	btrfs_set_dir_flags(leaf, dir_item, 0);
+	btrfs_set_dir_data_len(leaf, dir_item, 0);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
 	name_ptr = (unsigned long)(dir_item + 1);
 	write_extent_buffer(leaf, name, name_ptr, name_len);
@@ -194,6 +243,43 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 dir,
+					  const char *name, u16 name_len,
+					  int mod)
+{
+	int ret;
+	struct btrfs_key key;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	ret = btrfs_name_hash(name, name_len, &key.offset);
+	BUG_ON(ret);
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			return NULL;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != dir ||
+	    btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
+	    found_key.offset != key.offset)
+		return NULL;
+
+	return btrfs_match_dir_item_name(root, path, name, name_len);
+}
+
 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
@@ -210,7 +296,8 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
 	while(cur < total_len) {
 		this_len = sizeof(*dir_item) +
-			btrfs_dir_name_len(leaf, dir_item);
+			btrfs_dir_name_len(leaf, dir_item) +
+			btrfs_dir_data_len(leaf, dir_item);
 		name_ptr = (unsigned long)(dir_item + 1);
 
 		if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
@@ -236,7 +323,8 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 	int ret = 0;
 
 	leaf = path->nodes[0];
-	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di);
+	sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) +
+		btrfs_dir_data_len(leaf, di);
 	item_len = btrfs_item_size_nr(leaf, path->slots[0]);
 	if (sub_item_len == item_len) {
 		ret = btrfs_del_item(trans, root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d6d32465bc9..37f8e0321af 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
+#include <linux/xattr.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -839,6 +840,9 @@ void btrfs_delete_inode(struct inode *inode)
 
 	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode);
+	if (ret)
+		goto no_delete_lock;
+	ret = btrfs_delete_xattrs(trans, root, inode);
 	if (ret)
 		goto no_delete_lock;
 	ret = btrfs_free_inode(trans, root, inode);
@@ -1110,7 +1114,8 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 			if (over)
 				goto nopos;
-			di_len = btrfs_dir_name_len(leaf, di) + sizeof(*di);
+			di_len = btrfs_dir_name_len(leaf, di) +
+				btrfs_dir_data_len(leaf, di) +sizeof(*di);
 			di_cur += di_len;
 			di = (struct btrfs_dir_item *)((char *)di + di_len);
 		}
@@ -2519,6 +2524,10 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.symlink	= btrfs_symlink,
 	.setattr	= btrfs_setattr,
 	.mknod		= btrfs_mknod,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= generic_removexattr,
 };
 
 static struct inode_operations btrfs_dir_ro_inode_operations = {
@@ -2567,6 +2576,10 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.truncate	= btrfs_truncate,
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr      = btrfs_listxattr,
+	.removexattr	= generic_removexattr,
 };
 
 static struct inode_operations btrfs_special_inode_operations = {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9f8696c8a8e..030324febf6 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -58,9 +58,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
-			printk("\t\tdir oid %llu flags %u type %u\n",
+			printk("\t\tdir oid %llu type %u\n",
 				(unsigned long long)found_key.objectid,
-				btrfs_dir_flags(l, di),
 				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f94aa1f97a0..c46bc391179 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
+#include "xattr.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -106,6 +107,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
 	sb->s_op = &btrfs_super_ops;
+	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 
 	tree_root = open_ctree(sb);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
new file mode 100644
index 00000000000..f4ac5e0bbad
--- /dev/null
+++ b/fs/btrfs/xattr.c
@@ -0,0 +1,497 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/rwsem.h>
+#include <linux/xattr.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "xattr.h"
+#include "disk-io.h"
+
+static struct xattr_handler *btrfs_xattr_handler_map[] = {
+	[BTRFS_XATTR_INDEX_USER]		= &btrfs_xattr_user_handler,
+	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
+	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
+	[BTRFS_XATTR_INDEX_TRUSTED]		= &btrfs_xattr_trusted_handler,
+	[BTRFS_XATTR_INDEX_SECURITY]		= &btrfs_xattr_security_handler,
+	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
+};
+
+struct xattr_handler *btrfs_xattr_handlers[] = {
+	&btrfs_xattr_user_handler,
+	&btrfs_xattr_acl_access_handler,
+	&btrfs_xattr_acl_default_handler,
+	&btrfs_xattr_trusted_handler,
+	&btrfs_xattr_security_handler,
+	&btrfs_xattr_system_handler,
+	NULL,
+};
+
+/*
+ * @param name - the xattr name
+ * @return - the xattr_handler for the xattr, NULL if its not found
+ *
+ * use this with listxattr where we don't already know the type of xattr we
+ * have
+ */
+static struct xattr_handler *find_btrfs_xattr_handler(struct extent_buffer *l,
+						      unsigned long name_ptr,
+						      u16 name_len)
+{
+	struct xattr_handler *handler = NULL;
+	int i = 0;
+
+	for (handler = btrfs_xattr_handlers[i]; handler != NULL; i++,
+	     handler = btrfs_xattr_handlers[i]) {
+		u16 prefix_len = strlen(handler->prefix);
+
+		if (name_len < prefix_len)
+			continue;
+
+		if (memcmp_extent_buffer(l, handler->prefix, name_ptr,
+					 prefix_len) == 0)
+			break;
+	}
+
+	return handler;
+}
+
+/*
+ * @param name_index - the index for the xattr handler
+ * @return the xattr_handler if we found it, NULL otherwise
+ *
+ * use this if we know the type of the xattr already
+ */
+static struct xattr_handler *btrfs_xattr_handler(int name_index)
+{
+	struct xattr_handler *handler = NULL;
+
+	if (name_index >= 0 &&
+	    name_index < ARRAY_SIZE(btrfs_xattr_handler_map))
+		handler = btrfs_xattr_handler_map[name_index];
+
+	return handler;
+}
+
+static inline char *get_name(const char *name, int name_index)
+{
+	char *ret = NULL;
+	struct xattr_handler *handler = btrfs_xattr_handler(name_index);
+	int prefix_len;
+
+	if (!handler)
+		return ret;
+
+	prefix_len = strlen(handler->prefix);
+
+	ret = kmalloc(strlen(name) + prefix_len + 1, GFP_KERNEL);
+	if (!ret)
+		return ret;
+
+	memcpy(ret, handler->prefix, prefix_len);
+	memcpy(ret+prefix_len, name, strlen(name));
+	ret[prefix_len + strlen(name)] = '\0';
+
+	return ret;
+}
+
+size_t btrfs_xattr_generic_list(struct inode *inode, char *list,
+				size_t list_size, const char *name,
+				size_t name_len)
+{
+	if (list && (name_len+1) <= list_size) {
+		memcpy(list, name, name_len);
+		list[name_len] = '\0';
+	} else
+		return -ERANGE;
+
+	return name_len+1;
+}
+
+ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
+			const char *attr_name, void *buffer, size_t size)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct xattr_handler *handler = btrfs_xattr_handler(name_index);
+	int ret = 0;
+	unsigned long data_ptr;
+	char *name;
+
+	if (!handler)
+		return -EOPNOTSUPP;
+
+	/* just in case... */
+	if (*attr_name == '\0')
+		return -EINVAL;
+
+	name = get_name(attr_name, name_index);
+	if (!name)
+		return -ENOMEM;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		kfree(name);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	/* lookup the xattr by name */
+	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+				strlen(name), 0);
+	if (!di || IS_ERR(di)) {
+		ret = -ENODATA;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	/* if size is 0, that means we want the size of the attr */
+	if (!size) {
+		ret = btrfs_dir_data_len(leaf, di);
+		goto out;
+	}
+
+	/* now get the data out of our dir_item */
+	if (btrfs_dir_data_len(leaf, di) > size) {
+		ret = -ERANGE;
+		goto out;
+	}
+	data_ptr = (unsigned long)((char *)(di + 1) +
+				   btrfs_dir_name_len(leaf, di));
+	read_extent_buffer(leaf, buffer, data_ptr,
+			   btrfs_dir_name_len(leaf, di));
+	ret = btrfs_dir_data_len(leaf, di);
+
+out:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	kfree(name);
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_xattr_set(struct inode *inode, int name_index,
+		    const char *attr_name, const void *value, size_t size,
+		    int flags)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct xattr_handler *handler = btrfs_xattr_handler(name_index);
+	char *name;
+	int ret = 0, mod = 0;
+
+	if (!handler)
+		return -EOPNOTSUPP;
+
+	/* just in case... */
+	if (*attr_name == '\0')
+		return -EINVAL;
+
+	name = get_name(attr_name, name_index);
+	if (!name)
+		return -ENOMEM;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		kfree(name);
+		return -ENOMEM;
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+
+	/* first lets see if we already have this xattr */
+	di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+				strlen(name), -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	/* ok we already have this xattr, lets remove it */
+	if (di) {
+		/* if we want create only exit */
+		if (flags & XATTR_CREATE) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+		ret = btrfs_delete_one_dir_name(trans, root, path, di);
+		if (ret)
+			goto out;
+		btrfs_release_path(root, path);
+
+		/* if we don't have a value then we are removing the xattr */
+		if (!value) {
+			mod = 1;
+			goto out;
+		}
+	} else if (flags & XATTR_REPLACE) {
+		/* we couldn't find the attr to replace, so error out */
+		ret = -ENODATA;
+		goto out;
+	}
+
+	/* ok we have to create a completely new xattr */
+	ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
+				      value, size, inode->i_ino);
+	if (ret)
+		goto out;
+	mod = 1;
+
+out:
+	if (mod) {
+		inode->i_ctime = CURRENT_TIME;
+		ret = btrfs_update_inode(trans, root, inode);
+	}
+
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	kfree(name);
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct btrfs_key key, found_key;
+	struct inode *inode = dentry->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_item *item;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct xattr_handler *handler;
+	int ret = 0, slot, advance;
+	size_t total_size = 0, size_left = size, written;
+	unsigned long name_ptr;
+	char *name;
+	u32 nritems;
+
+	/*
+	 * ok we want all objects associated with this id.
+	 * NOTE: we set key.offset = 0; because we want to start with the
+	 * first xattr that we find and walk forward
+	 */
+	key.objectid = inode->i_ino;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = 0;
+
+	path = btrfs_alloc_path();
+	path->reada = 2;
+	if (!path)
+		return -ENOMEM;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+
+	/* search for our xattrs */
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto err;
+	ret = 0;
+	advance = 0;
+	while (1) {
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		slot = path->slots[0];
+
+		/* this is where we start walking through the path */
+		if (advance || slot >= nritems) {
+			/*
+			 * if we've reached the last slot in this leaf we need
+			 * to go to the next leaf and reset everything
+			 */
+			if (slot >= nritems-1) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+				leaf = path->nodes[0];
+				nritems = btrfs_header_nritems(leaf);
+				slot = path->slots[0];
+			} else {
+				/*
+				 * just walking through the slots on this leaf
+				 */
+				slot++;
+				path->slots[0]++;
+			}
+		}
+		advance = 1;
+
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* check to make sure this item is what we want */
+		if (found_key.objectid != key.objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+
+		total_size += btrfs_dir_name_len(leaf, di)+1;
+
+		/* we are just looking for how big our buffer needs to be */
+		if (!size)
+			continue;
+
+		/* find our handler for this xattr */
+		name_ptr = (unsigned long)(di + 1);
+		handler = find_btrfs_xattr_handler(leaf, name_ptr,
+						   btrfs_dir_name_len(leaf, di));
+		if (!handler) {
+			printk(KERN_ERR "btrfs: unsupported xattr found\n");
+			continue;
+		}
+
+		name = kmalloc(btrfs_dir_name_len(leaf, di), GFP_KERNEL);
+		read_extent_buffer(leaf, name, name_ptr,
+				   btrfs_dir_name_len(leaf, di));
+
+		/* call the list function associated with this xattr */
+		written = handler->list(inode, buffer, size_left, name,
+					btrfs_dir_name_len(leaf, di));
+		kfree(name);
+
+		if (written < 0) {
+			ret = -ERANGE;
+			break;
+		}
+
+		size_left -= written;
+		buffer += written;
+	}
+	ret = total_size;
+
+err:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+/*
+ * delete all the xattrs associated with the inode.  fs_mutex should be
+ * held when we come into here
+ */
+int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key, found_key;
+	struct btrfs_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = inode->i_ino;
+	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	while(1) {
+		/* look for our next xattr */
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			goto out;
+		BUG_ON(ret == 0);
+
+		if (path->slots[0] == 0)
+			break;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		item = btrfs_item_nr(leaf, path->slots[0]);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		if (found_key.objectid != key.objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
+			break;
+
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
+		btrfs_release_path(root, path);
+	}
+	ret = 0;
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
+/*
+ * Handler functions
+ */
+#define BTRFS_XATTR_SETGET_FUNCS(name, index)				\
+static int btrfs_xattr_##name##_get(struct inode *inode,		\
+				    const char *name, void *value,	\
+				    size_t size)			\
+{									\
+	return btrfs_xattr_get(inode, index, name, value, size);	\
+}									\
+static int btrfs_xattr_##name##_set(struct inode *inode,		\
+				    const char *name, const void *value,\
+				    size_t size, int flags)		\
+{									\
+	return btrfs_xattr_set(inode, index, name, value, size, flags);	\
+}									\
+
+BTRFS_XATTR_SETGET_FUNCS(security, BTRFS_XATTR_INDEX_SECURITY);
+BTRFS_XATTR_SETGET_FUNCS(system, BTRFS_XATTR_INDEX_SYSTEM);
+BTRFS_XATTR_SETGET_FUNCS(user, BTRFS_XATTR_INDEX_USER);
+BTRFS_XATTR_SETGET_FUNCS(trusted, BTRFS_XATTR_INDEX_TRUSTED);
+
+struct xattr_handler btrfs_xattr_security_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.list	= btrfs_xattr_generic_list,
+	.get	= btrfs_xattr_security_get,
+	.set	= btrfs_xattr_security_set,
+};
+
+struct xattr_handler btrfs_xattr_system_handler = {
+	.prefix = XATTR_SYSTEM_PREFIX,
+	.list	= btrfs_xattr_generic_list,
+	.get	= btrfs_xattr_system_get,
+	.set	= btrfs_xattr_system_set,
+};
+
+struct xattr_handler btrfs_xattr_user_handler = {
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= btrfs_xattr_generic_list,
+	.get	= btrfs_xattr_user_get,
+	.set	= btrfs_xattr_user_set,
+};
+
+struct xattr_handler btrfs_xattr_trusted_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.list	= btrfs_xattr_generic_list,
+	.get	= btrfs_xattr_trusted_get,
+	.set	= btrfs_xattr_trusted_set,
+};
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
new file mode 100644
index 00000000000..b2e47e3f244
--- /dev/null
+++ b/fs/btrfs/xattr.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2007 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __XATTR__
+#define __XATTR__
+
+#include <linux/xattr.h>
+#include "ctree.h"
+
+/* Name indexes */
+enum {
+	BTRFS_XATTR_INDEX_USER,
+	BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
+	BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
+	BTRFS_XATTR_INDEX_TRUSTED,
+	BTRFS_XATTR_INDEX_SECURITY,
+	BTRFS_XATTR_INDEX_SYSTEM,
+	BTRFS_XATTR_INDEX_END,
+};
+
+extern struct xattr_handler btrfs_xattr_user_handler;
+extern struct xattr_handler btrfs_xattr_trusted_handler;
+extern struct xattr_handler btrfs_xattr_acl_access_handler;
+extern struct xattr_handler btrfs_xattr_acl_default_handler;
+extern struct xattr_handler btrfs_xattr_security_handler;
+extern struct xattr_handler btrfs_xattr_system_handler;
+
+extern struct xattr_handler *btrfs_xattr_handlers[];
+
+ssize_t btrfs_xattr_get(struct inode *inode, int name_index, const char *name,
+			void *buffer, size_t size);
+int btrfs_xattr_set(struct inode *inode, int name_index, const char *name,
+			const void *value, size_t size, int flags);
+
+/*
+ * the only reason this is public is for acl.c.  There may be a point where
+ * acl.c doesn't need it, and if thats the case we need to remove it and make
+ * it static in xattr.c
+ */
+size_t btrfs_xattr_generic_list(struct inode *inode, char *list,
+				size_t list_size, const char *name,
+				size_t name_len);
+#endif /* __XATTR__ */
-- 
cgit v1.2.3


From 324ae4df00fdc1a6a179bf584d8addf027bb75fb Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 16 Nov 2007 14:57:08 -0500
Subject: Btrfs: Add block group pinned accounting back

This patch adds a helper function 'update_pinned_extents' to
extent-tree.c. The usage of the helper function is similar to
'update_block_group',  the last parameter of the function indicates
pin vs unpin.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  5 +++--
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 54 +++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 44 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 27cadae1af6..56b977ffe91 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -292,8 +292,8 @@ struct btrfs_block_group_cache {
 	struct btrfs_block_group_item item;
 	int data;
 	int cached;
+	u64 pinned;
 };
-
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	struct btrfs_root *extent_root;
@@ -324,8 +324,9 @@ struct btrfs_fs_info {
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
-};
 
+	u64 total_pinned;
+};
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c8533fba7c..3e16cca72b4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -569,6 +569,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
+	fs_info->total_pinned = 0;
 
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7405bd5301a..4ef6dc3d7d3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -277,7 +277,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		if (shint && (shint->data == data ||
 			      shint->data == BTRFS_BLOCK_GROUP_MIXED)) {
 			used = btrfs_block_group_used(&shint->item);
-			if (used < div_factor(shint->key.offset, factor)) {
+			if (used + shint->pinned <
+			    div_factor(shint->key.offset, factor)) {
 				return shint;
 			}
 		}
@@ -285,7 +286,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (hint && (hint->data == data ||
 		     hint->data == BTRFS_BLOCK_GROUP_MIXED)) {
 		used = btrfs_block_group_used(&hint->item);
-		if (used < div_factor(hint->key.offset, factor)) {
+		if (used + hint->pinned <
+		    div_factor(hint->key.offset, factor)) {
 			return hint;
 		}
 		last = hint->key.objectid + hint->key.offset;
@@ -317,8 +319,7 @@ again:
 			free_check = cache->key.offset;
 		else
 			free_check = div_factor(cache->key.offset, factor);
-
-		if (used < free_check) {
+		if (used + cache->pinned < free_check) {
 			found_group = cache;
 			goto found;
 		}
@@ -664,6 +665,37 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	}
 	return 0;
 }
+static int update_pinned_extents(struct btrfs_root *root,
+				u64 bytenr, u64 num, int pin)
+{
+	u64 len;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	if (pin) {
+		set_extent_dirty(&fs_info->pinned_extents,
+				bytenr, bytenr + num - 1, GFP_NOFS);
+	} else {
+		clear_extent_dirty(&fs_info->pinned_extents,
+				bytenr, bytenr + num - 1, GFP_NOFS);
+	}
+	while (num > 0) {
+		cache = btrfs_lookup_block_group(fs_info, bytenr);
+		WARN_ON(!cache);
+		len = min(num, cache->key.offset -
+			  (bytenr - cache->key.objectid));
+		if (pin) {
+			cache->pinned += len;
+			fs_info->total_pinned += len;
+		} else {
+			cache->pinned -= len;
+			fs_info->total_pinned -= len;
+		}
+		bytenr += len;
+		num -= len;
+	}
+	return 0;
+}
 
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy)
 {
@@ -691,9 +723,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	int ret;
-	struct extent_map_tree *pinned_extents = &root->fs_info->pinned_extents;
 	struct extent_map_tree *free_space_cache;
-
 	free_space_cache = &root->fs_info->free_space_cache;
 
 	while(1) {
@@ -701,9 +731,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-
-		clear_extent_dirty(pinned_extents, start, end,
-				   GFP_NOFS);
+		update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
 	}
@@ -761,8 +789,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			}
 			free_extent_buffer(buf);
 		}
-		set_extent_dirty(&root->fs_info->pinned_extents,
-				 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+		update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
@@ -866,8 +893,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 					    EXTENT_LOCKED);
 		if (ret)
 			break;
-
-		set_extent_dirty(pinned_extents, start, end, GFP_NOFS);
+		update_pinned_extents(extent_root, start, end + 1 - start, 1);
 		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
 		ret = __free_extent(trans, extent_root,
@@ -1579,7 +1605,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				   sizeof(cache->item));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->cached = 0;
-
+		cache->pinned = 0;
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 
-- 
cgit v1.2.3


From 5e5745dcaf73c5860a16c09572d59383ecee6d28 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 16 Nov 2007 14:57:09 -0500
Subject: Btrfs: Add full_scan parameter to find_search_start

This patch adds a new parameter 'full_scan' to 'find_search_start',
thereby 'find_search_start' can know whether 'find_free_extent' is in
full scan phrase. I feel that 'find_search_start' should skip calling
'btrfs_find_block_group' when 'find_free_extent' is in full scan
phrase. In my test on a 2GB volume, Oops occurs when space usage is
about 76%. After apply the patch,  Oops occurs when space usage is
near 100%.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4ef6dc3d7d3..e5340677d6c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -158,10 +158,10 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 
 	return NULL;
 }
-
 static u64 find_search_start(struct btrfs_root *root,
 			     struct btrfs_block_group_cache **cache_ret,
-			     u64 search_start, int num, int data)
+			     u64 search_start, int num,
+			     int data, int full_scan)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
@@ -218,10 +218,10 @@ wrapped:
 	if (cache_miss && !cache->cached) {
 		cache_block_group(root, cache);
 		last = cache_miss;
-
 		cache = btrfs_lookup_block_group(root->fs_info, last);
 	}
-	cache = btrfs_find_block_group(root, cache, last, data, 0);
+	if (!full_scan)
+		cache = btrfs_find_block_group(root, cache, last, data, 0);
 	*cache_ret = cache;
 	cache_miss = 0;
 	goto again;
@@ -979,12 +979,10 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	total_needed += empty_size;
 	path = btrfs_alloc_path();
-
 check_failed:
-	search_start = find_search_start(root, &block_group,
-					 search_start, total_needed, data);
+	search_start = find_search_start(root, &block_group, search_start,
+					 total_needed, data, full_scan);
 	cached_start = search_start;
-
 	btrfs_init_path(path);
 	ins->objectid = search_start;
 	ins->offset = 0;
-- 
cgit v1.2.3


From 5cf664263b05beb080bf95037e4c3bd50c80b096 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 16 Nov 2007 14:57:09 -0500
Subject: Btrfs: Off by one fixes for extent-tree.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e5340677d6c..6a7a9ba75fd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -150,12 +150,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 		return NULL;
 
 	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
-
-
-	if (block_group->key.objectid <= bytenr && bytenr <=
+	if (block_group->key.objectid <= bytenr && bytenr <
 	    block_group->key.objectid + block_group->key.offset)
 		return block_group;
-
 	return NULL;
 }
 static u64 find_search_start(struct btrfs_root *root,
@@ -195,7 +192,7 @@ again:
 			continue;
 		}
 		if (data != BTRFS_BLOCK_GROUP_MIXED &&
-		    start + num >= cache->key.objectid + cache->key.offset)
+		    start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
 		return start;
 	}
@@ -1088,9 +1085,8 @@ check_pending:
 
 	if (ins->objectid + num_bytes >= search_end)
 		goto enospc;
-
 	if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
-	    ins->objectid + num_bytes >= block_group->
+	    ins->objectid + num_bytes > block_group->
 	    key.objectid + block_group->key.offset) {
 		search_start = block_group->key.objectid +
 			block_group->key.offset;
-- 
cgit v1.2.3


From 1caf9342fc71d020e788a59598a44d4ea0136b36 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Mon, 19 Nov 2007 10:18:17 -0500
Subject: Btrfs: Make ACLs return EOPNOTSUPP for now

There was a slight problem with ACL's returning EINVAL when you tried to set an
ACL.  This isn't correct, we should be returning EOPNOTSUPP, so I did a very
ugly thing and just commented everybody out and made them return EOPNOTSUPP.
This is only temporary, I'm going back to implement ACL's, but Chris wants to
push out a release so this will suffice for now.

Also Yan suggested setting reada to -1 in the delete case to enable backwards
readahead, and in the listxattr case I moved path->reada = 2; to after the if
(!path) check so we can avoid a possible null dereference.  Thank you,

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   | 18 ++++++++++++++++++
 fs/btrfs/xattr.c |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index aee9f0657c3..889ba89d3f9 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -23,32 +23,50 @@
 #include "ctree.h"
 #include "xattr.h"
 
+/*
+ * FIXME: At this point this is all place holder stuff, we just return
+ * -EOPNOTSUPP so cp won't complain when it tries to copy over a file with an
+ *  acl on it.
+ */
+
 static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
 				      void *value, size_t size)
 {
+	/*
 	return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name,
 			       value, size);
+	*/
+	return -EOPNOTSUPP;
 }
 
 static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
 				      const void *value, size_t size, int flags)
 {
+	/*
 	return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name,
 			       value, size, flags);
+	*/
+	return -EOPNOTSUPP;
 }
 
 static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
 				       void *value, size_t size)
 {
+	/*
 	return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
 			       name, value, size);
+	*/
+	return -EOPNOTSUPP;
 }
 
 static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
 				       const void *value, size_t size, int flags)
 {
+	/*
 	return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
 			       name, value, size, flags);
+	*/
+	return -EOPNOTSUPP;
 }
 
 struct xattr_handler btrfs_xattr_acl_default_handler = {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f4ac5e0bbad..984616cca25 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -302,9 +302,9 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	key.offset = 0;
 
 	path = btrfs_alloc_path();
-	path->reada = 2;
 	if (!path)
 		return -ENOMEM;
+	path->reada = 2;
 
 	mutex_lock(&root->fs_info->fs_mutex);
 
@@ -410,7 +410,7 @@ int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-
+	path->reada = -1;
 	key.objectid = inode->i_ino;
 	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
 	key.offset = (u64)-1;
-- 
cgit v1.2.3


From 3acd7ee876c508973eb4722de5b00b3dff4c7372 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Mon, 19 Nov 2007 10:18:19 -0500
Subject: Btrfs: xattr fixes

This fixes two typos that Yan pointed out to me.  Thanks much,

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/xattr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 984616cca25..33caaf5b4b0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -180,7 +180,7 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 	data_ptr = (unsigned long)((char *)(di + 1) +
 				   btrfs_dir_name_len(leaf, di));
 	read_extent_buffer(leaf, buffer, data_ptr,
-			   btrfs_dir_name_len(leaf, di));
+			   btrfs_dir_data_len(leaf, di));
 	ret = btrfs_dir_data_len(leaf, di);
 
 out:
@@ -490,7 +490,7 @@ struct xattr_handler btrfs_xattr_user_handler = {
 };
 
 struct xattr_handler btrfs_xattr_trusted_handler = {
-	.prefix = XATTR_USER_PREFIX,
+	.prefix = XATTR_TRUSTED_PREFIX,
 	.list	= btrfs_xattr_generic_list,
 	.get	= btrfs_xattr_trusted_get,
 	.set	= btrfs_xattr_trusted_set,
-- 
cgit v1.2.3


From 2f4cbe6442d3be7b5e4cf1607a5ab33995fe2d25 Mon Sep 17 00:00:00 2001
From: Wyatt Banks <wyatt@banksresearch.com>
Date: Mon, 19 Nov 2007 10:22:33 -0500
Subject: Btrfs: Return value checking in module init

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 16 ++++++++++++++--
 fs/btrfs/extent_map.h |  2 +-
 fs/btrfs/super.c      | 21 ++++++++++++++++++---
 3 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b3ff63c4bf2..f91f28efdb5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -42,18 +42,30 @@ struct extent_page_data {
 	struct extent_map_tree *tree;
 	get_extent_t *get_extent;
 };
-
-void __init extent_map_init(void)
+int __init extent_map_init(void)
 {
 	extent_map_cache = btrfs_cache_create("extent_map",
 					    sizeof(struct extent_map), 0,
 					    NULL);
+	if (!extent_map_cache)
+		return -ENOMEM;
 	extent_state_cache = btrfs_cache_create("extent_state",
 					    sizeof(struct extent_state), 0,
 					    NULL);
+	if (!extent_state_cache)
+		goto free_map_cache;
 	extent_buffer_cache = btrfs_cache_create("extent_buffers",
 					    sizeof(struct extent_buffer), 0,
 					    NULL);
+	if (!extent_buffer_cache)
+		goto free_state_cache;
+	return 0;
+
+free_state_cache:
+	kmem_cache_destroy(extent_state_cache);
+free_map_cache:
+	kmem_cache_destroy(extent_map_cache);
+	return -ENOMEM;
 }
 
 void __exit extent_map_exit(void)
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fbd23a72f53..24ddc8c9346 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -110,7 +110,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
 void free_extent_map(struct extent_map *em);
 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 			  get_extent_t *get_extent);
-void __init extent_map_init(void);
+int __init extent_map_init(void);
 void __exit extent_map_exit(void);
 
 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c46bc391179..ffa02872c4d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -347,9 +347,24 @@ static int __init init_btrfs_fs(void)
 	btrfs_init_transaction_sys();
 	err = btrfs_init_cachep();
 	if (err)
-		return err;
-	extent_map_init();
-	return register_filesystem(&btrfs_fs_type);
+		goto free_transaction_sys;
+	err = extent_map_init();
+	if (err)
+		goto free_cachep;
+
+	err = register_filesystem(&btrfs_fs_type);
+	if (err)
+		goto free_extent_map;
+	return 0;
+
+free_extent_map:
+	extent_map_exit();
+free_cachep:
+	btrfs_destroy_cachep();
+free_transaction_sys:
+	btrfs_exit_transaction_sys();
+	btrfs_exit_sysfs();
+	return err;
 }
 
 static void __exit exit_btrfs_fs(void)
-- 
cgit v1.2.3


From 3e9fd94ff0028a044d55690eb0a801fd1472e3c6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Nov 2007 10:47:25 -0500
Subject: Btrfs: Avoid fragmentation from parallel delalloc filling

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 83 ++++++++++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f91f28efdb5..7fd4eb7a8f0 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1033,11 +1033,11 @@ out:
 EXPORT_SYMBOL(find_first_extent_bit);
 
 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
-			     u64 start, u64 lock_start, u64 *end, u64 max_bytes)
+			     u64 *start, u64 *end, u64 max_bytes)
 {
 	struct rb_node *node;
 	struct extent_state *state;
-	u64 cur_start = start;
+	u64 cur_start = *start;
 	u64 found = 0;
 	u64 total_bytes = 0;
 
@@ -1054,27 +1054,43 @@ search_again:
 
 	while(1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->start != cur_start) {
+		if (found && state->start != cur_start) {
 			goto out;
 		}
 		if (!(state->state & EXTENT_DELALLOC)) {
 			goto out;
 		}
-		if (state->start >= lock_start) {
-			if (state->state & EXTENT_LOCKED) {
-				DEFINE_WAIT(wait);
-				atomic_inc(&state->refs);
-				prepare_to_wait(&state->wq, &wait,
-						TASK_UNINTERRUPTIBLE);
-				write_unlock_irq(&tree->lock);
-				schedule();
-				write_lock_irq(&tree->lock);
-				finish_wait(&state->wq, &wait);
-				free_extent_state(state);
-				goto search_again;
+		if (!found) {
+			struct extent_state *prev_state;
+			struct rb_node *prev_node = node;
+			while(1) {
+				prev_node = rb_prev(prev_node);
+				if (!prev_node)
+					break;
+				prev_state = rb_entry(prev_node,
+						      struct extent_state,
+						      rb_node);
+				if (!(prev_state->state & EXTENT_DELALLOC))
+					break;
+				state = prev_state;
+				node = prev_node;
 			}
-			state->state |= EXTENT_LOCKED;
 		}
+		if (state->state & EXTENT_LOCKED) {
+			DEFINE_WAIT(wait);
+			atomic_inc(&state->refs);
+			prepare_to_wait(&state->wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+			write_unlock_irq(&tree->lock);
+			schedule();
+			write_lock_irq(&tree->lock);
+			finish_wait(&state->wq, &wait);
+			free_extent_state(state);
+			goto search_again;
+		}
+		state->state |= EXTENT_LOCKED;
+		if (!found)
+			*start = state->start;
 		found++;
 		*end = state->end;
 		cur_start = state->end + 1;
@@ -1695,6 +1711,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	struct extent_page_data *epd = data;
 	struct extent_map_tree *tree = epd->tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 delalloc_start;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
@@ -1729,25 +1746,23 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	set_page_extent_mapped(page);
 
-	lock_extent(tree, start, page_end, GFP_NOFS);
-	nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
-					       &delalloc_end,
-					       128 * 1024 * 1024);
-	if (nr_delalloc) {
-		tree->ops->fill_delalloc(inode, start, delalloc_end);
-		if (delalloc_end >= page_end + 1) {
-			clear_extent_bit(tree, page_end + 1, delalloc_end,
-					 EXTENT_LOCKED | EXTENT_DELALLOC,
-					 1, 0, GFP_NOFS);
-		}
-		clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC,
-				 0, 0, GFP_NOFS);
-		if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-			printk("found delalloc bits after clear extent_bit\n");
-		}
-	} else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-		printk("found delalloc bits after find_delalloc_range returns 0\n");
+	delalloc_start = start;
+	delalloc_end = 0;
+	while(delalloc_end < page_end) {
+		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+						       &delalloc_end,
+						       128 * 1024 * 1024);
+		if (nr_delalloc <= 0)
+			break;
+		tree->ops->fill_delalloc(inode, delalloc_start,
+					 delalloc_end);
+		clear_extent_bit(tree, delalloc_start,
+				 delalloc_end,
+				 EXTENT_LOCKED | EXTENT_DELALLOC,
+				 1, 0, GFP_NOFS);
+		delalloc_start = delalloc_end + 1;
 	}
+	lock_extent(tree, start, page_end, GFP_NOFS);
 
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-- 
cgit v1.2.3


From 7073c8e852946274e4d50fdf072438612f9dc845 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 20 Nov 2007 13:44:45 -0500
Subject: Btrfs: Make sure page mapping dirty tag is properly cleared

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 7fd4eb7a8f0..a4e9096754f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1825,12 +1825,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (ret)
 			SetPageError(page);
 		else {
-			unsigned long nr = end_index + 1;
+			unsigned long max_nr = end_index + 1;
 			set_range_writeback(tree, cur, cur + iosize - 1);
+			if (!PageWriteback(page)) {
+				printk("warning page %lu not writeback, "
+				       "cur %llu end %llu\n", page->index,
+				       (unsigned long long)cur,
+				       (unsigned long long)end);
+			}
 
 			ret = submit_extent_page(WRITE, tree, page, sector,
 						 iosize, page_offset, bdev,
-						 &epd->bio, nr,
+						 &epd->bio, max_nr,
 						 end_bio_extent_writepage);
 			if (ret)
 				SetPageError(page);
@@ -1840,6 +1846,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		nr++;
 	}
 done:
+	if (nr == 0) {
+		/* make sure the mapping tag for page dirty gets cleared */
+		set_page_writeback(page);
+		end_page_writeback(page);
+	}
 	unlock_extent(tree, start, page_end, GFP_NOFS);
 	unlock_page(page);
 	return 0;
@@ -2408,6 +2419,13 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree,
 			}
 		}
 		clear_page_dirty_for_io(page);
+		write_lock_irq(&page->mapping->tree_lock);
+		if (!PageDirty(page)) {
+			radix_tree_tag_clear(&page->mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		}
+		write_unlock_irq(&page->mapping->tree_lock);
 		unlock_page(page);
 	}
 	return 0;
-- 
cgit v1.2.3


From 0e4de58432f552c0ecead965b0fad8a9fb1d31ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Nov 2007 10:55:49 -0500
Subject: Btrfs: Add check for null block group to find_search_start

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6a7a9ba75fd..39be6baccc3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -204,6 +204,7 @@ new_group:
 wrapped:
 	cache = btrfs_lookup_block_group(root->fs_info, last);
 	if (!cache) {
+no_cache:
 		if (!wrapped) {
 			wrapped = 1;
 			last = search_start;
@@ -219,6 +220,8 @@ wrapped:
 	}
 	if (!full_scan)
 		cache = btrfs_find_block_group(root, cache, last, data, 0);
+	if (!cache)
+		goto no_cache;
 	*cache_ret = cache;
 	cache_miss = 0;
 	goto again;
-- 
cgit v1.2.3


From eef1c494a2d4212e13c67d05e9cc3cd1e6dfed5d Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 26 Nov 2007 10:58:13 -0500
Subject: Btrfs: Properly update right_nritems in push_leaf_left

The codes that fixup the right leaf and the codes that dirty the
extnet buffer use the variable 'right_nritems' ,  both of them expect
'right_nritems' is the number of items in right leaf after the push.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1b47fe71e0b..3b16051b121 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1717,11 +1717,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			      btrfs_item_nr_offset(push_items),
 			     (btrfs_header_nritems(right) - push_items) *
 			     sizeof(struct btrfs_item));
-
 	}
-	btrfs_set_header_nritems(right, right_nritems - push_items);
+	right_nritems -= push_items;
+	btrfs_set_header_nritems(right, right_nritems);
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
-
 	for (i = 0; i < right_nritems; i++) {
 		item = btrfs_item_nr(right, i);
 
-- 
cgit v1.2.3


From 015a739c7c238768fbfa4eea8ea2ebc1a35e7bb1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Nov 2007 16:15:16 -0800
Subject: Btrfs: Handle writeback under high memory pressure better

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/extent_map.c  | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 39be6baccc3..3c00f967ecc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -961,7 +961,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	level = btrfs_header_level(root->node);
 
-	if (num_bytes >= 96 * 1024 * 1024 && hint_byte) {
+	if (num_bytes >= 32 * 1024 * 1024 && hint_byte) {
 		data = BTRFS_BLOCK_GROUP_MIXED;
 	}
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a4e9096754f..55f272c335c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1861,13 +1861,25 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			  struct writeback_control *wbc)
 {
 	int ret;
+	struct address_space *mapping = page->mapping;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
 	};
+	struct writeback_control wbc_writepages = {
+		.bdi		= wbc->bdi,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 64,
+		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
+		.range_end	= (loff_t)-1,
+	};
+
 
 	ret = __extent_writepage(page, wbc, &epd);
+
+	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
 	if (epd.bio)
 		submit_one_bio(WRITE, epd.bio);
 	return ret;
-- 
cgit v1.2.3


From 793955bca66c99defdffc857ae6eb7e8431d6bbe Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Nov 2007 16:34:41 -0800
Subject: Btrfs: Limit btree writeback to prevent seeks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c    |  9 +++++++++
 fs/btrfs/extent_map.c | 39 +++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_map.h |  3 +++
 3 files changed, 51 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3e16cca72b4..28e9ef8a325 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -210,6 +210,15 @@ static int btree_writepages(struct address_space *mapping,
 {
 	struct extent_map_tree *tree;
 	tree = &BTRFS_I(mapping->host)->extent_tree;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		u64 num_dirty;
+		u64 start = 0;
+		unsigned long thresh = 96 * 1024 * 1024;
+		num_dirty = count_range_bits(tree, &start, thresh, EXTENT_DIRTY);
+		if (num_dirty < thresh) {
+			return 0;
+		}
+	}
 	return extent_writepages(tree, mapping, btree_get_extent, wbc);
 }
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 55f272c335c..b6a4974ecc2 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1106,6 +1106,45 @@ out:
 	return found;
 }
 
+u64 count_range_bits(struct extent_map_tree *tree,
+		     u64 *start, u64 max_bytes, unsigned long bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 total_bytes = 0;
+	int found = 0;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, cur_start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if ((state->state & bits)) {
+			total_bytes += state->end - state->start + 1;
+			if (total_bytes >= max_bytes)
+				break;
+			if (!found) {
+				*start = state->start;
+				found = 1;
+			}
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	write_unlock_irq(&tree->lock);
+	return total_bytes;
+}
+
 /*
  * helper function to lock both pages and extents in the tree.
  * pages must be locked first.
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 24ddc8c9346..13c562f7cb6 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -113,6 +113,9 @@ int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 int __init extent_map_init(void);
 void __exit extent_map_exit(void);
 
+u64 count_range_bits(struct extent_map_tree *tree,
+		     u64 *start, u64 max_bytes, unsigned long bits);
+
 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
 		   int bits, int filled);
 int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-- 
cgit v1.2.3


From 8790bad7fb5ac4929dcd52c5dd8d6b81f8c05be8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 26 Nov 2007 19:47:05 -0500
Subject: Btrfs: Only limit btree writeback for pdflush

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 28e9ef8a325..8a74f9027d6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -210,7 +210,7 @@ static int btree_writepages(struct address_space *mapping,
 {
 	struct extent_map_tree *tree;
 	tree = &BTRFS_I(mapping->host)->extent_tree;
-	if (wbc->sync_mode == WB_SYNC_NONE) {
+	if (wbc->sync_mode == WB_SYNC_NONE && current_is_pdflush()) {
 		u64 num_dirty;
 		u64 start = 0;
 		unsigned long thresh = 96 * 1024 * 1024;
-- 
cgit v1.2.3


From 448d640b668dae3928591e83f2bf4ca9f4c06e52 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Nov 2007 07:52:01 -0800
Subject: Btrfs: Fine tune the btree writeback exclusion some more

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 ++++
 fs/btrfs/file.c    | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8a74f9027d6..de0552532d2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -214,6 +214,10 @@ static int btree_writepages(struct address_space *mapping,
 		u64 num_dirty;
 		u64 start = 0;
 		unsigned long thresh = 96 * 1024 * 1024;
+
+		if (wbc->for_kupdate)
+			return 0;
+
 		num_dirty = count_range_bits(tree, &start, thresh, EXTENT_DIRTY);
 		if (num_dirty < thresh) {
 			return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9dd2c5c23b5..ba624ae16e6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -767,7 +767,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		num_written += write_bytes;
 
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
-		btrfs_btree_balance_dirty(root, 1);
+		if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+			btrfs_btree_balance_dirty(root, 1);
 		cond_resched();
 	}
 	mutex_unlock(&inode->i_mutex);
-- 
cgit v1.2.3


From ca6646264b7dab662d84435441164bb2a8e8885a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 Nov 2007 11:16:35 -0500
Subject: Btrfs: Add efficient dirty accounting to the extent_map tree

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c    |  7 ++++++-
 fs/btrfs/extent_map.c | 32 +++++++++++++++++++++++++++++---
 fs/btrfs/extent_map.h |  1 +
 3 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index de0552532d2..eef4ab56b9c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -210,7 +210,7 @@ static int btree_writepages(struct address_space *mapping,
 {
 	struct extent_map_tree *tree;
 	tree = &BTRFS_I(mapping->host)->extent_tree;
-	if (wbc->sync_mode == WB_SYNC_NONE && current_is_pdflush()) {
+	if (wbc->sync_mode == WB_SYNC_NONE) {
 		u64 num_dirty;
 		u64 start = 0;
 		unsigned long thresh = 96 * 1024 * 1024;
@@ -218,6 +218,11 @@ static int btree_writepages(struct address_space *mapping,
 		if (wbc->for_kupdate)
 			return 0;
 
+		if (current_is_pdflush()) {
+			thresh = 96 * 1024 * 1024;
+		} else {
+			thresh = 8 * 1024 * 1024;
+		}
 		num_dirty = count_range_bits(tree, &start, thresh, EXTENT_DIRTY);
 		if (num_dirty < thresh) {
 			return 0;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b6a4974ecc2..06e437723dc 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -42,6 +42,7 @@ struct extent_page_data {
 	struct extent_map_tree *tree;
 	get_extent_t *get_extent;
 };
+
 int __init extent_map_init(void)
 {
 	extent_map_cache = btrfs_cache_create("extent_map",
@@ -94,6 +95,7 @@ void extent_map_tree_init(struct extent_map_tree *tree,
 	tree->map.rb_node = NULL;
 	tree->state.rb_node = NULL;
 	tree->ops = NULL;
+	tree->dirty_bytes = 0;
 	rwlock_init(&tree->lock);
 	spin_lock_init(&tree->lru_lock);
 	tree->mapping = mapping;
@@ -414,6 +416,8 @@ static int insert_state(struct extent_map_tree *tree,
 		printk("end < start %Lu %Lu\n", end, start);
 		WARN_ON(1);
 	}
+	if (bits & EXTENT_DIRTY)
+		tree->dirty_bytes += end - start + 1;
 	state->state |= bits;
 	state->start = start;
 	state->end = end;
@@ -476,6 +480,12 @@ static int clear_state_bit(struct extent_map_tree *tree,
 			    int delete)
 {
 	int ret = state->state & bits;
+
+	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		WARN_ON(range > tree->dirty_bytes);
+		tree->dirty_bytes -= range;
+	}
 	state->state &= ~bits;
 	if (wake)
 		wake_up(&state->wq);
@@ -668,6 +678,17 @@ out:
 }
 EXPORT_SYMBOL(wait_extent_bit);
 
+static void set_state_bits(struct extent_map_tree *tree,
+			   struct extent_state *state,
+			   int bits)
+{
+	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		tree->dirty_bytes += range;
+	}
+	state->state |= bits;
+}
+
 /*
  * set some bits on a range in the tree.  This may require allocations
  * or sleeping, so the gfp mask is used to indicate what is allowed.
@@ -727,7 +748,7 @@ again:
 			err = -EEXIST;
 			goto out;
 		}
-		state->state |= bits;
+		set_state_bits(tree, state, bits);
 		start = state->end + 1;
 		merge_state(tree, state);
 		goto search_again;
@@ -762,7 +783,7 @@ again:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			state->state |= bits;
+			set_state_bits(tree, state, bits);
 			start = state->end + 1;
 			merge_state(tree, state);
 		} else {
@@ -808,7 +829,7 @@ again:
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
 
-		prealloc->state |= bits;
+		set_state_bits(tree, prealloc, bits);
 		merge_state(tree, prealloc);
 		prealloc = NULL;
 		goto out;
@@ -1116,6 +1137,11 @@ u64 count_range_bits(struct extent_map_tree *tree,
 	int found = 0;
 
 	write_lock_irq(&tree->lock);
+	if (bits == EXTENT_DIRTY) {
+		*start = 0;
+		total_bytes = tree->dirty_bytes;
+		goto out;
+	}
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 13c562f7cb6..2ffc0c76338 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -40,6 +40,7 @@ struct extent_map_tree {
 	struct rb_root map;
 	struct rb_root state;
 	struct address_space *mapping;
+	u64 dirty_bytes;
 	rwlock_t lock;
 	struct extent_map_ops *ops;
 	spinlock_t lru_lock;
-- 
cgit v1.2.3


From 00f5c795fca47d038fedd3f0c9311da3be710c9f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 30 Nov 2007 10:09:33 -0500
Subject: btrfs_drop_extents: make sure the item is getting smaller before
 truncate

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent-tree.c |  8 ++++++++
 fs/btrfs/file.c        | 30 +++++++++++++++---------------
 3 files changed, 24 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 56b977ffe91..b29b911dd82 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1037,7 +1037,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 inline_end, u64 *hint_block);
+		       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3c00f967ecc..55abdf997ca 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -45,6 +45,9 @@ static int cache_block_group(struct btrfs_root *root,
 	u64 first_free;
 	int found = 0;
 
+	if (!block_group)
+		return 0;
+
 	root = root->fs_info->extent_root;
 	free_space_cache = &root->fs_info->free_space_cache;
 
@@ -168,6 +171,11 @@ static u64 find_search_start(struct btrfs_root *root,
 	u64 cache_miss = 0;
 	int wrapped = 0;
 
+	if (!cache) {
+		cache = btrfs_lookup_block_group(root->fs_info, search_start);
+		if (!cache)
+			return search_start;
+	}
 again:
 	ret = cache_block_group(root, cache);
 	if (ret)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ba624ae16e6..b0d63778726 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -377,23 +377,23 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
  */
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
-		       u64 start, u64 end, u64 inline_end, u64 *hint_byte)
+		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
-	int ret;
-	struct btrfs_key key;
+	u64 extent_end = 0;
+	u64 search_start = start;
 	struct extent_buffer *leaf;
-	int slot;
 	struct btrfs_file_extent_item *extent;
-	u64 extent_end = 0;
-	int keep;
-	struct btrfs_file_extent_item old;
 	struct btrfs_path *path;
-	u64 search_start = start;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item old;
+	int keep;
+	int slot;
 	int bookend;
 	int found_type;
 	int found_extent;
 	int found_inline;
 	int recow;
+	int ret;
 
 	btrfs_drop_extent_cache(inode, start, end - 1);
 
@@ -502,7 +502,7 @@ next_slot:
 			}
 			bookend = 1;
 			if (found_inline && start <= key.offset &&
-			    inline_end < extent_end)
+			    inline_limit < extent_end)
 				keep = 1;
 		}
 		/* truncate existing extent */
@@ -526,12 +526,12 @@ next_slot:
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								new_num);
 				btrfs_mark_buffer_dirty(leaf);
-			} else if (end > extent_end &&
-				   key.offset < inline_end &&
-				   inline_end < extent_end) {
+			} else if (key.offset < inline_limit &&
+				   (end > extent_end) &&
+				   (inline_limit < extent_end)) {
 				u32 new_size;
 				new_size = btrfs_file_extent_calc_inline_size(
-						   inline_end - key.offset);
+						   inline_limit - key.offset);
 				btrfs_truncate_item(trans, root, path,
 						    new_size, 1);
 			}
@@ -575,10 +575,10 @@ next_slot:
 				continue;
 		}
 		if (bookend && found_inline && start <= key.offset &&
-		    inline_end < extent_end) {
+		    inline_limit < extent_end && key.offset <= inline_limit) {
 			u32 new_size;
 			new_size = btrfs_file_extent_calc_inline_size(
-						   extent_end - inline_end);
+						   extent_end - inline_limit);
 			btrfs_truncate_item(trans, root, path, new_size, 0);
 		}
 		/* create bookend, splitting the extent in two */
-- 
cgit v1.2.3


From 87ee04eb0f2f0c63314cef4a76bd1adac748425e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 30 Nov 2007 11:30:34 -0500
Subject: Btrfs: Add simple stripe size parameter

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  5 +++++
 fs/btrfs/disk-io.c     | 15 ++++++++++-----
 fs/btrfs/extent-tree.c | 36 ++++++++++++++++++++++++++++--------
 3 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b29b911dd82..9bc1b0a8615 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -135,6 +135,7 @@ struct btrfs_super_block {
 	__le32 sectorsize;
 	__le32 nodesize;
 	__le32 leafsize;
+	__le32 stripesize;
 	u8 root_level;
 } __attribute__ ((__packed__));
 
@@ -353,6 +354,8 @@ struct btrfs_root {
 	/* leaf allocations are done in leafsize units */
 	u32 leafsize;
 
+	u32 stripesize;
+
 	u32 type;
 	u64 highest_inode;
 	u64 last_inode_alloc;
@@ -776,6 +779,8 @@ BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
 			 nodesize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
 			 leafsize, 32);
+BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
+			 stripesize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
 			 root_dir_objectid, 64);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eef4ab56b9c..60a30da6af0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -355,7 +355,7 @@ int wait_on_tree_block_writeback(struct btrfs_root *root,
 }
 
 static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
-			struct btrfs_root *root,
+			u32 stripesize, struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
 			u64 objectid)
 {
@@ -365,6 +365,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
+	root->stripesize = stripesize;
 	root->ref_cows = 0;
 	root->fs_info = fs_info;
 	root->objectid = objectid;
@@ -393,7 +394,8 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	u32 blocksize;
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
-		     tree_root->sectorsize, root, fs_info, objectid);
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, objectid);
 	ret = btrfs_find_last_root(tree_root, objectid,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
@@ -430,8 +432,8 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 	}
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
-		     tree_root->sectorsize, root, fs_info,
-		     location->objectid);
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, location->objectid);
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -537,6 +539,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	u32 nodesize;
 	u32 leafsize;
 	u32 blocksize;
+	u32 stripesize;
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
@@ -607,7 +610,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		goto fail_iput;
 	}
 #endif
-	__setup_root(512, 512, 512, tree_root,
+	__setup_root(512, 512, 512, 512, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 	fs_info->sb_buffer = read_tree_block(tree_root,
@@ -630,9 +633,11 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	nodesize = btrfs_super_nodesize(disk_super);
 	leafsize = btrfs_super_leafsize(disk_super);
 	sectorsize = btrfs_super_sectorsize(disk_super);
+	stripesize = btrfs_super_stripesize(disk_super);
 	tree_root->nodesize = nodesize;
 	tree_root->leafsize = leafsize;
 	tree_root->sectorsize = sectorsize;
+	tree_root->stripesize = stripesize;
 	sb_set_blocksize(sb, sectorsize);
 
 	i_size_write(fs_info->btree_inode,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 55abdf997ca..91397e98939 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -932,6 +932,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret ? ret : pending_ret;
 }
 
+static u64 stripe_align(struct btrfs_root *root, u64 val)
+{
+	u64 mask = ((u64)root->stripesize - 1);
+	u64 ret = (val + mask) & ~mask;
+	return ret;
+}
+
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
@@ -948,8 +955,9 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	int ret;
 	u64 hole_size = 0;
+	u64 aligned;
+	int ret;
 	int slot = 0;
 	u64 last_byte = 0;
 	u64 orig_search_start = search_start;
@@ -990,6 +998,7 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 check_failed:
 	search_start = find_search_start(root, &block_group, search_start,
 					 total_needed, data, full_scan);
+	search_start = stripe_align(root, search_start);
 	cached_start = search_start;
 	btrfs_init_path(path);
 	ins->objectid = search_start;
@@ -1039,13 +1048,23 @@ check_failed:
 			search_start = max(search_start,
 					   block_group->key.objectid);
 			if (!start_found) {
-				ins->objectid = search_start;
-				ins->offset = search_end - search_start;
+				aligned = stripe_align(root, search_start);
+				ins->objectid = aligned;
+				if (aligned >= search_end) {
+					ret = -ENOSPC;
+					goto error;
+				}
+				ins->offset = search_end - aligned;
 				start_found = 1;
 				goto check_pending;
 			}
-			ins->objectid = last_byte > search_start ?
-					last_byte : search_start;
+			ins->objectid = stripe_align(root,
+						     last_byte > search_start ?
+						     last_byte : search_start);
+			if (search_end <= ins->objectid) {
+				ret = -ENOSPC;
+				goto error;
+			}
 			ins->offset = search_end - ins->objectid;
 			BUG_ON(ins->objectid >= search_end);
 			goto check_pending;
@@ -1056,9 +1075,10 @@ check_failed:
 		    start_found) {
 			if (last_byte < search_start)
 				last_byte = search_start;
-			hole_size = key.objectid - last_byte;
-			if (hole_size >= num_bytes) {
-				ins->objectid = last_byte;
+			aligned = stripe_align(root, last_byte);
+			hole_size = key.objectid - aligned;
+			if (key.objectid > aligned && hole_size >= num_bytes) {
+				ins->objectid = aligned;
 				ins->offset = hole_size;
 				goto check_pending;
 			}
-- 
cgit v1.2.3


From 1a2b2ac78a4151a49267a1e3f48a131f5810814e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 4 Dec 2007 13:18:24 -0500
Subject: Btrfs: Fix extent allocation for btree blocks as the disk fills

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 91397e98939..8ab4954f6ad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -172,9 +172,7 @@ static u64 find_search_start(struct btrfs_root *root,
 	int wrapped = 0;
 
 	if (!cache) {
-		cache = btrfs_lookup_block_group(root->fs_info, search_start);
-		if (!cache)
-			return search_start;
+		goto out;
 	}
 again:
 	ret = cache_block_group(root, cache);
@@ -205,6 +203,13 @@ again:
 		return start;
 	}
 out:
+	cache = btrfs_lookup_block_group(root->fs_info, search_start);
+	if (!cache) {
+		printk("Unable to find block group for %Lu\n",
+		       search_start);
+		WARN_ON(1);
+		return search_start;
+	}
 	return search_start;
 
 new_group:
@@ -219,15 +224,14 @@ no_cache:
 			data = BTRFS_BLOCK_GROUP_MIXED;
 			goto wrapped;
 		}
-		return search_start;
+		goto out;
 	}
 	if (cache_miss && !cache->cached) {
 		cache_block_group(root, cache);
 		last = cache_miss;
 		cache = btrfs_lookup_block_group(root->fs_info, last);
 	}
-	if (!full_scan)
-		cache = btrfs_find_block_group(root, cache, last, data, 0);
+	cache = btrfs_find_block_group(root, cache, last, data, 0);
 	if (!cache)
 		goto no_cache;
 	*cache_ret = cache;
@@ -985,12 +989,14 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 	if (hint_byte) {
 		block_group = btrfs_lookup_block_group(info, hint_byte);
+		if (!block_group)
+			hint_byte = search_start;
 		block_group = btrfs_find_block_group(root, block_group,
 						     hint_byte, data, 1);
 	} else {
 		block_group = btrfs_find_block_group(root,
-						     trans->block_group, 0,
-						     data, 1);
+						     trans->block_group,
+						     search_start, data, 1);
 	}
 
 	total_needed += empty_size;
@@ -1159,14 +1165,14 @@ enospc:
 			if (!full_scan)
 				total_needed -= empty_size;
 			full_scan = 1;
+			data = BTRFS_BLOCK_GROUP_MIXED;
 		} else
 			wrapped = 1;
 	}
 	block_group = btrfs_lookup_block_group(info, search_start);
 	cond_resched();
-	if (!full_scan)
-		block_group = btrfs_find_block_group(root, block_group,
-						     search_start, data, 0);
+	block_group = btrfs_find_block_group(root, block_group,
+					     search_start, data, 0);
 	goto check_failed;
 
 error:
-- 
cgit v1.2.3


From 9691975dd663b255fa52404dc7de4b18892bf244 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 4 Dec 2007 13:20:20 -0500
Subject: Btrfs: Fix buffer get/release issue in create_snapshot
 btrfs_cow_block expects a reference to be held on the buffer being cow'd.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 37f8e0321af..d0af0807d9f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2034,11 +2034,11 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	key.objectid = objectid;
 	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-
+	extent_buffer_get(root->node);
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
+	free_extent_buffer(tmp);
 	btrfs_set_root_bytenr(&new_root_item, root->node->start);
 	btrfs_set_root_level(&new_root_item, btrfs_header_level(root->node));
-
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
 	if (ret)
-- 
cgit v1.2.3


From 17636e03f42a1a42fed3834859de4702bd655fd1 Mon Sep 17 00:00:00 2001
From: Christian Hesse <list@eworm.de>
Date: Tue, 11 Dec 2007 09:25:06 -0500
Subject: Btrfs: section mismatch warnings

--Boundary-00=_CcOWHFYK4T+JwSj
Content-Type: text/plain;
  charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

Hello everybody,

compiling btrfs into the kernel results in section mismatch warnings. __exit
functions are called where they are not allowed to. The attached patch fixes
this for me. Not sure if it is correct though.

Signed-off-by: Christian Hesse <mail@earthworm.de>
--
Regards,
Chris

--Boundary-00=_CcOWHFYK4T+JwSj
Content-Type: text/x-diff; charset="iso-8859-1";
	name="btrfs-section_mismatches.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="btrfs-section_mismatches.patch"

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c  | 2 +-
 fs/btrfs/extent_map.h  | 2 +-
 fs/btrfs/transaction.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 06e437723dc..8326a18413f 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -69,7 +69,7 @@ free_map_cache:
 	return -ENOMEM;
 }
 
-void __exit extent_map_exit(void)
+void extent_map_exit(void)
 {
 	struct extent_state *state;
 
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 2ffc0c76338..6e572d3e892 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -112,7 +112,7 @@ void free_extent_map(struct extent_map *em);
 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
 			  get_extent_t *get_extent);
 int __init extent_map_init(void);
-void __exit extent_map_exit(void);
+void extent_map_exit(void);
 
 u64 count_range_bits(struct extent_map_tree *tree,
 		     u64 *start, u64 max_bytes, unsigned long bits);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4af1c0dab7f..0229e69bd2f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -666,7 +666,7 @@ void __init btrfs_init_transaction_sys(void)
 	trans_wq = create_workqueue("btrfs");
 }
 
-void __exit btrfs_exit_transaction_sys(void)
+void btrfs_exit_transaction_sys(void)
 {
 	destroy_workqueue(trans_wq);
 }
-- 
cgit v1.2.3


From 74493f7a59bfd4d1c7029c74ab2cd0e400612c6b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 Dec 2007 09:25:06 -0500
Subject: Btrfs: Implement generation numbers in block pointers

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 16 ++++++++++++++
 fs/btrfs/ctree.h       | 47 ++++++++++++++++++++++++++++++++++-----
 fs/btrfs/extent-tree.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 115 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3b16051b121..5697705f753 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -114,6 +114,9 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	} else {
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
+		WARN_ON(trans->transid == 0);
+		btrfs_set_node_ptr_generation(parent, parent_slot,
+					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len, 1);
@@ -967,6 +970,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 {
 	struct extent_buffer *b;
 	u64 bytenr;
+	u64 ptr_gen;
 	int slot;
 	int ret;
 	int level;
@@ -1031,10 +1035,18 @@ again:
 			if (level == lowest_level)
 				break;
 			bytenr = btrfs_node_blockptr(b, slot);
+			ptr_gen = btrfs_node_ptr_generation(b, slot);
 			if (should_reada)
 				reada_for_search(root, p, level, slot);
 			b = read_tree_block(root, bytenr,
 					    btrfs_level_size(root, level - 1));
+			if (ptr_gen != btrfs_header_generation(b)) {
+				printk("block %llu bad gen wanted %llu "
+				       "found %llu\n",
+			        (unsigned long long)b->start,
+				(unsigned long long)ptr_gen,
+			        (unsigned long long)btrfs_header_generation(b));
+			}
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -1218,6 +1230,8 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
 		btrfs_node_key(lower, &lower_key, 0);
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
+	WARN_ON(btrfs_header_generation(lower) == 0);
+	btrfs_set_node_ptr_generation(c, 0, btrfs_header_generation(lower));
 
 	btrfs_mark_buffer_dirty(c);
 
@@ -1261,6 +1275,8 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	btrfs_set_node_key(lower, key, slot);
 	btrfs_set_node_blockptr(lower, slot, bytenr);
+	WARN_ON(trans->transid == 0);
+	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
 	btrfs_set_header_nritems(lower, nritems + 1);
 	btrfs_mark_buffer_dirty(lower);
 	return 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9bc1b0a8615..fd58dd846e6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -110,7 +110,7 @@ struct btrfs_header {
 #define BTRFS_MAX_LEVEL 8
 #define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
 			        sizeof(struct btrfs_header)) / \
-			       (sizeof(struct btrfs_disk_key) + sizeof(u64)))
+			        sizeof(struct btrfs_key_ptr))
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
 #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
@@ -168,6 +168,7 @@ struct btrfs_leaf {
 struct btrfs_key_ptr {
 	struct btrfs_disk_key key;
 	__le64 blockptr;
+	__le64 generation;
 } __attribute__ ((__packed__));
 
 struct btrfs_node {
@@ -196,7 +197,13 @@ struct btrfs_path {
  */
 struct btrfs_extent_item {
 	__le32 refs;
-	__le64 owner;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_ref {
+	__le64 root;
+	__le64 generation;
+	__le64 objectid;
+	__le64 offset;
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_timespec {
@@ -402,12 +409,13 @@ struct btrfs_root {
  * are used, and how many references there are to each block
  */
 #define BTRFS_EXTENT_ITEM_KEY	33
+#define BTRFS_EXTENT_REF_KEY	34
 
 /*
  * block groups give us hints into the extent allocation trees.  Which
  * blocks are free etc etc
  */
-#define BTRFS_BLOCK_GROUP_ITEM_KEY 34
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 50
 
 /*
  * string items are for debugging.  They just store a short string of
@@ -529,15 +537,25 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32);
 
 /* struct btrfs_extent_item */
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
-BTRFS_SETGET_FUNCS(extent_owner, struct btrfs_extent_item, owner, 64);
+
+/* struct btrfs_extent_ref */
+BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
+
+BTRFS_SETGET_STACK_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_STACK_FUNCS(ref_generation, struct btrfs_extent_ref,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
 
 BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
 			 refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_extent_owner, struct btrfs_extent_item,
-			 owner, 64);
 
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
+BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
 
 static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
 {
@@ -556,6 +574,23 @@ static inline void btrfs_set_node_blockptr(struct extent_buffer *eb,
 	btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val);
 }
 
+static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr);
+}
+
+static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb,
+						 int nr, u64 val)
+{
+	unsigned long ptr;
+	ptr = offsetof(struct btrfs_node, ptrs) +
+		sizeof(struct btrfs_key_ptr) * nr;
+	btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val);
+}
+
 static inline unsigned long btrfs_node_key_ptr_offset(int nr)
 {
 	return offsetof(struct btrfs_node, ptrs) +
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8ab4954f6ad..0f1ebdd4e92 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include "hash.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -352,9 +353,63 @@ found:
 	return found_group;
 }
 
+static u64 hash_extent_ref(u64 root_objectid, u64 root_generation,
+			   u64 owner, u64 owner_offset)
+{
+	u32 high_crc = ~(u32)0;
+	u32 low_crc = ~(u32)0;
+	__le64 lenum;
+
+	lenum = cpu_to_le64(root_objectid);
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(root_generation);
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+
+	lenum = cpu_to_le64(owner);
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+
+	lenum = cpu_to_le64(owner_offset);
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+
+	return ((u64)high_crc << 32) | (u64)low_crc;
+}
+
+int insert_extent_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path,
+				u64 bytenr,
+				u64 root_objectid, u64 root_generation,
+				u64 owner, u64 owner_offset)
+{
+	u64 hash;
+	struct btrfs_key key;
+	struct btrfs_extent_ref ref;
+	struct extent_buffer *l;
+	struct btrfs_extent_item *item;
+	int ret;
+
+	btrfs_set_stack_ref_root(&ref, root_objectid);
+	btrfs_set_stack_ref_generation(&ref, root_generation);
+	btrfs_set_stack_ref_objectid(&ref, owner);
+	btrfs_set_stack_ref_offset(&ref, owner_offset);
+
+	ret = btrfs_name_hash(&ref, sizeof(ref), &hash);
+	key.offset = hash;
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
+	while (ret == -EEXIST) {
+
+	}
+
+}
+
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes)
+				u64 bytenr, u64 num_bytes,
+				u64 root_objectid, u64 root_generation,
+				u64 owner, u64 owner_offset)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -386,9 +441,10 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, path);
-	btrfs_free_path(path);
 	finish_current_insert(trans, root->fs_info->extent_root);
 	del_pending_extents(trans, root->fs_info->extent_root);
+
+	btrfs_free_path(path);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7bb86316c3961d1bc401ef184fd996f999556c7f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 Dec 2007 09:25:06 -0500
Subject: Btrfs: Add back pointers from extents to the btree or file
 referencing them

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 196 +++++++++++++++++++++++---
 fs/btrfs/ctree.h       |  41 ++++--
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c | 373 +++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/file.c        |  20 ++-
 fs/btrfs/inode.c       |  38 +++--
 fs/btrfs/print-tree.c  |  10 ++
 fs/btrfs/tree-defrag.c |   2 +
 8 files changed, 577 insertions(+), 105 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5697705f753..fd8233e05cf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -77,13 +77,37 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct extent_buffer **cow_ret,
 			     u64 search_start, u64 empty_size)
 {
+	u64 root_gen;
 	struct extent_buffer *cow;
+	u32 nritems;
 	int ret = 0;
 	int different_trans = 0;
+	int level;
+	struct btrfs_key first_key;
+
+	if (root->ref_cows) {
+		root_gen = trans->transid;
+	} else {
+		root_gen = 0;
+	}
 
+	WARN_ON(root->ref_cows && trans->transid !=
+		root->fs_info->running_transaction->transid);
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
-	cow = btrfs_alloc_free_block(trans, root, buf->len,
+	level = btrfs_header_level(buf);
+	nritems = btrfs_header_nritems(buf);
+	if (nritems) {
+		if (level == 0)
+			btrfs_item_key_to_cpu(buf, &first_key, 0);
+		else
+			btrfs_node_key_to_cpu(buf, &first_key, 0);
+	} else {
+		first_key.objectid = 0;
+	}
+	cow = __btrfs_alloc_free_block(trans, root, buf->len,
+				     root->root_key.objectid,
+				     root_gen, first_key.objectid, level,
 				     search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
@@ -104,14 +128,17 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	}
 
 	if (buf == root->node) {
+		root_gen = btrfs_header_generation(buf);
 		root->node = cow;
 		extent_buffer_get(cow);
 		if (buf != root->commit_root) {
 			btrfs_free_extent(trans, root, buf->start,
-					  buf->len, 1);
+					  buf->len, root->root_key.objectid,
+					  root_gen, 0, 0, 1);
 		}
 		free_extent_buffer(buf);
 	} else {
+		root_gen = btrfs_header_generation(parent);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
 		WARN_ON(trans->transid == 0);
@@ -119,7 +146,9 @@ static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
-		btrfs_free_extent(trans, root, buf->start, buf->len, 1);
+		btrfs_free_extent(trans, root, buf->start, buf->len,
+				  btrfs_header_owner(parent), root_gen,
+				  0, 0, 1);
 	}
 	free_extent_buffer(buf);
 	btrfs_mark_buffer_dirty(cow);
@@ -606,6 +635,8 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 0;
 
 	mid = path->nodes[level];
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
+
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < BTRFS_MAX_LEVEL - 1)
@@ -631,7 +662,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		wait_on_tree_block_writeback(root, mid);
 		/* once for the path */
 		free_extent_buffer(mid);
-		ret = btrfs_free_extent(trans, root, mid->start, mid->len, 1);
+		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
+					root->root_key.objectid,
+					btrfs_header_generation(mid), 0, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return ret;
@@ -681,6 +714,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
 			u64 bytenr = right->start;
+			u64 generation = btrfs_header_generation(parent);
 			u32 blocksize = right->len;
 
 			clean_tree_block(trans, root, right);
@@ -692,7 +726,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root, bytenr,
-						 blocksize, 1);
+						 blocksize,
+						 btrfs_header_owner(parent),
+						 generation, 0, 0, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -722,6 +758,7 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 	if (btrfs_header_nritems(mid) == 0) {
 		/* we've managed to empty the middle node, drop it */
+		u64 root_gen = btrfs_header_generation(parent);
 		u64 bytenr = mid->start;
 		u32 blocksize = mid->len;
 		clean_tree_block(trans, root, mid);
@@ -731,7 +768,9 @@ static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
 		wret = del_ptr(trans, root, path, level + 1, pslot);
 		if (wret)
 			ret = wret;
-		wret = btrfs_free_extent(trans, root, bytenr, blocksize, 1);
+		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+					 btrfs_header_owner(parent),
+					 root_gen, 0, 0, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -788,6 +827,7 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		return 1;
 
 	mid = path->nodes[level];
+	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
 	if (level < BTRFS_MAX_LEVEL - 1)
@@ -1113,6 +1153,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	src_nritems = btrfs_header_nritems(src);
 	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
 
 	if (push_items <= 0) {
 		return 1;
@@ -1159,6 +1201,9 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	int dst_nritems;
 	int ret = 0;
 
+	WARN_ON(btrfs_header_generation(src) != trans->transid);
+	WARN_ON(btrfs_header_generation(dst) != trans->transid);
+
 	src_nritems = btrfs_header_nritems(src);
 	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
@@ -1202,6 +1247,8 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
+	u64 root_gen;
+	u64 lower_gen;
 	struct extent_buffer *lower;
 	struct extent_buffer *c;
 	struct btrfs_disk_key lower_key;
@@ -1209,7 +1256,20 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	c = btrfs_alloc_free_block(trans, root, root->nodesize,
+	if (root->ref_cows)
+		root_gen = trans->transid;
+	else
+		root_gen = 0;
+
+	lower = path->nodes[level-1];
+	if (level == 1)
+		btrfs_item_key(lower, &lower_key, 0);
+	else
+		btrfs_node_key(lower, &lower_key, 0);
+
+	c = __btrfs_alloc_free_block(trans, root, root->nodesize,
+				   root->root_key.objectid,
+				   root_gen, lower_key.objectid, level,
 				   root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
@@ -1219,19 +1279,16 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_bytenr(c, c->start);
 	btrfs_set_header_generation(c, trans->transid);
 	btrfs_set_header_owner(c, root->root_key.objectid);
-	lower = path->nodes[level-1];
 
 	write_extent_buffer(c, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(c),
 			    BTRFS_FSID_SIZE);
-	if (level == 1)
-		btrfs_item_key(lower, &lower_key, 0);
-	else
-		btrfs_node_key(lower, &lower_key, 0);
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
-	WARN_ON(btrfs_header_generation(lower) == 0);
-	btrfs_set_node_ptr_generation(c, 0, btrfs_header_generation(lower));
+	lower_gen = btrfs_header_generation(lower);
+	WARN_ON(lower_gen == 0);
+
+	btrfs_set_node_ptr_generation(c, 0, lower_gen);
 
 	btrfs_mark_buffer_dirty(c);
 
@@ -1241,6 +1298,18 @@ static int insert_new_root(struct btrfs_trans_handle *trans,
 	extent_buffer_get(c);
 	path->nodes[level] = c;
 	path->slots[level] = 0;
+
+	if (root->ref_cows && lower_gen != trans->transid) {
+		struct btrfs_path *back_path = btrfs_alloc_path();
+		int ret;
+		ret = btrfs_insert_extent_backref(trans,
+						  root->fs_info->extent_root,
+						  path, lower->start,
+						  root->root_key.objectid,
+						  trans->transid, 0, 0);
+		BUG_ON(ret);
+		btrfs_free_path(back_path);
+	}
 	return 0;
 }
 
@@ -1294,6 +1363,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level)
 {
+	u64 root_gen;
 	struct extent_buffer *c;
 	struct extent_buffer *split;
 	struct btrfs_disk_key disk_key;
@@ -1303,6 +1373,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	u32 c_nritems;
 
 	c = path->nodes[level];
+	WARN_ON(btrfs_header_generation(c) != trans->transid);
 	if (c == root->node) {
 		/* trying to split the root, lets make a new one */
 		ret = insert_new_root(trans, root, path, level + 1);
@@ -1319,8 +1390,17 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	c_nritems = btrfs_header_nritems(c);
-	split = btrfs_alloc_free_block(trans, root, root->nodesize,
-				       c->start, 0);
+	if (root->ref_cows)
+		root_gen = trans->transid;
+	else
+		root_gen = 0;
+
+	btrfs_node_key(c, &disk_key, 0);
+	split = __btrfs_alloc_free_block(trans, root, root->nodesize,
+					 root->root_key.objectid,
+					 root_gen,
+					 btrfs_disk_key_objectid(&disk_key),
+					 level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -1789,6 +1869,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *ins_key,
 		      struct btrfs_path *path, int data_size, int extend)
 {
+	u64 root_gen;
 	struct extent_buffer *l;
 	u32 nritems;
 	int mid;
@@ -1807,6 +1888,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (extend)
 		space_needed = data_size;
 
+	if (root->ref_cows)
+		root_gen = trans->transid;
+	else
+		root_gen = 0;
+
 	/* first try to make some room by pushing left and right */
 	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
@@ -1837,8 +1923,12 @@ again:
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1)/ 2;
 
-	right = btrfs_alloc_free_block(trans, root, root->leafsize,
-				       l->start, 0);
+	btrfs_item_key(l, &disk_key, 0);
+
+	right = __btrfs_alloc_free_block(trans, root, root->leafsize,
+					 root->root_key.objectid,
+					 root_gen, disk_key.objectid, 0,
+					 l->start, 0);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
@@ -2413,13 +2503,16 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (leaf == root->node) {
 			btrfs_set_header_level(leaf, 0);
 		} else {
+			u64 root_gen = btrfs_header_generation(path->nodes[1]);
 			clean_tree_block(trans, root, leaf);
 			wait_on_tree_block_writeback(root, leaf);
 			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root,
-						 leaf->start, leaf->len, 1);
+					 leaf->start, leaf->len,
+					 btrfs_header_owner(path->nodes[1]),
+					 root_gen, 0, 0, 1);
 			if (wret)
 				ret = wret;
 		}
@@ -2456,9 +2549,13 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 
 			if (btrfs_header_nritems(leaf) == 0) {
+				u64 root_gen;
 				u64 bytenr = leaf->start;
 				u32 blocksize = leaf->len;
 
+				root_gen = btrfs_header_generation(
+							   path->nodes[1]);
+
 				clean_tree_block(trans, root, leaf);
 				wait_on_tree_block_writeback(root, leaf);
 
@@ -2468,7 +2565,9 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 				free_extent_buffer(leaf);
 				wret = btrfs_free_extent(trans, root, bytenr,
-							 blocksize, 1);
+					     blocksize,
+					     btrfs_header_owner(path->nodes[1]),
+					     root_gen, 0, 0, 1);
 				if (wret)
 					ret = wret;
 			} else {
@@ -2482,6 +2581,61 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return ret;
 }
 
+/*
+ * walk up the tree as far as required to find the previous leaf.
+ * returns 0 if it found something or 1 if there are no lesser leaves.
+ * returns < 0 on io errors.
+ */
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+{
+	int slot;
+	int level = 1;
+	u64 bytenr;
+	struct extent_buffer *c;
+	struct extent_buffer *next = NULL;
+
+	while(level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			return 1;
+
+		slot = path->slots[level];
+		c = path->nodes[level];
+		if (slot == 0) {
+			level++;
+			if (level == BTRFS_MAX_LEVEL)
+				return 1;
+			continue;
+		}
+		slot--;
+
+		bytenr = btrfs_node_blockptr(c, slot);
+		if (next)
+			free_extent_buffer(next);
+
+		if (path->reada < 0)
+			reada_for_search(root, path, level, slot);
+
+		next = read_tree_block(root, bytenr,
+				       btrfs_level_size(root, level - 1));
+		break;
+	}
+	path->slots[level] = slot;
+	while(1) {
+		level--;
+		c = path->nodes[level];
+		free_extent_buffer(c);
+		path->nodes[level] = next;
+		path->slots[level] = 0;
+		if (!level)
+			break;
+		if (path->reada)
+			reada_for_search(root, path, level, 0);
+		next = read_tree_block(root, btrfs_node_blockptr(next, 0),
+				       btrfs_level_size(root, level - 1));
+	}
+	return 0;
+}
+
 /*
  * walk up the tree as far as required to find the next leaf.
  * returns 0 if it found something or 1 if there are no greater leaves.
@@ -2503,6 +2657,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		c = path->nodes[level];
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
+			if (level == BTRFS_MAX_LEVEL)
+				return 1;
 			continue;
 		}
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fd58dd846e6..cb1b156d954 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -544,11 +544,12 @@ BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
 BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
 BTRFS_SETGET_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
 
-BTRFS_SETGET_STACK_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
-BTRFS_SETGET_STACK_FUNCS(ref_generation, struct btrfs_extent_ref,
+BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
 			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
+			 objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref, offset, 64);
 
 BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
 			 refs, 32);
@@ -914,24 +915,45 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 *hint, u64 search_start,
 						 int data, int owner);
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root);
+		       struct btrfs_root *root, u64 owner_objectid);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root, u32 size,
+					    u64 root_objectid,
 					    u64 hint, u64 empty_size);
+struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     u32 blocksize,
+					     u64 root_objectid,
+					     u64 ref_generation,
+					     u64 first_objectid,
+					     int level,
+					     u64 hint,
+					     u64 empty_size);
+int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, u64 bytenr,
+				 u64 root_objectid, u64 ref_generation,
+				 u64 owner, u64 owner_offset);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, u64 owner,
-		       u64 num_bytes, u64 empty_size, u64 search_start,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 root_objectid, u64 ref_generation,
+		       u64 owner, u64 owner_offset,
+		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, int data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 bytenr, u64 num_bytes, int pin);
+		      *root, u64 bytenr, u64 num_bytes,
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, u64 owner_offset, int pin);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct extent_map_tree *unpin);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes);
+				u64 bytenr, u64 num_bytes,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
@@ -966,6 +988,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			    *root, struct btrfs_path *path, struct btrfs_key
 			    *cpu_key, u32 data_size);
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60a30da6af0..0ac21e3aac8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -210,7 +210,7 @@ static int btree_writepages(struct address_space *mapping,
 {
 	struct extent_map_tree *tree;
 	tree = &BTRFS_I(mapping->host)->extent_tree;
-	if (wbc->sync_mode == WB_SYNC_NONE) {
+	if (0 && wbc->sync_mode == WB_SYNC_NONE) {
 		u64 num_dirty;
 		u64 start = 0;
 		unsigned long thresh = 96 * 1024 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0f1ebdd4e92..32991f73e9d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/crc32c.h>
 #include "hash.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -89,7 +90,8 @@ static int cache_block_group(struct btrfs_root *root,
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid < block_group->key.objectid) {
-			if (key.objectid + key.offset > first_free)
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_REF_KEY &&
+			    key.objectid + key.offset > first_free)
 				first_free = key.objectid + key.offset;
 			goto next;
 		}
@@ -353,7 +355,7 @@ found:
 	return found_group;
 }
 
-static u64 hash_extent_ref(u64 root_objectid, u64 root_generation,
+static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
 			   u64 owner, u64 owner_offset)
 {
 	u32 high_crc = ~(u32)0;
@@ -362,53 +364,149 @@ static u64 hash_extent_ref(u64 root_objectid, u64 root_generation,
 
 	lenum = cpu_to_le64(root_objectid);
 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
-	lenum = cpu_to_le64(root_generation);
-	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(ref_generation);
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
+#if 0
 	lenum = cpu_to_le64(owner);
 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
-
 	lenum = cpu_to_le64(owner_offset);
 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
-
+#endif
 	return ((u64)high_crc << 32) | (u64)low_crc;
 }
 
-int insert_extent_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path,
-				u64 bytenr,
-				u64 root_objectid, u64 root_generation,
-				u64 owner, u64 owner_offset)
+static int match_extent_ref(struct extent_buffer *leaf,
+			    struct btrfs_extent_ref *disk_ref,
+			    struct btrfs_extent_ref *cpu_ref)
+{
+	int ret;
+	int len;
+
+	if (cpu_ref->objectid)
+		len = sizeof(*cpu_ref);
+	else
+		len = 2 * sizeof(u64);
+	ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
+				   len);
+	return ret == 0;
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, u64 bytenr,
+				 u64 root_objectid, u64 ref_generation,
+				 u64 owner, u64 owner_offset, int del)
 {
 	u64 hash;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
 	struct btrfs_extent_ref ref;
-	struct extent_buffer *l;
-	struct btrfs_extent_item *item;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *disk_ref;
+	int ret;
+	int ret2;
+
+	btrfs_set_stack_ref_root(&ref, root_objectid);
+	btrfs_set_stack_ref_generation(&ref, ref_generation);
+	btrfs_set_stack_ref_objectid(&ref, owner);
+	btrfs_set_stack_ref_offset(&ref, owner_offset);
+
+	hash = hash_extent_ref(root_objectid, ref_generation, owner,
+			       owner_offset);
+	key.offset = hash;
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path,
+					del ? -1 : 0, del);
+		if (ret < 0)
+			goto out;
+		leaf = path->nodes[0];
+		if (ret != 0) {
+			u32 nritems = btrfs_header_nritems(leaf);
+			if (path->slots[0] >= nritems) {
+				ret2 = btrfs_next_leaf(root, path);
+				if (ret2)
+					goto out;
+				leaf = path->nodes[0];
+			}
+			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+			if (found_key.objectid != bytenr ||
+			    found_key.type != BTRFS_EXTENT_REF_KEY)
+				goto out;
+			key.offset = found_key.offset;
+			if (del) {
+				btrfs_release_path(root, path);
+				continue;
+			}
+		}
+		disk_ref = btrfs_item_ptr(path->nodes[0],
+					  path->slots[0],
+					  struct btrfs_extent_ref);
+		if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
+			ret = 0;
+			goto out;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		key.offset = found_key.offset + 1;
+		btrfs_release_path(root, path);
+	}
+out:
+	return ret;
+}
+
+int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, u64 bytenr,
+				 u64 root_objectid, u64 ref_generation,
+				 u64 owner, u64 owner_offset)
+{
+	u64 hash;
+	struct btrfs_key key;
+	struct btrfs_extent_ref ref;
+	struct btrfs_extent_ref *disk_ref;
 	int ret;
 
 	btrfs_set_stack_ref_root(&ref, root_objectid);
-	btrfs_set_stack_ref_generation(&ref, root_generation);
+	btrfs_set_stack_ref_generation(&ref, ref_generation);
 	btrfs_set_stack_ref_objectid(&ref, owner);
 	btrfs_set_stack_ref_offset(&ref, owner_offset);
 
-	ret = btrfs_name_hash(&ref, sizeof(ref), &hash);
+	hash = hash_extent_ref(root_objectid, ref_generation, owner,
+			       owner_offset);
 	key.offset = hash;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_REF_KEY;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
 	while (ret == -EEXIST) {
-
+		disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					  struct btrfs_extent_ref);
+		if (match_extent_ref(path->nodes[0], disk_ref, &ref))
+			goto out;
+		key.offset++;
+		btrfs_release_path(root, path);
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+					      sizeof(ref));
 	}
-
+	if (ret)
+		goto out;
+	disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				  struct btrfs_extent_ref);
+	write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
+			    sizeof(ref));
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
+	btrfs_release_path(root, path);
+	return ret;
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes,
-				u64 root_objectid, u64 root_generation,
+				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset)
 {
 	struct btrfs_path *path;
@@ -441,6 +539,11 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
 	btrfs_release_path(root->fs_info->extent_root, path);
+
+	ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
+					  path, bytenr, root_objectid,
+					  ref_generation, owner, owner_offset);
+	BUG_ON(ret);
 	finish_current_insert(trans, root->fs_info->extent_root);
 	del_pending_extents(trans, root->fs_info->extent_root);
 
@@ -489,10 +592,29 @@ out:
 }
 
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root)
+		       struct btrfs_root *root, u64 owner_objectid)
 {
+	u64 generation;
+	u64 key_objectid;
+	u64 level;
+	u32 nritems;
+	struct btrfs_disk_key disk_key;
+
+	level = btrfs_header_level(root->node);
+	generation = trans->transid;
+	nritems = btrfs_header_nritems(root->node);
+	if (nritems > 0) {
+		if (level == 0)
+			btrfs_item_key(root->node, &disk_key, 0);
+		else
+			btrfs_node_key(root->node, &disk_key, 0);
+		key_objectid = btrfs_disk_key_objectid(&disk_key);
+	} else {
+		key_objectid = 0;
+	}
 	return btrfs_inc_extent_ref(trans, root, root->node->start,
-				    root->node->len);
+				    root->node->len, owner_objectid,
+				    generation, 0, 0);
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -506,7 +628,6 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int level;
 	int ret;
 	int faili;
-	int err;
 
 	if (!root->ref_cows)
 		return 0;
@@ -528,7 +649,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (disk_bytenr == 0)
 				continue;
 			ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf, fi));
+				    btrfs_file_extent_disk_num_bytes(buf, fi),
+				    root->root_key.objectid, trans->transid,
+				    key.objectid, key.offset);
 			if (ret) {
 				faili = i;
 				goto fail;
@@ -536,7 +659,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			ret = btrfs_inc_extent_ref(trans, root, bytenr,
-					   btrfs_level_size(root, level - 1));
+					   btrfs_level_size(root, level - 1),
+					   root->root_key.objectid,
+					   trans->transid, 0, 0);
 			if (ret) {
 				faili = i;
 				goto fail;
@@ -546,6 +671,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return 0;
 fail:
 	WARN_ON(1);
+#if 0
 	for (i =0; i < faili; i++) {
 		if (level == 0) {
 			u64 disk_bytenr;
@@ -571,6 +697,7 @@ fail:
 			BUG_ON(err);
 		}
 	}
+#endif
 	return ret;
 }
 
@@ -809,18 +936,18 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root)
 {
+	u64 start;
+	u64 end;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_path *path;
 	struct btrfs_key ins;
 	struct btrfs_extent_item extent_item;
 	int ret;
 	int err = 0;
-	u64 start;
-	u64 end;
-	struct btrfs_fs_info *info = extent_root->fs_info;
 
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
-	btrfs_set_stack_extent_owner(&extent_item,
-				     extent_root->root_key.objectid);
+	path = btrfs_alloc_path();
 
 	while(1) {
 		ret = find_first_extent_bit(&info->extent_ins, 0, &start,
@@ -834,7 +961,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 					&extent_item, sizeof(extent_item));
 		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
+		err = btrfs_insert_extent_backref(trans, extent_root, path,
+					  start, extent_root->root_key.objectid,
+					  0, 0, 0);
+		BUG_ON(err);
 	}
+	btrfs_free_path(path);
 	return 0;
 }
 
@@ -871,7 +1003,9 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
  * remove an extent from the root, returns 0 on success
  */
 static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 bytenr, u64 num_bytes, int pin,
+			 *root, u64 bytenr, u64 num_bytes,
+			 u64 root_objectid, u64 ref_generation,
+			 u64 owner_objectid, u64 owner_offset, int pin,
 			 int mark_free)
 {
 	struct btrfs_path *path;
@@ -891,6 +1025,24 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
+	if (ref_generation && owner_objectid == 0 && root_objectid == 3) {
+//printk("drop backref root %Lu gen %Lu byte %Lu\n", root_objectid, ref_generation, bytenr );
+	}
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, root_objectid,
+				    ref_generation,
+				    owner_objectid, owner_offset, 1);
+	if (ret == 0) {
+		ret = btrfs_del_item(trans, extent_root, path);
+	} else {
+		btrfs_print_leaf(extent_root, path->nodes[0]);
+		WARN_ON(1);
+		printk("Unable to find ref byte nr %Lu root %Lu "
+		       " gen %Lu owner %Lu offset %Lu\n", bytenr,
+		       root_objectid, ref_generation, owner_objectid,
+		       owner_offset);
+	}
+	btrfs_release_path(extent_root, path);
 	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
 	if (ret < 0)
 		return ret;
@@ -965,7 +1117,9 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
 		ret = __free_extent(trans, extent_root,
-				     start, end + 1 - start, 0, 0);
+				     start, end + 1 - start,
+				     extent_root->root_key.objectid,
+				     0, 0, 0, 0, 0);
 		if (ret)
 			err = ret;
 	}
@@ -976,18 +1130,25 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
  * remove an extent from the root, returns 0 on success
  */
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 bytenr, u64 num_bytes, int pin)
+		      *root, u64 bytenr, u64 num_bytes,
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, u64 owner_offset, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
 	int ret;
 
 	WARN_ON(num_bytes < root->sectorsize);
+	if (!root->ref_cows)
+		ref_generation = 0;
+
 	if (root == extent_root) {
 		pin_down_bytes(root, bytenr, num_bytes, 1);
 		return 0;
 	}
-	ret = __free_extent(trans, root, bytenr, num_bytes, pin, pin == 0);
+	ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
+			    ref_generation, owner_objectid, owner_offset,
+			    pin, pin == 0);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
@@ -1080,23 +1241,26 @@ check_failed:
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 
 	/*
-	 * a rare case, go back one key if we hit a block group item
-	 * instead of an extent item
+	 * walk backwards to find the first extent item key
 	 */
-	if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY &&
-	    key.objectid + key.offset >= search_start) {
-		ins->objectid = key.objectid;
-		ins->offset = key.offset - 1;
-		btrfs_release_path(root, path);
-		ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
-		if (ret < 0)
-			goto error;
-
-		if (path->slots[0] > 0) {
+	while(btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
+		if (path->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0) {
+				ret = btrfs_search_slot(trans, root, ins,
+							path, 0, 0);
+				if (ret < 0)
+					goto error;
+				if (path->slots[0] > 0)
+					path->slots[0]--;
+				break;
+			}
+		} else {
 			path->slots[0]--;
 		}
+		l = path->nodes[0];
+		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 	}
-
 	while (1) {
 		l = path->nodes[0];
 		slot = path->slots[0];
@@ -1146,7 +1310,8 @@ check_failed:
 			}
 		}
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
-			if (!start_found) {
+			if (!start_found && btrfs_key_type(&key) ==
+			    BTRFS_BLOCK_GROUP_ITEM_KEY) {
 				last_byte = key.objectid;
 				start_found = 1;
 			}
@@ -1244,8 +1409,10 @@ error:
  * returns 0 if everything worked, non-zero otherwise.
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, u64 owner,
-		       u64 num_bytes, u64 empty_size, u64 hint_byte,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 root_objectid, u64 ref_generation,
+		       u64 owner, u64 owner_offset,
+		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, int data)
 {
 	int ret;
@@ -1255,9 +1422,9 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
+	struct btrfs_path *path;
 
 	btrfs_set_stack_extent_refs(&extent_item, 1);
-	btrfs_set_stack_extent_owner(&extent_item, owner);
 
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -1296,8 +1463,16 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 	trans->alloc_exclude_start = 0;
 	trans->alloc_exclude_nr = 0;
+	BUG_ON(ret);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_insert_extent_backref(trans, extent_root, path,
+					  ins->objectid, root_objectid,
+					  ref_generation, owner, owner_offset);
 
 	BUG_ON(ret);
+	btrfs_free_path(path);
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
 
@@ -1321,15 +1496,43 @@ update_block:
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
-					     u32 blocksize, u64 hint,
+					     u32 blocksize,
+					     u64 root_objectid, u64 hint,
+					     u64 empty_size)
+{
+	u64 ref_generation;
+
+	if (root->ref_cows)
+		ref_generation = trans->transid;
+	else
+		ref_generation = 0;
+
+
+	return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid,
+					ref_generation, 0, 0, hint, empty_size);
+}
+
+/*
+ * helper function to allocate a block for a given tree
+ * returns the tree buffer or NULL.
+ */
+struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     u32 blocksize,
+					     u64 root_objectid,
+					     u64 ref_generation,
+					     u64 first_objectid,
+					     int level,
+					     u64 hint,
 					     u64 empty_size)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct extent_buffer *buf;
 
-	ret = btrfs_alloc_extent(trans, root, root->root_key.objectid,
-				 blocksize, empty_size, hint,
+	ret = btrfs_alloc_extent(trans, root, blocksize,
+				 root_objectid, ref_generation,
+				 first_objectid, level, empty_size, hint,
 				 (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
@@ -1337,7 +1540,9 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	}
 	buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
 	if (!buf) {
-		btrfs_free_extent(trans, root, ins.objectid, blocksize, 0);
+		btrfs_free_extent(trans, root, ins.objectid, blocksize,
+				  root->root_key.objectid, ref_generation,
+				  0, 0, 0);
 		return ERR_PTR(-ENOMEM);
 	}
 	btrfs_set_buffer_uptodate(buf);
@@ -1355,6 +1560,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root, struct extent_buffer *leaf)
 {
+	u64 leaf_owner;
+	u64 leaf_generation;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
@@ -1363,6 +1570,9 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
+	leaf_owner = btrfs_header_owner(leaf);
+	leaf_generation = btrfs_header_generation(leaf);
+
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
 
@@ -1381,7 +1591,9 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (disk_bytenr == 0)
 			continue;
 		ret = btrfs_free_extent(trans, root, disk_bytenr,
-				btrfs_file_extent_disk_num_bytes(leaf, fi), 0);
+				btrfs_file_extent_disk_num_bytes(leaf, fi),
+				leaf_owner, leaf_generation,
+				key.objectid, key.offset, 0);
 		BUG_ON(ret);
 	}
 	return 0;
@@ -1423,9 +1635,12 @@ static void reada_walk_down(struct btrfs_root *root,
 static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, struct btrfs_path *path, int *level)
 {
+	u64 root_owner;
+	u64 root_gen;
+	u64 bytenr;
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
-	u64 bytenr;
+	struct extent_buffer *parent;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -1466,9 +1681,13 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 		ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1) {
+			parent = path->nodes[*level];
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
 			ret = btrfs_free_extent(trans, root, bytenr,
-						blocksize, 1);
+						blocksize, root_owner,
+						root_gen, 0, 0, 1);
 			BUG_ON(ret);
 			continue;
 		}
@@ -1484,10 +1703,16 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 						blocksize, &refs);
 			BUG_ON(ret);
 			if (refs != 1) {
+				parent = path->nodes[*level];
+				root_owner = btrfs_header_owner(parent);
+				root_gen = btrfs_header_generation(parent);
+
 				path->slots[*level]++;
 				free_extent_buffer(next);
-				ret = btrfs_free_extent(trans, root,
-							bytenr, blocksize, 1);
+				ret = btrfs_free_extent(trans, root, bytenr,
+							blocksize,
+							root_owner,
+							root_gen, 0, 0, 1);
 				BUG_ON(ret);
 				continue;
 			}
@@ -1502,8 +1727,19 @@ static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 out:
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	if (path->nodes[*level] == root->node) {
+		root_owner = root->root_key.objectid;
+		parent = path->nodes[*level];
+	} else {
+		parent = path->nodes[*level + 1];
+		root_owner = btrfs_header_owner(parent);
+	}
+
+	root_gen = btrfs_header_generation(parent);
 	ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
-				path->nodes[*level]->len, 1);
+				path->nodes[*level]->len,
+				root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -1519,10 +1755,12 @@ out:
 static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root, struct btrfs_path *path, int *level)
 {
+	u64 root_owner;
+	u64 root_gen;
+	struct btrfs_root_item *root_item = &root->root_item;
 	int i;
 	int slot;
 	int ret;
-	struct btrfs_root_item *root_item = &root->root_item;
 
 	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
@@ -1539,9 +1777,20 @@ static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
 			root_item->drop_level = i;
 			return 0;
 		} else {
+			if (path->nodes[*level] == root->node) {
+				root_owner = root->root_key.objectid;
+				root_gen =
+				   btrfs_header_generation(path->nodes[*level]);
+			} else {
+				struct extent_buffer *node;
+				node = path->nodes[*level + 1];
+				root_owner = btrfs_header_owner(node);
+				root_gen = btrfs_header_generation(node);
+			}
 			ret = btrfs_free_extent(trans, root,
 						path->nodes[*level]->start,
-						path->nodes[*level]->len, 1);
+						path->nodes[*level]->len,
+						root_owner, root_gen, 0, 0, 1);
 			BUG_ON(ret);
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b0d63778726..5b1f90f06e0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -496,7 +496,10 @@ next_slot:
 						   sizeof(old));
 				if (disk_bytenr != 0) {
 					ret = btrfs_inc_extent_ref(trans, root,
-					         disk_bytenr, disk_num_bytes);
+					         disk_bytenr, disk_num_bytes,
+						 root->root_key.objectid,
+						 trans->transid,
+						 key.objectid, end);
 					BUG_ON(ret);
 				}
 			}
@@ -541,6 +544,14 @@ next_slot:
 			u64 disk_bytenr = 0;
 			u64 disk_num_bytes = 0;
 			u64 extent_num_bytes = 0;
+			u64 root_gen;
+
+			if (leaf != root->node) {
+				root_gen =
+					btrfs_header_generation(path->nodes[1]);
+			} else {
+				root_gen = btrfs_header_generation(leaf);
+			}
 			if (found_extent) {
 				disk_bytenr =
 				      btrfs_file_extent_disk_bytenr(leaf,
@@ -562,8 +573,11 @@ next_slot:
 			if (found_extent && disk_bytenr != 0) {
 				inode->i_blocks -= extent_num_bytes >> 9;
 				ret = btrfs_free_extent(trans, root,
-							disk_bytenr,
-							disk_num_bytes, 0);
+						disk_bytenr,
+						disk_num_bytes,
+						root->root_key.objectid,
+						root_gen, inode->i_ino,
+						key.offset, 0);
 			}
 
 			BUG_ON(ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d0af0807d9f..bb70db0c9df 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -93,7 +93,9 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
 
-	ret = btrfs_alloc_extent(trans, root, inode->i_ino, num_bytes, 0,
+	ret = btrfs_alloc_extent(trans, root, num_bytes,
+				 root->root_key.objectid, trans->transid,
+				 inode->i_ino, start, 0,
 				 alloc_hint, (u64)-1, &ins, 1);
 	if (ret) {
 		WARN_ON(1);
@@ -560,6 +562,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	u64 extent_start = 0;
 	u64 extent_num_bytes = 0;
 	u64 item_end = 0;
+	u64 root_gen = 0;
 	int found_extent;
 	int del_item;
 	int extent_type = -1;
@@ -670,6 +673,15 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 					found_extent = 1;
 					inode->i_blocks -= num_dec;
 				}
+				if (leaf == root->node) {
+					root_gen =
+						btrfs_header_generation(leaf);
+				} else {
+					struct extent_buffer *parent;
+					parent = path->nodes[1];
+					root_gen =
+						btrfs_header_generation(parent);
+				}
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE &&
 			   !del_item) {
@@ -690,7 +702,10 @@ delete:
 		btrfs_release_path(root, path);
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
-						extent_num_bytes, 0);
+						extent_num_bytes,
+						root->root_key.objectid,
+						root_gen, inode->i_ino,
+						found_key.offset, 0);
 			BUG_ON(ret);
 		}
 	}
@@ -1900,7 +1915,14 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 0);
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
+	if (ret)
+		goto fail;
+
+	leaf = __btrfs_alloc_free_block(trans, root, root->leafsize,
+					objectid, trans->transid, 0, 0,
+					0, 0);
 	if (IS_ERR(leaf))
 		return PTR_ERR(leaf);
 
@@ -1908,7 +1930,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	btrfs_set_header_level(leaf, 0);
 	btrfs_set_header_bytenr(leaf, leaf->start);
 	btrfs_set_header_generation(leaf, trans->transid);
-	btrfs_set_header_owner(leaf, root->root_key.objectid);
+	btrfs_set_header_owner(leaf, objectid);
+
 	write_extent_buffer(leaf, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(leaf),
 			    BTRFS_FSID_SIZE);
@@ -1933,11 +1956,6 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	free_extent_buffer(leaf);
 	leaf = NULL;
 
-	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-				       0, &objectid);
-	if (ret)
-		goto fail;
-
 	btrfs_set_root_dirid(&root_item, new_dirid);
 
 	key.objectid = objectid;
@@ -2056,7 +2074,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail;
 
-	ret = btrfs_inc_root_ref(trans, root);
+	ret = btrfs_inc_root_ref(trans, root, objectid);
 	if (ret)
 		goto fail;
 fail:
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 030324febf6..da0b4dcf361 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -33,6 +33,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	struct btrfs_extent_ref *ref;
 	u32 type;
 
 	printk("leaf %llu total ptrs %d free space %d\n",
@@ -73,6 +74,15 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			printk("\t\textent data refs %u\n",
 				btrfs_extent_refs(l, ei));
 			break;
+		case BTRFS_EXTENT_REF_KEY:
+			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
+			printk("\t\textent back ref root %llu gen %llu "
+			       "owner %llu offset %llu\n",
+			       (unsigned long long)btrfs_ref_root(l, ref),
+			       (unsigned long long)btrfs_ref_generation(l, ref),
+			       (unsigned long long)btrfs_ref_objectid(l, ref),
+			       (unsigned long long)btrfs_ref_offset(l, ref));
+			break;
 
 		case BTRFS_EXTENT_DATA_KEY:
 			fi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3994795edfe..5c58630dce0 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -78,6 +78,8 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 			break;
 
 		if (*level == 1) {
+			WARN_ON(btrfs_header_generation(path->nodes[*level]) !=
+							trans->transid);
 			ret = btrfs_realloc_node(trans, root,
 						 path->nodes[*level],
 						 path->slots[*level],
-- 
cgit v1.2.3


From d8d5f3e16d1ae4fe9b93312e083f2c04a95520f0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 Dec 2007 12:42:00 -0500
Subject: Btrfs: Add lowest key information to back refs for extent tree blocks
 as well.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/file.c        |  11 ++----
 fs/btrfs/inode.c       |  14 ++-----
 4 files changed, 110 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0ac21e3aac8..60a30da6af0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -210,7 +210,7 @@ static int btree_writepages(struct address_space *mapping,
 {
 	struct extent_map_tree *tree;
 	tree = &BTRFS_I(mapping->host)->extent_tree;
-	if (0 && wbc->sync_mode == WB_SYNC_NONE) {
+	if (wbc->sync_mode == WB_SYNC_NONE) {
 		u64 num_dirty;
 		u64 start = 0;
 		unsigned long thresh = 96 * 1024 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32991f73e9d..187be401247 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -457,6 +457,94 @@ out:
 	return ret;
 }
 
+/*
+ * Back reference rules.  Back refs have three main goals:
+ *
+ * 1) differentiate between all holders of references to an extent so that
+ *    when a reference is dropped we can make sure it was a valid reference
+ *    before freeing the extent.
+ *
+ * 2) Provide enough information to quickly find the holders of an extent
+ *    if we notice a given block is corrupted or bad.
+ *
+ * 3) Make it easy to migrate blocks for FS shrinking or storage pool
+ *    maintenance.  This is actually the same as #2, but with a slightly
+ *    different use case.
+ *
+ * File extents can be referenced by:
+ *
+ * - multiple snapshots, subvolumes, or different generations in one subvol
+ * - different files inside a single subvolume (in theory, not implemented yet)
+ * - different offsets inside a file (bookend extents in file.c)
+ *
+ * The extent ref structure has fields for:
+ *
+ * - Objectid of the subvolume root
+ * - Generation number of the tree holding the reference
+ * - objectid of the file holding the reference
+ * - offset in the file corresponding to the key holding the reference
+ *
+ * When a file extent is allocated the fields are filled in:
+ *     (root_key.objectid, trans->transid, inode objectid, offset in file)
+ *
+ * When a leaf is cow'd new references are added for every file extent found
+ * in the leaf.  It looks the same as the create case, but trans->transid
+ * will be different when the block is cow'd.
+ *
+ *     (root_key.objectid, trans->transid, inode objectid, offset in file)
+ *
+ * When a file extent is removed either during snapshot deletion or file
+ * truncation, the corresponding back reference is found
+ * by searching for:
+ *
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *      inode objectid, offset in file)
+ *
+ * Btree extents can be referenced by:
+ *
+ * - Different subvolumes
+ * - Different generations of the same subvolume
+ *
+ * Storing sufficient information for a full reverse mapping of a btree
+ * block would require storing the lowest key of the block in the backref,
+ * and it would require updating that lowest key either before write out or
+ * every time it changed.  Instead, the objectid of the lowest key is stored
+ * along with the level of the tree block.  This provides a hint
+ * about where in the btree the block can be found.  Searches through the
+ * btree only need to look for a pointer to that block, so they stop one
+ * level higher than the level recorded in the backref.
+ *
+ * Some btrees do not do reference counting on their extents.  These
+ * include the extent tree and the tree of tree roots.  Backrefs for these
+ * trees always have a generation of zero.
+ *
+ * When a tree block is created, back references are inserted:
+ *
+ * (root->root_key.objectid, trans->transid or zero, lowest_key_objectid, level)
+ *
+ * When a tree block is cow'd in a reference counted root,
+ * new back references are added for all the blocks it points to.
+ * These are of the form (trans->transid will have increased since creation):
+ *
+ * (root->root_key.objectid, trans->transid, lowest_key_objectid, level)
+ *
+ * Because the lowest_key_objectid and the level are just hints
+ * they are not used when backrefs are deleted.  When a backref is deleted:
+ *
+ * if backref was for a tree root:
+ *     root_objectid = root->root_key.objectid
+ * else
+ *     root_objectid = btrfs_header_owner(parent)
+ *
+ * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
+ *
+ * Back Reference Key hashing:
+ *
+ * Back references have four fields, each 64 bits long.  Unfortunately,
+ * This is hashed into a single 64 bit number and placed into the key offset.
+ * The key objectid corresponds to the first byte in the extent, and the
+ * key type is set to BTRFS_EXTENT_REF_KEY
+ */
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path, u64 bytenr,
@@ -939,10 +1027,13 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 	u64 start;
 	u64 end;
 	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct extent_buffer *eb;
 	struct btrfs_path *path;
 	struct btrfs_key ins;
+	struct btrfs_disk_key first;
 	struct btrfs_extent_item extent_item;
 	int ret;
+	int level;
 	int err = 0;
 
 	btrfs_set_stack_extent_refs(&extent_item, 1);
@@ -961,10 +1052,19 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 					&extent_item, sizeof(extent_item));
 		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
+		eb = read_tree_block(extent_root, ins.objectid, ins.offset);
+		level = btrfs_header_level(eb);
+		if (level == 0) {
+			btrfs_item_key(eb, &first, 0);
+		} else {
+			btrfs_node_key(eb, &first, 0);
+		}
 		err = btrfs_insert_extent_backref(trans, extent_root, path,
 					  start, extent_root->root_key.objectid,
-					  0, 0, 0);
+					  0, btrfs_disk_key_objectid(&first),
+					  level);
 		BUG_ON(err);
+		free_extent_buffer(eb);
 	}
 	btrfs_free_path(path);
 	return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5b1f90f06e0..1cc4d285951 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -545,13 +545,10 @@ next_slot:
 			u64 disk_num_bytes = 0;
 			u64 extent_num_bytes = 0;
 			u64 root_gen;
+			u64 root_owner;
 
-			if (leaf != root->node) {
-				root_gen =
-					btrfs_header_generation(path->nodes[1]);
-			} else {
-				root_gen = btrfs_header_generation(leaf);
-			}
+			root_gen = btrfs_header_generation(leaf);
+			root_owner = btrfs_header_owner(leaf);
 			if (found_extent) {
 				disk_bytenr =
 				      btrfs_file_extent_disk_bytenr(leaf,
@@ -575,7 +572,7 @@ next_slot:
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr,
 						disk_num_bytes,
-						root->root_key.objectid,
+						root_owner,
 						root_gen, inode->i_ino,
 						key.offset, 0);
 			}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb70db0c9df..03fea037667 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -563,6 +563,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	u64 extent_num_bytes = 0;
 	u64 item_end = 0;
 	u64 root_gen = 0;
+	u64 root_owner = 0;
 	int found_extent;
 	int del_item;
 	int extent_type = -1;
@@ -673,15 +674,8 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 					found_extent = 1;
 					inode->i_blocks -= num_dec;
 				}
-				if (leaf == root->node) {
-					root_gen =
-						btrfs_header_generation(leaf);
-				} else {
-					struct extent_buffer *parent;
-					parent = path->nodes[1];
-					root_gen =
-						btrfs_header_generation(parent);
-				}
+				root_gen = btrfs_header_generation(leaf);
+				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE &&
 			   !del_item) {
@@ -703,7 +697,7 @@ delete:
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
-						root->root_key.objectid,
+						root_owner,
 						root_gen, inode->i_ino,
 						found_key.offset, 0);
 			BUG_ON(ret);
-- 
cgit v1.2.3


From 3954401fa6013bb2f2c8758b903e9bffcf25b64b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 12 Dec 2007 14:38:19 -0500
Subject: Btrfs: Add back pointers from the inode to the directory that
 references it

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      |  21 +++-
 fs/btrfs/inode-item.c | 136 ++++++++++++++++++++++++++
 fs/btrfs/inode.c      | 258 +++++++++++++++++++++++---------------------------
 3 files changed, 272 insertions(+), 143 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cb1b156d954..035fa155034 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -53,7 +53,7 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_CSUM_SIZE 32
 /* four bytes for CRC32 */
 #define BTRFS_CRC32_SIZE 4
-#define BTRFS_EMPTY_DIR_SIZE 6
+#define BTRFS_EMPTY_DIR_SIZE 0
 
 #define BTRFS_FT_UNKNOWN	0
 #define BTRFS_FT_REG_FILE	1
@@ -206,6 +206,11 @@ struct btrfs_extent_ref {
 	__le64 offset;
 } __attribute__ ((__packed__));
 
+struct btrfs_inode_ref {
+	__le16 name_len;
+	/* name goes here */
+} __attribute__ ((__packed__));
+
 struct btrfs_inode_timespec {
 	__le64 sec;
 	__le32 nsec;
@@ -379,7 +384,8 @@ struct btrfs_root {
  * the FS
  */
 #define BTRFS_INODE_ITEM_KEY		1
-#define BTRFS_XATTR_ITEM_KEY		2
+#define BTRFS_INODE_REF_KEY		2
+#define BTRFS_XATTR_ITEM_KEY		8
 /* reserve 2-15 close to the inode for later flexibility */
 
 /*
@@ -486,6 +492,9 @@ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
 
+/* struct btrfs_inode_ref */
+BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
@@ -1043,6 +1052,14 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 
 /* inode-item.c */
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid);
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 35d2608f891..cba30b6cc6f 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -20,6 +20,142 @@
 #include "disk-io.h"
 #include "transaction.h"
 
+int find_name_in_backref(struct btrfs_path *path, const char * name,
+			 int name_len, struct btrfs_inode_ref **ref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	while (cur_offset < item_size) {
+		ref = (struct btrfs_inode_ref *)(ptr + cur_offset);
+		len = btrfs_inode_ref_name_len(leaf, ref);
+		name_ptr = (unsigned long)(ref + 1);
+		cur_offset += len + sizeof(*ref);
+		if (len != name_len)
+			continue;
+		if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) {
+			*ref_ret = ref;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+	u32 sub_item_len;
+	int ret;
+	int del_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	} else if (ret < 0) {
+		goto out;
+	}
+	if (!find_name_in_backref(path, name, name_len, &ref)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (del_len == item_size) {
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+	ptr = (unsigned long)ref;
+	sub_item_len = name_len + sizeof(*ref);
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
+			      item_size - (ptr + sub_item_len - item_start));
+	ret = btrfs_truncate_item(trans, root, path,
+				  item_size - sub_item_len, 1);
+	BUG_ON(ret);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   const char *name, int name_len,
+			   u64 inode_objectid, u64 ref_objectid)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	int ret;
+	int ins_len = name_len + sizeof(*ref);
+
+	key.objectid = inode_objectid;
+	key.offset = ref_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		u32 old_size;
+
+		if (find_name_in_backref(path, name, name_len, &ref))
+			goto out;
+
+		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+		ret = btrfs_extend_item(trans, root, path, ins_len);
+		BUG_ON(ret);
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		ptr = (unsigned long)(ref + 1);
+		ret = 0;
+	} else if (ret < 0) {
+		goto out;
+	} else {
+		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_inode_ref);
+		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		ptr = (unsigned long)(ref + 1);
+	}
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03fea037667..cefe740b6c7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -404,6 +404,17 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 
 	dentry->d_inode->i_ctime = dir->i_ctime;
+	if (!S_ISLNK(dentry->d_inode->i_mode)) {
+		ret = btrfs_del_inode_ref(trans, root, name, name_len,
+					  dentry->d_inode->i_ino,
+					  dentry->d_parent->d_inode->i_ino);
+		if (ret) {
+			printk("failed to delete reference to %.*s, "
+			       "inode %lu parent %lu\n", name_len, name,
+			       dentry->d_inode->i_ino,
+			       dentry->d_parent->d_inode->i_ino);
+		}
+	}
 err:
 	btrfs_free_path(path);
 	if (!ret) {
@@ -445,75 +456,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	int err;
 	int ret;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_path *path;
-	struct btrfs_key key;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_key found_key;
-	int found_type;
-	struct extent_buffer *leaf;
-	char *goodnames = "..";
 	unsigned long nr;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-
 	btrfs_set_trans_block_group(trans, dir);
-	key.objectid = inode->i_ino;
-	key.offset = (u64)-1;
-	key.type = (u8)-1;
-	while(1) {
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0) {
-			err = ret;
-			goto out;
-		}
-		BUG_ON(ret == 0);
-		if (path->slots[0] == 0) {
-			err = -ENOENT;
-			goto out;
-		}
-		path->slots[0]--;
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		found_type = btrfs_key_type(&found_key);
-		if (found_key.objectid != inode->i_ino) {
-			err = -ENOENT;
-			goto out;
-		}
-		if ((found_type != BTRFS_DIR_ITEM_KEY &&
-		     found_type != BTRFS_DIR_INDEX_KEY) ||
-	            (!btrfs_match_dir_item_name(root, path, goodnames, 2) &&
-	            !btrfs_match_dir_item_name(root, path, goodnames, 1))) {
-			err = -ENOTEMPTY;
-			goto out;
-		}
-		ret = btrfs_del_item(trans, root, path);
-		BUG_ON(ret);
-
-		if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1)
-			break;
-		btrfs_release_path(root, path);
-	}
-	ret = 0;
-	btrfs_release_path(root, path);
 
 	/* now the directory is empty */
 	err = btrfs_unlink_trans(trans, root, dir, dentry);
 	if (!err) {
 		inode->i_size = 0;
 	}
-out:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
+
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -887,21 +850,59 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret = 0;
 
+	if (namelen == 1 && strcmp(name, ".") == 0) {
+		location->objectid = dir->i_ino;
+		location->type = BTRFS_INODE_ITEM_KEY;
+		location->offset = 0;
+		return 0;
+	}
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+
+	if (namelen == 1 && strcmp(name, "..") == 0) {
+		struct btrfs_key key;
+		struct extent_buffer *leaf;
+		u32 nritems;
+		int slot;
+
+		key.objectid = dir->i_ino;
+		btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+		key.offset = 0;
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		BUG_ON(ret == 0);
+		ret = 0;
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (slot >= nritems)
+			goto out_err;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != dir->i_ino ||
+		    key.type != BTRFS_INODE_REF_KEY) {
+			goto out_err;
+		}
+		location->objectid = key.offset;
+		location->type = BTRFS_INODE_ITEM_KEY;
+		location->offset = 0;
+		goto out;
+	}
+
 	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
 	if (IS_ERR(di))
 		ret = PTR_ERR(di);
 	if (!di || IS_ERR(di)) {
-		location->objectid = 0;
-		goto out;
+		goto out_err;
 	}
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 out:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
+out_err:
+	location->objectid = 0;
+	goto out;
 }
 
 /*
@@ -1053,13 +1054,50 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	if (root->fs_info->tree_root == root)
 		key_type = BTRFS_DIR_ITEM_KEY;
 
+	/* special case for "." */
+	if (filp->f_pos == 0) {
+		over = filldir(dirent, ".", 1,
+			       1, inode->i_ino,
+			       DT_DIR);
+		if (over)
+			return 0;
+		filp->f_pos = 1;
+	}
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
+	path = btrfs_alloc_path();
+	path->reada = 2;
+
+	/* special case for .., just use the back ref */
+	if (filp->f_pos == 1) {
+		btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+		key.offset = 0;
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		BUG_ON(ret == 0);
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (slot >= nritems) {
+			btrfs_release_path(root, path);
+			goto read_dir_items;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		btrfs_release_path(root, path);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != BTRFS_INODE_REF_KEY)
+			goto read_dir_items;
+		over = filldir(dirent, "..", 2,
+			       2, found_key.offset, DT_DIR);
+		if (over)
+			goto nopos;
+		filp->f_pos = 2;
+	}
+
+read_dir_items:
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 
-	path = btrfs_alloc_path();
-	path->reada = 2;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -1255,6 +1293,13 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_parent->d_inode->i_ino,
 				    &key, btrfs_inode_type(inode));
 	if (ret == 0) {
+		if (!S_ISLNK(inode->i_mode)) {
+			ret = btrfs_insert_inode_ref(trans, root,
+					     dentry->d_name.name,
+					     dentry->d_name.len,
+					     inode->i_ino,
+					     dentry->d_parent->d_inode->i_ino);
+		}
 		parent_inode = dentry->d_parent->d_inode;
 		parent_inode->i_size += dentry->d_name.len * 2;
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -1430,35 +1475,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	return err;
 }
 
-static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 objectid, u64 dirid)
-{
-	int ret;
-	char buf[2];
-	struct btrfs_key key;
-
-	buf[0] = '.';
-	buf[1] = '.';
-
-	key.objectid = objectid;
-	key.offset = 0;
-	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-
-	ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
-				    &key, BTRFS_FT_DIR);
-	if (ret)
-		goto error;
-
-	key.objectid = dirid;
-	ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
-				    &key, BTRFS_FT_DIR);
-	if (ret)
-		goto error;
-error:
-	return ret;
-}
-
 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct inode *inode;
@@ -1496,11 +1512,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode->i_fop = &btrfs_dir_file_operations;
 	btrfs_set_trans_block_group(trans, inode);
 
-	err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
-	if (err)
-		goto out_fail;
-
-	inode->i_size = 6;
+	inode->i_size = 0;
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		goto out_fail;
@@ -1971,6 +1983,12 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail;
 
+	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
+			     name, namelen, objectid,
+			     root->fs_info->sb->s_root->d_inode->i_ino);
+	if (ret)
+		goto fail;
+
 	ret = btrfs_commit_transaction(trans, root);
 	if (ret)
 		goto fail_commit;
@@ -1989,12 +2007,10 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	inode->i_fop = &btrfs_dir_file_operations;
 	new_root->inode = inode;
 
-	ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
-	if (ret)
-		goto fail;
-
+	ret = btrfs_insert_inode_ref(trans, new_root, "..", 2, new_dirid,
+				     new_dirid);
 	inode->i_nlink = 1;
-	inode->i_size = 6;
+	inode->i_size = 0;
 	ret = btrfs_update_inode(trans, new_root, inode);
 	if (ret)
 		goto fail;
@@ -2068,6 +2084,13 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail;
 
+	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
+			     name, namelen, objectid,
+			     root->fs_info->sb->s_root->d_inode->i_ino);
+
+	if (ret)
+		goto fail;
+
 	ret = btrfs_inc_root_ref(trans, root, objectid);
 	if (ret)
 		goto fail;
@@ -2338,7 +2361,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct inode *old_inode = old_dentry->d_inode;
 	struct timespec ctime = CURRENT_TIME;
 	struct btrfs_path *path;
-	struct btrfs_dir_item *di;
 	int ret;
 
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -2361,52 +2383,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
 
-	if (S_ISDIR(old_inode->i_mode) && old_dir != new_dir) {
-		struct btrfs_key *location = &BTRFS_I(new_dir)->location;
-		struct btrfs_key old_parent_key;
-		di = btrfs_lookup_dir_item(trans, root, path, old_inode->i_ino,
-					   "..", 2, -1);
-		if (IS_ERR(di)) {
-			ret = PTR_ERR(di);
-			goto out_fail;
-		}
-		if (!di) {
-			ret = -ENOENT;
-			goto out_fail;
-		}
-		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &old_parent_key);
-		ret = btrfs_del_item(trans, root, path);
-		if (ret) {
-			goto out_fail;
-		}
-		btrfs_release_path(root, path);
-
-		di = btrfs_lookup_dir_index_item(trans, root, path,
-						 old_inode->i_ino,
-						 old_parent_key.objectid,
-						 "..", 2, -1);
-		if (IS_ERR(di)) {
-			ret = PTR_ERR(di);
-			goto out_fail;
-		}
-		if (!di) {
-			ret = -ENOENT;
-			goto out_fail;
-		}
-		ret = btrfs_del_item(trans, root, path);
-		if (ret) {
-			goto out_fail;
-		}
-		btrfs_release_path(root, path);
-
-		ret = btrfs_insert_dir_item(trans, root, "..", 2,
-					    old_inode->i_ino, location,
-					    BTRFS_FT_DIR);
-		if (ret)
-			goto out_fail;
-	}
-
-
 	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry);
 	if (ret)
 		goto out_fail;
-- 
cgit v1.2.3


From 70b043f0c7d7c78d8a0a01e65067c7c8fe338485 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 13 Dec 2007 09:02:46 -0500
Subject: Btrfs: Extra NULL block group checks in find_free_extent

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 187be401247..7c953b2ecf1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1319,6 +1319,12 @@ static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	total_needed += empty_size;
 	path = btrfs_alloc_path();
 check_failed:
+	if (!block_group) {
+		block_group = btrfs_lookup_block_group(info, search_start);
+		if (!block_group)
+			block_group = btrfs_lookup_block_group(info,
+						       orig_search_start);
+	}
 	search_start = find_search_start(root, &block_group, search_start,
 					 total_needed, data, full_scan);
 	search_start = stripe_align(root, search_start);
-- 
cgit v1.2.3


From 76fea00a0522185658b733d441d6ec049344456b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 13 Dec 2007 09:06:01 -0500
Subject: Btrfs: Add backrefs for symbolic link inodes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cefe740b6c7..06d92be1680 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -404,16 +404,14 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 
 	dentry->d_inode->i_ctime = dir->i_ctime;
-	if (!S_ISLNK(dentry->d_inode->i_mode)) {
-		ret = btrfs_del_inode_ref(trans, root, name, name_len,
-					  dentry->d_inode->i_ino,
-					  dentry->d_parent->d_inode->i_ino);
-		if (ret) {
-			printk("failed to delete reference to %.*s, "
-			       "inode %lu parent %lu\n", name_len, name,
-			       dentry->d_inode->i_ino,
-			       dentry->d_parent->d_inode->i_ino);
-		}
+	ret = btrfs_del_inode_ref(trans, root, name, name_len,
+				  dentry->d_inode->i_ino,
+				  dentry->d_parent->d_inode->i_ino);
+	if (ret) {
+		printk("failed to delete reference to %.*s, "
+		       "inode %lu parent %lu\n", name_len, name,
+		       dentry->d_inode->i_ino,
+		       dentry->d_parent->d_inode->i_ino);
 	}
 err:
 	btrfs_free_path(path);
@@ -1293,13 +1291,11 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_parent->d_inode->i_ino,
 				    &key, btrfs_inode_type(inode));
 	if (ret == 0) {
-		if (!S_ISLNK(inode->i_mode)) {
-			ret = btrfs_insert_inode_ref(trans, root,
-					     dentry->d_name.name,
-					     dentry->d_name.len,
-					     inode->i_ino,
-					     dentry->d_parent->d_inode->i_ino);
-		}
+		ret = btrfs_insert_inode_ref(trans, root,
+				     dentry->d_name.name,
+				     dentry->d_name.len,
+				     inode->i_ino,
+				     dentry->d_parent->d_inode->i_ino);
 		parent_inode = dentry->d_parent->d_inode;
 		parent_inode->i_size += dentry->d_name.len * 2;
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-- 
cgit v1.2.3


From 7a7205367dfd9adf129f0ebc7d040a557fe0d1ba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 13 Dec 2007 09:06:59 -0500
Subject: Btrfs: Fix typo in .. check (thanks Yan)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06d92be1680..41b0c7de7a5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -857,7 +857,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	if (namelen == 1 && strcmp(name, "..") == 0) {
+	if (namelen == 2 && strcmp(name, "..") == 0) {
 		struct btrfs_key key;
 		struct extent_buffer *leaf;
 		u32 nritems;
-- 
cgit v1.2.3


From 6caab489c5866788ae64a72986520b1f0e40c706 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 13 Dec 2007 09:48:07 -0500
Subject: Fix btrfs_inc_ref to add backref hints

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7c953b2ecf1..057cdd49c09 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -702,7 +702,7 @@ int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 	}
 	return btrfs_inc_extent_ref(trans, root, root->node->start,
 				    root->node->len, owner_objectid,
-				    generation, 0, 0);
+				    generation, key_objectid, level);
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -746,10 +746,12 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
+			btrfs_node_key_to_cpu(buf, &key, i);
 			ret = btrfs_inc_extent_ref(trans, root, bytenr,
 					   btrfs_level_size(root, level - 1),
 					   root->root_key.objectid,
-					   trans->transid, 0, 0);
+					   trans->transid, key.objectid,
+					   level - 1);
 			if (ret) {
 				faili = i;
 				goto fail;
-- 
cgit v1.2.3


From f6dbff55d77dee363c22873481db54d3bada3ea6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 13 Dec 2007 11:13:32 -0500
Subject: Btrfs: Reorder extent back refs to differentiate btree blocks from
 file data

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  4 ++--
 fs/btrfs/extent-tree.c | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 035fa155034..052555ced44 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -37,11 +37,12 @@ extern struct kmem_cache *btrfs_path_cachep;
 
 #define BTRFS_MAGIC "_B2RfS_M"
 
+#define BTRFS_MAX_LEVEL 8
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
 #define BTRFS_FS_TREE_OBJECTID 3ULL
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
-#define BTRFS_FIRST_FREE_OBJECTID 5ULL
+#define BTRFS_FIRST_FREE_OBJECTID 256ULL
 
 /*
  * we can actually store much bigger names, but lets not confuse the rest
@@ -107,7 +108,6 @@ struct btrfs_header {
 	u8 level;
 } __attribute__ ((__packed__));
 
-#define BTRFS_MAX_LEVEL 8
 #define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
 			        sizeof(struct btrfs_header)) / \
 			        sizeof(struct btrfs_key_ptr))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 057cdd49c09..00414836e96 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -520,13 +520,13 @@ out:
  *
  * When a tree block is created, back references are inserted:
  *
- * (root->root_key.objectid, trans->transid or zero, lowest_key_objectid, level)
+ * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
  *
  * When a tree block is cow'd in a reference counted root,
  * new back references are added for all the blocks it points to.
  * These are of the form (trans->transid will have increased since creation):
  *
- * (root->root_key.objectid, trans->transid, lowest_key_objectid, level)
+ * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
  *
  * Because the lowest_key_objectid and the level are just hints
  * they are not used when backrefs are deleted.  When a backref is deleted:
@@ -702,7 +702,7 @@ int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 	}
 	return btrfs_inc_extent_ref(trans, root, root->node->start,
 				    root->node->len, owner_objectid,
-				    generation, key_objectid, level);
+				    generation, level, key_objectid);
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -750,8 +750,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			ret = btrfs_inc_extent_ref(trans, root, bytenr,
 					   btrfs_level_size(root, level - 1),
 					   root->root_key.objectid,
-					   trans->transid, key.objectid,
-					   level - 1);
+					   trans->transid,
+					   level - 1, key.objectid);
 			if (ret) {
 				faili = i;
 				goto fail;
@@ -1063,8 +1063,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 		}
 		err = btrfs_insert_extent_backref(trans, extent_root, path,
 					  start, extent_root->root_key.objectid,
-					  0, btrfs_disk_key_objectid(&first),
-					  level);
+					  0, level,
+					  btrfs_disk_key_objectid(&first));
 		BUG_ON(err);
 		free_extent_buffer(eb);
 	}
@@ -1640,7 +1640,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_alloc_extent(trans, root, blocksize,
 				 root_objectid, ref_generation,
-				 first_objectid, level, empty_size, hint,
+				 level, first_objectid, empty_size, hint,
 				 (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
-- 
cgit v1.2.3


From 6e3b96668eb34da6060a5189e6a73e9d62a3b178 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 14 Dec 2007 11:14:42 -0500
Subject: Off by one fix for btrfs_drop_extents

One of my old patches introduces a new bug to
btrfs_drop_extents(changeset 275). Inline extents are not truncated
properly when "extent_end == end", it can trigger the BUG_ON at
file.c:600.  I hope I don't introduce new bug this time.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1cc4d285951..a280b956222 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -481,8 +481,9 @@ next_slot:
 			search_start = (extent_end + mask) & ~mask;
 		} else
 			search_start = extent_end;
-		if (end < extent_end && start > key.offset && found_inline) {
+		if (end <= extent_end && start >= key.offset && found_inline) {
 			*hint_byte = EXTENT_MAP_INLINE;
+			continue;
 		}
 		if (end < extent_end && end >= key.offset) {
 			if (found_extent) {
-- 
cgit v1.2.3


From e9906a98497c26a3c1a7532b51b81cc1fbdf9e1d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 14 Dec 2007 12:56:58 -0500
Subject: Fixes for loopback files in btrfs

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c  |  1 +
 fs/btrfs/inode.c | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a280b956222..62fcd79d8ab 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -861,6 +861,7 @@ struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read       = generic_file_aio_read,
+	.splice_read	= generic_file_splice_read,
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 41b0c7de7a5..e535c50e669 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1893,8 +1893,19 @@ static void btrfs_truncate(struct inode *inode)
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to)
 {
-	return extent_commit_write(&BTRFS_I(page->mapping->host)->extent_tree,
-				   page->mapping->host, page, from, to);
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+	struct inode *inode = page->mapping->host;
+
+	btrfs_cow_one_page(inode, page, PAGE_CACHE_SIZE);
+
+	set_page_extent_mapped(page);
+	set_page_dirty(page);
+
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
 }
 
 static int create_subvol(struct btrfs_root *root, char *name, int namelen)
-- 
cgit v1.2.3


From b6cda9bcb4df7544c67fc3548a53bc1607d59f46 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 14 Dec 2007 15:30:32 -0500
Subject: Btrfs: Add mount -o nodatasum to turn of file data checksumming

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 14 ++++++++------
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/inode.c   | 11 ++++++++++-
 fs/btrfs/super.c   | 17 +++++++++++++++--
 4 files changed, 34 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 052555ced44..5e255cabfd1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -321,6 +321,7 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
+	unsigned long mount_opt;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
 	struct extent_buffer *sb_buffer;
@@ -429,6 +430,13 @@ struct btrfs_root {
  */
 #define BTRFS_STRING_ITEM_KEY	253
 
+#define BTRFS_MOUNT_NODATASUM		0x1
+
+#define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
+#define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
+					 BTRFS_MOUNT_##opt)
+
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
@@ -906,12 +914,6 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
 	((unsigned long)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset_nr(leaf, slot)))
 
-/* mount option defines and helpers */
-#define BTRFS_MOUNT_SUBVOL		0x000001
-#define btrfs_clear_opt(o, opt)		o &= ~BTRFS_MOUNT_##opt
-#define btrfs_set_opt(o, opt)		o |= BTRFS_MOUNT_##opt
-#define btrfs_test_opt(sb, opt)		(BTRFS_SB(sb)->s_mount_opt & \
-					 BTRFS_MOUNT_##opt)
 /* extent-tree.c */
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 60a30da6af0..4338b00c2a2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -568,6 +568,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->sb = sb;
+	fs_info->mount_opt = 0;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e535c50e669..11885cb114e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -116,10 +116,13 @@ int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	char *kaddr;
-	int ret;
+	int ret = 0;
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	size_t offset = start - page_start;
 
+	if (btrfs_test_opt(root, NODATASUM))
+		return 0;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
@@ -143,6 +146,9 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_path *path = NULL;
 	u32 csum;
 
+	if (btrfs_test_opt(root, NODATASUM))
+		return 0;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	path = btrfs_alloc_path();
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
@@ -176,6 +182,9 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	u32 csum = ~(u32)0;
 	unsigned long flags;
 
+	if (btrfs_test_opt(root, NODATASUM))
+		return 0;
+
 	ret = get_state_private(em_tree, start, &private);
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ffa02872c4d..ad4f280ec77 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,11 +61,12 @@ static void btrfs_put_super (struct super_block * sb)
 }
 
 enum {
-	Opt_subvol, Opt_err,
+	Opt_subvol, Opt_nodatasum, Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_subvol, "subvol=%s"},
+	{Opt_nodatasum, "nodatasum"},
 	{Opt_err, NULL}
 };
 
@@ -74,7 +75,12 @@ static int parse_options (char * options,
 			  char **subvol_name)
 {
 	char * p;
+	struct btrfs_fs_info *info = NULL;
 	substring_t args[MAX_OPT_ARGS];
+
+	if (root)
+		info = root->fs_info;
+
 	if (!options)
 		return 1;
 
@@ -86,7 +92,12 @@ static int parse_options (char * options,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_subvol:
-			*subvol_name = match_strdup(&args[0]);
+			if (subvol_name)
+				*subvol_name = match_strdup(&args[0]);
+			break;
+		case Opt_nodatasum:
+			if (root)
+				btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		default:
 			return 0;
@@ -143,6 +154,8 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 		goto fail_close;
 	}
 
+	parse_options((char *)data, tree_root, NULL);
+
 	/* this does the super kobj at the same time */
 	err = btrfs_sysfs_add_super(tree_root->fs_info);
 	if (err)
-- 
cgit v1.2.3


From be20aa9dbadc8c06283784ee12bbc0d97dea3418 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Dec 2007 20:14:01 -0500
Subject: Btrfs: Add mount option to turn off data cow

A number of workloads do not require copy on write data or checksumming.
mount -o nodatasum to disable checksums and -o nodatacow to disable
both copy on write and checksumming.

In nodatacow mode, copy on write is still performed when a given extent
is under snapshot.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  53 ++++++++++++++++++++-
 fs/btrfs/ctree.h       |   8 ++++
 fs/btrfs/extent-tree.c | 101 ++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/inode.c       | 122 +++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/super.c       |  34 +++++++++++---
 5 files changed, 293 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fd8233e05cf..585f279d111 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -70,7 +70,58 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	memset(p, 0, sizeof(*p));
 }
 
-static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+	struct extent_buffer *cow;
+	u32 nritems;
+	int ret = 0;
+	int level;
+	struct btrfs_key first_key;
+	struct btrfs_root new_root;
+
+	memcpy(&new_root, root, sizeof(new_root));
+	new_root.root_key.objectid = new_root_objectid;
+
+	WARN_ON(root->ref_cows && trans->transid !=
+		root->fs_info->running_transaction->transid);
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+	nritems = btrfs_header_nritems(buf);
+	if (nritems) {
+		if (level == 0)
+			btrfs_item_key_to_cpu(buf, &first_key, 0);
+		else
+			btrfs_node_key_to_cpu(buf, &first_key, 0);
+	} else {
+		first_key.objectid = 0;
+	}
+	cow = __btrfs_alloc_free_block(trans, &new_root, buf->len,
+				       new_root_objectid,
+				       trans->transid, first_key.objectid,
+				       level, buf->start, 0);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_owner(cow, new_root_objectid);
+
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	ret = btrfs_inc_ref(trans, &new_root, buf);
+	if (ret)
+		return ret;
+
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e255cabfd1..b51b021fff8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -431,6 +431,7 @@ struct btrfs_root {
 #define BTRFS_STRING_ITEM_KEY	253
 
 #define BTRFS_MOUNT_NODATASUM		0x1
+#define BTRFS_MOUNT_NODATACOW		0x2
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -915,6 +916,9 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
 	btrfs_item_offset_nr(leaf, slot)))
 
 /* extent-tree.c */
+u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
+				  struct btrfs_path *count_path,
+				  u64 first_extent);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy);
@@ -974,6 +978,10 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 00414836e96..1412d556313 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -679,6 +679,104 @@ out:
 	return 0;
 }
 
+u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
+				  struct btrfs_path *count_path,
+				  u64 first_extent)
+{
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
+	u64 bytenr;
+	u64 found_objectid;
+	u64 root_objectid = 0;
+	u32 total_count = 0;
+	u32 cur_count;
+	u32 refs;
+	u32 nritems;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *l;
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_ref *ref_item;
+	int level = -1;
+
+	path = btrfs_alloc_path();
+again:
+	if (level == -1)
+		bytenr = first_extent;
+	else
+		bytenr = count_path->nodes[level]->start;
+
+	cur_count = 0;
+	key.objectid = bytenr;
+	key.offset = 0;
+
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+
+	if (found_key.objectid != bytenr ||
+	    found_key.type != BTRFS_EXTENT_ITEM_KEY) {
+		goto out;
+	}
+
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(l, item);
+	while (1) {
+		nritems = btrfs_header_nritems(l);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret == 0)
+				continue;
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+		if (found_key.objectid != bytenr)
+			break;
+		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+			path->slots[0]++;
+			continue;
+		}
+
+		cur_count++;
+		ref_item = btrfs_item_ptr(l, path->slots[0],
+					  struct btrfs_extent_ref);
+		found_objectid = btrfs_ref_root(l, ref_item);
+
+		if (found_objectid != root_objectid)
+			total_count++;
+
+		if (total_count > 1)
+			goto out;
+
+		if (root_objectid == 0)
+			root_objectid = found_objectid;
+
+		path->slots[0]++;
+	}
+	if (cur_count == 0) {
+		total_count = 0;
+		goto out;
+	}
+	if (total_count > 1)
+		goto out;
+	if (level >= 0 && root->node == count_path->nodes[level])
+		goto out;
+	level++;
+	btrfs_release_path(root, path);
+	goto again;
+
+out:
+	btrfs_free_path(path);
+	return total_count;
+
+}
+
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner_objectid)
 {
@@ -1127,9 +1225,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
-	if (ref_generation && owner_objectid == 0 && root_objectid == 3) {
-//printk("drop backref root %Lu gen %Lu byte %Lu\n", root_objectid, ref_generation, bytenr );
-	}
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, root_objectid,
 				    ref_generation,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 11885cb114e..91f3fc43e2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -72,21 +72,22 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_key ins;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
-	int ret;
 	u64 blocksize = root->sectorsize;
+	struct btrfs_key ins;
+	int ret;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
 	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
+
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
 	ret = btrfs_drop_extents(trans, root, inode,
 				 start, start + num_bytes, start, &alloc_hint);
 
@@ -106,6 +107,101 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 				       ins.offset);
 out:
 	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+{
+	u64 extent_start;
+	u64 extent_end;
+	u64 bytenr;
+	u64 cow_end;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	int found_type;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *item;
+	int ret;
+	int err;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	ret = btrfs_lookup_file_extent(NULL, root, path,
+				       inode->i_ino, start, 0);
+	if (ret < 0) {
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	cow_end = end;
+	if (ret != 0) {
+		if (path->slots[0] == 0)
+			goto not_found;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+
+	/* are we inside the extent that was found? */
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	found_type = btrfs_key_type(&found_key);
+	if (found_key.objectid != inode->i_ino ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		goto not_found;
+	}
+
+	found_type = btrfs_file_extent_type(leaf, item);
+	extent_start = found_key.offset;
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		extent_end = extent_start +
+		       btrfs_file_extent_num_bytes(leaf, item);
+		err = 0;
+
+		if (start < extent_start || start >= extent_end)
+			goto not_found;
+
+		cow_end = min(end, extent_end - 1);
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+		if (bytenr == 0)
+			goto not_found;
+
+		bytenr += btrfs_file_extent_offset(leaf, item);
+		if (btrfs_count_snapshots_in_path(root, path, bytenr) != 1) {
+			goto not_found;
+		}
+
+		start = extent_end;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		goto not_found;
+	}
+loop:
+	if (start > end) {
+		btrfs_free_path(path);
+		return 0;
+	}
+	btrfs_release_path(root, path);
+	goto again;
+
+not_found:
+	cow_file_range(inode, start, cow_end);
+	start = cow_end + 1;
+	goto loop;
+}
+
+static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	if (btrfs_test_opt(root, NODATACOW))
+		ret = run_delalloc_nocow(inode, start, end);
+	else
+		ret = cow_file_range(inode, start, end);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
@@ -1907,9 +2003,6 @@ int btrfs_commit_write(struct file *file, struct page *page,
 
 	btrfs_cow_one_page(inode, page, PAGE_CACHE_SIZE);
 
-	set_page_extent_mapped(page);
-	set_page_dirty(page);
-
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
 		mark_inode_dirty(inode);
@@ -2078,13 +2171,18 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	key.objectid = objectid;
 	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
 	extent_buffer_get(root->node);
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 	free_extent_buffer(tmp);
-	btrfs_set_root_bytenr(&new_root_item, root->node->start);
-	btrfs_set_root_level(&new_root_item, btrfs_header_level(root->node));
+
+	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
+
+	btrfs_set_root_bytenr(&new_root_item, tmp->start);
+	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
+	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
 
@@ -2106,10 +2204,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 
 	if (ret)
 		goto fail;
-
-	ret = btrfs_inc_root_ref(trans, root, objectid);
-	if (ret)
-		goto fail;
 fail:
 	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ad4f280ec77..2116728d1f9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,12 +61,13 @@ static void btrfs_put_super (struct super_block * sb)
 }
 
 enum {
-	Opt_subvol, Opt_nodatasum, Opt_err,
+	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_subvol, "subvol=%s"},
 	{Opt_nodatasum, "nodatasum"},
+	{Opt_nodatacow, "nodatacow"},
 	{Opt_err, NULL}
 };
 
@@ -78,12 +79,20 @@ static int parse_options (char * options,
 	struct btrfs_fs_info *info = NULL;
 	substring_t args[MAX_OPT_ARGS];
 
-	if (root)
-		info = root->fs_info;
-
 	if (!options)
 		return 1;
 
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	options = kstrdup(options, GFP_NOFS);
+	if (!options)
+		return -ENOMEM;
+
+	if (root)
+		info = root->fs_info;
+
 	while ((p = strsep (&options, ",")) != NULL) {
 		int token;
 		if (!*p)
@@ -92,17 +101,28 @@ static int parse_options (char * options,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_subvol:
-			if (subvol_name)
+			if (subvol_name) {
 				*subvol_name = match_strdup(&args[0]);
+			}
 			break;
 		case Opt_nodatasum:
-			if (root)
+			if (info) {
+				printk("btrfs: setting nodatacsum\n");
 				btrfs_set_opt(info->mount_opt, NODATASUM);
+			}
+			break;
+		case Opt_nodatacow:
+			if (info) {
+				printk("btrfs: setting nodatacow\n");
+				btrfs_set_opt(info->mount_opt, NODATACOW);
+				btrfs_set_opt(info->mount_opt, NODATASUM);
+			}
 			break;
 		default:
-			return 0;
+			break;
 		}
 	}
+	kfree(options);
 	return 1;
 }
 
-- 
cgit v1.2.3


From d10c5f31c79c6a703974ae29d0a05781b828fe30 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Dec 2007 20:14:04 -0500
Subject: Btrfs: Fix extent_map and extent_state leaks by flushing lrus on FS
 unmount

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4338b00c2a2..27856417899 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -765,8 +765,18 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
+
+	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+
+	extent_map_tree_empty_lru(&fs_info->free_space_cache);
+	extent_map_tree_empty_lru(&fs_info->block_group_cache);
+	extent_map_tree_empty_lru(&fs_info->pinned_extents);
+	extent_map_tree_empty_lru(&fs_info->pending_del);
+	extent_map_tree_empty_lru(&fs_info->extent_ins);
 	extent_map_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+
 	iput(fs_info->btree_inode);
 #if 0
 	while(!list_empty(&fs_info->hashers)) {
-- 
cgit v1.2.3


From c59f8951d48c5eb000926935f3ab063d8181d1ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Dec 2007 20:14:04 -0500
Subject: Btrfs: Add mount option to enforce a max extent size

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c |  1 +
 fs/btrfs/inode.c   | 28 ++++++++++++++++++----------
 fs/btrfs/super.c   | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 65 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b51b021fff8..32b24460ec8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -322,6 +322,7 @@ struct btrfs_fs_info {
 	u64 generation;
 	u64 last_trans_committed;
 	unsigned long mount_opt;
+	u64 max_extent;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
 	struct extent_buffer *sb_buffer;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 27856417899..7213012c27d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -569,6 +569,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->extent_root = extent_root;
 	fs_info->sb = sb;
 	fs_info->mount_opt = 0;
+	fs_info->max_extent = (u64)-1;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 91f3fc43e2a..686dd03f34f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -78,6 +78,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
+	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
 	struct btrfs_key ins;
 	int ret;
@@ -94,17 +95,24 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
 
-	ret = btrfs_alloc_extent(trans, root, num_bytes,
-				 root->root_key.objectid, trans->transid,
-				 inode->i_ino, start, 0,
-				 alloc_hint, (u64)-1, &ins, 1);
-	if (ret) {
-		WARN_ON(1);
-		goto out;
+	while(num_bytes > 0) {
+		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+		ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
+					 root->root_key.objectid,
+					 trans->transid,
+					 inode->i_ino, start, 0,
+					 alloc_hint, (u64)-1, &ins, 1);
+		if (ret) {
+			WARN_ON(1);
+			goto out;
+		}
+		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+					       start, ins.objectid, ins.offset,
+					       ins.offset);
+		num_bytes -= cur_alloc_size;
+		alloc_hint = ins.objectid + ins.offset;
+		start += cur_alloc_size;
 	}
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       start, ins.objectid, ins.offset,
-				       ins.offset);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2116728d1f9..b7323914bf5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -34,6 +34,7 @@
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/parser.h>
+#include <linux/ctype.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -61,16 +62,42 @@ static void btrfs_put_super (struct super_block * sb)
 }
 
 enum {
-	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_err,
+	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_subvol, "subvol=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
+	{Opt_max_extent, "max_extent=%s"},
 	{Opt_err, NULL}
 };
 
+static unsigned long parse_size(char *str)
+{
+	unsigned long res;
+	int mult = 1;
+	char *end;
+	char last;
+
+	res = simple_strtoul(str, &end, 10);
+
+	last = end[0];
+	if (isalpha(last)) {
+		last = tolower(last);
+		switch (last) {
+		case 'g':
+			mult *= 1024;
+		case 'm':
+			mult *= 1024;
+		case 'k':
+			mult *= 1024;
+		}
+		res = res * mult;
+	}
+	return res;
+}
+
 static int parse_options (char * options,
 			  struct btrfs_root *root,
 			  char **subvol_name)
@@ -118,6 +145,21 @@ static int parse_options (char * options,
 				btrfs_set_opt(info->mount_opt, NODATASUM);
 			}
 			break;
+		case Opt_max_extent:
+			if (info) {
+				char *num = match_strdup(&args[0]);
+				if (num) {
+					info->max_extent = parse_size(num);
+					kfree(num);
+
+					info->max_extent = max_t(u64,
+							 info->max_extent,
+							 root->sectorsize);
+					printk("btrfs: max_extent at %Lu\n",
+					       info->max_extent);
+				}
+			}
+			break;
 		default:
 			break;
 		}
@@ -329,6 +371,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data,
 			btrfs_fill_super, mnt,
 			subvol_name ? subvol_name : "default");
+	if (subvol_name)
+		kfree(subvol_name);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6da6abae027e2dbc59bca5f4168b0760f25068c7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 18 Dec 2007 16:15:09 -0500
Subject: Btrfs: Back port to 2.6.18-el kernels

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  13 ++++
 fs/btrfs/disk-io.c     |   5 +-
 fs/btrfs/extent_map.c  | 168 ++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/file.c        |  15 +++--
 fs/btrfs/inode.c       |  21 +++++--
 fs/btrfs/super.c       |   1 +
 fs/btrfs/sysfs.c       |  23 ++++++-
 fs/btrfs/transaction.c |   8 +++
 fs/btrfs/transaction.h |   5 ++
 9 files changed, 239 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 32b24460ec8..2559f5d5d00 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -19,6 +19,7 @@
 #ifndef __BTRFS__
 #define __BTRFS__
 
+#include <linux/version.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
@@ -334,7 +335,11 @@ struct btrfs_fs_info {
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	struct work_struct trans_work;
+#else
 	struct delayed_work trans_work;
+#endif
 	struct kobject super_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
@@ -916,6 +921,14 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
 	((unsigned long)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset_nr(leaf, slot)))
 
+static inline struct dentry *fdentry(struct file *file) {
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	return file->f_dentry;
+#else
+	return file->f_path.dentry;
+#endif
+}
+
 /* extent-tree.c */
 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 				  struct btrfs_path *count_path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7213012c27d..ebb2db624fd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -593,8 +593,11 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
 	fs_info->total_pinned = 0;
-
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
+#else
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
+#endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8326a18413f..0d1e59a86e4 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1585,8 +1585,18 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 
 static int submit_one_bio(int rw, struct bio *bio)
 {
+	u64 maxsector;
 	int ret = 0;
+
 	bio_get(bio);
+
+        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+	if (maxsector < bio->bi_sector) {
+		printk("sector too large max %Lu got %llu\n", maxsector,
+			(unsigned long long)bio->bi_sector);
+		WARN_ON(1);
+	}
+
 	submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
@@ -1678,8 +1688,12 @@ static int __extent_read_full_page(struct extent_map_tree *tree,
 
 	while (cur <= end) {
 		if (cur >= last_byte) {
+			char *userpage;
 			iosize = PAGE_CACHE_SIZE - page_offset;
-			zero_user_page(page, page_offset, iosize, KM_USER0);
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    GFP_NOFS);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
@@ -1707,7 +1721,12 @@ static int __extent_read_full_page(struct extent_map_tree *tree,
 
 		/* we've found a hole, just zero and go on */
 		if (block_start == EXTENT_MAP_HOLE) {
-			zero_user_page(page, page_offset, iosize, KM_USER0);
+			char *userpage;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    GFP_NOFS);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
@@ -1804,9 +1823,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	}
 
 	if (page->index == end_index) {
+		char *userpage;
+
 		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
-		zero_user_page(page, offset,
-			       PAGE_CACHE_SIZE - offset, KM_USER0);
+
+		userpage = kmap_atomic(page, KM_USER0);
+		memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap_atomic(userpage, KM_USER0);
 	}
 
 	set_page_extent_mapped(page);
@@ -1921,6 +1945,129 @@ done:
 	return 0;
 }
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+
+/* Taken directly from 2.6.23 for 2.6.18 back port */
+typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
+                                void *data);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space
+ * and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int write_cache_pages(struct address_space *mapping,
+		      struct writeback_control *wbc, writepage_t writepage,
+		      void *data)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+retry:
+	while (!done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+				unlock_page(page);
+				ret = 0;
+			}
+			if (ret || (--(wbc->nr_to_write) <= 0))
+				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+#endif
+
 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc)
@@ -1945,18 +2092,20 @@ int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
 	ret = __extent_writepage(page, wbc, &epd);
 
 	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
-	if (epd.bio)
+	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_full_page);
 
+
 int extent_writepages(struct extent_map_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
 		      struct writeback_control *wbc)
 {
-	int ret;
+	int ret = 0;
 	struct extent_page_data epd = {
 		.bio = NULL,
 		.tree = tree,
@@ -1964,8 +2113,9 @@ int extent_writepages(struct extent_map_tree *tree,
 	};
 
 	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
-	if (epd.bio)
+	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_writepages);
@@ -2106,7 +2256,9 @@ int extent_prepare_write(struct extent_map_tree *tree,
 			flush_dcache_page(page);
 			kunmap_atomic(kaddr, KM_USER0);
 		}
-		if (!isnew && !PageUptodate(page) &&
+		if ((em->block_start != EXTENT_MAP_HOLE &&
+		     em->block_start != EXTENT_MAP_INLINE) && 
+		    !isnew && !PageUptodate(page) &&
 		    (block_off_end > to || block_off_start < from) &&
 		    !test_range_bit(tree, block_start, cur_end,
 				    EXTENT_UPTODATE, 1)) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 62fcd79d8ab..461b09663fe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -231,7 +231,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 {
 	int err = 0;
 	int i;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = fdentry(file)->d_inode;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 hint_byte;
@@ -652,7 +652,7 @@ static int prepare_pages(struct btrfs_root *root,
 {
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = fdentry(file)->d_inode;
 	int err = 0;
 	u64 start_pos;
 
@@ -666,7 +666,11 @@ static int prepare_pages(struct btrfs_root *root,
 			err = -ENOMEM;
 			BUG_ON(1);
 		}
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(pages[i]);
+#else
 		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+#endif
 		wait_on_page_writeback(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
@@ -682,7 +686,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	ssize_t num_written = 0;
 	ssize_t err = 0;
 	int ret = 0;
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page **pages = NULL;
 	int nrptrs;
@@ -707,7 +711,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		goto out;
 	if (count == 0)
 		goto out;
-	err = remove_suid(file->f_path.dentry);
+	err = remove_suid(fdentry(file));
 	if (err)
 		goto out;
 	file_update_time(file);
@@ -862,6 +866,9 @@ struct file_operations btrfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read       = generic_file_aio_read,
 	.splice_read	= generic_file_splice_read,
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	.sendfile	= generic_file_sendfile,
+#endif
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 686dd03f34f..6a7d9160df2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -532,7 +532,11 @@ err:
 		dir->i_size -= name_len * 2;
 		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 		btrfs_update_inode(trans, root, dir);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		dentry->d_inode->i_nlink--;
+#else
 		drop_nlink(dentry->d_inode);
+#endif
 		ret = btrfs_update_inode(trans, root, dentry->d_inode);
 		dir->i_sb->s_dirt = 1;
 	}
@@ -1139,7 +1143,7 @@ static unsigned char btrfs_filetype_table[] = {
 
 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
-	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct inode *inode = filp->f_dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_item *item;
 	struct btrfs_dir_item *di;
@@ -1554,7 +1558,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		return -ENOENT;
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	inode->i_nlink++;
+#else
 	inc_nlink(inode);
+#endif
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 
@@ -1825,6 +1833,9 @@ insert:
 	if (ret == -EEXIST) {
 		free_extent_map(em);
 		em = NULL;
+		if (0 && failed_insert == 1) {
+			btrfs_drop_extent_cache(inode, start, end);
+		}
 		failed_insert++;
 		if (failed_insert > 5) {
 			printk("failing to insert %Lu %Lu\n", start, end);
@@ -1942,7 +1953,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
  */
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	unsigned long end;
 	loff_t size;
 	int ret = -EINVAL;
@@ -2243,7 +2254,7 @@ static unsigned long force_ra(struct address_space *mapping,
 }
 
 int btrfs_defrag_file(struct file *file) {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = fdentry(file)->d_inode;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct page *page;
 	unsigned long last_index;
@@ -2329,7 +2340,7 @@ static int btrfs_ioctl_snap_create(struct btrfs_root *root, void __user *arg)
 
 static int btrfs_ioctl_defrag(struct file *file)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
+	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
 	switch (inode->i_mode & S_IFMT) {
@@ -2350,7 +2361,7 @@ static int btrfs_ioctl_defrag(struct file *file)
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
-	struct btrfs_root *root = BTRFS_I(file->f_path.dentry->d_inode)->root;
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 
 	switch (cmd) {
 	case BTRFS_IOC_SNAP_CREATE:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b7323914bf5..7cab680418a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -35,6 +35,7 @@
 #include <linux/compat.h>
 #include <linux/parser.h>
 #include <linux/ctype.h>
+#include <linux/namei.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1671c731800..cd673ca9bb9 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -189,12 +189,29 @@ static struct kset btrfs_kset;
 int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
 {
 	int error;
+	char *name;
+	char c;
+	int len = strlen(fs->sb->s_id) + 1;
+	int i;
+
+	name = kmalloc(len, GFP_NOFS);
+	if (!name) {
+		error = -ENOMEM;
+		goto fail;
+	}
+
+	for (i = 0; i < len; i++) {
+		c = fs->sb->s_id[i];
+		if (c == '/' || c == '\\')
+			c = '!';
+		name[i] = c;
+	}
+	name[len] = '\0';
 
 	fs->super_kobj.kset = &btrfs_kset;
 	fs->super_kobj.ktype = &btrfs_super_ktype;
 
-	error = kobject_set_name(&fs->super_kobj, "%s",
-				 fs->sb->s_id);
+	error = kobject_set_name(&fs->super_kobj, "%s", name);
 	if (error)
 		goto fail;
 
@@ -202,9 +219,11 @@ int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
 	if (error)
 		goto fail;
 
+	kfree(name);
 	return 0;
 
 fail:
+	kfree(name);
 	printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
 	return error;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0229e69bd2f..1ad611b9f61 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -614,12 +614,20 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 	}
 	return 0;
 }
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+void btrfs_transaction_cleaner(void *p)
+#else
 void btrfs_transaction_cleaner(struct work_struct *work)
+#endif
 {
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	struct btrfs_fs_info *fs_info = p;
+#else
 	struct btrfs_fs_info *fs_info = container_of(work,
 						     struct btrfs_fs_info,
 						     trans_work.work);
 
+#endif
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_transaction *cur;
 	struct btrfs_trans_handle *trans;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ae39fcfc169..eef840bca91 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -73,7 +73,12 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+void btrfs_transaction_cleaner(void *p);
+#else
 void btrfs_transaction_cleaner(struct work_struct *work);
+#endif
+
 void btrfs_transaction_flush_work(struct btrfs_root *root);
 void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
 void btrfs_init_transaction_sys(void);
-- 
cgit v1.2.3


From 4aec2b52327b3a3a20faa3f1af3102c0d97c7c92 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 18 Dec 2007 16:25:45 -0500
Subject: kmalloc a few large stack objects in the btrfs_ioctl path

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 20 +++++++++++++------
 fs/btrfs/inode.c | 60 +++++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 56 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 585f279d111..7f764455f26 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -80,10 +80,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int level;
 	struct btrfs_key first_key;
-	struct btrfs_root new_root;
+	struct btrfs_root *new_root;
 
-	memcpy(&new_root, root, sizeof(new_root));
-	new_root.root_key.objectid = new_root_objectid;
+	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
+	if (!new_root)
+		return -ENOMEM;
+
+	memcpy(new_root, root, sizeof(*new_root));
+	new_root->root_key.objectid = new_root_objectid;
 
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
@@ -99,12 +103,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	} else {
 		first_key.objectid = 0;
 	}
-	cow = __btrfs_alloc_free_block(trans, &new_root, buf->len,
+	cow = __btrfs_alloc_free_block(trans, new_root, buf->len,
 				       new_root_objectid,
 				       trans->transid, first_key.objectid,
 				       level, buf->start, 0);
-	if (IS_ERR(cow))
+	if (IS_ERR(cow)) {
+		kfree(new_root);
 		return PTR_ERR(cow);
+	}
 
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
@@ -112,7 +118,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(cow, new_root_objectid);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, &new_root, buf);
+	ret = btrfs_inc_ref(trans, new_root, buf);
+	kfree(new_root);
+
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6a7d9160df2..1e725a48467 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2302,40 +2302,64 @@ out_unlock:
 
 static int btrfs_ioctl_snap_create(struct btrfs_root *root, void __user *arg)
 {
-	struct btrfs_ioctl_vol_args vol_args;
+	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
-	int namelen;
 	u64 root_dirid;
+	int namelen;
+	int ret;
 
-	if (copy_from_user(&vol_args, arg, sizeof(vol_args)))
-		return -EFAULT;
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
 
-	namelen = strlen(vol_args.name);
-	if (namelen > BTRFS_VOL_NAME_MAX)
-		return -EINVAL;
-	if (strchr(vol_args.name, '/'))
-		return -EINVAL;
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	namelen = strlen(vol_args->name);
+	if (namelen > BTRFS_VOL_NAME_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (strchr(vol_args->name, '/')) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
 	mutex_lock(&root->fs_info->fs_mutex);
 	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
 			    path, root_dirid,
-			    vol_args.name, namelen, 0);
+			    vol_args->name, namelen, 0);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_free_path(path);
-	if (di && !IS_ERR(di))
-		return -EEXIST;
-	if (IS_ERR(di))
-		return PTR_ERR(di);
+
+	if (di && !IS_ERR(di)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
 
 	if (root == root->fs_info->tree_root)
-		return create_subvol(root, vol_args.name, namelen);
-	return create_snapshot(root, vol_args.name, namelen);
+		ret = create_subvol(root, vol_args->name, namelen);
+	else
+		ret = create_snapshot(root, vol_args->name, namelen);
+out:
+	kfree(vol_args);
+	return ret;
 }
 
 static int btrfs_ioctl_defrag(struct file *file)
-- 
cgit v1.2.3


From 190662b2128dd648749e197f5563e9f6bbb5e05c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 18 Dec 2007 16:25:45 -0500
Subject: Btrfs: Fix delayed allocation to avoid missing delalloc extents

find_lock_delalloc_range could exit out too early

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 9 +++++++--
 fs/btrfs/inode.c      | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 0d1e59a86e4..a0dff34dd43 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1070,6 +1070,7 @@ u64 find_lock_delalloc_range(struct extent_map_tree *tree,
 search_again:
 	node = tree_search(&tree->state, cur_start);
 	if (!node || IS_ERR(node)) {
+		*end = (u64)-1;
 		goto out;
 	}
 
@@ -1079,6 +1080,8 @@ search_again:
 			goto out;
 		}
 		if (!(state->state & EXTENT_DELALLOC)) {
+			if (!found)
+				*end = state->end;
 			goto out;
 		}
 		if (!found) {
@@ -1841,8 +1844,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
 						       &delalloc_end,
 						       128 * 1024 * 1024);
-		if (nr_delalloc <= 0)
-			break;
+		if (nr_delalloc == 0) {
+			delalloc_start = delalloc_end + 1;
+			continue;
+		}
 		tree->ops->fill_delalloc(inode, delalloc_start,
 					 delalloc_end);
 		clear_extent_bit(tree, delalloc_start,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1e725a48467..55b2e142602 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -798,6 +798,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+	WARN_ON(!PageLocked(page));
 	set_page_extent_mapped(page);
 
 	lock_extent(em_tree, page_start, page_end, GFP_NOFS);
-- 
cgit v1.2.3


From 879c1cfc3104b632dea85d07490f1b55e54275a9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 19 Dec 2007 09:11:48 -0500
Subject: Btrfs: Fix nodatacow extent lookup

Yan Zheng noticed the offset into the extent was incorrectly being added to the
extent start before trying to find it in the extent allocation tree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 55b2e142602..1c61c85dc3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -177,7 +177,6 @@ again:
 		if (bytenr == 0)
 			goto not_found;
 
-		bytenr += btrfs_file_extent_offset(leaf, item);
 		if (btrfs_count_snapshots_in_path(root, path, bytenr) != 1) {
 			goto not_found;
 		}
-- 
cgit v1.2.3


From 01f466580502c57001bf80fff709479fdb9e87a5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 21 Dec 2007 16:24:26 -0500
Subject: Btrfs: Less aggressive readahead on deletes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7f764455f26..2b3ffa707ea 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -987,9 +987,10 @@ static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
  * readahead one full node of leaves
  */
 static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
-			     int level, int slot)
+			     int level, int slot, u64 objectid)
 {
 	struct extent_buffer *node;
+	struct btrfs_disk_key disk_key;
 	u32 nritems;
 	u64 search;
 	u64 lowest_read;
@@ -1031,6 +1032,11 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 			if (nr >= nritems)
 				break;
 		}
+		if (path->reada < 0 && objectid) {
+			btrfs_node_key(node, &disk_key, nr);
+			if (btrfs_disk_key_objectid(&disk_key) != objectid)
+				break;
+		}
 		search = btrfs_node_blockptr(node, nr);
 		if ((search >= lowest_read && search <= highest_read) ||
 		    (search < lowest_read && lowest_read - search <= 32768) ||
@@ -1136,7 +1142,8 @@ again:
 			bytenr = btrfs_node_blockptr(b, slot);
 			ptr_gen = btrfs_node_ptr_generation(b, slot);
 			if (should_reada)
-				reada_for_search(root, p, level, slot);
+				reada_for_search(root, p, level, slot,
+						 key->objectid);
 			b = read_tree_block(root, bytenr,
 					    btrfs_level_size(root, level - 1));
 			if (ptr_gen != btrfs_header_generation(b)) {
@@ -2671,9 +2678,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (next)
 			free_extent_buffer(next);
 
-		if (path->reada < 0)
-			reada_for_search(root, path, level, slot);
-
 		next = read_tree_block(root, bytenr,
 				       btrfs_level_size(root, level - 1));
 		break;
@@ -2687,8 +2691,6 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		path->slots[level] = 0;
 		if (!level)
 			break;
-		if (path->reada)
-			reada_for_search(root, path, level, 0);
 		next = read_tree_block(root, btrfs_node_blockptr(next, 0),
 				       btrfs_level_size(root, level - 1));
 	}
@@ -2726,7 +2728,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			free_extent_buffer(next);
 
 		if (path->reada)
-			reada_for_search(root, path, level, slot);
+			reada_for_search(root, path, level, slot, 0);
 
 		next = read_tree_block(root, bytenr,
 				       btrfs_level_size(root, level -1));
@@ -2742,7 +2744,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (!level)
 			break;
 		if (path->reada)
-			reada_for_search(root, path, level, 0);
+			reada_for_search(root, path, level, 0, 0);
 		next = read_tree_block(root, btrfs_node_blockptr(next, 0),
 				       btrfs_level_size(root, level - 1));
 	}
-- 
cgit v1.2.3


From 1832a6d5ee3b1af61001cadba9e10da9e91af4a4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 21 Dec 2007 16:27:21 -0500
Subject: Btrfs: Implement basic support for -ENOSPC

This is intended to prevent accidentally filling the drive.  A determined
user can still make things oops.

It includes some accounting of the current bytes under delayed allocation,
but this will change as things get optimized

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      |   4 ++
 fs/btrfs/disk-io.c    |   5 +-
 fs/btrfs/extent_map.c |  20 +++++--
 fs/btrfs/extent_map.h |   3 +-
 fs/btrfs/file.c       |  28 +++++++--
 fs/btrfs/inode.c      | 161 +++++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 193 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2559f5d5d00..10129cc6656 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -346,6 +346,8 @@ struct btrfs_fs_info {
 	int closing;
 
 	u64 total_pinned;
+	spinlock_t delalloc_lock;
+	u64 delalloc_bytes;
 };
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
@@ -1115,6 +1117,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+			   int for_del);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ebb2db624fd..eebb4fb65c6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -223,7 +223,8 @@ static int btree_writepages(struct address_space *mapping,
 		} else {
 			thresh = 8 * 1024 * 1024;
 		}
-		num_dirty = count_range_bits(tree, &start, thresh, EXTENT_DIRTY);
+		num_dirty = count_range_bits(tree, &start, (u64)-1,
+					     thresh, EXTENT_DIRTY);
 		if (num_dirty < thresh) {
 			return 0;
 		}
@@ -559,6 +560,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	spin_lock_init(&fs_info->hash_lock);
+	spin_lock_init(&fs_info->delalloc_lock);
 
 	memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj));
 	init_completion(&fs_info->kobj_unregister);
@@ -570,6 +572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->sb = sb;
 	fs_info->mount_opt = 0;
 	fs_info->max_extent = (u64)-1;
+	fs_info->delalloc_bytes = 0;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a0dff34dd43..2b92f107027 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1131,7 +1131,8 @@ out:
 }
 
 u64 count_range_bits(struct extent_map_tree *tree,
-		     u64 *start, u64 max_bytes, unsigned long bits)
+		     u64 *start, u64 search_end, u64 max_bytes,
+		     unsigned long bits)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1139,9 +1140,14 @@ u64 count_range_bits(struct extent_map_tree *tree,
 	u64 total_bytes = 0;
 	int found = 0;
 
+	if (search_end <= cur_start) {
+		printk("search_end %Lu start %Lu\n", search_end, cur_start);
+		WARN_ON(1);
+		return 0;
+	}
+
 	write_lock_irq(&tree->lock);
-	if (bits == EXTENT_DIRTY) {
-		*start = 0;
+	if (cur_start == 0 && bits == EXTENT_DIRTY) {
 		total_bytes = tree->dirty_bytes;
 		goto out;
 	}
@@ -1156,8 +1162,11 @@ u64 count_range_bits(struct extent_map_tree *tree,
 
 	while(1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if ((state->state & bits)) {
-			total_bytes += state->end - state->start + 1;
+		if (state->start > search_end)
+			break;
+		if (state->end >= cur_start && (state->state & bits)) {
+			total_bytes += min(search_end, state->end) + 1 -
+				       max(cur_start, state->start);
 			if (total_bytes >= max_bytes)
 				break;
 			if (!found) {
@@ -1173,7 +1182,6 @@ out:
 	write_unlock_irq(&tree->lock);
 	return total_bytes;
 }
-
 /*
  * helper function to lock both pages and extents in the tree.
  * pages must be locked first.
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 6e572d3e892..ea60f5447b5 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -115,7 +115,8 @@ int __init extent_map_init(void);
 void extent_map_exit(void);
 
 u64 count_range_bits(struct extent_map_tree *tree,
-		     u64 *start, u64 max_bytes, unsigned long bits);
+		     u64 *start, u64 search_end,
+		     u64 max_bytes, unsigned long bits);
 
 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
 		   int bits, int filled);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 461b09663fe..71dc2d33b6c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -307,6 +307,7 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	    inline_size > 32768 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
+		u64 existing_delalloc = 0;
 
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
@@ -316,8 +317,19 @@ static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		last_end = (u64)(pages[num_pages -1]->index) <<
 				PAGE_CACHE_SHIFT;
 		last_end += PAGE_CACHE_SIZE - 1;
+		if (start_pos < isize) {
+			u64 delalloc_start = start_pos;
+			existing_delalloc = count_range_bits(em_tree,
+					     &delalloc_start,
+					     end_of_last_block, (u64)-1,
+					     EXTENT_DELALLOC);
+		}
 		set_extent_delalloc(em_tree, start_pos, end_of_last_block,
 				 GFP_NOFS);
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
+					  start_pos) - existing_delalloc;
+		spin_unlock(&root->fs_info->delalloc_lock);
 	} else {
 		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
@@ -708,12 +720,12 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
-		goto out;
+		goto out_nolock;
 	if (count == 0)
-		goto out;
+		goto out_nolock;
 	err = remove_suid(fdentry(file));
 	if (err)
-		goto out;
+		goto out_nolock;
 	file_update_time(file);
 
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -758,6 +770,13 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(pages));
+
+		mutex_lock(&root->fs_info->fs_mutex);
+		ret = btrfs_check_free_space(root, write_bytes, 0);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		if (ret)
+			goto out;
+
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes);
@@ -787,8 +806,9 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 			btrfs_btree_balance_dirty(root, 1);
 		cond_resched();
 	}
-	mutex_unlock(&inode->i_mutex);
 out:
+	mutex_unlock(&inode->i_mutex);
+out_nolock:
 	kfree(pages);
 	if (pinned[0])
 		page_cache_release(pinned[0]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c61c85dc3d..a9f5d6d417f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -72,6 +72,26 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
+int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
+			   int for_del)
+{
+	u64 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	u64 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
+	u64 thresh;
+	int ret = 0;
+
+	if (for_del)
+		thresh = (total * 90) / 100;
+	else
+		thresh = (total * 85) / 100;
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
+		ret = -ENOSPC;
+	spin_unlock(&root->fs_info->delalloc_lock);
+	return ret;
+}
+
 static int cow_file_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -124,6 +144,7 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
 	u64 extent_end;
 	u64 bytenr;
 	u64 cow_end;
+	u64 loops = 0;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	int found_type;
@@ -169,6 +190,9 @@ again:
 		       btrfs_file_extent_num_bytes(leaf, item);
 		err = 0;
 
+		if (loops && start != extent_start)
+			goto not_found;
+
 		if (start < extent_start || start >= extent_end)
 			goto not_found;
 
@@ -191,6 +215,7 @@ loop:
 		return 0;
 	}
 	btrfs_release_path(root, path);
+	loops++;
 	goto again;
 
 not_found:
@@ -202,6 +227,7 @@ not_found:
 static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 num_bytes;
 	int ret;
 
 	mutex_lock(&root->fs_info->fs_mutex);
@@ -209,6 +235,17 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 		ret = run_delalloc_nocow(inode, start, end);
 	else
 		ret = cow_file_range(inode, start, end);
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	num_bytes = end + 1 - start;
+	if (root->fs_info->delalloc_bytes < num_bytes) {
+		printk("delalloc accounting error total %llu sub %llu\n",
+		       root->fs_info->delalloc_bytes, num_bytes);
+	} else {
+		root->fs_info->delalloc_bytes -= num_bytes;
+	}
+	spin_unlock(&root->fs_info->delalloc_lock);
+
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
@@ -547,10 +584,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
 	int ret;
-	unsigned long nr;
+	unsigned long nr = 0;
 
 	root = BTRFS_I(dir)->root;
 	mutex_lock(&root->fs_info->fs_mutex);
+
+	ret = btrfs_check_free_space(root, 1, 1);
+	if (ret)
+		goto fail;
+
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
@@ -558,25 +600,29 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction(trans, root);
+fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
-
 	return ret;
 }
 
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	int err;
+	int err = 0;
 	int ret;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
+	unsigned long nr = 0;
 
 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, 1, 1);
+	if (ret)
+		goto fail;
+
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -588,6 +634,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction(trans, root);
+fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 
@@ -792,17 +839,29 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 			      size_t zero_start)
 {
 	char *kaddr;
-	int ret = 0;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	u64 existing_delalloc;
+	u64 delalloc_start;
+	int ret = 0;
 
 	WARN_ON(!PageLocked(page));
 	set_page_extent_mapped(page);
 
 	lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+	delalloc_start = page_start;
+	existing_delalloc = count_range_bits(&BTRFS_I(inode)->extent_tree,
+					     &delalloc_start, page_end,
+					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
 	set_extent_delalloc(&BTRFS_I(inode)->extent_tree, page_start,
 			    page_end, GFP_NOFS);
+
+	spin_lock(&root->fs_info->delalloc_lock);
+	root->fs_info->delalloc_bytes += PAGE_CACHE_SIZE - existing_delalloc;
+	spin_unlock(&root->fs_info->delalloc_lock);
+
 	if (zero_start != PAGE_CACHE_SIZE) {
 		kaddr = kmap(page);
 		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
@@ -881,6 +940,12 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (attr->ia_size <= pos)
 			goto out;
 
+		mutex_lock(&root->fs_info->fs_mutex);
+		err = btrfs_check_free_space(root, 1, 0);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		if (err)
+			goto fail;
+
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
 		lock_extent(em_tree, pos, block_end, GFP_NOFS);
@@ -906,7 +971,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 out:
 	err = inode_setattr(inode, attr);
-
+fail:
 	return err;
 }
 void btrfs_delete_inode(struct inode *inode)
@@ -1440,16 +1505,20 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
-	unsigned long nr;
+	unsigned long nr = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto fail;
+
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -1480,6 +1549,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
+fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 
 	if (drop_inode) {
@@ -1495,13 +1565,16 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	int err;
 	int drop_inode = 0;
-	unsigned long nr;
+	unsigned long nr = 0;
 	u64 objectid;
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto fail;
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -1535,6 +1608,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
+fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 
 	if (drop_inode) {
@@ -1551,7 +1625,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = old_dentry->d_inode;
-	unsigned long nr;
+	unsigned long nr = 0;
 	int err;
 	int drop_inode = 0;
 
@@ -1564,6 +1638,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	inc_nlink(inode);
 #endif
 	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto fail;
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
@@ -1582,6 +1659,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
+fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 
 	if (drop_inode) {
@@ -1603,6 +1681,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	unsigned long nr = 1;
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto out_unlock;
+
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -1869,6 +1951,15 @@ static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
 static int btrfs_prepare_write(struct file *file, struct page *page,
 			       unsigned from, unsigned to)
 {
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	int err;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+	mutex_lock(&root->fs_info->fs_mutex);
+	if (err)
+		return -ENOSPC;
+
 	return extent_prepare_write(&BTRFS_I(page->mapping->host)->extent_tree,
 				    page->mapping->host, page, from, to,
 				    btrfs_get_extent);
@@ -1880,6 +1971,7 @@ int btrfs_readpage(struct file *file, struct page *page)
 	tree = &BTRFS_I(page->mapping->host)->extent_tree;
 	return extent_read_full_page(tree, page, btrfs_get_extent);
 }
+
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_map_tree *tree;
@@ -1954,11 +2046,20 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	unsigned long end;
 	loff_t size;
-	int ret = -EINVAL;
+	int ret;
 	u64 page_start;
 
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+	mutex_lock(&root->fs_info->fs_mutex);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+
 	down_read(&BTRFS_I(inode)->root->snap_sem);
 	lock_page(page);
 	wait_on_page_writeback(page);
@@ -1982,6 +2083,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
 	up_read(&BTRFS_I(inode)->root->snap_sem);
 	unlock_page(page);
+out:
 	return ret;
 }
 
@@ -2046,6 +2148,10 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 	unsigned long nr = 1;
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto fail_commit;
+
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
@@ -2162,7 +2268,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	int ret;
 	int err;
 	u64 objectid;
-	unsigned long nr;
+	unsigned long nr = 0;
 
 	if (!root->ref_cows)
 		return -EINVAL;
@@ -2172,6 +2278,10 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	thaw_bdev(root->fs_info->sb->s_bdev, root->fs_info->sb);
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto fail_unlock;
+
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
@@ -2229,7 +2339,7 @@ fail:
 
 	if (err && !ret)
 		ret = err;
-
+fail_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	up_write(&root->snap_sem);
 	btrfs_btree_balance_dirty(root, nr);
@@ -2255,6 +2365,7 @@ static unsigned long force_ra(struct address_space *mapping,
 
 int btrfs_defrag_file(struct file *file) {
 	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct page *page;
 	unsigned long last_index;
@@ -2262,6 +2373,13 @@ int btrfs_defrag_file(struct file *file) {
 	u64 page_start;
 	u64 page_end;
 	unsigned long i;
+	int ret;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (ret)
+		return -ENOSPC;
 
 	mutex_lock(&inode->i_mutex);
 	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -2522,6 +2640,10 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	}
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto out_unlock;
+
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, new_dir);
@@ -2553,6 +2675,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 out_fail:
 	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
+out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
@@ -2564,7 +2687,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
@@ -2573,12 +2696,17 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
 	struct extent_buffer *leaf;
-	unsigned long nr;
+	unsigned long nr = 0;
 
 	name_len = strlen(symname) + 1;
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
+
 	mutex_lock(&root->fs_info->fs_mutex);
+	err = btrfs_check_free_space(root, 1, 0);
+	if (err)
+		goto out_fail;
+
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2645,6 +2773,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
+out_fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
-- 
cgit v1.2.3


From 5d4fb734b4cb89a3902d6de309af0f6f3e63fe73 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 21 Dec 2007 16:27:23 -0500
Subject: Btrfs: Fix an off by one in the extent_map prepare write code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 4 ++--
 fs/btrfs/inode.c      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2b92f107027..f53d8c97621 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -2270,7 +2270,7 @@ int extent_prepare_write(struct extent_map_tree *tree,
 			kunmap_atomic(kaddr, KM_USER0);
 		}
 		if ((em->block_start != EXTENT_MAP_HOLE &&
-		     em->block_start != EXTENT_MAP_INLINE) && 
+		     em->block_start != EXTENT_MAP_INLINE) &&
 		    !isnew && !PageUptodate(page) &&
 		    (block_off_end > to || block_off_start < from) &&
 		    !test_range_bit(tree, block_start, cur_end,
@@ -2279,7 +2279,7 @@ int extent_prepare_write(struct extent_map_tree *tree,
 			u64 extent_offset = block_start - em->start;
 			size_t iosize;
 			sector = (em->block_start + extent_offset) >> 9;
-			iosize = (cur_end - block_start + blocksize - 1) &
+			iosize = (cur_end - block_start + blocksize) &
 				~((u64)blocksize - 1);
 			/*
 			 * we've already got the extent locked, but we
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a9f5d6d417f..5f5b7b89b14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1956,7 +1956,7 @@ static int btrfs_prepare_write(struct file *file, struct page *page,
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	if (err)
 		return -ENOSPC;
 
-- 
cgit v1.2.3


From edbd8d4efe4ddaf29a175ae504e2c9a05a96ebee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 21 Dec 2007 16:27:24 -0500
Subject: Btrfs: Support for online FS resize (grow and shrink)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  10 ++
 fs/btrfs/disk-io.c     |  32 +++-
 fs/btrfs/disk-io.h     |   2 +
 fs/btrfs/extent-tree.c | 480 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/inode.c       | 105 ++++++++++-
 fs/btrfs/ioctl.h       |   2 +
 fs/btrfs/root-tree.c   |   5 +
 fs/btrfs/super.c       |   7 +-
 8 files changed, 629 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 10129cc6656..b83a1bec346 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -508,6 +508,8 @@ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 			 used, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags, struct btrfs_block_group_item,
+		   flags, 8);
 
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
@@ -960,6 +962,9 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     int level,
 					     u64 hint,
 					     u64 empty_size);
+int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 new_size);
+int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path, u64 bytenr,
@@ -1117,6 +1122,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+unsigned long btrfs_force_ra(struct address_space *mapping,
+			      struct file_ra_state *ra, struct file *file,
+			      pgoff_t offset, pgoff_t last_index);
 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 			   int for_del);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
@@ -1162,4 +1170,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct inode *inode);
+/* super.c */
+u64 btrfs_parse_size(char *str);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eebb4fb65c6..de026d9d9b2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -471,13 +471,17 @@ insert:
 	return root;
 }
 
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-				      struct btrfs_key *location,
-				      const char *name, int namelen)
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+					      struct btrfs_key *location)
 {
 	struct btrfs_root *root;
 	int ret;
 
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
 				 (unsigned long)location->objectid);
 	if (root)
@@ -494,6 +498,23 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+	ret = btrfs_find_dead_roots(fs_info->tree_root,
+				    root->root_key.objectid, root);
+	BUG_ON(ret);
+
+	return root;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
+				      struct btrfs_key *location,
+				      const char *name, int namelen)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	root = btrfs_read_fs_root_no_name(fs_info, location);
+	if (!root)
+		return NULL;
 
 	ret = btrfs_set_root_name(root, name, namelen);
 	if (ret) {
@@ -509,11 +530,6 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
-
-	ret = btrfs_find_dead_roots(fs_info->tree_root,
-				    root->root_key.objectid, root);
-	BUG_ON(ret);
-
 	return root;
 }
 #if 0
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 33d2ccfd74d..8c3cfd02901 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -39,6 +39,8 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      const char *name, int namelen);
 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
 					       struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+					      struct btrfs_key *location);
 int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   struct block_device *bdev,
 			   u64 device_id,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1412d556313..de0fb0743cf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/crc32c.h>
+#include <linux/pagemap.h>
 #include "hash.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -1622,6 +1623,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	int pending_ret;
 	u64 super_used, root_used;
 	u64 search_start = 0;
+	u64 new_hint;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct btrfs_extent_item extent_item;
@@ -1629,6 +1631,10 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 
+	new_hint = max(hint_byte, 16ULL * 1024 * 1024 * 1024);
+	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
+		hint_byte = new_hint;
+
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
 			       search_start, search_end, hint_byte, ins,
@@ -2100,6 +2106,480 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	return 0;
 }
 
+static int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+{
+	u64 page_start;
+	u64 page_end;
+	u64 delalloc_start;
+	u64 existing_delalloc;
+	unsigned long last_index;
+	unsigned long first_index;
+	unsigned long i;
+	struct page *page;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct file_ra_state ra;
+
+	mutex_lock(&inode->i_mutex);
+	first_index = start >> PAGE_CACHE_SHIFT;
+	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+
+	memset(&ra, 0, sizeof(ra));
+	file_ra_state_init(&ra, inode->i_mapping);
+	btrfs_force_ra(inode->i_mapping, &ra, NULL, first_index, last_index);
+
+	for (i = first_index; i <= last_index; i++) {
+		page = grab_cache_page(inode->i_mapping, i);
+		if (!page)
+			goto out_unlock;
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto out_unlock;
+			}
+		}
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+
+		delalloc_start = page_start;
+		existing_delalloc =
+			count_range_bits(&BTRFS_I(inode)->extent_tree,
+					 &delalloc_start, page_end,
+					 PAGE_CACHE_SIZE, EXTENT_DELALLOC);
+
+		set_extent_delalloc(em_tree, page_start,
+				    page_end, GFP_NOFS);
+
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->delalloc_bytes += PAGE_CACHE_SIZE -
+						 existing_delalloc;
+		spin_unlock(&root->fs_info->delalloc_lock);
+
+		unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+static int relocate_one_reference(struct btrfs_root *extent_root,
+				  struct btrfs_path *path,
+				  struct btrfs_key *extent_key,
+				  u64 ref_root, u64 ref_gen, u64 ref_objectid,
+				  u64 ref_offset)
+{
+	struct inode *inode;
+	struct btrfs_root *found_root;
+	struct btrfs_key root_location;
+	int ret;
+
+	root_location.objectid = ref_root;
+	if (ref_gen == 0)
+		root_location.offset = 0;
+	else
+		root_location.offset = (u64)-1;
+	root_location.type = BTRFS_ROOT_ITEM_KEY;
+
+	found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
+						&root_location);
+	BUG_ON(!found_root);
+
+	if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		mutex_unlock(&extent_root->fs_info->fs_mutex);
+		inode = btrfs_iget_locked(extent_root->fs_info->sb,
+					  ref_objectid, found_root);
+		if (inode->i_state & I_NEW) {
+			/* the inode and parent dir are two different roots */
+			BTRFS_I(inode)->root = found_root;
+			BTRFS_I(inode)->location.objectid = ref_objectid;
+			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+			BTRFS_I(inode)->location.offset = 0;
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+
+		}
+		/* this can happen if the reference is not against
+		 * the latest version of the tree root
+		 */
+		if (is_bad_inode(inode)) {
+			mutex_lock(&extent_root->fs_info->fs_mutex);
+			goto out;
+		}
+		relocate_inode_pages(inode, ref_offset, extent_key->offset);
+		/* FIXME, data=ordered will help get rid of this */
+		filemap_fdatawrite(inode->i_mapping);
+		iput(inode);
+		mutex_lock(&extent_root->fs_info->fs_mutex);
+	} else {
+		struct btrfs_trans_handle *trans;
+		struct btrfs_key found_key;
+		struct extent_buffer *eb;
+		int level;
+		int i;
+
+		trans = btrfs_start_transaction(found_root, 1);
+		eb = read_tree_block(found_root, extent_key->objectid,
+				     extent_key->offset);
+		level = btrfs_header_level(eb);
+
+		if (level == 0)
+			btrfs_item_key_to_cpu(eb, &found_key, 0);
+		else
+			btrfs_node_key_to_cpu(eb, &found_key, 0);
+
+		free_extent_buffer(eb);
+
+		path->lowest_level = level;
+		path->reada = 0;
+		ret = btrfs_search_slot(trans, found_root, &found_key, path,
+					0, 1);
+		path->lowest_level = 0;
+		for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+			if (!path->nodes[i])
+				break;
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+		btrfs_release_path(found_root, path);
+		btrfs_end_transaction(trans, found_root);
+	}
+
+out:
+	return 0;
+}
+
+static int relocate_one_extent(struct btrfs_root *extent_root,
+			       struct btrfs_path *path,
+			       struct btrfs_key *extent_key)
+{
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_extent_ref *ref;
+	struct extent_buffer *leaf;
+	u64 ref_root;
+	u64 ref_gen;
+	u64 ref_objectid;
+	u64 ref_offset;
+	u32 nritems;
+	u32 item_size;
+	int ret = 0;
+
+	key.objectid = extent_key->objectid;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = 0;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+
+		BUG_ON(ret == 0);
+
+		if (ret < 0)
+			goto out;
+
+		ret = 0;
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] == nritems)
+			goto out;
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != extent_key->objectid)
+			break;
+
+		if (found_key.type != BTRFS_EXTENT_REF_KEY)
+			break;
+
+		key.offset = found_key.offset + 1;
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		ref_root = btrfs_ref_root(leaf, ref);
+		ref_gen = btrfs_ref_generation(leaf, ref);
+		ref_objectid = btrfs_ref_objectid(leaf, ref);
+		ref_offset = btrfs_ref_offset(leaf, ref);
+		btrfs_release_path(extent_root, path);
+
+		ret = relocate_one_reference(extent_root, path,
+					     extent_key, ref_root, ref_gen,
+					     ref_objectid, ref_offset);
+		if (ret)
+			goto out;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(extent_root, path);
+	return ret;
+}
+
+static int find_overlapping_extent(struct btrfs_root *root,
+				   struct btrfs_path *path, u64 new_size)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int ret;
+
+	while(1) {
+		if (path->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, path);
+			if (ret == 1) {
+				return 1;
+			}
+			if (ret < 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.type == BTRFS_EXTENT_ITEM_KEY) {
+			if (found_key.objectid + found_key.offset > new_size)
+				return 0;
+			else
+				return 1;
+		}
+	}
+	return 1;
+}
+
+int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+	struct btrfs_path *path;
+	u64 cur_byte;
+	u64 total_found;
+	u64 ptr;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct extent_map_tree *block_group_cache;
+	struct btrfs_key key;
+	struct btrfs_key found_key = { 0, 0, 0 };
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int ret;
+	int slot;
+
+	btrfs_set_super_total_bytes(&info->super_copy, new_size);
+	block_group_cache = &info->block_group_cache;
+	path = btrfs_alloc_path();
+	root = root->fs_info->extent_root;
+
+again:
+	total_found = 0;
+	key.objectid = new_size;
+	cur_byte = key.objectid;
+	key.offset = 0;
+	key.type = 0;
+	while(1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+next:
+		leaf = path->nodes[0];
+		if (key.objectid == new_size - 1) {
+			ret = find_overlapping_extent(root, path, new_size);
+			if (ret != 0) {
+				btrfs_release_path(root, path);
+				ret = btrfs_search_slot(NULL, root, &key,
+							path, 0, 0);
+				if (ret < 0)
+					goto out;
+			}
+		}
+		nritems = btrfs_header_nritems(leaf);
+		ret = 0;
+		slot = path->slots[0];
+		if (slot < nritems)
+			btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (slot == nritems ||
+		    btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY) {
+			path->slots[0]++;
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0)
+					goto out;
+				if (ret == 1) {
+					ret = 0;
+					break;
+				}
+			}
+			goto next;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid + found_key.offset <= cur_byte)
+			continue;
+		total_found++;
+		cur_byte = found_key.objectid + found_key.offset;
+		key.objectid = cur_byte;
+		btrfs_release_path(root, path);
+		ret = relocate_one_extent(root, path, &found_key);
+	}
+
+	btrfs_release_path(root, path);
+
+	if (total_found > 0) {
+		trans = btrfs_start_transaction(tree_root, 1);
+		btrfs_commit_transaction(trans, tree_root);
+
+		mutex_unlock(&root->fs_info->fs_mutex);
+		btrfs_clean_old_snapshots(tree_root);
+		mutex_lock(&root->fs_info->fs_mutex);
+
+		trans = btrfs_start_transaction(tree_root, 1);
+		btrfs_commit_transaction(trans, tree_root);
+		goto again;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = new_size;
+	key.offset = 0;
+	key.type = 0;
+	while(1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			goto out;
+bg_next:
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		ret = 0;
+		slot = path->slots[0];
+		if (slot < nritems)
+			btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (slot == nritems ||
+		    btrfs_key_type(&found_key) != BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			if (slot < nritems) {
+				printk("shrinker found key %Lu %u %Lu\n",
+				       found_key.objectid, found_key.type,
+				       found_key.offset);
+				path->slots[0]++;
+			}
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0)
+					break;
+				if (ret == 1) {
+					ret = 0;
+					break;
+				}
+			}
+			goto bg_next;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		ret = get_state_private(&info->block_group_cache,
+					found_key.objectid, &ptr);
+		if (!ret)
+			kfree((void *)(unsigned long)ptr);
+
+		clear_extent_bits(&info->block_group_cache, found_key.objectid,
+				  found_key.objectid + found_key.offset - 1,
+				  (unsigned int)-1, GFP_NOFS);
+
+		key.objectid = found_key.objectid + 1;
+		btrfs_del_item(trans, root, path);
+		btrfs_release_path(root, path);
+	}
+	clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
+			   GFP_NOFS);
+	btrfs_commit_transaction(trans, root);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 new_size)
+{
+	struct btrfs_path *path;
+	u64 nr = 0;
+	u64 cur_byte;
+	u64 old_size;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_block_group_item *item;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct extent_map_tree *block_group_cache;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	int ret;
+	int bit;
+
+	old_size = btrfs_super_total_bytes(&info->super_copy);
+	block_group_cache = &info->block_group_cache;
+
+	root = info->extent_root;
+
+	cache = btrfs_lookup_block_group(root->fs_info, old_size - 1);
+
+	cur_byte = cache->key.objectid + cache->key.offset;
+	if (cur_byte >= new_size)
+		goto set_size;
+
+	key.offset = BTRFS_BLOCK_GROUP_SIZE;
+	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	while(cur_byte < new_size) {
+		key.objectid = cur_byte;
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+				        sizeof(struct btrfs_block_group_item));
+		BUG_ON(ret);
+		leaf = path->nodes[0];
+		item = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_block_group_item);
+
+		btrfs_set_disk_block_group_used(leaf, item, 0);
+		if (nr % 3) {
+			btrfs_set_disk_block_group_flags(leaf, item,
+						 BTRFS_BLOCK_GROUP_DATA);
+		} else {
+			btrfs_set_disk_block_group_flags(leaf, item, 0);
+		}
+		nr++;
+
+		cache = kmalloc(sizeof(*cache), GFP_NOFS);
+		BUG_ON(!cache);
+
+		read_extent_buffer(leaf, &cache->item, (unsigned long)item,
+				   sizeof(cache->item));
+
+		memcpy(&cache->key, &key, sizeof(key));
+		cache->cached = 0;
+		cache->pinned = 0;
+		cur_byte = key.objectid + key.offset;
+		btrfs_release_path(root, path);
+
+		if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
+			bit = BLOCK_GROUP_DATA;
+			cache->data = BTRFS_BLOCK_GROUP_DATA;
+		} else {
+			bit = BLOCK_GROUP_METADATA;
+			cache->data = 0;
+		}
+
+		/* use EXTENT_LOCKED to prevent merging */
+		set_extent_bits(block_group_cache, key.objectid,
+				key.objectid + key.offset - 1,
+				bit | EXTENT_LOCKED, GFP_NOFS);
+		set_state_private(block_group_cache, key.objectid,
+				  (unsigned long)cache);
+	}
+	btrfs_free_path(path);
+set_size:
+	btrfs_set_super_total_bytes(&info->super_copy, new_size);
+	return 0;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f5b7b89b14..f6a20112d9e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2346,7 +2346,7 @@ fail_unlock:
 	return ret;
 }
 
-static unsigned long force_ra(struct address_space *mapping,
+unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index)
 {
@@ -2372,6 +2372,8 @@ int btrfs_defrag_file(struct file *file) {
 	unsigned long ra_index = 0;
 	u64 page_start;
 	u64 page_end;
+	u64 delalloc_start;
+	u64 existing_delalloc;
 	unsigned long i;
 	int ret;
 
@@ -2385,8 +2387,9 @@ int btrfs_defrag_file(struct file *file) {
 	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
 	for (i = 0; i <= last_index; i++) {
 		if (i == ra_index) {
-			ra_index = force_ra(inode->i_mapping, &file->f_ra,
-					    file, ra_index, last_index);
+			ra_index = btrfs_force_ra(inode->i_mapping,
+						  &file->f_ra,
+						  file, ra_index, last_index);
 		}
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page)
@@ -2404,8 +2407,19 @@ int btrfs_defrag_file(struct file *file) {
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		delalloc_start = page_start;
+		existing_delalloc =
+			count_range_bits(&BTRFS_I(inode)->extent_tree,
+					 &delalloc_start, page_end,
+					 PAGE_CACHE_SIZE, EXTENT_DELALLOC);
 		set_extent_delalloc(em_tree, page_start,
 				    page_end, GFP_NOFS);
+
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->delalloc_bytes += PAGE_CACHE_SIZE -
+						 existing_delalloc;
+		spin_unlock(&root->fs_info->delalloc_lock);
+
 		unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
 		unlock_page(page);
@@ -2418,6 +2432,89 @@ out_unlock:
 	return 0;
 }
 
+static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+{
+	u64 new_size;
+	u64 old_size;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	char *sizestr;
+	int ret = 0;
+	int namelen;
+	int mod = 0;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	namelen = strlen(vol_args->name);
+	if (namelen > BTRFS_VOL_NAME_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sizestr = vol_args->name;
+	if (!strcmp(sizestr, "max"))
+		new_size = root->fs_info->sb->s_bdev->bd_inode->i_size;
+	else {
+		if (sizestr[0] == '-') {
+			mod = -1;
+			sizestr++;
+		} else if (sizestr[0] == '+') {
+			mod = 1;
+			sizestr++;
+		}
+		new_size = btrfs_parse_size(sizestr);
+		if (new_size == 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	old_size = btrfs_super_total_bytes(&root->fs_info->super_copy);
+
+	if (mod < 0) {
+		if (new_size > old_size) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		new_size = old_size - new_size;
+	} else if (mod > 0) {
+		new_size = old_size + new_size;
+	}
+
+	if (new_size < 256 * 1024 * 1024) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (new_size > root->fs_info->sb->s_bdev->bd_inode->i_size) {
+		ret = -EFBIG;
+		goto out_unlock;
+	}
+	new_size = (new_size / root->sectorsize) * root->sectorsize;
+
+printk("new size is %Lu\n", new_size);
+	if (new_size > old_size) {
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_grow_extent_tree(trans, root, new_size);
+		btrfs_commit_transaction(trans, root);
+	} else {
+		ret = btrfs_shrink_extent_tree(root, new_size);
+	}
+
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+out:
+	kfree(vol_args);
+	return ret;
+}
+
 static int btrfs_ioctl_snap_create(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
@@ -2510,6 +2607,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_snap_create(root, (void __user *)arg);
 	case BTRFS_IOC_DEFRAG:
 		return btrfs_ioctl_defrag(file);
+	case BTRFS_IOC_RESIZE:
+		return btrfs_ioctl_resize(root, (void __user *)arg);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 8bc47dec286..8c6290665d4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,4 +30,6 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
+				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 88bcdd33f56..e2b9b86af69 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -179,6 +179,11 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
+	if (ret) {
+btrfs_print_leaf(root, path->nodes[0]);
+printk("failed to del %Lu %u %Lu\n", key->objectid, key->type, key->offset);
+
+	}
 	BUG_ON(ret != 0);
 	leaf = path->nodes[0];
 	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7cab680418a..3bbd7f66569 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -74,9 +74,9 @@ static match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
-static unsigned long parse_size(char *str)
+u64 btrfs_parse_size(char *str)
 {
-	unsigned long res;
+	u64 res;
 	int mult = 1;
 	char *end;
 	char last;
@@ -150,7 +150,8 @@ static int parse_options (char * options,
 			if (info) {
 				char *num = match_strdup(&args[0]);
 				if (num) {
-					info->max_extent = parse_size(num);
+					info->max_extent =
+						btrfs_parse_size(num);
 					kfree(num);
 
 					info->max_extent = max_t(u64,
-- 
cgit v1.2.3


From e52ec0eb62e76ea584c8438ef92933e9b815db80 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 21 Dec 2007 16:36:24 -0500
Subject: Btrfs: Fix NULL block groups on reading the inode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 fs/btrfs/inode.c       | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de0fb0743cf..a421069e998 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1630,11 +1630,11 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 
 	btrfs_set_stack_extent_refs(&extent_item, 1);
-
+#if 0
 	new_hint = max(hint_byte, 16ULL * 1024 * 1024 * 1024);
 	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
 		hint_byte = new_hint;
-
+#endif
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
 			       search_start, search_end, hint_byte, ins,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f6a20112d9e..3c7bf3bddca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -402,6 +402,10 @@ void btrfs_read_locked_inode(struct inode *inode)
 	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
 						       alloc_group_block);
 
+	if (!BTRFS_I(inode)->block_group) {
+		BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
+						         NULL, 0, 0, 0);
+	}
 	btrfs_free_path(path);
 	inode_item = NULL;
 
-- 
cgit v1.2.3


From 8f662a76c6af8eb367fa519e9bb9766040d9cea8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 Jan 2008 10:01:11 -0500
Subject: Btrfs: Add readahead to the online shrinker, and a mount -o
 alloc_start= for testing

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  9 ++++++---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c |  9 +++++----
 fs/btrfs/inode.c       |  2 +-
 fs/btrfs/super.c       | 16 +++++++++++++++-
 5 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2b3ffa707ea..8fa92a2d981 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2654,9 +2654,9 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  */
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
+	u64 bytenr;
 	int slot;
 	int level = 1;
-	u64 bytenr;
 	struct extent_buffer *c;
 	struct extent_buffer *next = NULL;
 
@@ -2687,11 +2687,14 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		level--;
 		c = path->nodes[level];
 		free_extent_buffer(c);
+		slot = btrfs_header_nritems(next);
+		if (slot != 0)
+			slot--;
 		path->nodes[level] = next;
-		path->slots[level] = 0;
+		path->slots[level] = slot;
 		if (!level)
 			break;
-		next = read_tree_block(root, btrfs_node_blockptr(next, 0),
+		next = read_tree_block(root, btrfs_node_blockptr(next, slot),
 				       btrfs_level_size(root, level - 1));
 	}
 	return 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b83a1bec346..d6e5c19969b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -324,6 +324,7 @@ struct btrfs_fs_info {
 	u64 last_trans_committed;
 	unsigned long mount_opt;
 	u64 max_extent;
+	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
 	struct extent_buffer *sb_buffer;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a421069e998..d6243c2afc7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1630,11 +1630,11 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 
 	btrfs_set_stack_extent_refs(&extent_item, 1);
-#if 0
-	new_hint = max(hint_byte, 16ULL * 1024 * 1024 * 1024);
+
+	new_hint = max(hint_byte, root->fs_info->alloc_start);
 	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
 		hint_byte = new_hint;
-#endif
+
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
 			       search_start, search_end, hint_byte, ins,
@@ -2239,7 +2239,7 @@ static int relocate_one_reference(struct btrfs_root *extent_root,
 		free_extent_buffer(eb);
 
 		path->lowest_level = level;
-		path->reada = 0;
+		path->reada = 2;
 		ret = btrfs_search_slot(trans, found_root, &found_key, path,
 					0, 1);
 		path->lowest_level = 0;
@@ -2372,6 +2372,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	block_group_cache = &info->block_group_cache;
 	path = btrfs_alloc_path();
 	root = root->fs_info->extent_root;
+	path->reada = 2;
 
 again:
 	total_found = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3c7bf3bddca..d6de4696188 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2058,7 +2058,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret)
 		goto out;
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3bbd7f66569..f1b56eb77b1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -63,7 +63,8 @@ static void btrfs_put_super (struct super_block * sb)
 }
 
 enum {
-	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_err,
+	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent,
+	Opt_alloc_start, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -71,6 +72,7 @@ static match_table_t tokens = {
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
 	{Opt_max_extent, "max_extent=%s"},
+	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_err, NULL}
 };
 
@@ -162,6 +164,18 @@ static int parse_options (char * options,
 				}
 			}
 			break;
+		case Opt_alloc_start:
+			if (info) {
+				char *num = match_strdup(&args[0]);
+				if (num) {
+					info->alloc_start =
+						btrfs_parse_size(num);
+					kfree(num);
+					printk("btrfs: allocations start at "
+					       "%Lu\n", info->alloc_start);
+				}
+			}
+			break;
 		default:
 			break;
 		}
-- 
cgit v1.2.3


From 56b453c92fdf51fd3283a2dc2dfbedf36f516031 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 09:08:27 -0500
Subject: Btrfs: Explicitly send a root objectid to count_snapshots_in_path

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 16 ++++++----------
 fs/btrfs/inode.c       | 39 +++++----------------------------------
 2 files changed, 11 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d6243c2afc7..91c2b27d968 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -688,7 +688,7 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 	struct btrfs_path *path;
 	u64 bytenr;
 	u64 found_objectid;
-	u64 root_objectid = 0;
+	u64 root_objectid = root->root_key.objectid;
 	u32 total_count = 0;
 	u32 cur_count;
 	u32 refs;
@@ -749,15 +749,11 @@ again:
 					  struct btrfs_extent_ref);
 		found_objectid = btrfs_ref_root(l, ref_item);
 
-		if (found_objectid != root_objectid)
-			total_count++;
-
-		if (total_count > 1)
-			goto out;
-
-		if (root_objectid == 0)
-			root_objectid = found_objectid;
-
+		if (found_objectid != root_objectid) {
+			total_count = 2;
+			break;
+		}
+		total_count = 1;
 		path->slots[0]++;
 	}
 	if (cur_count == 0) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d6de4696188..5003a86510a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1770,6 +1770,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 again:
 	em = lookup_extent_mapping(em_tree, start, end);
 	if (em) {
+		if (em->start > start) {
+			printk("get_extent start %Lu em start %Lu\n",
+			       start, em->start);
+			WARN_ON(1);
+		}
 		goto out;
 	}
 	if (!em) {
@@ -1952,23 +1957,6 @@ static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
 	return extent_bmap(mapping, iblock, btrfs_get_extent);
 }
 
-static int btrfs_prepare_write(struct file *file, struct page *page,
-			       unsigned from, unsigned to)
-{
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	int err;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	err = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	if (err)
-		return -ENOSPC;
-
-	return extent_prepare_write(&BTRFS_I(page->mapping->host)->extent_tree,
-				    page->mapping->host, page, from, to,
-				    btrfs_get_extent);
-}
-
 int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_map_tree *tree;
@@ -2120,21 +2108,6 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_btree_balance_dirty(root, nr);
 }
 
-int btrfs_commit_write(struct file *file, struct page *page,
-		       unsigned from, unsigned to)
-{
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-	struct inode *inode = page->mapping->host;
-
-	btrfs_cow_one_page(inode, page, PAGE_CACHE_SIZE);
-
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
-}
-
 static int create_subvol(struct btrfs_root *root, char *name, int namelen)
 {
 	struct btrfs_trans_handle *trans;
@@ -2930,8 +2903,6 @@ static struct address_space_operations btrfs_aops = {
 	.writepages	= btrfs_writepages,
 	.readpages	= btrfs_readpages,
 	.sync_page	= block_sync_page,
-	.prepare_write	= btrfs_prepare_write,
-	.commit_write	= btrfs_commit_write,
 	.bmap		= btrfs_bmap,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
-- 
cgit v1.2.3


From 4313b3994d719fcdeb7e661473019ca3d62e829b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 09:08:48 -0500
Subject: Btrfs: Reduce stack usage in the resizer, fix 32 bit compiles

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  5 ++++
 fs/btrfs/extent-tree.c | 74 ++++++++++++++++++++++++++------------------------
 fs/btrfs/inode.c       |  6 ++--
 fs/btrfs/transaction.c | 20 ++++++++++----
 5 files changed, 63 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d6e5c19969b..9873975ce0e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -386,6 +386,7 @@ struct btrfs_root {
 	int defrag_running;
 	int defrag_level;
 	char *name;
+	int in_sysfs;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index de026d9d9b2..67d9fd72886 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -374,6 +374,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->highest_inode = 0;
 	root->last_inode_alloc = 0;
 	root->name = NULL;
+	root->in_sysfs = 0;
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -516,6 +517,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	if (!root)
 		return NULL;
 
+	if (root->in_sysfs)
+		return root;
+
 	ret = btrfs_set_root_name(root, name, namelen);
 	if (ret) {
 		free_extent_buffer(root->node);
@@ -530,6 +534,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+	root->in_sysfs = 1;
 	return root;
 }
 #if 0
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 91c2b27d968..6137f06091e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -751,7 +751,7 @@ again:
 
 		if (found_objectid != root_objectid) {
 			total_count = 2;
-			break;
+			goto out;
 		}
 		total_count = 1;
 		path->slots[0]++;
@@ -760,8 +760,6 @@ again:
 		total_count = 0;
 		goto out;
 	}
-	if (total_count > 1)
-		goto out;
 	if (level >= 0 && root->node == count_path->nodes[level])
 		goto out;
 	level++;
@@ -2109,22 +2107,23 @@ static int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
 	u64 delalloc_start;
 	u64 existing_delalloc;
 	unsigned long last_index;
-	unsigned long first_index;
 	unsigned long i;
 	struct page *page;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct file_ra_state ra;
+	struct file_ra_state *ra;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 
 	mutex_lock(&inode->i_mutex);
-	first_index = start >> PAGE_CACHE_SHIFT;
+	i = start >> PAGE_CACHE_SHIFT;
 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
-	memset(&ra, 0, sizeof(ra));
-	file_ra_state_init(&ra, inode->i_mapping);
-	btrfs_force_ra(inode->i_mapping, &ra, NULL, first_index, last_index);
+	file_ra_state_init(ra, inode->i_mapping);
+	btrfs_force_ra(inode->i_mapping, ra, NULL, i, last_index);
+	kfree(ra);
 
-	for (i = first_index; i <= last_index; i++) {
+	for (; i <= last_index; i++) {
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page)
 			goto out_unlock;
@@ -2167,27 +2166,43 @@ out_unlock:
 	return 0;
 }
 
+/*
+ * note, this releases the path
+ */
 static int relocate_one_reference(struct btrfs_root *extent_root,
 				  struct btrfs_path *path,
-				  struct btrfs_key *extent_key,
-				  u64 ref_root, u64 ref_gen, u64 ref_objectid,
-				  u64 ref_offset)
+				  struct btrfs_key *extent_key)
 {
 	struct inode *inode;
 	struct btrfs_root *found_root;
-	struct btrfs_key root_location;
+	struct btrfs_key *root_location;
+	struct btrfs_extent_ref *ref;
+	u64 ref_root;
+	u64 ref_gen;
+	u64 ref_objectid;
+	u64 ref_offset;
 	int ret;
 
-	root_location.objectid = ref_root;
+	ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			     struct btrfs_extent_ref);
+	ref_root = btrfs_ref_root(path->nodes[0], ref);
+	ref_gen = btrfs_ref_generation(path->nodes[0], ref);
+	ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
+	ref_offset = btrfs_ref_offset(path->nodes[0], ref);
+	btrfs_release_path(extent_root, path);
+
+	root_location = kmalloc(sizeof(*root_location), GFP_NOFS);
+	root_location->objectid = ref_root;
 	if (ref_gen == 0)
-		root_location.offset = 0;
+		root_location->offset = 0;
 	else
-		root_location.offset = (u64)-1;
-	root_location.type = BTRFS_ROOT_ITEM_KEY;
+		root_location->offset = (u64)-1;
+	root_location->type = BTRFS_ROOT_ITEM_KEY;
 
 	found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
-						&root_location);
+						root_location);
 	BUG_ON(!found_root);
+	kfree(root_location);
 
 	if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 		mutex_unlock(&extent_root->fs_info->fs_mutex);
@@ -2259,12 +2274,7 @@ static int relocate_one_extent(struct btrfs_root *extent_root,
 {
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	struct btrfs_extent_ref *ref;
 	struct extent_buffer *leaf;
-	u64 ref_root;
-	u64 ref_gen;
-	u64 ref_objectid;
-	u64 ref_offset;
 	u32 nritems;
 	u32 item_size;
 	int ret = 0;
@@ -2297,17 +2307,7 @@ static int relocate_one_extent(struct btrfs_root *extent_root,
 		key.offset = found_key.offset + 1;
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_ref);
-		ref_root = btrfs_ref_root(leaf, ref);
-		ref_gen = btrfs_ref_generation(leaf, ref);
-		ref_objectid = btrfs_ref_objectid(leaf, ref);
-		ref_offset = btrfs_ref_offset(leaf, ref);
-		btrfs_release_path(extent_root, path);
-
-		ret = relocate_one_reference(extent_root, path,
-					     extent_key, ref_root, ref_gen,
-					     ref_objectid, ref_offset);
+		ret = relocate_one_reference(extent_root, path, extent_key);
 		if (ret)
 			goto out;
 	}
@@ -2354,7 +2354,6 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	struct btrfs_path *path;
 	u64 cur_byte;
 	u64 total_found;
-	u64 ptr;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct extent_map_tree *block_group_cache;
 	struct btrfs_key key;
@@ -2377,6 +2376,7 @@ again:
 	key.offset = 0;
 	key.type = 0;
 	while(1) {
+
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
@@ -2441,6 +2441,8 @@ next:
 	key.offset = 0;
 	key.type = 0;
 	while(1) {
+		u64 ptr;
+
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0)
 			goto out;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5003a86510a..b62f35e862b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2108,7 +2108,8 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_btree_balance_dirty(root, nr);
 }
 
-static int create_subvol(struct btrfs_root *root, char *name, int namelen)
+static int noinline create_subvol(struct btrfs_root *root, char *name,
+				  int namelen)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
@@ -2492,7 +2493,8 @@ out:
 	return ret;
 }
 
-static int btrfs_ioctl_snap_create(struct btrfs_root *root, void __user *arg)
+static int noinline btrfs_ioctl_snap_create(struct btrfs_root *root,
+					    void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_dir_item *di;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1ad611b9f61..02721eea9a7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -481,12 +481,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
-	struct extent_map_tree pinned_copy;
+	struct extent_map_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
 
-	extent_map_tree_init(&pinned_copy,
-			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&dirty_fs_roots);
 
 	mutex_lock(&root->fs_info->trans_mutex);
@@ -507,6 +505,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
+
+	pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
+	if (!pinned_copy)
+		return -ENOMEM;
+
+	extent_map_tree_init(pinned_copy,
+			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
+
 	trans->transaction->in_commit = 1;
 	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -568,16 +574,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			    &root->fs_info->super_copy, 0,
 			    sizeof(root->fs_info->super_copy));
 
-	btrfs_copy_pinned(root, &pinned_copy);
+	btrfs_copy_pinned(root, pinned_copy);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
+
 	mutex_lock(&root->fs_info->fs_mutex);
-	btrfs_finish_extent_commit(trans, root, &pinned_copy);
+	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
+
+	kfree(pinned_copy);
+
 	cur_trans->commit_done = 1;
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
-- 
cgit v1.2.3


From f9ef6604ace23a6fcd698e08b58a883d6009157b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 09:22:38 -0500
Subject: Btrfs: 32 bit compile fixes for the resizer and enospc checks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  4 +++-
 fs/btrfs/inode.c       | 10 +++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6137f06091e..54f2b1d8bdf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2501,6 +2501,7 @@ int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
 	u64 nr = 0;
 	u64 cur_byte;
 	u64 old_size;
+	unsigned long rem;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_block_group_item *item;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -2538,7 +2539,8 @@ int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
 				      struct btrfs_block_group_item);
 
 		btrfs_set_disk_block_group_used(leaf, item, 0);
-		if (nr % 3) {
+		div_long_long_rem(nr, 3, &rem);
+		if (rem) {
 			btrfs_set_disk_block_group_flags(leaf, item,
 						 BTRFS_BLOCK_GROUP_DATA);
 		} else {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b62f35e862b..8a2001bbf10 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -81,9 +81,11 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	int ret = 0;
 
 	if (for_del)
-		thresh = (total * 90) / 100;
+		thresh = total * 90;
 	else
-		thresh = (total * 85) / 100;
+		thresh = total * 85;
+
+	do_div(thresh, 100);
 
 	spin_lock(&root->fs_info->delalloc_lock);
 	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
@@ -2475,7 +2477,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		ret = -EFBIG;
 		goto out_unlock;
 	}
-	new_size = (new_size / root->sectorsize) * root->sectorsize;
+
+	do_div(new_size, root->sectorsize);
+	new_size *= root->sectorsize;
 
 printk("new size is %Lu\n", new_size);
 	if (new_size > old_size) {
-- 
cgit v1.2.3


From 98ed51747b63435b9987ef12692a75c223818bbe Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 10:01:48 -0500
Subject: Btrfs: Force inlining off in a few places to save stack usage

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 21 ++++++++--------
 fs/btrfs/extent-tree.c | 66 ++++++++++++++++++++++++++++----------------------
 fs/btrfs/file.c        | 16 +++++-------
 3 files changed, 54 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8fa92a2d981..35c57074a37 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -548,8 +548,8 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	return 0;
 }
 
-static int check_block(struct btrfs_root *root, struct btrfs_path *path,
-			int level)
+static int noinline check_block(struct btrfs_root *root,
+				struct btrfs_path *path, int level)
 {
 	return 0;
 #if 0
@@ -676,8 +676,9 @@ static struct extent_buffer *read_node_slot(struct btrfs_root *root,
 		       btrfs_level_size(root, btrfs_header_level(parent) - 1));
 }
 
-static int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, struct btrfs_path *path, int level)
+static int balance_level(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 struct btrfs_path *path, int level)
 {
 	struct extent_buffer *right = NULL;
 	struct extent_buffer *mid;
@@ -868,9 +869,9 @@ enospc:
 }
 
 /* returns zero if the push worked, non-zero otherwise */
-static int push_nodes_for_insert(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path, int level)
+static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, int level)
 {
 	struct extent_buffer *right = NULL;
 	struct extent_buffer *mid;
@@ -1207,8 +1208,8 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
  * error, and > 0 if there was no room in the left hand block.
  */
-static int push_node_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct extent_buffer *dst,
+static int push_node_left(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *dst,
 			  struct extent_buffer *src)
 {
 	int push_items = 0;
@@ -1309,7 +1310,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
  *
  * returns zero on success or < 0 on failure.
  */
-static int insert_new_root(struct btrfs_trans_handle *trans,
+static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 54f2b1d8bdf..4957cface9a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -162,10 +162,11 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 		return block_group;
 	return NULL;
 }
-static u64 find_search_start(struct btrfs_root *root,
-			     struct btrfs_block_group_cache **cache_ret,
-			     u64 search_start, int num,
-			     int data, int full_scan)
+
+static u64 noinline find_search_start(struct btrfs_root *root,
+			      struct btrfs_block_group_cache **cache_ret,
+			      u64 search_start, int num,
+			      int data, int full_scan)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
@@ -393,11 +394,12 @@ static int match_extent_ref(struct extent_buffer *leaf,
 	return ret == 0;
 }
 
-static int lookup_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path, u64 bytenr,
-				 u64 root_objectid, u64 ref_generation,
-				 u64 owner, u64 owner_offset, int del)
+static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 bytenr,
+					  u64 root_objectid,
+					  u64 ref_generation, u64 owner,
+					  u64 owner_offset, int del)
 {
 	u64 hash;
 	struct btrfs_key key;
@@ -1116,8 +1118,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int finish_current_insert(struct btrfs_trans_handle *trans, struct
-				 btrfs_root *extent_root)
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root)
 {
 	u64 start;
 	u64 end;
@@ -1360,11 +1362,13 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-static int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *orig_root, u64 num_bytes, u64 empty_size,
-			    u64 search_start, u64 search_end, u64 hint_byte,
-			    struct btrfs_key *ins, u64 exclude_start,
-			    u64 exclude_nr, int data)
+static int noinline find_free_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *orig_root,
+				     u64 num_bytes, u64 empty_size,
+				     u64 search_start, u64 search_end,
+				     u64 hint_byte, struct btrfs_key *ins,
+				     u64 exclude_start, u64 exclude_nr,
+				     int data)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -1760,8 +1764,9 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	return buf;
 }
 
-static int drop_leaf_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root, struct extent_buffer *leaf)
+static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -1802,8 +1807,8 @@ static int drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static void reada_walk_down(struct btrfs_root *root,
-			    struct extent_buffer *node)
+static void noinline reada_walk_down(struct btrfs_root *root,
+				     struct extent_buffer *node)
 {
 	int i;
 	u32 nritems;
@@ -1835,8 +1840,9 @@ static void reada_walk_down(struct btrfs_root *root,
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
-static int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, int *level)
+static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level)
 {
 	u64 root_owner;
 	u64 root_gen;
@@ -1955,8 +1961,9 @@ out:
  * to find the first node higher up where we haven't yet gone through
  * all the slots
  */
-static int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, struct btrfs_path *path, int *level)
+static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, int *level)
 {
 	u64 root_owner;
 	u64 root_gen;
@@ -2100,7 +2107,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	return 0;
 }
 
-static int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+static int noinline relocate_inode_pages(struct inode *inode, u64 start,
+					 u64 len)
 {
 	u64 page_start;
 	u64 page_end;
@@ -2169,7 +2177,7 @@ out_unlock:
 /*
  * note, this releases the path
  */
-static int relocate_one_reference(struct btrfs_root *extent_root,
+static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 				  struct btrfs_path *path,
 				  struct btrfs_key *extent_key)
 {
@@ -2268,9 +2276,9 @@ out:
 	return 0;
 }
 
-static int relocate_one_extent(struct btrfs_root *extent_root,
-			       struct btrfs_path *path,
-			       struct btrfs_key *extent_key)
+static int noinline relocate_one_extent(struct btrfs_root *extent_root,
+					struct btrfs_path *path,
+					struct btrfs_key *extent_key)
 {
 	struct btrfs_key key;
 	struct btrfs_key found_key;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 71dc2d33b6c..c9ebd4fe8f7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -80,7 +80,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
-static int insert_inline_extent(struct btrfs_trans_handle *trans,
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode,
 				u64 offset, size_t size,
 				struct page **pages, size_t page_offset,
@@ -221,7 +221,7 @@ fail:
 	return err;
 }
 
-static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct file *file,
 				   struct page **pages,
@@ -653,14 +653,10 @@ out:
 /*
  * this gets pages into the page cache and locks them down
  */
-static int prepare_pages(struct btrfs_root *root,
-			 struct file *file,
-			 struct page **pages,
-			 size_t num_pages,
-			 loff_t pos,
-			 unsigned long first_index,
-			 unsigned long last_index,
-			 size_t write_bytes)
+static int prepare_pages(struct btrfs_root *root, struct file *file,
+			 struct page **pages, size_t num_pages,
+			 loff_t pos, unsigned long first_index,
+			 unsigned long last_index, size_t write_bytes)
 {
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
-- 
cgit v1.2.3


From bd09835d9aad9c7b664cddc8435cc37b86077971 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 13:23:19 -0500
Subject: count_snapshots: Properly update the leaf pointer after
 btrfs_next_leaf

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 fs/btrfs/inode.c       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4957cface9a..b801eb4b3ee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -693,7 +693,6 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 	u64 root_objectid = root->root_key.objectid;
 	u32 total_count = 0;
 	u32 cur_count;
-	u32 refs;
 	u32 nritems;
 	int ret;
 	struct btrfs_key key;
@@ -729,8 +728,8 @@ again:
 	}
 
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-	refs = btrfs_extent_refs(l, item);
 	while (1) {
+		l = path->nodes[0];
 		nritems = btrfs_header_nritems(l);
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(extent_root, path);
@@ -741,6 +740,7 @@ again:
 		btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
 		if (found_key.objectid != bytenr)
 			break;
+
 		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
 			path->slots[0]++;
 			continue;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8a2001bbf10..6e2d03626fb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -208,7 +208,7 @@ again:
 		}
 
 		start = extent_end;
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+	} else {
 		goto not_found;
 	}
 loop:
-- 
cgit v1.2.3


From 5b92ee7204a5fb6542b204831202adbc1a7a851a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 13:46:11 -0500
Subject: Btrfs: Fix lock ordering of the snapshot semaphore against the page
 lock

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c9ebd4fe8f7..e862292bdfc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -251,7 +251,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	num_bytes = (write_bytes + pos - start_pos +
 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
 	end_of_last_block = start_pos + num_bytes - 1;
 
 	lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
@@ -356,7 +355,6 @@ out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	unlock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
 	free_extent_map(em);
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 	return err;
 }
 
@@ -726,6 +724,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
+	down_read(&BTRFS_I(inode)->root->snap_sem);
+
 	mutex_lock(&inode->i_mutex);
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -804,6 +804,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
+	up_read(&BTRFS_I(inode)->root->snap_sem);
+
 out_nolock:
 	kfree(pages);
 	if (pinned[0])
-- 
cgit v1.2.3


From d548ee5182bda42c87bdd4e28b539d5e436e206a Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 3 Jan 2008 13:56:30 -0500
Subject: Btrfs: Add a helper that finds previous extent item

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 80 +++++++++++++++++++++++---------------------------
 1 file changed, 36 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b801eb4b3ee..cc89df4493a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,28 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
+static int find_previous_extent(struct btrfs_root *root,
+				struct btrfs_path *path)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int ret;
+
+	while(1) {
+		if (path->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
+			return 0;
+	}
+	return 1;
+}
 
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
@@ -65,16 +87,19 @@ static int cache_block_group(struct btrfs_root *root,
 	first_free = block_group->key.objectid;
 	key.objectid = block_group->key.objectid;
 	key.offset = 0;
-
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
 	if (ret < 0)
 		return ret;
-
-	if (ret && path->slots[0] > 0)
-		path->slots[0]--;
-
+	ret = find_previous_extent(root, path);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid + key.offset > first_free)
+			first_free = key.objectid + key.offset;
+	}
 	while(1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -88,15 +113,10 @@ static int cache_block_group(struct btrfs_root *root,
 				break;
 			}
 		}
-
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid < block_group->key.objectid) {
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_REF_KEY &&
-			    key.objectid + key.offset > first_free)
-				first_free = key.objectid + key.offset;
 			goto next;
 		}
-
 		if (key.objectid >= block_group->key.objectid +
 		    block_group->key.offset) {
 			break;
@@ -162,11 +182,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 		return block_group;
 	return NULL;
 }
-
 static u64 noinline find_search_start(struct btrfs_root *root,
 			      struct btrfs_block_group_cache **cache_ret,
-			      u64 search_start, int num,
-			      int data, int full_scan)
+			      u64 search_start, int num, int data)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
@@ -771,9 +789,7 @@ again:
 out:
 	btrfs_free_path(path);
 	return total_count;
-
 }
-
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner_objectid)
 {
@@ -1422,7 +1438,7 @@ check_failed:
 						       orig_search_start);
 	}
 	search_start = find_search_start(root, &block_group, search_start,
-					 total_needed, data, full_scan);
+					 total_needed, data);
 	search_start = stripe_align(root, search_start);
 	cached_start = search_start;
 	btrfs_init_path(path);
@@ -1434,35 +1450,11 @@ check_failed:
 	ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
 	if (ret < 0)
 		goto error;
-
-	if (path->slots[0] > 0) {
-		path->slots[0]--;
-	}
-
+	ret = find_previous_extent(root, path);
+	if (ret < 0)
+		goto error;
 	l = path->nodes[0];
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-
-	/*
-	 * walk backwards to find the first extent item key
-	 */
-	while(btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
-		if (path->slots[0] == 0) {
-			ret = btrfs_prev_leaf(root, path);
-			if (ret != 0) {
-				ret = btrfs_search_slot(trans, root, ins,
-							path, 0, 0);
-				if (ret < 0)
-					goto error;
-				if (path->slots[0] > 0)
-					path->slots[0]--;
-				break;
-			}
-		} else {
-			path->slots[0]--;
-		}
-		l = path->nodes[0];
-		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-	}
 	while (1) {
 		l = path->nodes[0];
 		slot = path->slots[0];
-- 
cgit v1.2.3


From 73e48b277a3fcd647f1936c71aea2f8d450ecc74 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 3 Jan 2008 14:14:39 -0500
Subject: Btrfs: Properly handle overlapping extent in shrink_extent_tree

The patch fixes the overlapping extent issue in shrink_extent_tree.
It checks whether there is an overlapping extent by using
find_previous_extent. If there is an overlapping extent, it setups
key.objectid and cur_byte properly.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 140 ++++++++++++++++++++-----------------------------
 1 file changed, 56 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cc89df4493a..426a0222e12 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2317,36 +2317,6 @@ out:
 	return ret;
 }
 
-static int find_overlapping_extent(struct btrfs_root *root,
-				   struct btrfs_path *path, u64 new_size)
-{
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	int ret;
-
-	while(1) {
-		if (path->slots[0] == 0) {
-			ret = btrfs_prev_leaf(root, path);
-			if (ret == 1) {
-				return 1;
-			}
-			if (ret < 0)
-				return ret;
-		} else {
-			path->slots[0]--;
-		}
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.type == BTRFS_EXTENT_ITEM_KEY) {
-			if (found_key.objectid + found_key.offset > new_size)
-				return 0;
-			else
-				return 1;
-		}
-	}
-	return 1;
-}
-
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 {
 	struct btrfs_trans_handle *trans;
@@ -2357,11 +2327,10 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	struct btrfs_fs_info *info = root->fs_info;
 	struct extent_map_tree *block_group_cache;
 	struct btrfs_key key;
-	struct btrfs_key found_key = { 0, 0, 0 };
+	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 	u32 nritems;
 	int ret;
-	int slot;
 
 	btrfs_set_super_total_bytes(&info->super_copy, new_size);
 	block_group_cache = &info->block_group_cache;
@@ -2372,48 +2341,54 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 again:
 	total_found = 0;
 	key.objectid = new_size;
-	cur_byte = key.objectid;
 	key.offset = 0;
 	key.type = 0;
-	while(1) {
+	cur_byte = key.objectid;
 
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
+	ret = find_previous_extent(root, path);
+	if (ret < 0)
+		goto out;
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid + found_key.offset > new_size) {
+			cur_byte = found_key.objectid;
+			key.objectid = cur_byte;
+		}
+	}
+	btrfs_release_path(root, path);
+
+	while(1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
-next:
+
 		leaf = path->nodes[0];
-		if (key.objectid == new_size - 1) {
-			ret = find_overlapping_extent(root, path, new_size);
-			if (ret != 0) {
-				btrfs_release_path(root, path);
-				ret = btrfs_search_slot(NULL, root, &key,
-							path, 0, 0);
-				if (ret < 0)
-					goto out;
+		nritems = btrfs_header_nritems(leaf);
+next:
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret == 1) {
+				ret = 0;
+				break;
 			}
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
 		}
-		nritems = btrfs_header_nritems(leaf);
-		ret = 0;
-		slot = path->slots[0];
-		if (slot < nritems)
-			btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (slot == nritems ||
-		    btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY) {
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
+		    found_key.objectid + found_key.offset <= cur_byte) {
 			path->slots[0]++;
-			if (path->slots[0] >= nritems) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret < 0)
-					goto out;
-				if (ret == 1) {
-					ret = 0;
-					break;
-				}
-			}
 			goto next;
 		}
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (found_key.objectid + found_key.offset <= cur_byte)
-			continue;
+
 		total_found++;
 		cur_byte = found_key.objectid + found_key.offset;
 		key.objectid = cur_byte;
@@ -2446,33 +2421,30 @@ next:
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0)
 			goto out;
-bg_next:
+
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-		ret = 0;
-		slot = path->slots[0];
-		if (slot < nritems)
-			btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (slot == nritems ||
-		    btrfs_key_type(&found_key) != BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			if (slot < nritems) {
-				printk("shrinker found key %Lu %u %Lu\n",
-				       found_key.objectid, found_key.type,
-				       found_key.offset);
-				path->slots[0]++;
-			}
-			if (path->slots[0] >= nritems) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret < 0)
-					break;
-				if (ret == 1) {
-					ret = 0;
-					break;
-				}
+bg_next:
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				break;
+			if (ret == 1) {
+				ret = 0;
+				break;
 			}
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (btrfs_key_type(&found_key) != BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			printk("shrinker found key %Lu %u %Lu\n",
+				found_key.objectid, found_key.type,
+				found_key.offset);
+			path->slots[0]++;
 			goto bg_next;
 		}
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 		ret = get_state_private(&info->block_group_cache,
 					found_key.objectid, &ptr);
 		if (!ret)
-- 
cgit v1.2.3


From d666746207a01546e55bdaa4b721d1890faaf6e8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 14:51:00 -0500
Subject: Btrfs: Change st_blocksize to 4k

Some programs (python) do rwm cycles at the granularity returned by
stat.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c     | 2 +-
 fs/btrfs/root-tree.c | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6e2d03626fb..6d0cd9a7a61 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2701,7 +2701,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
 {
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
-	stat->blksize = 256 * 1024;
+	stat->blksize = PAGE_CACHE_SIZE;
 	return 0;
 }
 
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e2b9b86af69..08f19ec8809 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -75,7 +75,14 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret != 0);
+
+	if (ret != 0) {
+		btrfs_print_leaf(root, path->nodes[0]);
+		printk("unable to update root key %Lu %u %Lu\n",
+		       key->objectid, key->type, key->offset);
+		BUG_ON(1);
+	}
+
 	l = path->nodes[0];
 	slot = path->slots[0];
 	ptr = btrfs_item_ptr_offset(l, slot);
-- 
cgit v1.2.3


From 2552488345ea0933711010ee2c844792130a4252 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Jan 2008 15:44:57 -0500
Subject: Btrfs: Lower the max inline size to 8k

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e862292bdfc..94c93373cb7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -303,7 +303,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 */
 	inline_size = end_pos;
 	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > 32768 ||
+	    inline_size > 8192 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
 		u64 existing_delalloc = 0;
-- 
cgit v1.2.3


From 1372f8e609d260c2352bf74e125614eef7aee8ef Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 4 Jan 2008 09:34:54 -0500
Subject: Properly call btrfs_search_slot while shrinking

The shrinking code used btrfs_next_leaf to find the next item, but
this does not cow the blocks it touches.  This fix calls search_slot after
finding the next item to do appropriate cow and balancing.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 426a0222e12..c390fd847a2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2434,7 +2434,15 @@ bg_next:
 				break;
 			}
 			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
+			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+			/*
+			 * btrfs_next_leaf doesn't cow buffers, we have to
+			 * do the search again
+			 */
+			memcpy(&key, &found_key, sizeof(key));
+			btrfs_release_path(root, path);
+			continue;
 		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-- 
cgit v1.2.3


From 725c8463ea9bcfc2b56cff09b93a90e5bcca59c4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 4 Jan 2008 16:47:16 -0500
Subject: Btrfs: resizer: don't hold the fs_mutex for long periods of time

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c390fd847a2..c906bb19b21 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2331,6 +2331,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	struct extent_buffer *leaf;
 	u32 nritems;
 	int ret;
+	int progress = 0;
 
 	btrfs_set_super_total_bytes(&info->super_copy, new_size);
 	block_group_cache = &info->block_group_cache;
@@ -2383,6 +2384,19 @@ next:
 		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		if (progress && need_resched()) {
+			memcpy(&key, &found_key, sizeof(key));
+			mutex_unlock(&root->fs_info->fs_mutex);
+			cond_resched();
+			mutex_lock(&root->fs_info->fs_mutex);
+			btrfs_release_path(root, path);
+			btrfs_search_slot(NULL, root, &key, path, 0, 0);
+			progress = 0;
+			goto next;
+		}
+		progress = 1;
+
 		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
 		    found_key.objectid + found_key.offset <= cur_byte) {
 			path->slots[0]++;
@@ -2442,7 +2456,7 @@ bg_next:
 			 */
 			memcpy(&key, &found_key, sizeof(key));
 			btrfs_release_path(root, path);
-			continue;
+			goto resched_check;
 		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -2465,6 +2479,12 @@ bg_next:
 		key.objectid = found_key.objectid + 1;
 		btrfs_del_item(trans, root, path);
 		btrfs_release_path(root, path);
+resched_check:
+		if (need_resched()) {
+			mutex_unlock(&root->fs_info->fs_mutex);
+			cond_resched();
+			mutex_lock(&root->fs_info->fs_mutex);
+		}
 	}
 	clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
 			   GFP_NOFS);
-- 
cgit v1.2.3


From e4204dedbbaa3a614605cb83cc0ac5161af6b4e6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:27 -0500
Subject: Btrfs: Change tree block csum tagging to avoid false error messages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 67d9fd72886..a6170ff19e7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -145,10 +145,25 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	btrfs_csum_final(crc, result);
 
 	if (verify) {
-		if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
-			printk("btrfs: %s checksum verify failed on %llu\n",
+		int from_this_trans = 0;
+
+		if (root->fs_info->running_transaction &&
+		    btrfs_header_generation(buf) ==
+		    root->fs_info->running_transaction->transid)
+			from_this_trans = 1;
+
+		/* FIXME, this is not good */
+		if (from_this_trans == 0 &&
+		    memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
+			u32 val;
+			u32 found = 0;
+			memcpy(&found, result, BTRFS_CRC32_SIZE);
+
+			read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
+			printk("btrfs: %s checksum verify failed on %llu "
+			       "wanted %X found %X from_this_trans %d\n",
 			       root->fs_info->sb->s_id,
-			       buf->start);
+			       buf->start, val, found, from_this_trans);
 			return 1;
 		}
 	} else {
@@ -313,6 +328,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_map_tree *extent_tree;
+	u64 end;
 	int ret;
 
 	extent_tree = &BTRFS_I(btree_inode)->extent_tree;
@@ -322,19 +338,29 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 		return NULL;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
 				 buf, 0, 1);
-	if (buf->flags & EXTENT_CSUM) {
+
+	if (buf->flags & EXTENT_CSUM)
 		return buf;
-	}
-	if (test_range_bit(extent_tree, buf->start, buf->start + buf->len - 1,
-			   EXTENT_CSUM, 1)) {
+
+	end = buf->start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) {
 		buf->flags |= EXTENT_CSUM;
 		return buf;
 	}
+
+	lock_extent(extent_tree, buf->start, end, GFP_NOFS);
+
+	if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) {
+		buf->flags |= EXTENT_CSUM;
+		goto out_unlock;
+	}
+
 	ret = csum_tree_block(root, buf, 1);
-	set_extent_bits(extent_tree, buf->start,
-			buf->start + buf->len - 1,
-			EXTENT_CSUM, GFP_NOFS);
+	set_extent_bits(extent_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
+
+out_unlock:
+	unlock_extent(extent_tree, buf->start, end, GFP_NOFS);
 	return buf;
 }
 
-- 
cgit v1.2.3


From dc17ff8f11d129db9e83ab7244769e4eae05e14d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:30 -0500
Subject: Btrfs: Add data=ordered support

This forces file data extents down the disk along with the metadata that
references them.  The current implementation is fairly simple, and just
writes out all of the dirty pages in an inode before the commit.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/btrfs_inode.h  |   1 +
 fs/btrfs/ctree.c        |   6 +-
 fs/btrfs/ctree.h        |   7 +-
 fs/btrfs/disk-io.c      |  16 +++-
 fs/btrfs/disk-io.h      |   2 +
 fs/btrfs/extent-tree.c  |   4 +-
 fs/btrfs/file.c         |   5 +-
 fs/btrfs/inode.c        |  46 ++++++----
 fs/btrfs/ordered-data.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ordered-data.h |  39 +++++++++
 fs/btrfs/transaction.c  |  58 +++++++++++++
 fs/btrfs/transaction.h  |   8 +-
 13 files changed, 387 insertions(+), 28 deletions(-)
 create mode 100644 fs/btrfs/ordered-data.c
 create mode 100644 fs/btrfs/ordered-data.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d5804c5ca07..ab9a9f8edbf 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,7 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o
+	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o ordered-data.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d1d5af471c3..f27e633f174 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ struct btrfs_inode {
 	struct extent_map_tree extent_tree;
 	struct inode vfs_inode;
 
+	u64 ordered_trans;
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35c57074a37..43d23148a4f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -221,7 +221,9 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
+	u64 header_trans;
 	int ret;
+
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
 		       root->fs_info->running_transaction->transid);
@@ -232,7 +234,9 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	if (btrfs_header_generation(buf) == trans->transid) {
+
+	header_trans = btrfs_header_generation(buf);
+	if (header_trans == trans->transid) {
 		*cow_ret = buf;
 		return 0;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9873975ce0e..b55dba58dfa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -16,8 +16,8 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __BTRFS__
-#define __BTRFS__
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
 
 #include <linux/version.h>
 #include <linux/mm.h>
@@ -363,7 +363,6 @@ struct btrfs_root {
 	struct inode *inode;
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
-	struct rw_semaphore snap_sem;
 	u64 objectid;
 	u64 last_trans;
 
@@ -1142,6 +1141,8 @@ void btrfs_destroy_cachep(void);
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    u64 root_objectid);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a6170ff19e7..34cf1f1f47b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -406,7 +406,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
 	init_completion(&root->kobj_unregister);
-	init_rwsem(&root->snap_sem);
 	root->defrag_running = 0;
 	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
@@ -498,6 +497,21 @@ insert:
 	return root;
 }
 
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_root *root;
+
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)root_objectid);
+	return root;
+}
+
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8c3cfd02901..dae9fba8efc 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -34,6 +34,8 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c906bb19b21..68137cd8506 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1195,7 +1195,9 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			if (btrfs_buffer_uptodate(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
-				if (btrfs_header_generation(buf) == transid) {
+				u64 header_transid =
+					btrfs_header_generation(buf);
+				if (header_transid == transid) {
 					free_extent_buffer(buf);
 					return 1;
 				}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 94c93373cb7..0a5f4defe59 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,6 +34,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
 
@@ -329,6 +330,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
 					  start_pos) - existing_delalloc;
 		spin_unlock(&root->fs_info->delalloc_lock);
+		btrfs_add_ordered_inode(inode);
 	} else {
 		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
@@ -724,8 +726,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
-
 	mutex_lock(&inode->i_mutex);
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -804,7 +804,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 
 out_nolock:
 	kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6d0cd9a7a61..6d6e1ac0a9a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -135,6 +135,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+	btrfs_add_ordered_inode(inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -367,8 +368,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
-
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret)
 		goto make_bad;
@@ -898,7 +899,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
 
-	down_read(&root->snap_sem);
 	ret = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -917,7 +917,6 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	unlock_page(page);
 	page_cache_release(page);
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 out:
 	return ret;
 }
@@ -1146,6 +1145,19 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
 		args->root == BTRFS_I(inode)->root);
 }
 
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    u64 root_objectid)
+{
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = btrfs_lookup_fs_root(btrfs_sb(s)->fs_info, root_objectid);
+
+	if (!args.root)
+		return NULL;
+
+	return ilookup5(s, objectid, btrfs_find_actor, (void *)&args);
+}
+
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root)
 {
@@ -1336,7 +1348,6 @@ read_dir_items:
 
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
-
 			over = filldir(dirent, name_ptr, name_len,
 				       found_key.offset,
 				       location.objectid,
@@ -2054,7 +2065,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 
 	ret = -EINVAL;
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
 	lock_page(page);
 	wait_on_page_writeback(page);
 	size = i_size_read(inode);
@@ -2075,7 +2085,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ret = btrfs_cow_one_page(inode, page, end);
 
 out_unlock:
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 	unlock_page(page);
 out:
 	return ret;
@@ -2118,7 +2127,7 @@ static int noinline create_subvol(struct btrfs_root *root, char *name,
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
-	struct btrfs_root *new_root;
+	struct btrfs_root *new_root = root;
 	struct inode *inode;
 	struct inode *dir;
 	int ret;
@@ -2230,7 +2239,7 @@ static int noinline create_subvol(struct btrfs_root *root, char *name,
 		goto fail;
 fail:
 	nr = trans->blocks_used;
-	err = btrfs_commit_transaction(trans, root);
+	err = btrfs_commit_transaction(trans, new_root);
 	if (err && !ret)
 		ret = err;
 fail_commit:
@@ -2253,10 +2262,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	down_write(&root->snap_sem);
-	freeze_bdev(root->fs_info->sb->s_bdev);
-	thaw_bdev(root->fs_info->sb->s_bdev, root->fs_info->sb);
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
@@ -2264,6 +2269,9 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
+	err = btrfs_commit_transaction(trans, root);
+
+	trans = btrfs_start_transaction(root, 1);
 
 	ret = btrfs_update_inode(trans, root, root->inode);
 	if (ret)
@@ -2272,9 +2280,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
 	if (ret)
-		goto fail;
-
-	memcpy(&new_root_item, &root->root_item,
+		goto fail; memcpy(&new_root_item, &root->root_item,
 	       sizeof(new_root_item));
 
 	key.objectid = objectid;
@@ -2285,12 +2291,20 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 	free_extent_buffer(tmp);
 
+	/* write the ordered inodes to force all delayed allocations to
+	 * be filled.  Once this is done, we can copy the root
+	 */
+	mutex_lock(&root->fs_info->trans_mutex);
+	btrfs_write_ordered_inodes(trans, root);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
 	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
 
 	btrfs_set_root_bytenr(&new_root_item, tmp->start);
 	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
+printk("new root %Lu node %Lu\n", objectid, tmp->start);
 	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
@@ -2321,7 +2335,6 @@ fail:
 		ret = err;
 fail_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
-	up_write(&root->snap_sem);
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2608,6 +2621,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
+	ei->ordered_trans = 0;
 	return &ei->vfs_inode;
 }
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 00000000000..411aba84d30
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+
+struct tree_entry {
+	u64 root_objectid;
+	u64 objectid;
+	struct rb_node rb_node;
+};
+
+/*
+ * returns > 0 if entry passed (root, objectid) is > entry,
+ * < 0 if (root, objectid) < entry and zero if they are equal
+ */
+static int comp_entry(struct tree_entry *entry, u64 root_objectid,
+		      u64 objectid)
+{
+	if (root_objectid < entry->root_objectid)
+		return -1;
+	if (root_objectid > entry->root_objectid)
+		return 1;
+	if (objectid < entry->objectid)
+		return -1;
+	if (objectid > entry->objectid)
+		return 1;
+	return 0;
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
+				   u64 objectid, struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct tree_entry *entry;
+	int comp;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		comp = comp_entry(entry, root_objectid, objectid);
+		if (comp < 0)
+			p = &(*p)->rb_left;
+		else if (comp > 0)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
+				     u64 objectid, struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+	int comp;
+
+	while(n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+		comp = comp_entry(entry, root_objectid, objectid);
+
+		if (comp < 0)
+			n = n->rb_left;
+		else if (comp > 0)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root,
+					  u64 root_objectid, u64 objectid)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, root_objectid, objectid, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 root_objectid = root->root_key.objectid;
+	u64 transid = root->fs_info->running_transaction->transid;
+	struct tree_entry *entry;
+	struct rb_node *node;
+	struct btrfs_ordered_inode_tree *tree;
+
+	if (transid <= BTRFS_I(inode)->ordered_trans)
+		return 0;
+
+	tree = &root->fs_info->running_transaction->ordered_inode_tree;
+
+	read_lock(&tree->lock);
+	node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
+	read_unlock(&tree->lock);
+	if (node) {
+		return 0;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_NOFS);
+	if (!entry)
+		return -ENOMEM;
+
+	write_lock(&tree->lock);
+	entry->objectid = inode->i_ino;
+	entry->root_objectid = root_objectid;
+
+	node = tree_insert(&tree->tree, root_objectid,
+			   inode->i_ino, &entry->rb_node);
+
+	BTRFS_I(inode)->ordered_trans = transid;
+
+	write_unlock(&tree->lock);
+	if (node)
+		kfree(entry);
+	return 0;
+}
+
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid)
+{
+	struct tree_entry *entry;
+	struct rb_node *node;
+
+	write_lock(&tree->lock);
+	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+	entry = rb_entry(node, struct tree_entry, rb_node);
+
+	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+		node = rb_next(node);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct tree_entry, rb_node);
+	}
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	*root_objectid = entry->root_objectid;
+	*objectid = entry->objectid;
+	write_unlock(&tree->lock);
+	return 1;
+}
+
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid)
+{
+	struct tree_entry *entry;
+	struct rb_node *node;
+
+	write_lock(&tree->lock);
+	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+		node = rb_next(node);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct tree_entry, rb_node);
+	}
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	*root_objectid = entry->root_objectid;
+	*objectid = entry->objectid;
+	rb_erase(node, &tree->tree);
+	write_unlock(&tree->lock);
+	kfree(entry);
+	return 1;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 00000000000..aaf9eb14271
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+struct btrfs_ordered_inode_tree {
+	rwlock_t lock;
+	struct rb_root tree;
+};
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+	rwlock_init(&t->lock);
+	t->tree.rb_node = NULL;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode);
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 02721eea9a7..3ed5868e7c0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -67,6 +67,7 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
 		extent_map_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
@@ -473,6 +474,60 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 	return ret;
 }
 
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct inode *inode;
+	u64 root_objectid = 0;
+	u64 objectid = 0;
+	u64 transid = trans->transid;
+	int ret;
+
+printk("write ordered trans %Lu\n", transid);
+	while(1) {
+		ret = btrfs_find_first_ordered_inode(
+				&cur_trans->ordered_inode_tree,
+				&root_objectid, &objectid);
+		if (!ret)
+			break;
+
+		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		inode = btrfs_ilookup(root->fs_info->sb, objectid,
+				      root_objectid);
+		if (inode) {
+			if (S_ISREG(inode->i_mode))
+				filemap_fdatawrite(inode->i_mapping);
+			iput(inode);
+		}
+		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+	while(1) {
+		root_objectid = 0;
+		objectid = 0;
+		ret = btrfs_find_del_first_ordered_inode(
+				&cur_trans->ordered_inode_tree,
+				&root_objectid, &objectid);
+		if (!ret)
+			break;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		inode = btrfs_ilookup(root->fs_info->sb, objectid,
+				      root_objectid);
+		if (inode) {
+			if (S_ISREG(inode->i_mode))
+				filemap_write_and_wait(inode->i_mapping);
+			iput(inode);
+		}
+		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+printk("done write ordered trans %Lu\n", transid);
+	return 0;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -550,10 +605,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
+		ret = btrfs_write_ordered_inodes(trans, root);
+
 	} while (cur_trans->num_writers > 1 ||
 		 (cur_trans->num_joined != joined));
 
 	WARN_ON(cur_trans != trans->transaction);
+
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index eef840bca91..c157ddbe9d1 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -16,9 +16,10 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __TRANSACTION__
-#define __TRANSACTION__
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -30,6 +31,7 @@ struct btrfs_transaction {
 	struct list_head list;
 	struct extent_map_tree dirty_pages;
 	unsigned long start_time;
+	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 };
@@ -90,4 +92,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From 3063d29f2a4d4a4e9fa1ec77c124514f287c6da7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:30 -0500
Subject: Btrfs: Move snapshot creation to commit time

It is very difficult to create a consistent snapshot of the btree when
other writers may update the btree before the commit is done.

This changes the snapshot creation to happen during the commit, while
no other updates are possible.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c       | 79 ++++++++++--------------------------------------
 fs/btrfs/transaction.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/transaction.h |  7 +++++
 3 files changed, 100 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6d6e1ac0a9a..10cece11dbd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2250,13 +2250,10 @@ fail_commit:
 
 static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 {
+	struct btrfs_pending_snapshot *pending_snapshot;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_key key;
-	struct btrfs_root_item new_root_item;
-	struct extent_buffer *tmp;
 	int ret;
 	int err;
-	u64 objectid;
 	unsigned long nr = 0;
 
 	if (!root->ref_cows)
@@ -2267,72 +2264,26 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail_unlock;
 
+	pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS);
+	if (!pending_snapshot) {
+		ret = -ENOMEM;
+		goto fail_unlock;
+	}
+	pending_snapshot->name = kstrndup(name, namelen, GFP_NOFS);
+	if (!pending_snapshot->name) {
+		ret = -ENOMEM;
+		kfree(pending_snapshot);
+		goto fail_unlock;
+	}
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
-	err = btrfs_commit_transaction(trans, root);
-
-	trans = btrfs_start_transaction(root, 1);
 
+	pending_snapshot->root = root;
+	list_add(&pending_snapshot->list,
+		 &trans->transaction->pending_snapshots);
 	ret = btrfs_update_inode(trans, root, root->inode);
-	if (ret)
-		goto fail;
-
-	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-				       0, &objectid);
-	if (ret)
-		goto fail; memcpy(&new_root_item, &root->root_item,
-	       sizeof(new_root_item));
-
-	key.objectid = objectid;
-	key.offset = 1;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-
-	extent_buffer_get(root->node);
-	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-	free_extent_buffer(tmp);
-
-	/* write the ordered inodes to force all delayed allocations to
-	 * be filled.  Once this is done, we can copy the root
-	 */
-	mutex_lock(&root->fs_info->trans_mutex);
-	btrfs_write_ordered_inodes(trans, root);
-	mutex_unlock(&root->fs_info->trans_mutex);
-
-	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
-
-	btrfs_set_root_bytenr(&new_root_item, tmp->start);
-	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-				&new_root_item);
-printk("new root %Lu node %Lu\n", objectid, tmp->start);
-	free_extent_buffer(tmp);
-	if (ret)
-		goto fail;
-
-	/*
-	 * insert the directory item
-	 */
-	key.offset = (u64)-1;
-	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    name, namelen,
-				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, BTRFS_FT_DIR);
-
-	if (ret)
-		goto fail;
-
-	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
-			     name, namelen, objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino);
-
-	if (ret)
-		goto fail;
-fail:
-	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, root);
 
-	if (err && !ret)
-		ret = err;
 fail_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3ed5868e7c0..dc9865323e3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -66,6 +66,7 @@ static int join_transaction(struct btrfs_root *root)
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
+		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
 		extent_map_tree_init(&cur_trans->dirty_pages,
@@ -481,10 +482,8 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 	struct inode *inode;
 	u64 root_objectid = 0;
 	u64 objectid = 0;
-	u64 transid = trans->transid;
 	int ret;
 
-printk("write ordered trans %Lu\n", transid);
 	while(1) {
 		ret = btrfs_find_first_ordered_inode(
 				&cur_trans->ordered_inode_tree,
@@ -524,7 +523,80 @@ printk("write ordered trans %Lu\n", transid);
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
-printk("done write ordered trans %Lu\n", transid);
+	return 0;
+}
+
+static int create_pending_snapshot(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info,
+				   struct btrfs_pending_snapshot *pending)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item new_root_item;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root = pending->root;
+	struct extent_buffer *tmp;
+	int ret;
+	u64 objectid;
+
+	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+	if (ret)
+		goto fail;
+
+	memcpy(&new_root_item, &root->root_item, sizeof(new_root_item));
+
+	key.objectid = objectid;
+	key.offset = 1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+	extent_buffer_get(root->node);
+	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
+	free_extent_buffer(tmp);
+
+	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
+
+	btrfs_set_root_bytenr(&new_root_item, tmp->start);
+	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&new_root_item);
+	free_extent_buffer(tmp);
+	if (ret)
+		goto fail;
+
+	/*
+	 * insert the directory item
+	 */
+	key.offset = (u64)-1;
+	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
+				    pending->name, strlen(pending->name),
+				    root->fs_info->sb->s_root->d_inode->i_ino,
+				    &key, BTRFS_FT_DIR);
+
+	if (ret)
+		goto fail;
+
+	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
+			     pending->name, strlen(pending->name), objectid,
+			     root->fs_info->sb->s_root->d_inode->i_ino);
+fail:
+	return ret;
+}
+
+static int create_pending_snapshots(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_pending_snapshot *pending;
+	struct list_head *head = &trans->transaction->pending_snapshots;
+	int ret;
+
+	while(!list_empty(head)) {
+		pending = list_entry(head->next,
+				     struct btrfs_pending_snapshot, list);
+		ret = create_pending_snapshot(trans, fs_info, pending);
+		BUG_ON(ret);
+		list_del(&pending->list);
+		kfree(pending->name);
+		kfree(pending);
+	}
 	return 0;
 }
 
@@ -610,6 +682,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	} while (cur_trans->num_writers > 1 ||
 		 (cur_trans->num_joined != joined));
 
+	ret = create_pending_snapshots(trans, root->fs_info);
+	BUG_ON(ret);
+
 	WARN_ON(cur_trans != trans->transaction);
 
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c157ddbe9d1..fd52e9b2392 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -34,6 +34,7 @@ struct btrfs_transaction {
 	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
+	struct list_head pending_snapshots;
 };
 
 struct btrfs_trans_handle {
@@ -46,6 +47,12 @@ struct btrfs_trans_handle {
 	u64 alloc_exclude_nr;
 };
 
+struct btrfs_pending_snapshot {
+	struct btrfs_root *root;
+	char *name;
+	struct list_head list;
+};
+
 
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 					       struct inode *inode)
-- 
cgit v1.2.3


From e2008b61401ecb467a8ce1788fcd2116ae1cfbc1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:30 -0500
Subject: Btrfs: Add some simple throttling to wait for data=ordered and
 snapshot deletion

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  7 +++++++
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/file.c        |  1 +
 fs/btrfs/inode.c       | 12 ++++++++++++
 fs/btrfs/transaction.c |  4 ++++
 6 files changed, 26 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b55dba58dfa..f0fb1978553 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -345,6 +345,7 @@ struct btrfs_fs_info {
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
+	unsigned long throttles;
 
 	u64 total_pinned;
 	spinlock_t delalloc_lock;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 34cf1f1f47b..e0940a39ff0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -631,6 +631,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->sb = sb;
+	fs_info->throttles = 0;
 	fs_info->mount_opt = 0;
 	fs_info->max_extent = (u64)-1;
 	fs_info->delalloc_bytes = 0;
@@ -889,6 +890,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
 }
 
+void btrfs_throttle(struct btrfs_root *root)
+{
+	if (root->fs_info->throttles)
+		congestion_wait(WRITE, HZ/10);
+}
+
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 {
 	balance_dirty_pages_ratelimited_nr(
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index dae9fba8efc..828f3a2081b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -64,4 +64,5 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
+void btrfs_throttle(struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0a5f4defe59..897242e87fa 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -800,6 +800,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
 		if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root, 1);
+		btrfs_throttle(root);
 		cond_resched();
 	}
 out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 10cece11dbd..2cb2dd32407 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -610,6 +610,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return ret;
 }
 
@@ -644,6 +645,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 
 	if (ret && !err)
 		err = ret;
@@ -1010,6 +1012,7 @@ void btrfs_delete_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return;
 
 no_delete_lock:
@@ -1017,6 +1020,7 @@ no_delete_lock:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 no_delete:
 	clear_inode(inode);
 }
@@ -1574,6 +1578,7 @@ fail:
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return err;
 }
 
@@ -1633,6 +1638,7 @@ fail:
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return err;
 }
 
@@ -1684,6 +1690,7 @@ fail:
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return err;
 }
 
@@ -1752,6 +1759,7 @@ out_unlock:
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return err;
 }
 
@@ -2117,6 +2125,7 @@ static void btrfs_truncate(struct inode *inode)
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 }
 
 static int noinline create_subvol(struct btrfs_root *root, char *name,
@@ -2245,6 +2254,7 @@ fail:
 fail_commit:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return ret;
 }
 
@@ -2287,6 +2297,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 fail_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return ret;
 }
 
@@ -2827,6 +2838,7 @@ out_fail:
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
 	return err;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc9865323e3..614903f5c88 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -424,6 +424,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 
 		num_bytes = btrfs_root_used(&dirty->root->root_item);
 		root = dirty->latest_root;
+		root->fs_info->throttles++;
 
 		while(1) {
 			trans = btrfs_start_transaction(tree_root, 1);
@@ -447,6 +448,7 @@ static int drop_dirty_roots(struct btrfs_root *tree_root,
 			mutex_lock(&tree_root->fs_info->fs_mutex);
 		}
 		BUG_ON(ret);
+		root->fs_info->throttles--;
 
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
@@ -484,6 +486,7 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 	u64 objectid = 0;
 	int ret;
 
+	root->fs_info->throttles++;
 	while(1) {
 		ret = btrfs_find_first_ordered_inode(
 				&cur_trans->ordered_inode_tree,
@@ -523,6 +526,7 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
+	root->fs_info->throttles--;
 	return 0;
 }
 
-- 
cgit v1.2.3


From b0331a4c4c339ba7786472b137d6ece9e7f810ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:31 -0500
Subject: Btrfs: Disable btree reada during extent backref lookups.

This reada is generally not effective.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 68137cd8506..660b05a4baf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -630,6 +630,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	path->reada = 0;
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -649,6 +650,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(root->fs_info->extent_root, path);
 
+	path->reada = 0;
 	ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
 					  path, bytenr, root_objectid,
 					  ref_generation, owner, owner_offset);
@@ -680,6 +682,7 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 
 	WARN_ON(num_bytes < root->sectorsize);
 	path = btrfs_alloc_path();
+	path->reada = 0;
 	key.objectid = bytenr;
 	key.offset = num_bytes;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -1240,6 +1243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
+	path->reada = 0;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, root_objectid,
 				    ref_generation,
-- 
cgit v1.2.3


From c31f8830f0dfd31e7e196b85ca1b39aef8b77d57 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jan 2008 15:46:31 -0500
Subject: Btrfs: online shrinking fixes

While shrinking the FS, the allocation functions need to make sure
they don't try to allocate bytes past the end of the FS.

nodatacow needed an extra check to force cows when the existing extents are
past the end of the FS.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 32 +++++++++++++++++++++++---------
 fs/btrfs/inode.c       | 15 +++++++++++++--
 2 files changed, 36 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 660b05a4baf..99a8b0f0d31 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -192,11 +192,13 @@ static u64 noinline find_search_start(struct btrfs_root *root,
 	u64 start = 0;
 	u64 end = 0;
 	u64 cache_miss = 0;
+	u64 total_fs_bytes;
 	int wrapped = 0;
 
 	if (!cache) {
 		goto out;
 	}
+	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 again:
 	ret = cache_block_group(root, cache);
 	if (ret)
@@ -223,6 +225,8 @@ again:
 		if (data != BTRFS_BLOCK_GROUP_MIXED &&
 		    start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
+		if (start + num  > total_fs_bytes)
+			goto new_group;
 		return start;
 	}
 out:
@@ -239,7 +243,7 @@ new_group:
 	last = cache->key.objectid + cache->key.offset;
 wrapped:
 	cache = btrfs_lookup_block_group(root->fs_info, last);
-	if (!cache) {
+	if (!cache || cache->key.objectid >= total_fs_bytes) {
 no_cache:
 		if (!wrapped) {
 			wrapped = 1;
@@ -287,6 +291,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	u64 end;
 	u64 free_check;
 	u64 ptr;
+	u64 total_fs_bytes;
 	int bit;
 	int ret;
 	int full_search = 0;
@@ -294,6 +299,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	int data_swap = 0;
 
 	block_group_cache = &info->block_group_cache;
+	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 
 	if (!owner)
 		factor = 8;
@@ -306,7 +312,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	else
 		bit = BLOCK_GROUP_METADATA;
 
-	if (search_start) {
+	if (search_start && search_start < total_fs_bytes) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_block_group(info, search_start);
 		if (shint && (shint->data == data ||
@@ -318,8 +324,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 			}
 		}
 	}
-	if (hint && (hint->data == data ||
-		     hint->data == BTRFS_BLOCK_GROUP_MIXED)) {
+	if (hint && hint->key.objectid < total_fs_bytes &&
+	    (hint->data == data || hint->data == BTRFS_BLOCK_GROUP_MIXED)) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
 		    div_factor(hint->key.offset, factor)) {
@@ -333,6 +339,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 		else
 			hint_last = search_start;
 
+		if (hint_last >= total_fs_bytes)
+			hint_last = search_start;
 		last = hint_last;
 	}
 again:
@@ -350,6 +358,9 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
+		if (cache->key.objectid > total_fs_bytes)
+			break;
+
 		if (full_search)
 			free_check = cache->key.offset;
 		else
@@ -1420,8 +1431,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_MIXED;
 	}
 
-	if (search_end == (u64)-1)
-		search_end = btrfs_super_total_bytes(&info->super_copy);
+	search_end = min(search_end,
+			 btrfs_super_total_bytes(&info->super_copy));
 	if (hint_byte) {
 		block_group = btrfs_lookup_block_group(info, hint_byte);
 		if (!block_group)
@@ -1617,7 +1628,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	int pending_ret;
-	u64 super_used, root_used;
+	u64 super_used;
+	u64 root_used;
 	u64 search_start = 0;
 	u64 new_hint;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -1636,6 +1648,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 			       search_start, search_end, hint_byte, ins,
 			       trans->alloc_exclude_start,
 			       trans->alloc_exclude_nr, data);
+if (ret)
+printk("find free extent returns %d\n", ret);
 	BUG_ON(ret);
 	if (ret)
 		return ret;
@@ -2292,8 +2306,6 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	while(1) {
 		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 
-		BUG_ON(ret == 0);
-
 		if (ret < 0)
 			goto out;
 
@@ -2340,6 +2352,8 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	int progress = 0;
 
 	btrfs_set_super_total_bytes(&info->super_copy, new_size);
+	clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
+			   GFP_NOFS);
 	block_group_cache = &info->block_group_cache;
 	path = btrfs_alloc_path();
 	root = root->fs_info->extent_root;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2cb2dd32407..281757007dd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -148,6 +148,7 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
 	u64 bytenr;
 	u64 cow_end;
 	u64 loops = 0;
+	u64 total_fs_bytes;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	int found_type;
@@ -157,6 +158,7 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
 	int err;
 	struct btrfs_key found_key;
 
+	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 again:
@@ -189,8 +191,10 @@ again:
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
-		extent_end = extent_start +
-		       btrfs_file_extent_num_bytes(leaf, item);
+		u64 extent_num_bytes;
+
+		extent_num_bytes = btrfs_file_extent_num_bytes(leaf, item);
+		extent_end = extent_start + extent_num_bytes;
 		err = 0;
 
 		if (loops && start != extent_start)
@@ -204,6 +208,13 @@ again:
 		if (bytenr == 0)
 			goto not_found;
 
+		/*
+		 * we may be called by the resizer, make sure we're inside
+		 * the limits of the FS
+		 */
+		if (bytenr + extent_num_bytes > total_fs_bytes)
+			goto not_found;
+
 		if (btrfs_count_snapshots_in_path(root, path, bytenr) != 1) {
 			goto not_found;
 		}
-- 
cgit v1.2.3


From b98b6767a0f6f9baa1d00eff64a43d58412ecdeb Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 8 Jan 2008 15:54:37 -0500
Subject: Btrfs: Add inode flags support

This patch adds NODATASUM & NODATACOW inode flags support.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h       | 13 ++++++++++++-
 fs/btrfs/inode.c       | 24 +++++++++++-------------
 3 files changed, 24 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f27e633f174..9fc99883c70 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -34,6 +34,7 @@ struct btrfs_inode {
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
+	u32 flags;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f0fb1978553..7eda51542d1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -447,7 +447,18 @@ struct btrfs_root {
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
 					 BTRFS_MOUNT_##opt)
-
+/*
+ * Inode flags
+ */
+#define BTRFS_INODE_NODATASUM 		0x1
+#define BTRFS_INODE_NODATACOW 		0x2
+
+#define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
+					 ~BTRFS_INODE_##flag)
+#define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
+					 BTRFS_INODE_##flag)
+#define btrfs_test_flag(inode, flag)	(BTRFS_I(inode)->flags & \
+					 BTRFS_INODE_##flag)
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 281757007dd..fba0478a95f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -243,9 +243,9 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 num_bytes;
 	int ret;
-
 	mutex_lock(&root->fs_info->fs_mutex);
-	if (btrfs_test_opt(root, NODATACOW))
+	if (btrfs_test_opt(root, NODATACOW) ||
+	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, start, end);
 	else
 		ret = cow_file_range(inode, start, end);
@@ -273,10 +273,9 @@ int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 	int ret = 0;
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	size_t offset = start - page_start;
-
-	if (btrfs_test_opt(root, NODATASUM))
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
@@ -299,10 +298,9 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_csum_item *item;
 	struct btrfs_path *path = NULL;
 	u32 csum;
-
-	if (btrfs_test_opt(root, NODATASUM))
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	path = btrfs_alloc_path();
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
@@ -335,10 +333,9 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
 	unsigned long flags;
-
-	if (btrfs_test_opt(root, NODATASUM))
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-
 	ret = get_state_private(em_tree, start, &private);
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
@@ -415,7 +412,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
 	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
 						       alloc_group_block);
-
+	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 	if (!BTRFS_I(inode)->block_group) {
 		BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
 						         NULL, 0, 0, 0);
@@ -484,6 +481,7 @@ static void fill_inode_item(struct extent_buffer *leaf,
 	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
 	btrfs_set_inode_generation(leaf, item, inode->i_generation);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
 	btrfs_set_inode_block_group(leaf, item,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
@@ -1454,7 +1452,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 1;
 	group = btrfs_find_block_group(root, group, 0, 0, owner);
 	BTRFS_I(inode)->block_group = group;
-
+	BTRFS_I(inode)->flags = 0;
 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
 	if (ret)
 		goto fail;
-- 
cgit v1.2.3


From bcd987feefe8da66bc59b4e6bd51761a9820588c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Jan 2008 06:28:28 -0500
Subject: Btrfs: Remove extent_map debugging message

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f53d8c97621..9d6aefa937c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -273,7 +273,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	rb = tree_insert(&tree->map, em->end, &em->rb_node);
 	if (rb) {
 		prev = rb_entry(rb, struct extent_map, rb_node);
-		printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
 		ret = -EEXIST;
 		goto out;
 	}
-- 
cgit v1.2.3


From 21ad10cf3e9c1ef42e725e5c3a593c49f779a16b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Jan 2008 09:23:21 -0500
Subject: Btrfs: Add flush barriers on commit

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 5 +++--
 fs/btrfs/disk-io.c | 5 +++++
 fs/btrfs/super.c   | 9 ++++++++-
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7eda51542d1..0c0edacbc14 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -440,8 +440,9 @@ struct btrfs_root {
  */
 #define BTRFS_STRING_ITEM_KEY	253
 
-#define BTRFS_MOUNT_NODATASUM		0x1
-#define BTRFS_MOUNT_NODATACOW		0x2
+#define BTRFS_MOUNT_NODATASUM		(1 << 0)
+#define BTRFS_MOUNT_NODATACOW		(1 << 1)
+#define BTRFS_MOUNT_NOBARRIER		(1 << 2)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e0940a39ff0..a481b970608 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -762,10 +762,15 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	struct extent_buffer *super = root->fs_info->sb_buffer;
 	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct super_block *sb = root->fs_info->sb;
 
+	if (!btrfs_test_opt(root, NOBARRIER))
+		blkdev_issue_flush(sb->s_bdev, NULL);
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, super);
 	ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping,
 				     super->start, super->len);
+	if (!btrfs_test_opt(root, NOBARRIER))
+		blkdev_issue_flush(sb->s_bdev, NULL);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f1b56eb77b1..9ab4086d463 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -64,13 +64,14 @@ static void btrfs_put_super (struct super_block * sb)
 
 enum {
 	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent,
-	Opt_alloc_start, Opt_err,
+	Opt_alloc_start, Opt_nobarrier, Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_subvol, "subvol=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
+	{Opt_nobarrier, "nobarrier"},
 	{Opt_max_extent, "max_extent=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_err, NULL}
@@ -148,6 +149,12 @@ static int parse_options (char * options,
 				btrfs_set_opt(info->mount_opt, NODATASUM);
 			}
 			break;
+		case Opt_nobarrier:
+			if (info) {
+				printk("btrfs: turning off barriers\n");
+				btrfs_set_opt(info->mount_opt, NOBARRIER);
+			}
+			break;
 		case Opt_max_extent:
 			if (info) {
 				char *num = match_strdup(&args[0]);
-- 
cgit v1.2.3


From 55c69072d6bd5be170a85467f64a20963cddf490 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Jan 2008 15:55:33 -0500
Subject: Btrfs: Fix extent_buffer usage when nodesize != leafsize

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 26 +++++++++++++++++++++++---
 fs/btrfs/extent-tree.c | 13 ++++++++++---
 fs/btrfs/extent_map.c  | 30 +++++++++++++++++++++---------
 3 files changed, 54 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a481b970608..0338f8fd382 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -197,9 +197,23 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	if (found_start != start) {
 		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
 		       start, found_start, len);
+		WARN_ON(1);
+		goto err;
+	}
+	if (eb->first_page != page) {
+		printk("bad first page %lu %lu\n", eb->first_page->index,
+		       page->index);
+		WARN_ON(1);
+		goto err;
+	}
+	if (!PageUptodate(page)) {
+		printk("csum not up to date page %lu\n", page->index);
+		WARN_ON(1);
+		goto err;
 	}
 	found_level = btrfs_header_level(eb);
 	csum_tree_block(root, eb, 0);
+err:
 	free_extent_buffer(eb);
 out:
 	return 0;
@@ -368,7 +382,10 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		     struct extent_buffer *buf)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
+	if (btrfs_header_generation(buf) ==
+	    root->fs_info->running_transaction->transid)
+		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree,
+					  buf);
 	return 0;
 }
 
@@ -897,8 +914,11 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 void btrfs_throttle(struct btrfs_root *root)
 {
-	if (root->fs_info->throttles)
-		congestion_wait(WRITE, HZ/10);
+	struct backing_dev_info *bdi;
+
+	bdi = root->fs_info->sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+	if (root->fs_info->throttles && bdi_write_congested(bdi))
+		congestion_wait(WRITE, HZ/20);
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 99a8b0f0d31..2c569b4d59d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1212,6 +1212,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 				u64 header_transid =
 					btrfs_header_generation(buf);
 				if (header_transid == transid) {
+					clean_tree_block(NULL, root, buf);
 					free_extent_buffer(buf);
 					return 1;
 				}
@@ -1249,7 +1250,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1648,8 +1648,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 			       search_start, search_end, hint_byte, ins,
 			       trans->alloc_exclude_start,
 			       trans->alloc_exclude_nr, data);
-if (ret)
-printk("find free extent returns %d\n", ret);
 	BUG_ON(ret);
 	if (ret)
 		return ret;
@@ -1764,7 +1762,16 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 				  0, 0, 0);
 		return ERR_PTR(-ENOMEM);
 	}
+	btrfs_set_header_generation(buf, trans->transid);
+	clean_tree_block(trans, root, buf);
+	wait_on_tree_block_writeback(root, buf);
 	btrfs_set_buffer_uptodate(buf);
+
+	if (PageDirty(buf->first_page)) {
+		printk("page %lu dirty\n", buf->first_page->index);
+		WARN_ON(1);
+	}
+
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
 	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->extent_tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 9d6aefa937c..f3a384ed700 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1663,6 +1663,13 @@ void set_page_extent_mapped(struct page *page)
 	}
 }
 
+void set_page_extent_head(struct page *page, unsigned long len)
+{
+	WARN_ON(page->private && page->private == EXTENT_PAGE_PRIVATE &&
+		PageDirty(page));
+	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+
 /*
  * basic readpage implementation.  Locked extent state structs are inserted
  * into the tree that are removed when the IO is done (by the end_io
@@ -2490,8 +2497,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 		mark_page_accessed(page0);
 		set_page_extent_mapped(page0);
 		WARN_ON(!PageUptodate(page0));
-		set_page_private(page0, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
-				 len << 2);
+		set_page_extent_head(page0, len);
 	} else {
 		i = 0;
 	}
@@ -2505,8 +2511,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
 		mark_page_accessed(p);
 		if (i == 0) {
 			eb->first_page = p;
-			set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
-					 len << 2);
+			set_page_extent_head(p, len);
 		} else {
 			set_page_private(p, EXTENT_PAGE_PRIVATE);
 		}
@@ -2569,8 +2574,7 @@ struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
 
 		if (i == 0) {
 			eb->first_page = p;
-			set_page_private(p, EXTENT_PAGE_PRIVATE_FIRST_PAGE |
-					 len << 2);
+			set_page_extent_head(p, len);
 		} else {
 			set_page_private(p, EXTENT_PAGE_PRIVATE);
 		}
@@ -2643,6 +2647,11 @@ int clear_extent_buffer_dirty(struct extent_map_tree *tree,
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		lock_page(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+		else
+			set_page_private(page, EXTENT_PAGE_PRIVATE);
+
 		/*
 		 * if we're on the last page or the first page and the
 		 * block isn't aligned on a page boundary, do extra checks
@@ -2697,9 +2706,12 @@ int set_extent_buffer_dirty(struct extent_map_tree *tree,
 		 */
 		if (i == 0) {
 			lock_page(page);
-			set_page_private(page,
-					 EXTENT_PAGE_PRIVATE_FIRST_PAGE |
-					 eb->len << 2);
+			set_page_extent_head(page, eb->len);
+		} else if (PagePrivate(page) &&
+			   page->private != EXTENT_PAGE_PRIVATE) {
+			lock_page(page);
+			set_page_extent_mapped(page);
+			unlock_page(page);
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
 		if (i == 0)
-- 
cgit v1.2.3


From b3a0d8d28c607cb2531a68742afc2b967b1f6083 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Jan 2008 13:22:53 -0500
Subject: Btrfs: Remove verbose WARN_ON

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f3a384ed700..d71aed341ab 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1665,8 +1665,6 @@ void set_page_extent_mapped(struct page *page)
 
 void set_page_extent_head(struct page *page, unsigned long len)
 {
-	WARN_ON(page->private && page->private == EXTENT_PAGE_PRIVATE &&
-		PageDirty(page));
 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
 
-- 
cgit v1.2.3


From 744f52f997b21de2a01931f79cf6c198bf688120 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 14 Jan 2008 13:26:08 -0500
Subject: Btrfs: Implement ACLs setting and getting

ACLs are stored but not used for permission checks (yet)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c      | 73 ++++++++++++++++++++++++++++++++---------------------
 fs/btrfs/dir-item.c |  3 +++
 fs/btrfs/xattr.c    | 16 +++---------
 3 files changed, 51 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 889ba89d3f9..b4f85ebe1bd 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,53 +22,68 @@
 #include <linux/posix_acl_xattr.h>
 #include "ctree.h"
 #include "xattr.h"
+static int btrfs_xattr_set_acl(struct inode *inode, int type,
+			       const void *value, size_t size)
+{
+	int ret = 0;
+	struct posix_acl *acl;
 
-/*
- * FIXME: At this point this is all place holder stuff, we just return
- * -EOPNOTSUPP so cp won't complain when it tries to copy over a file with an
- *  acl on it.
- */
+	if (!is_owner_or_cap(inode))
+		return -EPERM;
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (acl == NULL) {
+			value = NULL;
+			size = 0;
+		} else if (IS_ERR(acl)) {
+			ret = PTR_ERR(acl);
+		} else {
+			ret = posix_acl_valid(acl);
+			posix_acl_release(acl);
+		}
+		if (ret)
+			return ret;
+	}
+	return btrfs_xattr_set(inode, type, "", value, size, 0);
+}
 
+static int btrfs_xattr_get_acl(struct inode *inode, int type,
+			       void *value, size_t size)
+{
+	return btrfs_xattr_get(inode, type, "", value, size);
+}
 static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
 				      void *value, size_t size)
 {
-	/*
-	return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name,
-			       value, size);
-	*/
-	return -EOPNOTSUPP;
+	if (*name != '\0')
+	       return -EINVAL;
+	return btrfs_xattr_get_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
+				   value, size);
 }
-
 static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
 				      const void *value, size_t size, int flags)
 {
-	/*
-	return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS, name,
-			       value, size, flags);
-	*/
-	return -EOPNOTSUPP;
+	if (*name != '\0')
+	       return -EINVAL;
+	return btrfs_xattr_set_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
+				   value, size);
 }
-
 static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
 				       void *value, size_t size)
 {
-	/*
-	return btrfs_xattr_get(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
-			       name, value, size);
-	*/
-	return -EOPNOTSUPP;
+	if (*name != '\0')
+	       return -EINVAL;
+	return btrfs_xattr_get_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
+				   value, size);
 }
-
 static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
 				       const void *value, size_t size, int flags)
 {
-	/*
-	return btrfs_xattr_set(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
-			       name, value, size, flags);
-	*/
-	return -EOPNOTSUPP;
+	if (*name != '\0')
+	       return -EINVAL;
+	return btrfs_xattr_set_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
+				   value, size);
 }
-
 struct xattr_handler btrfs_xattr_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.list	= btrfs_xattr_generic_list,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index ddbe12ae0d6..5247a9a41f0 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -76,6 +76,9 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+	if (name_len + data_len + sizeof(struct btrfs_dir_item) >
+	    BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
+		return -ENOSPC;
 
 	data_size = sizeof(*dir_item) + name_len + data_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 33caaf5b4b0..85ac42605f8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -141,11 +141,6 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 
 	if (!handler)
 		return -EOPNOTSUPP;
-
-	/* just in case... */
-	if (*attr_name == '\0')
-		return -EINVAL;
-
 	name = get_name(attr_name, name_index);
 	if (!name)
 		return -ENOMEM;
@@ -201,14 +196,8 @@ int btrfs_xattr_set(struct inode *inode, int name_index,
 	struct xattr_handler *handler = btrfs_xattr_handler(name_index);
 	char *name;
 	int ret = 0, mod = 0;
-
 	if (!handler)
 		return -EOPNOTSUPP;
-
-	/* just in case... */
-	if (*attr_name == '\0')
-		return -EINVAL;
-
 	name = get_name(attr_name, name_index);
 	if (!name)
 		return -ENOMEM;
@@ -454,15 +443,18 @@ static int btrfs_xattr_##name##_get(struct inode *inode,		\
 				    const char *name, void *value,	\
 				    size_t size)			\
 {									\
+	if (*name == '\0')						\
+		return -EINVAL;						\
 	return btrfs_xattr_get(inode, index, name, value, size);	\
 }									\
 static int btrfs_xattr_##name##_set(struct inode *inode,		\
 				    const char *name, const void *value,\
 				    size_t size, int flags)		\
 {									\
+	if (*name == '\0')						\
+		return -EINVAL;						\
 	return btrfs_xattr_set(inode, index, name, value, size, flags);	\
 }									\
-
 BTRFS_XATTR_SETGET_FUNCS(security, BTRFS_XATTR_INDEX_SECURITY);
 BTRFS_XATTR_SETGET_FUNCS(system, BTRFS_XATTR_INDEX_SYSTEM);
 BTRFS_XATTR_SETGET_FUNCS(user, BTRFS_XATTR_INDEX_USER);
-- 
cgit v1.2.3


From fdebe2bd70047e057827cba85ba31b2545e31900 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 14 Jan 2008 13:26:08 -0500
Subject: Btrfs: Add readonly inode flag

This patch adds readonly inode flag support.  A file with this flag
can't be modified, but can be deleted.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  6 +++---
 fs/btrfs/inode.c | 16 ++++++++++++----
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0c0edacbc14..245952cf765 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -451,9 +451,9 @@ struct btrfs_root {
 /*
  * Inode flags
  */
-#define BTRFS_INODE_NODATASUM 		0x1
-#define BTRFS_INODE_NODATACOW 		0x2
-
+#define BTRFS_INODE_NODATASUM		(1 << 0)
+#define BTRFS_INODE_NODATACOW		(1 << 1)
+#define BTRFS_INODE_READONLY		(1 << 2)
 #define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
 					 ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fba0478a95f..fd0e0d80333 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2850,6 +2850,13 @@ out_fail:
 	btrfs_throttle(root);
 	return err;
 }
+static int btrfs_permission(struct inode *inode, int mask,
+			    struct nameidata *nd)
+{
+	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+		return -EACCES;
+	return generic_permission(inode, mask, NULL);
+}
 
 static struct inode_operations btrfs_dir_inode_operations = {
 	.lookup		= btrfs_lookup,
@@ -2866,12 +2873,12 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.getxattr	= generic_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= generic_removexattr,
+	.permission	= btrfs_permission,
 };
-
 static struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
+	.permission	= btrfs_permission,
 };
-
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
@@ -2916,15 +2923,16 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.getxattr	= generic_getxattr,
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= generic_removexattr,
+	.permission	= btrfs_permission,
 };
-
 static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
+	.permission	= btrfs_permission,
 };
-
 static struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.permission	= btrfs_permission,
 };
-- 
cgit v1.2.3


From 69a32ac5175ec9bcfb407e8619a024e5eaea87b7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Jan 2008 14:33:35 -0500
Subject: Btrfs: Change magic string to reflect new format

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 fs/btrfs/xattr.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 245952cf765..f65b258344d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -36,7 +36,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 
-#define BTRFS_MAGIC "_B2RfS_M"
+#define BTRFS_MAGIC "_B3RfS_M"
 
 #define BTRFS_MAX_LEVEL 8
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 85ac42605f8..4b3bd24af72 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -454,7 +454,8 @@ static int btrfs_xattr_##name##_set(struct inode *inode,		\
 	if (*name == '\0')						\
 		return -EINVAL;						\
 	return btrfs_xattr_set(inode, index, name, value, size, flags);	\
-}									\
+}
+
 BTRFS_XATTR_SETGET_FUNCS(security, BTRFS_XATTR_INDEX_SECURITY);
 BTRFS_XATTR_SETGET_FUNCS(system, BTRFS_XATTR_INDEX_SYSTEM);
 BTRFS_XATTR_SETGET_FUNCS(user, BTRFS_XATTR_INDEX_USER);
-- 
cgit v1.2.3


From b99aa6cb548ae9461c5dc0e38f88ebc618ea68d7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Jan 2008 14:41:16 -0500
Subject: Btrfs: Only delete roots from sysfs when they were added to sysfs

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0338f8fd382..87daed632f3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -795,7 +795,8 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
-	btrfs_sysfs_del_root(root);
+	if (root->in_sysfs)
+		btrfs_sysfs_del_root(root);
 	if (root->inode)
 		iput(root->inode);
 	if (root->node)
-- 
cgit v1.2.3


From 8c416c9e0d35b0462e89b76b7581127efe03b8b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Jan 2008 15:10:26 -0500
Subject: Btrfs: Delete any remaining extent_maps before freeing the inode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fd0e0d80333..5260b470815 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2601,6 +2601,7 @@ void btrfs_destroy_inode(struct inode *inode)
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
+	btrfs_drop_extent_cache(inode, 0, (u64)-1);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-- 
cgit v1.2.3


From 61295eb8665e723e77af91d0a1e655a4bd28344f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Jan 2008 16:24:38 -0500
Subject: Btrfs: Add drop inode func to avoid data=ordered deadlock

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  1 +
 fs/btrfs/inode.c | 19 +++++++++++++++++++
 fs/btrfs/super.c |  1 +
 3 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f65b258344d..a2c2d6d82c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1144,6 +1144,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
+void btrfs_drop_inode(struct inode *inode);
 void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5260b470815..e53d2033164 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -990,6 +990,25 @@ out:
 fail:
 	return err;
 }
+
+void btrfs_drop_inode(struct inode *inode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (!BTRFS_I(inode)->ordered_trans) {
+		generic_drop_inode(inode);
+		return;
+	}
+	/* nasty, but it prevents a deadlock with data=ordered by preventing
+	 * a commit until after this inode is done
+	 */
+	trans = btrfs_start_transaction(root, 1);
+	generic_drop_inode(inode);
+	/* note, the inode is now untouchable */
+	btrfs_end_transaction(trans, root);
+}
+
 void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ab4086d463..4deea393ca9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -425,6 +425,7 @@ static struct file_system_type btrfs_fs_type = {
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
+	.drop_inode	= btrfs_drop_inode,
 	.put_super	= btrfs_put_super,
 	.read_inode	= btrfs_read_locked_inode,
 	.write_super	= btrfs_write_super,
-- 
cgit v1.2.3


From cee36a03e8f7c6e14aefd497d3acf01bcd3ef153 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 15 Jan 2008 08:40:48 -0500
Subject: Rework btrfs_drop_inode to avoid scheduling

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        |  1 +
 fs/btrfs/disk-io.c      |  1 +
 fs/btrfs/inode.c        | 13 +++----------
 fs/btrfs/ordered-data.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/btrfs/ordered-data.h |  1 +
 fs/btrfs/transaction.c  |  2 ++
 6 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a2c2d6d82c5..1e19f2d8633 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -349,6 +349,7 @@ struct btrfs_fs_info {
 
 	u64 total_pinned;
 	spinlock_t delalloc_lock;
+	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
 };
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87daed632f3..cd29922d407 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -639,6 +639,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	INIT_LIST_HEAD(&fs_info->hashers);
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
+	spin_lock_init(&fs_info->new_trans_lock);
 
 	memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj));
 	init_completion(&fs_info->kobj_unregister);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e53d2033164..008e3445748 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -993,20 +993,13 @@ fail:
 
 void btrfs_drop_inode(struct inode *inode)
 {
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
-	if (!BTRFS_I(inode)->ordered_trans) {
+	if (!BTRFS_I(inode)->ordered_trans || inode->i_nlink) {
 		generic_drop_inode(inode);
 		return;
 	}
-	/* nasty, but it prevents a deadlock with data=ordered by preventing
-	 * a commit until after this inode is done
-	 */
-	trans = btrfs_start_transaction(root, 1);
+	/* FIXME, make sure this delete actually ends up in the transaction */
+	btrfs_del_ordered_inode(inode);
 	generic_drop_inode(inode);
-	/* note, the inode is now untouchable */
-	btrfs_end_transaction(trans, root);
 }
 
 void btrfs_delete_inode(struct inode *inode)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 411aba84d30..b56011baa17 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -219,3 +219,39 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 	kfree(entry);
 	return 1;
 }
+
+static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				     u64 root_objectid, u64 objectid)
+{
+	struct tree_entry *entry;
+	struct rb_node *node;
+	struct rb_node *prev;
+
+	write_lock(&tree->lock);
+	node = __tree_search(&tree->tree, root_objectid, objectid, &prev);
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+	rb_erase(node, &tree->tree);
+	write_unlock(&tree->lock);
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	kfree(entry);
+	return 1;
+}
+
+int btrfs_del_ordered_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 root_objectid = root->root_key.objectid;
+
+	spin_lock(&root->fs_info->new_trans_lock);
+	if (root->fs_info->running_transaction) {
+		struct btrfs_ordered_inode_tree *tree;
+		tree = &root->fs_info->running_transaction->ordered_inode_tree;
+		__btrfs_del_ordered_inode(tree, root_objectid, inode->i_ino);
+	}
+	spin_unlock(&root->fs_info->new_trans_lock);
+	return 0;
+}
+
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index aaf9eb14271..26b26212865 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -36,4 +36,5 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid);
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid);
+int btrfs_del_ordered_inode(struct inode *inode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 614903f5c88..a3205808ab2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -699,7 +699,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	cur_trans = root->fs_info->running_transaction;
+	spin_lock(&root->fs_info->new_trans_lock);
 	root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->new_trans_lock);
 	btrfs_set_super_generation(&root->fs_info->super_copy,
 				   cur_trans->transid);
 	btrfs_set_super_root(&root->fs_info->super_copy,
-- 
cgit v1.2.3


From 9cce6c3bfca85bf92e8c9358542a18dfa6c232be Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 15 Jan 2008 08:44:06 -0500
Subject: Btrfs: Disable delalloc accounting for now

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 008e3445748..f83f88ca8ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -80,6 +80,8 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	u64 thresh;
 	int ret = 0;
 
+	return 0;
+
 	if (for_del)
 		thresh = total * 90;
 	else
-- 
cgit v1.2.3


From 2da98f003f4788b0a72c5f87bc55b061f65f30fa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Jan 2008 11:44:43 -0500
Subject: Btrfs: Run igrab on data=ordered inodes to prevent deadlocks during
 writeout

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        |  2 +-
 fs/btrfs/inode.c        | 32 ++++++++++++++++++++++++++------
 fs/btrfs/ordered-data.c | 10 ++++++++--
 fs/btrfs/super.c        |  2 +-
 fs/btrfs/transaction.c  |  1 +
 5 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1e19f2d8633..fa65fe027e4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1145,7 +1145,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
-void btrfs_drop_inode(struct inode *inode);
+void btrfs_put_inode(struct inode *inode);
 void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f83f88ca8ac..c1ac0bcbb46 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -601,6 +601,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
+	struct inode *inode = dentry->d_inode;
 	int ret;
 	unsigned long nr = 0;
 
@@ -617,6 +618,18 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	nr = trans->blocks_used;
 
+	if (inode->i_nlink == 0) {
+		int found;
+		/* if the inode isn't linked anywhere,
+		 * we don't need to worry about
+		 * data=ordered
+		 */
+		found = btrfs_del_ordered_inode(inode);
+		if (found == 1) {
+			atomic_dec(&inode->i_count);
+		}
+	}
+
 	btrfs_end_transaction(trans, root);
 fail:
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -993,15 +1006,22 @@ fail:
 	return err;
 }
 
-void btrfs_drop_inode(struct inode *inode)
+void btrfs_put_inode(struct inode *inode)
 {
-	if (!BTRFS_I(inode)->ordered_trans || inode->i_nlink) {
-		generic_drop_inode(inode);
+	int ret;
+
+	if (!BTRFS_I(inode)->ordered_trans) {
+		return;
+	}
+
+	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
 		return;
+
+	ret = btrfs_del_ordered_inode(inode);
+	if (ret == 1) {
+		atomic_dec(&inode->i_count);
 	}
-	/* FIXME, make sure this delete actually ends up in the transaction */
-	btrfs_del_ordered_inode(inode);
-	generic_drop_inode(inode);
 }
 
 void btrfs_delete_inode(struct inode *inode)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b56011baa17..cba2b623d02 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -153,6 +153,8 @@ int btrfs_add_ordered_inode(struct inode *inode)
 	write_unlock(&tree->lock);
 	if (node)
 		kfree(entry);
+	else
+		igrab(inode);
 	return 0;
 }
 
@@ -221,6 +223,7 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 }
 
 static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				     struct inode *inode,
 				     u64 root_objectid, u64 objectid)
 {
 	struct tree_entry *entry;
@@ -234,6 +237,7 @@ static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 		return 0;
 	}
 	rb_erase(node, &tree->tree);
+	BTRFS_I(inode)->ordered_trans = 0;
 	write_unlock(&tree->lock);
 	entry = rb_entry(node, struct tree_entry, rb_node);
 	kfree(entry);
@@ -244,14 +248,16 @@ int btrfs_del_ordered_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 root_objectid = root->root_key.objectid;
+	int ret = 0;
 
 	spin_lock(&root->fs_info->new_trans_lock);
 	if (root->fs_info->running_transaction) {
 		struct btrfs_ordered_inode_tree *tree;
 		tree = &root->fs_info->running_transaction->ordered_inode_tree;
-		__btrfs_del_ordered_inode(tree, root_objectid, inode->i_ino);
+		ret = __btrfs_del_ordered_inode(tree, inode, root_objectid,
+						inode->i_ino);
 	}
 	spin_unlock(&root->fs_info->new_trans_lock);
-	return 0;
+	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4deea393ca9..e506de3168b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -425,7 +425,7 @@ static struct file_system_type btrfs_fs_type = {
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
-	.drop_inode	= btrfs_drop_inode,
+	.put_inode	= btrfs_put_inode,
 	.put_super	= btrfs_put_super,
 	.read_inode	= btrfs_read_locked_inode,
 	.write_super	= btrfs_write_super,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a3205808ab2..08f7a188dc3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -521,6 +521,7 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		if (inode) {
 			if (S_ISREG(inode->i_mode))
 				filemap_write_and_wait(inode->i_mapping);
+			atomic_dec(&inode->i_count);
 			iput(inode);
 		}
 		mutex_lock(&root->fs_info->fs_mutex);
-- 
cgit v1.2.3


From 4d5e74bc0aec3f54b7e429d77b7c35de042c507d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Jan 2008 16:09:22 -0500
Subject: Btrfs: Fix data=ordered vs wait_on_inode deadlock on older kernels

Using ilookup5 during data=ordered writeback could deadlock on I_LOCK.  This
saves a pointer to the inode instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.c | 12 ++++++++++--
 fs/btrfs/ordered-data.h |  6 ++++--
 fs/btrfs/transaction.c  | 30 +++++++++++++-----------------
 3 files changed, 27 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index cba2b623d02..3ee51e10c18 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,7 @@
 struct tree_entry {
 	u64 root_objectid;
 	u64 objectid;
+	struct inode *inode;
 	struct rb_node rb_node;
 };
 
@@ -144,6 +145,7 @@ int btrfs_add_ordered_inode(struct inode *inode)
 	write_lock(&tree->lock);
 	entry->objectid = inode->i_ino;
 	entry->root_objectid = root_objectid;
+	entry->inode = inode;
 
 	node = tree_insert(&tree->tree, root_objectid,
 			   inode->i_ino, &entry->rb_node);
@@ -159,7 +161,8 @@ int btrfs_add_ordered_inode(struct inode *inode)
 }
 
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid)
+				   u64 *root_objectid, u64 *objectid,
+				   struct inode **inode)
 {
 	struct tree_entry *entry;
 	struct rb_node *node;
@@ -184,13 +187,16 @@ int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 	}
 
 	*root_objectid = entry->root_objectid;
+	*inode = entry->inode;
+	atomic_inc(&entry->inode->i_count);
 	*objectid = entry->objectid;
 	write_unlock(&tree->lock);
 	return 1;
 }
 
 int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid)
+				       u64 *root_objectid, u64 *objectid,
+				       struct inode **inode)
 {
 	struct tree_entry *entry;
 	struct rb_node *node;
@@ -216,6 +222,8 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 
 	*root_objectid = entry->root_objectid;
 	*objectid = entry->objectid;
+	*inode = entry->inode;
+	atomic_inc(&entry->inode->i_count);
 	rb_erase(node, &tree->tree);
 	write_unlock(&tree->lock);
 	kfree(entry);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 26b26212865..f25c6771ec6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -33,8 +33,10 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 
 int btrfs_add_ordered_inode(struct inode *inode);
 int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid);
+				       u64 *root_objectid, u64 *objectid,
+				       struct inode **inode);
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid);
+				       u64 *root_objectid, u64 *objectid,
+				       struct inode **inode);
 int btrfs_del_ordered_inode(struct inode *inode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 08f7a188dc3..b6bbfc179c2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -490,19 +490,17 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 	while(1) {
 		ret = btrfs_find_first_ordered_inode(
 				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid);
+				&root_objectid, &objectid, &inode);
 		if (!ret)
 			break;
 
 		mutex_unlock(&root->fs_info->trans_mutex);
 		mutex_unlock(&root->fs_info->fs_mutex);
-		inode = btrfs_ilookup(root->fs_info->sb, objectid,
-				      root_objectid);
-		if (inode) {
-			if (S_ISREG(inode->i_mode))
-				filemap_fdatawrite(inode->i_mapping);
-			iput(inode);
-		}
+
+		if (S_ISREG(inode->i_mode))
+			filemap_fdatawrite(inode->i_mapping);
+		iput(inode);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
@@ -511,19 +509,17 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		objectid = 0;
 		ret = btrfs_find_del_first_ordered_inode(
 				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid);
+				&root_objectid, &objectid, &inode);
 		if (!ret)
 			break;
 		mutex_unlock(&root->fs_info->trans_mutex);
 		mutex_unlock(&root->fs_info->fs_mutex);
-		inode = btrfs_ilookup(root->fs_info->sb, objectid,
-				      root_objectid);
-		if (inode) {
-			if (S_ISREG(inode->i_mode))
-				filemap_write_and_wait(inode->i_mapping);
-			atomic_dec(&inode->i_count);
-			iput(inode);
-		}
+
+		if (S_ISREG(inode->i_mode))
+			filemap_write_and_wait(inode->i_mapping);
+		atomic_dec(&inode->i_count);
+		iput(inode);
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
-- 
cgit v1.2.3


From caaca38b8fdcf63a17647fddae2195b189e19e37 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 17 Jan 2008 11:59:48 -0500
Subject: Btrfs: Fix compile on kernel without ACLs enabled

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 5 ++++-
 fs/btrfs/xattr.c  | 6 ++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ab9a9f8edbf..72cc3e16741 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,8 +5,11 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o ordered-data.o
+	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o
 
+ifeq ($(CONFIG_FS_POSIX_ACL),y)
+btrfs-y += acl.o
+endif
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
 #	  inode-map.o \
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 4b3bd24af72..0a4950400f8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -26,20 +26,22 @@
 #include "transaction.h"
 #include "xattr.h"
 #include "disk-io.h"
-
 static struct xattr_handler *btrfs_xattr_handler_map[] = {
 	[BTRFS_XATTR_INDEX_USER]		= &btrfs_xattr_user_handler,
+#ifdef CONFIG_FS_POSIX_ACL
 	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
 	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
+#endif
 	[BTRFS_XATTR_INDEX_TRUSTED]		= &btrfs_xattr_trusted_handler,
 	[BTRFS_XATTR_INDEX_SECURITY]		= &btrfs_xattr_security_handler,
 	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
 };
-
 struct xattr_handler *btrfs_xattr_handlers[] = {
 	&btrfs_xattr_user_handler,
+#ifdef CONFIG_FS_POSIX_ACL
 	&btrfs_xattr_acl_access_handler,
 	&btrfs_xattr_acl_default_handler,
+#endif
 	&btrfs_xattr_trusted_handler,
 	&btrfs_xattr_security_handler,
 	&btrfs_xattr_system_handler,
-- 
cgit v1.2.3


From fb4bc1e0565a842d704846005ac0ec739e608ede Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 17 Jan 2008 11:59:51 -0500
Subject: Btrfs: Fix compile on 2.6.22 kernel

This patch fixes compile error on kernel-2.6.22

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   | 5 +++++
 fs/btrfs/inode.c | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index b4f85ebe1bd..27bb841fafe 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,11 @@
 #include <linux/posix_acl_xattr.h>
 #include "ctree.h"
 #include "xattr.h"
+#ifndef is_owner_or_cap
+#define is_owner_or_cap(inode)	\
+	((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER))
+#endif
+
 static int btrfs_xattr_set_acl(struct inode *inode, int type,
 			       const void *value, size_t size)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c1ac0bcbb46..51fc06dc4a5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2322,15 +2322,16 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 		ret = -ENOMEM;
 		goto fail_unlock;
 	}
-	pending_snapshot->name = kstrndup(name, namelen, GFP_NOFS);
+	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
 	if (!pending_snapshot->name) {
 		ret = -ENOMEM;
 		kfree(pending_snapshot);
 		goto fail_unlock;
 	}
+	memcpy(pending_snapshot->name, name, namelen);
+	pending_snapshot->name[namelen] = '\0';
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
-
 	pending_snapshot->root = root;
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
-- 
cgit v1.2.3


From 04005cc7a1feef94237204566ccac38571af7991 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jan 2008 12:01:41 -0500
Subject: Btrfs: Use blk_congestion_wait on older kernels

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cd29922d407..731a534f81f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -919,8 +919,13 @@ void btrfs_throttle(struct btrfs_root *root)
 	struct backing_dev_info *bdi;
 
 	bdi = root->fs_info->sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-	if (root->fs_info->throttles && bdi_write_congested(bdi))
+	if (root->fs_info->throttles && bdi_write_congested(bdi)) {
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
 		congestion_wait(WRITE, HZ/20);
+#else
+		blk_congestion_wait(WRITE, HZ/20);
+#endif
+	}
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
-- 
cgit v1.2.3


From e18e4809b10e6c9efb5fe10c1ddcb4ebb690d517 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Jan 2008 10:54:22 -0500
Subject: Btrfs: Add mount -o ssd, which includes optimizations for seek free
 storage

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/disk-io.c     |  3 +++
 fs/btrfs/extent-tree.c | 31 ++++++++++++++++++++++++++++++-
 fs/btrfs/super.c       |  9 ++++++++-
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/tree-defrag.c |  3 +++
 6 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fa65fe027e4..7a588ba2b74 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ struct btrfs_fs_info {
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
+	u64 last_alloc;
 };
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
@@ -444,6 +445,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NODATASUM		(1 << 0)
 #define BTRFS_MOUNT_NODATACOW		(1 << 1)
 #define BTRFS_MOUNT_NOBARRIER		(1 << 2)
+#define BTRFS_MOUNT_SSD			(1 << 3)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 731a534f81f..5d1f9bca271 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -193,6 +193,7 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1);
+	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
 		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
@@ -676,6 +677,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
 	fs_info->total_pinned = 0;
+	fs_info->last_alloc = 0;
+
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2c569b4d59d..b69a46691a9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1431,6 +1431,19 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_MIXED;
 	}
 
+	/* for SSD, cluster allocations together as much as possible */
+	if (btrfs_test_opt(root, SSD)) {
+		if (!data) {
+			if (root->fs_info->last_alloc)
+				hint_byte = root->fs_info->last_alloc;
+			else {
+				hint_byte = hint_byte &
+					~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
+				empty_size += 16 * 1024 * 1024;
+			}
+		}
+	}
+
 	search_end = min(search_end,
 			 btrfs_super_total_bytes(&info->super_copy));
 	if (hint_byte) {
@@ -1456,6 +1469,19 @@ check_failed:
 	}
 	search_start = find_search_start(root, &block_group, search_start,
 					 total_needed, data);
+
+	if (!data && btrfs_test_opt(root, SSD) && info->last_alloc &&
+	    search_start != info->last_alloc) {
+		info->last_alloc = 0;
+		if (!empty_size) {
+			empty_size += 16 * 1024 * 1024;
+			total_needed += empty_size;
+		}
+		search_start = find_search_start(root, &block_group,
+						 search_start, total_needed,
+						 data);
+	}
+
 	search_start = stripe_align(root, search_start);
 	cached_start = search_start;
 	btrfs_init_path(path);
@@ -1610,6 +1636,8 @@ enospc:
 error:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+	if (btrfs_test_opt(root, SSD) && !ret && !data)
+		info->last_alloc = ins->objectid + ins->offset;
 	return ret;
 }
 /*
@@ -1778,7 +1806,8 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 			buf->start, buf->start + buf->len - 1,
 			EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
-	btrfs_set_buffer_defrag(buf);
+	if (!btrfs_test_opt(root, SSD))
+		btrfs_set_buffer_defrag(buf);
 	trans->blocks_used++;
 	return buf;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e506de3168b..b6982f0133c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -64,7 +64,7 @@ static void btrfs_put_super (struct super_block * sb)
 
 enum {
 	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent,
-	Opt_alloc_start, Opt_nobarrier, Opt_err,
+	Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -74,6 +74,7 @@ static match_table_t tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_max_extent, "max_extent=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
+	{Opt_ssd, "ssd"},
 	{Opt_err, NULL}
 };
 
@@ -149,6 +150,12 @@ static int parse_options (char * options,
 				btrfs_set_opt(info->mount_opt, NODATASUM);
 			}
 			break;
+		case Opt_ssd:
+			if (info) {
+				printk("btrfs: use ssd allocation scheme\n");
+				btrfs_set_opt(info->mount_opt, SSD);
+			}
+			break;
 		case Opt_nobarrier:
 			if (info) {
 				printk("btrfs: turning off barriers\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b6bbfc179c2..163c01a2449 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -57,6 +57,7 @@ static int join_transaction(struct btrfs_root *root)
 		BUG_ON(!cur_trans);
 		root->fs_info->generation++;
 		root->fs_info->running_transaction = cur_trans;
+		root->fs_info->last_alloc = 0;
 		cur_trans->num_writers = 1;
 		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5c58630dce0..5935cbd8f2b 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -179,6 +179,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (root->ref_cows == 0 && !is_extent)
 		goto out;
 
+	if (btrfs_test_opt(root, SSD))
+		goto out;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.2.3


From b17854275cd297d40c6c5562fa44a472314c6c7f Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 22 Jan 2008 12:46:56 -0500
Subject: Fix for btrfs_find_free_objectid

btrfs_find_free_objectid may return a used objectid due to arithmetic
underflow. This bug may happen when parameter 'root' is tree root,  so
it may cause serious problems when creating snapshot or sub-volume.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode-map.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ab74977adf5..a0925eabdaa 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -62,7 +62,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
-	u64 hole_size = 0;
 	int slot = 0;
 	u64 last_ino = 0;
 	int start_found;
@@ -109,8 +108,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			if (start_found) {
 				if (last_ino < search_start)
 					last_ino = search_start;
-				hole_size = key.objectid - last_ino;
-				if (hole_size > 0) {
+				if (key.objectid > last_ino) {
 					*objectid = last_ino;
 					goto found;
 				}
-- 
cgit v1.2.3


From f0c5da1446cc500856a5e31c9a0e2a7bdd30e663 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 22 Jan 2008 12:46:56 -0500
Subject: Btrfs: Fix for test_range_bit

test_range_bit doesn't properly handle the case: there's a hole at the
end of the range and there's no other extent_state after the range.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index d71aed341ab..485cf0719b3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1336,6 +1336,11 @@ int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
 		if (start > end)
 			break;
 		node = rb_next(node);
+		if (!node) {
+			if (filled)
+				bitset = 0;
+			break;
+		}
 	}
 	read_unlock_irq(&tree->lock);
 	return bitset;
-- 
cgit v1.2.3


From ed0dab6b86cee64e9072107f65fa40629f162b38 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 22 Jan 2008 12:46:56 -0500
Subject: Btrfs: Add basic lockfs calls

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b6982f0133c..8b52c69fda2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -429,6 +429,17 @@ static struct file_system_type btrfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+static void btrfs_write_super_lockfs(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	btrfs_transaction_flush_work(root);
+}
+
+static void btrfs_unlockfs(struct super_block *sb)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	btrfs_transaction_queue_work(root, HZ * 30);
+}
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
@@ -442,8 +453,9 @@ static struct super_operations btrfs_super_ops = {
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
+	.write_super_lockfs = btrfs_write_super_lockfs,
+	.unlockfs	= btrfs_unlockfs,
 };
-
 static int __init init_btrfs_fs(void)
 {
 	int err;
-- 
cgit v1.2.3


From c1e32da616a17813f11b701a7a87775d35c12e3a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jan 2008 12:46:56 -0500
Subject: Btrfs: Include sched.h in the acl code for current (fixes compile on
 2.6.23)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 27bb841fafe..ff0fcc72848 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
 #include "ctree.h"
 #include "xattr.h"
 #ifndef is_owner_or_cap
-- 
cgit v1.2.3


From 5f56406aabdf5444d040c5955effc665b1d0dbaf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jan 2008 16:47:59 -0500
Subject: Btrfs: Fix hole insertion corner cases

There were a few places that could cause duplicate extent insertion,
this adjusts the code that creates holes to avoid it.

lookup_extent_map is changed to correctly return all of the extents in a
range, even when there are none matching at the start of the range.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      |  1 +
 fs/btrfs/extent_map.c | 45 +++++++++++++++++++++++------
 fs/btrfs/file.c       | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/inode.c      | 15 ++++++++--
 4 files changed, 126 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7a588ba2b74..b9f2975b55c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1170,6 +1170,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct inode *inode);
 /* file.c */
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 485cf0719b3..010a287fbd7 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -204,10 +204,12 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 }
 
 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
-				   struct rb_node **prev_ret)
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
 {
 	struct rb_node * n = root->rb_node;
 	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
@@ -223,13 +225,25 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 		else
 			return n;
 	}
-	if (!prev_ret)
-		return NULL;
-	while(prev && offset > prev_entry->end) {
-		prev = rb_next(prev);
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while(prev && offset > prev_entry->end) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		while(prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*next_ret = prev;
 	}
-	*prev_ret = prev;
 	return NULL;
 }
 
@@ -237,7 +251,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 {
 	struct rb_node *prev;
 	struct rb_node *ret;
-	ret = __tree_search(root, offset, &prev);
+	ret = __tree_search(root, offset, &prev, NULL);
 	if (!ret)
 		return prev;
 	return ret;
@@ -248,7 +262,7 @@ static int tree_delete(struct rb_root *root, u64 offset)
 	struct rb_node *node;
 	struct tree_entry *entry;
 
-	node = __tree_search(root, offset, NULL);
+	node = __tree_search(root, offset, NULL, NULL);
 	if (!node)
 		return -ENOENT;
 	entry = rb_entry(node, struct tree_entry, rb_node);
@@ -314,9 +328,21 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 {
 	struct extent_map *em;
 	struct rb_node *rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
 
 	read_lock_irq(&tree->lock);
-	rb_node = tree_search(&tree->map, start);
+	rb_node = __tree_search(&tree->map, start, &prev, &next);
+	if (!rb_node && prev) {
+		em = rb_entry(prev, struct extent_map, rb_node);
+		if (em->start <= end && em->end >= start)
+			goto found;
+	}
+	if (!rb_node && next) {
+		em = rb_entry(next, struct extent_map, rb_node);
+		if (em->start <= end && em->end >= start)
+			goto found;
+	}
 	if (!rb_node) {
 		em = NULL;
 		goto out;
@@ -330,6 +356,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 		em = NULL;
 		goto out;
 	}
+found:
 	atomic_inc(&em->refs);
 out:
 	read_unlock_irq(&tree->lock);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 897242e87fa..1cd8c908811 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -278,7 +278,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		u64 hole_size;
 		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
-		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		hole_size = (end_pos - last_pos_in_file + mask) & ~mask;
 
 		if (last_pos_in_file < start_pos) {
 			err = btrfs_drop_extents(trans, root, inode,
@@ -293,6 +293,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 						       inode->i_ino,
 						       last_pos_in_file,
 						       0, 0, hole_size);
+			btrfs_check_file(root, inode);
 		}
 		if (err)
 			goto failed;
@@ -378,6 +379,80 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 	return 0;
 }
 
+int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
+{
+	return 0;
+#if 0
+	struct btrfs_path *path;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *extent;
+	u64 last_offset = 0;
+	int nritems;
+	int slot;
+	int found_type;
+	int ret;
+	int err = 0;
+	u64 extent_end = 0;
+
+	path = btrfs_alloc_path();
+	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
+				       last_offset, 0);
+	while(1) {
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret)
+				goto out;
+			nritems = btrfs_header_nritems(path->nodes[0]);
+		}
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (found_key.objectid != inode->i_ino)
+			break;
+		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto out;
+
+		if (found_key.offset != last_offset) {
+			WARN_ON(1);
+			btrfs_print_leaf(root, leaf);
+			printk("inode %lu found offset %Lu expected %Lu\n",
+			       inode->i_ino, found_key.offset, last_offset);
+			err = 1;
+			goto out;
+		}
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(leaf, extent);
+		if (found_type == BTRFS_FILE_EXTENT_REG) {
+			extent_end = found_key.offset +
+			     btrfs_file_extent_num_bytes(leaf, extent);
+		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+			struct btrfs_item *item;
+			item = btrfs_item_nr(leaf, slot);
+			extent_end = found_key.offset +
+			     btrfs_file_extent_inline_len(leaf, item);
+			extent_end = (extent_end + root->sectorsize - 1) &
+				~((u64)root->sectorsize -1 );
+		}
+		last_offset = extent_end;
+		path->slots[0]++;
+	}
+	if (last_offset < inode->i_size) {
+		WARN_ON(1);
+		btrfs_print_leaf(root, leaf);
+		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
+		       last_offset, inode->i_size);
+		err = 1;
+
+	}
+out:
+	btrfs_free_path(path);
+	return err;
+#endif
+}
+
 /*
  * this is very complex, but the basic idea is to drop all extents
  * in the range start - end.  hint_block is filled in with a block number
@@ -436,6 +511,7 @@ next_slot:
 		slot = path->slots[0];
 		ret = 0;
 		btrfs_item_key_to_cpu(leaf, &key, slot);
+
 		if (key.offset >= end || key.objectid != inode->i_ino) {
 			goto out;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51fc06dc4a5..67005480e13 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -133,6 +133,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       start, ins.objectid, ins.offset,
 					       ins.offset);
+		btrfs_check_file(root, inode);
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
@@ -965,12 +966,18 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		u64 mask = root->sectorsize - 1;
 		u64 pos = (inode->i_size + mask) & ~mask;
 		u64 block_end = attr->ia_size | mask;
+		u64 hole_start;
 		u64 hole_size;
 		u64 alloc_hint = 0;
 
 		if (attr->ia_size <= pos)
 			goto out;
 
+		if (pos != inode->i_size)
+			hole_start = pos + root->sectorsize;
+		else
+			hole_start = pos;
+
 		mutex_lock(&root->fs_info->fs_mutex);
 		err = btrfs_check_free_space(root, 1, 0);
 		mutex_unlock(&root->fs_info->fs_mutex);
@@ -980,19 +987,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
 		lock_extent(em_tree, pos, block_end, GFP_NOFS);
-		hole_size = (attr->ia_size - pos + mask) & ~mask;
+		hole_size = block_end - hole_start;
 
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		err = btrfs_drop_extents(trans, root, inode,
-					 pos, pos + hole_size, pos,
+					 pos, block_end, pos,
 					 &alloc_hint);
 
 		if (alloc_hint != EXTENT_MAP_INLINE) {
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
-						       pos, 0, 0, hole_size);
+						       hole_start, 0, 0,
+						       hole_size);
+			btrfs_check_file(root, inode);
 		}
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
-- 
cgit v1.2.3


From d1310b2e0cd98eb1348553e69b73827b436dca7b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jan 2008 16:13:08 -0500
Subject: Btrfs: Split the extent_map code into two parts

There is now extent_map for mapping offsets in the file to disk and
extent_io for state tracking, IO submission and extent_bufers.

The new extent_map code shifts from [start,end] pairs to [start,len], and
pushes the locking out into the caller.  This allows a few performance
optimizations and is easier to use.

A number of extent_map usage bugs were fixed, mostly with failing
to remove extent_map entries when changing the file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    3 +-
 fs/btrfs/btrfs_inode.h |    2 +
 fs/btrfs/ctree.h       |   15 +-
 fs/btrfs/disk-io.c     |  123 +-
 fs/btrfs/extent-tree.c |   43 +-
 fs/btrfs/extent_io.c   | 3089 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h   |  193 +++
 fs/btrfs/extent_map.c  | 3119 ++----------------------------------------------
 fs/btrfs/extent_map.h  |  214 +---
 fs/btrfs/file.c        |   29 +-
 fs/btrfs/inode.c       |  175 +--
 fs/btrfs/super.c       |   10 +-
 fs/btrfs/transaction.c |    8 +-
 fs/btrfs/transaction.h |    2 +-
 14 files changed, 3612 insertions(+), 3413 deletions(-)
 create mode 100644 fs/btrfs/extent_io.c
 create mode 100644 fs/btrfs/extent_io.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 72cc3e16741..9c61609f371 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,8 @@ obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o
+	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
+	   extent_io.o
 
 ifeq ($(CONFIG_FS_POSIX_ACL),y)
 btrfs-y += acl.o
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9fc99883c70..9a0647dc5a0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -20,6 +20,7 @@
 #define __BTRFS_I__
 
 #include "extent_map.h"
+#include "extent_io.h"
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -27,6 +28,7 @@ struct btrfs_inode {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key location;
 	struct extent_map_tree extent_tree;
+	struct extent_io_tree io_tree;
 	struct inode vfs_inode;
 
 	u64 ordered_trans;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9f2975b55c..6c65473e0fe 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,6 +27,7 @@
 #include <linux/completion.h>
 #include <asm/kmap_types.h>
 #include "bit-radix.h"
+#include "extent_io.h"
 #include "extent_map.h"
 
 struct btrfs_trans_handle;
@@ -314,11 +315,11 @@ struct btrfs_fs_info {
 	struct btrfs_root *tree_root;
 	struct radix_tree_root fs_roots_radix;
 
-	struct extent_map_tree free_space_cache;
-	struct extent_map_tree block_group_cache;
-	struct extent_map_tree pinned_extents;
-	struct extent_map_tree pending_del;
-	struct extent_map_tree extent_ins;
+	struct extent_io_tree free_space_cache;
+	struct extent_io_tree block_group_cache;
+	struct extent_io_tree pinned_extents;
+	struct extent_io_tree pending_del;
+	struct extent_io_tree extent_ins;
 
 	u64 generation;
 	u64 last_trans_committed;
@@ -956,7 +957,7 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 				  u64 first_extent);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy);
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr);
@@ -1001,7 +1002,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      u64 owner_objectid, u64 owner_offset, int pin);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct extent_map_tree *unpin);
+			       struct extent_io_tree *unpin);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5d1f9bca271..4c4ebea0b2a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,14 +43,14 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 }
 #endif
 
-static struct extent_map_ops btree_extent_map_ops;
+static struct extent_io_ops btree_extent_io_ops;
 
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
-	eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
 				bytenr, blocksize, GFP_NOFS);
 	return eb;
 }
@@ -61,13 +61,13 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
 
-	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
 				 bytenr, blocksize, NULL, GFP_NOFS);
 	return eb;
 }
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 end,
+				    size_t page_offset, u64 start, u64 len,
 				    int create)
 {
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -75,7 +75,9 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	int ret;
 
 again:
-	em = lookup_extent_mapping(em_tree, start, end);
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	spin_unlock(&em_tree->lock);
 	if (em) {
 		goto out;
 	}
@@ -85,11 +87,14 @@ again:
 		goto out;
 	}
 	em->start = 0;
-	em->end = (i_size_read(inode) & ~((u64)PAGE_CACHE_SIZE -1)) - 1;
+	em->len = i_size_read(inode);
 	em->block_start = 0;
-	em->block_end = em->end;
 	em->bdev = inode->i_sb->s_bdev;
+
+	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
 	if (ret == -EEXIST) {
 		free_extent_map(em);
 		em = NULL;
@@ -175,13 +180,13 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 
 int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 found_start;
 	int found_level;
 	unsigned long len;
 	struct extent_buffer *eb;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 	if (page->private == EXTENT_PAGE_PRIVATE)
 		goto out;
@@ -230,16 +235,16 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_write_full_page(tree, page, btree_get_extent, wbc);
 }
 
 static int btree_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		u64 num_dirty;
 		u64 start = 0;
@@ -264,18 +269,20 @@ static int btree_writepages(struct address_space *mapping,
 
 int btree_readpage(struct file *file, struct page *page)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_read_full_page(tree, page, btree_get_extent);
 }
 
 static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
 	int ret;
 
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(tree, page);
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(map, tree, page);
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
@@ -286,8 +293,8 @@ static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
 
 static void btree_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 }
@@ -331,7 +338,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return 0;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
 				 buf, 0, 0);
 	free_extent_buffer(buf);
 	return ret;
@@ -342,40 +349,39 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_map_tree *extent_tree;
+	struct extent_io_tree *io_tree;
 	u64 end;
 	int ret;
 
-	extent_tree = &BTRFS_I(btree_inode)->extent_tree;
+	io_tree = &BTRFS_I(btree_inode)->io_tree;
 
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
-				 buf, 0, 1);
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1);
 
 	if (buf->flags & EXTENT_CSUM)
 		return buf;
 
 	end = buf->start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) {
+	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
 		buf->flags |= EXTENT_CSUM;
 		return buf;
 	}
 
-	lock_extent(extent_tree, buf->start, end, GFP_NOFS);
+	lock_extent(io_tree, buf->start, end, GFP_NOFS);
 
-	if (test_range_bit(extent_tree, buf->start, end, EXTENT_CSUM, 1)) {
+	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
 		buf->flags |= EXTENT_CSUM;
 		goto out_unlock;
 	}
 
 	ret = csum_tree_block(root, buf, 1);
-	set_extent_bits(extent_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
+	set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
 
 out_unlock:
-	unlock_extent(extent_tree, buf->start, end, GFP_NOFS);
+	unlock_extent(io_tree, buf->start, end, GFP_NOFS);
 	return buf;
 }
 
@@ -385,7 +391,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid)
-		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree,
+		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	return 0;
 }
@@ -394,7 +400,7 @@ int wait_on_tree_block_writeback(struct btrfs_root *root,
 				 struct extent_buffer *buf)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->extent_tree,
+	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree,
 					buf);
 	return 0;
 }
@@ -659,20 +665,23 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->btree_inode->i_nlink = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
-	BTRFS_I(fs_info->btree_inode)->extent_tree.ops = &btree_extent_map_ops;
+	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
+			     GFP_NOFS);
+
+	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
 
-	extent_map_tree_init(&fs_info->free_space_cache,
+	extent_io_tree_init(&fs_info->free_space_cache,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->block_group_cache,
+	extent_io_tree_init(&fs_info->block_group_cache,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->pinned_extents,
+	extent_io_tree_init(&fs_info->pinned_extents,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->pending_del,
+	extent_io_tree_init(&fs_info->pending_del,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_map_tree_init(&fs_info->extent_ins,
+	extent_io_tree_init(&fs_info->extent_ins,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
@@ -787,7 +796,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (!btrfs_test_opt(root, NOBARRIER))
 		blkdev_issue_flush(sb->s_bdev, NULL);
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, super);
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super);
 	ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping,
 				     super->start, super->len);
 	if (!btrfs_test_opt(root, NOBARRIER))
@@ -864,12 +873,12 @@ int close_ctree(struct btrfs_root *root)
 
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 
-	extent_map_tree_empty_lru(&fs_info->free_space_cache);
-	extent_map_tree_empty_lru(&fs_info->block_group_cache);
-	extent_map_tree_empty_lru(&fs_info->pinned_extents);
-	extent_map_tree_empty_lru(&fs_info->pending_del);
-	extent_map_tree_empty_lru(&fs_info->extent_ins);
-	extent_map_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+	extent_io_tree_empty_lru(&fs_info->free_space_cache);
+	extent_io_tree_empty_lru(&fs_info->block_group_cache);
+	extent_io_tree_empty_lru(&fs_info->pinned_extents);
+	extent_io_tree_empty_lru(&fs_info->pending_del);
+	extent_io_tree_empty_lru(&fs_info->extent_ins);
+	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
@@ -892,13 +901,13 @@ int close_ctree(struct btrfs_root *root)
 int btrfs_buffer_uptodate(struct extent_buffer *buf)
 {
 	struct inode *btree_inode = buf->first_page->mapping->host;
-	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree, buf);
+	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
 	struct inode *btree_inode = buf->first_page->mapping->host;
-	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->extent_tree,
+	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 }
 
@@ -914,7 +923,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 			transid, root->fs_info->generation);
 		WARN_ON(1);
 	}
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->extent_tree, buf);
+	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
 }
 
 void btrfs_throttle(struct btrfs_root *root)
@@ -941,7 +950,7 @@ void btrfs_set_buffer_defrag(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
+	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
 }
 
@@ -949,7 +958,7 @@ void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	set_extent_bits(&BTRFS_I(btree_inode)->extent_tree, buf->start,
+	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
 			buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
 			GFP_NOFS);
 }
@@ -958,7 +967,7 @@ int btrfs_buffer_defrag(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
+	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
 }
 
@@ -966,7 +975,7 @@ int btrfs_buffer_defrag_done(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return test_range_bit(&BTRFS_I(btree_inode)->extent_tree,
+	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1,
 		     EXTENT_DEFRAG_DONE, 0);
 }
@@ -975,7 +984,7 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
+	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1,
 		     EXTENT_DEFRAG_DONE, GFP_NOFS);
 }
@@ -984,7 +993,7 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return clear_extent_bits(&BTRFS_I(btree_inode)->extent_tree,
+	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
 		     buf->start, buf->start + buf->len - 1,
 		     EXTENT_DEFRAG, GFP_NOFS);
 }
@@ -993,10 +1002,10 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
+	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
 					buf, 0, 1);
 }
 
-static struct extent_map_ops btree_extent_map_ops = {
+static struct extent_io_ops btree_extent_io_ops = {
 	.writepage_io_hook = btree_writepage_io_hook,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b69a46691a9..1cf125ab782 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -63,7 +63,7 @@ static int cache_block_group(struct btrfs_root *root,
 	int ret;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct extent_map_tree *free_space_cache;
+	struct extent_io_tree *free_space_cache;
 	int slot;
 	u64 last = 0;
 	u64 hole_size;
@@ -158,7 +158,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr)
 {
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *block_group = NULL;
 	u64 ptr;
 	u64 start;
@@ -281,7 +281,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 int data, int owner)
 {
 	struct btrfs_block_group_cache *cache;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
@@ -951,7 +951,7 @@ fail:
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *cache;
 	int ret;
 	int err = 0;
@@ -1107,12 +1107,12 @@ static int update_pinned_extents(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy)
+int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 {
 	u64 last = 0;
 	u64 start;
 	u64 end;
-	struct extent_map_tree *pinned_extents = &root->fs_info->pinned_extents;
+	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
 	while(1) {
@@ -1128,12 +1128,12 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy)
 
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       struct extent_map_tree *unpin)
+			       struct extent_io_tree *unpin)
 {
 	u64 start;
 	u64 end;
 	int ret;
-	struct extent_map_tree *free_space_cache;
+	struct extent_io_tree *free_space_cache;
 	free_space_cache = &root->fs_info->free_space_cache;
 
 	while(1) {
@@ -1329,8 +1329,8 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	int err = 0;
 	u64 start;
 	u64 end;
-	struct extent_map_tree *pending_del;
-	struct extent_map_tree *pinned_extents;
+	struct extent_io_tree *pending_del;
+	struct extent_io_tree *pinned_extents;
 
 	pending_del = &extent_root->fs_info->pending_del;
 	pinned_extents = &extent_root->fs_info->pinned_extents;
@@ -1802,7 +1802,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
-	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->extent_tree,
+	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->io_tree,
 			buf->start, buf->start + buf->len - 1,
 			EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
@@ -2166,7 +2166,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	unsigned long i;
 	struct page *page;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct file_ra_state *ra;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
@@ -2195,15 +2195,14 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 		delalloc_start = page_start;
-		existing_delalloc =
-			count_range_bits(&BTRFS_I(inode)->extent_tree,
-					 &delalloc_start, page_end,
-					 PAGE_CACHE_SIZE, EXTENT_DELALLOC);
+		existing_delalloc = count_range_bits(io_tree,
+					     &delalloc_start, page_end,
+					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
 
-		set_extent_delalloc(em_tree, page_start,
+		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
 		spin_lock(&root->fs_info->delalloc_lock);
@@ -2211,7 +2210,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 						 existing_delalloc;
 		spin_unlock(&root->fs_info->delalloc_lock);
 
-		unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
@@ -2379,7 +2378,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	u64 cur_byte;
 	u64 total_found;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
@@ -2561,7 +2560,7 @@ int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_block_group_item *item;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int ret;
@@ -2645,7 +2644,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_map_tree *block_group_cache;
+	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
new file mode 100644
index 00000000000..15cc158a049
--- /dev/null
+++ b/fs/btrfs/extent_io.c
@@ -0,0 +1,3089 @@
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/version.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "extent_io.h"
+#include "extent_map.h"
+
+/* temporary define until extent_map moves out of btrfs */
+struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+				       unsigned long extra_flags,
+				       void (*ctor)(void *, struct kmem_cache *,
+						    unsigned long));
+
+static struct kmem_cache *extent_state_cache;
+static struct kmem_cache *extent_buffer_cache;
+
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
+static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
+#define BUFFER_LRU_MAX 64
+
+struct tree_entry {
+	u64 start;
+	u64 end;
+	int in_tree;
+	struct rb_node rb_node;
+};
+
+struct extent_page_data {
+	struct bio *bio;
+	struct extent_io_tree *tree;
+	get_extent_t *get_extent;
+};
+
+int __init extent_io_init(void)
+{
+	extent_state_cache = btrfs_cache_create("extent_state",
+					    sizeof(struct extent_state), 0,
+					    NULL);
+	if (!extent_state_cache)
+		return -ENOMEM;
+
+	extent_buffer_cache = btrfs_cache_create("extent_buffers",
+					    sizeof(struct extent_buffer), 0,
+					    NULL);
+	if (!extent_buffer_cache)
+		goto free_state_cache;
+	return 0;
+
+free_state_cache:
+	kmem_cache_destroy(extent_state_cache);
+	return -ENOMEM;
+}
+
+void extent_io_exit(void)
+{
+	struct extent_state *state;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, list);
+		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
+		list_del(&state->list);
+		kmem_cache_free(extent_state_cache, state);
+
+	}
+
+	if (extent_state_cache)
+		kmem_cache_destroy(extent_state_cache);
+	if (extent_buffer_cache)
+		kmem_cache_destroy(extent_buffer_cache);
+}
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			  struct address_space *mapping, gfp_t mask)
+{
+	tree->state.rb_node = NULL;
+	tree->ops = NULL;
+	tree->dirty_bytes = 0;
+	rwlock_init(&tree->lock);
+	spin_lock_init(&tree->lru_lock);
+	tree->mapping = mapping;
+	INIT_LIST_HEAD(&tree->buffer_lru);
+	tree->lru_size = 0;
+}
+EXPORT_SYMBOL(extent_io_tree_init);
+
+void extent_io_tree_empty_lru(struct extent_io_tree *tree)
+{
+	struct extent_buffer *eb;
+	while(!list_empty(&tree->buffer_lru)) {
+		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
+				lru);
+		list_del_init(&eb->lru);
+		free_extent_buffer(eb);
+	}
+}
+EXPORT_SYMBOL(extent_io_tree_empty_lru);
+
+struct extent_state *alloc_extent_state(gfp_t mask)
+{
+	struct extent_state *state;
+	unsigned long flags;
+
+	state = kmem_cache_alloc(extent_state_cache, mask);
+	if (!state || IS_ERR(state))
+		return state;
+	state->state = 0;
+	state->in_tree = 0;
+	state->private = 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_add(&state->list, &states);
+	spin_unlock_irqrestore(&state_lock, flags);
+
+	atomic_set(&state->refs, 1);
+	init_waitqueue_head(&state->wq);
+	return state;
+}
+EXPORT_SYMBOL(alloc_extent_state);
+
+void free_extent_state(struct extent_state *state)
+{
+	unsigned long flags;
+	if (!state)
+		return;
+	if (atomic_dec_and_test(&state->refs)) {
+		WARN_ON(state->in_tree);
+		spin_lock_irqsave(&state_lock, flags);
+		list_del(&state->list);
+		spin_unlock_irqrestore(&state_lock, flags);
+		kmem_cache_free(extent_state_cache, state);
+	}
+}
+EXPORT_SYMBOL(free_extent_state);
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+				   struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct tree_entry *entry;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (offset < entry->start)
+			p = &(*p)->rb_left;
+		else if (offset > entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+				     struct rb_node **prev_ret,
+				     struct rb_node **next_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct rb_node *orig_prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+
+	while(n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset > entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+
+	if (prev_ret) {
+		orig_prev = prev;
+		while(prev && offset > prev_entry->end) {
+			prev = rb_next(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*prev_ret = prev;
+		prev = orig_prev;
+	}
+
+	if (next_ret) {
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		while(prev && offset < prev_entry->start) {
+			prev = rb_prev(prev);
+			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		}
+		*next_ret = prev;
+	}
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev, NULL);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+/*
+ * utility function to look for merge candidates inside a given range.
+ * Any extents with matching state are merged together into a single
+ * extent in the tree.  Extents with EXTENT_IO in their state field
+ * are not merged because the end_io handlers need to be able to do
+ * operations on them without sleeping (or doing allocations/splits).
+ *
+ * This should be called with the tree lock held.
+ */
+static int merge_state(struct extent_io_tree *tree,
+		       struct extent_state *state)
+{
+	struct extent_state *other;
+	struct rb_node *other_node;
+
+	if (state->state & EXTENT_IOBITS)
+		return 0;
+
+	other_node = rb_prev(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->end == state->start - 1 &&
+		    other->state == state->state) {
+			state->start = other->start;
+			other->in_tree = 0;
+			rb_erase(&other->rb_node, &tree->state);
+			free_extent_state(other);
+		}
+	}
+	other_node = rb_next(&state->rb_node);
+	if (other_node) {
+		other = rb_entry(other_node, struct extent_state, rb_node);
+		if (other->start == state->end + 1 &&
+		    other->state == state->state) {
+			other->start = state->start;
+			state->in_tree = 0;
+			rb_erase(&state->rb_node, &tree->state);
+			free_extent_state(state);
+		}
+	}
+	return 0;
+}
+
+/*
+ * insert an extent_state struct into the tree.  'bits' are set on the
+ * struct before it is inserted.
+ *
+ * This may return -EEXIST if the extent is already there, in which case the
+ * state struct is freed.
+ *
+ * The tree lock is not taken internally.  This is a utility function and
+ * probably isn't what you want to call (see set/clear_extent_bit).
+ */
+static int insert_state(struct extent_io_tree *tree,
+			struct extent_state *state, u64 start, u64 end,
+			int bits)
+{
+	struct rb_node *node;
+
+	if (end < start) {
+		printk("end < start %Lu %Lu\n", end, start);
+		WARN_ON(1);
+	}
+	if (bits & EXTENT_DIRTY)
+		tree->dirty_bytes += end - start + 1;
+	state->state |= bits;
+	state->start = start;
+	state->end = end;
+	node = tree_insert(&tree->state, end, &state->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		free_extent_state(state);
+		return -EEXIST;
+	}
+	merge_state(tree, state);
+	return 0;
+}
+
+/*
+ * split a given extent state struct in two, inserting the preallocated
+ * struct 'prealloc' as the newly created second half.  'split' indicates an
+ * offset inside 'orig' where it should be split.
+ *
+ * Before calling,
+ * the tree has 'orig' at [orig->start, orig->end].  After calling, there
+ * are two extent state structs in the tree:
+ * prealloc: [orig->start, split - 1]
+ * orig: [ split, orig->end ]
+ *
+ * The tree locks are not taken by this function. They need to be held
+ * by the caller.
+ */
+static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
+		       struct extent_state *prealloc, u64 split)
+{
+	struct rb_node *node;
+	prealloc->start = orig->start;
+	prealloc->end = split - 1;
+	prealloc->state = orig->state;
+	orig->start = split;
+
+	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+	if (node) {
+		struct extent_state *found;
+		found = rb_entry(node, struct extent_state, rb_node);
+		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
+		free_extent_state(prealloc);
+		return -EEXIST;
+	}
+	return 0;
+}
+
+/*
+ * utility function to clear some bits in an extent state struct.
+ * it will optionally wake up any one waiting on this state (wake == 1), or
+ * forcibly remove the state from the tree (delete == 1).
+ *
+ * If no bits are set on the state struct after clearing things, the
+ * struct is freed and removed from the tree
+ */
+static int clear_state_bit(struct extent_io_tree *tree,
+			    struct extent_state *state, int bits, int wake,
+			    int delete)
+{
+	int ret = state->state & bits;
+
+	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		WARN_ON(range > tree->dirty_bytes);
+		tree->dirty_bytes -= range;
+	}
+	state->state &= ~bits;
+	if (wake)
+		wake_up(&state->wq);
+	if (delete || state->state == 0) {
+		if (state->in_tree) {
+			rb_erase(&state->rb_node, &tree->state);
+			state->in_tree = 0;
+			free_extent_state(state);
+		} else {
+			WARN_ON(1);
+		}
+	} else {
+		merge_state(tree, state);
+	}
+	return ret;
+}
+
+/*
+ * clear some bits on a range in the tree.  This may require splitting
+ * or inserting elements in the tree, so the gfp mask is used to
+ * indicate which allocations or sleeping are allowed.
+ *
+ * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
+ * the given range from the tree regardless of state (ie for truncate).
+ *
+ * the range [start, end] is inclusive.
+ *
+ * This takes the tree lock, and returns < 0 on error, > 0 if any of the
+ * bits were already set, or zero if none of the bits were already set.
+ */
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	unsigned long flags;
+	int err;
+	int set = 0;
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irqsave(&tree->lock, flags);
+	/*
+	 * this search will find the extents that end after
+	 * our range starts
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node)
+		goto out;
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start > end)
+		goto out;
+	WARN_ON(state->end < start);
+
+	/*
+	 *     | ---- desired range ---- |
+	 *  | state | or
+	 *  | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip
+	 * bits on second half.
+	 *
+	 * If the extent we found extends past our range, we
+	 * just split and search again.  It'll get split again
+	 * the next time though.
+	 *
+	 * If the extent we found is inside our range, we clear
+	 * the desired bit on it.
+	 */
+
+	if (state->start < start) {
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			start = state->end + 1;
+			set |= clear_state_bit(tree, state, bits,
+					wake, delete);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and clear the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		if (wake)
+			wake_up(&state->wq);
+		set |= clear_state_bit(tree, prealloc, bits,
+				       wake, delete);
+		prealloc = NULL;
+		goto out;
+	}
+
+	start = state->end + 1;
+	set |= clear_state_bit(tree, state, bits, wake, delete);
+	goto search_again;
+
+out:
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return set;
+
+search_again:
+	if (start > end)
+		goto out;
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(clear_extent_bit);
+
+static int wait_on_state(struct extent_io_tree *tree,
+			 struct extent_state *state)
+{
+	DEFINE_WAIT(wait);
+	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
+	read_unlock_irq(&tree->lock);
+	schedule();
+	read_lock_irq(&tree->lock);
+	finish_wait(&state->wq, &wait);
+	return 0;
+}
+
+/*
+ * waits for one or more bits to clear on a range in the state tree.
+ * The range [start, end] is inclusive.
+ * The tree lock is taken by this function
+ */
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+{
+	struct extent_state *state;
+	struct rb_node *node;
+
+	read_lock_irq(&tree->lock);
+again:
+	while (1) {
+		/*
+		 * this search will find all the extents that end after
+		 * our range starts
+		 */
+		node = tree_search(&tree->state, start);
+		if (!node)
+			break;
+
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (state->start > end)
+			goto out;
+
+		if (state->state & bits) {
+			start = state->start;
+			atomic_inc(&state->refs);
+			wait_on_state(tree, state);
+			free_extent_state(state);
+			goto again;
+		}
+		start = state->end + 1;
+
+		if (start > end)
+			break;
+
+		if (need_resched()) {
+			read_unlock_irq(&tree->lock);
+			cond_resched();
+			read_lock_irq(&tree->lock);
+		}
+	}
+out:
+	read_unlock_irq(&tree->lock);
+	return 0;
+}
+EXPORT_SYMBOL(wait_extent_bit);
+
+static void set_state_bits(struct extent_io_tree *tree,
+			   struct extent_state *state,
+			   int bits)
+{
+	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+		u64 range = state->end - state->start + 1;
+		tree->dirty_bytes += range;
+	}
+	state->state |= bits;
+}
+
+/*
+ * set some bits on a range in the tree.  This may require allocations
+ * or sleeping, so the gfp mask is used to indicate what is allowed.
+ *
+ * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
+ * range already has the desired bits set.  The start of the existing
+ * range is returned in failed_start in this case.
+ *
+ * [start, end] is inclusive
+ * This takes the tree lock.
+ */
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+		   int exclusive, u64 *failed_start, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	unsigned long flags;
+	int err = 0;
+	int set;
+	u64 last_start;
+	u64 last_end;
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	write_lock_irqsave(&tree->lock, flags);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node) {
+		err = insert_state(tree, prealloc, start, end, bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+
+	state = rb_entry(node, struct extent_state, rb_node);
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		set = state->state & bits;
+		if (set && exclusive) {
+			*failed_start = state->start;
+			err = -EEXIST;
+			goto out;
+		}
+		set_state_bits(tree, state, bits);
+		start = state->end + 1;
+		merge_state(tree, state);
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, bits);
+			start = state->end + 1;
+			merge_state(tree, state);
+		} else {
+			start = state->start;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start -1;
+		err = insert_state(tree, prealloc, start, this_end,
+				   bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		if (err)
+			goto out;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		set = state->state & bits;
+		if (exclusive && set) {
+			*failed_start = start;
+			err = -EEXIST;
+			goto out;
+		}
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		set_state_bits(tree, prealloc, bits);
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	write_unlock_irqrestore(&tree->lock, flags);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+EXPORT_SYMBOL(set_extent_bit);
+
+/* wrappers around set/clear extent bit */
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_dirty);
+
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, bits, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_bits);
+
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_bits);
+
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_delalloc);
+
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end,
+				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_dirty);
+
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_new);
+
+int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_new);
+
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+			      mask);
+}
+EXPORT_SYMBOL(set_extent_uptodate);
+
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_uptodate);
+
+int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
+			      0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_writeback);
+
+int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+			   gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_writeback);
+
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
+}
+EXPORT_SYMBOL(wait_on_extent_writeback);
+
+/*
+ * locks a range in ascending order, waiting for any locked regions
+ * it hits on the way.  [start,end] are inclusive, and this will sleep.
+ */
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+	while (1) {
+		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+				     &failed_start, mask);
+		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
+			start = failed_start;
+		} else {
+			break;
+		}
+		WARN_ON(start > end);
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_extent);
+
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		  gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+}
+EXPORT_SYMBOL(unlock_extent);
+
+/*
+ * helper function to set pages and extents in the tree dirty
+ */
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		__set_page_dirty_nobuffers(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_dirty(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_dirty);
+
+/*
+ * helper function to set both pages and extents in the tree writeback
+ */
+int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		BUG_ON(!page);
+		set_page_writeback(page);
+		page_cache_release(page);
+		index++;
+	}
+	set_extent_writeback(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(set_range_writeback);
+
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 1;
+
+	read_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits)) {
+			*start_ret = state->start;
+			*end_ret = state->end;
+			ret = 0;
+			break;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	read_unlock_irq(&tree->lock);
+	return ret;
+}
+EXPORT_SYMBOL(find_first_extent_bit);
+
+u64 find_lock_delalloc_range(struct extent_io_tree *tree,
+			     u64 *start, u64 *end, u64 max_bytes)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 found = 0;
+	u64 total_bytes = 0;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+search_again:
+	node = tree_search(&tree->state, cur_start);
+	if (!node || IS_ERR(node)) {
+		*end = (u64)-1;
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (found && state->start != cur_start) {
+			goto out;
+		}
+		if (!(state->state & EXTENT_DELALLOC)) {
+			if (!found)
+				*end = state->end;
+			goto out;
+		}
+		if (!found) {
+			struct extent_state *prev_state;
+			struct rb_node *prev_node = node;
+			while(1) {
+				prev_node = rb_prev(prev_node);
+				if (!prev_node)
+					break;
+				prev_state = rb_entry(prev_node,
+						      struct extent_state,
+						      rb_node);
+				if (!(prev_state->state & EXTENT_DELALLOC))
+					break;
+				state = prev_state;
+				node = prev_node;
+			}
+		}
+		if (state->state & EXTENT_LOCKED) {
+			DEFINE_WAIT(wait);
+			atomic_inc(&state->refs);
+			prepare_to_wait(&state->wq, &wait,
+					TASK_UNINTERRUPTIBLE);
+			write_unlock_irq(&tree->lock);
+			schedule();
+			write_lock_irq(&tree->lock);
+			finish_wait(&state->wq, &wait);
+			free_extent_state(state);
+			goto search_again;
+		}
+		state->state |= EXTENT_LOCKED;
+		if (!found)
+			*start = state->start;
+		found++;
+		*end = state->end;
+		cur_start = state->end + 1;
+		node = rb_next(node);
+		if (!node)
+			break;
+		total_bytes += state->end - state->start + 1;
+		if (total_bytes >= max_bytes)
+			break;
+	}
+out:
+	write_unlock_irq(&tree->lock);
+	return found;
+}
+
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end, u64 max_bytes,
+		     unsigned long bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 cur_start = *start;
+	u64 total_bytes = 0;
+	int found = 0;
+
+	if (search_end <= cur_start) {
+		printk("search_end %Lu start %Lu\n", search_end, cur_start);
+		WARN_ON(1);
+		return 0;
+	}
+
+	write_lock_irq(&tree->lock);
+	if (cur_start == 0 && bits == EXTENT_DIRTY) {
+		total_bytes = tree->dirty_bytes;
+		goto out;
+	}
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, cur_start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->start > search_end)
+			break;
+		if (state->end >= cur_start && (state->state & bits)) {
+			total_bytes += min(search_end, state->end) + 1 -
+				       max(cur_start, state->start);
+			if (total_bytes >= max_bytes)
+				break;
+			if (!found) {
+				*start = state->start;
+				found = 1;
+			}
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	write_unlock_irq(&tree->lock);
+	return total_bytes;
+}
+/*
+ * helper function to lock both pages and extents in the tree.
+ * pages must be locked first.
+ */
+int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+	int err;
+
+	while (index <= end_index) {
+		page = grab_cache_page(tree->mapping, index);
+		if (!page) {
+			err = -ENOMEM;
+			goto failed;
+		}
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto failed;
+		}
+		index++;
+	}
+	lock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+
+failed:
+	/*
+	 * we failed above in getting the page at 'index', so we undo here
+	 * up to but not including the page at 'index'
+	 */
+	end_index = index;
+	index = start >> PAGE_CACHE_SHIFT;
+	while (index < end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	return err;
+}
+EXPORT_SYMBOL(lock_range);
+
+/*
+ * helper function to unlock both pages and extents in the tree.
+ */
+int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(tree->mapping, index);
+		unlock_page(page);
+		page_cache_release(page);
+		index++;
+	}
+	unlock_extent(tree, start, end, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(unlock_range);
+
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	write_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state->private = private;
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+	int ret = 0;
+
+	read_lock_irq(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(&tree->state, start);
+	if (!node || IS_ERR(node)) {
+		ret = -ENOENT;
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+	if (state->start != start) {
+		ret = -ENOENT;
+		goto out;
+	}
+	*private = state->private;
+out:
+	read_unlock_irq(&tree->lock);
+	return ret;
+}
+
+/*
+ * searches a range in the state tree for a given mask.
+ * If 'filled' == 1, this returns 1 only if ever extent in the tree
+ * has the bits set.  Otherwise, 1 is returned if any bit in the
+ * range is found set.
+ */
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled)
+{
+	struct extent_state *state = NULL;
+	struct rb_node *node;
+	int bitset = 0;
+	unsigned long flags;
+
+	read_lock_irqsave(&tree->lock, flags);
+	node = tree_search(&tree->state, start);
+	while (node && start <= end) {
+		state = rb_entry(node, struct extent_state, rb_node);
+
+		if (filled && state->start > start) {
+			bitset = 0;
+			break;
+		}
+
+		if (state->start > end)
+			break;
+
+		if (state->state & bits) {
+			bitset = 1;
+			if (!filled)
+				break;
+		} else if (filled) {
+			bitset = 0;
+			break;
+		}
+		start = state->end + 1;
+		if (start > end)
+			break;
+		node = rb_next(node);
+		if (!node) {
+			if (filled)
+				bitset = 0;
+			break;
+		}
+	}
+	read_unlock_irqrestore(&tree->lock, flags);
+	return bitset;
+}
+EXPORT_SYMBOL(test_range_bit);
+
+/*
+ * helper function to set a given page up to date if all the
+ * extents in the tree for that page are up to date
+ */
+static int check_page_uptodate(struct extent_io_tree *tree,
+			       struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+		SetPageUptodate(page);
+	return 0;
+}
+
+/*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static int check_page_locked(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+		unlock_page(page);
+	return 0;
+}
+
+/*
+ * helper function to end page writeback if all the extents
+ * in the tree for that page are done with writeback
+ */
+static int check_page_writeback(struct extent_io_tree *tree,
+			     struct page *page)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
+		end_page_writeback(page);
+	return 0;
+}
+
+/* lots and lots of room for performance fixes in the end_bio funcs */
+
+/*
+ * after a writepage IO is done, we need to:
+ * clear the uptodate bits on error
+ * clear the writeback bits in the extent tree for this IO
+ * end_page_writeback if the page has no more pending IO
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_writepage(struct bio *bio, int err)
+#else
+static int end_bio_extent_writepage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			 bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (!uptodate) {
+			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			end_page_writeback(page);
+		else
+			check_page_writeback(tree, page);
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start, end);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
+/*
+ * after a readpage IO is done, we need to:
+ * clear the uptodate bits on error
+ * set the uptodate bits if things worked
+ * set the page up to date if all extents in the tree are uptodate
+ * clear the lock bit in the extent tree
+ * unlock the page if there are no other extents locked for it
+ *
+ * Scheduling is not allowed, so the extent state tree is expected
+ * to have one and only one object corresponding to this IO.
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_readpage(struct bio *bio, int err)
+#else
+static int end_bio_extent_readpage(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+	int whole_page;
+	int ret;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
+			whole_page = 1;
+		else
+			whole_page = 0;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+			ret = tree->ops->readpage_end_io_hook(page, start, end);
+			if (ret)
+				uptodate = 0;
+		}
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+			if (whole_page)
+				SetPageUptodate(page);
+			else
+				check_page_uptodate(tree, page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+		if (whole_page)
+			unlock_page(page);
+		else
+			check_page_locked(tree, page);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
+/*
+ * IO done from prepare_write is pretty simple, we just unlock
+ * the structs in the extent tree when done, and set the uptodate bits
+ * as appropriate.
+ */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_extent_preparewrite(struct bio *bio, int err)
+#else
+static int end_bio_extent_preparewrite(struct bio *bio,
+				       unsigned int bytes_done, int err)
+#endif
+{
+	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_io_tree *tree = bio->bi_private;
+	u64 start;
+	u64 end;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+
+	do {
+		struct page *page = bvec->bv_page;
+		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+			bvec->bv_offset;
+		end = start + bvec->bv_len - 1;
+
+		if (--bvec >= bio->bi_io_vec)
+			prefetchw(&bvec->bv_page->flags);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		unlock_extent(tree, start, end, GFP_ATOMIC);
+
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
+static struct bio *
+extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+		 gfp_t gfp_flags)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_sector;
+	}
+	return bio;
+}
+
+static int submit_one_bio(int rw, struct bio *bio)
+{
+	u64 maxsector;
+	int ret = 0;
+
+	bio_get(bio);
+
+        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+	if (maxsector < bio->bi_sector) {
+		printk("sector too large max %Lu got %llu\n", maxsector,
+			(unsigned long long)bio->bi_sector);
+		WARN_ON(1);
+	}
+
+	submit_bio(rw, bio);
+	if (bio_flagged(bio, BIO_EOPNOTSUPP))
+		ret = -EOPNOTSUPP;
+	bio_put(bio);
+	return ret;
+}
+
+static int submit_extent_page(int rw, struct extent_io_tree *tree,
+			      struct page *page, sector_t sector,
+			      size_t size, unsigned long offset,
+			      struct block_device *bdev,
+			      struct bio **bio_ret,
+			      unsigned long max_pages,
+			      bio_end_io_t end_io_func)
+{
+	int ret = 0;
+	struct bio *bio;
+	int nr;
+
+	if (bio_ret && *bio_ret) {
+		bio = *bio_ret;
+		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		    bio_add_page(bio, page, size, offset) < size) {
+			ret = submit_one_bio(rw, bio);
+			bio = NULL;
+		} else {
+			return 0;
+		}
+	}
+	nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
+	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+	if (!bio) {
+		printk("failed to allocate bio nr %d\n", nr);
+	}
+	bio_add_page(bio, page, size, offset);
+	bio->bi_end_io = end_io_func;
+	bio->bi_private = tree;
+	if (bio_ret) {
+		*bio_ret = bio;
+	} else {
+		ret = submit_one_bio(rw, bio);
+	}
+
+	return ret;
+}
+
+void set_page_extent_mapped(struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		WARN_ON(!page->mapping->a_ops->invalidatepage);
+		set_page_private(page, EXTENT_PAGE_PRIVATE);
+		page_cache_get(page);
+	}
+}
+
+void set_page_extent_head(struct page *page, unsigned long len)
+{
+	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+}
+
+/*
+ * basic readpage implementation.  Locked extent state structs are inserted
+ * into the tree that are removed when the IO is done (by the end_io
+ * handlers)
+ */
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio)
+{
+	struct inode *inode = page->mapping->host;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 cur_end;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t iosize;
+	size_t blocksize = inode->i_sb->s_blocksize;
+
+	set_page_extent_mapped(page);
+
+	end = page_end;
+	lock_extent(tree, start, end, GFP_NOFS);
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			char *userpage;
+			iosize = PAGE_CACHE_SIZE - page_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			break;
+		}
+		em = get_extent(inode, page, page_offset, cur,
+				end - cur + 1, 0);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			unlock_extent(tree, cur, end, GFP_NOFS);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		cur_end = min(extent_map_end(em) - 1, end);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		/* we've found a hole, just zero and go on */
+		if (block_start == EXTENT_MAP_HOLE) {
+			char *userpage;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + page_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+
+			set_extent_uptodate(tree, cur, cur + iosize - 1,
+					    GFP_NOFS);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		/* the get_extent function already copied into the page */
+		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		ret = 0;
+		if (tree->ops && tree->ops->readpage_io_hook) {
+			ret = tree->ops->readpage_io_hook(page, cur,
+							  cur + iosize - 1);
+		}
+		if (!ret) {
+			unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+			nr -= page->index;
+			ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset,
+					 bdev, bio, nr,
+					 end_bio_extent_readpage);
+		}
+		if (ret)
+			SetPageError(page);
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+	if (!nr) {
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	return 0;
+}
+
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			    get_extent_t *get_extent)
+{
+	struct bio *bio = NULL;
+	int ret;
+
+	ret = __extent_read_full_page(tree, page, get_extent, &bio);
+	if (bio)
+		submit_one_bio(READ, bio);
+	return ret;
+}
+EXPORT_SYMBOL(extent_read_full_page);
+
+/*
+ * the writepage semantics are similar to regular writepage.  extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback.  Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+			      void *data)
+{
+	struct inode *inode = page->mapping->host;
+	struct extent_page_data *epd = data;
+	struct extent_io_tree *tree = epd->tree;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 delalloc_start;
+	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 end;
+	u64 cur = start;
+	u64 extent_offset;
+	u64 last_byte = i_size_read(inode);
+	u64 block_start;
+	u64 iosize;
+	sector_t sector;
+	struct extent_map *em;
+	struct block_device *bdev;
+	int ret;
+	int nr = 0;
+	size_t page_offset = 0;
+	size_t blocksize;
+	loff_t i_size = i_size_read(inode);
+	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	u64 nr_delalloc;
+	u64 delalloc_end;
+
+	WARN_ON(!PageLocked(page));
+	if (page->index > end_index) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_page(page);
+		return 0;
+	}
+
+	if (page->index == end_index) {
+		char *userpage;
+
+		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
+
+		userpage = kmap_atomic(page, KM_USER0);
+		memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap_atomic(userpage, KM_USER0);
+	}
+
+	set_page_extent_mapped(page);
+
+	delalloc_start = start;
+	delalloc_end = 0;
+	while(delalloc_end < page_end) {
+		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+						       &delalloc_end,
+						       128 * 1024 * 1024);
+		if (nr_delalloc == 0) {
+			delalloc_start = delalloc_end + 1;
+			continue;
+		}
+		tree->ops->fill_delalloc(inode, delalloc_start,
+					 delalloc_end);
+		clear_extent_bit(tree, delalloc_start,
+				 delalloc_end,
+				 EXTENT_LOCKED | EXTENT_DELALLOC,
+				 1, 0, GFP_NOFS);
+		delalloc_start = delalloc_end + 1;
+	}
+	lock_extent(tree, start, page_end, GFP_NOFS);
+
+	end = page_end;
+	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
+		printk("found delalloc bits after lock_extent\n");
+	}
+
+	if (last_byte <= start) {
+		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		goto done;
+	}
+
+	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
+	blocksize = inode->i_sb->s_blocksize;
+
+	while (cur <= end) {
+		if (cur >= last_byte) {
+			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			break;
+		}
+		em = epd->get_extent(inode, page, page_offset, cur,
+				     end - cur + 1, 1);
+		if (IS_ERR(em) || !em) {
+			SetPageError(page);
+			break;
+		}
+
+		extent_offset = cur - em->start;
+		BUG_ON(extent_map_end(em) <= cur);
+		BUG_ON(end < cur);
+		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		sector = (em->block_start + extent_offset) >> 9;
+		bdev = em->bdev;
+		block_start = em->block_start;
+		free_extent_map(em);
+		em = NULL;
+
+		if (block_start == EXTENT_MAP_HOLE ||
+		    block_start == EXTENT_MAP_INLINE) {
+			clear_extent_dirty(tree, cur,
+					   cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+
+		/* leave this out until we have a page_mkwrite call */
+		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
+				   EXTENT_DIRTY, 0)) {
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
+		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
+		if (tree->ops && tree->ops->writepage_io_hook) {
+			ret = tree->ops->writepage_io_hook(page, cur,
+						cur + iosize - 1);
+		} else {
+			ret = 0;
+		}
+		if (ret)
+			SetPageError(page);
+		else {
+			unsigned long max_nr = end_index + 1;
+			set_range_writeback(tree, cur, cur + iosize - 1);
+			if (!PageWriteback(page)) {
+				printk("warning page %lu not writeback, "
+				       "cur %llu end %llu\n", page->index,
+				       (unsigned long long)cur,
+				       (unsigned long long)end);
+			}
+
+			ret = submit_extent_page(WRITE, tree, page, sector,
+						 iosize, page_offset, bdev,
+						 &epd->bio, max_nr,
+						 end_bio_extent_writepage);
+			if (ret)
+				SetPageError(page);
+		}
+		cur = cur + iosize;
+		page_offset += iosize;
+		nr++;
+	}
+done:
+	if (nr == 0) {
+		/* make sure the mapping tag for page dirty gets cleared */
+		set_page_writeback(page);
+		end_page_writeback(page);
+	}
+	unlock_extent(tree, start, page_end, GFP_NOFS);
+	unlock_page(page);
+	return 0;
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+
+/* Taken directly from 2.6.23 for 2.6.18 back port */
+typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
+                                void *data);
+
+/**
+ * write_cache_pages - walk the list of dirty pages of the given address space
+ * and write all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
+ *
+ * If a page is already under I/O, write_cache_pages() skips it, even
+ * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
+ * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
+ * and msync() need to guarantee that all the data which was dirty at the time
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
+ */
+static int write_cache_pages(struct address_space *mapping,
+		      struct writeback_control *wbc, writepage_t writepage,
+		      void *data)
+{
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int range_whole = 0;
+
+	if (wbc->nonblocking && bdi_write_congested(bdi)) {
+		wbc->encountered_congestion = 1;
+		return 0;
+	}
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
+		scanned = 1;
+	}
+retry:
+	while (!done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+			lock_page(page);
+
+			if (unlikely(page->mapping != mapping)) {
+				unlock_page(page);
+				continue;
+			}
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				unlock_page(page);
+				continue;
+			}
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (PageWriteback(page) ||
+			    !clear_page_dirty_for_io(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			ret = (*writepage)(page, wbc, data);
+
+			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+				unlock_page(page);
+				ret = 0;
+			}
+			if (ret || (--(wbc->nr_to_write) <= 0))
+				done = 1;
+			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+				wbc->encountered_congestion = 1;
+				done = 1;
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+		mapping->writeback_index = index;
+	return ret;
+}
+#endif
+
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc)
+{
+	int ret;
+	struct address_space *mapping = page->mapping;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+	};
+	struct writeback_control wbc_writepages = {
+		.bdi		= wbc->bdi,
+		.sync_mode	= WB_SYNC_NONE,
+		.older_than_this = NULL,
+		.nr_to_write	= 64,
+		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
+		.range_end	= (loff_t)-1,
+	};
+
+
+	ret = __extent_writepage(page, wbc, &epd);
+
+	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
+	if (epd.bio) {
+		submit_one_bio(WRITE, epd.bio);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(extent_write_full_page);
+
+
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc)
+{
+	int ret = 0;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+	};
+
+	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+	if (epd.bio) {
+		submit_one_bio(WRITE, epd.bio);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(extent_writepages);
+
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent)
+{
+	struct bio *bio = NULL;
+	unsigned page_idx;
+	struct pagevec pvec;
+
+	pagevec_init(&pvec, 0);
+	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
+		struct page *page = list_entry(pages->prev, struct page, lru);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+		/*
+		 * what we want to do here is call add_to_page_cache_lru,
+		 * but that isn't exported, so we reproduce it here
+		 */
+		if (!add_to_page_cache(page, mapping,
+					page->index, GFP_KERNEL)) {
+
+			/* open coding of lru_cache_add, also not exported */
+			page_cache_get(page);
+			if (!pagevec_add(&pvec, page))
+				__pagevec_lru_add(&pvec);
+			__extent_read_full_page(tree, page, get_extent, &bio);
+		}
+		page_cache_release(page);
+	}
+	if (pagevec_count(&pvec))
+		__pagevec_lru_add(&pvec);
+	BUG_ON(!list_empty(pages));
+	if (bio)
+		submit_one_bio(READ, bio);
+	return 0;
+}
+EXPORT_SYMBOL(extent_readpages);
+
+/*
+ * basic invalidatepage code, this waits on any locked or writeback
+ * ranges corresponding to the page, and then deletes any extent state
+ * records from the tree
+ */
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset)
+{
+	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+
+	start += (offset + blocksize -1) & ~(blocksize - 1);
+	if (start > end)
+		return 0;
+
+	lock_extent(tree, start, end, GFP_NOFS);
+	wait_on_extent_writeback(tree, start, end);
+	clear_extent_bit(tree, start, end,
+			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+			 1, 1, GFP_NOFS);
+	return 0;
+}
+EXPORT_SYMBOL(extent_invalidatepage);
+
+/*
+ * simple commit_write call, set_range_dirty is used to mark both
+ * the pages and the extent records as dirty
+ */
+int extent_commit_write(struct extent_io_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to)
+{
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	set_page_extent_mapped(page);
+	set_page_dirty(page);
+
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_commit_write);
+
+int extent_prepare_write(struct extent_io_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent)
+{
+	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	u64 block_start;
+	u64 orig_block_start;
+	u64 block_end;
+	u64 cur_end;
+	struct extent_map *em;
+	unsigned blocksize = 1 << inode->i_blkbits;
+	size_t page_offset = 0;
+	size_t block_off_start;
+	size_t block_off_end;
+	int err = 0;
+	int iocount = 0;
+	int ret = 0;
+	int isnew;
+
+	set_page_extent_mapped(page);
+
+	block_start = (page_start + from) & ~((u64)blocksize - 1);
+	block_end = (page_start + to - 1) | (blocksize - 1);
+	orig_block_start = block_start;
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	while(block_start <= block_end) {
+		em = get_extent(inode, page, page_offset, block_start,
+				block_end - block_start + 1, 1);
+		if (IS_ERR(em) || !em) {
+			goto err;
+		}
+		cur_end = min(block_end, extent_map_end(em) - 1);
+		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
+		block_off_end = block_off_start + blocksize;
+		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
+
+		if (!PageUptodate(page) && isnew &&
+		    (block_off_end > to || block_off_start < from)) {
+			void *kaddr;
+
+			kaddr = kmap_atomic(page, KM_USER0);
+			if (block_off_end > to)
+				memset(kaddr + to, 0, block_off_end - to);
+			if (block_off_start < from)
+				memset(kaddr + block_off_start, 0,
+				       from - block_off_start);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		if ((em->block_start != EXTENT_MAP_HOLE &&
+		     em->block_start != EXTENT_MAP_INLINE) &&
+		    !isnew && !PageUptodate(page) &&
+		    (block_off_end > to || block_off_start < from) &&
+		    !test_range_bit(tree, block_start, cur_end,
+				    EXTENT_UPTODATE, 1)) {
+			u64 sector;
+			u64 extent_offset = block_start - em->start;
+			size_t iosize;
+			sector = (em->block_start + extent_offset) >> 9;
+			iosize = (cur_end - block_start + blocksize) &
+				~((u64)blocksize - 1);
+			/*
+			 * we've already got the extent locked, but we
+			 * need to split the state such that our end_bio
+			 * handler can clear the lock.
+			 */
+			set_extent_bit(tree, block_start,
+				       block_start + iosize - 1,
+				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+			ret = submit_extent_page(READ, tree, page,
+					 sector, iosize, page_offset, em->bdev,
+					 NULL, 1,
+					 end_bio_extent_preparewrite);
+			iocount++;
+			block_start = block_start + iosize;
+		} else {
+			set_extent_uptodate(tree, block_start, cur_end,
+					    GFP_NOFS);
+			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
+			block_start = cur_end + 1;
+		}
+		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
+		free_extent_map(em);
+	}
+	if (iocount) {
+		wait_extent_bit(tree, orig_block_start,
+				block_end, EXTENT_LOCKED);
+	}
+	check_page_uptodate(tree, page);
+err:
+	/* FIXME, zero out newly allocated blocks on error */
+	return err;
+}
+EXPORT_SYMBOL(extent_prepare_write);
+
+/*
+ * a helper for releasepage.  As long as there are no locked extents
+ * in the range corresponding to the page, both state records and extent
+ * map records are removed
+ */
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page)
+{
+	struct extent_map *em;
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 orig_start = start;
+	int ret = 1;
+
+	while (start <= end) {
+		spin_lock(&map->lock);
+		em = lookup_extent_mapping(map, start, end);
+		if (!em || IS_ERR(em)) {
+			spin_unlock(&map->lock);
+			break;
+		}
+		if (!test_range_bit(tree, em->start, extent_map_end(em) - 1,
+				    EXTENT_LOCKED, 0)) {
+			remove_extent_mapping(map, em);
+			/* once for the rb tree */
+			free_extent_map(em);
+		}
+		start = extent_map_end(em);
+		spin_unlock(&map->lock);
+
+		/* once for us */
+		free_extent_map(em);
+	}
+	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
+		ret = 0;
+	else
+		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
+				 1, 1, GFP_NOFS);
+	return ret;
+}
+EXPORT_SYMBOL(try_release_extent_mapping);
+
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent)
+{
+	struct inode *inode = mapping->host;
+	u64 start = iblock << inode->i_blkbits;
+	sector_t sector = 0;
+	struct extent_map *em;
+
+	em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
+	if (!em || IS_ERR(em))
+		return 0;
+
+	if (em->block_start == EXTENT_MAP_INLINE ||
+	    em->block_start == EXTENT_MAP_HOLE)
+		goto out;
+
+	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+printk("bmap finds %Lu %Lu block %Lu\n", em->start, em->len, em->block_start);
+out:
+	free_extent_map(em);
+	return sector;
+}
+
+static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
+{
+	if (list_empty(&eb->lru)) {
+		extent_buffer_get(eb);
+		list_add(&eb->lru, &tree->buffer_lru);
+		tree->lru_size++;
+		if (tree->lru_size >= BUFFER_LRU_MAX) {
+			struct extent_buffer *rm;
+			rm = list_entry(tree->buffer_lru.prev,
+					struct extent_buffer, lru);
+			tree->lru_size--;
+			list_del_init(&rm->lru);
+			free_extent_buffer(rm);
+		}
+	} else
+		list_move(&eb->lru, &tree->buffer_lru);
+	return 0;
+}
+static struct extent_buffer *find_lru(struct extent_io_tree *tree,
+				      u64 start, unsigned long len)
+{
+	struct list_head *lru = &tree->buffer_lru;
+	struct list_head *cur = lru->next;
+	struct extent_buffer *eb;
+
+	if (list_empty(lru))
+		return NULL;
+
+	do {
+		eb = list_entry(cur, struct extent_buffer, lru);
+		if (eb->start == start && eb->len == len) {
+			extent_buffer_get(eb);
+			return eb;
+		}
+		cur = cur->next;
+	} while (cur != lru);
+	return NULL;
+}
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	struct page *p;
+	struct address_space *mapping;
+
+	if (i == 0)
+		return eb->first_page;
+	i += eb->start >> PAGE_CACHE_SHIFT;
+	mapping = eb->first_page->mapping;
+	read_lock_irq(&mapping->tree_lock);
+	p = radix_tree_lookup(&mapping->page_tree, i);
+	read_unlock_irq(&mapping->tree_lock);
+	return p;
+}
+
+static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
+						   u64 start,
+						   unsigned long len,
+						   gfp_t mask)
+{
+	struct extent_buffer *eb = NULL;
+
+	spin_lock(&tree->lru_lock);
+	eb = find_lru(tree, start, len);
+	spin_unlock(&tree->lru_lock);
+	if (eb) {
+		return eb;
+	}
+
+	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+	INIT_LIST_HEAD(&eb->lru);
+	eb->start = start;
+	eb->len = len;
+	atomic_set(&eb->refs, 1);
+
+	return eb;
+}
+
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+	kmem_cache_free(extent_buffer_cache, eb);
+}
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0,
+					  gfp_t mask)
+{
+	unsigned long num_pages = num_extent_pages(start, len);
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+	int uptodate = 1;
+
+	eb = __alloc_extent_buffer(tree, start, len, mask);
+	if (!eb || IS_ERR(eb))
+		return NULL;
+
+	if (eb->flags & EXTENT_BUFFER_FILLED)
+		goto lru_add;
+
+	if (page0) {
+		eb->first_page = page0;
+		i = 1;
+		index++;
+		page_cache_get(page0);
+		mark_page_accessed(page0);
+		set_page_extent_mapped(page0);
+		WARN_ON(!PageUptodate(page0));
+		set_page_extent_head(page0, len);
+	} else {
+		i = 0;
+	}
+	for (; i < num_pages; i++, index++) {
+		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+		if (!p) {
+			WARN_ON(1);
+			goto fail;
+		}
+		set_page_extent_mapped(p);
+		mark_page_accessed(p);
+		if (i == 0) {
+			eb->first_page = p;
+			set_page_extent_head(p, len);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
+		if (!PageUptodate(p))
+			uptodate = 0;
+		unlock_page(p);
+	}
+	if (uptodate)
+		eb->flags |= EXTENT_UPTODATE;
+	eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+	spin_lock(&tree->lru_lock);
+	add_lru(tree, eb);
+	spin_unlock(&tree->lru_lock);
+	return eb;
+
+fail:
+	spin_lock(&tree->lru_lock);
+	list_del_init(&eb->lru);
+	spin_unlock(&tree->lru_lock);
+	if (!atomic_dec_and_test(&eb->refs))
+		return NULL;
+	for (index = 1; index < i; index++) {
+		page_cache_release(extent_buffer_page(eb, index));
+	}
+	if (i > 0)
+		page_cache_release(extent_buffer_page(eb, 0));
+	__free_extent_buffer(eb);
+	return NULL;
+}
+EXPORT_SYMBOL(alloc_extent_buffer);
+
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask)
+{
+	unsigned long num_pages = num_extent_pages(start, len);
+	unsigned long i;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct extent_buffer *eb;
+	struct page *p;
+	struct address_space *mapping = tree->mapping;
+	int uptodate = 1;
+
+	eb = __alloc_extent_buffer(tree, start, len, mask);
+	if (!eb || IS_ERR(eb))
+		return NULL;
+
+	if (eb->flags & EXTENT_BUFFER_FILLED)
+		goto lru_add;
+
+	for (i = 0; i < num_pages; i++, index++) {
+		p = find_lock_page(mapping, index);
+		if (!p) {
+			goto fail;
+		}
+		set_page_extent_mapped(p);
+		mark_page_accessed(p);
+
+		if (i == 0) {
+			eb->first_page = p;
+			set_page_extent_head(p, len);
+		} else {
+			set_page_private(p, EXTENT_PAGE_PRIVATE);
+		}
+
+		if (!PageUptodate(p))
+			uptodate = 0;
+		unlock_page(p);
+	}
+	if (uptodate)
+		eb->flags |= EXTENT_UPTODATE;
+	eb->flags |= EXTENT_BUFFER_FILLED;
+
+lru_add:
+	spin_lock(&tree->lru_lock);
+	add_lru(tree, eb);
+	spin_unlock(&tree->lru_lock);
+	return eb;
+fail:
+	spin_lock(&tree->lru_lock);
+	list_del_init(&eb->lru);
+	spin_unlock(&tree->lru_lock);
+	if (!atomic_dec_and_test(&eb->refs))
+		return NULL;
+	for (index = 1; index < i; index++) {
+		page_cache_release(extent_buffer_page(eb, index));
+	}
+	if (i > 0)
+		page_cache_release(extent_buffer_page(eb, 0));
+	__free_extent_buffer(eb);
+	return NULL;
+}
+EXPORT_SYMBOL(find_extent_buffer);
+
+void free_extent_buffer(struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+
+	if (!eb)
+		return;
+
+	if (!atomic_dec_and_test(&eb->refs))
+		return;
+
+	WARN_ON(!list_empty(&eb->lru));
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	for (i = 1; i < num_pages; i++) {
+		page_cache_release(extent_buffer_page(eb, i));
+	}
+	page_cache_release(extent_buffer_page(eb, 0));
+	__free_extent_buffer(eb);
+}
+EXPORT_SYMBOL(free_extent_buffer);
+
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb)
+{
+	int set;
+	unsigned long i;
+	unsigned long num_pages;
+	struct page *page;
+
+	u64 start = eb->start;
+	u64 end = start + eb->len - 1;
+
+	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		lock_page(page);
+		if (i == 0)
+			set_page_extent_head(page, eb->len);
+		else
+			set_page_private(page, EXTENT_PAGE_PRIVATE);
+
+		/*
+		 * if we're on the last page or the first page and the
+		 * block isn't aligned on a page boundary, do extra checks
+		 * to make sure we don't clean page that is partially dirty
+		 */
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			start = (u64)page->index << PAGE_CACHE_SHIFT;
+			end  = start + PAGE_CACHE_SIZE - 1;
+			if (test_range_bit(tree, start, end,
+					   EXTENT_DIRTY, 0)) {
+				unlock_page(page);
+				continue;
+			}
+		}
+		clear_page_dirty_for_io(page);
+		write_lock_irq(&page->mapping->tree_lock);
+		if (!PageDirty(page)) {
+			radix_tree_tag_clear(&page->mapping->page_tree,
+						page_index(page),
+						PAGECACHE_TAG_DIRTY);
+		}
+		write_unlock_irq(&page->mapping->tree_lock);
+		unlock_page(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(clear_extent_buffer_dirty);
+
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+				    struct extent_buffer *eb)
+{
+	return wait_on_extent_writeback(tree, eb->start,
+					eb->start + eb->len - 1);
+}
+EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
+
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb)
+{
+	unsigned long i;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = extent_buffer_page(eb, i);
+		/* writepage may need to do something special for the
+		 * first page, we have to make sure page->private is
+		 * properly set.  releasepage may drop page->private
+		 * on us if the page isn't already dirty.
+		 */
+		if (i == 0) {
+			lock_page(page);
+			set_page_extent_head(page, eb->len);
+		} else if (PagePrivate(page) &&
+			   page->private != EXTENT_PAGE_PRIVATE) {
+			lock_page(page);
+			set_page_extent_mapped(page);
+			unlock_page(page);
+		}
+		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+		if (i == 0)
+			unlock_page(page);
+	}
+	return set_extent_dirty(tree, eb->start,
+				eb->start + eb->len - 1, GFP_NOFS);
+}
+EXPORT_SYMBOL(set_extent_buffer_dirty);
+
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+
+	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			    GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
+		    ((i == num_pages - 1) &&
+		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
+			check_page_uptodate(tree, page);
+			continue;
+		}
+		SetPageUptodate(page);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(set_extent_buffer_uptodate);
+
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			     struct extent_buffer *eb)
+{
+	if (eb->flags & EXTENT_UPTODATE)
+		return 1;
+	return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1);
+}
+EXPORT_SYMBOL(extent_buffer_uptodate);
+
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb,
+			     u64 start,
+			     int wait)
+{
+	unsigned long i;
+	unsigned long start_i;
+	struct page *page;
+	int err;
+	int ret = 0;
+	unsigned long num_pages;
+
+	if (eb->flags & EXTENT_UPTODATE)
+		return 0;
+
+	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+			   EXTENT_UPTODATE, 1)) {
+		return 0;
+	}
+
+	if (start) {
+		WARN_ON(start < eb->start);
+		start_i = (start >> PAGE_CACHE_SHIFT) -
+			(eb->start >> PAGE_CACHE_SHIFT);
+	} else {
+		start_i = 0;
+	}
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (PageUptodate(page)) {
+			continue;
+		}
+		if (!wait) {
+			if (TestSetPageLocked(page)) {
+				continue;
+			}
+		} else {
+			lock_page(page);
+		}
+		if (!PageUptodate(page)) {
+			err = page->mapping->a_ops->readpage(NULL, page);
+			if (err) {
+				ret = err;
+			}
+		} else {
+			unlock_page(page);
+		}
+	}
+
+	if (ret || !wait) {
+		return ret;
+	}
+
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		wait_on_page_locked(page);
+		if (!PageUptodate(page)) {
+			ret = -EIO;
+		}
+	}
+	if (!ret)
+		eb->flags |= EXTENT_UPTODATE;
+	return ret;
+}
+EXPORT_SYMBOL(read_extent_buffer_pages);
+
+void read_extent_buffer(struct extent_buffer *eb, void *dstv,
+			unsigned long start,
+			unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *dst = (char *)dstv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		if (!PageUptodate(page)) {
+			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
+			WARN_ON(1);
+		}
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(dst, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		dst += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(read_extent_buffer);
+
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
+			       unsigned long min_len, char **token, char **map,
+			       unsigned long *map_start,
+			       unsigned long *map_len, int km)
+{
+	size_t offset = start & (PAGE_CACHE_SIZE - 1);
+	char *kaddr;
+	struct page *p;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	unsigned long end_i = (start_offset + start + min_len - 1) >>
+		PAGE_CACHE_SHIFT;
+
+	if (i != end_i)
+		return -EINVAL;
+
+	if (i == 0) {
+		offset = start_offset;
+		*map_start = 0;
+	} else {
+		offset = 0;
+		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+	}
+	if (start + min_len > eb->len) {
+printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
+		WARN_ON(1);
+	}
+
+	p = extent_buffer_page(eb, i);
+	WARN_ON(!PageUptodate(p));
+	kaddr = kmap_atomic(p, km);
+	*token = kaddr;
+	*map = kaddr + offset;
+	*map_len = PAGE_CACHE_SIZE - offset;
+	return 0;
+}
+EXPORT_SYMBOL(map_private_extent_buffer);
+
+int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
+		      unsigned long min_len,
+		      char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km)
+{
+	int err;
+	int save = 0;
+	if (eb->map_token) {
+		unmap_extent_buffer(eb, eb->map_token, km);
+		eb->map_token = NULL;
+		save = 1;
+	}
+	err = map_private_extent_buffer(eb, start, min_len, token, map,
+				       map_start, map_len, km);
+	if (!err && save) {
+		eb->map_token = *token;
+		eb->kaddr = *map;
+		eb->map_start = *map_start;
+		eb->map_len = *map_len;
+	}
+	return err;
+}
+EXPORT_SYMBOL(map_extent_buffer);
+
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
+{
+	kunmap_atomic(token, km);
+}
+EXPORT_SYMBOL(unmap_extent_buffer);
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *ptr = (char *)ptrv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		ret = memcmp(ptr, kaddr + offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+		if (ret)
+			break;
+
+		ptr += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(memcmp_extent_buffer);
+
+void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
+			 unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *src = (char *)srcv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER1);
+		memcpy(kaddr + offset, src, cur);
+		kunmap_atomic(kaddr, KM_USER1);
+
+		src += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(write_extent_buffer);
+
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(eb, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, PAGE_CACHE_SIZE - offset);
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + offset, c, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(memset_extent_buffer);
+
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len)
+{
+	u64 dst_len = dst->len;
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+
+	WARN_ON(src->len != dst_len);
+
+	offset = (start_offset + dst_offset) &
+		((unsigned long)PAGE_CACHE_SIZE - 1);
+
+	while(len > 0) {
+		page = extent_buffer_page(dst, i);
+		WARN_ON(!PageUptodate(page));
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		read_extent_buffer(src, kaddr + offset, src_offset, cur);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		src_offset += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+}
+EXPORT_SYMBOL(copy_extent_buffer);
+
+static void move_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	if (dst_page == src_page) {
+		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
+	} else {
+		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
+		char *p = dst_kaddr + dst_off + len;
+		char *s = src_kaddr + src_off + len;
+
+		while (len--)
+			*--p = *--s;
+
+		kunmap_atomic(src_kaddr, KM_USER1);
+	}
+	kunmap_atomic(dst_kaddr, KM_USER0);
+}
+
+static void copy_pages(struct page *dst_page, struct page *src_page,
+		       unsigned long dst_off, unsigned long src_off,
+		       unsigned long len)
+{
+	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
+	char *src_kaddr;
+
+	if (dst_page != src_page)
+		src_kaddr = kmap_atomic(src_page, KM_USER1);
+	else
+		src_kaddr = dst_kaddr;
+
+	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	kunmap_atomic(dst_kaddr, KM_USER0);
+	if (dst_page != src_page)
+		kunmap_atomic(src_kaddr, KM_USER1);
+}
+
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
+		       src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
+		       dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+
+	while(len > 0) {
+		dst_off_in_page = (start_offset + dst_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_offset) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+
+		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+					       src_off_in_page));
+		cur = min_t(unsigned long, cur,
+			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+
+		copy_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page, src_off_in_page, cur);
+
+		src_offset += cur;
+		dst_offset += cur;
+		len -= cur;
+	}
+}
+EXPORT_SYMBOL(memcpy_extent_buffer);
+
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len)
+{
+	size_t cur;
+	size_t dst_off_in_page;
+	size_t src_off_in_page;
+	unsigned long dst_end = dst_offset + len - 1;
+	unsigned long src_end = src_offset + len - 1;
+	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long dst_i;
+	unsigned long src_i;
+
+	if (src_offset + len > dst->len) {
+		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
+		       src_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset + len > dst->len) {
+		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
+		       dst_offset, len, dst->len);
+		BUG_ON(1);
+	}
+	if (dst_offset < src_offset) {
+		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
+		return;
+	}
+	while(len > 0) {
+		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
+		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+
+		dst_off_in_page = (start_offset + dst_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+		src_off_in_page = (start_offset + src_end) &
+			((unsigned long)PAGE_CACHE_SIZE - 1);
+
+		cur = min_t(unsigned long, len, src_off_in_page + 1);
+		cur = min(cur, dst_off_in_page + 1);
+		move_pages(extent_buffer_page(dst, dst_i),
+			   extent_buffer_page(dst, src_i),
+			   dst_off_in_page - cur + 1,
+			   src_off_in_page - cur + 1, cur);
+
+		dst_end -= cur;
+		src_end -= cur;
+		len -= cur;
+	}
+}
+EXPORT_SYMBOL(memmove_extent_buffer);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
new file mode 100644
index 00000000000..06be1fe84b2
--- /dev/null
+++ b/fs/btrfs/extent_io.h
@@ -0,0 +1,193 @@
+#ifndef __EXTENTIO__
+#define __EXTENTIO__
+
+#include <linux/rbtree.h>
+
+/* bits for the extent state */
+#define EXTENT_DIRTY 1
+#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_NEW (1 << 4)
+#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG_DONE (1 << 7)
+#define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_CSUM (1 << 9)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+
+/*
+ * page->private values.  Every page that is controlled by the extent
+ * map has page->private set to one.
+ */
+#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
+
+struct extent_io_ops {
+	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end);
+	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end);
+};
+
+struct extent_io_tree {
+	struct rb_root state;
+	struct address_space *mapping;
+	u64 dirty_bytes;
+	rwlock_t lock;
+	struct extent_io_ops *ops;
+	spinlock_t lru_lock;
+	struct list_head buffer_lru;
+	int lru_size;
+};
+
+struct extent_state {
+	u64 start;
+	u64 end; /* inclusive */
+	int in_tree;
+	struct rb_node rb_node;
+	wait_queue_head_t wq;
+	atomic_t refs;
+	unsigned long state;
+
+	/* for use by the FS */
+	u64 private;
+
+	struct list_head list;
+};
+
+struct extent_buffer {
+	u64 start;
+	unsigned long len;
+	char *map_token;
+	char *kaddr;
+	unsigned long map_start;
+	unsigned long map_len;
+	struct page *first_page;
+	struct list_head lru;
+	atomic_t refs;
+	int flags;
+};
+
+struct extent_map_tree;
+
+typedef struct extent_map *(get_extent_t)(struct inode *inode,
+					  struct page *page,
+					  size_t page_offset,
+					  u64 start, u64 len,
+					  int create);
+
+void extent_io_tree_init(struct extent_io_tree *tree,
+			  struct address_space *mapping, gfp_t mask);
+void extent_io_tree_empty_lru(struct extent_io_tree *tree);
+int try_release_extent_mapping(struct extent_map_tree *map,
+			       struct extent_io_tree *tree, struct page *page);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent);
+int __init extent_io_init(void);
+void extent_io_exit(void);
+
+u64 count_range_bits(struct extent_io_tree *tree,
+		     u64 *start, u64 search_end,
+		     u64 max_bytes, unsigned long bits);
+
+int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   int bits, int filled);
+int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		      int bits, gfp_t mask);
+int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+		    int bits, gfp_t mask);
+int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			gfp_t mask);
+int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+		   gfp_t mask);
+int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
+int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+			  u64 *start_ret, u64 *end_ret, int bits);
+int extent_invalidatepage(struct extent_io_tree *tree,
+			  struct page *page, unsigned long offset);
+int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
+			  get_extent_t *get_extent,
+			  struct writeback_control *wbc);
+int extent_writepages(struct extent_io_tree *tree,
+		      struct address_space *mapping,
+		      get_extent_t *get_extent,
+		      struct writeback_control *wbc);
+int extent_readpages(struct extent_io_tree *tree,
+		     struct address_space *mapping,
+		     struct list_head *pages, unsigned nr_pages,
+		     get_extent_t get_extent);
+int extent_prepare_write(struct extent_io_tree *tree,
+			 struct inode *inode, struct page *page,
+			 unsigned from, unsigned to, get_extent_t *get_extent);
+int extent_commit_write(struct extent_io_tree *tree,
+			struct inode *inode, struct page *page,
+			unsigned from, unsigned to);
+sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+		get_extent_t *get_extent);
+int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
+int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
+int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
+void set_page_extent_mapped(struct page *page);
+
+struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+					  u64 start, unsigned long len,
+					  struct page *page0,
+					  gfp_t mask);
+struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
+					 u64 start, unsigned long len,
+					  gfp_t mask);
+void free_extent_buffer(struct extent_buffer *eb);
+int read_extent_buffer_pages(struct extent_io_tree *tree,
+			     struct extent_buffer *eb, u64 start, int wait);
+
+static inline void extent_buffer_get(struct extent_buffer *eb)
+{
+	atomic_inc(&eb->refs);
+}
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len);
+void read_extent_buffer(struct extent_buffer *eb, void *dst,
+			unsigned long start,
+			unsigned long len);
+void write_extent_buffer(struct extent_buffer *eb, const void *src,
+			 unsigned long start, unsigned long len);
+void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
+			unsigned long dst_offset, unsigned long src_offset,
+			unsigned long len);
+void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
+			   unsigned long src_offset, unsigned long len);
+void memset_extent_buffer(struct extent_buffer *eb, char c,
+			  unsigned long start, unsigned long len);
+int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
+				    struct extent_buffer *eb);
+int clear_extent_buffer_dirty(struct extent_io_tree *tree,
+			      struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_io_tree *tree,
+			     struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_io_tree *tree,
+			       struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_io_tree *tree,
+			   struct extent_buffer *eb);
+int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
+		      unsigned long min_len, char **token, char **map,
+		      unsigned long *map_start,
+		      unsigned long *map_len, int km);
+void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 010a287fbd7..268ad8facf6 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,17 +1,10 @@
-#include <linux/bitops.h>
-#include <linux/slab.h>
-#include <linux/bio.h>
-#include <linux/mm.h>
+#include <linux/err.h>
 #include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/page-flags.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>
-#include <linux/swap.h>
 #include <linux/version.h>
-#include <linux/writeback.h>
-#include <linux/pagevec.h>
+#include <linux/hardirq.h>
 #include "extent_map.h"
 
 /* temporary define until extent_map moves out of btrfs */
@@ -21,27 +14,6 @@ struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 						    unsigned long));
 
 static struct kmem_cache *extent_map_cache;
-static struct kmem_cache *extent_state_cache;
-static struct kmem_cache *extent_buffer_cache;
-
-static LIST_HEAD(buffers);
-static LIST_HEAD(states);
-
-static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
-#define BUFFER_LRU_MAX 64
-
-struct tree_entry {
-	u64 start;
-	u64 end;
-	int in_tree;
-	struct rb_node rb_node;
-};
-
-struct extent_page_data {
-	struct bio *bio;
-	struct extent_map_tree *tree;
-	get_extent_t *get_extent;
-};
 
 int __init extent_map_init(void)
 {
@@ -50,72 +22,23 @@ int __init extent_map_init(void)
 					    NULL);
 	if (!extent_map_cache)
 		return -ENOMEM;
-	extent_state_cache = btrfs_cache_create("extent_state",
-					    sizeof(struct extent_state), 0,
-					    NULL);
-	if (!extent_state_cache)
-		goto free_map_cache;
-	extent_buffer_cache = btrfs_cache_create("extent_buffers",
-					    sizeof(struct extent_buffer), 0,
-					    NULL);
-	if (!extent_buffer_cache)
-		goto free_state_cache;
 	return 0;
-
-free_state_cache:
-	kmem_cache_destroy(extent_state_cache);
-free_map_cache:
-	kmem_cache_destroy(extent_map_cache);
-	return -ENOMEM;
 }
 
 void extent_map_exit(void)
 {
-	struct extent_state *state;
-
-	while (!list_empty(&states)) {
-		state = list_entry(states.next, struct extent_state, list);
-		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
-		list_del(&state->list);
-		kmem_cache_free(extent_state_cache, state);
-
-	}
-
 	if (extent_map_cache)
 		kmem_cache_destroy(extent_map_cache);
-	if (extent_state_cache)
-		kmem_cache_destroy(extent_state_cache);
-	if (extent_buffer_cache)
-		kmem_cache_destroy(extent_buffer_cache);
 }
 
-void extent_map_tree_init(struct extent_map_tree *tree,
-			  struct address_space *mapping, gfp_t mask)
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
 	tree->map.rb_node = NULL;
-	tree->state.rb_node = NULL;
-	tree->ops = NULL;
-	tree->dirty_bytes = 0;
-	rwlock_init(&tree->lock);
-	spin_lock_init(&tree->lru_lock);
-	tree->mapping = mapping;
-	INIT_LIST_HEAD(&tree->buffer_lru);
-	tree->lru_size = 0;
+	tree->last = NULL;
+	spin_lock_init(&tree->lock);
 }
 EXPORT_SYMBOL(extent_map_tree_init);
 
-void extent_map_tree_empty_lru(struct extent_map_tree *tree)
-{
-	struct extent_buffer *eb;
-	while(!list_empty(&tree->buffer_lru)) {
-		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
-				lru);
-		list_del_init(&eb->lru);
-		free_extent_buffer(eb);
-	}
-}
-EXPORT_SYMBOL(extent_map_tree_empty_lru);
-
 struct extent_map *alloc_extent_map(gfp_t mask)
 {
 	struct extent_map *em;
@@ -123,6 +46,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 	if (!em || IS_ERR(em))
 		return em;
 	em->in_tree = 0;
+	em->flags = 0;
 	atomic_set(&em->refs, 1);
 	return em;
 }
@@ -132,6 +56,7 @@ void free_extent_map(struct extent_map *em)
 {
 	if (!em)
 		return;
+	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
 		WARN_ON(em->in_tree);
 		kmem_cache_free(extent_map_cache, em);
@@ -139,64 +64,28 @@ void free_extent_map(struct extent_map *em)
 }
 EXPORT_SYMBOL(free_extent_map);
 
-
-struct extent_state *alloc_extent_state(gfp_t mask)
-{
-	struct extent_state *state;
-	unsigned long flags;
-
-	state = kmem_cache_alloc(extent_state_cache, mask);
-	if (!state || IS_ERR(state))
-		return state;
-	state->state = 0;
-	state->in_tree = 0;
-	state->private = 0;
-
-	spin_lock_irqsave(&state_lock, flags);
-	list_add(&state->list, &states);
-	spin_unlock_irqrestore(&state_lock, flags);
-
-	atomic_set(&state->refs, 1);
-	init_waitqueue_head(&state->wq);
-	return state;
-}
-EXPORT_SYMBOL(alloc_extent_state);
-
-void free_extent_state(struct extent_state *state)
-{
-	unsigned long flags;
-	if (!state)
-		return;
-	if (atomic_dec_and_test(&state->refs)) {
-		WARN_ON(state->in_tree);
-		spin_lock_irqsave(&state_lock, flags);
-		list_del(&state->list);
-		spin_unlock_irqrestore(&state_lock, flags);
-		kmem_cache_free(extent_state_cache, state);
-	}
-}
-EXPORT_SYMBOL(free_extent_state);
-
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
-	struct tree_entry *entry;
+	struct extent_map *entry;
 
 	while(*p) {
 		parent = *p;
-		entry = rb_entry(parent, struct tree_entry, rb_node);
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		WARN_ON(!entry->in_tree);
 
 		if (offset < entry->start)
 			p = &(*p)->rb_left;
-		else if (offset > entry->end)
+		else if (offset >= extent_map_end(entry))
 			p = &(*p)->rb_right;
 		else
 			return parent;
 	}
 
-	entry = rb_entry(node, struct tree_entry, rb_node);
+	entry = rb_entry(node, struct extent_map, rb_node);
 	entry->in_tree = 1;
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
@@ -210,17 +99,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 	struct rb_node * n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
-	struct tree_entry *entry;
-	struct tree_entry *prev_entry = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
 
 	while(n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
+		entry = rb_entry(n, struct extent_map, rb_node);
 		prev = n;
 		prev_entry = entry;
 
+		WARN_ON(!entry->in_tree);
+
 		if (offset < entry->start)
 			n = n->rb_left;
-		else if (offset > entry->end)
+		else if (offset >= extent_map_end(entry))
 			n = n->rb_right;
 		else
 			return n;
@@ -228,19 +119,19 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset > prev_entry->end) {
+		while(prev && offset >= extent_map_end(prev_entry)) {
 			prev = rb_next(prev);
-			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
 		*prev_ret = prev;
 		prev = orig_prev;
 	}
 
 	if (next_ret) {
-		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		while(prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
-			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
 		*next_ret = prev;
 	}
@@ -257,22 +148,26 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 	return ret;
 }
 
-static int tree_delete(struct rb_root *root, u64 offset)
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 {
-	struct rb_node *node;
-	struct tree_entry *entry;
-
-	node = __tree_search(root, offset, NULL, NULL);
-	if (!node)
-		return -ENOENT;
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	entry->in_tree = 0;
-	rb_erase(node, root);
+	if (extent_map_end(prev) == next->start &&
+	    prev->flags == next->flags &&
+	    prev->bdev == next->bdev &&
+	    ((next->block_start == EXTENT_MAP_HOLE &&
+	      prev->block_start == EXTENT_MAP_HOLE) ||
+	     (next->block_start == EXTENT_MAP_INLINE &&
+	      prev->block_start == EXTENT_MAP_INLINE) ||
+	     (next->block_start == EXTENT_MAP_DELALLOC &&
+	      prev->block_start == EXTENT_MAP_DELALLOC) ||
+	     (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
+	      next->block_start == extent_map_block_end(prev)))) {
+		return 1;
+	}
 	return 0;
 }
 
 /*
- * add_extent_mapping tries a simple backward merge with existing
+ * add_extent_mapping tries a simple forward/backward merge with existing
  * mappings.  The extent_map struct passed in will be inserted into
  * the tree directly (no copies made, just a reference taken).
  */
@@ -280,13 +175,12 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em)
 {
 	int ret = 0;
-	struct extent_map *prev = NULL;
+	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
-	write_lock_irq(&tree->lock);
-	rb = tree_insert(&tree->map, em->end, &em->rb_node);
+	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
-		prev = rb_entry(rb, struct extent_map, rb_node);
+		merge = rb_entry(rb, struct extent_map, rb_node);
 		ret = -EEXIST;
 		goto out;
 	}
@@ -294,53 +188,60 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	if (em->start != 0) {
 		rb = rb_prev(&em->rb_node);
 		if (rb)
-			prev = rb_entry(rb, struct extent_map, rb_node);
-		if (prev && prev->end + 1 == em->start &&
-		    ((em->block_start == EXTENT_MAP_HOLE &&
-		      prev->block_start == EXTENT_MAP_HOLE) ||
-		     (em->block_start == EXTENT_MAP_INLINE &&
-		      prev->block_start == EXTENT_MAP_INLINE) ||
-		     (em->block_start == EXTENT_MAP_DELALLOC &&
-		      prev->block_start == EXTENT_MAP_DELALLOC) ||
-		     (em->block_start < EXTENT_MAP_DELALLOC - 1 &&
-		      em->block_start == prev->block_end + 1))) {
-			em->start = prev->start;
-			em->block_start = prev->block_start;
-			rb_erase(&prev->rb_node, &tree->map);
-			prev->in_tree = 0;
-			free_extent_map(prev);
+			merge = rb_entry(rb, struct extent_map, rb_node);
+		if (rb && mergable_maps(merge, em)) {
+			em->start = merge->start;
+			em->len += merge->len;
+			em->block_start = merge->block_start;
+			merge->in_tree = 0;
+			rb_erase(&merge->rb_node, &tree->map);
+			free_extent_map(merge);
 		}
 	 }
+	rb = rb_next(&em->rb_node);
+	if (rb)
+		merge = rb_entry(rb, struct extent_map, rb_node);
+	if (rb && mergable_maps(em, merge)) {
+		em->len += merge->len;
+		rb_erase(&merge->rb_node, &tree->map);
+		merge->in_tree = 0;
+		free_extent_map(merge);
+	}
+	tree->last = em;
 out:
-	write_unlock_irq(&tree->lock);
 	return ret;
 }
 EXPORT_SYMBOL(add_extent_mapping);
 
+static u64 range_end(u64 start, u64 len)
+{
+	if (start + len < start)
+		return (u64)-1;
+	return start + len;
+}
+
 /*
  * lookup_extent_mapping returns the first extent_map struct in the
- * tree that intersects the [start, end] (inclusive) range.  There may
+ * tree that intersects the [start, len] range.  There may
  * be additional objects in the tree that intersect, so check the object
  * returned carefully to make sure you don't need additional lookups.
  */
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
-					 u64 start, u64 end)
+					 u64 start, u64 len)
 {
 	struct extent_map *em;
 	struct rb_node *rb_node;
-	struct rb_node *prev = NULL;
-	struct rb_node *next = NULL;
+	struct rb_node *prev = NULL; struct rb_node *next = NULL; u64 end = range_end(start, len); em = tree->last; if (em && end > em->start && start < extent_map_end(em)) goto found;
 
-	read_lock_irq(&tree->lock);
 	rb_node = __tree_search(&tree->map, start, &prev, &next);
 	if (!rb_node && prev) {
 		em = rb_entry(prev, struct extent_map, rb_node);
-		if (em->start <= end && em->end >= start)
+		if (end > em->start && start < extent_map_end(em))
 			goto found;
 	}
 	if (!rb_node && next) {
 		em = rb_entry(next, struct extent_map, rb_node);
-		if (em->start <= end && em->end >= start)
+		if (end > em->start && start < extent_map_end(em))
 			goto found;
 	}
 	if (!rb_node) {
@@ -352,14 +253,16 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 		goto out;
 	}
 	em = rb_entry(rb_node, struct extent_map, rb_node);
-	if (em->end < start || em->start > end) {
-		em = NULL;
-		goto out;
-	}
+	if (end > em->start && start < extent_map_end(em))
+		goto found;
+
+	em = NULL;
+	goto out;
+
 found:
 	atomic_inc(&em->refs);
+	tree->last = em;
 out:
-	read_unlock_irq(&tree->lock);
 	return em;
 }
 EXPORT_SYMBOL(lookup_extent_mapping);
@@ -370,2866 +273,12 @@ EXPORT_SYMBOL(lookup_extent_mapping);
  */
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
-	int ret;
+	int ret = 0;
 
-	write_lock_irq(&tree->lock);
-	ret = tree_delete(&tree->map, em->end);
-	write_unlock_irq(&tree->lock);
+	rb_erase(&em->rb_node, &tree->map);
+	em->in_tree = 0;
+	if (tree->last == em)
+		tree->last = NULL;
 	return ret;
 }
 EXPORT_SYMBOL(remove_extent_mapping);
-
-/*
- * utility function to look for merge candidates inside a given range.
- * Any extents with matching state are merged together into a single
- * extent in the tree.  Extents with EXTENT_IO in their state field
- * are not merged because the end_io handlers need to be able to do
- * operations on them without sleeping (or doing allocations/splits).
- *
- * This should be called with the tree lock held.
- */
-static int merge_state(struct extent_map_tree *tree,
-		       struct extent_state *state)
-{
-	struct extent_state *other;
-	struct rb_node *other_node;
-
-	if (state->state & EXTENT_IOBITS)
-		return 0;
-
-	other_node = rb_prev(&state->rb_node);
-	if (other_node) {
-		other = rb_entry(other_node, struct extent_state, rb_node);
-		if (other->end == state->start - 1 &&
-		    other->state == state->state) {
-			state->start = other->start;
-			other->in_tree = 0;
-			rb_erase(&other->rb_node, &tree->state);
-			free_extent_state(other);
-		}
-	}
-	other_node = rb_next(&state->rb_node);
-	if (other_node) {
-		other = rb_entry(other_node, struct extent_state, rb_node);
-		if (other->start == state->end + 1 &&
-		    other->state == state->state) {
-			other->start = state->start;
-			state->in_tree = 0;
-			rb_erase(&state->rb_node, &tree->state);
-			free_extent_state(state);
-		}
-	}
-	return 0;
-}
-
-/*
- * insert an extent_state struct into the tree.  'bits' are set on the
- * struct before it is inserted.
- *
- * This may return -EEXIST if the extent is already there, in which case the
- * state struct is freed.
- *
- * The tree lock is not taken internally.  This is a utility function and
- * probably isn't what you want to call (see set/clear_extent_bit).
- */
-static int insert_state(struct extent_map_tree *tree,
-			struct extent_state *state, u64 start, u64 end,
-			int bits)
-{
-	struct rb_node *node;
-
-	if (end < start) {
-		printk("end < start %Lu %Lu\n", end, start);
-		WARN_ON(1);
-	}
-	if (bits & EXTENT_DIRTY)
-		tree->dirty_bytes += end - start + 1;
-	state->state |= bits;
-	state->start = start;
-	state->end = end;
-	node = tree_insert(&tree->state, end, &state->rb_node);
-	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
-		free_extent_state(state);
-		return -EEXIST;
-	}
-	merge_state(tree, state);
-	return 0;
-}
-
-/*
- * split a given extent state struct in two, inserting the preallocated
- * struct 'prealloc' as the newly created second half.  'split' indicates an
- * offset inside 'orig' where it should be split.
- *
- * Before calling,
- * the tree has 'orig' at [orig->start, orig->end].  After calling, there
- * are two extent state structs in the tree:
- * prealloc: [orig->start, split - 1]
- * orig: [ split, orig->end ]
- *
- * The tree locks are not taken by this function. They need to be held
- * by the caller.
- */
-static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
-		       struct extent_state *prealloc, u64 split)
-{
-	struct rb_node *node;
-	prealloc->start = orig->start;
-	prealloc->end = split - 1;
-	prealloc->state = orig->state;
-	orig->start = split;
-
-	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
-	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
-		free_extent_state(prealloc);
-		return -EEXIST;
-	}
-	return 0;
-}
-
-/*
- * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1), or
- * forcibly remove the state from the tree (delete == 1).
- *
- * If no bits are set on the state struct after clearing things, the
- * struct is freed and removed from the tree
- */
-static int clear_state_bit(struct extent_map_tree *tree,
-			    struct extent_state *state, int bits, int wake,
-			    int delete)
-{
-	int ret = state->state & bits;
-
-	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
-		u64 range = state->end - state->start + 1;
-		WARN_ON(range > tree->dirty_bytes);
-		tree->dirty_bytes -= range;
-	}
-	state->state &= ~bits;
-	if (wake)
-		wake_up(&state->wq);
-	if (delete || state->state == 0) {
-		if (state->in_tree) {
-			rb_erase(&state->rb_node, &tree->state);
-			state->in_tree = 0;
-			free_extent_state(state);
-		} else {
-			WARN_ON(1);
-		}
-	} else {
-		merge_state(tree, state);
-	}
-	return ret;
-}
-
-/*
- * clear some bits on a range in the tree.  This may require splitting
- * or inserting elements in the tree, so the gfp mask is used to
- * indicate which allocations or sleeping are allowed.
- *
- * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
- * the given range from the tree regardless of state (ie for truncate).
- *
- * the range [start, end] is inclusive.
- *
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
- */
-int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, gfp_t mask)
-{
-	struct extent_state *state;
-	struct extent_state *prealloc = NULL;
-	struct rb_node *node;
-	unsigned long flags;
-	int err;
-	int set = 0;
-
-again:
-	if (!prealloc && (mask & __GFP_WAIT)) {
-		prealloc = alloc_extent_state(mask);
-		if (!prealloc)
-			return -ENOMEM;
-	}
-
-	write_lock_irqsave(&tree->lock, flags);
-	/*
-	 * this search will find the extents that end after
-	 * our range starts
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node)
-		goto out;
-	state = rb_entry(node, struct extent_state, rb_node);
-	if (state->start > end)
-		goto out;
-	WARN_ON(state->end < start);
-
-	/*
-	 *     | ---- desired range ---- |
-	 *  | state | or
-	 *  | ------------- state -------------- |
-	 *
-	 * We need to split the extent we found, and may flip
-	 * bits on second half.
-	 *
-	 * If the extent we found extends past our range, we
-	 * just split and search again.  It'll get split again
-	 * the next time though.
-	 *
-	 * If the extent we found is inside our range, we clear
-	 * the desired bit on it.
-	 */
-
-	if (state->start < start) {
-		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
-		prealloc = NULL;
-		if (err)
-			goto out;
-		if (state->end <= end) {
-			start = state->end + 1;
-			set |= clear_state_bit(tree, state, bits,
-					wake, delete);
-		} else {
-			start = state->start;
-		}
-		goto search_again;
-	}
-	/*
-	 * | ---- desired range ---- |
-	 *                        | state |
-	 * We need to split the extent, and clear the bit
-	 * on the first half
-	 */
-	if (state->start <= end && state->end > end) {
-		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
-
-		if (wake)
-			wake_up(&state->wq);
-		set |= clear_state_bit(tree, prealloc, bits,
-				       wake, delete);
-		prealloc = NULL;
-		goto out;
-	}
-
-	start = state->end + 1;
-	set |= clear_state_bit(tree, state, bits, wake, delete);
-	goto search_again;
-
-out:
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (prealloc)
-		free_extent_state(prealloc);
-
-	return set;
-
-search_again:
-	if (start > end)
-		goto out;
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (mask & __GFP_WAIT)
-		cond_resched();
-	goto again;
-}
-EXPORT_SYMBOL(clear_extent_bit);
-
-static int wait_on_state(struct extent_map_tree *tree,
-			 struct extent_state *state)
-{
-	DEFINE_WAIT(wait);
-	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-	read_unlock_irq(&tree->lock);
-	schedule();
-	read_lock_irq(&tree->lock);
-	finish_wait(&state->wq, &wait);
-	return 0;
-}
-
-/*
- * waits for one or more bits to clear on a range in the state tree.
- * The range [start, end] is inclusive.
- * The tree lock is taken by this function
- */
-int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
-{
-	struct extent_state *state;
-	struct rb_node *node;
-
-	read_lock_irq(&tree->lock);
-again:
-	while (1) {
-		/*
-		 * this search will find all the extents that end after
-		 * our range starts
-		 */
-		node = tree_search(&tree->state, start);
-		if (!node)
-			break;
-
-		state = rb_entry(node, struct extent_state, rb_node);
-
-		if (state->start > end)
-			goto out;
-
-		if (state->state & bits) {
-			start = state->start;
-			atomic_inc(&state->refs);
-			wait_on_state(tree, state);
-			free_extent_state(state);
-			goto again;
-		}
-		start = state->end + 1;
-
-		if (start > end)
-			break;
-
-		if (need_resched()) {
-			read_unlock_irq(&tree->lock);
-			cond_resched();
-			read_lock_irq(&tree->lock);
-		}
-	}
-out:
-	read_unlock_irq(&tree->lock);
-	return 0;
-}
-EXPORT_SYMBOL(wait_extent_bit);
-
-static void set_state_bits(struct extent_map_tree *tree,
-			   struct extent_state *state,
-			   int bits)
-{
-	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
-		u64 range = state->end - state->start + 1;
-		tree->dirty_bytes += range;
-	}
-	state->state |= bits;
-}
-
-/*
- * set some bits on a range in the tree.  This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
- *
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set.  The start of the existing
- * range is returned in failed_start in this case.
- *
- * [start, end] is inclusive
- * This takes the tree lock.
- */
-int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
-		   int exclusive, u64 *failed_start, gfp_t mask)
-{
-	struct extent_state *state;
-	struct extent_state *prealloc = NULL;
-	struct rb_node *node;
-	unsigned long flags;
-	int err = 0;
-	int set;
-	u64 last_start;
-	u64 last_end;
-again:
-	if (!prealloc && (mask & __GFP_WAIT)) {
-		prealloc = alloc_extent_state(mask);
-		if (!prealloc)
-			return -ENOMEM;
-	}
-
-	write_lock_irqsave(&tree->lock, flags);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node) {
-		err = insert_state(tree, prealloc, start, end, bits);
-		prealloc = NULL;
-		BUG_ON(err == -EEXIST);
-		goto out;
-	}
-
-	state = rb_entry(node, struct extent_state, rb_node);
-	last_start = state->start;
-	last_end = state->end;
-
-	/*
-	 * | ---- desired range ---- |
-	 * | state |
-	 *
-	 * Just lock what we found and keep going
-	 */
-	if (state->start == start && state->end <= end) {
-		set = state->state & bits;
-		if (set && exclusive) {
-			*failed_start = state->start;
-			err = -EEXIST;
-			goto out;
-		}
-		set_state_bits(tree, state, bits);
-		start = state->end + 1;
-		merge_state(tree, state);
-		goto search_again;
-	}
-
-	/*
-	 *     | ---- desired range ---- |
-	 * | state |
-	 *   or
-	 * | ------------- state -------------- |
-	 *
-	 * We need to split the extent we found, and may flip bits on
-	 * second half.
-	 *
-	 * If the extent we found extends past our
-	 * range, we just split and search again.  It'll get split
-	 * again the next time though.
-	 *
-	 * If the extent we found is inside our range, we set the
-	 * desired bit on it.
-	 */
-	if (state->start < start) {
-		set = state->state & bits;
-		if (exclusive && set) {
-			*failed_start = start;
-			err = -EEXIST;
-			goto out;
-		}
-		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
-		prealloc = NULL;
-		if (err)
-			goto out;
-		if (state->end <= end) {
-			set_state_bits(tree, state, bits);
-			start = state->end + 1;
-			merge_state(tree, state);
-		} else {
-			start = state->start;
-		}
-		goto search_again;
-	}
-	/*
-	 * | ---- desired range ---- |
-	 *     | state | or               | state |
-	 *
-	 * There's a hole, we need to insert something in it and
-	 * ignore the extent we found.
-	 */
-	if (state->start > start) {
-		u64 this_end;
-		if (end < last_start)
-			this_end = end;
-		else
-			this_end = last_start -1;
-		err = insert_state(tree, prealloc, start, this_end,
-				   bits);
-		prealloc = NULL;
-		BUG_ON(err == -EEXIST);
-		if (err)
-			goto out;
-		start = this_end + 1;
-		goto search_again;
-	}
-	/*
-	 * | ---- desired range ---- |
-	 *                        | state |
-	 * We need to split the extent, and set the bit
-	 * on the first half
-	 */
-	if (state->start <= end && state->end > end) {
-		set = state->state & bits;
-		if (exclusive && set) {
-			*failed_start = start;
-			err = -EEXIST;
-			goto out;
-		}
-		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
-
-		set_state_bits(tree, prealloc, bits);
-		merge_state(tree, prealloc);
-		prealloc = NULL;
-		goto out;
-	}
-
-	goto search_again;
-
-out:
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (prealloc)
-		free_extent_state(prealloc);
-
-	return err;
-
-search_again:
-	if (start > end)
-		goto out;
-	write_unlock_irqrestore(&tree->lock, flags);
-	if (mask & __GFP_WAIT)
-		cond_resched();
-	goto again;
-}
-EXPORT_SYMBOL(set_extent_bit);
-
-/* wrappers around set/clear extent bit */
-int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_dirty);
-
-int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, bits, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_bits);
-
-int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_bits);
-
-int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_delalloc);
-
-int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end,
-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_dirty);
-
-int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_new);
-
-int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_new);
-
-int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
-			gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
-			      mask);
-}
-EXPORT_SYMBOL(set_extent_uptodate);
-
-int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
-			  gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_uptodate);
-
-int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
-			 gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
-			      0, NULL, mask);
-}
-EXPORT_SYMBOL(set_extent_writeback);
-
-int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
-			   gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
-}
-EXPORT_SYMBOL(clear_extent_writeback);
-
-int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
-}
-EXPORT_SYMBOL(wait_on_extent_writeback);
-
-/*
- * locks a range in ascending order, waiting for any locked regions
- * it hits on the way.  [start,end] are inclusive, and this will sleep.
- */
-int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
-{
-	int err;
-	u64 failed_start;
-	while (1) {
-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
-				     &failed_start, mask);
-		if (err == -EEXIST && (mask & __GFP_WAIT)) {
-			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
-			start = failed_start;
-		} else {
-			break;
-		}
-		WARN_ON(start > end);
-	}
-	return err;
-}
-EXPORT_SYMBOL(lock_extent);
-
-int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
-		  gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
-}
-EXPORT_SYMBOL(unlock_extent);
-
-/*
- * helper function to set pages and extents in the tree dirty
- */
-int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page);
-		__set_page_dirty_nobuffers(page);
-		page_cache_release(page);
-		index++;
-	}
-	set_extent_dirty(tree, start, end, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(set_range_dirty);
-
-/*
- * helper function to set both pages and extents in the tree writeback
- */
-int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page);
-		set_page_writeback(page);
-		page_cache_release(page);
-		index++;
-	}
-	set_extent_writeback(tree, start, end, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(set_range_writeback);
-
-int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	int ret = 1;
-
-	read_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node || IS_ERR(node)) {
-		goto out;
-	}
-
-	while(1) {
-		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->end >= start && (state->state & bits)) {
-			*start_ret = state->start;
-			*end_ret = state->end;
-			ret = 0;
-			break;
-		}
-		node = rb_next(node);
-		if (!node)
-			break;
-	}
-out:
-	read_unlock_irq(&tree->lock);
-	return ret;
-}
-EXPORT_SYMBOL(find_first_extent_bit);
-
-u64 find_lock_delalloc_range(struct extent_map_tree *tree,
-			     u64 *start, u64 *end, u64 max_bytes)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	u64 cur_start = *start;
-	u64 found = 0;
-	u64 total_bytes = 0;
-
-	write_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-search_again:
-	node = tree_search(&tree->state, cur_start);
-	if (!node || IS_ERR(node)) {
-		*end = (u64)-1;
-		goto out;
-	}
-
-	while(1) {
-		state = rb_entry(node, struct extent_state, rb_node);
-		if (found && state->start != cur_start) {
-			goto out;
-		}
-		if (!(state->state & EXTENT_DELALLOC)) {
-			if (!found)
-				*end = state->end;
-			goto out;
-		}
-		if (!found) {
-			struct extent_state *prev_state;
-			struct rb_node *prev_node = node;
-			while(1) {
-				prev_node = rb_prev(prev_node);
-				if (!prev_node)
-					break;
-				prev_state = rb_entry(prev_node,
-						      struct extent_state,
-						      rb_node);
-				if (!(prev_state->state & EXTENT_DELALLOC))
-					break;
-				state = prev_state;
-				node = prev_node;
-			}
-		}
-		if (state->state & EXTENT_LOCKED) {
-			DEFINE_WAIT(wait);
-			atomic_inc(&state->refs);
-			prepare_to_wait(&state->wq, &wait,
-					TASK_UNINTERRUPTIBLE);
-			write_unlock_irq(&tree->lock);
-			schedule();
-			write_lock_irq(&tree->lock);
-			finish_wait(&state->wq, &wait);
-			free_extent_state(state);
-			goto search_again;
-		}
-		state->state |= EXTENT_LOCKED;
-		if (!found)
-			*start = state->start;
-		found++;
-		*end = state->end;
-		cur_start = state->end + 1;
-		node = rb_next(node);
-		if (!node)
-			break;
-		total_bytes += state->end - state->start + 1;
-		if (total_bytes >= max_bytes)
-			break;
-	}
-out:
-	write_unlock_irq(&tree->lock);
-	return found;
-}
-
-u64 count_range_bits(struct extent_map_tree *tree,
-		     u64 *start, u64 search_end, u64 max_bytes,
-		     unsigned long bits)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	u64 cur_start = *start;
-	u64 total_bytes = 0;
-	int found = 0;
-
-	if (search_end <= cur_start) {
-		printk("search_end %Lu start %Lu\n", search_end, cur_start);
-		WARN_ON(1);
-		return 0;
-	}
-
-	write_lock_irq(&tree->lock);
-	if (cur_start == 0 && bits == EXTENT_DIRTY) {
-		total_bytes = tree->dirty_bytes;
-		goto out;
-	}
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, cur_start);
-	if (!node || IS_ERR(node)) {
-		goto out;
-	}
-
-	while(1) {
-		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->start > search_end)
-			break;
-		if (state->end >= cur_start && (state->state & bits)) {
-			total_bytes += min(search_end, state->end) + 1 -
-				       max(cur_start, state->start);
-			if (total_bytes >= max_bytes)
-				break;
-			if (!found) {
-				*start = state->start;
-				found = 1;
-			}
-		}
-		node = rb_next(node);
-		if (!node)
-			break;
-	}
-out:
-	write_unlock_irq(&tree->lock);
-	return total_bytes;
-}
-/*
- * helper function to lock both pages and extents in the tree.
- * pages must be locked first.
- */
-int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-	int err;
-
-	while (index <= end_index) {
-		page = grab_cache_page(tree->mapping, index);
-		if (!page) {
-			err = -ENOMEM;
-			goto failed;
-		}
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto failed;
-		}
-		index++;
-	}
-	lock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-
-failed:
-	/*
-	 * we failed above in getting the page at 'index', so we undo here
-	 * up to but not including the page at 'index'
-	 */
-	end_index = index;
-	index = start >> PAGE_CACHE_SHIFT;
-	while (index < end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	return err;
-}
-EXPORT_SYMBOL(lock_range);
-
-/*
- * helper function to unlock both pages and extents in the tree.
- */
-int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
-{
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-	struct page *page;
-
-	while (index <= end_index) {
-		page = find_get_page(tree->mapping, index);
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
-	}
-	unlock_extent(tree, start, end, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(unlock_range);
-
-int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	int ret = 0;
-
-	write_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node || IS_ERR(node)) {
-		ret = -ENOENT;
-		goto out;
-	}
-	state = rb_entry(node, struct extent_state, rb_node);
-	if (state->start != start) {
-		ret = -ENOENT;
-		goto out;
-	}
-	state->private = private;
-out:
-	write_unlock_irq(&tree->lock);
-	return ret;
-}
-
-int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-	int ret = 0;
-
-	read_lock_irq(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(&tree->state, start);
-	if (!node || IS_ERR(node)) {
-		ret = -ENOENT;
-		goto out;
-	}
-	state = rb_entry(node, struct extent_state, rb_node);
-	if (state->start != start) {
-		ret = -ENOENT;
-		goto out;
-	}
-	*private = state->private;
-out:
-	read_unlock_irq(&tree->lock);
-	return ret;
-}
-
-/*
- * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if ever extent in the tree
- * has the bits set.  Otherwise, 1 is returned if any bit in the
- * range is found set.
- */
-int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
-		   int bits, int filled)
-{
-	struct extent_state *state = NULL;
-	struct rb_node *node;
-	int bitset = 0;
-
-	read_lock_irq(&tree->lock);
-	node = tree_search(&tree->state, start);
-	while (node && start <= end) {
-		state = rb_entry(node, struct extent_state, rb_node);
-
-		if (filled && state->start > start) {
-			bitset = 0;
-			break;
-		}
-
-		if (state->start > end)
-			break;
-
-		if (state->state & bits) {
-			bitset = 1;
-			if (!filled)
-				break;
-		} else if (filled) {
-			bitset = 0;
-			break;
-		}
-		start = state->end + 1;
-		if (start > end)
-			break;
-		node = rb_next(node);
-		if (!node) {
-			if (filled)
-				bitset = 0;
-			break;
-		}
-	}
-	read_unlock_irq(&tree->lock);
-	return bitset;
-}
-EXPORT_SYMBOL(test_range_bit);
-
-/*
- * helper function to set a given page up to date if all the
- * extents in the tree for that page are up to date
- */
-static int check_page_uptodate(struct extent_map_tree *tree,
-			       struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
-		SetPageUptodate(page);
-	return 0;
-}
-
-/*
- * helper function to unlock a page if all the extents in the tree
- * for that page are unlocked
- */
-static int check_page_locked(struct extent_map_tree *tree,
-			     struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
-		unlock_page(page);
-	return 0;
-}
-
-/*
- * helper function to end page writeback if all the extents
- * in the tree for that page are done with writeback
- */
-static int check_page_writeback(struct extent_map_tree *tree,
-			     struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
-		end_page_writeback(page);
-	return 0;
-}
-
-/* lots and lots of room for performance fixes in the end_bio funcs */
-
-/*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
- *
- * Scheduling is not allowed, so the extent state tree is expected
- * to have one and only one object corresponding to this IO.
- */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void end_bio_extent_writepage(struct bio *bio, int err)
-#else
-static int end_bio_extent_writepage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_map_tree *tree = bio->bi_private;
-	u64 start;
-	u64 end;
-	int whole_page;
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
-	do {
-		struct page *page = bvec->bv_page;
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			 bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
-		if (whole_page)
-			end_page_writeback(page);
-		else
-			check_page_writeback(tree, page);
-		if (tree->ops && tree->ops->writepage_end_io_hook)
-			tree->ops->writepage_end_io_hook(page, start, end);
-	} while (bvec >= bio->bi_io_vec);
-
-	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
-}
-
-/*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
- *
- * Scheduling is not allowed, so the extent state tree is expected
- * to have one and only one object corresponding to this IO.
- */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void end_bio_extent_readpage(struct bio *bio, int err)
-#else
-static int end_bio_extent_readpage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
-{
-	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_map_tree *tree = bio->bi_private;
-	u64 start;
-	u64 end;
-	int whole_page;
-	int ret;
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
-	do {
-		struct page *page = bvec->bv_page;
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
-			ret = tree->ops->readpage_end_io_hook(page, start, end);
-			if (ret)
-				uptodate = 0;
-		}
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
-			if (whole_page)
-				SetPageUptodate(page);
-			else
-				check_page_uptodate(tree, page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-
-		unlock_extent(tree, start, end, GFP_ATOMIC);
-
-		if (whole_page)
-			unlock_page(page);
-		else
-			check_page_locked(tree, page);
-	} while (bvec >= bio->bi_io_vec);
-
-	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
-}
-
-/*
- * IO done from prepare_write is pretty simple, we just unlock
- * the structs in the extent tree when done, and set the uptodate bits
- * as appropriate.
- */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void end_bio_extent_preparewrite(struct bio *bio, int err)
-#else
-static int end_bio_extent_preparewrite(struct bio *bio,
-				       unsigned int bytes_done, int err)
-#endif
-{
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_map_tree *tree = bio->bi_private;
-	u64 start;
-	u64 end;
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
-	do {
-		struct page *page = bvec->bv_page;
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
-
-		if (--bvec >= bio->bi_io_vec)
-			prefetchw(&bvec->bv_page->flags);
-
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
-
-		unlock_extent(tree, start, end, GFP_ATOMIC);
-
-	} while (bvec >= bio->bi_io_vec);
-
-	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
-}
-
-static struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-		 gfp_t gfp_flags)
-{
-	struct bio *bio;
-
-	bio = bio_alloc(gfp_flags, nr_vecs);
-
-	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-		while (!bio && (nr_vecs /= 2))
-			bio = bio_alloc(gfp_flags, nr_vecs);
-	}
-
-	if (bio) {
-		bio->bi_bdev = bdev;
-		bio->bi_sector = first_sector;
-	}
-	return bio;
-}
-
-static int submit_one_bio(int rw, struct bio *bio)
-{
-	u64 maxsector;
-	int ret = 0;
-
-	bio_get(bio);
-
-        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
-	if (maxsector < bio->bi_sector) {
-		printk("sector too large max %Lu got %llu\n", maxsector,
-			(unsigned long long)bio->bi_sector);
-		WARN_ON(1);
-	}
-
-	submit_bio(rw, bio);
-	if (bio_flagged(bio, BIO_EOPNOTSUPP))
-		ret = -EOPNOTSUPP;
-	bio_put(bio);
-	return ret;
-}
-
-static int submit_extent_page(int rw, struct extent_map_tree *tree,
-			      struct page *page, sector_t sector,
-			      size_t size, unsigned long offset,
-			      struct block_device *bdev,
-			      struct bio **bio_ret,
-			      unsigned long max_pages,
-			      bio_end_io_t end_io_func)
-{
-	int ret = 0;
-	struct bio *bio;
-	int nr;
-
-	if (bio_ret && *bio_ret) {
-		bio = *bio_ret;
-		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
-		    bio_add_page(bio, page, size, offset) < size) {
-			ret = submit_one_bio(rw, bio);
-			bio = NULL;
-		} else {
-			return 0;
-		}
-	}
-	nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
-	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
-	if (!bio) {
-		printk("failed to allocate bio nr %d\n", nr);
-	}
-	bio_add_page(bio, page, size, offset);
-	bio->bi_end_io = end_io_func;
-	bio->bi_private = tree;
-	if (bio_ret) {
-		*bio_ret = bio;
-	} else {
-		ret = submit_one_bio(rw, bio);
-	}
-
-	return ret;
-}
-
-void set_page_extent_mapped(struct page *page)
-{
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		set_page_private(page, EXTENT_PAGE_PRIVATE);
-		page_cache_get(page);
-	}
-}
-
-void set_page_extent_head(struct page *page, unsigned long len)
-{
-	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
-}
-
-/*
- * basic readpage implementation.  Locked extent state structs are inserted
- * into the tree that are removed when the IO is done (by the end_io
- * handlers)
- */
-static int __extent_read_full_page(struct extent_map_tree *tree,
-				   struct page *page,
-				   get_extent_t *get_extent,
-				   struct bio **bio)
-{
-	struct inode *inode = page->mapping->host;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = start + PAGE_CACHE_SIZE - 1;
-	u64 end;
-	u64 cur = start;
-	u64 extent_offset;
-	u64 last_byte = i_size_read(inode);
-	u64 block_start;
-	u64 cur_end;
-	sector_t sector;
-	struct extent_map *em;
-	struct block_device *bdev;
-	int ret;
-	int nr = 0;
-	size_t page_offset = 0;
-	size_t iosize;
-	size_t blocksize = inode->i_sb->s_blocksize;
-
-	set_page_extent_mapped(page);
-
-	end = page_end;
-	lock_extent(tree, start, end, GFP_NOFS);
-
-	while (cur <= end) {
-		if (cur >= last_byte) {
-			char *userpage;
-			iosize = PAGE_CACHE_SIZE - page_offset;
-			userpage = kmap_atomic(page, KM_USER0);
-			memset(userpage + page_offset, 0, iosize);
-			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
-			break;
-		}
-		em = get_extent(inode, page, page_offset, cur, end, 0);
-		if (IS_ERR(em) || !em) {
-			SetPageError(page);
-			unlock_extent(tree, cur, end, GFP_NOFS);
-			break;
-		}
-
-		extent_offset = cur - em->start;
-		BUG_ON(em->end < cur);
-		BUG_ON(end < cur);
-
-		iosize = min(em->end - cur, end - cur) + 1;
-		cur_end = min(em->end, end);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-		sector = (em->block_start + extent_offset) >> 9;
-		bdev = em->bdev;
-		block_start = em->block_start;
-		free_extent_map(em);
-		em = NULL;
-
-		/* we've found a hole, just zero and go on */
-		if (block_start == EXTENT_MAP_HOLE) {
-			char *userpage;
-			userpage = kmap_atomic(page, KM_USER0);
-			memset(userpage + page_offset, 0, iosize);
-			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
-
-			set_extent_uptodate(tree, cur, cur + iosize - 1,
-					    GFP_NOFS);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-		/* the get_extent function already copied into the page */
-		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-
-		ret = 0;
-		if (tree->ops && tree->ops->readpage_io_hook) {
-			ret = tree->ops->readpage_io_hook(page, cur,
-							  cur + iosize - 1);
-		}
-		if (!ret) {
-			unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-			nr -= page->index;
-			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset,
-					 bdev, bio, nr,
-					 end_bio_extent_readpage);
-		}
-		if (ret)
-			SetPageError(page);
-		cur = cur + iosize;
-		page_offset += iosize;
-		nr++;
-	}
-	if (!nr) {
-		if (!PageError(page))
-			SetPageUptodate(page);
-		unlock_page(page);
-	}
-	return 0;
-}
-
-int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
-			    get_extent_t *get_extent)
-{
-	struct bio *bio = NULL;
-	int ret;
-
-	ret = __extent_read_full_page(tree, page, get_extent, &bio);
-	if (bio)
-		submit_one_bio(READ, bio);
-	return ret;
-}
-EXPORT_SYMBOL(extent_read_full_page);
-
-/*
- * the writepage semantics are similar to regular writepage.  extent
- * records are inserted to lock ranges in the tree, and as dirty areas
- * are found, they are marked writeback.  Then the lock bits are removed
- * and the end_io handler clears the writeback ranges
- */
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
-			      void *data)
-{
-	struct inode *inode = page->mapping->host;
-	struct extent_page_data *epd = data;
-	struct extent_map_tree *tree = epd->tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 delalloc_start;
-	u64 page_end = start + PAGE_CACHE_SIZE - 1;
-	u64 end;
-	u64 cur = start;
-	u64 extent_offset;
-	u64 last_byte = i_size_read(inode);
-	u64 block_start;
-	u64 iosize;
-	sector_t sector;
-	struct extent_map *em;
-	struct block_device *bdev;
-	int ret;
-	int nr = 0;
-	size_t page_offset = 0;
-	size_t blocksize;
-	loff_t i_size = i_size_read(inode);
-	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
-	u64 nr_delalloc;
-	u64 delalloc_end;
-
-	WARN_ON(!PageLocked(page));
-	if (page->index > end_index) {
-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
-		unlock_page(page);
-		return 0;
-	}
-
-	if (page->index == end_index) {
-		char *userpage;
-
-		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
-
-		userpage = kmap_atomic(page, KM_USER0);
-		memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
-		kunmap_atomic(userpage, KM_USER0);
-	}
-
-	set_page_extent_mapped(page);
-
-	delalloc_start = start;
-	delalloc_end = 0;
-	while(delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
-						       &delalloc_end,
-						       128 * 1024 * 1024);
-		if (nr_delalloc == 0) {
-			delalloc_start = delalloc_end + 1;
-			continue;
-		}
-		tree->ops->fill_delalloc(inode, delalloc_start,
-					 delalloc_end);
-		clear_extent_bit(tree, delalloc_start,
-				 delalloc_end,
-				 EXTENT_LOCKED | EXTENT_DELALLOC,
-				 1, 0, GFP_NOFS);
-		delalloc_start = delalloc_end + 1;
-	}
-	lock_extent(tree, start, page_end, GFP_NOFS);
-
-	end = page_end;
-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-		printk("found delalloc bits after lock_extent\n");
-	}
-
-	if (last_byte <= start) {
-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
-		goto done;
-	}
-
-	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
-	blocksize = inode->i_sb->s_blocksize;
-
-	while (cur <= end) {
-		if (cur >= last_byte) {
-			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
-			break;
-		}
-		em = epd->get_extent(inode, page, page_offset, cur, end, 1);
-		if (IS_ERR(em) || !em) {
-			SetPageError(page);
-			break;
-		}
-
-		extent_offset = cur - em->start;
-		BUG_ON(em->end < cur);
-		BUG_ON(end < cur);
-		iosize = min(em->end - cur, end - cur) + 1;
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-		sector = (em->block_start + extent_offset) >> 9;
-		bdev = em->bdev;
-		block_start = em->block_start;
-		free_extent_map(em);
-		em = NULL;
-
-		if (block_start == EXTENT_MAP_HOLE ||
-		    block_start == EXTENT_MAP_INLINE) {
-			clear_extent_dirty(tree, cur,
-					   cur + iosize - 1, GFP_NOFS);
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-
-		/* leave this out until we have a page_mkwrite call */
-		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
-				   EXTENT_DIRTY, 0)) {
-			cur = cur + iosize;
-			page_offset += iosize;
-			continue;
-		}
-		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
-		if (tree->ops && tree->ops->writepage_io_hook) {
-			ret = tree->ops->writepage_io_hook(page, cur,
-						cur + iosize - 1);
-		} else {
-			ret = 0;
-		}
-		if (ret)
-			SetPageError(page);
-		else {
-			unsigned long max_nr = end_index + 1;
-			set_range_writeback(tree, cur, cur + iosize - 1);
-			if (!PageWriteback(page)) {
-				printk("warning page %lu not writeback, "
-				       "cur %llu end %llu\n", page->index,
-				       (unsigned long long)cur,
-				       (unsigned long long)end);
-			}
-
-			ret = submit_extent_page(WRITE, tree, page, sector,
-						 iosize, page_offset, bdev,
-						 &epd->bio, max_nr,
-						 end_bio_extent_writepage);
-			if (ret)
-				SetPageError(page);
-		}
-		cur = cur + iosize;
-		page_offset += iosize;
-		nr++;
-	}
-done:
-	if (nr == 0) {
-		/* make sure the mapping tag for page dirty gets cleared */
-		set_page_writeback(page);
-		end_page_writeback(page);
-	}
-	unlock_extent(tree, start, page_end, GFP_NOFS);
-	unlock_page(page);
-	return 0;
-}
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-
-/* Taken directly from 2.6.23 for 2.6.18 back port */
-typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
-                                void *data);
-
-/**
- * write_cache_pages - walk the list of dirty pages of the given address space
- * and write all of them.
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @writepage: function called for each page
- * @data: data passed to writepage function
- *
- * If a page is already under I/O, write_cache_pages() skips it, even
- * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
- * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
- * and msync() need to guarantee that all the data which was dirty at the time
- * the call was made get new I/O started against them.  If wbc->sync_mode is
- * WB_SYNC_ALL then we were called for data integrity and we must wait for
- * existing IO to complete.
- */
-static int write_cache_pages(struct address_space *mapping,
-		      struct writeback_control *wbc, writepage_t writepage,
-		      void *data)
-{
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-	int ret = 0;
-	int done = 0;
-	struct pagevec pvec;
-	int nr_pages;
-	pgoff_t index;
-	pgoff_t end;		/* Inclusive */
-	int scanned = 0;
-	int range_whole = 0;
-
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
-	pagevec_init(&pvec, 0);
-	if (wbc->range_cyclic) {
-		index = mapping->writeback_index; /* Start from prev offset */
-		end = -1;
-	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-		scanned = 1;
-	}
-retry:
-	while (!done && (index <= end) &&
-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
-		unsigned i;
-
-		scanned = 1;
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/*
-			 * At this point we hold neither mapping->tree_lock nor
-			 * lock on the page itself: the page may be truncated or
-			 * invalidated (changing page->mapping to NULL), or even
-			 * swizzled back from swapper_space to tmpfs file
-			 * mapping
-			 */
-			lock_page(page);
-
-			if (unlikely(page->mapping != mapping)) {
-				unlock_page(page);
-				continue;
-			}
-
-			if (!wbc->range_cyclic && page->index > end) {
-				done = 1;
-				unlock_page(page);
-				continue;
-			}
-
-			if (wbc->sync_mode != WB_SYNC_NONE)
-				wait_on_page_writeback(page);
-
-			if (PageWriteback(page) ||
-			    !clear_page_dirty_for_io(page)) {
-				unlock_page(page);
-				continue;
-			}
-
-			ret = (*writepage)(page, wbc, data);
-
-			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
-				unlock_page(page);
-				ret = 0;
-			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
-				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}
-		}
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-	if (!scanned && !done) {
-		/*
-		 * We hit the last page and there is more work to be done: wrap
-		 * back to the start of the file
-		 */
-		scanned = 1;
-		index = 0;
-		goto retry;
-	}
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-		mapping->writeback_index = index;
-	return ret;
-}
-#endif
-
-int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent,
-			  struct writeback_control *wbc)
-{
-	int ret;
-	struct address_space *mapping = page->mapping;
-	struct extent_page_data epd = {
-		.bio = NULL,
-		.tree = tree,
-		.get_extent = get_extent,
-	};
-	struct writeback_control wbc_writepages = {
-		.bdi		= wbc->bdi,
-		.sync_mode	= WB_SYNC_NONE,
-		.older_than_this = NULL,
-		.nr_to_write	= 64,
-		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
-		.range_end	= (loff_t)-1,
-	};
-
-
-	ret = __extent_writepage(page, wbc, &epd);
-
-	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
-	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(extent_write_full_page);
-
-
-int extent_writepages(struct extent_map_tree *tree,
-		      struct address_space *mapping,
-		      get_extent_t *get_extent,
-		      struct writeback_control *wbc)
-{
-	int ret = 0;
-	struct extent_page_data epd = {
-		.bio = NULL,
-		.tree = tree,
-		.get_extent = get_extent,
-	};
-
-	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
-	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(extent_writepages);
-
-int extent_readpages(struct extent_map_tree *tree,
-		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages,
-		     get_extent_t get_extent)
-{
-	struct bio *bio = NULL;
-	unsigned page_idx;
-	struct pagevec pvec;
-
-	pagevec_init(&pvec, 0);
-	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page = list_entry(pages->prev, struct page, lru);
-
-		prefetchw(&page->flags);
-		list_del(&page->lru);
-		/*
-		 * what we want to do here is call add_to_page_cache_lru,
-		 * but that isn't exported, so we reproduce it here
-		 */
-		if (!add_to_page_cache(page, mapping,
-					page->index, GFP_KERNEL)) {
-
-			/* open coding of lru_cache_add, also not exported */
-			page_cache_get(page);
-			if (!pagevec_add(&pvec, page))
-				__pagevec_lru_add(&pvec);
-			__extent_read_full_page(tree, page, get_extent, &bio);
-		}
-		page_cache_release(page);
-	}
-	if (pagevec_count(&pvec))
-		__pagevec_lru_add(&pvec);
-	BUG_ON(!list_empty(pages));
-	if (bio)
-		submit_one_bio(READ, bio);
-	return 0;
-}
-EXPORT_SYMBOL(extent_readpages);
-
-/*
- * basic invalidatepage code, this waits on any locked or writeback
- * ranges corresponding to the page, and then deletes any extent state
- * records from the tree
- */
-int extent_invalidatepage(struct extent_map_tree *tree,
-			  struct page *page, unsigned long offset)
-{
-	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
-
-	start += (offset + blocksize -1) & ~(blocksize - 1);
-	if (start > end)
-		return 0;
-
-	lock_extent(tree, start, end, GFP_NOFS);
-	wait_on_extent_writeback(tree, start, end);
-	clear_extent_bit(tree, start, end,
-			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-			 1, 1, GFP_NOFS);
-	return 0;
-}
-EXPORT_SYMBOL(extent_invalidatepage);
-
-/*
- * simple commit_write call, set_range_dirty is used to mark both
- * the pages and the extent records as dirty
- */
-int extent_commit_write(struct extent_map_tree *tree,
-			struct inode *inode, struct page *page,
-			unsigned from, unsigned to)
-{
-	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
-	set_page_extent_mapped(page);
-	set_page_dirty(page);
-
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(extent_commit_write);
-
-int extent_prepare_write(struct extent_map_tree *tree,
-			 struct inode *inode, struct page *page,
-			 unsigned from, unsigned to, get_extent_t *get_extent)
-{
-	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-	u64 block_start;
-	u64 orig_block_start;
-	u64 block_end;
-	u64 cur_end;
-	struct extent_map *em;
-	unsigned blocksize = 1 << inode->i_blkbits;
-	size_t page_offset = 0;
-	size_t block_off_start;
-	size_t block_off_end;
-	int err = 0;
-	int iocount = 0;
-	int ret = 0;
-	int isnew;
-
-	set_page_extent_mapped(page);
-
-	block_start = (page_start + from) & ~((u64)blocksize - 1);
-	block_end = (page_start + to - 1) | (blocksize - 1);
-	orig_block_start = block_start;
-
-	lock_extent(tree, page_start, page_end, GFP_NOFS);
-	while(block_start <= block_end) {
-		em = get_extent(inode, page, page_offset, block_start,
-				block_end, 1);
-		if (IS_ERR(em) || !em) {
-			goto err;
-		}
-		cur_end = min(block_end, em->end);
-		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
-		block_off_end = block_off_start + blocksize;
-		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
-
-		if (!PageUptodate(page) && isnew &&
-		    (block_off_end > to || block_off_start < from)) {
-			void *kaddr;
-
-			kaddr = kmap_atomic(page, KM_USER0);
-			if (block_off_end > to)
-				memset(kaddr + to, 0, block_off_end - to);
-			if (block_off_start < from)
-				memset(kaddr + block_off_start, 0,
-				       from - block_off_start);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr, KM_USER0);
-		}
-		if ((em->block_start != EXTENT_MAP_HOLE &&
-		     em->block_start != EXTENT_MAP_INLINE) &&
-		    !isnew && !PageUptodate(page) &&
-		    (block_off_end > to || block_off_start < from) &&
-		    !test_range_bit(tree, block_start, cur_end,
-				    EXTENT_UPTODATE, 1)) {
-			u64 sector;
-			u64 extent_offset = block_start - em->start;
-			size_t iosize;
-			sector = (em->block_start + extent_offset) >> 9;
-			iosize = (cur_end - block_start + blocksize) &
-				~((u64)blocksize - 1);
-			/*
-			 * we've already got the extent locked, but we
-			 * need to split the state such that our end_bio
-			 * handler can clear the lock.
-			 */
-			set_extent_bit(tree, block_start,
-				       block_start + iosize - 1,
-				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
-			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset, em->bdev,
-					 NULL, 1,
-					 end_bio_extent_preparewrite);
-			iocount++;
-			block_start = block_start + iosize;
-		} else {
-			set_extent_uptodate(tree, block_start, cur_end,
-					    GFP_NOFS);
-			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
-			block_start = cur_end + 1;
-		}
-		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
-		free_extent_map(em);
-	}
-	if (iocount) {
-		wait_extent_bit(tree, orig_block_start,
-				block_end, EXTENT_LOCKED);
-	}
-	check_page_uptodate(tree, page);
-err:
-	/* FIXME, zero out newly allocated blocks on error */
-	return err;
-}
-EXPORT_SYMBOL(extent_prepare_write);
-
-/*
- * a helper for releasepage.  As long as there are no locked extents
- * in the range corresponding to the page, both state records and extent
- * map records are removed
- */
-int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
-{
-	struct extent_map *em;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	u64 orig_start = start;
-	int ret = 1;
-
-	while (start <= end) {
-		em = lookup_extent_mapping(tree, start, end);
-		if (!em || IS_ERR(em))
-			break;
-		if (!test_range_bit(tree, em->start, em->end,
-				    EXTENT_LOCKED, 0)) {
-			remove_extent_mapping(tree, em);
-			/* once for the rb tree */
-			free_extent_map(em);
-		}
-		start = em->end + 1;
-		/* once for us */
-		free_extent_map(em);
-	}
-	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
-		ret = 0;
-	else
-		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
-				 1, 1, GFP_NOFS);
-	return ret;
-}
-EXPORT_SYMBOL(try_release_extent_mapping);
-
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-		get_extent_t *get_extent)
-{
-	struct inode *inode = mapping->host;
-	u64 start = iblock << inode->i_blkbits;
-	u64 end = start + (1 << inode->i_blkbits) - 1;
-	sector_t sector = 0;
-	struct extent_map *em;
-
-	em = get_extent(inode, NULL, 0, start, end, 0);
-	if (!em || IS_ERR(em))
-		return 0;
-
-	if (em->block_start == EXTENT_MAP_INLINE ||
-	    em->block_start == EXTENT_MAP_HOLE)
-		goto out;
-
-	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
-out:
-	free_extent_map(em);
-	return sector;
-}
-
-static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
-{
-	if (list_empty(&eb->lru)) {
-		extent_buffer_get(eb);
-		list_add(&eb->lru, &tree->buffer_lru);
-		tree->lru_size++;
-		if (tree->lru_size >= BUFFER_LRU_MAX) {
-			struct extent_buffer *rm;
-			rm = list_entry(tree->buffer_lru.prev,
-					struct extent_buffer, lru);
-			tree->lru_size--;
-			list_del_init(&rm->lru);
-			free_extent_buffer(rm);
-		}
-	} else
-		list_move(&eb->lru, &tree->buffer_lru);
-	return 0;
-}
-static struct extent_buffer *find_lru(struct extent_map_tree *tree,
-				      u64 start, unsigned long len)
-{
-	struct list_head *lru = &tree->buffer_lru;
-	struct list_head *cur = lru->next;
-	struct extent_buffer *eb;
-
-	if (list_empty(lru))
-		return NULL;
-
-	do {
-		eb = list_entry(cur, struct extent_buffer, lru);
-		if (eb->start == start && eb->len == len) {
-			extent_buffer_get(eb);
-			return eb;
-		}
-		cur = cur->next;
-	} while (cur != lru);
-	return NULL;
-}
-
-static inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
-}
-
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
-					      unsigned long i)
-{
-	struct page *p;
-	struct address_space *mapping;
-
-	if (i == 0)
-		return eb->first_page;
-	i += eb->start >> PAGE_CACHE_SHIFT;
-	mapping = eb->first_page->mapping;
-	read_lock_irq(&mapping->tree_lock);
-	p = radix_tree_lookup(&mapping->page_tree, i);
-	read_unlock_irq(&mapping->tree_lock);
-	return p;
-}
-
-static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
-						   u64 start,
-						   unsigned long len,
-						   gfp_t mask)
-{
-	struct extent_buffer *eb = NULL;
-
-	spin_lock(&tree->lru_lock);
-	eb = find_lru(tree, start, len);
-	spin_unlock(&tree->lru_lock);
-	if (eb) {
-		return eb;
-	}
-
-	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
-	INIT_LIST_HEAD(&eb->lru);
-	eb->start = start;
-	eb->len = len;
-	atomic_set(&eb->refs, 1);
-
-	return eb;
-}
-
-static void __free_extent_buffer(struct extent_buffer *eb)
-{
-	kmem_cache_free(extent_buffer_cache, eb);
-}
-
-struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0,
-					  gfp_t mask)
-{
-	unsigned long num_pages = num_extent_pages(start, len);
-	unsigned long i;
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	struct extent_buffer *eb;
-	struct page *p;
-	struct address_space *mapping = tree->mapping;
-	int uptodate = 1;
-
-	eb = __alloc_extent_buffer(tree, start, len, mask);
-	if (!eb || IS_ERR(eb))
-		return NULL;
-
-	if (eb->flags & EXTENT_BUFFER_FILLED)
-		goto lru_add;
-
-	if (page0) {
-		eb->first_page = page0;
-		i = 1;
-		index++;
-		page_cache_get(page0);
-		mark_page_accessed(page0);
-		set_page_extent_mapped(page0);
-		WARN_ON(!PageUptodate(page0));
-		set_page_extent_head(page0, len);
-	} else {
-		i = 0;
-	}
-	for (; i < num_pages; i++, index++) {
-		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
-		if (!p) {
-			WARN_ON(1);
-			goto fail;
-		}
-		set_page_extent_mapped(p);
-		mark_page_accessed(p);
-		if (i == 0) {
-			eb->first_page = p;
-			set_page_extent_head(p, len);
-		} else {
-			set_page_private(p, EXTENT_PAGE_PRIVATE);
-		}
-		if (!PageUptodate(p))
-			uptodate = 0;
-		unlock_page(p);
-	}
-	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
-
-lru_add:
-	spin_lock(&tree->lru_lock);
-	add_lru(tree, eb);
-	spin_unlock(&tree->lru_lock);
-	return eb;
-
-fail:
-	spin_lock(&tree->lru_lock);
-	list_del_init(&eb->lru);
-	spin_unlock(&tree->lru_lock);
-	if (!atomic_dec_and_test(&eb->refs))
-		return NULL;
-	for (index = 1; index < i; index++) {
-		page_cache_release(extent_buffer_page(eb, index));
-	}
-	if (i > 0)
-		page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
-	return NULL;
-}
-EXPORT_SYMBOL(alloc_extent_buffer);
-
-struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
-					 u64 start, unsigned long len,
-					  gfp_t mask)
-{
-	unsigned long num_pages = num_extent_pages(start, len);
-	unsigned long i;
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	struct extent_buffer *eb;
-	struct page *p;
-	struct address_space *mapping = tree->mapping;
-	int uptodate = 1;
-
-	eb = __alloc_extent_buffer(tree, start, len, mask);
-	if (!eb || IS_ERR(eb))
-		return NULL;
-
-	if (eb->flags & EXTENT_BUFFER_FILLED)
-		goto lru_add;
-
-	for (i = 0; i < num_pages; i++, index++) {
-		p = find_lock_page(mapping, index);
-		if (!p) {
-			goto fail;
-		}
-		set_page_extent_mapped(p);
-		mark_page_accessed(p);
-
-		if (i == 0) {
-			eb->first_page = p;
-			set_page_extent_head(p, len);
-		} else {
-			set_page_private(p, EXTENT_PAGE_PRIVATE);
-		}
-
-		if (!PageUptodate(p))
-			uptodate = 0;
-		unlock_page(p);
-	}
-	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
-
-lru_add:
-	spin_lock(&tree->lru_lock);
-	add_lru(tree, eb);
-	spin_unlock(&tree->lru_lock);
-	return eb;
-fail:
-	spin_lock(&tree->lru_lock);
-	list_del_init(&eb->lru);
-	spin_unlock(&tree->lru_lock);
-	if (!atomic_dec_and_test(&eb->refs))
-		return NULL;
-	for (index = 1; index < i; index++) {
-		page_cache_release(extent_buffer_page(eb, index));
-	}
-	if (i > 0)
-		page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
-	return NULL;
-}
-EXPORT_SYMBOL(find_extent_buffer);
-
-void free_extent_buffer(struct extent_buffer *eb)
-{
-	unsigned long i;
-	unsigned long num_pages;
-
-	if (!eb)
-		return;
-
-	if (!atomic_dec_and_test(&eb->refs))
-		return;
-
-	WARN_ON(!list_empty(&eb->lru));
-	num_pages = num_extent_pages(eb->start, eb->len);
-
-	for (i = 1; i < num_pages; i++) {
-		page_cache_release(extent_buffer_page(eb, i));
-	}
-	page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
-}
-EXPORT_SYMBOL(free_extent_buffer);
-
-int clear_extent_buffer_dirty(struct extent_map_tree *tree,
-			      struct extent_buffer *eb)
-{
-	int set;
-	unsigned long i;
-	unsigned long num_pages;
-	struct page *page;
-
-	u64 start = eb->start;
-	u64 end = start + eb->len - 1;
-
-	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
-	num_pages = num_extent_pages(eb->start, eb->len);
-
-	for (i = 0; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		lock_page(page);
-		if (i == 0)
-			set_page_extent_head(page, eb->len);
-		else
-			set_page_private(page, EXTENT_PAGE_PRIVATE);
-
-		/*
-		 * if we're on the last page or the first page and the
-		 * block isn't aligned on a page boundary, do extra checks
-		 * to make sure we don't clean page that is partially dirty
-		 */
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			start = (u64)page->index << PAGE_CACHE_SHIFT;
-			end  = start + PAGE_CACHE_SIZE - 1;
-			if (test_range_bit(tree, start, end,
-					   EXTENT_DIRTY, 0)) {
-				unlock_page(page);
-				continue;
-			}
-		}
-		clear_page_dirty_for_io(page);
-		write_lock_irq(&page->mapping->tree_lock);
-		if (!PageDirty(page)) {
-			radix_tree_tag_clear(&page->mapping->page_tree,
-						page_index(page),
-						PAGECACHE_TAG_DIRTY);
-		}
-		write_unlock_irq(&page->mapping->tree_lock);
-		unlock_page(page);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(clear_extent_buffer_dirty);
-
-int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
-				    struct extent_buffer *eb)
-{
-	return wait_on_extent_writeback(tree, eb->start,
-					eb->start + eb->len - 1);
-}
-EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
-
-int set_extent_buffer_dirty(struct extent_map_tree *tree,
-			     struct extent_buffer *eb)
-{
-	unsigned long i;
-	unsigned long num_pages;
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = extent_buffer_page(eb, i);
-		/* writepage may need to do something special for the
-		 * first page, we have to make sure page->private is
-		 * properly set.  releasepage may drop page->private
-		 * on us if the page isn't already dirty.
-		 */
-		if (i == 0) {
-			lock_page(page);
-			set_page_extent_head(page, eb->len);
-		} else if (PagePrivate(page) &&
-			   page->private != EXTENT_PAGE_PRIVATE) {
-			lock_page(page);
-			set_page_extent_mapped(page);
-			unlock_page(page);
-		}
-		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		if (i == 0)
-			unlock_page(page);
-	}
-	return set_extent_dirty(tree, eb->start,
-				eb->start + eb->len - 1, GFP_NOFS);
-}
-EXPORT_SYMBOL(set_extent_buffer_dirty);
-
-int set_extent_buffer_uptodate(struct extent_map_tree *tree,
-				struct extent_buffer *eb)
-{
-	unsigned long i;
-	struct page *page;
-	unsigned long num_pages;
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-
-	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-			    GFP_NOFS);
-	for (i = 0; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			check_page_uptodate(tree, page);
-			continue;
-		}
-		SetPageUptodate(page);
-	}
-	return 0;
-}
-EXPORT_SYMBOL(set_extent_buffer_uptodate);
-
-int extent_buffer_uptodate(struct extent_map_tree *tree,
-			     struct extent_buffer *eb)
-{
-	if (eb->flags & EXTENT_UPTODATE)
-		return 1;
-	return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1);
-}
-EXPORT_SYMBOL(extent_buffer_uptodate);
-
-int read_extent_buffer_pages(struct extent_map_tree *tree,
-			     struct extent_buffer *eb,
-			     u64 start,
-			     int wait)
-{
-	unsigned long i;
-	unsigned long start_i;
-	struct page *page;
-	int err;
-	int ret = 0;
-	unsigned long num_pages;
-
-	if (eb->flags & EXTENT_UPTODATE)
-		return 0;
-
-	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-			   EXTENT_UPTODATE, 1)) {
-		return 0;
-	}
-
-	if (start) {
-		WARN_ON(start < eb->start);
-		start_i = (start >> PAGE_CACHE_SHIFT) -
-			(eb->start >> PAGE_CACHE_SHIFT);
-	} else {
-		start_i = 0;
-	}
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = start_i; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		if (PageUptodate(page)) {
-			continue;
-		}
-		if (!wait) {
-			if (TestSetPageLocked(page)) {
-				continue;
-			}
-		} else {
-			lock_page(page);
-		}
-		if (!PageUptodate(page)) {
-			err = page->mapping->a_ops->readpage(NULL, page);
-			if (err) {
-				ret = err;
-			}
-		} else {
-			unlock_page(page);
-		}
-	}
-
-	if (ret || !wait) {
-		return ret;
-	}
-
-	for (i = start_i; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		wait_on_page_locked(page);
-		if (!PageUptodate(page)) {
-			ret = -EIO;
-		}
-	}
-	if (!ret)
-		eb->flags |= EXTENT_UPTODATE;
-	return ret;
-}
-EXPORT_SYMBOL(read_extent_buffer_pages);
-
-void read_extent_buffer(struct extent_buffer *eb, void *dstv,
-			unsigned long start,
-			unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	char *dst = (char *)dstv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		if (!PageUptodate(page)) {
-			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
-			WARN_ON(1);
-		}
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, (PAGE_CACHE_SIZE - offset));
-		kaddr = kmap_atomic(page, KM_USER1);
-		memcpy(dst, kaddr + offset, cur);
-		kunmap_atomic(kaddr, KM_USER1);
-
-		dst += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(read_extent_buffer);
-
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
-			       unsigned long min_len, char **token, char **map,
-			       unsigned long *map_start,
-			       unsigned long *map_len, int km)
-{
-	size_t offset = start & (PAGE_CACHE_SIZE - 1);
-	char *kaddr;
-	struct page *p;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	unsigned long end_i = (start_offset + start + min_len - 1) >>
-		PAGE_CACHE_SHIFT;
-
-	if (i != end_i)
-		return -EINVAL;
-
-	if (i == 0) {
-		offset = start_offset;
-		*map_start = 0;
-	} else {
-		offset = 0;
-		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
-	}
-	if (start + min_len > eb->len) {
-printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
-		WARN_ON(1);
-	}
-
-	p = extent_buffer_page(eb, i);
-	WARN_ON(!PageUptodate(p));
-	kaddr = kmap_atomic(p, km);
-	*token = kaddr;
-	*map = kaddr + offset;
-	*map_len = PAGE_CACHE_SIZE - offset;
-	return 0;
-}
-EXPORT_SYMBOL(map_private_extent_buffer);
-
-int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
-		      unsigned long min_len,
-		      char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km)
-{
-	int err;
-	int save = 0;
-	if (eb->map_token) {
-		unmap_extent_buffer(eb, eb->map_token, km);
-		eb->map_token = NULL;
-		save = 1;
-	}
-	err = map_private_extent_buffer(eb, start, min_len, token, map,
-				       map_start, map_len, km);
-	if (!err && save) {
-		eb->map_token = *token;
-		eb->kaddr = *map;
-		eb->map_start = *map_start;
-		eb->map_len = *map_len;
-	}
-	return err;
-}
-EXPORT_SYMBOL(map_extent_buffer);
-
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
-{
-	kunmap_atomic(token, km);
-}
-EXPORT_SYMBOL(unmap_extent_buffer);
-
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	char *ptr = (char *)ptrv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	int ret = 0;
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, (PAGE_CACHE_SIZE - offset));
-
-		kaddr = kmap_atomic(page, KM_USER0);
-		ret = memcmp(ptr, kaddr + offset, cur);
-		kunmap_atomic(kaddr, KM_USER0);
-		if (ret)
-			break;
-
-		ptr += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-	return ret;
-}
-EXPORT_SYMBOL(memcmp_extent_buffer);
-
-void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
-			 unsigned long start, unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	char *src = (char *)srcv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, PAGE_CACHE_SIZE - offset);
-		kaddr = kmap_atomic(page, KM_USER1);
-		memcpy(kaddr + offset, src, cur);
-		kunmap_atomic(kaddr, KM_USER1);
-
-		src += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(write_extent_buffer);
-
-void memset_extent_buffer(struct extent_buffer *eb, char c,
-			  unsigned long start, unsigned long len)
-{
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-
-	WARN_ON(start > eb->len);
-	WARN_ON(start + len > eb->start + eb->len);
-
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(eb, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, PAGE_CACHE_SIZE - offset);
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr + offset, c, cur);
-		kunmap_atomic(kaddr, KM_USER0);
-
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(memset_extent_buffer);
-
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
-			unsigned long dst_offset, unsigned long src_offset,
-			unsigned long len)
-{
-	u64 dst_len = dst->len;
-	size_t cur;
-	size_t offset;
-	struct page *page;
-	char *kaddr;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
-
-	WARN_ON(src->len != dst_len);
-
-	offset = (start_offset + dst_offset) &
-		((unsigned long)PAGE_CACHE_SIZE - 1);
-
-	while(len > 0) {
-		page = extent_buffer_page(dst, i);
-		WARN_ON(!PageUptodate(page));
-
-		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
-
-		kaddr = kmap_atomic(page, KM_USER0);
-		read_extent_buffer(src, kaddr + offset, src_offset, cur);
-		kunmap_atomic(kaddr, KM_USER0);
-
-		src_offset += cur;
-		len -= cur;
-		offset = 0;
-		i++;
-	}
-}
-EXPORT_SYMBOL(copy_extent_buffer);
-
-static void move_pages(struct page *dst_page, struct page *src_page,
-		       unsigned long dst_off, unsigned long src_off,
-		       unsigned long len)
-{
-	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
-	if (dst_page == src_page) {
-		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
-	} else {
-		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
-		char *p = dst_kaddr + dst_off + len;
-		char *s = src_kaddr + src_off + len;
-
-		while (len--)
-			*--p = *--s;
-
-		kunmap_atomic(src_kaddr, KM_USER1);
-	}
-	kunmap_atomic(dst_kaddr, KM_USER0);
-}
-
-static void copy_pages(struct page *dst_page, struct page *src_page,
-		       unsigned long dst_off, unsigned long src_off,
-		       unsigned long len)
-{
-	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
-	char *src_kaddr;
-
-	if (dst_page != src_page)
-		src_kaddr = kmap_atomic(src_page, KM_USER1);
-	else
-		src_kaddr = dst_kaddr;
-
-	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
-	kunmap_atomic(dst_kaddr, KM_USER0);
-	if (dst_page != src_page)
-		kunmap_atomic(src_kaddr, KM_USER1);
-}
-
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len)
-{
-	size_t cur;
-	size_t dst_off_in_page;
-	size_t src_off_in_page;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long dst_i;
-	unsigned long src_i;
-
-	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
-		BUG_ON(1);
-	}
-	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
-		BUG_ON(1);
-	}
-
-	while(len > 0) {
-		dst_off_in_page = (start_offset + dst_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-		src_off_in_page = (start_offset + src_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-
-		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
-		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
-
-		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
-					       src_off_in_page));
-		cur = min_t(unsigned long, cur,
-			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
-
-		copy_pages(extent_buffer_page(dst, dst_i),
-			   extent_buffer_page(dst, src_i),
-			   dst_off_in_page, src_off_in_page, cur);
-
-		src_offset += cur;
-		dst_offset += cur;
-		len -= cur;
-	}
-}
-EXPORT_SYMBOL(memcpy_extent_buffer);
-
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len)
-{
-	size_t cur;
-	size_t dst_off_in_page;
-	size_t src_off_in_page;
-	unsigned long dst_end = dst_offset + len - 1;
-	unsigned long src_end = src_offset + len - 1;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long dst_i;
-	unsigned long src_i;
-
-	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
-		BUG_ON(1);
-	}
-	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
-		BUG_ON(1);
-	}
-	if (dst_offset < src_offset) {
-		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
-		return;
-	}
-	while(len > 0) {
-		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
-		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
-
-		dst_off_in_page = (start_offset + dst_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-		src_off_in_page = (start_offset + src_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
-
-		cur = min_t(unsigned long, len, src_off_in_page + 1);
-		cur = min(cur, dst_off_in_page + 1);
-		move_pages(extent_buffer_page(dst, dst_i),
-			   extent_buffer_page(dst, src_i),
-			   dst_off_in_page - cur + 1,
-			   src_off_in_page - cur + 1, cur);
-
-		dst_end -= cur;
-		src_end -= cur;
-		len -= cur;
-	}
-}
-EXPORT_SYMBOL(memmove_extent_buffer);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ea60f5447b5..56314217cfc 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,215 +3,53 @@
 
 #include <linux/rbtree.h>
 
+#define EXTENT_MAP_LAST_BYTE (u64)-4
 #define EXTENT_MAP_HOLE (u64)-3
 #define EXTENT_MAP_INLINE (u64)-2
 #define EXTENT_MAP_DELALLOC (u64)-1
 
-/* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_DEFRAG_DONE (1 << 7)
-#define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_CSUM (1 << 9)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-
-/*
- * page->private values.  Every page that is controlled by the extent
- * map has page->private set to one.
- */
-#define EXTENT_PAGE_PRIVATE 1
-#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
-
-
-struct extent_map_ops {
-	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
-	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end);
-	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end);
-};
-
-struct extent_map_tree {
-	struct rb_root map;
-	struct rb_root state;
-	struct address_space *mapping;
-	u64 dirty_bytes;
-	rwlock_t lock;
-	struct extent_map_ops *ops;
-	spinlock_t lru_lock;
-	struct list_head buffer_lru;
-	int lru_size;
-};
-
-/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
 struct extent_map {
-	u64 start;
-	u64 end; /* inclusive */
-	int in_tree;
 	struct rb_node rb_node;
-	/* block_start and block_end are in bytes */
+
+	/* all of these are in bytes */
+	u64 start;
+	u64 len;
 	u64 block_start;
-	u64 block_end; /* inclusive */
+	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-};
-
-/* note, this must start with the same fields as fs/extent_map.c:tree_entry */
-struct extent_state {
-	u64 start;
-	u64 end; /* inclusive */
 	int in_tree;
-	struct rb_node rb_node;
-	wait_queue_head_t wq;
-	atomic_t refs;
-	unsigned long state;
-
-	/* for use by the FS */
-	u64 private;
-
-	struct list_head list;
 };
 
-struct extent_buffer {
-	u64 start;
-	unsigned long len;
-	char *map_token;
-	char *kaddr;
-	unsigned long map_start;
-	unsigned long map_len;
-	struct page *first_page;
-	struct list_head lru;
-	atomic_t refs;
-	int flags;
+struct extent_map_tree {
+	struct rb_root map;
+	struct extent_map *last;
+	spinlock_t lock;
 };
 
-typedef struct extent_map *(get_extent_t)(struct inode *inode,
-					  struct page *page,
-					  size_t page_offset,
-					  u64 start, u64 end,
-					  int create);
+static inline u64 extent_map_end(struct extent_map *em)
+{
+	if (em->start + em->len < em->start)
+		return (u64)-1;
+	return em->start + em->len;
+}
+
+static inline u64 extent_map_block_end(struct extent_map *em)
+{
+	if (em->block_start + em->len < em->block_start)
+		return (u64)-1;
+	return em->block_start + em->len;
+}
 
-void extent_map_tree_init(struct extent_map_tree *tree,
-			  struct address_space *mapping, gfp_t mask);
-void extent_map_tree_empty_lru(struct extent_map_tree *tree);
+void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
-					 u64 start, u64 end);
+					 u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
-int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page);
-int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
-int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask);
+
 struct extent_map *alloc_extent_map(gfp_t mask);
 void free_extent_map(struct extent_map *em);
-int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-
-u64 count_range_bits(struct extent_map_tree *tree,
-		     u64 *start, u64 search_end,
-		     u64 max_bytes, unsigned long bits);
-
-int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
-		   int bits, int filled);
-int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask);
-int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask);
-int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
-			gfp_t mask);
-int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
-		   gfp_t mask);
-int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
-int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
-		       gfp_t mask);
-int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
-int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits);
-int extent_invalidatepage(struct extent_map_tree *tree,
-			  struct page *page, unsigned long offset);
-int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
-			  get_extent_t *get_extent,
-			  struct writeback_control *wbc);
-int extent_writepages(struct extent_map_tree *tree,
-		      struct address_space *mapping,
-		      get_extent_t *get_extent,
-		      struct writeback_control *wbc);
-int extent_readpages(struct extent_map_tree *tree,
-		     struct address_space *mapping,
-		     struct list_head *pages, unsigned nr_pages,
-		     get_extent_t get_extent);
-int extent_prepare_write(struct extent_map_tree *tree,
-			 struct inode *inode, struct page *page,
-			 unsigned from, unsigned to, get_extent_t *get_extent);
-int extent_commit_write(struct extent_map_tree *tree,
-			struct inode *inode, struct page *page,
-			unsigned from, unsigned to);
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-		get_extent_t *get_extent);
-int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end);
-int set_state_private(struct extent_map_tree *tree, u64 start, u64 private);
-int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private);
-void set_page_extent_mapped(struct page *page);
-
-struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0,
-					  gfp_t mask);
-struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
-					 u64 start, unsigned long len,
-					  gfp_t mask);
-void free_extent_buffer(struct extent_buffer *eb);
-int read_extent_buffer_pages(struct extent_map_tree *tree,
-			     struct extent_buffer *eb, u64 start, int wait);
-
-static inline void extent_buffer_get(struct extent_buffer *eb)
-{
-	atomic_inc(&eb->refs);
-}
-
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
-			  unsigned long start,
-			  unsigned long len);
-void read_extent_buffer(struct extent_buffer *eb, void *dst,
-			unsigned long start,
-			unsigned long len);
-void write_extent_buffer(struct extent_buffer *eb, const void *src,
-			 unsigned long start, unsigned long len);
-void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
-			unsigned long dst_offset, unsigned long src_offset,
-			unsigned long len);
-void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len);
-void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
-			   unsigned long src_offset, unsigned long len);
-void memset_extent_buffer(struct extent_buffer *eb, char c,
-			  unsigned long start, unsigned long len);
-int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
-				    struct extent_buffer *eb);
-int clear_extent_buffer_dirty(struct extent_map_tree *tree,
-			      struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_map_tree *tree,
-			     struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_map_tree *tree,
-			       struct extent_buffer *eb);
-int extent_buffer_uptodate(struct extent_map_tree *tree,
-			   struct extent_buffer *eb);
-int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km);
-int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
-		      unsigned long min_len, char **token, char **map,
-		      unsigned long *map_start,
-		      unsigned long *map_len, int km);
-void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1cd8c908811..c5bb00f9239 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -233,8 +233,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	int err = 0;
 	int i;
 	struct inode *inode = fdentry(file)->d_inode;
-	struct extent_map *em;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	u64 hint_byte;
 	u64 num_bytes;
 	u64 start_pos;
@@ -242,11 +241,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 end_pos = pos + write_bytes;
 	u64 inline_size;
 	loff_t isize = i_size_read(inode);
-	em = alloc_extent_map(GFP_NOFS);
-	if (!em)
-		return -ENOMEM;
-
-	em->bdev = inode->i_sb->s_bdev;
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
 	num_bytes = (write_bytes + pos - start_pos +
@@ -254,7 +248,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 	end_of_last_block = start_pos + num_bytes - 1;
 
-	lock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
+	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
@@ -268,7 +262,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	if ((end_of_last_block & 4095) == 0) {
 		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
 	}
-	set_extent_uptodate(em_tree, start_pos, end_of_last_block, GFP_NOFS);
+	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
 	/* FIXME...EIEIO, ENOSPC and more */
 
@@ -293,6 +287,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 						       inode->i_ino,
 						       last_pos_in_file,
 						       0, 0, hole_size);
+			btrfs_drop_extent_cache(inode, last_pos_in_file,
+					last_pos_in_file + hole_size -1);
 			btrfs_check_file(root, inode);
 		}
 		if (err)
@@ -320,12 +316,12 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		last_end += PAGE_CACHE_SIZE - 1;
 		if (start_pos < isize) {
 			u64 delalloc_start = start_pos;
-			existing_delalloc = count_range_bits(em_tree,
+			existing_delalloc = count_range_bits(io_tree,
 					     &delalloc_start,
 					     end_of_last_block, (u64)-1,
 					     EXTENT_DELALLOC);
 		}
-		set_extent_delalloc(em_tree, start_pos, end_of_last_block,
+		set_extent_delalloc(io_tree, start_pos, end_of_last_block,
 				 GFP_NOFS);
 		spin_lock(&root->fs_info->delalloc_lock);
 		root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
@@ -346,6 +342,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		inline_size -= start_pos;
 		err = insert_inline_extent(trans, root, inode, start_pos,
 					   inline_size, pages, 0, num_pages);
+		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
 		BUG_ON(err);
 	}
 	if (end_pos > isize) {
@@ -356,8 +353,7 @@ failed:
 	err = btrfs_end_transaction(trans, root);
 out_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
-	unlock_extent(em_tree, start_pos, end_of_last_block, GFP_NOFS);
-	free_extent_map(em);
+	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
 }
 
@@ -367,10 +363,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 
 	while(1) {
+		spin_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, end);
-		if (!em)
+		if (!em) {
+			spin_unlock(&em_tree->lock);
 			break;
+		}
 		remove_extent_mapping(em_tree, em);
+		spin_unlock(&em_tree->lock);
+
 		/* once for us */
 		free_extent_map(em);
 		/* once for the tree*/
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67005480e13..16d3aef45d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -53,7 +53,7 @@ static struct inode_operations btrfs_file_inode_operations;
 static struct address_space_operations btrfs_aops;
 static struct address_space_operations btrfs_symlink_aops;
 static struct file_operations btrfs_dir_file_operations;
-static struct extent_map_ops btrfs_extent_map_ops;
+static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
@@ -104,6 +104,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	u64 num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
+	u64 orig_start = start;
+	u64 orig_num_bytes;
 	struct btrfs_key ins;
 	int ret;
 
@@ -115,6 +117,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	num_bytes = max(blocksize,  num_bytes);
 	ret = btrfs_drop_extents(trans, root, inode,
 				 start, start + num_bytes, start, &alloc_hint);
+	orig_num_bytes = num_bytes;
 
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
@@ -138,6 +141,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+	btrfs_drop_extent_cache(inode, orig_start,
+				orig_start + orig_num_bytes - 1);
 	btrfs_add_ordered_inode(inode);
 out:
 	btrfs_end_transaction(trans, root);
@@ -297,7 +302,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	int ret = 0;
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_csum_item *item;
 	struct btrfs_path *path = NULL;
 	u32 csum;
@@ -317,7 +322,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
 			   BTRFS_CRC32_SIZE);
-	set_state_private(em_tree, start, csum);
+	set_state_private(io_tree, start, csum);
 out:
 	if (path)
 		btrfs_free_path(path);
@@ -329,17 +334,19 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
 	struct inode *inode = page->mapping->host;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	char *kaddr;
 	u64 private;
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
 	unsigned long flags;
+
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-	ret = get_state_private(em_tree, start, &private);
+
+	ret = get_state_private(io_tree, start, &private);
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
 	if (ret) {
@@ -428,7 +435,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
-		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		break;
@@ -873,7 +880,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 			      size_t zero_start)
 {
 	char *kaddr;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -884,12 +891,12 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 	WARN_ON(!PageLocked(page));
 	set_page_extent_mapped(page);
 
-	lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 	delalloc_start = page_start;
-	existing_delalloc = count_range_bits(&BTRFS_I(inode)->extent_tree,
+	existing_delalloc = count_range_bits(&BTRFS_I(inode)->io_tree,
 					     &delalloc_start, page_end,
 					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
-	set_extent_delalloc(&BTRFS_I(inode)->extent_tree, page_start,
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
 			    page_end, GFP_NOFS);
 
 	spin_lock(&root->fs_info->delalloc_lock);
@@ -903,7 +910,7 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 		kunmap(page);
 	}
 	set_page_dirty(page);
-	unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 	return ret;
 }
@@ -961,7 +968,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
 		struct btrfs_trans_handle *trans;
 		struct btrfs_root *root = BTRFS_I(inode)->root;
-		struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+		struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 
 		u64 mask = root->sectorsize - 1;
 		u64 pos = (inode->i_size + mask) & ~mask;
@@ -986,7 +993,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		lock_extent(em_tree, pos, block_end, GFP_NOFS);
+		lock_extent(io_tree, pos, block_end, GFP_NOFS);
 		hole_size = block_end - hole_start;
 
 		mutex_lock(&root->fs_info->fs_mutex);
@@ -1001,11 +1008,13 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 						       inode->i_ino,
 						       hole_start, 0, 0,
 						       hole_size);
+			btrfs_drop_extent_cache(inode, hole_start,
+						hole_size - 1);
 			btrfs_check_file(root, inode);
 		}
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
-		unlock_extent(em_tree, pos, block_end, GFP_NOFS);
+		unlock_extent(io_tree, pos, block_end, GFP_NOFS);
 		if (err)
 			return err;
 	}
@@ -1189,7 +1198,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
 	return 0;
 }
@@ -1485,7 +1495,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
 	BTRFS_I(inode)->root = root;
 
@@ -1672,9 +1683,10 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
-		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -1816,7 +1828,7 @@ out_unlock:
 }
 
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 end,
+				    size_t page_offset, u64 start, u64 len,
 				    int create)
 {
 	int ret;
@@ -1826,7 +1838,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
 	u32 found_type;
-	int failed_insert = 0;
 	struct btrfs_path *path;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_file_extent_item *item;
@@ -1834,6 +1845,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct btrfs_key found_key;
 	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
 
 	path = btrfs_alloc_path();
@@ -1841,24 +1853,26 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	mutex_lock(&root->fs_info->fs_mutex);
 
 again:
-	em = lookup_extent_mapping(em_tree, start, end);
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	spin_unlock(&em_tree->lock);
+
 	if (em) {
 		if (em->start > start) {
-			printk("get_extent start %Lu em start %Lu\n",
-			       start, em->start);
+			printk("get_extent lookup [%Lu %Lu] em [%Lu %Lu]\n",
+			       start, len, em->start, em->len);
 			WARN_ON(1);
 		}
 		goto out;
 	}
+	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
-		em = alloc_extent_map(GFP_NOFS);
-		if (!em) {
-			err = -ENOMEM;
-			goto out;
-		}
-		em->start = EXTENT_MAP_HOLE;
-		em->end = EXTENT_MAP_HOLE;
+		err = -ENOMEM;
+		goto out;
 	}
+
+	em->start = EXTENT_MAP_HOLE;
+	em->len = (u64)-1;
 	em->bdev = inode->i_sb->s_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
@@ -1893,28 +1907,25 @@ again:
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
-				if (end < extent_start)
+				if (start + len <= extent_start)
 					goto not_found;
-				em->end = extent_end - 1;
+				em->len = extent_end - extent_start;
 			} else {
-				em->end = end;
+				em->len = len;
 			}
 			goto not_found_em;
 		}
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0) {
 			em->start = extent_start;
-			em->end = extent_end - 1;
+			em->len = extent_end - extent_start;
 			em->block_start = EXTENT_MAP_HOLE;
-			em->block_end = EXTENT_MAP_HOLE;
 			goto insert;
 		}
 		bytenr += btrfs_file_extent_offset(leaf, item);
 		em->block_start = bytenr;
-		em->block_end = em->block_start +
-			btrfs_file_extent_num_bytes(leaf, item) - 1;
 		em->start = extent_start;
-		em->end = extent_end - 1;
+		em->len = extent_end - extent_start;
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		unsigned long ptr;
@@ -1925,25 +1936,24 @@ again:
 
 		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
 						    path->slots[0]));
-		extent_end = (extent_start + size - 1) |
-			((u64)root->sectorsize - 1);
+		extent_end = (extent_start + size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
 			em->start = start;
 			if (start < extent_start) {
-				if (end < extent_start)
+				if (start + len <= extent_start)
 					goto not_found;
-				em->end = extent_end;
+				em->len = extent_end - extent_start;
 			} else {
-				em->end = end;
+				em->len = len;
 			}
 			goto not_found_em;
 		}
 		em->block_start = EXTENT_MAP_INLINE;
-		em->block_end = EXTENT_MAP_INLINE;
 
 		if (!page) {
 			em->start = extent_start;
-			em->end = extent_start + size - 1;
+			em->len = size;
 			goto out;
 		}
 
@@ -1952,8 +1962,7 @@ again:
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - page_offset,
 				size - extent_offset);
 		em->start = extent_start + extent_offset;
-		em->end = (em->start + copy_size -1) |
-			((u64)root->sectorsize -1);
+		em->len = copy_size;
 		map = kmap(page);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
@@ -1974,7 +1983,8 @@ again:
 			btrfs_mark_buffer_dirty(leaf);
 		}
 		kunmap(page);
-		set_extent_uptodate(em_tree, em->start, em->end, GFP_NOFS);
+		set_extent_uptodate(io_tree, em->start,
+				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
 	} else {
 		printk("unkknown found_type %d\n", found_type);
@@ -1982,33 +1992,29 @@ again:
 	}
 not_found:
 	em->start = start;
-	em->end = end;
+	em->len = len;
 not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
-	em->block_end = EXTENT_MAP_HOLE;
 insert:
 	btrfs_release_path(root, path);
-	if (em->start > start || em->end < start) {
-		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->end, start, end);
+	if (em->start > start || extent_map_end(em) <= start) {
+		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
 		err = -EIO;
 		goto out;
 	}
+
+	err = 0;
+	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
 		free_extent_map(em);
-		em = NULL;
-		if (0 && failed_insert == 1) {
-			btrfs_drop_extent_cache(inode, start, end);
-		}
-		failed_insert++;
-		if (failed_insert > 5) {
-			printk("failing to insert %Lu %Lu\n", start, end);
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (!em) {
 			err = -EIO;
-			goto out;
+			printk("failing to insert %Lu %Lu\n", start, len);
 		}
-		goto again;
 	}
-	err = 0;
+	spin_unlock(&em_tree->lock);
 out:
 	btrfs_free_path(path);
 	if (trans) {
@@ -2032,14 +2038,14 @@ static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
 
 int btrfs_readpage(struct file *file, struct page *page)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_read_full_page(tree, page, btrfs_get_extent);
 }
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
 
 
 	if (current->flags & PF_MEMALLOC) {
@@ -2047,15 +2053,15 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 		unlock_page(page);
 		return 0;
 	}
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
 static int btrfs_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
 }
 
@@ -2063,19 +2069,21 @@ static int
 btrfs_readpages(struct file *file, struct address_space *mapping,
 		struct list_head *pages, unsigned nr_pages)
 {
-	struct extent_map_tree *tree;
-	tree = &BTRFS_I(mapping->host)->extent_tree;
+	struct extent_io_tree *tree;
+	tree = &BTRFS_I(mapping->host)->io_tree;
 	return extent_readpages(tree, mapping, pages, nr_pages,
 				btrfs_get_extent);
 }
 
 static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
+	struct extent_map_tree *map;
 	int ret;
 
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(tree, page);
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	map = &BTRFS_I(page->mapping->host)->extent_tree;
+	ret = try_release_extent_mapping(map, tree, page);
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
@@ -2086,9 +2094,9 @@ static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
-	struct extent_map_tree *tree;
+	struct extent_io_tree *tree;
 
-	tree = &BTRFS_I(page->mapping->host)->extent_tree;
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	extent_invalidatepage(tree, page, offset);
 	btrfs_releasepage(page, GFP_NOFS);
 }
@@ -2374,7 +2382,7 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 int btrfs_defrag_file(struct file *file) {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct page *page;
 	unsigned long last_index;
 	unsigned long ra_index = 0;
@@ -2414,13 +2422,13 @@ int btrfs_defrag_file(struct file *file) {
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-		lock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		delalloc_start = page_start;
 		existing_delalloc =
-			count_range_bits(&BTRFS_I(inode)->extent_tree,
+			count_range_bits(&BTRFS_I(inode)->io_tree,
 					 &delalloc_start, page_end,
 					 PAGE_CACHE_SIZE, EXTENT_DELALLOC);
-		set_extent_delalloc(em_tree, page_start,
+		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
 		spin_lock(&root->fs_info->delalloc_lock);
@@ -2428,7 +2436,7 @@ int btrfs_defrag_file(struct file *file) {
 						 existing_delalloc;
 		spin_unlock(&root->fs_info->delalloc_lock);
 
-		unlock_extent(em_tree, page_start, page_end, GFP_NOFS);
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
@@ -2842,9 +2850,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &btrfs_aops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree,
+		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
-		BTRFS_I(inode)->extent_tree.ops = &btrfs_extent_map_ops;
+		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2934,7 +2943,7 @@ static struct file_operations btrfs_dir_file_operations = {
 #endif
 };
 
-static struct extent_map_ops btrfs_extent_map_ops = {
+static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.writepage_io_hook = btrfs_writepage_io_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8b52c69fda2..f8a1016600b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -468,10 +468,15 @@ static int __init init_btrfs_fs(void)
 	err = btrfs_init_cachep();
 	if (err)
 		goto free_transaction_sys;
-	err = extent_map_init();
+
+	err = extent_io_init();
 	if (err)
 		goto free_cachep;
 
+	err = extent_map_init();
+	if (err)
+		goto free_extent_io;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto free_extent_map;
@@ -479,6 +484,8 @@ static int __init init_btrfs_fs(void)
 
 free_extent_map:
 	extent_map_exit();
+free_extent_io:
+	extent_io_exit();
 free_cachep:
 	btrfs_destroy_cachep();
 free_transaction_sys:
@@ -492,6 +499,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_exit_transaction_sys();
 	btrfs_destroy_cachep();
 	extent_map_exit();
+	extent_io_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 163c01a2449..b4a1bc62a78 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -70,7 +70,7 @@ static int join_transaction(struct btrfs_root *root)
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
 		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
-		extent_map_tree_init(&cur_trans->dirty_pages,
+		extent_io_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
 	} else {
@@ -153,7 +153,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 	int ret;
 	int err;
 	int werr = 0;
-	struct extent_map_tree *dirty_pages;
+	struct extent_io_tree *dirty_pages;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	u64 start;
@@ -610,7 +610,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
 	struct list_head dirty_fs_roots;
-	struct extent_map_tree *pinned_copy;
+	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
 
@@ -639,7 +639,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	if (!pinned_copy)
 		return -ENOMEM;
 
-	extent_map_tree_init(pinned_copy,
+	extent_io_tree_init(pinned_copy,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	trans->transaction->in_commit = 1;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index fd52e9b2392..c3172ddb332 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -29,7 +29,7 @@ struct btrfs_transaction {
 	int use_count;
 	int commit_done;
 	struct list_head list;
-	struct extent_map_tree dirty_pages;
+	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
 	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
-- 
cgit v1.2.3


From bea495e5b4d9746f26cf5b4100d24c77c3dcbd84 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jan 2008 16:13:14 -0500
Subject: Btrfs: Tune readahead during defrag to avoid reading too much at once

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1cf125ab782..c4fe37791d3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1856,27 +1856,44 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 }
 
 static void noinline reada_walk_down(struct btrfs_root *root,
-				     struct extent_buffer *node)
+				     struct extent_buffer *node,
+				     int slot)
 {
-	int i;
-	u32 nritems;
 	u64 bytenr;
-	int ret;
+	u64 last = 0;
+	u32 nritems;
 	u32 refs;
-	int level;
 	u32 blocksize;
+	int ret;
+	int i;
+	int level;
+	int skipped = 0;
 
 	nritems = btrfs_header_nritems(node);
 	level = btrfs_header_level(node);
-	for (i = 0; i < nritems; i++) {
+	if (level)
+		return;
+
+	for (i = slot; i < nritems && skipped < 32; i++) {
 		bytenr = btrfs_node_blockptr(node, i);
-		blocksize = btrfs_level_size(root, level - 1);
-		ret = lookup_extent_ref(NULL, root, bytenr, blocksize, &refs);
-		BUG_ON(ret);
-		if (refs != 1)
+		if (last && ((bytenr > last && bytenr - last > 32 * 1024) ||
+			     (last > bytenr && last - bytenr > 32 * 1024))) {
+			skipped++;
 			continue;
+		}
+		blocksize = btrfs_level_size(root, level - 1);
+		if (i != slot) {
+			ret = lookup_extent_ref(NULL, root, bytenr,
+						blocksize, &refs);
+			BUG_ON(ret);
+			if (refs != 1) {
+				skipped++;
+				continue;
+			}
+		}
 		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = readahead_tree_block(root, bytenr, blocksize);
+		last = bytenr + blocksize;
 		cond_resched();
 		mutex_lock(&root->fs_info->fs_mutex);
 		if (ret)
@@ -1919,9 +1936,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
 
-		if (*level > 0 && path->slots[*level] == 0)
-			reada_walk_down(root, cur);
-
 		if (btrfs_header_level(cur) != *level)
 			WARN_ON(1);
 
@@ -1951,6 +1965,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next)) {
 			free_extent_buffer(next);
+			reada_walk_down(root, cur, path->slots[*level]);
 			mutex_unlock(&root->fs_info->fs_mutex);
 			next = read_tree_block(root, bytenr, blocksize);
 			mutex_lock(&root->fs_info->fs_mutex);
-- 
cgit v1.2.3


From c2a8b6e11009398ca9363d8ba8d4e7e40fb897fd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 09:10:26 -0500
Subject: Btrfs: Force f_pos to the max when a readdir hits the end of the
 directory.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16d3aef45d1..026bc9c4c90 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1430,7 +1430,7 @@ read_dir_items:
 			di = (struct btrfs_dir_item *)((char *)di + di_len);
 		}
 	}
-	filp->f_pos++;
+	filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
 nopos:
 	ret = 0;
 err:
-- 
cgit v1.2.3


From aadfeb6e39ad6bde080cb3ab23f4da57ccb25f4a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 09:10:27 -0500
Subject: Btrfs: Add some extra debugging around file data checksum failures

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 4 ++++
 fs/btrfs/inode.c     | 8 +++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 614176e5285..3ebbc058d08 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -261,6 +261,10 @@ csum:
 found:
 	csum_result = btrfs_csum_data(root, data, csum_result, len);
 	btrfs_csum_final(csum_result, (char *)&csum_result);
+	if (csum_result == 0) {
+		printk("csum result is 0 for inode %lu offset %Lu\n", inode->i_ino, offset);
+	}
+
 	write_extent_buffer(leaf, &csum_result, (unsigned long)item,
 			    BTRFS_CRC32_SIZE);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 026bc9c4c90..48455127410 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -318,6 +318,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 		if (ret == -ENOENT || ret == -EFBIG)
 			ret = 0;
 		csum = 0;
+		printk("no csum found for inode %lu start %Lu\n", inode->i_ino, start);
 		goto out;
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
@@ -336,7 +337,7 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	char *kaddr;
-	u64 private;
+	u64 private = ~(u32)0;
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
@@ -362,8 +363,9 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	return 0;
 
 zeroit:
-	printk("btrfs csum failed ino %lu off %llu\n",
-	       page->mapping->host->i_ino, (unsigned long long)start);
+	printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
+	       page->mapping->host->i_ino, (unsigned long long)start, csum,
+	       private);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
-- 
cgit v1.2.3


From 70dec8079d78691e476cc6c7cede40656078ad30 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 09:59:12 -0500
Subject: Btrfs: extent_io and extent_state optimizations

The end_bio routines are changed to take a pointer to the extent state
struct, and the state tree is walked in order to set/clear appropriate
bits as IO completes.  This greatly reduces the number of rbtree searches
done by the end_bio handlers, and reduces lock contention.

The extent_io releasepage function is changed to avoid expensive searches
for locked state.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |   4 +-
 fs/btrfs/extent_io.c | 358 +++++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/extent_io.h |  15 ++-
 fs/btrfs/inode.c     |  35 +++--
 4 files changed, 298 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4c4ebea0b2a..e29a0716f6a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -274,7 +274,7 @@ int btree_readpage(struct file *file, struct page *page)
 	return extent_read_full_page(tree, page, btree_get_extent);
 }
 
-static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
+static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	struct extent_io_tree *tree;
 	struct extent_map_tree *map;
@@ -282,7 +282,7 @@ static int btree_releasepage(struct page *page, gfp_t unused_gfp_flags)
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(map, tree, page);
+	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 15cc158a049..1f734c34dc2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -27,13 +27,11 @@ static struct kmem_cache *extent_buffer_cache;
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
-static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
 #define BUFFER_LRU_MAX 64
 
 struct tree_entry {
 	u64 start;
 	u64 end;
-	int in_tree;
 	struct rb_node rb_node;
 };
 
@@ -69,7 +67,7 @@ void extent_io_exit(void)
 
 	while (!list_empty(&states)) {
 		state = list_entry(states.next, struct extent_state, list);
-		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
+		printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
 		list_del(&state->list);
 		kmem_cache_free(extent_state_cache, state);
 
@@ -87,7 +85,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 	tree->state.rb_node = NULL;
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
-	rwlock_init(&tree->lock);
+	spin_lock_init(&tree->lock);
 	spin_lock_init(&tree->lru_lock);
 	tree->mapping = mapping;
 	INIT_LIST_HEAD(&tree->buffer_lru);
@@ -110,18 +108,13 @@ EXPORT_SYMBOL(extent_io_tree_empty_lru);
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
-	unsigned long flags;
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state || IS_ERR(state))
 		return state;
 	state->state = 0;
-	state->in_tree = 0;
 	state->private = 0;
-
-	spin_lock_irqsave(&state_lock, flags);
-	list_add(&state->list, &states);
-	spin_unlock_irqrestore(&state_lock, flags);
+	state->tree = NULL;
 
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
@@ -131,14 +124,10 @@ EXPORT_SYMBOL(alloc_extent_state);
 
 void free_extent_state(struct extent_state *state)
 {
-	unsigned long flags;
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
-		WARN_ON(state->in_tree);
-		spin_lock_irqsave(&state_lock, flags);
-		list_del(&state->list);
-		spin_unlock_irqrestore(&state_lock, flags);
+		WARN_ON(state->tree);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -164,7 +153,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 	}
 
 	entry = rb_entry(node, struct tree_entry, rb_node);
-	entry->in_tree = 1;
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
 	return NULL;
@@ -216,8 +204,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 {
-	struct rb_node *prev;
+	struct rb_node *prev = NULL;
 	struct rb_node *ret;
+
 	ret = __tree_search(root, offset, &prev, NULL);
 	if (!ret)
 		return prev;
@@ -248,7 +237,7 @@ static int merge_state(struct extent_io_tree *tree,
 		if (other->end == state->start - 1 &&
 		    other->state == state->state) {
 			state->start = other->start;
-			other->in_tree = 0;
+			other->tree = NULL;
 			rb_erase(&other->rb_node, &tree->state);
 			free_extent_state(other);
 		}
@@ -259,7 +248,7 @@ static int merge_state(struct extent_io_tree *tree,
 		if (other->start == state->end + 1 &&
 		    other->state == state->state) {
 			other->start = state->start;
-			state->in_tree = 0;
+			state->tree = NULL;
 			rb_erase(&state->rb_node, &tree->state);
 			free_extent_state(state);
 		}
@@ -300,6 +289,7 @@ static int insert_state(struct extent_io_tree *tree,
 		free_extent_state(state);
 		return -EEXIST;
 	}
+	state->tree = tree;
 	merge_state(tree, state);
 	return 0;
 }
@@ -335,6 +325,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
+	prealloc->tree = tree;
 	return 0;
 }
 
@@ -361,9 +352,9 @@ static int clear_state_bit(struct extent_io_tree *tree,
 	if (wake)
 		wake_up(&state->wq);
 	if (delete || state->state == 0) {
-		if (state->in_tree) {
+		if (state->tree) {
 			rb_erase(&state->rb_node, &tree->state);
-			state->in_tree = 0;
+			state->tree = NULL;
 			free_extent_state(state);
 		} else {
 			WARN_ON(1);
@@ -404,7 +395,7 @@ again:
 			return -ENOMEM;
 	}
 
-	write_lock_irqsave(&tree->lock, flags);
+	spin_lock_irqsave(&tree->lock, flags);
 	/*
 	 * this search will find the extents that end after
 	 * our range starts
@@ -434,6 +425,8 @@ again:
 	 */
 
 	if (state->start < start) {
+		if (!prealloc)
+			prealloc = alloc_extent_state(GFP_ATOMIC);
 		err = split_state(tree, state, prealloc, start);
 		BUG_ON(err == -EEXIST);
 		prealloc = NULL;
@@ -455,6 +448,8 @@ again:
 	 * on the first half
 	 */
 	if (state->start <= end && state->end > end) {
+		if (!prealloc)
+			prealloc = alloc_extent_state(GFP_ATOMIC);
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
 
@@ -471,7 +466,7 @@ again:
 	goto search_again;
 
 out:
-	write_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	if (prealloc)
 		free_extent_state(prealloc);
 
@@ -480,7 +475,7 @@ out:
 search_again:
 	if (start > end)
 		goto out;
-	write_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
@@ -492,9 +487,9 @@ static int wait_on_state(struct extent_io_tree *tree,
 {
 	DEFINE_WAIT(wait);
 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-	read_unlock_irq(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	schedule();
-	read_lock_irq(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	finish_wait(&state->wq, &wait);
 	return 0;
 }
@@ -509,7 +504,7 @@ int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 	struct extent_state *state;
 	struct rb_node *node;
 
-	read_lock_irq(&tree->lock);
+	spin_lock_irq(&tree->lock);
 again:
 	while (1) {
 		/*
@@ -538,13 +533,13 @@ again:
 			break;
 
 		if (need_resched()) {
-			read_unlock_irq(&tree->lock);
+			spin_unlock_irq(&tree->lock);
 			cond_resched();
-			read_lock_irq(&tree->lock);
+			spin_lock_irq(&tree->lock);
 		}
 	}
 out:
-	read_unlock_irq(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return 0;
 }
 EXPORT_SYMBOL(wait_extent_bit);
@@ -589,7 +584,7 @@ again:
 			return -ENOMEM;
 	}
 
-	write_lock_irqsave(&tree->lock, flags);
+	spin_lock_irqsave(&tree->lock, flags);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -709,7 +704,7 @@ again:
 	goto search_again;
 
 out:
-	write_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	if (prealloc)
 		free_extent_state(prealloc);
 
@@ -718,7 +713,7 @@ out:
 search_again:
 	if (start > end)
 		goto out;
-	write_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
@@ -817,10 +812,6 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(wait_on_extent_writeback);
 
-/*
- * locks a range in ascending order, waiting for any locked regions
- * it hits on the way.  [start,end] are inclusive, and this will sleep.
- */
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 {
 	int err;
@@ -896,7 +887,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 	struct extent_state *state;
 	int ret = 1;
 
-	read_lock_irq(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -919,7 +910,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			break;
 	}
 out:
-	read_unlock_irq(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return ret;
 }
 EXPORT_SYMBOL(find_first_extent_bit);
@@ -933,7 +924,7 @@ u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 	u64 found = 0;
 	u64 total_bytes = 0;
 
-	write_lock_irq(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -976,9 +967,9 @@ search_again:
 			atomic_inc(&state->refs);
 			prepare_to_wait(&state->wq, &wait,
 					TASK_UNINTERRUPTIBLE);
-			write_unlock_irq(&tree->lock);
+			spin_unlock_irq(&tree->lock);
 			schedule();
-			write_lock_irq(&tree->lock);
+			spin_lock_irq(&tree->lock);
 			finish_wait(&state->wq, &wait);
 			free_extent_state(state);
 			goto search_again;
@@ -997,7 +988,7 @@ search_again:
 			break;
 	}
 out:
-	write_unlock_irq(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return found;
 }
 
@@ -1017,7 +1008,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		return 0;
 	}
 
-	write_lock_irq(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
 		total_bytes = tree->dirty_bytes;
 		goto out;
@@ -1050,7 +1041,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 			break;
 	}
 out:
-	write_unlock_irq(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return total_bytes;
 }
 /*
@@ -1122,7 +1113,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 	struct extent_state *state;
 	int ret = 0;
 
-	write_lock_irq(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -1139,7 +1130,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 	}
 	state->private = private;
 out:
-	write_unlock_irq(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return ret;
 }
 
@@ -1149,7 +1140,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 	struct extent_state *state;
 	int ret = 0;
 
-	read_lock_irq(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -1166,13 +1157,13 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 	}
 	*private = state->private;
 out:
-	read_unlock_irq(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return ret;
 }
 
 /*
  * searches a range in the state tree for a given mask.
- * If 'filled' == 1, this returns 1 only if ever extent in the tree
+ * If 'filled' == 1, this returns 1 only if every extent in the tree
  * has the bits set.  Otherwise, 1 is returned if any bit in the
  * range is found set.
  */
@@ -1184,7 +1175,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	int bitset = 0;
 	unsigned long flags;
 
-	read_lock_irqsave(&tree->lock, flags);
+	spin_lock_irqsave(&tree->lock, flags);
 	node = tree_search(&tree->state, start);
 	while (node && start <= end) {
 		state = rb_entry(node, struct extent_state, rb_node);
@@ -1215,7 +1206,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			break;
 		}
 	}
-	read_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	return bitset;
 }
 EXPORT_SYMBOL(test_range_bit);
@@ -1282,16 +1273,19 @@ static int end_bio_extent_writepage(struct bio *bio,
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_io_tree *tree = bio->bi_private;
+	struct extent_state *state = bio->bi_private;
+	struct extent_io_tree *tree = state->tree;
+	struct rb_node *node;
 	u64 start;
 	u64 end;
+	u64 cur;
 	int whole_page;
+	unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
 		return 1;
 #endif
-
 	do {
 		struct page *page = bvec->bv_page;
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1311,16 +1305,80 @@ static int end_bio_extent_writepage(struct bio *bio,
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
-		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+		if (tree->ops && tree->ops->writepage_end_io_hook) {
+			tree->ops->writepage_end_io_hook(page, start, end,
+							 state);
+		}
+
+		/*
+		 * bios can get merged in funny ways, and so we need to
+		 * be careful with the state variable.  We know the
+		 * state won't be merged with others because it has
+		 * WRITEBACK set, but we can't be sure each biovec is
+		 * sequential in the file.  So, if our cached state
+		 * doesn't match the expected end, search the tree
+		 * for the correct one.
+		 */
+
+		spin_lock_irqsave(&tree->lock, flags);
+		if (!state || state->end != end) {
+			state = NULL;
+			node = __tree_search(&tree->state, start, NULL, NULL);
+			if (node) {
+				state = rb_entry(node, struct extent_state,
+						 rb_node);
+				if (state->end != end ||
+				    !(state->state & EXTENT_WRITEBACK))
+					state = NULL;
+			}
+			if (!state) {
+				spin_unlock_irqrestore(&tree->lock, flags);
+				clear_extent_writeback(tree, start,
+						       end, GFP_ATOMIC);
+				goto next_io;
+			}
+		}
+		cur = end;
+		while(1) {
+			struct extent_state *clear = state;
+			cur = state->start;
+			node = rb_prev(&state->rb_node);
+			if (node) {
+				state = rb_entry(node,
+						 struct extent_state,
+						 rb_node);
+			} else {
+				state = NULL;
+			}
+
+			clear_state_bit(tree, clear, EXTENT_WRITEBACK,
+					1, 0);
+			if (cur == start)
+				break;
+			if (cur < start) {
+				WARN_ON(1);
+				break;
+			}
+			if (!node)
+				break;
+		}
+		/* before releasing the lock, make sure the next state
+		 * variable has the expected bits set and corresponds
+		 * to the correct offsets in the file
+		 */
+		if (state && (state->end + 1 != start ||
+		    !state->state & EXTENT_WRITEBACK)) {
+			state = NULL;
+		}
+		spin_unlock_irqrestore(&tree->lock, flags);
+next_io:
 
 		if (whole_page)
 			end_page_writeback(page);
 		else
 			check_page_writeback(tree, page);
-		if (tree->ops && tree->ops->writepage_end_io_hook)
-			tree->ops->writepage_end_io_hook(page, start, end);
 	} while (bvec >= bio->bi_io_vec);
-
 	bio_put(bio);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
@@ -1347,9 +1405,13 @@ static int end_bio_extent_readpage(struct bio *bio,
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_io_tree *tree = bio->bi_private;
+	struct extent_state *state = bio->bi_private;
+	struct extent_io_tree *tree = state->tree;
+	struct rb_node *node;
 	u64 start;
 	u64 end;
+	u64 cur;
+	unsigned long flags;
 	int whole_page;
 	int ret;
 
@@ -1373,27 +1435,83 @@ static int end_bio_extent_readpage(struct bio *bio,
 			prefetchw(&bvec->bv_page->flags);
 
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
-			ret = tree->ops->readpage_end_io_hook(page, start, end);
+			ret = tree->ops->readpage_end_io_hook(page, start, end,
+							      state);
 			if (ret)
 				uptodate = 0;
 		}
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
-			if (whole_page)
-				SetPageUptodate(page);
-			else
-				check_page_uptodate(tree, page);
-		} else {
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
 
-		unlock_extent(tree, start, end, GFP_ATOMIC);
+		spin_lock_irqsave(&tree->lock, flags);
+		if (!state || state->end != end) {
+			state = NULL;
+			node = __tree_search(&tree->state, start, NULL, NULL);
+			if (node) {
+				state = rb_entry(node, struct extent_state,
+						 rb_node);
+				if (state->end != end ||
+				    !(state->state & EXTENT_LOCKED))
+					state = NULL;
+			}
+			if (!state) {
+				spin_unlock_irqrestore(&tree->lock, flags);
+				set_extent_uptodate(tree, start, end,
+						    GFP_ATOMIC);
+				unlock_extent(tree, start, end, GFP_ATOMIC);
+				goto next_io;
+			}
+		}
 
-		if (whole_page)
+		cur = end;
+		while(1) {
+			struct extent_state *clear = state;
+			cur = state->start;
+			node = rb_prev(&state->rb_node);
+			if (node) {
+				state = rb_entry(node,
+					 struct extent_state,
+					 rb_node);
+			} else {
+				state = NULL;
+			}
+			clear->state |= EXTENT_UPTODATE;
+			clear_state_bit(tree, clear, EXTENT_LOCKED,
+					1, 0);
+			if (cur == start)
+				break;
+			if (cur < start) {
+				WARN_ON(1);
+				break;
+			}
+			if (!node)
+				break;
+		}
+		/* before releasing the lock, make sure the next state
+		 * variable has the expected bits set and corresponds
+		 * to the correct offsets in the file
+		 */
+		if (state && (state->end + 1 != start ||
+		    !state->state & EXTENT_WRITEBACK)) {
+			state = NULL;
+		}
+		spin_unlock_irqrestore(&tree->lock, flags);
+next_io:
+		if (whole_page) {
+			if (uptodate) {
+				SetPageUptodate(page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
 			unlock_page(page);
-		else
+		} else {
+			if (uptodate) {
+				check_page_uptodate(tree, page);
+			} else {
+				ClearPageUptodate(page);
+				SetPageError(page);
+			}
 			check_page_locked(tree, page);
+		}
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
@@ -1416,7 +1534,8 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_io_tree *tree = bio->bi_private;
+	struct extent_state *state = bio->bi_private;
+	struct extent_io_tree *tree = state->tree;
 	u64 start;
 	u64 end;
 
@@ -1475,6 +1594,29 @@ static int submit_one_bio(int rw, struct bio *bio)
 {
 	u64 maxsector;
 	int ret = 0;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct page *page = bvec->bv_page;
+	struct extent_io_tree *tree = bio->bi_private;
+	struct rb_node *node;
+	struct extent_state *state;
+	u64 start;
+	u64 end;
+
+	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+	end = start + bvec->bv_len - 1;
+
+	spin_lock_irq(&tree->lock);
+	node = __tree_search(&tree->state, start, NULL, NULL);
+	BUG_ON(!node);
+	state = rb_entry(node, struct extent_state, rb_node);
+	while(state->end < end) {
+		node = rb_next(node);
+		state = rb_entry(node, struct extent_state, rb_node);
+	}
+	BUG_ON(state->end != end);
+	spin_unlock_irq(&tree->lock);
+
+	bio->bi_private = state;
 
 	bio_get(bio);
 
@@ -1519,9 +1661,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	if (!bio) {
 		printk("failed to allocate bio nr %d\n", nr);
 	}
+
+
 	bio_add_page(bio, page, size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
+
 	if (bio_ret) {
 		*bio_ret = bio;
 	} else {
@@ -1635,6 +1780,16 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			page_offset += iosize;
 			continue;
 		}
+		/* we have an inline extent but it didn't get marked up
+		 * to date.  Error out
+		 */
+		if (block_start == EXTENT_MAP_INLINE) {
+			SetPageError(page);
+			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			cur = cur + iosize;
+			page_offset += iosize;
+			continue;
+		}
 
 		ret = 0;
 		if (tree->ops && tree->ops->readpage_io_hook) {
@@ -2205,7 +2360,8 @@ EXPORT_SYMBOL(extent_prepare_write);
  * map records are removed
  */
 int try_release_extent_mapping(struct extent_map_tree *map,
-			       struct extent_io_tree *tree, struct page *page)
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask)
 {
 	struct extent_map *em;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -2213,30 +2369,42 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 	u64 orig_start = start;
 	int ret = 1;
 
-	while (start <= end) {
-		spin_lock(&map->lock);
-		em = lookup_extent_mapping(map, start, end);
-		if (!em || IS_ERR(em)) {
+	if ((mask & __GFP_WAIT) &&
+	    page->mapping->host->i_size > 16 * 1024 * 1024) {
+		while (start <= end) {
+			spin_lock(&map->lock);
+			em = lookup_extent_mapping(map, start, end);
+			if (!em || IS_ERR(em)) {
+				spin_unlock(&map->lock);
+				break;
+			}
+			if (em->start != start) {
+				spin_unlock(&map->lock);
+				free_extent_map(em);
+				break;
+			}
+			if (!test_range_bit(tree, em->start,
+					    extent_map_end(em) - 1,
+					    EXTENT_LOCKED, 0)) {
+				remove_extent_mapping(map, em);
+				/* once for the rb tree */
+				free_extent_map(em);
+			}
+			start = extent_map_end(em);
 			spin_unlock(&map->lock);
-			break;
-		}
-		if (!test_range_bit(tree, em->start, extent_map_end(em) - 1,
-				    EXTENT_LOCKED, 0)) {
-			remove_extent_mapping(map, em);
-			/* once for the rb tree */
+
+			/* once for us */
 			free_extent_map(em);
 		}
-		start = extent_map_end(em);
-		spin_unlock(&map->lock);
-
-		/* once for us */
-		free_extent_map(em);
 	}
-	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
+	if (test_range_bit(tree, orig_start, end, EXTENT_IOBITS, 0))
 		ret = 0;
-	else
+	else {
+		if ((mask & GFP_NOFS) == GFP_NOFS)
+			mask = GFP_NOFS;
 		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
-				 1, 1, GFP_NOFS);
+				 1, 1, mask);
+	}
 	return ret;
 }
 EXPORT_SYMBOL(try_release_extent_mapping);
@@ -2553,13 +2721,13 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			}
 		}
 		clear_page_dirty_for_io(page);
-		write_lock_irq(&page->mapping->tree_lock);
+		read_lock_irq(&page->mapping->tree_lock);
 		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&page->mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
-		write_unlock_irq(&page->mapping->tree_lock);
+		read_unlock_irq(&page->mapping->tree_lock);
 		unlock_page(page);
 	}
 	return 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 06be1fe84b2..f1e8bf251f3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -23,19 +23,23 @@
 #define EXTENT_PAGE_PRIVATE 1
 #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
 
+struct extent_state;
+
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end);
-	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
+				    struct extent_state *state);
+	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+				      struct extent_state *state);
 };
 
 struct extent_io_tree {
 	struct rb_root state;
 	struct address_space *mapping;
 	u64 dirty_bytes;
-	rwlock_t lock;
+	spinlock_t lock;
 	struct extent_io_ops *ops;
 	spinlock_t lru_lock;
 	struct list_head buffer_lru;
@@ -45,8 +49,8 @@ struct extent_io_tree {
 struct extent_state {
 	u64 start;
 	u64 end; /* inclusive */
-	int in_tree;
 	struct rb_node rb_node;
+	struct extent_io_tree *tree;
 	wait_queue_head_t wq;
 	atomic_t refs;
 	unsigned long state;
@@ -82,7 +86,8 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 			  struct address_space *mapping, gfp_t mask);
 void extent_io_tree_empty_lru(struct extent_io_tree *tree);
 int try_release_extent_mapping(struct extent_map_tree *map,
-			       struct extent_io_tree *tree, struct page *page);
+			       struct extent_io_tree *tree, struct page *page,
+			       gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 48455127410..bac8722e14e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -331,7 +331,8 @@ out:
 	return ret;
 }
 
-int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
+int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+			       struct extent_state *state)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
 	struct inode *inode = page->mapping->host;
@@ -347,7 +348,12 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end)
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
 
-	ret = get_state_private(io_tree, start, &private);
+	if (state->start == start) {
+		private = state->private;
+		ret = 0;
+	} else {
+		ret = get_state_private(io_tree, start, &private);
+	}
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
 	if (ret) {
@@ -1830,7 +1836,7 @@ out_unlock:
 }
 
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 len,
+				    size_t pg_offset, u64 start, u64 len,
 				    int create)
 {
 	int ret;
@@ -1865,7 +1871,10 @@ again:
 			       start, len, em->start, em->len);
 			WARN_ON(1);
 		}
-		goto out;
+		if (em->block_start == EXTENT_MAP_INLINE && page)
+			free_extent_map(em);
+		else
+			goto out;
 	}
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
@@ -1930,6 +1939,7 @@ again:
 		em->len = extent_end - extent_start;
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		u64 page_start;
 		unsigned long ptr;
 		char *map;
 		size_t size;
@@ -1959,16 +1969,17 @@ again:
 			goto out;
 		}
 
-		extent_offset = ((u64)page->index << PAGE_CACHE_SHIFT) -
-			extent_start + page_offset;
-		copy_size = min_t(u64, PAGE_CACHE_SIZE - page_offset,
+		page_start = page_offset(page) + pg_offset;
+		extent_offset = page_start - extent_start;
+		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
 				size - extent_offset);
 		em->start = extent_start + extent_offset;
-		em->len = copy_size;
+		em->len = (copy_size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
 		map = kmap(page);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
-			read_extent_buffer(leaf, map + page_offset, ptr,
+			read_extent_buffer(leaf, map + pg_offset, ptr,
 					   copy_size);
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
@@ -1980,7 +1991,7 @@ again:
 				trans = btrfs_start_transaction(root, 1);
 				goto again;
 			}
-			write_extent_buffer(leaf, map + page_offset, ptr,
+			write_extent_buffer(leaf, map + pg_offset, ptr,
 					    copy_size);
 			btrfs_mark_buffer_dirty(leaf);
 		}
@@ -2077,7 +2088,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
 				btrfs_get_extent);
 }
 
-static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	struct extent_io_tree *tree;
 	struct extent_map_tree *map;
@@ -2085,7 +2096,7 @@ static int btrfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(map, tree, page);
+	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
-- 
cgit v1.2.3


From 85e21bac165b4ba1f6f90431ad6fc658ffcbaf3a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 15:11:36 -0500
Subject: Btrfs: During deletes and truncate, remove many items at once from
 the tree

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     |  38 +++++++++--------
 fs/btrfs/ctree.h     |  12 +++++-
 fs/btrfs/extent_io.c |   1 -
 fs/btrfs/inode.c     | 114 +++++++++++++++++++++++++++++----------------------
 4 files changed, 95 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 43d23148a4f..84ad53e06b3 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2514,34 +2514,36 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * delete the item at the leaf level in path.  If that empties
  * the leaf, remove it from the tree
  */
-int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct btrfs_path *path)
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct btrfs_path *path, int slot, int nr)
 {
-	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
-	int doff;
-	int dsize;
+	int last_off;
+	int dsize = 0;
 	int ret = 0;
 	int wret;
+	int i;
 	u32 nritems;
 
 	leaf = path->nodes[0];
-	slot = path->slots[0];
-	doff = btrfs_item_offset_nr(leaf, slot);
-	dsize = btrfs_item_size_nr(leaf, slot);
+	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
+
+	for (i = 0; i < nr; i++)
+		dsize += btrfs_item_size_nr(leaf, slot + i);
+
 	nritems = btrfs_header_nritems(leaf);
 
-	if (slot != nritems - 1) {
+	if (slot + nr != nritems) {
 		int i;
 		int data_end = leaf_data_end(root, leaf);
 
 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
 			      data_end + dsize,
 			      btrfs_leaf_data(leaf) + data_end,
-			      doff - data_end);
+			      last_off - data_end);
 
-		for (i = slot + 1; i < nritems; i++) {
+		for (i = slot + nr; i < nritems; i++) {
 			u32 ioff;
 
 			item = btrfs_item_nr(leaf, i);
@@ -2562,12 +2564,12 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
-			      btrfs_item_nr_offset(slot + 1),
+			      btrfs_item_nr_offset(slot + nr),
 			      sizeof(struct btrfs_item) *
-			      (nritems - slot - 1));
+			      (nritems - slot - nr));
 	}
-	btrfs_set_header_nritems(leaf, nritems - 1);
-	nritems--;
+	btrfs_set_header_nritems(leaf, nritems - nr);
+	nritems -= nr;
 
 	/* delete the leaf if we've emptied it */
 	if (nritems == 0) {
@@ -2600,7 +2602,7 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
@@ -2608,13 +2610,13 @@ int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			slot = path->slots[1];
 			extent_buffer_get(leaf);
 
-			wret = push_leaf_right(trans, root, path, 1, 1);
+			wret = push_leaf_left(trans, root, path, 1, 1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
 
 			if (path->nodes[0] == leaf &&
 			    btrfs_header_nritems(leaf)) {
-				wret = push_leaf_left(trans, root, path, 1, 1);
+				wret = push_leaf_right(trans, root, path, 1, 1);
 				if (wret < 0 && wret != -ENOSPC)
 					ret = wret;
 			}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6c65473e0fe..098cf088315 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1038,8 +1038,16 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
-int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct btrfs_path *path);
+int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		   struct btrfs_path *path, int slot, int nr);
+
+static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path)
+{
+	return btrfs_del_items(trans, root, path, path->slots[0], 1);
+}
+
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, void *data, u32 data_size);
 int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1f734c34dc2..8aec72253a1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2863,7 +2863,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (ret || !wait) {
 		return ret;
 	}
-
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bac8722e14e..0a2fe51c412 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -692,27 +692,6 @@ fail:
 	return err;
 }
 
-static int btrfs_free_inode(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct inode *inode)
-{
-	struct btrfs_path *path;
-	int ret;
-
-	clear_inode(inode);
-
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-	ret = btrfs_lookup_inode(trans, root, path,
-				 &BTRFS_I(inode)->location, -1);
-	if (ret > 0)
-		ret = -ENOENT;
-	if (!ret)
-		ret = btrfs_del_item(trans, root, path);
-	btrfs_free_path(path);
-	return ret;
-}
-
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -723,7 +702,8 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
  */
 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
-				   struct inode *inode)
+				   struct inode *inode,
+				   u32 min_type)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -739,6 +719,8 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	u64 root_owner = 0;
 	int found_extent;
 	int del_item;
+	int pending_del_nr = 0;
+	int pending_del_slot = 0;
 	int extent_type = -1;
 
 	btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
@@ -751,17 +733,19 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
+	btrfs_init_path(path);
+search_again:
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0) {
+		goto error;
+	}
+	if (ret > 0) {
+		BUG_ON(path->slots[0] == 0);
+		path->slots[0]--;
+	}
+
 	while(1) {
-		btrfs_init_path(path);
 		fi = NULL;
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0) {
-			goto error;
-		}
-		if (ret > 0) {
-			BUG_ON(path->slots[0] == 0);
-			path->slots[0]--;
-		}
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
@@ -769,10 +753,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		if (found_key.objectid != inode->i_ino)
 			break;
 
-		if (found_type != BTRFS_CSUM_ITEM_KEY &&
-		    found_type != BTRFS_DIR_ITEM_KEY &&
-		    found_type != BTRFS_DIR_INDEX_KEY &&
-		    found_type != BTRFS_EXTENT_DATA_KEY)
+		if (found_type < min_type)
 			break;
 
 		item_end = found_key.offset;
@@ -801,14 +782,17 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
 				found_type = BTRFS_CSUM_ITEM_KEY;
+			} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
+				found_type = BTRFS_XATTR_ITEM_KEY;
+			} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
+				found_type = BTRFS_INODE_REF_KEY;
 			} else if (found_type) {
 				found_type--;
 			} else {
 				break;
 			}
 			btrfs_set_key_type(&key, found_type);
-			btrfs_release_path(root, path);
-			continue;
+			goto next;
 		}
 		if (found_key.offset >= inode->i_size)
 			del_item = 1;
@@ -860,13 +844,21 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 		}
 delete:
 		if (del_item) {
-			ret = btrfs_del_item(trans, root, path);
-			if (ret)
-				goto error;
+			if (!pending_del_nr) {
+				/* no pending yet, add ourselves */
+				pending_del_slot = path->slots[0];
+				pending_del_nr = 1;
+			} else if (pending_del_nr &&
+				   path->slots[0] + 1 == pending_del_slot) {
+				/* hop on the pending chunk */
+				pending_del_nr++;
+				pending_del_slot = path->slots[0];
+			} else {
+				printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
+			}
 		} else {
 			break;
 		}
-		btrfs_release_path(root, path);
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
@@ -875,9 +867,36 @@ delete:
 						found_key.offset, 0);
 			BUG_ON(ret);
 		}
+next:
+		if (path->slots[0] == 0) {
+			if (pending_del_nr)
+				goto del_pending;
+			btrfs_release_path(root, path);
+			goto search_again;
+		}
+
+		path->slots[0]--;
+		if (pending_del_nr &&
+		    path->slots[0] + 1 != pending_del_slot) {
+			struct btrfs_key debug;
+del_pending:
+			btrfs_item_key_to_cpu(path->nodes[0], &debug,
+					      pending_del_slot);
+			ret = btrfs_del_items(trans, root, path,
+					      pending_del_slot,
+					      pending_del_nr);
+			BUG_ON(ret);
+			pending_del_nr = 0;
+			btrfs_release_path(root, path);
+			goto search_again;
+		}
 	}
 	ret = 0;
 error:
+	if (pending_del_nr) {
+		ret = btrfs_del_items(trans, root, path, pending_del_slot,
+				      pending_del_nr);
+	}
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	inode->i_sb->s_dirt = 1;
@@ -1067,16 +1086,12 @@ void btrfs_delete_inode(struct inode *inode)
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
-	ret = btrfs_truncate_in_trans(trans, root, inode);
-	if (ret)
-		goto no_delete_lock;
-	ret = btrfs_delete_xattrs(trans, root, inode);
-	if (ret)
-		goto no_delete_lock;
-	ret = btrfs_free_inode(trans, root, inode);
+	ret = btrfs_truncate_in_trans(trans, root, inode, 0);
 	if (ret)
 		goto no_delete_lock;
+
 	nr = trans->blocks_used;
+	clear_inode(inode);
 
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -2190,7 +2205,8 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_set_trans_block_group(trans, inode);
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
-	ret = btrfs_truncate_in_trans(trans, root, inode);
+	ret = btrfs_truncate_in_trans(trans, root, inode,
+				      BTRFS_EXTENT_DATA_KEY);
 	btrfs_update_inode(trans, root, inode);
 	nr = trans->blocks_used;
 
-- 
cgit v1.2.3


From 9c58309d6cf22471dacbcb6de54d00cef9ca20d4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 15:15:18 -0500
Subject: Btrfs: Add inode item and backref in one insert, reducing cpu usage

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 39 ++++++++++++++++----------
 fs/btrfs/ctree.h | 17 ++++++++++--
 fs/btrfs/inode.c | 85 +++++++++++++++++++++++++++++++++++++++++---------------
 3 files changed, 102 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 84ad53e06b3..fb2e2bd506c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2332,27 +2332,34 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
  * Given a key and some data, insert an item into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
-int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 data_size)
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_item *item;
 	int ret = 0;
 	int slot;
 	int slot_orig;
+	int i;
 	u32 nritems;
+	u32 total_size = 0;
+	u32 total_data = 0;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
 
-	btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+	for (i = 0; i < nr; i++) {
+		total_data += data_size[i];
+	}
 
 	/* create a root if there isn't one */
 	if (!root->node)
 		BUG();
 
-	ret = btrfs_search_slot(trans, root, cpu_key, path, data_size, 1);
+	total_size = total_data + (nr - 1) * sizeof(struct btrfs_item);
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
 	if (ret == 0) {
 		return -EEXIST;
 	}
@@ -2366,10 +2373,10 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 	data_end = leaf_data_end(root, leaf);
 
 	if (btrfs_leaf_free_space(root, leaf) <
-	    sizeof(struct btrfs_item) + data_size) {
+	    sizeof(struct btrfs_item) + total_size) {
 		btrfs_print_leaf(root, leaf);
 		printk("not enough freespace need %u have %d\n",
-		       data_size, btrfs_leaf_free_space(root, leaf));
+		       total_size, btrfs_leaf_free_space(root, leaf));
 		BUG();
 	}
 
@@ -2404,7 +2411,7 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 			}
 
 			ioff = btrfs_item_offset(leaf, item);
-			btrfs_set_item_offset(leaf, item, ioff - data_size);
+			btrfs_set_item_offset(leaf, item, ioff - total_data);
 		}
 		if (leaf->map_token) {
 			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
@@ -2412,23 +2419,27 @@ int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 		}
 
 		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
 			      btrfs_item_nr_offset(slot),
 			      (nritems - slot) * sizeof(struct btrfs_item));
 
 		/* shift the data */
 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-			      data_end - data_size, btrfs_leaf_data(leaf) +
+			      data_end - total_data, btrfs_leaf_data(leaf) +
 			      data_end, old_data - data_end);
 		data_end = old_data;
 	}
 
 	/* setup the item for the new data */
-	btrfs_set_item_key(leaf, &disk_key, slot);
-	item = btrfs_item_nr(leaf, slot);
-	btrfs_set_item_offset(leaf, item, data_end - data_size);
-	btrfs_set_item_size(leaf, item, data_size);
-	btrfs_set_header_nritems(leaf, nritems + 1);
+	for (i = 0; i < nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+		btrfs_set_item_key(leaf, &disk_key, slot + i);
+		item = btrfs_item_nr(leaf, slot + i);
+		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		data_end -= data_size[i];
+		btrfs_set_item_size(leaf, item, data_size[i]);
+	}
+	btrfs_set_header_nritems(leaf, nritems + nr);
 	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 098cf088315..127c86f795d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1050,9 +1050,20 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, void *data, u32 data_size);
-int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			    *root, struct btrfs_path *path, struct btrfs_key
-			    *cpu_key, u32 data_size);
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     struct btrfs_key *cpu_key, u32 *data_size, int nr);
+
+static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_key *key,
+					  u32 data_size)
+{
+	return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1);
+}
+
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0a2fe51c412..413b1012de5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1500,6 +1500,8 @@ void btrfs_dirty_inode(struct inode *inode)
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
+				     const char *name, int name_len,
+				     u64 ref_objectid,
 				     u64 objectid,
 				     struct btrfs_block_group_cache *group,
 				     int mode)
@@ -1508,6 +1510,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_key *location;
 	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_key key[2];
+	u32 sizes[2];
+	unsigned long ptr;
 	int ret;
 	int owner;
 
@@ -1530,10 +1536,25 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	group = btrfs_find_block_group(root, group, 0, 0, owner);
 	BTRFS_I(inode)->block_group = group;
 	BTRFS_I(inode)->flags = 0;
-	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
-	if (ret)
+
+	key[0].objectid = objectid;
+	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
+	key[0].offset = 0;
+
+	key[1].objectid = objectid;
+	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+	key[1].offset = ref_objectid;
+
+	sizes[0] = sizeof(struct btrfs_inode_item);
+	sizes[1] = name_len + sizeof(*ref);
+
+	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+	if (ret != 0)
 		goto fail;
 
+	if (objectid > root->highest_inode)
+		root->highest_inode = objectid;
+
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 	inode->i_mode = mode;
@@ -1543,6 +1564,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
 	fill_inode_item(path->nodes[0], inode_item, inode);
+
+	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+			     struct btrfs_inode_ref);
+	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+	ptr = (unsigned long)(ref + 1);
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
@@ -1564,7 +1592,8 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 }
 
 static int btrfs_add_link(struct btrfs_trans_handle *trans,
-			    struct dentry *dentry, struct inode *inode)
+			    struct dentry *dentry, struct inode *inode,
+			    int add_backref)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1580,11 +1609,13 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_parent->d_inode->i_ino,
 				    &key, btrfs_inode_type(inode));
 	if (ret == 0) {
-		ret = btrfs_insert_inode_ref(trans, root,
-				     dentry->d_name.name,
-				     dentry->d_name.len,
-				     inode->i_ino,
-				     dentry->d_parent->d_inode->i_ino);
+		if (add_backref) {
+			ret = btrfs_insert_inode_ref(trans, root,
+					     dentry->d_name.name,
+					     dentry->d_name.len,
+					     inode->i_ino,
+					     dentry->d_parent->d_inode->i_ino);
+		}
 		parent_inode = dentry->d_parent->d_inode;
 		parent_inode->i_size += dentry->d_name.len * 2;
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -1595,9 +1626,10 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 }
 
 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
-			    struct dentry *dentry, struct inode *inode)
+			    struct dentry *dentry, struct inode *inode,
+			    int backref)
 {
-	int err = btrfs_add_link(trans, dentry, inode);
+	int err = btrfs_add_link(trans, dentry, inode, backref);
 	if (!err) {
 		d_instantiate(dentry, inode);
 		return 0;
@@ -1635,14 +1667,16 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid,
+	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode);
+	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
 		drop_inode = 1;
 	else {
@@ -1692,14 +1726,16 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid,
-				BTRFS_I(dir)->block_group, mode);
+	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino,
+				objectid, BTRFS_I(dir)->block_group, mode);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode);
+	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
 		drop_inode = 1;
 	else {
@@ -1755,7 +1791,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
-	err = btrfs_add_nondir(trans, dentry, inode);
+	err = btrfs_add_nondir(trans, dentry, inode, 1);
 
 	if (err)
 		drop_inode = 1;
@@ -1810,7 +1846,9 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid,
+	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
@@ -1827,7 +1865,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_fail;
 
-	err = btrfs_add_link(trans, dentry, inode);
+	err = btrfs_add_link(trans, dentry, inode, 0);
 	if (err)
 		goto out_fail;
 
@@ -2320,7 +2358,8 @@ static int noinline create_subvol(struct btrfs_root *root, char *name,
 	trans = btrfs_start_transaction(new_root, 1);
 	BUG_ON(!trans);
 
-	inode = btrfs_new_inode(trans, new_root, new_dirid,
+	inode = btrfs_new_inode(trans, new_root, "..", 2, new_dirid,
+				new_dirid,
 				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
 	if (IS_ERR(inode))
 		goto fail;
@@ -2817,7 +2856,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		if (ret)
 			goto out_fail;
 	}
-	ret = btrfs_add_link(trans, new_dentry, old_inode);
+	ret = btrfs_add_link(trans, new_dentry, old_inode, 1);
 	if (ret)
 		goto out_fail;
 
@@ -2865,14 +2904,16 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, objectid,
+	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+				dentry->d_name.len,
+				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode);
+	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
 		drop_inode = 1;
 	else {
-- 
cgit v1.2.3


From 291d673e6a22d9c6834e939f66c7cfef90669021 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 15:55:23 -0500
Subject: Btrfs: Do delalloc accounting via hooks in the extent_state code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  6 -----
 fs/btrfs/extent_io.c   | 25 +++++++++++++++++++++
 fs/btrfs/extent_io.h   |  4 ++++
 fs/btrfs/file.c        |  4 ----
 fs/btrfs/inode.c       | 60 ++++++++++++++++++++------------------------------
 5 files changed, 53 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c4fe37791d3..90718f77bbc 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2180,7 +2180,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	unsigned long last_index;
 	unsigned long i;
 	struct page *page;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct file_ra_state *ra;
 
@@ -2220,11 +2219,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->delalloc_bytes += PAGE_CACHE_SIZE -
-						 existing_delalloc;
-		spin_unlock(&root->fs_info->delalloc_lock);
-
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
 		unlock_page(page);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8aec72253a1..e81b35b0618 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -256,6 +256,26 @@ static int merge_state(struct extent_io_tree *tree,
 	return 0;
 }
 
+static void set_state_cb(struct extent_io_tree *tree,
+			 struct extent_state *state,
+			 unsigned long bits)
+{
+	if (tree->ops && tree->ops->set_bit_hook) {
+		tree->ops->set_bit_hook(tree->mapping->host, state->start,
+					state->end, bits);
+	}
+}
+
+static void clear_state_cb(struct extent_io_tree *tree,
+			   struct extent_state *state,
+			   unsigned long bits)
+{
+	if (tree->ops && tree->ops->set_bit_hook) {
+		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
+					  state->end, bits);
+	}
+}
+
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
  * struct before it is inserted.
@@ -281,6 +301,7 @@ static int insert_state(struct extent_io_tree *tree,
 	state->state |= bits;
 	state->start = start;
 	state->end = end;
+	set_state_cb(tree, state, bits);
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -349,6 +370,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
 		tree->dirty_bytes -= range;
 	}
 	state->state &= ~bits;
+	clear_state_cb(tree, state, bits);
 	if (wake)
 		wake_up(&state->wq);
 	if (delete || state->state == 0) {
@@ -553,6 +575,7 @@ static void set_state_bits(struct extent_io_tree *tree,
 		tree->dirty_bytes += range;
 	}
 	state->state |= bits;
+	set_state_cb(tree, state, bits);
 }
 
 /*
@@ -975,6 +998,7 @@ search_again:
 			goto search_again;
 		}
 		state->state |= EXTENT_LOCKED;
+		set_state_cb(tree, state, EXTENT_LOCKED);
 		if (!found)
 			*start = state->start;
 		found++;
@@ -1474,6 +1498,7 @@ static int end_bio_extent_readpage(struct bio *bio,
 				state = NULL;
 			}
 			clear->state |= EXTENT_UPTODATE;
+			set_state_cb(tree, clear, EXTENT_UPTODATE);
 			clear_state_bit(tree, clear, EXTENT_LOCKED,
 					1, 0);
 			if (cur == start)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f1e8bf251f3..a96c5a14134 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -33,6 +33,10 @@ struct extent_io_ops {
 				    struct extent_state *state);
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state);
+	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+			    unsigned long bits);
+	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
+			    unsigned long bits);
 };
 
 struct extent_io_tree {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c5bb00f9239..8e210616d70 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -323,10 +323,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		}
 		set_extent_delalloc(io_tree, start_pos, end_of_last_block,
 				 GFP_NOFS);
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
-					  start_pos) - existing_delalloc;
-		spin_unlock(&root->fs_info->delalloc_lock);
 		btrfs_add_ordered_inode(inode);
 	} else {
 		u64 aligned_end;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 413b1012de5..5a38443a24e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -80,8 +80,6 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	u64 thresh;
 	int ret = 0;
 
-	return 0;
-
 	if (for_del)
 		thresh = total * 90;
 	else
@@ -249,7 +247,6 @@ not_found:
 static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 num_bytes;
 	int ret;
 	mutex_lock(&root->fs_info->fs_mutex);
 	if (btrfs_test_opt(root, NODATACOW) ||
@@ -258,20 +255,34 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	else
 		ret = cow_file_range(inode, start, end);
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	num_bytes = end + 1 - start;
-	if (root->fs_info->delalloc_bytes < num_bytes) {
-		printk("delalloc accounting error total %llu sub %llu\n",
-		       root->fs_info->delalloc_bytes, num_bytes);
-	} else {
-		root->fs_info->delalloc_bytes -= num_bytes;
-	}
-	spin_unlock(&root->fs_info->delalloc_lock);
-
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
+int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+		       unsigned long bits)
+{
+	if ((bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->delalloc_bytes += end - start + 1;
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+	return 0;
+}
+
+int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+			 unsigned long bits)
+{
+	if ((bits & EXTENT_DELALLOC)) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		spin_lock(&root->fs_info->delalloc_lock);
+		root->fs_info->delalloc_bytes -= end - start + 1;
+		spin_unlock(&root->fs_info->delalloc_lock);
+	}
+	return 0;
+}
+
 int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 {
 	struct inode *inode = page->mapping->host;
@@ -908,28 +919,17 @@ static int btrfs_cow_one_page(struct inode *inode, struct page *page,
 {
 	char *kaddr;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-	u64 existing_delalloc;
-	u64 delalloc_start;
 	int ret = 0;
 
 	WARN_ON(!PageLocked(page));
 	set_page_extent_mapped(page);
 
 	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-	delalloc_start = page_start;
-	existing_delalloc = count_range_bits(&BTRFS_I(inode)->io_tree,
-					     &delalloc_start, page_end,
-					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
 	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
 			    page_end, GFP_NOFS);
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	root->fs_info->delalloc_bytes += PAGE_CACHE_SIZE - existing_delalloc;
-	spin_unlock(&root->fs_info->delalloc_lock);
-
 	if (zero_start != PAGE_CACHE_SIZE) {
 		kaddr = kmap(page);
 		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
@@ -2456,8 +2456,6 @@ int btrfs_defrag_file(struct file *file) {
 	unsigned long ra_index = 0;
 	u64 page_start;
 	u64 page_end;
-	u64 delalloc_start;
-	u64 existing_delalloc;
 	unsigned long i;
 	int ret;
 
@@ -2491,19 +2489,9 @@ int btrfs_defrag_file(struct file *file) {
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-		delalloc_start = page_start;
-		existing_delalloc =
-			count_range_bits(&BTRFS_I(inode)->io_tree,
-					 &delalloc_start, page_end,
-					 PAGE_CACHE_SIZE, EXTENT_DELALLOC);
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->delalloc_bytes += PAGE_CACHE_SIZE -
-						 existing_delalloc;
-		spin_unlock(&root->fs_info->delalloc_lock);
-
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
 		unlock_page(page);
-- 
cgit v1.2.3


From 6f568d35a045dbb8a13fe71bfc32e85e39a986cb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 16:03:38 -0500
Subject: Btrfs: mount -o max_inline=size to control the maximum inline extent
 size

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  1 +
 fs/btrfs/disk-io.c   |  1 +
 fs/btrfs/extent_io.c |  1 -
 fs/btrfs/file.c      |  3 ++-
 fs/btrfs/super.c     | 19 ++++++++++++++++++-
 5 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 127c86f795d..8f93e066bf8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -325,6 +325,7 @@ struct btrfs_fs_info {
 	u64 last_trans_committed;
 	unsigned long mount_opt;
 	u64 max_extent;
+	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e29a0716f6a..6647b90ef74 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -659,6 +659,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->throttles = 0;
 	fs_info->mount_opt = 0;
 	fs_info->max_extent = (u64)-1;
+	fs_info->max_inline = 8192 * 1024;
 	fs_info->delalloc_bytes = 0;
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e81b35b0618..6999b819db6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2451,7 +2451,6 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 		goto out;
 
 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
-printk("bmap finds %Lu %Lu block %Lu\n", em->start, em->len, em->block_start);
 out:
 	free_extent_map(em);
 	return sector;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8e210616d70..7c10a90362f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -301,7 +301,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	 */
 	inline_size = end_pos;
 	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > 8192 ||
+	    inline_size > root->fs_info->max_inline ||
+	    (inline_size & (root->sectorsize -1)) == 0 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
 		u64 existing_delalloc = 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f8a1016600b..a46300c4753 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -64,7 +64,7 @@ static void btrfs_put_super (struct super_block * sb)
 
 enum {
 	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent,
-	Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_err,
+	Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -73,6 +73,7 @@ static match_table_t tokens = {
 	{Opt_nodatacow, "nodatacow"},
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_max_extent, "max_extent=%s"},
+	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_ssd, "ssd"},
 	{Opt_err, NULL}
@@ -178,6 +179,22 @@ static int parse_options (char * options,
 				}
 			}
 			break;
+		case Opt_max_inline:
+			if (info) {
+				char *num = match_strdup(&args[0]);
+				if (num) {
+					info->max_inline =
+						btrfs_parse_size(num);
+					kfree(num);
+
+					info->max_inline = max_t(u64,
+							 info->max_inline,
+							 root->sectorsize);
+					printk("btrfs: max_inline at %Lu\n",
+					       info->max_inline);
+				}
+			}
+			break;
 		case Opt_alloc_start:
 			if (info) {
 				char *num = match_strdup(&args[0]);
-- 
cgit v1.2.3


From b3236e68bf86b3ae87f58984a1822369225211cb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jan 2008 16:06:37 -0500
Subject: Btrfs: Leave on the tree defragger in mount -o ssd, it still helps
 there

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-defrag.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5935cbd8f2b..5c58630dce0 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -179,9 +179,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (root->ref_cows == 0 && !is_extent)
 		goto out;
 
-	if (btrfs_test_opt(root, SSD))
-		goto out;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 5a01a2e3a9d4dc9cb4871dde4d832a3b8de9f748 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 30 Jan 2008 11:43:54 -0500
Subject: Btrfs: Copy correct tree when inserting into slot 0

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fb2e2bd506c..cf32651876b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2443,13 +2443,16 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 
 	ret = 0;
-	if (slot == 0)
+	if (slot == 0) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
 		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
 
 	if (btrfs_leaf_free_space(root, leaf) < 0) {
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
+
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From b1632b10c0133992fd0290b4836c47661bed4396 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Wed, 30 Jan 2008 11:54:04 -0500
Subject: Btrfs: Align extent length to sectorsize in

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5a38443a24e..de17c61dfa0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -823,6 +823,8 @@ search_again:
 					btrfs_file_extent_num_bytes(leaf, fi);
 				extent_num_bytes = inode->i_size -
 					found_key.offset + root->sectorsize - 1;
+				extent_num_bytes = extent_num_bytes &
+					~((u64)root->sectorsize - 1);
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
-- 
cgit v1.2.3


From f392a938f3cc36a2abe68db9286bd864c3b2f7dd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 30 Jan 2008 11:54:05 -0500
Subject: Properly align the hole size in btrfs_setattr

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index de17c61dfa0..515f8b01ab0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1001,7 +1001,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		u64 mask = root->sectorsize - 1;
 		u64 pos = (inode->i_size + mask) & ~mask;
-		u64 block_end = attr->ia_size | mask;
+		u64 block_end = (attr->ia_size + mask) & ~mask;
 		u64 hole_start;
 		u64 hole_size;
 		u64 alloc_hint = 0;
@@ -1022,7 +1022,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		lock_extent(io_tree, pos, block_end, GFP_NOFS);
+		lock_extent(io_tree, pos, block_end - 1, GFP_NOFS);
 		hole_size = block_end - hole_start;
 
 		mutex_lock(&root->fs_info->fs_mutex);
@@ -1043,7 +1043,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
-		unlock_extent(io_tree, pos, block_end, GFP_NOFS);
+		unlock_extent(io_tree, pos, block_end - 1, GFP_NOFS);
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From 1b0f7c29e2f4f41e1367e7581d3a9c8c70f0394a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 30 Jan 2008 14:33:02 -0500
Subject: Fix hole start calculation in btrfs_settar

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 515f8b01ab0..bcf3b35fb65 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1000,20 +1000,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 
 		u64 mask = root->sectorsize - 1;
-		u64 pos = (inode->i_size + mask) & ~mask;
+		u64 hole_start = (inode->i_size + mask) & ~mask;
 		u64 block_end = (attr->ia_size + mask) & ~mask;
-		u64 hole_start;
 		u64 hole_size;
 		u64 alloc_hint = 0;
 
-		if (attr->ia_size <= pos)
+		if (attr->ia_size <= hole_start)
 			goto out;
 
-		if (pos != inode->i_size)
-			hole_start = pos + root->sectorsize;
-		else
-			hole_start = pos;
-
 		mutex_lock(&root->fs_info->fs_mutex);
 		err = btrfs_check_free_space(root, 1, 0);
 		mutex_unlock(&root->fs_info->fs_mutex);
@@ -1022,14 +1016,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		lock_extent(io_tree, pos, block_end - 1, GFP_NOFS);
+		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		hole_size = block_end - hole_start;
 
 		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		err = btrfs_drop_extents(trans, root, inode,
-					 pos, block_end, pos,
+					 hole_start, block_end, hole_start,
 					 &alloc_hint);
 
 		if (alloc_hint != EXTENT_MAP_INLINE) {
@@ -1043,7 +1037,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 		btrfs_end_transaction(trans, root);
 		mutex_unlock(&root->fs_info->fs_mutex);
-		unlock_extent(io_tree, pos, block_end - 1, GFP_NOFS);
+		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From 0181e58f91c1edbf835edb7a87d6dfe81374709d Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Wed, 30 Jan 2008 14:39:54 -0500
Subject: btrfs_drop_extent fixe for inline items > 8K

When truncating a inline extent, btrfs_drop_extents doesn't properly
handle the case "key.offset > inline_limit". This bug can only happen
when max line size is larger than 8K.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7c10a90362f..7cbf8d8d7e2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,8 +591,7 @@ next_slot:
 				}
 			}
 			bookend = 1;
-			if (found_inline && start <= key.offset &&
-			    inline_limit < extent_end)
+			if (found_inline && start <= key.offset)
 				keep = 1;
 		}
 		/* truncate existing extent */
@@ -672,11 +671,10 @@ next_slot:
 			if (!bookend)
 				continue;
 		}
-		if (bookend && found_inline && start <= key.offset &&
-		    inline_limit < extent_end && key.offset <= inline_limit) {
+		if (bookend && found_inline && start <= key.offset) {
 			u32 new_size;
 			new_size = btrfs_file_extent_calc_inline_size(
-						   extent_end - inline_limit);
+						   extent_end - end);
 			btrfs_truncate_item(trans, root, path, new_size, 0);
 		}
 		/* create bookend, splitting the extent in two */
-- 
cgit v1.2.3


From b0c68f8bed058d9f2023b067b16ed06a8c439544 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jan 2008 11:05:37 -0500
Subject: Btrfs: Enable delalloc accounting

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |  4 ++++
 fs/btrfs/extent_io.c | 14 +++++++-------
 fs/btrfs/extent_io.h |  4 ++--
 fs/btrfs/inode.c     | 18 +++++++++++++-----
 4 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6647b90ef74..ad72e219201 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -861,6 +861,10 @@ int close_ctree(struct btrfs_root *root)
 	write_ctree_super(NULL, root);
 	mutex_unlock(&fs_info->fs_mutex);
 
+	if (fs_info->delalloc_bytes) {
+		printk("btrfs: at unmount delalloc count %Lu\n",
+		       fs_info->delalloc_bytes);
+	}
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6999b819db6..624aabc3e6f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -262,7 +262,7 @@ static void set_state_cb(struct extent_io_tree *tree,
 {
 	if (tree->ops && tree->ops->set_bit_hook) {
 		tree->ops->set_bit_hook(tree->mapping->host, state->start,
-					state->end, bits);
+					state->end, state->state, bits);
 	}
 }
 
@@ -272,7 +272,7 @@ static void clear_state_cb(struct extent_io_tree *tree,
 {
 	if (tree->ops && tree->ops->set_bit_hook) {
 		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
-					  state->end, bits);
+					  state->end, state->state, bits);
 	}
 }
 
@@ -298,10 +298,10 @@ static int insert_state(struct extent_io_tree *tree,
 	}
 	if (bits & EXTENT_DIRTY)
 		tree->dirty_bytes += end - start + 1;
+	set_state_cb(tree, state, bits);
 	state->state |= bits;
 	state->start = start;
 	state->end = end;
-	set_state_cb(tree, state, bits);
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -369,8 +369,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
 		WARN_ON(range > tree->dirty_bytes);
 		tree->dirty_bytes -= range;
 	}
-	state->state &= ~bits;
 	clear_state_cb(tree, state, bits);
+	state->state &= ~bits;
 	if (wake)
 		wake_up(&state->wq);
 	if (delete || state->state == 0) {
@@ -574,8 +574,8 @@ static void set_state_bits(struct extent_io_tree *tree,
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
-	state->state |= bits;
 	set_state_cb(tree, state, bits);
+	state->state |= bits;
 }
 
 /*
@@ -997,8 +997,8 @@ search_again:
 			free_extent_state(state);
 			goto search_again;
 		}
-		state->state |= EXTENT_LOCKED;
 		set_state_cb(tree, state, EXTENT_LOCKED);
+		state->state |= EXTENT_LOCKED;
 		if (!found)
 			*start = state->start;
 		found++;
@@ -1497,8 +1497,8 @@ static int end_bio_extent_readpage(struct bio *bio,
 			} else {
 				state = NULL;
 			}
-			clear->state |= EXTENT_UPTODATE;
 			set_state_cb(tree, clear, EXTENT_UPTODATE);
+			clear->state |= EXTENT_UPTODATE;
 			clear_state_bit(tree, clear, EXTENT_LOCKED,
 					1, 0);
 			if (cur == start)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a96c5a14134..6fd5e2c9061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -34,9 +34,9 @@ struct extent_io_ops {
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state);
 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
-			    unsigned long bits);
+			    unsigned long old, unsigned long bits);
 	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
-			    unsigned long bits);
+			    unsigned long old, unsigned long bits);
 };
 
 struct extent_io_tree {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bcf3b35fb65..f13b1dbc061 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -260,9 +260,9 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 }
 
 int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
-		       unsigned long bits)
+		       unsigned long old, unsigned long bits)
 {
-	if ((bits & EXTENT_DELALLOC)) {
+	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		spin_lock(&root->fs_info->delalloc_lock);
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -272,12 +272,18 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 }
 
 int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
-			 unsigned long bits)
+			 unsigned long old, unsigned long bits)
 {
-	if ((bits & EXTENT_DELALLOC)) {
+	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->delalloc_bytes -= end - start + 1;
+		if (end - start + 1 > root->fs_info->delalloc_bytes) {
+			printk("warning: delalloc account %Lu %Lu\n",
+			       end - start + 1, root->fs_info->delalloc_bytes);
+			root->fs_info->delalloc_bytes = 0;
+		} else {
+			root->fs_info->delalloc_bytes -= end - start + 1;
+		}
 		spin_unlock(&root->fs_info->delalloc_lock);
 	}
 	return 0;
@@ -3002,6 +3008,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.writepage_io_hook = btrfs_writepage_io_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.set_bit_hook = btrfs_set_bit_hook,
+	.clear_bit_hook = btrfs_clear_bit_hook,
 };
 
 static struct address_space_operations btrfs_aops = {
-- 
cgit v1.2.3


From 4529ba495c6fd0d79247784d0df55ae6512fa883 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jan 2008 16:45:07 -0500
Subject: Btrfs: Add data block hints to SSD mode too

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 28 ++++++++++++++++------------
 fs/btrfs/transaction.c |  1 +
 4 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8f93e066bf8..35e9a7af10a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -354,6 +354,7 @@ struct btrfs_fs_info {
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
 	u64 last_alloc;
+	u64 last_data_alloc;
 };
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ad72e219201..dda8ad6c007 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -688,6 +688,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->closing = 0;
 	fs_info->total_pinned = 0;
 	fs_info->last_alloc = 0;
+	fs_info->last_data_alloc = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 90718f77bbc..fa54ea59007 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1410,6 +1410,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	int slot = 0;
 	u64 last_byte = 0;
+	u64 *last_ptr = NULL;
 	u64 orig_search_start = search_start;
 	int start_found;
 	struct extent_buffer *l;
@@ -1433,14 +1434,17 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 	/* for SSD, cluster allocations together as much as possible */
 	if (btrfs_test_opt(root, SSD)) {
-		if (!data) {
-			if (root->fs_info->last_alloc)
-				hint_byte = root->fs_info->last_alloc;
-			else {
-				hint_byte = hint_byte &
-					~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
-				empty_size += 16 * 1024 * 1024;
-			}
+		if (data)
+			last_ptr = &root->fs_info->last_data_alloc;
+		else
+			last_ptr = &root->fs_info->last_alloc;
+		if (*last_ptr) {
+			hint_byte = *last_ptr;
+		}
+		else {
+			hint_byte = hint_byte &
+				~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
+			empty_size += 16 * 1024 * 1024;
 		}
 	}
 
@@ -1470,8 +1474,8 @@ check_failed:
 	search_start = find_search_start(root, &block_group, search_start,
 					 total_needed, data);
 
-	if (!data && btrfs_test_opt(root, SSD) && info->last_alloc &&
-	    search_start != info->last_alloc) {
+	if (btrfs_test_opt(root, SSD) && *last_ptr &&
+	    search_start != *last_ptr) {
 		info->last_alloc = 0;
 		if (!empty_size) {
 			empty_size += 16 * 1024 * 1024;
@@ -1609,6 +1613,8 @@ check_pending:
 	}
 	ins->offset = num_bytes;
 	btrfs_free_path(path);
+	if (btrfs_test_opt(root, SSD))
+		*last_ptr = ins->objectid + ins->offset;
 	return 0;
 
 new_group:
@@ -1636,8 +1642,6 @@ enospc:
 error:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	if (btrfs_test_opt(root, SSD) && !ret && !data)
-		info->last_alloc = ins->objectid + ins->offset;
 	return ret;
 }
 /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b4a1bc62a78..3f64d0c7ddb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -58,6 +58,7 @@ static int join_transaction(struct btrfs_root *root)
 		root->fs_info->generation++;
 		root->fs_info->running_transaction = cur_trans;
 		root->fs_info->last_alloc = 0;
+		root->fs_info->last_data_alloc = 0;
 		cur_trans->num_writers = 1;
 		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
-- 
cgit v1.2.3


From 80ea96b1f3bd2431e0d71c9df6ab45c3de0c5840 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Feb 2008 14:51:59 -0500
Subject: Btrfs: Add a lookup cache to the extent state tree

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 57 ++++++++++++++++++++++++++++++++++++----------------
 fs/btrfs/extent_io.h |  1 +
 2 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 624aabc3e6f..5f2fbf2054f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -90,6 +90,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 	tree->mapping = mapping;
 	INIT_LIST_HEAD(&tree->buffer_lru);
 	tree->lru_size = 0;
+	tree->last = NULL;
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
@@ -158,16 +159,23 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
-static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
+static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 				     struct rb_node **prev_ret,
 				     struct rb_node **next_ret)
 {
+	struct rb_root *root = &tree->state;
 	struct rb_node * n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
+	if (tree->last) {
+		struct extent_state *state;
+		state = tree->last;
+		if (state->start <= offset && offset <= state->end)
+			return &tree->last->rb_node;
+	}
 	while(n) {
 		entry = rb_entry(n, struct tree_entry, rb_node);
 		prev = n;
@@ -177,8 +185,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 			n = n->rb_left;
 		else if (offset > entry->end)
 			n = n->rb_right;
-		else
+		else {
+			tree->last = rb_entry(n, struct extent_state, rb_node);
 			return n;
+		}
 	}
 
 	if (prev_ret) {
@@ -202,14 +212,20 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
-static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+					  u64 offset)
 {
 	struct rb_node *prev = NULL;
 	struct rb_node *ret;
 
-	ret = __tree_search(root, offset, &prev, NULL);
-	if (!ret)
+	ret = __etree_search(tree, offset, &prev, NULL);
+	if (!ret) {
+		if (prev) {
+			tree->last = rb_entry(prev, struct extent_state,
+					      rb_node);
+		}
 		return prev;
+	}
 	return ret;
 }
 
@@ -238,6 +254,8 @@ static int merge_state(struct extent_io_tree *tree,
 		    other->state == state->state) {
 			state->start = other->start;
 			other->tree = NULL;
+			if (tree->last == other)
+				tree->last = NULL;
 			rb_erase(&other->rb_node, &tree->state);
 			free_extent_state(other);
 		}
@@ -249,6 +267,8 @@ static int merge_state(struct extent_io_tree *tree,
 		    other->state == state->state) {
 			other->start = state->start;
 			state->tree = NULL;
+			if (tree->last == state)
+				tree->last = NULL;
 			rb_erase(&state->rb_node, &tree->state);
 			free_extent_state(state);
 		}
@@ -311,6 +331,7 @@ static int insert_state(struct extent_io_tree *tree,
 		return -EEXIST;
 	}
 	state->tree = tree;
+	tree->last = state;
 	merge_state(tree, state);
 	return 0;
 }
@@ -375,6 +396,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
 		wake_up(&state->wq);
 	if (delete || state->state == 0) {
 		if (state->tree) {
+			if (tree->last == state)
+				tree->last = NULL;
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
 			free_extent_state(state);
@@ -422,7 +445,7 @@ again:
 	 * this search will find the extents that end after
 	 * our range starts
 	 */
-	node = tree_search(&tree->state, start);
+	node = tree_search(tree, start);
 	if (!node)
 		goto out;
 	state = rb_entry(node, struct extent_state, rb_node);
@@ -533,7 +556,7 @@ again:
 		 * this search will find all the extents that end after
 		 * our range starts
 		 */
-		node = tree_search(&tree->state, start);
+		node = tree_search(tree, start);
 		if (!node)
 			break;
 
@@ -612,7 +635,7 @@ again:
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(&tree->state, start);
+	node = tree_search(tree, start);
 	if (!node) {
 		err = insert_state(tree, prealloc, start, end, bits);
 		prealloc = NULL;
@@ -915,7 +938,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(&tree->state, start);
+	node = tree_search(tree, start);
 	if (!node || IS_ERR(node)) {
 		goto out;
 	}
@@ -953,7 +976,7 @@ u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 search_again:
-	node = tree_search(&tree->state, cur_start);
+	node = tree_search(tree, cur_start);
 	if (!node || IS_ERR(node)) {
 		*end = (u64)-1;
 		goto out;
@@ -1041,7 +1064,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(&tree->state, cur_start);
+	node = tree_search(tree, cur_start);
 	if (!node || IS_ERR(node)) {
 		goto out;
 	}
@@ -1142,7 +1165,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(&tree->state, start);
+	node = tree_search(tree, start);
 	if (!node || IS_ERR(node)) {
 		ret = -ENOENT;
 		goto out;
@@ -1169,7 +1192,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-	node = tree_search(&tree->state, start);
+	node = tree_search(tree, start);
 	if (!node || IS_ERR(node)) {
 		ret = -ENOENT;
 		goto out;
@@ -1200,7 +1223,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	unsigned long flags;
 
 	spin_lock_irqsave(&tree->lock, flags);
-	node = tree_search(&tree->state, start);
+	node = tree_search(tree, start);
 	while (node && start <= end) {
 		state = rb_entry(node, struct extent_state, rb_node);
 
@@ -1348,7 +1371,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 		spin_lock_irqsave(&tree->lock, flags);
 		if (!state || state->end != end) {
 			state = NULL;
-			node = __tree_search(&tree->state, start, NULL, NULL);
+			node = __etree_search(tree, start, NULL, NULL);
 			if (node) {
 				state = rb_entry(node, struct extent_state,
 						 rb_node);
@@ -1468,7 +1491,7 @@ static int end_bio_extent_readpage(struct bio *bio,
 		spin_lock_irqsave(&tree->lock, flags);
 		if (!state || state->end != end) {
 			state = NULL;
-			node = __tree_search(&tree->state, start, NULL, NULL);
+			node = __etree_search(tree, start, NULL, NULL);
 			if (node) {
 				state = rb_entry(node, struct extent_state,
 						 rb_node);
@@ -1631,7 +1654,7 @@ static int submit_one_bio(int rw, struct bio *bio)
 	end = start + bvec->bv_len - 1;
 
 	spin_lock_irq(&tree->lock);
-	node = __tree_search(&tree->state, start, NULL, NULL);
+	node = __etree_search(tree, start, NULL, NULL);
 	BUG_ON(!node);
 	state = rb_entry(node, struct extent_state, rb_node);
 	while(state->end < end) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6fd5e2c9061..160da3f9d78 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -48,6 +48,7 @@ struct extent_io_tree {
 	spinlock_t lru_lock;
 	struct list_head buffer_lru;
 	int lru_size;
+	struct extent_state *last;
 };
 
 struct extent_state {
-- 
cgit v1.2.3


From 47e4bb988c853d9af79d76fc5135aee9eeffed77 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Feb 2008 14:51:59 -0500
Subject: Btrfs: Insert extent record and the first backref in a single balance

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fa54ea59007..0fc8dfd58da 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1664,12 +1664,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	u64 root_used;
 	u64 search_start = 0;
 	u64 new_hint;
+	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
-	struct btrfs_extent_item extent_item;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_ref *ref;
 	struct btrfs_path *path;
-
-	btrfs_set_stack_extent_refs(&extent_item, 1);
+	struct btrfs_key keys[2];
 
 	new_hint = max(hint_byte, root->fs_info->alloc_start);
 	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
@@ -1707,20 +1708,37 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	WARN_ON(trans->alloc_exclude_nr);
 	trans->alloc_exclude_start = ins->objectid;
 	trans->alloc_exclude_nr = ins->offset;
-	ret = btrfs_insert_item(trans, extent_root, ins, &extent_item,
-				sizeof(extent_item));
 
-	trans->alloc_exclude_start = 0;
-	trans->alloc_exclude_nr = 0;
-	BUG_ON(ret);
+	memcpy(&keys[0], ins, sizeof(*ins));
+	keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
+					 owner, owner_offset);
+	keys[1].objectid = ins->objectid;
+	keys[1].type = BTRFS_EXTENT_REF_KEY;
+	sizes[0] = sizeof(*extent_item);
+	sizes[1] = sizeof(*ref);
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	ret = btrfs_insert_extent_backref(trans, extent_root, path,
-					  ins->objectid, root_objectid,
-					  ref_generation, owner, owner_offset);
+
+	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
+				       sizes, 2);
 
 	BUG_ON(ret);
+	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+			     struct btrfs_extent_ref);
+
+	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
+	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
+	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
+	btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
+	trans->alloc_exclude_start = 0;
+	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
-- 
cgit v1.2.3


From 21a4989d26d5ce43aac452fd67be592463a5996d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Feb 2008 14:51:59 -0500
Subject: Btrfs: Hash in the offset and owner for file extent backref keys

This makes searches for backrefs and backref insertion much more efficient
when there are many backrefs for a single extent

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0fc8dfd58da..8761aec59e3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -397,13 +397,12 @@ static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
 	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(ref_generation);
 	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
-
-#if 0
-	lenum = cpu_to_le64(owner);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
-	lenum = cpu_to_le64(owner_offset);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
-#endif
+	if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
+		lenum = cpu_to_le64(owner);
+		low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+		lenum = cpu_to_le64(owner_offset);
+		low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	}
 	return ((u64)high_crc << 32) | (u64)low_crc;
 }
 
-- 
cgit v1.2.3


From 60cde612c8eb51ba79dabb4088f149c6a9802fdf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Feb 2008 14:51:59 -0500
Subject: Btrfs: Use last_alloc optimizations for metadata, even without -o ssd

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8761aec59e3..6f161abe286 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1431,19 +1431,19 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_MIXED;
 	}
 
-	/* for SSD, cluster allocations together as much as possible */
-	if (btrfs_test_opt(root, SSD)) {
-		if (data)
-			last_ptr = &root->fs_info->last_data_alloc;
-		else
-			last_ptr = &root->fs_info->last_alloc;
-		if (*last_ptr) {
+	if (!data)
+		last_ptr = &root->fs_info->last_alloc;
+
+	if (data && btrfs_test_opt(root, SSD))
+		last_ptr = &root->fs_info->last_data_alloc;
+
+	if (last_ptr) {
+		if (*last_ptr)
 			hint_byte = *last_ptr;
-		}
 		else {
 			hint_byte = hint_byte &
 				~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
-			empty_size += 16 * 1024 * 1024;
+			empty_size += 2 * 1024 * 1024;
 		}
 	}
 
@@ -1473,9 +1473,8 @@ check_failed:
 	search_start = find_search_start(root, &block_group, search_start,
 					 total_needed, data);
 
-	if (btrfs_test_opt(root, SSD) && *last_ptr &&
-	    search_start != *last_ptr) {
-		info->last_alloc = 0;
+	if (last_ptr && *last_ptr && search_start != *last_ptr) {
+		*last_ptr = 0;
 		if (!empty_size) {
 			empty_size += 16 * 1024 * 1024;
 			total_needed += empty_size;
@@ -1612,7 +1611,7 @@ check_pending:
 	}
 	ins->offset = num_bytes;
 	btrfs_free_path(path);
-	if (btrfs_test_opt(root, SSD))
+	if (last_ptr)
 		*last_ptr = ins->objectid + ins->offset;
 	return 0;
 
@@ -1827,8 +1826,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 			buf->start, buf->start + buf->len - 1,
 			EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
-	if (!btrfs_test_opt(root, SSD))
-		btrfs_set_buffer_defrag(buf);
+	btrfs_set_buffer_defrag(buf);
 	trans->blocks_used++;
 	return buf;
 }
-- 
cgit v1.2.3


From 2f375ab9c55bf22bf77ed9e3082b93e77ea10f55 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 1 Feb 2008 14:58:07 -0500
Subject: Call btrfs_cow_block while lowering tree level.

When freeing root block of a tree,  btrfs_free_extent' parameter
'ref_generation' is from root block itseft.  When freeing non-root
block,  'ref_generation' is from its parent. so when converting a
non-root block to root block, we must guarantee its generation is
equal to its parent's generation.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cf32651876b..ef8809c1e69 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -720,6 +720,9 @@ static int balance_level(struct btrfs_trans_handle *trans,
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
 		BUG_ON(!child);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+		BUG_ON(ret);
+
 		root->node = child;
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
-- 
cgit v1.2.3


From ae9d12853b44f4e0c06732166e8e3399ec01a680 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Feb 2008 15:42:15 -0500
Subject: Btrfs: Fix delalloc account on state deletion

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5f2fbf2054f..5d2fb527fd9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -396,6 +396,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
 		wake_up(&state->wq);
 	if (delete || state->state == 0) {
 		if (state->tree) {
+			clear_state_cb(tree, state, state->state);
 			if (tree->last == state)
 				tree->last = NULL;
 			rb_erase(&state->rb_node, &tree->state);
-- 
cgit v1.2.3


From 80b6794d1153ed91a040d873396efb9bd60969fd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Feb 2008 16:35:04 -0500
Subject: Btrfs: Lower stack usage in transaction.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3f64d0c7ddb..e9a0983897f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -33,7 +33,7 @@ static struct workqueue_struct *trans_wq;
 #define BTRFS_ROOT_TRANS_TAG 0
 #define BTRFS_ROOT_DEFRAG_TAG 1
 
-static void put_transaction(struct btrfs_transaction *transaction)
+static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(transaction->use_count == 0);
 	transaction->use_count--;
@@ -46,7 +46,7 @@ static void put_transaction(struct btrfs_transaction *transaction)
 	}
 }
 
-static int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
 	cur_trans = root->fs_info->running_transaction;
@@ -82,7 +82,7 @@ static int join_transaction(struct btrfs_root *root)
 	return 0;
 }
 
-static int record_root_in_trans(struct btrfs_root *root)
+static noinline int record_root_in_trans(struct btrfs_root *root)
 {
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
 	if (root->ref_cows && root->last_trans < running_trans_id) {
@@ -225,8 +225,8 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int wait_for_commit(struct btrfs_root *root,
-			   struct btrfs_transaction *commit)
+static noinline int wait_for_commit(struct btrfs_root *root,
+				    struct btrfs_transaction *commit)
 {
 	DEFINE_WAIT(wait);
 	mutex_lock(&root->fs_info->trans_mutex);
@@ -265,9 +265,9 @@ int btrfs_add_dead_root(struct btrfs_root *root,
 	return 0;
 }
 
-static int add_dirty_roots(struct btrfs_trans_handle *trans,
-			   struct radix_tree_root *radix,
-			   struct list_head *list)
+static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
+				    struct radix_tree_root *radix,
+				    struct list_head *list)
 {
 	struct dirty_root *dirty;
 	struct btrfs_root *gang[8];
@@ -406,8 +406,8 @@ int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
 	return err;
 }
 
-static int drop_dirty_roots(struct btrfs_root *tree_root,
-			    struct list_head *list)
+static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
+				     struct list_head *list)
 {
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
@@ -529,23 +529,28 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int create_pending_snapshot(struct btrfs_trans_handle *trans,
+static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_pending_snapshot *pending)
 {
 	struct btrfs_key key;
-	struct btrfs_root_item new_root_item;
+	struct btrfs_root_item *new_root_item;
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root = pending->root;
 	struct extent_buffer *tmp;
 	int ret;
 	u64 objectid;
 
+	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
+	if (!new_root_item) {
+		ret = -ENOMEM;
+		goto fail;
+	}
 	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
 	if (ret)
 		goto fail;
 
-	memcpy(&new_root_item, &root->root_item, sizeof(new_root_item));
+	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
 	key.objectid = objectid;
 	key.offset = 1;
@@ -557,10 +562,10 @@ static int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
 
-	btrfs_set_root_bytenr(&new_root_item, tmp->start);
-	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
+	btrfs_set_root_bytenr(new_root_item, tmp->start);
+	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-				&new_root_item);
+				new_root_item);
 	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
@@ -581,11 +586,12 @@ static int create_pending_snapshot(struct btrfs_trans_handle *trans,
 			     pending->name, strlen(pending->name), objectid,
 			     root->fs_info->sb->s_root->d_inode->i_ino);
 fail:
+	kfree(new_root_item);
 	return ret;
 }
 
-static int create_pending_snapshots(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info)
+static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
+					     struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_pending_snapshot *pending;
 	struct list_head *head = &trans->transaction->pending_snapshots;
-- 
cgit v1.2.3


From c2e639f02cff721172569add21ba5d058fd1a81c Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 4 Feb 2008 08:57:25 -0500
Subject: Btrfs: Fix typo in extent_io.c ---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 4 ++--
 fs/btrfs/inode.c     | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5d2fb527fd9..36a943e5180 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1416,7 +1416,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 		 * to the correct offsets in the file
 		 */
 		if (state && (state->end + 1 != start ||
-		    !state->state & EXTENT_WRITEBACK)) {
+		    !(state->state & EXTENT_WRITEBACK))) {
 			state = NULL;
 		}
 		spin_unlock_irqrestore(&tree->lock, flags);
@@ -1539,7 +1539,7 @@ static int end_bio_extent_readpage(struct bio *bio,
 		 * to the correct offsets in the file
 		 */
 		if (state && (state->end + 1 != start ||
-		    !state->state & EXTENT_WRITEBACK)) {
+		    !(state->state & EXTENT_LOCKED))) {
 			state = NULL;
 		}
 		spin_unlock_irqrestore(&tree->lock, flags);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f13b1dbc061..44fc94fee2c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -364,8 +364,7 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
-
-	if (state->start == start) {
+	if (state && state->start == start) {
 		private = state->private;
 		ret = 0;
 	} else {
-- 
cgit v1.2.3


From f594706643607d7d24901479e985ec62e0ce129a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Feb 2008 10:10:13 -0500
Subject: Btrfs: Add debugging for block group update failure

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6f161abe286..a340eeedd6c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1751,7 +1751,11 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 update_block:
 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0,
 				 data);
-	BUG_ON(ret);
+	if (ret) {
+		printk("update block group failed for %Lu %Lu\n",
+		       ins->objectid, ins->offset);
+		BUG();
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 47b0c4f8c717890877058f30e07a30e05f74a7bb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Feb 2008 10:10:43 -0500
Subject: Btrfs: Update magic

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 35e9a7af10a..dfb700efcd6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -37,7 +37,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 
-#define BTRFS_MAGIC "_B3RfS_M"
+#define BTRFS_MAGIC "_B4RfS_M"
 
 #define BTRFS_MAX_LEVEL 8
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
-- 
cgit v1.2.3


From dacb473ad81d5c1650cb73d1cb9b866202f6bab0 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 5 Feb 2008 09:07:49 -0500
Subject: Btrfs: Fix hole creation in file_write

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7cbf8d8d7e2..184bde1cfc3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -265,16 +265,14 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
 	/* FIXME...EIEIO, ENOSPC and more */
-
 	/* insert any holes we need to create */
-	if (inode->i_size < start_pos) {
+	if (isize < end_pos) {
 		u64 last_pos_in_file;
 		u64 hole_size;
 		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
 		hole_size = (end_pos - last_pos_in_file + mask) & ~mask;
-
-		if (last_pos_in_file < start_pos) {
+		if (last_pos_in_file < end_pos) {
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
-- 
cgit v1.2.3


From 068fe39fa18e8921062e2e16e6aa2b46768b2a35 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 5 Feb 2008 10:20:17 -0500
Subject: Btrfs: Add checks for last byte in disk to allocator grouping

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a340eeedd6c..9e0869959cd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1611,8 +1611,13 @@ check_pending:
 	}
 	ins->offset = num_bytes;
 	btrfs_free_path(path);
-	if (last_ptr)
+	if (last_ptr) {
 		*last_ptr = ins->objectid + ins->offset;
+		if (*last_ptr ==
+		    btrfs_super_total_bytes(&root->fs_info->super_copy)) {
+			*last_ptr = 0;
+		}
+	}
 	return 0;
 
 new_group:
-- 
cgit v1.2.3


From 7261009ceaee65e4683be6e474d1ea85a1b1b26c Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 5 Feb 2008 15:40:36 -0500
Subject: btrfs_drop_extents: handle BTRFS_INODE_REF_KEY types It's possible
 "key.type == BTRFS_INODE_REF_KEY" and "key.offset >= end".

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 184bde1cfc3..bfa4149c053 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -507,11 +507,12 @@ next_slot:
 		slot = path->slots[0];
 		ret = 0;
 		btrfs_item_key_to_cpu(leaf, &key, slot);
-
-		if (key.offset >= end || key.objectid != inode->i_ino) {
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
+		    key.offset >= end) {
 			goto out;
 		}
-		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY) {
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+		    key.objectid != inode->i_ino) {
 			goto out;
 		}
 		if (recow) {
-- 
cgit v1.2.3


From 5d196fc15dbe1cc54b81801694aafe218121e054 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 6 Feb 2008 10:00:57 -0500
Subject: Btrfs: Use 2MB as the empty_size for clustered allocations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9e0869959cd..220e9be9ad8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1476,7 +1476,7 @@ check_failed:
 	if (last_ptr && *last_ptr && search_start != *last_ptr) {
 		*last_ptr = 0;
 		if (!empty_size) {
-			empty_size += 16 * 1024 * 1024;
+			empty_size += 2 * 1024 * 1024;
 			total_needed += empty_size;
 		}
 		search_start = find_search_start(root, &block_group,
-- 
cgit v1.2.3


From 961d023213b94861480f5d6533f7adc8332eb527 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 6 Feb 2008 11:01:42 -0500
Subject: Btrfs: Don't case unsigned long to int in bio submission

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 36a943e5180..b68e57f66a9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1705,7 +1705,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			return 0;
 		}
 	}
-	nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
+	nr = bio_get_nr_vecs(bdev);
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
 	if (!bio) {
 		printk("failed to allocate bio nr %d\n", nr);
-- 
cgit v1.2.3


From a86c12c73d982c545a37a8ecdd66528ab260b770 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 7 Feb 2008 10:50:54 -0500
Subject: Btrfs: Create larger bios for btree blocks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 10 ++++++----
 fs/btrfs/extent_io.c | 12 +++++++++---
 fs/btrfs/extent_io.h |  3 ++-
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dda8ad6c007..6f0825b617d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -197,7 +197,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1);
+	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
+				 btree_get_extent);
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
@@ -339,7 +340,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, 0);
+				 buf, 0, 0, btree_get_extent);
 	free_extent_buffer(buf);
 	return ret;
 }
@@ -358,7 +359,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1);
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1,
+				 btree_get_extent);
 
 	if (buf->flags & EXTENT_CSUM)
 		return buf;
@@ -1009,7 +1011,7 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-					buf, 0, 1);
+					buf, 0, 1, btree_get_extent);
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b68e57f66a9..fcae999ab71 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2859,8 +2859,8 @@ EXPORT_SYMBOL(extent_buffer_uptodate);
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb,
-			     u64 start,
-			     int wait)
+			     u64 start, int wait,
+			     get_extent_t *get_extent)
 {
 	unsigned long i;
 	unsigned long start_i;
@@ -2868,6 +2868,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	int err;
 	int ret = 0;
 	unsigned long num_pages;
+	struct bio *bio = NULL;
+
 
 	if (eb->flags & EXTENT_UPTODATE)
 		return 0;
@@ -2899,7 +2901,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			lock_page(page);
 		}
 		if (!PageUptodate(page)) {
-			err = page->mapping->a_ops->readpage(NULL, page);
+			err = __extent_read_full_page(tree, page,
+						      get_extent, &bio);
 			if (err) {
 				ret = err;
 			}
@@ -2908,6 +2911,9 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		}
 	}
 
+	if (bio)
+		submit_one_bio(READ, bio);
+
 	if (ret || !wait) {
 		return ret;
 	}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 160da3f9d78..151fdada4dc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -157,7 +157,8 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					  gfp_t mask);
 void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb, u64 start, int wait);
+			     struct extent_buffer *eb, u64 start, int wait,
+			     get_extent_t *get_extent);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
-- 
cgit v1.2.3


From 9069218d448ea547dbad5f1cbd537e88d6519d66 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 8 Feb 2008 13:49:28 -0500
Subject: Btrfs: Fix i_blocks accounting

Now that delayed allocation accounting works, i_blocks accounting is changed
to only modify i_blocks when extents inserted or removed.

The fillattr call is changed to include the delayed allocation byte count
in the i_blocks result.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h       |  9 +++++++++
 fs/btrfs/file.c        | 17 +++++++++++------
 fs/btrfs/inode.c       | 43 +++++++++++++++++++++++++++++--------------
 4 files changed, 50 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9a0647dc5a0..9b9db9cbc01 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,6 +36,7 @@ struct btrfs_inode {
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
+	u64 delalloc_bytes;
 	u32 flags;
 };
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dfb700efcd6..52144b04f1a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1161,6 +1161,15 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+static inline void dec_i_blocks(struct inode *inode, u64 dec)
+{
+	dec = dec >> 9;
+	if (dec <= inode->i_blocks)
+		inode->i_blocks -= dec;
+	else
+		inode->i_blocks = 0;
+}
+
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bfa4149c053..f8939608254 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -175,6 +175,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 			leaf = path->nodes[0];
 			ei = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			inode->i_blocks += (offset + size - found_end) >> 9;
 		}
 		if (found_end < offset) {
 			ptr = btrfs_file_extent_inline_start(ei) + found_size;
@@ -184,6 +185,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 insert:
 		btrfs_release_path(root, path);
 		datasize = offset + size - key.offset;
+		inode->i_blocks += datasize >> 9;
 		datasize = btrfs_file_extent_calc_inline_size(datasize);
 		ret = btrfs_insert_empty_item(trans, root, path, &key,
 					      datasize);
@@ -256,7 +258,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		goto out_unlock;
 	}
 	btrfs_set_trans_block_group(trans, inode);
-	inode->i_blocks += num_bytes >> 9;
 	hint_byte = 0;
 
 	if ((end_of_last_block & 4095) == 0) {
@@ -410,7 +411,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
 			goto out;
 
-		if (found_key.offset != last_offset) {
+		if (found_key.offset < last_offset) {
 			WARN_ON(1);
 			btrfs_print_leaf(root, leaf);
 			printk("inode %lu found offset %Lu expected %Lu\n",
@@ -435,7 +436,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 		last_offset = extent_end;
 		path->slots[0]++;
 	}
-	if (last_offset < inode->i_size) {
+	if (0 && last_offset < inode->i_size) {
 		WARN_ON(1);
 		btrfs_print_leaf(root, leaf);
 		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
@@ -608,8 +609,7 @@ next_slot:
 								      extent);
 				if (btrfs_file_extent_disk_bytenr(leaf,
 								  extent)) {
-					inode->i_blocks -=
-						(old_num - new_num) >> 9;
+					dec_i_blocks(inode, old_num - new_num);
 				}
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								new_num);
@@ -620,6 +620,8 @@ next_slot:
 				u32 new_size;
 				new_size = btrfs_file_extent_calc_inline_size(
 						   inline_limit - key.offset);
+				dec_i_blocks(inode, (extent_end - key.offset) -
+					(inline_limit - key.offset));
 				btrfs_truncate_item(trans, root, path,
 						    new_size, 1);
 			}
@@ -653,7 +655,7 @@ next_slot:
 			btrfs_release_path(root, path);
 			extent = NULL;
 			if (found_extent && disk_bytenr != 0) {
-				inode->i_blocks -= extent_num_bytes >> 9;
+				dec_i_blocks(inode, extent_num_bytes);
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr,
 						disk_num_bytes,
@@ -674,6 +676,8 @@ next_slot:
 			u32 new_size;
 			new_size = btrfs_file_extent_calc_inline_size(
 						   extent_end - end);
+			dec_i_blocks(inode, (extent_end - key.offset) -
+					(extent_end - end));
 			btrfs_truncate_item(trans, root, path, new_size, 0);
 		}
 		/* create bookend, splitting the extent in two */
@@ -718,6 +722,7 @@ next_slot:
 	}
 out:
 	btrfs_free_path(path);
+	btrfs_check_file(root, inode);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44fc94fee2c..913ab128eee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       start, ins.objectid, ins.offset,
 					       ins.offset);
+		inode->i_blocks += ins.offset >> 9;
 		btrfs_check_file(root, inode);
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
@@ -142,6 +143,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	btrfs_drop_extent_cache(inode, orig_start,
 				orig_start + orig_num_bytes - 1);
 	btrfs_add_ordered_inode(inode);
+	btrfs_update_inode(trans, root, inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -265,6 +267,7 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		spin_lock(&root->fs_info->delalloc_lock);
+		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
 		spin_unlock(&root->fs_info->delalloc_lock);
 	}
@@ -281,8 +284,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			printk("warning: delalloc account %Lu %Lu\n",
 			       end - start + 1, root->fs_info->delalloc_bytes);
 			root->fs_info->delalloc_bytes = 0;
+			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
 			root->fs_info->delalloc_bytes -= end - start + 1;
+			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
 		spin_unlock(&root->fs_info->delalloc_lock);
 	}
@@ -833,32 +838,37 @@ search_again:
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
-					   extent_num_bytes) >> 9;
-				if (extent_start != 0) {
-					inode->i_blocks -= num_dec;
-				}
+					   extent_num_bytes);
+				if (extent_start != 0)
+					dec_i_blocks(inode, num_dec);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf,
 									 fi);
 				/* FIXME blocksize != 4096 */
-				num_dec = btrfs_file_extent_num_bytes(leaf,
-								       fi) >> 9;
+				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
 					found_extent = 1;
-					inode->i_blocks -= num_dec;
+					dec_i_blocks(inode, num_dec);
 				}
 				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
 			}
-		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE &&
-			   !del_item) {
-			u32 newsize = inode->i_size - found_key.offset;
-			newsize = btrfs_file_extent_calc_inline_size(newsize);
-			ret = btrfs_truncate_item(trans, root, path,
-						  newsize, 1);
-			BUG_ON(ret);
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			if (!del_item) {
+				u32 newsize = inode->i_size - found_key.offset;
+				dec_i_blocks(inode, item_end + 1 -
+					    found_key.offset - newsize);
+				newsize =
+				    btrfs_file_extent_calc_inline_size(newsize);
+				ret = btrfs_truncate_item(trans, root, path,
+							  newsize, 1);
+				BUG_ON(ret);
+			} else {
+				dec_i_blocks(inode, item_end + 1 -
+					     found_key.offset);
+			}
 		}
 delete:
 		if (del_item) {
@@ -1222,6 +1232,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
+	BTRFS_I(inode)->delalloc_bytes = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1528,6 +1539,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
+	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
 	if (mode & S_IFDIR)
@@ -1746,6 +1758,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2797,6 +2810,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	stat->blksize = PAGE_CACHE_SIZE;
+	stat->blocks = inode->i_blocks + (BTRFS_I(inode)->delalloc_bytes >> 9);
 	return 0;
 }
 
@@ -2912,6 +2926,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
-- 
cgit v1.2.3


From 39b5637f6f195852259004bb27b58e2dcf9fb378 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Fri, 15 Feb 2008 10:40:50 -0500
Subject: Btrfs: Fix "no csum found for inode" issue.

A few codes were not properly updated for changes of extent map.  This
may be the causes of "no csum found for inode" issue.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 5 +++--
 fs/btrfs/file.c      | 5 ++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fcae999ab71..9262ab37a7c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2417,12 +2417,13 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	u64 orig_start = start;
 	int ret = 1;
-
 	if ((mask & __GFP_WAIT) &&
 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
+		u64 len;
 		while (start <= end) {
+			len = end - start + 1;
 			spin_lock(&map->lock);
-			em = lookup_extent_mapping(map, start, end);
+			em = lookup_extent_mapping(map, start, len);
 			if (!em || IS_ERR(em)) {
 				spin_unlock(&map->lock);
 				break;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f8939608254..b0352b5958a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -357,10 +357,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 {
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 len = end - start + 1;
 
+	if (end == (u64)-1)
+		len = (u64)-1;
 	while(1) {
 		spin_lock(&em_tree->lock);
-		em = lookup_extent_mapping(em_tree, start, end);
+		em = lookup_extent_mapping(em_tree, start, len);
 		if (!em) {
 			spin_unlock(&em_tree->lock);
 			break;
-- 
cgit v1.2.3


From df68b8a7ad4a18c9e63f1c12015a59c3b7031adb Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Fri, 15 Feb 2008 10:40:52 -0500
Subject: Btrfs: unaligned access fixes

Btrfs set/get macros lose type information needed to avoid
unaligned accesses on sparc64.
ere is a patch for the kernel bits which fixes most of the
unaligned accesses on sparc64.

btrfs_name_hash is modified to return the hash value instead
of getting a return location via a (potentially unaligned)
pointer.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        | 17 ++++++-----------
 fs/btrfs/dir-item.c     | 12 ++++--------
 fs/btrfs/hash.c         | 17 ++++++++---------
 fs/btrfs/hash.h         |  2 +-
 fs/btrfs/struct-funcs.c | 30 ++++++++++++++----------------
 5 files changed, 33 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 52144b04f1a..086cb0525cd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -495,22 +495,17 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
 {									\
-	char *kaddr = kmap_atomic(eb->first_page, KM_USER0);		\
-	unsigned long offset = offsetof(type, member);			\
-	u##bits res;							\
-	__le##bits *tmp = (__le##bits *)(kaddr + offset);		\
-	res = le##bits##_to_cpu(*tmp);					\
-	kunmap_atomic(kaddr, KM_USER0);					\
+	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	u##bits res = le##bits##_to_cpu(p->member);			\
+	kunmap_atomic(p, KM_USER0);					\
 	return res;							\
 }									\
 static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 				    u##bits val)			\
 {									\
-	char *kaddr = kmap_atomic(eb->first_page, KM_USER0);		\
-	unsigned long offset = offsetof(type, member);			\
-	__le##bits *tmp = (__le##bits *)(kaddr + offset);		\
-	*tmp = cpu_to_le##bits(val);					\
-	kunmap_atomic(kaddr, KM_USER0);					\
+	type *p = kmap_atomic(eb->first_page, KM_USER0);		\
+	p->member = cpu_to_le##bits(val);				\
+	kunmap_atomic(p, KM_USER0);					\
 }
 
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)		\
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 5247a9a41f0..7a73dc59dc4 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -71,8 +71,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
-	ret = btrfs_name_hash(name, name_len, &key.offset);
-	BUG_ON(ret);
+	key.offset = btrfs_name_hash(name, name_len);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -125,8 +124,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
-	ret = btrfs_name_hash(name, name_len, &key.offset);
-	BUG_ON(ret);
+	key.offset = btrfs_name_hash(name, name_len);
 	path = btrfs_alloc_path();
 	data_size = sizeof(*dir_item) + name_len;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
@@ -199,8 +197,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 
-	ret = btrfs_name_hash(name, name_len, &key.offset);
-	BUG_ON(ret);
+	key.offset = btrfs_name_hash(name, name_len);
 
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
@@ -261,8 +258,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 
 	key.objectid = dir;
 	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
-	ret = btrfs_name_hash(name, name_len, &key.offset);
-	BUG_ON(ret);
+	key.offset = btrfs_name_hash(name, name_len);
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
 		return ERR_PTR(ret);
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index e5c76903d72..d5252f5c9d0 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -76,19 +76,18 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 		*buf++ = pad;
 }
 
-int btrfs_name_hash(const char *name, int len, u64 *hash_result)
+u64 btrfs_name_hash(const char *name, int len)
 {
 	__u32	hash;
 	__u32	minor_hash = 0;
 	const char	*p;
 	__u32		in[8], buf[2];
+	u64		hash_result;
 
 	if (len == 1 && *name == '.') {
-		*hash_result = 1;
-		return 0;
+		return 1;
 	} else if (len == 2 && name[0] == '.' && name[1] == '.') {
-		*hash_result = 2;
-		return 0;
+		return 2;
 	}
 
 	/* Initialize the default seed for the hash checksum functions */
@@ -106,8 +105,8 @@ int btrfs_name_hash(const char *name, int len, u64 *hash_result)
 	}
 	hash = buf[0];
 	minor_hash = buf[1];
-	*hash_result = buf[0];
-	*hash_result <<= 32;
-	*hash_result |= buf[1];
-	return 0;
+	hash_result = buf[0];
+	hash_result <<= 32;
+	hash_result |= buf[1];
+	return hash_result;
 }
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index d3be0267058..868ee17ca77 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -18,5 +18,5 @@
 
 #ifndef __HASH__
 #define __HASH__
-int btrfs_name_hash(const char *name, int len, u64 *hash_result);
+u64 btrfs_name_hash(const char *name, int len);
 #endif
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c5715a60554..ad03a32d111 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -21,16 +21,15 @@
 u##bits btrfs_##name(struct extent_buffer *eb,				\
 				   type *s)				\
 {									\
-	unsigned long offset = (unsigned long)s +			\
-				offsetof(type, member);			\
-	__le##bits *tmp;						\
+	unsigned long part_offset = (unsigned long)s;			\
+	unsigned long offset = part_offset + offsetof(type, member);	\
+	type *p;							\
 	/* ugly, but we want the fast path here */			\
 	if (eb->map_token && offset >= eb->map_start &&			\
 	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
 	    eb->map_len) {						\
-		tmp = (__le##bits *)(eb->kaddr + offset -		\
-				     eb->map_start);			\
-		return le##bits##_to_cpu(*tmp);				\
+		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
+		return le##bits##_to_cpu(p->member);			\
 	}								\
 	{								\
 		int err;						\
@@ -48,8 +47,8 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\
 			read_eb_member(eb, s, type, member, &res);	\
 			return le##bits##_to_cpu(res);			\
 		}							\
-		tmp = (__le##bits *)(kaddr + offset - map_start);	\
-		res = le##bits##_to_cpu(*tmp);				\
+		p = (type *)(kaddr + part_offset - map_start);		\
+		res = le##bits##_to_cpu(p->member);			\
 		if (unmap_on_exit)					\
 			unmap_extent_buffer(eb, map_token, KM_USER1);	\
 		return res;						\
@@ -58,16 +57,15 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\
 void btrfs_set_##name(struct extent_buffer *eb,				\
 				    type *s, u##bits val)		\
 {									\
-	unsigned long offset = (unsigned long)s +			\
-				offsetof(type, member);			\
-	__le##bits *tmp;						\
+	unsigned long part_offset = (unsigned long)s;			\
+	unsigned long offset = part_offset + offsetof(type, member);	\
+	type *p;							\
 	/* ugly, but we want the fast path here */			\
 	if (eb->map_token && offset >= eb->map_start &&			\
 	    offset + sizeof(((type *)0)->member) <= eb->map_start +	\
 	    eb->map_len) {						\
-		tmp = (__le##bits *)(eb->kaddr + offset -		\
-				     eb->map_start);			\
-		*tmp = cpu_to_le##bits(val);				\
+		p = (type *)(eb->kaddr + part_offset - eb->map_start);	\
+		p->member = cpu_to_le##bits(val);			\
 		return;							\
 	}								\
 	{								\
@@ -86,8 +84,8 @@ void btrfs_set_##name(struct extent_buffer *eb,				\
 			write_eb_member(eb, s, type, member, &val);	\
 			return;						\
 		}							\
-		tmp = (__le##bits *)(kaddr + offset - map_start);	\
-		*tmp = cpu_to_le##bits(val);				\
+		p = (type *)(kaddr + part_offset - map_start);		\
+		p->member = cpu_to_le##bits(val);			\
 		if (unmap_on_exit)					\
 			unmap_extent_buffer(eb, map_token, KM_USER1);	\
 	}								\
-- 
cgit v1.2.3


From 9afbb0b752ef30a429c45b9de6706e28ad1a36e1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Feb 2008 13:19:35 -0500
Subject: Btrfs: Disable tree defrag in SSD mode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 fs/btrfs/tree-defrag.c | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 220e9be9ad8..af5d4a08528 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1835,7 +1835,8 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 			buf->start, buf->start + buf->len - 1,
 			EXTENT_CSUM, GFP_NOFS);
 	buf->flags |= EXTENT_CSUM;
-	btrfs_set_buffer_defrag(buf);
+	if (!btrfs_test_opt(root, SSD))
+		btrfs_set_buffer_defrag(buf);
 	trans->blocks_used++;
 	return buf;
 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5c58630dce0..5935cbd8f2b 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -179,6 +179,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (root->ref_cows == 0 && !is_extent)
 		goto out;
 
+	if (btrfs_test_opt(root, SSD))
+		goto out;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.2.3


From d7fc640e6fed46932f7c74e14f9b58b8637c66cf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 Feb 2008 12:12:38 -0500
Subject: Btrfs: Allocator improvements

Reduce CPU time searching for free blocks by optimizing find_first_extent_bit

Fix find_free_extent to make better use of the last_alloc hint.  Before it
was often finding blocks just before the hint.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/extent-tree.c | 34 ++++++++++++++++++++++++----------
 fs/btrfs/extent_io.c   | 38 ++++++++++++++++++++++++++++++++++----
 fs/btrfs/extent_io.h   | 11 +++++++++++
 4 files changed, 70 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6f0825b617d..88e21bdbc47 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -951,7 +951,7 @@ void btrfs_throttle(struct btrfs_root *root)
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 {
 	balance_dirty_pages_ratelimited_nr(
-			root->fs_info->btree_inode->i_mapping, 1);
+				   root->fs_info->btree_inode->i_mapping, 1);
 }
 
 void btrfs_set_buffer_defrag(struct extent_buffer *buf)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index af5d4a08528..239e9d8669c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -188,9 +188,10 @@ static u64 noinline find_search_start(struct btrfs_root *root,
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
+	struct extent_io_tree *free_space_cache;
+	struct extent_state *state;
 	u64 last;
 	u64 start = 0;
-	u64 end = 0;
 	u64 cache_miss = 0;
 	u64 total_fs_bytes;
 	int wrapped = 0;
@@ -199,6 +200,8 @@ static u64 noinline find_search_start(struct btrfs_root *root,
 		goto out;
 	}
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	free_space_cache = &root->fs_info->free_space_cache;
+
 again:
 	ret = cache_block_group(root, cache);
 	if (ret)
@@ -206,22 +209,27 @@ again:
 
 	last = max(search_start, cache->key.objectid);
 
+	spin_lock_irq(&free_space_cache->lock);
+	state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
 	while(1) {
-		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
-					    last, &start, &end, EXTENT_DIRTY);
-		if (ret) {
+		if (!state) {
 			if (!cache_miss)
 				cache_miss = last;
+			spin_unlock_irq(&free_space_cache->lock);
 			goto new_group;
 		}
 
-		start = max(last, start);
-		last = end + 1;
+		start = max(last, state->start);
+		last = state->end + 1;
 		if (last - start < num) {
 			if (last == cache->key.objectid + cache->key.offset)
 				cache_miss = start;
+			do {
+				state = extent_state_next(state);
+			} while(state && !(state->state & EXTENT_DIRTY));
 			continue;
 		}
+		spin_unlock_irq(&free_space_cache->lock);
 		if (data != BTRFS_BLOCK_GROUP_MIXED &&
 		    start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
@@ -1420,6 +1428,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	int empty_cluster;
 	u64 cached_start;
 
 	WARN_ON(num_bytes < root->sectorsize);
@@ -1431,11 +1440,15 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_MIXED;
 	}
 
-	if (!data)
+	if (!data) {
 		last_ptr = &root->fs_info->last_alloc;
+		empty_cluster = 128 * 1024;
+	}
 
-	if (data && btrfs_test_opt(root, SSD))
+	if (data && btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->last_data_alloc;
+		empty_cluster = 2 * 1024 * 1024;
+	}
 
 	if (last_ptr) {
 		if (*last_ptr)
@@ -1443,8 +1456,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		else {
 			hint_byte = hint_byte &
 				~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
-			empty_size += 2 * 1024 * 1024;
+			empty_size += empty_cluster;
 		}
+		search_start = max(search_start, hint_byte);
 	}
 
 	search_end = min(search_end,
@@ -1476,7 +1490,7 @@ check_failed:
 	if (last_ptr && *last_ptr && search_start != *last_ptr) {
 		*last_ptr = 0;
 		if (!empty_size) {
-			empty_size += 2 * 1024 * 1024;
+			empty_size += empty_cluster;
 			total_needed += empty_size;
 		}
 		search_start = find_search_start(root, &block_group,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9262ab37a7c..fb6400895ed 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -255,7 +255,7 @@ static int merge_state(struct extent_io_tree *tree,
 			state->start = other->start;
 			other->tree = NULL;
 			if (tree->last == other)
-				tree->last = NULL;
+				tree->last = state;
 			rb_erase(&other->rb_node, &tree->state);
 			free_extent_state(other);
 		}
@@ -268,7 +268,7 @@ static int merge_state(struct extent_io_tree *tree,
 			other->start = state->start;
 			state->tree = NULL;
 			if (tree->last == state)
-				tree->last = NULL;
+				tree->last = other;
 			rb_erase(&state->rb_node, &tree->state);
 			free_extent_state(state);
 		}
@@ -397,8 +397,9 @@ static int clear_state_bit(struct extent_io_tree *tree,
 	if (delete || state->state == 0) {
 		if (state->tree) {
 			clear_state_cb(tree, state, state->state);
-			if (tree->last == state)
-				tree->last = NULL;
+			if (tree->last == state) {
+				tree->last = extent_state_next(state);
+			}
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
 			free_extent_state(state);
@@ -962,6 +963,35 @@ out:
 }
 EXPORT_SYMBOL(find_first_extent_bit);
 
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+						 u64 start, int bits)
+{
+	struct rb_node *node;
+	struct extent_state *state;
+
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node || IS_ERR(node)) {
+		goto out;
+	}
+
+	while(1) {
+		state = rb_entry(node, struct extent_state, rb_node);
+		if (state->end >= start && (state->state & bits)) {
+			return state;
+		}
+		node = rb_next(node);
+		if (!node)
+			break;
+	}
+out:
+	return NULL;
+}
+EXPORT_SYMBOL(find_first_extent_bit_state);
+
 u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 			     u64 *start, u64 *end, u64 max_bytes)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 151fdada4dc..fcc4bb078c2 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -81,6 +81,15 @@ struct extent_buffer {
 
 struct extent_map_tree;
 
+static inline struct extent_state *extent_state_next(struct extent_state *state)
+{
+	struct rb_node *node;
+	node = rb_next(&state->rb_node);
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct extent_state, rb_node);
+}
+
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
 					  struct page *page,
 					  size_t page_offset,
@@ -122,6 +131,8 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits);
+struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
+						 u64 start, int bits);
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
-- 
cgit v1.2.3


From 952fccac50350481742425cac0c80f36ba8b83f2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 Feb 2008 16:33:44 -0500
Subject: Btrfs: Remove extent back refs in batches, and avoid duplicate
 searches

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 239e9d8669c..8f441783a6e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1251,6 +1251,9 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_root *extent_root = info->extent_root;
 	struct extent_buffer *leaf;
 	int ret;
+	int extent_slot = 0;
+	int found_extent = 0;
+	int num_to_del = 1;
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
@@ -1267,7 +1270,24 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 				    ref_generation,
 				    owner_objectid, owner_offset, 1);
 	if (ret == 0) {
-		ret = btrfs_del_item(trans, extent_root, path);
+		struct btrfs_key found_key;
+		extent_slot = path->slots[0];
+		while(extent_slot > 0) {
+			extent_slot--;
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      extent_slot);
+			if (found_key.objectid != bytenr)
+				break;
+			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    found_key.offset == num_bytes) {
+				found_extent = 1;
+				break;
+			}
+			if (path->slots[0] - extent_slot > 5)
+				break;
+		}
+		if (!found_extent)
+			ret = btrfs_del_item(trans, extent_root, path);
 	} else {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
@@ -1276,21 +1296,46 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		       root_objectid, ref_generation, owner_objectid,
 		       owner_offset);
 	}
-	btrfs_release_path(extent_root, path);
-	ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-	if (ret < 0)
-		return ret;
-	BUG_ON(ret);
+	if (!found_extent) {
+		btrfs_release_path(extent_root, path);
+		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+		if (ret < 0)
+			return ret;
+		BUG_ON(ret);
+		extent_slot = path->slots[0];
+	}
 
 	leaf = path->nodes[0];
-	ei = btrfs_item_ptr(leaf, path->slots[0],
+	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
 	BUG_ON(refs == 0);
 	refs -= 1;
 	btrfs_set_extent_refs(leaf, ei, refs);
+
 	btrfs_mark_buffer_dirty(leaf);
 
+	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+		/* if the back ref and the extent are next to each other
+		 * they get deleted below in one shot
+		 */
+		path->slots[0] = extent_slot;
+		num_to_del = 2;
+	} else if (found_extent) {
+		/* otherwise delete the extent back ref */
+		ret = btrfs_del_item(trans, extent_root, path);
+		BUG_ON(ret);
+		/* if refs are 0, we need to setup the path for deletion */
+		if (refs == 0) {
+			btrfs_release_path(extent_root, path);
+			ret = btrfs_search_slot(trans, extent_root, &key, path,
+						-1, 1);
+			if (ret < 0)
+				return ret;
+			BUG_ON(ret);
+		}
+	}
+
 	if (refs == 0) {
 		u64 super_used;
 		u64 root_used;
@@ -1311,8 +1356,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		root_used = btrfs_root_used(&root->root_item);
 		btrfs_set_root_used(&root->root_item,
 					   root_used - num_bytes);
-
-		ret = btrfs_del_item(trans, extent_root, path);
+		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
+				      num_to_del);
 		if (ret) {
 			return ret;
 		}
-- 
cgit v1.2.3


From 0762704b196d41941a9b439e9165efaf85c6609e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 19 Feb 2008 11:29:24 -0500
Subject: Btrfs: Properly clear dirty and delalloc extent bits while preparing
 the file for write

Yan Zheng noticed that we don't clear the extent state tree dirty and delalloc
bits when we clear the dirty bits on the page during file write.

This leads to csum errors later on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b0352b5958a..1a47251a9d4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -762,6 +762,13 @@ static int prepare_pages(struct btrfs_root *root, struct file *file,
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
+	if (start_pos < inode->i_size) {
+		u64 last_pos;
+		last_pos = (index + num_pages) << PAGE_CACHE_SHIFT;
+		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
+				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+				  GFP_NOFS);
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5e591a0703bc03047d37938586f97fd8ca960c08 Mon Sep 17 00:00:00 2001
From: Yan Zheng <yanzheng@21cn.com>
Date: Tue, 19 Feb 2008 11:41:02 -0500
Subject: Btrfs: Fix looping on readdir of the subvol roots

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 913ab128eee..3e79572a5cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1465,7 +1465,10 @@ read_dir_items:
 			di = (struct btrfs_dir_item *)((char *)di + di_len);
 		}
 	}
-	filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+	if (key_type == BTRFS_DIR_INDEX_KEY)
+		filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+	else
+		filp->f_pos++;
 nopos:
 	ret = 0;
 err:
-- 
cgit v1.2.3


From d99cb30a11ea117ad0f58cda09aa43438ebbf81c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 19 Feb 2008 12:55:05 -0500
Subject: Btrfs: Take the extent lock before dropping the delalloc bits

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1a47251a9d4..51466dcd44e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -765,9 +765,13 @@ static int prepare_pages(struct btrfs_root *root, struct file *file,
 	if (start_pos < inode->i_size) {
 		u64 last_pos;
 		last_pos = (index + num_pages) << PAGE_CACHE_SHIFT;
+		lock_extent(&BTRFS_I(inode)->io_tree,
+			    start_pos, last_pos - 1, GFP_NOFS);
 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
 				  GFP_NOFS);
+		unlock_extent(&BTRFS_I(inode)->io_tree,
+			      start_pos, last_pos - 1, GFP_NOFS);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 0740c82bc7585a23e100c9a59d5e2abbf99459fb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 19 Feb 2008 16:24:18 -0500
Subject: Btrfs: Properly cast before shifting

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 51466dcd44e..5fa450452f9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -764,7 +764,7 @@ static int prepare_pages(struct btrfs_root *root, struct file *file,
 	}
 	if (start_pos < inode->i_size) {
 		u64 last_pos;
-		last_pos = (index + num_pages) << PAGE_CACHE_SHIFT;
+		last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 		lock_extent(&BTRFS_I(inode)->io_tree,
 			    start_pos, last_pos - 1, GFP_NOFS);
 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
-- 
cgit v1.2.3


From 065631f6dccea07bfad48d8981369f6d9cfd6e2b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Feb 2008 12:07:25 -0500
Subject: Btrfs: checksum file data at bio submission time instead of during
 writepage

When we checkum file data during writepage, the checksumming is done one
page at a time, making it difficult to do bulk metadata modifications
to insert checksums for large ranges of the file at once.

This patch changes btrfs to checksum on a per-bio basis instead.  The
bios are checksummed before they are handed off to the block layer, so
each bio is contiguous and only has pages from the same inode.

Checksumming on a bio basis allows us to insert and modify the file
checksum items in large groups.  It also allows the checksumming to
be done more easily by async worker threads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  8 +++-----
 fs/btrfs/extent_io.c |  2 ++
 fs/btrfs/extent_io.h |  1 +
 fs/btrfs/file-item.c | 50 +++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/inode.c     | 32 ++++++++++++++++++++++++++++++--
 5 files changed, 75 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 086cb0525cd..92d892f9207 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1142,11 +1142,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 bytenr, int mod);
-int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct inode *inode,
-			  u64 objectid, u64 offset,
-			  char *data, size_t len);
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct inode *inode,
+			   struct bio *bio);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fb6400895ed..e8130c87633 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1705,6 +1705,8 @@ static int submit_one_bio(int rw, struct bio *bio)
 			(unsigned long long)bio->bi_sector);
 		WARN_ON(1);
 	}
+	if (tree->ops && tree->ops->submit_bio_hook)
+		tree->ops->submit_bio_hook(rw, bio);
 
 	submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index fcc4bb078c2..9d665466708 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -28,6 +28,7 @@ struct extent_state;
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*submit_bio_hook)(int rw, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3ebbc058d08..3f0e71b0e5d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -16,6 +16,9 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -131,28 +134,35 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_csum_file_block(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct inode *inode,
-			  u64 objectid, u64 offset,
-			  char *data, size_t len)
+int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, struct inode *inode,
+			   struct bio *bio)
 {
+	u64 objectid = inode->i_ino;
+	u64 offset;
 	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
-	u64 next_offset = (u64)-1;
-	int found_next = 0;
+	u64 next_offset;
+	int found_next;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
+	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
 	u64 csum_offset;
-	u32 csum_result = ~(u32)0;
+	u32 csum_result;
 	u32 nritems;
 	u32 ins_size;
+	int bio_index = 0;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	char *data;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-
+again:
+	next_offset = (u64)-1;
+	found_next = 0;
+	offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
@@ -259,7 +269,15 @@ csum:
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
 					  csum_offset * BTRFS_CRC32_SIZE);
 found:
-	csum_result = btrfs_csum_data(root, data, csum_result, len);
+	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
+	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+				      btrfs_item_size_nr(leaf, path->slots[0]));
+next_bvec:
+	data = kmap_atomic(bvec->bv_page, KM_IRQ0);
+	csum_result = ~(u32)0;
+	csum_result = btrfs_csum_data(root, data + bvec->bv_offset,
+				      csum_result, bvec->bv_len);
+	kunmap_atomic(data, KM_IRQ0);
 	btrfs_csum_final(csum_result, (char *)&csum_result);
 	if (csum_result == 0) {
 		printk("csum result is 0 for inode %lu offset %Lu\n", inode->i_ino, offset);
@@ -267,9 +285,19 @@ found:
 
 	write_extent_buffer(leaf, &csum_result, (unsigned long)item,
 			    BTRFS_CRC32_SIZE);
+	bio_index++;
+	bvec++;
+	if (bio_index < bio->bi_vcnt) {
+		item = (struct btrfs_csum_item *)((char *)item + BTRFS_CRC32_SIZE);
+		if (item < item_end)
+			goto next_bvec;
+	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	if (bio_index < bio->bi_vcnt) {
+		btrfs_release_path(root, path);
+		goto again;
+	}
 fail:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e79572a5cb..bbb71fad861 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
@@ -294,6 +295,32 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
+int btrfs_submit_bio_hook(int rw, struct bio *bio)
+{
+	// struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct inode *inode = bvec->bv_page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	if (rw != WRITE)
+		return 0;
+
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	btrfs_csum_file_blocks(trans, root, inode, bio);
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+#if 0
 int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 {
 	struct inode *inode = page->mapping->host;
@@ -318,7 +345,7 @@ int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
-
+#endif
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 {
 	int ret = 0;
@@ -3022,7 +3049,8 @@ static struct file_operations btrfs_dir_file_operations = {
 
 static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
-	.writepage_io_hook = btrfs_writepage_io_hook,
+	// .writepage_io_hook = btrfs_writepage_io_hook,
+	.submit_bio_hook = btrfs_submit_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
-- 
cgit v1.2.3


From 6e92f5e651a34f24ab31ebdf3f113c7d23a36000 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Feb 2008 12:07:25 -0500
Subject: Btrfs: While doing checksums on bios, cache the extent_buffer mapping

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3f0e71b0e5d..ee25e50e7c0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -156,6 +156,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	int bio_index = 0;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	char *data;
+	char *eb_map;
+	char *eb_token;
+	unsigned long map_len;
+	unsigned long map_start;
+
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -272,6 +277,7 @@ found:
 	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
+	eb_token = NULL;
 next_bvec:
 	data = kmap_atomic(bvec->bv_page, KM_IRQ0);
 	csum_result = ~(u32)0;
@@ -283,15 +289,39 @@ next_bvec:
 		printk("csum result is 0 for inode %lu offset %Lu\n", inode->i_ino, offset);
 	}
 
-	write_extent_buffer(leaf, &csum_result, (unsigned long)item,
-			    BTRFS_CRC32_SIZE);
+	if (!eb_token ||
+	   (unsigned long)item  + BTRFS_CRC32_SIZE >= map_start + map_len) {
+		int err;
+
+		if (eb_token)
+			unmap_extent_buffer(leaf, eb_token, KM_IRQ1);
+		eb_token = NULL;
+		err = map_private_extent_buffer(leaf, (unsigned long)item,
+						BTRFS_CRC32_SIZE,
+						&eb_token, &eb_map,
+						&map_start, &map_len, KM_IRQ1);
+		if (err)
+			eb_token = NULL;
+	}
+	if (eb_token) {
+		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
+		       &csum_result, BTRFS_CRC32_SIZE);
+	} else {
+		write_extent_buffer(leaf, &csum_result, (unsigned long)item,
+				    BTRFS_CRC32_SIZE);
+	}
 	bio_index++;
 	bvec++;
 	if (bio_index < bio->bi_vcnt) {
-		item = (struct btrfs_csum_item *)((char *)item + BTRFS_CRC32_SIZE);
+		item = (struct btrfs_csum_item *)((char *)item +
+						  BTRFS_CRC32_SIZE);
 		if (item < item_end)
 			goto next_bvec;
 	}
+	if (eb_token) {
+		unmap_extent_buffer(leaf, eb_token, KM_IRQ1);
+		eb_token = NULL;
+	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	if (bio_index < bio->bi_vcnt) {
 		btrfs_release_path(root, path);
-- 
cgit v1.2.3


From e3fe4e7120bc753552b071773022efcff704e34b Mon Sep 17 00:00:00 2001
From: Greg KH <greg@kroah.com>
Date: Wed, 20 Feb 2008 14:14:16 -0500
Subject: btrfs: fixes for kobject changes in mainline

Here's a patch against the unstable tree that gets the code to build
against Linus's current tree (2.6.24-git12).  This is needed as the
kobject/kset api has changed there.

I tried to make the smallest changes needed, and it builds and loads
successfully, but I don't have a btrfs volume anywhere (yet) to try to
see if things still work properly :)

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/sysfs.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index cd673ca9bb9..973d56e0569 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -184,7 +184,8 @@ static struct kobj_type btrfs_super_ktype = {
 	.release	= btrfs_super_release,
 };
 
-static struct kset btrfs_kset;
+/* /sys/fs/btrfs/ entry */
+static struct kset *btrfs_kset;
 
 int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
 {
@@ -208,14 +209,9 @@ int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
 	}
 	name[len] = '\0';
 
-	fs->super_kobj.kset = &btrfs_kset;
-	fs->super_kobj.ktype = &btrfs_super_ktype;
-
-	error = kobject_set_name(&fs->super_kobj, "%s", name);
-	if (error)
-		goto fail;
-
-	error = kobject_register(&fs->super_kobj);
+	fs->super_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
+				     NULL, "%s", name);
 	if (error)
 		goto fail;
 
@@ -232,15 +228,9 @@ int btrfs_sysfs_add_root(struct btrfs_root *root)
 {
 	int error;
 
-	root->root_kobj.ktype = &btrfs_root_ktype;
-	root->root_kobj.parent = &root->fs_info->super_kobj;
-
-	error = kobject_set_name(&root->root_kobj, "%s", root->name);
-	if (error) {
-		goto fail;
-	}
-
-	error = kobject_register(&root->root_kobj);
+	error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
+				     &root->fs_info->super_kobj,
+				     "%s", root->name);
 	if (error)
 		goto fail;
 
@@ -253,24 +243,25 @@ fail:
 
 void btrfs_sysfs_del_root(struct btrfs_root *root)
 {
-	kobject_unregister(&root->root_kobj);
+	kobject_put(&root->root_kobj);
 	wait_for_completion(&root->kobj_unregister);
 }
 
 void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
 {
-	kobject_unregister(&fs->super_kobj);
+	kobject_put(&fs->super_kobj);
 	wait_for_completion(&fs->kobj_unregister);
 }
 
 int btrfs_init_sysfs()
 {
-	kobj_set_kset_s(&btrfs_kset, fs_subsys);
-	kobject_set_name(&btrfs_kset.kobj, "btrfs");
-	return kset_register(&btrfs_kset);
+	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
+	if (!btrfs_kset)
+		return -ENOMEM;
+	return 0;
 }
 
 void btrfs_exit_sysfs()
 {
-	kset_unregister(&btrfs_kset);
+	kset_unregister(btrfs_kset);
 }
-- 
cgit v1.2.3


From 6885f308b557020efce6ef9c1fcc96adaabecbb6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Feb 2008 16:11:05 -0500
Subject: Btrfs: Misc 2.6.25 updates

Remove the btrfs read_inode method, and use save_mount_options

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 29 +----------------------------
 fs/btrfs/super.c | 11 ++++++++++-
 2 files changed, 11 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bbb71fad861..315dd550d56 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -297,7 +297,6 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 
 int btrfs_submit_bio_hook(int rw, struct bio *bio)
 {
-	// struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	struct inode *inode = bvec->bv_page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -320,32 +319,7 @@ int btrfs_submit_bio_hook(int rw, struct bio *bio)
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
-#if 0
-int btrfs_writepage_io_hook(struct page *page, u64 start, u64 end)
-{
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
-	char *kaddr;
-	int ret = 0;
-	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-	size_t offset = start - page_start;
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
-		return 0;
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-	kaddr = kmap(page);
-	btrfs_csum_file_block(trans, root, inode, inode->i_ino,
-			      start, kaddr + offset, end - start + 1);
-	kunmap(page);
-	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
-#endif
+
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 {
 	int ret = 0;
@@ -3049,7 +3023,6 @@ static struct file_operations btrfs_dir_file_operations = {
 
 static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
-	// .writepage_io_hook = btrfs_writepage_io_hook,
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a46300c4753..4423a91206a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -272,6 +272,11 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 
 	sb->s_root = root_dentry;
 	btrfs_transaction_queue_work(tree_root, HZ * 30);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
+	save_mount_options(sb, data);
+#endif
+
 	return 0;
 
 fail_close:
@@ -462,9 +467,13 @@ static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
 	.put_inode	= btrfs_put_inode,
 	.put_super	= btrfs_put_super,
-	.read_inode	= btrfs_read_locked_inode,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+	.read_inode     = btrfs_read_locked_inode,
+#else
+	.show_options	= generic_show_options,
+#endif
 	.write_inode	= btrfs_write_inode,
 	.dirty_inode	= btrfs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
-- 
cgit v1.2.3


From 2e1a992e3103624af48f1501aaad4e4d8317f88c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Feb 2008 15:44:32 -0500
Subject: Btrfs: Make sure bio pages are adjacent during bulk csumming

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index ee25e50e7c0..e17cce33fe9 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -161,7 +161,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	unsigned long map_len;
 	unsigned long map_start;
 
-
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 again:
@@ -315,8 +314,11 @@ next_bvec:
 	if (bio_index < bio->bi_vcnt) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  BTRFS_CRC32_SIZE);
-		if (item < item_end)
+		if (item < item_end && offset + PAGE_CACHE_SIZE ==
+		    page_offset(bvec->bv_page)) {
+			offset = page_offset(bvec->bv_page);
 			goto next_bvec;
+		}
 	}
 	if (eb_token) {
 		unmap_extent_buffer(leaf, eb_token, KM_IRQ1);
-- 
cgit v1.2.3


From 55d474148982785dd8995dc690febad3a16a2c46 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Feb 2008 16:02:51 -0500
Subject: Btrfs: Disable sysfs files on older kernels

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/sysfs.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 973d56e0569..1cf4f4f8d2c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,6 +28,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
 static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
@@ -265,3 +266,36 @@ void btrfs_exit_sysfs()
 {
 	kset_unregister(btrfs_kset);
 }
+
+#else
+
+int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
+{
+	return 0;
+}
+
+int btrfs_sysfs_add_root(struct btrfs_root *root)
+{
+	return 0;
+}
+
+void btrfs_sysfs_del_root(struct btrfs_root *root)
+{
+	return;
+}
+
+void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
+{
+	return;
+}
+
+int btrfs_init_sysfs()
+{
+	return 0;
+}
+
+void btrfs_exit_sysfs()
+{
+	return;
+}
+#endif
-- 
cgit v1.2.3


From eb20978f318ab5e360ef9c1b24b5dea14d0fee6a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 21 Feb 2008 09:30:08 -0500
Subject: Btrfs: Use KM_USERN instead of KM_IRQ during data summing

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index e17cce33fe9..374d0245b29 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -278,11 +278,11 @@ found:
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
 next_bvec:
-	data = kmap_atomic(bvec->bv_page, KM_IRQ0);
+	data = kmap_atomic(bvec->bv_page, KM_USER0);
 	csum_result = ~(u32)0;
 	csum_result = btrfs_csum_data(root, data + bvec->bv_offset,
 				      csum_result, bvec->bv_len);
-	kunmap_atomic(data, KM_IRQ0);
+	kunmap_atomic(data, KM_USER0);
 	btrfs_csum_final(csum_result, (char *)&csum_result);
 	if (csum_result == 0) {
 		printk("csum result is 0 for inode %lu offset %Lu\n", inode->i_ino, offset);
@@ -293,12 +293,12 @@ next_bvec:
 		int err;
 
 		if (eb_token)
-			unmap_extent_buffer(leaf, eb_token, KM_IRQ1);
+			unmap_extent_buffer(leaf, eb_token, KM_USER1);
 		eb_token = NULL;
 		err = map_private_extent_buffer(leaf, (unsigned long)item,
 						BTRFS_CRC32_SIZE,
 						&eb_token, &eb_map,
-						&map_start, &map_len, KM_IRQ1);
+						&map_start, &map_len, KM_USER1);
 		if (err)
 			eb_token = NULL;
 	}
@@ -321,7 +321,7 @@ next_bvec:
 		}
 	}
 	if (eb_token) {
-		unmap_extent_buffer(leaf, eb_token, KM_IRQ1);
+		unmap_extent_buffer(leaf, eb_token, KM_USER1);
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-- 
cgit v1.2.3


From 7f93bf8d27653726e3721c01fefc523487ecf2af Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:01:28 -0400
Subject: Match the extent tree code to btrfs-progs for multi-device merging

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 59 ++++++--------------------------------------------
 1 file changed, 7 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8f441783a6e..ebfd304138c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -209,27 +209,22 @@ again:
 
 	last = max(search_start, cache->key.objectid);
 
-	spin_lock_irq(&free_space_cache->lock);
-	state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
 	while(1) {
-		if (!state) {
+		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
+					    last, &start, &end, EXTENT_DIRTY);
+		if (ret) {
 			if (!cache_miss)
 				cache_miss = last;
-			spin_unlock_irq(&free_space_cache->lock);
 			goto new_group;
 		}
 
-		start = max(last, state->start);
-		last = state->end + 1;
+		start = max(last, start);
+		last = end + 1;
 		if (last - start < num) {
 			if (last == cache->key.objectid + cache->key.offset)
 				cache_miss = start;
-			do {
-				state = extent_state_next(state);
-			} while(state && !(state->state & EXTENT_DIRTY));
 			continue;
 		}
-		spin_unlock_irq(&free_space_cache->lock);
 		if (data != BTRFS_BLOCK_GROUP_MIXED &&
 		    start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
@@ -1485,29 +1480,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_MIXED;
 	}
 
-	if (!data) {
-		last_ptr = &root->fs_info->last_alloc;
-		empty_cluster = 128 * 1024;
-	}
-
-	if (data && btrfs_test_opt(root, SSD)) {
-		last_ptr = &root->fs_info->last_data_alloc;
-		empty_cluster = 2 * 1024 * 1024;
-	}
-
-	if (last_ptr) {
-		if (*last_ptr)
-			hint_byte = *last_ptr;
-		else {
-			hint_byte = hint_byte &
-				~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
-			empty_size += empty_cluster;
-		}
-		search_start = max(search_start, hint_byte);
-	}
-
-	search_end = min(search_end,
-			 btrfs_super_total_bytes(&info->super_copy));
+	if (search_end == (u64)-1)
+		search_end = btrfs_super_total_bytes(&info->super_copy);
 	if (hint_byte) {
 		block_group = btrfs_lookup_block_group(info, hint_byte);
 		if (!block_group)
@@ -1531,18 +1505,6 @@ check_failed:
 	}
 	search_start = find_search_start(root, &block_group, search_start,
 					 total_needed, data);
-
-	if (last_ptr && *last_ptr && search_start != *last_ptr) {
-		*last_ptr = 0;
-		if (!empty_size) {
-			empty_size += empty_cluster;
-			total_needed += empty_size;
-		}
-		search_start = find_search_start(root, &block_group,
-						 search_start, total_needed,
-						 data);
-	}
-
 	search_start = stripe_align(root, search_start);
 	cached_start = search_start;
 	btrfs_init_path(path);
@@ -1670,13 +1632,6 @@ check_pending:
 	}
 	ins->offset = num_bytes;
 	btrfs_free_path(path);
-	if (last_ptr) {
-		*last_ptr = ins->objectid + ins->offset;
-		if (*last_ptr ==
-		    btrfs_super_total_bytes(&root->fs_info->super_copy)) {
-			*last_ptr = 0;
-		}
-	}
 	return 0;
 
 new_group:
-- 
cgit v1.2.3


From 0b86a832a1f38abec695864ec2eaedc9d2383f1b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:01:56 -0400
Subject: Btrfs: Add support for multiple devices per filesystem

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/ctree.c       |  38 ++-
 fs/btrfs/ctree.h       | 313 ++++++++++++++++--
 fs/btrfs/disk-io.c     |  97 +++++-
 fs/btrfs/disk-io.h     |   2 +
 fs/btrfs/extent-tree.c | 429 +++++++------------------
 fs/btrfs/extent_io.c   |   6 +-
 fs/btrfs/extent_io.h   |   2 +-
 fs/btrfs/inode.c       |  26 +-
 fs/btrfs/print-tree.c  |  48 +++
 fs/btrfs/transaction.c |  50 ++-
 fs/btrfs/volumes.c     | 852 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h     |  78 +++++
 13 files changed, 1563 insertions(+), 380 deletions(-)
 create mode 100644 fs/btrfs/volumes.c
 create mode 100644 fs/btrfs/volumes.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9c61609f371..5ac6f2946c2 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o
+	   extent_io.o volumes.o
 
 ifeq ($(CONFIG_FS_POSIX_ACL),y)
 btrfs-y += acl.o
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ef8809c1e69..70b6ddfe15a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -70,6 +70,14 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	memset(p, 0, sizeof(*p));
 }
 
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+	if (root->track_dirty && list_empty(&root->dirty_list)) {
+		list_add(&root->dirty_list,
+			 &root->fs_info->dirty_cowonly_roots);
+	}
+}
+
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
@@ -196,6 +204,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					  root_gen, 0, 0, 1);
 		}
 		free_extent_buffer(buf);
+		add_root_to_dirty_list(root);
 	} else {
 		root_gen = btrfs_header_generation(parent);
 		btrfs_set_node_blockptr(parent, parent_slot,
@@ -241,7 +250,7 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	search_start = buf->start & ~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
+	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
 	return ret;
@@ -724,6 +733,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 		BUG_ON(ret);
 
 		root->node = child;
+		add_root_to_dirty_list(root);
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
 		wait_on_tree_block_writeback(root, mid);
@@ -1369,6 +1379,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(root->node);
 	root->node = c;
+	add_root_to_dirty_list(root);
 	extent_buffer_get(c);
 	path->nodes[level] = c;
 	path->slots[level] = 0;
@@ -2777,3 +2788,28 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	}
 	return 0;
 }
+
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int ret;
+
+	while(1) {
+		if (path->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.type == type)
+			return 0;
+	}
+	return 1;
+}
+
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 92d892f9207..1453d995fef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,12 +40,44 @@ extern struct kmem_cache *btrfs_path_cachep;
 #define BTRFS_MAGIC "_B4RfS_M"
 
 #define BTRFS_MAX_LEVEL 8
+
+/* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/* one per subvolume, storing files and directories */
 #define BTRFS_FS_TREE_OBJECTID 3ULL
+
+/* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
+
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 5ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 6ULL
+
+/*
+ * All files have objectids higher than this.
+ */
 #define BTRFS_FIRST_FREE_OBJECTID 256ULL
 
+
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
@@ -95,6 +127,81 @@ struct btrfs_key {
 	u64 offset;
 } __attribute__ ((__packed__));
 
+struct btrfs_mapping_tree {
+	struct extent_map_tree map_tree;
+};
+
+#define BTRFS_DEV_UUID_SIZE 16
+struct btrfs_dev_item {
+	/* the internal btrfs device id */
+	__le64 devid;
+
+	/* size of the device */
+	__le64 total_bytes;
+
+	/* bytes used */
+	__le64 bytes_used;
+
+	/* optimal io alignment for this device */
+	__le32 io_align;
+
+	/* optimal io width for this device */
+	__le32 io_width;
+
+	/* minimal io size for this device */
+	__le32 sector_size;
+
+	/* the kernel device number */
+	__le64 rdev;
+
+	/* type and info about this device */
+	__le64 type;
+
+	/* partition number, 0 for whole dev */
+	__le32 partition;
+
+	/* length of the name data at the end of the item */
+	__le16 name_len;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_DEV_UUID_SIZE];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+	__le64 devid;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+	__le64 owner;
+	__le64 stripe_len;
+	__le64 type;
+
+	/* optimal io alignment for this chunk */
+	__le32 io_align;
+
+	/* optimal io width for this chunk */
+	__le32 io_width;
+
+	/* minimal io size for this chunk */
+	__le32 sector_size;
+
+	/* 2^16 stripes is quite a lot, a second limit is the size of a single
+	 * item in the btree
+	 */
+	__le16 num_stripes;
+	struct btrfs_stripe stripe;
+	/* additional stripes go here */
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+	BUG_ON(num_stripes == 0);
+	return sizeof(struct btrfs_chunk) +
+		sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
 #define BTRFS_FSID_SIZE 16
 /*
  * every tree block (leaf or node) starts with this header.
@@ -119,6 +226,13 @@ struct btrfs_header {
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -131,6 +245,7 @@ struct btrfs_super_block {
 	__le64 magic;
 	__le64 generation;
 	__le64 root;
+	__le64 chunk_root;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -138,7 +253,10 @@ struct btrfs_super_block {
 	__le32 nodesize;
 	__le32 leafsize;
 	__le32 stripesize;
+	__le32 sys_chunk_array_size;
 	u8 root_level;
+	u8 chunk_root_level;
+	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
 /*
@@ -208,12 +326,22 @@ struct btrfs_extent_ref {
 	__le64 offset;
 } __attribute__ ((__packed__));
 
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent
+ */
+struct btrfs_dev_extent {
+	__le64 owner;
+	__le64 length;
+} __attribute__ ((__packed__));
+
+
 struct btrfs_inode_ref {
 	__le16 name_len;
 	/* name goes here */
 } __attribute__ ((__packed__));
 
-struct btrfs_inode_timespec {
+struct btrfs_timespec {
 	__le64 sec;
 	__le32 nsec;
 } __attribute__ ((__packed__));
@@ -231,13 +359,13 @@ struct btrfs_inode_item {
 	__le32 uid;
 	__le32 gid;
 	__le32 mode;
-	__le32 rdev;
+	__le64 rdev;
 	__le16 flags;
 	__le16 compat_flags;
-	struct btrfs_inode_timespec atime;
-	struct btrfs_inode_timespec ctime;
-	struct btrfs_inode_timespec mtime;
-	struct btrfs_inode_timespec otime;
+	struct btrfs_timespec atime;
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec mtime;
+	struct btrfs_timespec otime;
 } __attribute__ ((__packed__));
 
 struct btrfs_dir_item {
@@ -290,29 +418,34 @@ struct btrfs_csum_item {
 	u8 csum;
 } __attribute__ ((__packed__));
 
-/* tag for the radix tree of block groups in ram */
-#define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
-
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 
-#define BTRFS_BLOCK_GROUP_DATA 1
-#define BTRFS_BLOCK_GROUP_MIXED 2
 
 struct btrfs_block_group_item {
 	__le64 used;
-	u8 flags;
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 flags;
 } __attribute__ ((__packed__));
 
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
-	int data;
-	int cached;
 	u64 pinned;
+	u64 flags;
+	int cached;
 };
+
+struct btrfs_device;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
 	struct radix_tree_root fs_roots_radix;
 
 	struct extent_io_tree free_space_cache;
@@ -321,6 +454,9 @@ struct btrfs_fs_info {
 	struct extent_io_tree pending_del;
 	struct extent_io_tree extent_ins;
 
+	/* logical->physical extent mapping */
+	struct btrfs_mapping_tree mapping_tree;
+
 	u64 generation;
 	u64 last_trans_committed;
 	unsigned long mount_opt;
@@ -330,6 +466,7 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
 	struct extent_buffer *sb_buffer;
+	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
 	spinlock_t hash_lock;
@@ -350,12 +487,17 @@ struct btrfs_fs_info {
 	unsigned long throttles;
 
 	u64 total_pinned;
+	struct list_head dirty_cowonly_roots;
+
+	struct list_head devices;
+	struct list_head *last_device;
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
 };
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -387,14 +529,19 @@ struct btrfs_root {
 	u64 highest_inode;
 	u64 last_inode_alloc;
 	int ref_cows;
+	int track_dirty;
 	struct btrfs_key defrag_progress;
 	int defrag_running;
 	int defrag_level;
 	char *name;
 	int in_sysfs;
+
+	/* the dirty list is only used by non-reference counted roots */
+	struct list_head dirty_list;
 };
 
 /*
+
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
  * the FS
@@ -439,6 +586,10 @@ struct btrfs_root {
  */
 #define BTRFS_BLOCK_GROUP_ITEM_KEY 50
 
+#define BTRFS_DEV_EXTENT_KEY	75
+#define BTRFS_DEV_ITEM_KEY	76
+#define BTRFS_CHUNK_ITEM_KEY	77
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -518,13 +669,104 @@ static inline void btrfs_set_##name(type *s, u##bits val)		\
 	s->member = cpu_to_le##bits(val);				\
 }
 
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_rdev, struct btrfs_dev_item, rdev, 64);
+BTRFS_SETGET_FUNCS(device_partition, struct btrfs_dev_item, partition, 32);
+BTRFS_SETGET_FUNCS(device_name_len, struct btrfs_dev_item, name_len, 16);
+
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline char *btrfs_device_name(struct btrfs_dev_item *d)
+{
+	return (char *)(d + 1);
+}
+
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+			 stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+						   int nr)
+{
+	unsigned long offset = (unsigned long)c;
+	offset += offsetof(struct btrfs_chunk, stripe);
+	offset += nr * sizeof(struct btrfs_stripe);
+	return (struct btrfs_stripe *)offset;
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
 /* struct btrfs_block_group_item */
 BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 			 used, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_flags, struct btrfs_block_group_item,
-		   flags, 8);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_tree, struct btrfs_block_group_item,
+			 chunk_tree, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_tree, struct btrfs_block_group_item,
+			 chunk_tree, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+			struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objecitd,
+		   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+		   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+			struct btrfs_block_group_item, flags, 64);
 
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
@@ -538,49 +780,53 @@ BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
 BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
 BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
 BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
 		   compat_flags, 16);
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_atime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, atime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, mtime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, ctime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_otime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, otime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_inode_timespec, sec, 64);
-BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 
 /* struct btrfs_extent_item */
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
 
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_owner, struct btrfs_dev_extent, owner, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
 /* struct btrfs_extent_ref */
 BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
@@ -846,8 +1092,14 @@ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+			 struct btrfs_super_block, sys_chunk_array_size, 32);
 BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
 			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+			 chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+			 chunk_root_level, 64);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
 			 total_bytes, 64);
 BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
@@ -1009,7 +1261,14 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 size);
 /* ctree.c */
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type);
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 88e21bdbc47..8e37fa120cc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 #include "print-tree.h"
 
 #if 0
@@ -234,6 +235,19 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
 	return 0;
 }
 
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 offset;
+	offset = bio->bi_sector << 9;
+	if (offset == BTRFS_SUPER_INFO_OFFSET) {
+		bio->bi_bdev = root->fs_info->sb->s_bdev;
+		submit_bio(rw, bio);
+		return 0;
+	}
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+}
+
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
@@ -345,6 +359,23 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 	return ret;
 }
 
+static int close_all_devices(struct btrfs_fs_info *fs_info)
+{
+	struct list_head *list;
+	struct list_head *next;
+	struct btrfs_device *device;
+
+	list = &fs_info->devices;
+	while(!list_empty(list)) {
+		next = list->next;
+		list_del(next);
+		device = list_entry(next, struct btrfs_device, dev_list);
+		kfree(device->name);
+		kfree(device);
+	}
+	return 0;
+}
+
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize)
 {
@@ -420,6 +451,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->leafsize = leafsize;
 	root->stripesize = stripesize;
 	root->ref_cows = 0;
+	root->track_dirty = 0;
+
 	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
@@ -427,6 +460,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->last_inode_alloc = 0;
 	root->name = NULL;
 	root->in_sysfs = 0;
+
+	INIT_LIST_HEAD(&root->dirty_list);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -634,6 +669,10 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
 						GFP_NOFS);
+	struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root),
+						GFP_NOFS);
+	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
+					      GFP_NOFS);
 	int ret;
 	int err = -EIO;
 	struct btrfs_super_block *disk_super;
@@ -657,6 +696,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->last_trans_committed = 0;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
+	fs_info->chunk_root = chunk_root;
+	fs_info->dev_root = dev_root;
+	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+	INIT_LIST_HEAD(&fs_info->devices);
+	btrfs_mapping_init(&fs_info->mapping_tree);
+	fs_info->last_device = &fs_info->devices;
 	fs_info->sb = sb;
 	fs_info->throttles = 0;
 	fs_info->mount_opt = 0;
@@ -714,12 +759,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		goto fail_iput;
 	}
 #endif
-	__setup_root(512, 512, 512, 512, tree_root,
+	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 	fs_info->sb_buffer = read_tree_block(tree_root,
 					     BTRFS_SUPER_INFO_OFFSET,
-					     512);
+					     4096);
 
 	if (!fs_info->sb_buffer)
 		goto fail_iput;
@@ -730,6 +775,7 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	read_extent_buffer(fs_info->sb_buffer, fs_info->fsid,
 			   (unsigned long)btrfs_super_fsid(fs_info->sb_buffer),
 			   BTRFS_FSID_SIZE);
+
 	disk_super = &fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
@@ -753,23 +799,47 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 		goto fail_sb_buffer;
 	}
 
+	mutex_lock(&fs_info->fs_mutex);
+	ret = btrfs_read_sys_array(tree_root);
+	BUG_ON(ret);
+
+	blocksize = btrfs_level_size(tree_root,
+				     btrfs_super_chunk_root_level(disk_super));
+
+	__setup_root(nodesize, leafsize, sectorsize, stripesize,
+		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+	chunk_root->node = read_tree_block(chunk_root,
+					   btrfs_super_chunk_root(disk_super),
+					   blocksize);
+	BUG_ON(!chunk_root->node);
+
+	ret = btrfs_read_chunk_tree(chunk_root);
+	BUG_ON(ret);
+
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 
+
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  blocksize);
 	if (!tree_root->node)
 		goto fail_sb_buffer;
 
-	mutex_lock(&fs_info->fs_mutex);
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-	if (ret) {
-		mutex_unlock(&fs_info->fs_mutex);
+	if (ret)
 		goto fail_tree_root;
-	}
+	extent_root->track_dirty = 1;
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_DEV_TREE_OBJECTID, dev_root);
+	dev_root->track_dirty = 1;
+
+	if (ret)
+		goto fail_extent_root;
 
 	btrfs_read_block_groups(extent_root);
 
@@ -777,7 +847,10 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
+fail_extent_root:
+	free_extent_buffer(extent_root->node);
 fail_tree_root:
+	mutex_unlock(&fs_info->fs_mutex);
 	free_extent_buffer(tree_root->node);
 fail_sb_buffer:
 	free_extent_buffer(fs_info->sb_buffer);
@@ -874,6 +947,12 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->tree_root->node)
 		free_extent_buffer(fs_info->tree_root->node);
 
+	if (root->fs_info->chunk_root->node);
+		free_extent_buffer(root->fs_info->chunk_root->node);
+
+	if (root->fs_info->dev_root->node);
+		free_extent_buffer(root->fs_info->dev_root->node);
+
 	free_extent_buffer(fs_info->sb_buffer);
 
 	btrfs_free_block_groups(root->fs_info);
@@ -901,8 +980,13 @@ int close_ctree(struct btrfs_root *root)
 		kfree(hasher);
 	}
 #endif
+	close_all_devices(fs_info);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
 	return 0;
 }
 
@@ -1016,4 +1100,5 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 
 static struct extent_io_ops btree_extent_io_ops = {
 	.writepage_io_hook = btree_writepage_io_hook,
+	.submit_bio_hook = btree_submit_bio_hook,
 };
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 828f3a2081b..206cb48638f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -20,6 +20,7 @@
 #define __DISKIO__
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+struct btrfs_device;
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize);
@@ -65,4 +66,5 @@ int btrfs_read_buffer(struct extent_buffer *buf);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 void btrfs_throttle(struct btrfs_root *root);
+int btrfs_open_device(struct btrfs_device *dev);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ebfd304138c..2cd957d6e8d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,37 +24,19 @@
 #include "disk-io.h"
 #include "print-tree.h"
 #include "transaction.h"
+#include "volumes.h"
 
-#define BLOCK_GROUP_DATA EXTENT_WRITEBACK
+#define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
+#define BLOCK_GROUP_SYSTEM   EXTENT_NEW
+
 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
 
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
-static int find_previous_extent(struct btrfs_root *root,
-				struct btrfs_path *path)
-{
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	int ret;
 
-	while(1) {
-		if (path->slots[0] == 0) {
-			ret = btrfs_prev_leaf(root, path);
-			if (ret != 0)
-				return ret;
-		} else {
-			path->slots[0]--;
-		}
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
-			return 0;
-	}
-	return 1;
-}
 
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
@@ -91,7 +73,7 @@ static int cache_block_group(struct btrfs_root *root,
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	ret = find_previous_extent(root, path);
+	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
 	if (ret < 0)
 		return ret;
 	if (ret == 0) {
@@ -168,7 +150,8 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	block_group_cache = &info->block_group_cache;
 	ret = find_first_extent_bit(block_group_cache,
 				    bytenr, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA);
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
+				    BLOCK_GROUP_SYSTEM);
 	if (ret) {
 		return NULL;
 	}
@@ -182,23 +165,38 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 		return block_group;
 	return NULL;
 }
-static u64 noinline find_search_start(struct btrfs_root *root,
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+	if ((bits & BLOCK_GROUP_DATA) &&
+	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))
+		return 1;
+	if ((bits & BLOCK_GROUP_METADATA) &&
+	     (cache->flags & BTRFS_BLOCK_GROUP_METADATA))
+		return 1;
+	if ((bits & BLOCK_GROUP_SYSTEM) &&
+	     (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
+		return 1;
+	return 0;
+}
+
+static int noinline find_search_start(struct btrfs_root *root,
 			      struct btrfs_block_group_cache **cache_ret,
-			      u64 search_start, int num, int data)
+			      u64 *start_ret, int num, int data)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
 	struct extent_io_tree *free_space_cache;
-	struct extent_state *state;
 	u64 last;
 	u64 start = 0;
+	u64 end = 0;
 	u64 cache_miss = 0;
 	u64 total_fs_bytes;
+	u64 search_start = *start_ret;
 	int wrapped = 0;
 
-	if (!cache) {
+	if (!cache)
 		goto out;
-	}
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
@@ -208,6 +206,9 @@ again:
 		goto out;
 
 	last = max(search_start, cache->key.objectid);
+	if (!block_group_bits(cache, data)) {
+		goto new_group;
+	}
 
 	while(1) {
 		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
@@ -225,22 +226,20 @@ again:
 				cache_miss = start;
 			continue;
 		}
-		if (data != BTRFS_BLOCK_GROUP_MIXED &&
-		    start + num > cache->key.objectid + cache->key.offset)
+		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
 		if (start + num  > total_fs_bytes)
 			goto new_group;
-		return start;
+		*start_ret = start;
+		return 0;
 	}
 out:
 	cache = btrfs_lookup_block_group(root->fs_info, search_start);
 	if (!cache) {
-		printk("Unable to find block group for %Lu\n",
-		       search_start);
+		printk("Unable to find block group for %Lu\n", search_start);
 		WARN_ON(1);
-		return search_start;
 	}
-	return search_start;
+	return -ENOSPC;
 
 new_group:
 	last = cache->key.objectid + cache->key.offset;
@@ -251,7 +250,6 @@ no_cache:
 		if (!wrapped) {
 			wrapped = 1;
 			last = search_start;
-			data = BTRFS_BLOCK_GROUP_MIXED;
 			goto wrapped;
 		}
 		goto out;
@@ -299,7 +297,6 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	int ret;
 	int full_search = 0;
 	int factor = 8;
-	int data_swap = 0;
 
 	block_group_cache = &info->block_group_cache;
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
@@ -307,19 +304,12 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (!owner)
 		factor = 8;
 
-	if (data == BTRFS_BLOCK_GROUP_MIXED) {
-		bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
-		factor = 10;
-	} else if (data)
-		bit = BLOCK_GROUP_DATA;
-	else
-		bit = BLOCK_GROUP_METADATA;
+	bit = data;
 
 	if (search_start && search_start < total_fs_bytes) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_block_group(info, search_start);
-		if (shint && (shint->data == data ||
-			      shint->data == BTRFS_BLOCK_GROUP_MIXED)) {
+		if (shint && block_group_bits(shint, data)) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
 			    div_factor(shint->key.offset, factor)) {
@@ -327,8 +317,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 			}
 		}
 	}
-	if (hint && hint->key.objectid < total_fs_bytes &&
-	    (hint->data == data || hint->data == BTRFS_BLOCK_GROUP_MIXED)) {
+	if (hint && block_group_bits(hint, data) &&
+	    hint->key.objectid < total_fs_bytes) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
 		    div_factor(hint->key.offset, factor)) {
@@ -379,12 +369,6 @@ again:
 		full_search = 1;
 		goto again;
 	}
-	if (!data_swap) {
-		data_swap = 1;
-		bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
-		last = search_start;
-		goto again;
-	}
 found:
 	return found_group;
 }
@@ -1002,7 +986,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
-			      int mark_free, int data)
+			      int mark_free)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -1027,41 +1011,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
-			if (cache->data != data &&
-			    old_val < (cache->key.offset >> 1)) {
-				int bit_to_clear;
-				int bit_to_set;
-				cache->data = data;
-				if (data) {
-					bit_to_clear = BLOCK_GROUP_METADATA;
-					bit_to_set = BLOCK_GROUP_DATA;
-					cache->item.flags &=
-						~BTRFS_BLOCK_GROUP_MIXED;
-					cache->item.flags |=
-						BTRFS_BLOCK_GROUP_DATA;
-				} else {
-					bit_to_clear = BLOCK_GROUP_DATA;
-					bit_to_set = BLOCK_GROUP_METADATA;
-					cache->item.flags &=
-						~BTRFS_BLOCK_GROUP_MIXED;
-					cache->item.flags &=
-						~BTRFS_BLOCK_GROUP_DATA;
-				}
-				clear_extent_bits(&info->block_group_cache,
-						  start, end, bit_to_clear,
-						  GFP_NOFS);
-				set_extent_bits(&info->block_group_cache,
-						start, end, bit_to_set,
-						GFP_NOFS);
-			} else if (cache->data != data &&
-				   cache->data != BTRFS_BLOCK_GROUP_MIXED) {
-				cache->data = BTRFS_BLOCK_GROUP_MIXED;
-				set_extent_bits(&info->block_group_cache,
-						start, end,
-						BLOCK_GROUP_DATA |
-						BLOCK_GROUP_METADATA,
-						GFP_NOFS);
-			}
 			old_val += num_bytes;
 		} else {
 			old_val -= num_bytes;
@@ -1357,7 +1306,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			return ret;
 		}
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
-					 mark_free, 0);
+					 mark_free);
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
@@ -1450,38 +1399,21 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				     u64 exclude_start, u64 exclude_nr,
 				     int data)
 {
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	u64 hole_size = 0;
-	u64 aligned;
 	int ret;
-	int slot = 0;
-	u64 last_byte = 0;
-	u64 *last_ptr = NULL;
 	u64 orig_search_start = search_start;
-	int start_found;
-	struct extent_buffer *l;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
-	int level;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
-	int empty_cluster;
-	u64 cached_start;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
-	level = btrfs_header_level(root->node);
-
-	if (num_bytes >= 32 * 1024 * 1024 && hint_byte) {
-		data = BTRFS_BLOCK_GROUP_MIXED;
-	}
-
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
+
 	if (hint_byte) {
 		block_group = btrfs_lookup_block_group(info, hint_byte);
 		if (!block_group)
@@ -1495,7 +1427,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	}
 
 	total_needed += empty_size;
-	path = btrfs_alloc_path();
+
 check_failed:
 	if (!block_group) {
 		block_group = btrfs_lookup_block_group(info, search_start);
@@ -1503,135 +1435,49 @@ check_failed:
 			block_group = btrfs_lookup_block_group(info,
 						       orig_search_start);
 	}
-	search_start = find_search_start(root, &block_group, search_start,
-					 total_needed, data);
-	search_start = stripe_align(root, search_start);
-	cached_start = search_start;
-	btrfs_init_path(path);
-	ins->objectid = search_start;
-	ins->offset = 0;
-	start_found = 0;
-	path->reada = 2;
-
-	ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
-	if (ret < 0)
-		goto error;
-	ret = find_previous_extent(root, path);
-	if (ret < 0)
+	ret = find_search_start(root, &block_group, &search_start,
+				total_needed, data);
+	if (ret)
 		goto error;
-	l = path->nodes[0];
-	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-	while (1) {
-		l = path->nodes[0];
-		slot = path->slots[0];
-		if (slot >= btrfs_header_nritems(l)) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret == 0)
-				continue;
-			if (ret < 0)
-				goto error;
 
-			search_start = max(search_start,
-					   block_group->key.objectid);
-			if (!start_found) {
-				aligned = stripe_align(root, search_start);
-				ins->objectid = aligned;
-				if (aligned >= search_end) {
-					ret = -ENOSPC;
-					goto error;
-				}
-				ins->offset = search_end - aligned;
-				start_found = 1;
-				goto check_pending;
-			}
-			ins->objectid = stripe_align(root,
-						     last_byte > search_start ?
-						     last_byte : search_start);
-			if (search_end <= ins->objectid) {
-				ret = -ENOSPC;
-				goto error;
-			}
-			ins->offset = search_end - ins->objectid;
-			BUG_ON(ins->objectid >= search_end);
-			goto check_pending;
-		}
-		btrfs_item_key_to_cpu(l, &key, slot);
-
-		if (key.objectid >= search_start && key.objectid > last_byte &&
-		    start_found) {
-			if (last_byte < search_start)
-				last_byte = search_start;
-			aligned = stripe_align(root, last_byte);
-			hole_size = key.objectid - aligned;
-			if (key.objectid > aligned && hole_size >= num_bytes) {
-				ins->objectid = aligned;
-				ins->offset = hole_size;
-				goto check_pending;
-			}
-		}
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
-			if (!start_found && btrfs_key_type(&key) ==
-			    BTRFS_BLOCK_GROUP_ITEM_KEY) {
-				last_byte = key.objectid;
-				start_found = 1;
-			}
-			goto next;
-		}
-
-
-		start_found = 1;
-		last_byte = key.objectid + key.offset;
-
-		if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
-		    last_byte >= block_group->key.objectid +
-		    block_group->key.offset) {
-			btrfs_release_path(root, path);
-			search_start = block_group->key.objectid +
-				block_group->key.offset;
-			goto new_group;
-		}
-next:
-		path->slots[0]++;
-		cond_resched();
-	}
-check_pending:
-	/* we have to make sure we didn't find an extent that has already
-	 * been allocated by the map tree or the original allocation
-	 */
-	btrfs_release_path(root, path);
-	BUG_ON(ins->objectid < search_start);
+	search_start = stripe_align(root, search_start);
+	ins->objectid = search_start;
+	ins->offset = num_bytes;
 
 	if (ins->objectid + num_bytes >= search_end)
 		goto enospc;
-	if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
-	    ins->objectid + num_bytes > block_group->
-	    key.objectid + block_group->key.offset) {
+
+	if (ins->objectid + num_bytes >
+	    block_group->key.objectid + block_group->key.offset) {
 		search_start = block_group->key.objectid +
 			block_group->key.offset;
 		goto new_group;
 	}
+
 	if (test_range_bit(&info->extent_ins, ins->objectid,
 			   ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
 		search_start = ins->objectid + num_bytes;
 		goto new_group;
 	}
+
 	if (test_range_bit(&info->pinned_extents, ins->objectid,
 			   ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
 		search_start = ins->objectid + num_bytes;
 		goto new_group;
 	}
+
 	if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
 	    ins->objectid < exclude_start + exclude_nr)) {
 		search_start = exclude_start + exclude_nr;
 		goto new_group;
 	}
-	if (!data) {
+
+	if (!(data & BLOCK_GROUP_DATA)) {
 		block_group = btrfs_lookup_block_group(info, ins->objectid);
 		if (block_group)
 			trans->block_group = block_group;
 	}
 	ins->offset = num_bytes;
-	btrfs_free_path(path);
 	return 0;
 
 new_group:
@@ -1646,7 +1492,6 @@ enospc:
 			if (!full_scan)
 				total_needed -= empty_size;
 			full_scan = 1;
-			data = BTRFS_BLOCK_GROUP_MIXED;
 		} else
 			wrapped = 1;
 	}
@@ -1657,8 +1502,6 @@ enospc:
 	goto check_failed;
 
 error:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
 	return ret;
 }
 /*
@@ -1689,6 +1532,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
+	if (data)
+		data = BLOCK_GROUP_DATA;
+	else if (root == root->fs_info->chunk_root)
+		data = BLOCK_GROUP_SYSTEM;
+	else
+		data = BLOCK_GROUP_METADATA;
+
 	new_hint = max(hint_byte, root->fs_info->alloc_start);
 	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
 		hint_byte = new_hint;
@@ -1718,7 +1568,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
 				EXTENT_LOCKED, GFP_NOFS);
-		WARN_ON(data == 1);
 		goto update_block;
 	}
 
@@ -1768,8 +1617,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	}
 
 update_block:
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0,
-				 data);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
 	if (ret) {
 		printk("update block group failed for %Lu %Lu\n",
 		       ins->objectid, ins->offset);
@@ -2457,7 +2305,7 @@ again:
 	if (ret < 0)
 		goto out;
 
-	ret = find_previous_extent(root, path);
+	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
 	if (ret < 0)
 		goto out;
 	if (ret == 0) {
@@ -2604,95 +2452,48 @@ out:
 int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 new_size)
 {
-	struct btrfs_path *path;
-	u64 nr = 0;
-	u64 cur_byte;
-	u64 old_size;
-	unsigned long rem;
-	struct btrfs_block_group_cache *cache;
-	struct btrfs_block_group_item *item;
-	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_key key;
-	struct extent_buffer *leaf;
-	int ret;
-	int bit;
-
-	old_size = btrfs_super_total_bytes(&info->super_copy);
-	block_group_cache = &info->block_group_cache;
-
-	root = info->extent_root;
-
-	cache = btrfs_lookup_block_group(root->fs_info, old_size - 1);
-
-	cur_byte = cache->key.objectid + cache->key.offset;
-	if (cur_byte >= new_size)
-		goto set_size;
-
-	key.offset = BTRFS_BLOCK_GROUP_SIZE;
-	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy, new_size);
+	return 0;
+}
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
+			   struct btrfs_key *key)
+{
+	int ret;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int slot;
 
-	while(cur_byte < new_size) {
-		key.objectid = cur_byte;
-		ret = btrfs_insert_empty_item(trans, root, path, &key,
-				        sizeof(struct btrfs_block_group_item));
-		BUG_ON(ret);
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	while(1) {
+		slot = path->slots[0];
 		leaf = path->nodes[0];
-		item = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_block_group_item);
-
-		btrfs_set_disk_block_group_used(leaf, item, 0);
-		div_long_long_rem(nr, 3, &rem);
-		if (rem) {
-			btrfs_set_disk_block_group_flags(leaf, item,
-						 BTRFS_BLOCK_GROUP_DATA);
-		} else {
-			btrfs_set_disk_block_group_flags(leaf, item, 0);
-		}
-		nr++;
-
-		cache = kmalloc(sizeof(*cache), GFP_NOFS);
-		BUG_ON(!cache);
-
-		read_extent_buffer(leaf, &cache->item, (unsigned long)item,
-				   sizeof(cache->item));
-
-		memcpy(&cache->key, &key, sizeof(key));
-		cache->cached = 0;
-		cache->pinned = 0;
-		cur_byte = key.objectid + key.offset;
-		btrfs_release_path(root, path);
-
-		if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
-			bit = BLOCK_GROUP_DATA;
-			cache->data = BTRFS_BLOCK_GROUP_DATA;
-		} else {
-			bit = BLOCK_GROUP_METADATA;
-			cache->data = 0;
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
 		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
-		/* use EXTENT_LOCKED to prevent merging */
-		set_extent_bits(block_group_cache, key.objectid,
-				key.objectid + key.offset - 1,
-				bit | EXTENT_LOCKED, GFP_NOFS);
-		set_state_private(block_group_cache, key.objectid,
-				  (unsigned long)cache);
+		if (found_key.objectid >= key->objectid &&
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
+			return 0;
+		path->slots[0]++;
 	}
-	btrfs_free_path(path);
-set_size:
-	btrfs_set_super_total_bytes(&info->super_copy, new_size);
-	return 0;
+	ret = -ENOENT;
+error:
+	return ret;
 }
 
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
 	int ret;
-	int err = 0;
 	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -2702,28 +2503,28 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	struct extent_buffer *leaf;
 
 	block_group_cache = &info->block_group_cache;
-
 	root = info->extent_root;
 	key.objectid = 0;
-	key.offset = BTRFS_BLOCK_GROUP_SIZE;
+	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	while(1) {
-		ret = btrfs_search_slot(NULL, info->extent_root,
-					&key, path, 0, 0);
-		if (ret != 0) {
-			err = ret;
-			break;
+		ret = find_first_block_group(root, path, &key);
+		if (ret > 0) {
+			ret = 0;
+			goto error;
 		}
+		if (ret != 0)
+			goto error;
+
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		cache = kmalloc(sizeof(*cache), GFP_NOFS);
 		if (!cache) {
-			err = -1;
+			ret = -ENOMEM;
 			break;
 		}
 
@@ -2733,18 +2534,17 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->cached = 0;
 		cache->pinned = 0;
+
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
-
-		if (cache->item.flags & BTRFS_BLOCK_GROUP_MIXED) {
-			bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
-			cache->data = BTRFS_BLOCK_GROUP_MIXED;
-		} else if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
+		cache->flags = btrfs_block_group_flags(&cache->item);
+		bit = 0;
+		if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
 			bit = BLOCK_GROUP_DATA;
-			cache->data = BTRFS_BLOCK_GROUP_DATA;
-		} else {
+		} else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+			bit = BLOCK_GROUP_SYSTEM;
+		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
 			bit = BLOCK_GROUP_METADATA;
-			cache->data = 0;
 		}
 
 		/* use EXTENT_LOCKED to prevent merging */
@@ -2758,7 +2558,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		    btrfs_super_total_bytes(&info->super_copy))
 			break;
 	}
-
+	ret = 0;
+error:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e8130c87633..7e3a1ebde9f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1706,9 +1706,9 @@ static int submit_one_bio(int rw, struct bio *bio)
 		WARN_ON(1);
 	}
 	if (tree->ops && tree->ops->submit_bio_hook)
-		tree->ops->submit_bio_hook(rw, bio);
-
-	submit_bio(rw, bio);
+		tree->ops->submit_bio_hook(page->mapping->host, rw, bio);
+	else
+		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9d665466708..0dca89328f9 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -28,7 +28,7 @@ struct extent_state;
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*submit_bio_hook)(int rw, struct bio *bio);
+	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 315dd550d56..17063cd2cb7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -40,6 +40,7 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
+#include "volumes.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -295,20 +296,20 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
-int btrfs_submit_bio_hook(int rw, struct bio *bio)
+int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 {
-	struct bio_vec *bvec = bio->bi_io_vec;
-	struct inode *inode = bvec->bv_page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	if (rw != WRITE)
-		return 0;
+	if (rw != WRITE) {
+		goto mapit;
+	}
 
 	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
-		return 0;
+	    btrfs_test_flag(inode, NODATASUM)) {
+		goto mapit;
+	}
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
@@ -317,7 +318,8 @@ int btrfs_submit_bio_hook(int rw, struct bio *bio)
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
+mapit:
+	return btrfs_map_bio(root, rw, bio);
 }
 
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
@@ -406,7 +408,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_inode_timespec *tspec;
+	struct btrfs_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
 	u64 alloc_group_block;
@@ -455,7 +457,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 	if (!BTRFS_I(inode)->block_group) {
 		BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
-						         NULL, 0, 0, 0);
+						 NULL, 0,
+						 BTRFS_BLOCK_GROUP_METADATA, 0);
 	}
 	btrfs_free_path(path);
 	inode_item = NULL;
@@ -1550,7 +1553,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 0;
 	else
 		owner = 1;
-	group = btrfs_find_block_group(root, group, 0, 0, owner);
+	group = btrfs_find_block_group(root, group, 0,
+				       BTRFS_BLOCK_GROUP_METADATA, owner);
 	BTRFS_I(inode)->block_group = group;
 	BTRFS_I(inode)->flags = 0;
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index da0b4dcf361..9c1335dad40 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -20,6 +20,40 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+	int i;
+	printk("\t\tchunk owner %llu type %llu num_stripes %d\n",
+	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
+	       (unsigned long long)btrfs_chunk_type(eb, chunk),
+	       num_stripes);
+	for (i = 0 ; i < num_stripes ; i++) {
+		printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
+		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+	}
+}
+static void print_dev_item(struct extent_buffer *eb,
+			   struct btrfs_dev_item *dev_item)
+{
+	char *name;
+	int name_len;
+
+	name_len = btrfs_device_name_len(eb, dev_item);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (name) {
+		read_extent_buffer(eb, name,
+				   (unsigned long)btrfs_device_name(dev_item),
+				   name_len);
+	}
+	printk("\t\tdev item name %.*s devid %llu "
+	       "total_bytes %llu bytes used %Lu\n", name_len, name,
+	       (unsigned long long)btrfs_device_id(eb, dev_item),
+	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+	kfree(name);
+}
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
@@ -34,6 +68,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_extent_ref *ref;
+	struct btrfs_dev_extent *dev_extent;
 	u32 type;
 
 	printk("leaf %llu total ptrs %d free space %d\n",
@@ -106,6 +141,19 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			printk("\t\tblock group used %llu\n",
 			       (unsigned long long)btrfs_disk_block_group_used(l, bi));
 			break;
+		case BTRFS_CHUNK_ITEM_KEY:
+			print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
+			break;
+		case BTRFS_DEV_ITEM_KEY:
+			print_dev_item(l, btrfs_item_ptr(l, i,
+					struct btrfs_dev_item));
+			break;
+		case BTRFS_DEV_EXTENT_KEY:
+			dev_extent = btrfs_item_ptr(l, i,
+						    struct btrfs_dev_extent);
+			printk("\t\tdev extent owner %llu length %llu\n",
+			       (unsigned long long)btrfs_dev_extent_owner(l, dev_extent),
+			       (unsigned long long)btrfs_dev_extent_length(l, dev_extent));
 		};
 	}
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e9a0983897f..5e9f69244f9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -198,29 +198,42 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 	return werr;
 }
 
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root)
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
 {
 	int ret;
-	u64 old_extent_block;
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_root *tree_root = fs_info->tree_root;
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	u64 old_root_bytenr;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
 
-	btrfs_write_dirty_block_groups(trans, extent_root);
+	btrfs_write_dirty_block_groups(trans, root);
 	while(1) {
-		old_extent_block = btrfs_root_bytenr(&extent_root->root_item);
-		if (old_extent_block == extent_root->node->start)
+		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+		if (old_root_bytenr == root->node->start)
 			break;
-		btrfs_set_root_bytenr(&extent_root->root_item,
-				      extent_root->node->start);
-		btrfs_set_root_level(&extent_root->root_item,
-				     btrfs_header_level(extent_root->node));
+		btrfs_set_root_bytenr(&root->root_item,
+				       root->node->start);
+		btrfs_set_root_level(&root->root_item,
+				     btrfs_header_level(root->node));
 		ret = btrfs_update_root(trans, tree_root,
-					&extent_root->root_key,
-					&extent_root->root_item);
+					&root->root_key,
+					&root->root_item);
 		BUG_ON(ret);
-		btrfs_write_dirty_block_groups(trans, extent_root);
+		btrfs_write_dirty_block_groups(trans, root);
+	}
+	return 0;
+}
+
+int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *next;
+
+	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
+		next = fs_info->dirty_cowonly_roots.next;
+		list_del_init(next);
+		root = list_entry(next, struct btrfs_root, dirty_list);
+		update_cowonly_root(trans, root);
 	}
 	return 0;
 }
@@ -616,6 +629,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
 	struct list_head dirty_fs_roots;
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
@@ -714,6 +728,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_set_super_root_level(&root->fs_info->super_copy,
 			   btrfs_header_level(root->fs_info->tree_root->node));
 
+	btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+				   chunk_root->node->start);
+	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+					 btrfs_header_level(chunk_root->node));
 	write_extent_buffer(root->fs_info->sb_buffer,
 			    &root->fs_info->super_copy, 0,
 			    sizeof(root->fs_info->super_copy));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 00000000000..90a8d45dc6d
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+
+struct map_lookup {
+	struct btrfs_device *dev;
+	u64 physical;
+};
+
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static int find_free_dev_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_device *device,
+				struct btrfs_path *path,
+				u64 num_bytes, u64 *start)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	u64 hole_size = 0;
+	u64 last_byte = 0;
+	u64 search_start = 0;
+	u64 search_end = device->total_bytes;
+	int ret;
+	int slot = 0;
+	int start_found;
+	struct extent_buffer *l;
+
+	start_found = 0;
+	path->reada = 2;
+
+	/* FIXME use last free of some kind */
+
+	key.objectid = device->devid;
+	key.offset = search_start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	ret = btrfs_previous_item(root, path, 0, key.type);
+	if (ret < 0)
+		goto error;
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+no_more_items:
+			if (!start_found) {
+				if (search_start >= search_end) {
+					ret = -ENOSPC;
+					goto error;
+				}
+				*start = search_start;
+				start_found = 1;
+				goto check_pending;
+			}
+			*start = last_byte > search_start ?
+				last_byte : search_start;
+			if (search_end <= *start) {
+				ret = -ENOSPC;
+				goto error;
+			}
+			goto check_pending;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			goto no_more_items;
+
+		if (key.offset >= search_start && key.offset > last_byte &&
+		    start_found) {
+			if (last_byte < search_start)
+				last_byte = search_start;
+			hole_size = key.offset - last_byte;
+			if (key.offset > last_byte &&
+			    hole_size >= num_bytes) {
+				*start = last_byte;
+				goto check_pending;
+			}
+		}
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+			goto next;
+		}
+
+		start_found = 1;
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+check_pending:
+	/* we have to make sure we didn't find an extent that has already
+	 * been allocated by the map tree or the original allocation
+	 */
+	btrfs_release_path(root, path);
+	BUG_ON(*start < search_start);
+
+	if (*start + num_bytes >= search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+	/* check for pending inserts here */
+	return 0;
+
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 owner, u64 num_bytes, u64 *start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
+	if (ret)
+		goto err;
+
+	key.objectid = device->devid;
+	key.offset = *start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*extent));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_dev_extent);
+	btrfs_set_dev_extent_owner(leaf, extent, owner);
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = (u64)-1;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+	if (ret) {
+		*objectid = 0;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.objectid + found_key.offset;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static struct btrfs_device *next_device(struct list_head *head,
+					struct list_head *last)
+{
+	struct list_head *next = last->next;
+	struct btrfs_device *dev;
+
+	if (list_empty(head))
+		return NULL;
+
+	if (next == head)
+		next = next->next;
+
+	dev = list_entry(next, struct btrfs_device, dev_list);
+	return dev;
+}
+
+static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
+			   u64 *objectid)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+				  BTRFS_DEV_ITEM_KEY);
+	if (ret) {
+		*objectid = 1;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.offset + 1;
+	}
+	ret = 0;
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	u64 free_devid;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_next_devid(root, path, &free_devid);
+	if (ret)
+		goto out;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = free_devid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*dev_item) + device->name_len);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_name_len(leaf, dev_item, device->name_len);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	write_extent_buffer(leaf, device->name, ptr, device->name_len);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+int btrfs_update_device(struct btrfs_trans_handle *trans,
+			struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	root = device->dev_root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_key *key,
+			   struct btrfs_chunk *chunk, int item_size)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key disk_key;
+	u32 array_size;
+	u8 *ptr;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+		return -EFBIG;
+
+	ptr = super_copy->sys_chunk_array + array_size;
+	btrfs_cpu_key_to_disk(&disk_key, key);
+	memcpy(ptr, &disk_key, sizeof(disk_key));
+	ptr += sizeof(disk_key);
+	memcpy(ptr, chunk, item_size);
+	item_size += sizeof(disk_key);
+	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+	return 0;
+}
+
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 *start,
+		      u64 *num_bytes, u32 type)
+{
+	u64 dev_offset;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_stripe *stripes;
+	struct btrfs_device *device = NULL;
+	struct btrfs_chunk *chunk;
+	struct list_head *dev_list = &extent_root->fs_info->devices;
+	struct list_head *last_dev = extent_root->fs_info->last_device;
+	struct extent_map_tree *em_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 physical;
+	u64 calc_size = 1024 * 1024 * 1024;
+	int num_stripes;
+	int ret;
+	int index = 0;
+	struct btrfs_key key;
+
+
+	ret = find_next_chunk(chunk_root, &key.objectid);
+	if (ret)
+		return ret;
+
+	num_stripes = 1;
+	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	stripes = &chunk->stripe;
+
+	*num_bytes = calc_size;
+	while(index < num_stripes) {
+		device = next_device(dev_list, last_dev);
+		BUG_ON(!device);
+		last_dev = &device->dev_list;
+		extent_root->fs_info->last_device = last_dev;
+
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     key.objectid,
+					     calc_size, &dev_offset);
+		BUG_ON(ret);
+
+		device->bytes_used += calc_size;
+		ret = btrfs_update_device(trans, device);
+		BUG_ON(ret);
+
+		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
+		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
+		physical = dev_offset;
+		index++;
+	}
+
+	/* key.objectid was set above */
+	key.offset = *num_bytes;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024);
+	btrfs_set_stack_chunk_type(chunk, type);
+	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
+				btrfs_chunk_item_size(num_stripes));
+	BUG_ON(ret);
+	*start = key.objectid;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = key.objectid;
+	em->len = key.offset;
+	em->block_start = 0;
+
+	map->physical = physical;
+	map->dev = device;
+
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+	kfree(chunk);
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&em_tree->lock);
+	free_extent_map(em);
+	return ret;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+	extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+	struct extent_map *em;
+
+	while(1) {
+		spin_lock(&tree->map_tree.lock);
+		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		if (em)
+			remove_extent_mapping(&tree->map_tree, em);
+		spin_unlock(&tree->map_tree.lock);
+		if (!em)
+			break;
+		kfree(em->bdev);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	u64 offset;
+
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, *length);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	offset = logical - em->start;
+	*phys = map->physical + offset;
+	*length = em->len - offset;
+	*dev = map->dev;
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+	return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+{
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	u64 logical = bio->bi_sector << 9;
+	u64 physical;
+	u64 length = 0;
+	u64 map_length;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		length += bvec->bv_len;
+	}
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	BUG_ON(map_length < length);
+	bio->bi_sector = physical >> 9;
+	bio->bi_bdev = dev->bdev;
+	submit_bio(rw, bio);
+	return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur = root->fs_info->devices.next;
+	struct list_head *head = &root->fs_info->devices;
+
+	while(cur != head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid)
+			return dev;
+		cur = cur->next;
+	}
+	return NULL;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+			  struct extent_buffer *leaf,
+			  struct btrfs_chunk *chunk)
+{
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 logical;
+	u64 length;
+	u64 devid;
+	int ret;
+
+	logical = key->objectid;
+	length = key->offset;
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+
+	/* already mapped? */
+	if (em && em->start <= logical && em->start + em->len > logical) {
+		free_extent_map(em);
+		spin_unlock(&map_tree->map_tree.lock);
+		return 0;
+	} else if (em) {
+		free_extent_map(em);
+	}
+	spin_unlock(&map_tree->map_tree.lock);
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (!map)
+		return -ENOMEM;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = logical;
+	em->len = length;
+	em->block_start = 0;
+
+	map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0);
+	devid = btrfs_stripe_devid_nr(leaf, chunk, 0);
+	map->dev = btrfs_find_device(root, devid);
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+
+	spin_lock(&map_tree->map_tree.lock);
+	ret = add_extent_mapping(&map_tree->map_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&map_tree->map_tree.lock);
+	free_extent_map(em);
+
+	return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+				 struct btrfs_dev_item *dev_item,
+				 struct btrfs_device *device)
+{
+	unsigned long ptr;
+	char *name;
+
+	device->devid = btrfs_device_id(leaf, dev_item);
+	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+	device->type = btrfs_device_type(leaf, dev_item);
+	device->io_align = btrfs_device_io_align(leaf, dev_item);
+	device->io_width = btrfs_device_io_width(leaf, dev_item);
+	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	device->rdev = btrfs_device_rdev(leaf, dev_item);
+	device->partition = btrfs_device_partition(leaf, dev_item);
+	device->name_len = btrfs_device_name_len(leaf, dev_item);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+
+	name = kmalloc(device->name_len + 1, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+	device->name = name;
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	read_extent_buffer(leaf, name, ptr, device->name_len);
+	name[device->name_len] = '\0';
+	return 0;
+}
+
+static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
+			struct extent_buffer *leaf,
+			struct btrfs_dev_item *dev_item)
+{
+	struct btrfs_device *device;
+	u64 devid;
+	int ret;
+
+	devid = btrfs_device_id(leaf, dev_item);
+	if (btrfs_find_device(root, devid))
+		return 0;
+
+	device = kmalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return -ENOMEM;
+
+	fill_device_from_item(leaf, dev_item, device);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = root->fs_info->sb->s_bdev;
+	list_add(&device->dev_list, &root->fs_info->devices);
+	memcpy(&device->dev_key, key, sizeof(*key));
+	ret = 0;
+#if 0
+	ret = btrfs_open_device(device);
+	if (ret) {
+		kfree(device);
+	}
+#endif
+	return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	u32 cur;
+	int ret;
+	int dev_only = 1;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	/*
+	 * we do this loop twice, once for the device items and
+	 * once for all of the chunks.  This way there are device
+	 * structs filled in for every chunk
+	 */
+again:
+	ptr = super_copy->sys_chunk_array;
+	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID &&
+		    key.type == BTRFS_DEV_ITEM_KEY) {
+			dev_item = (struct btrfs_dev_item *)sb_ptr;
+			if (dev_only) {
+				ret = read_one_dev(root, &key, sb, dev_item);
+				BUG_ON(ret);
+			}
+			len = sizeof(*dev_item);
+			len += btrfs_device_name_len(sb, dev_item);
+		} else if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+
+			chunk = (struct btrfs_chunk *)sb_ptr;
+			if (!dev_only) {
+				ret = read_one_chunk(root, &key, sb, chunk);
+				BUG_ON(ret);
+			}
+			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+			len = btrfs_chunk_item_size(num_stripes);
+		} else {
+			BUG();
+		}
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+	}
+	if (dev_only == 1) {
+		dev_only = 0;
+		goto again;
+	}
+	return 0;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+	int slot;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* first we search for all of the device items, and then we
+	 * read in all of the chunk items.  This way we can create chunk
+	 * mappings that reference all of the devices that are afound
+	 */
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	while(1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+				break;
+			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+				struct btrfs_dev_item *dev_item;
+				dev_item = btrfs_item_ptr(leaf, slot,
+						  struct btrfs_dev_item);
+				ret = read_one_dev(root, &found_key, leaf,
+						   dev_item);
+				BUG_ON(ret);
+			}
+		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+			struct btrfs_chunk *chunk;
+			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+			ret = read_one_chunk(root, &found_key, leaf, chunk);
+		}
+		path->slots[0]++;
+	}
+	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+		key.objectid = 0;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	ret = 0;
+error:
+	return ret;
+}
+
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 00000000000..4a47dcb00a7
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+struct btrfs_device {
+	struct list_head dev_list;
+	struct btrfs_root *dev_root;
+	struct btrfs_key dev_key;
+
+	struct block_device *bdev;
+
+	/* the internal btrfs device id */
+	u64 devid;
+
+	/* size of the device */
+	u64 total_bytes;
+
+	/* bytes used */
+	u64 bytes_used;
+
+	/* optimal io alignment for this device */
+	u32 io_align;
+
+	/* optimal io width for this device */
+	u32 io_width;
+
+	/* minimal io size for this device */
+	u32 sector_size;
+
+	/* the kernel device number */
+	u64 rdev;
+
+	/* type and info about this device */
+	u64 type;
+
+	/* partition number, 0 for whole dev */
+	int partition;
+
+	/* length of the name data at the end of the item */
+	int name_len;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_DEV_UUID_SIZE];
+
+	char *name;
+};
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 owner, u64 num_bytes, u64 *start);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 *start,
+		      u64 *num_bytes, u32 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
+#endif
-- 
cgit v1.2.3


From 6324fbf334f4586325057197da7752f4ffa409d3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:01:59 -0400
Subject: Btrfs: Dynamic chunk and block group allocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  12 ++-
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c | 208 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/inode.c       |   9 ++-
 fs/btrfs/volumes.c     |  87 +++++++++++++--------
 fs/btrfs/volumes.h     |   2 +-
 6 files changed, 263 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1453d995fef..a877105f5c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -431,9 +431,19 @@ struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+struct btrfs_space_info {
+	u64 flags;
+	u64 total_bytes;
+	u64 bytes_used;
+	u64 bytes_pinned;
+	int full;
+	struct list_head list;
+};
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
+	struct btrfs_space_info *space_info;
 	u64 pinned;
 	u64 flags;
 	int cached;
@@ -490,7 +500,7 @@ struct btrfs_fs_info {
 	struct list_head dirty_cowonly_roots;
 
 	struct list_head devices;
-	struct list_head *last_device;
+	struct list_head space_info;
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e37fa120cc..2a239ae49f7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -700,8 +700,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->dev_root = dev_root;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->devices);
+	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-	fs_info->last_device = &fs_info->devices;
 	fs_info->sb = sb;
 	fs_info->throttles = 0;
 	fs_info->mount_opt = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2cd957d6e8d..15082b1087b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -36,6 +36,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 size);
 
 
 static int cache_block_group(struct btrfs_root *root,
@@ -168,16 +172,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
-	if ((bits & BLOCK_GROUP_DATA) &&
-	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))
-		return 1;
-	if ((bits & BLOCK_GROUP_METADATA) &&
-	     (cache->flags & BTRFS_BLOCK_GROUP_METADATA))
-		return 1;
-	if ((bits & BLOCK_GROUP_SYSTEM) &&
-	     (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
-		return 1;
-	return 0;
+	return (cache->flags & bits);
 }
 
 static int noinline find_search_start(struct btrfs_root *root,
@@ -276,6 +271,18 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
+static int block_group_state_bits(u64 flags)
+{
+	int bits = 0;
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		bits |= BLOCK_GROUP_DATA;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		bits |= BLOCK_GROUP_METADATA;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		bits |= BLOCK_GROUP_SYSTEM;
+	return bits;
+}
+
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
@@ -304,7 +311,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (!owner)
 		factor = 8;
 
-	bit = data;
+	bit = block_group_state_bits(data);
 
 	if (search_start && search_start < total_fs_bytes) {
 		struct btrfs_block_group_cache *shint;
@@ -358,10 +365,15 @@ again:
 			free_check = cache->key.offset;
 		else
 			free_check = div_factor(cache->key.offset, factor);
+
 		if (used + cache->pinned < free_check) {
 			found_group = cache;
 			goto found;
 		}
+		if (full_search) {
+			printk("failed on cache %Lu used %Lu total %Lu\n",
+			       cache->key.objectid, used, cache->key.offset);
+		}
 		cond_resched();
 	}
 	if (!full_search) {
@@ -983,6 +995,58 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	return werr;
 }
 
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+						  u64 flags)
+{
+	struct list_head *head = &info->space_info;
+	struct list_head *cur;
+	struct btrfs_space_info *found;
+	list_for_each(cur, head) {
+		found = list_entry(cur, struct btrfs_space_info, list);
+		if (found->flags == flags)
+			return found;
+	}
+	return NULL;
+
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags)
+{
+	struct btrfs_space_info *space_info;
+	u64 thresh;
+	u64 start;
+	u64 num_bytes;
+	int ret;
+
+	space_info = __find_space_info(extent_root->fs_info, flags);
+	BUG_ON(!space_info);
+
+	if (space_info->full)
+		return 0;
+
+	thresh = div_factor(space_info->total_bytes, 7);
+	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
+	    thresh)
+		return 0;
+
+	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
+	if (ret == -ENOSPC) {
+printk("space info full %Lu\n", flags);
+		space_info->full = 1;
+		return 0;
+	}
+
+	BUG_ON(ret);
+
+	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
+		     extent_root->fs_info->chunk_root->root_key.objectid,
+		     start, num_bytes);
+	BUG_ON(ret);
+	return 0;
+}
+
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
@@ -1012,8 +1076,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
 			old_val += num_bytes;
+			cache->space_info->bytes_used += num_bytes;
 		} else {
 			old_val -= num_bytes;
+			cache->space_info->bytes_used -= num_bytes;
 			if (mark_free) {
 				set_extent_dirty(&info->free_space_cache,
 						 bytenr, bytenr + num_bytes - 1,
@@ -1026,6 +1092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	}
 	return 0;
 }
+
 static int update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
@@ -1047,9 +1114,11 @@ static int update_pinned_extents(struct btrfs_root *root,
 			  (bytenr - cache->key.objectid));
 		if (pin) {
 			cache->pinned += len;
+			cache->space_info->bytes_pinned += len;
 			fs_info->total_pinned += len;
 		} else {
 			cache->pinned -= len;
+			cache->space_info->bytes_pinned -= len;
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1472,7 +1541,7 @@ check_failed:
 		goto new_group;
 	}
 
-	if (!(data & BLOCK_GROUP_DATA)) {
+	if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
 		block_group = btrfs_lookup_block_group(info, ins->objectid);
 		if (block_group)
 			trans->block_group = block_group;
@@ -1532,12 +1601,25 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
-	if (data)
-		data = BLOCK_GROUP_DATA;
-	else if (root == root->fs_info->chunk_root)
-		data = BLOCK_GROUP_SYSTEM;
-	else
-		data = BLOCK_GROUP_METADATA;
+	if (data) {
+		data = BTRFS_BLOCK_GROUP_DATA;
+	} else if (root == root->fs_info->chunk_root) {
+		data = BTRFS_BLOCK_GROUP_SYSTEM;
+	} else {
+		data = BTRFS_BLOCK_GROUP_METADATA;
+	}
+
+	if (root->ref_cows) {
+		if (data != BTRFS_BLOCK_GROUP_METADATA) {
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     num_bytes,
+					     BTRFS_BLOCK_GROUP_METADATA);
+			BUG_ON(ret);
+		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     num_bytes, data);
+		BUG_ON(ret);
+	}
 
 	new_hint = max(hint_byte, root->fs_info->alloc_start);
 	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
@@ -2490,6 +2572,34 @@ error:
 	return ret;
 }
 
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+
+	found = __find_space_info(info, flags);
+	if (found) {
+		found->total_bytes += total_bytes;
+		found->bytes_used += bytes_used;
+		WARN_ON(found->total_bytes < found->bytes_used);
+		*space_info = found;
+		return 0;
+	}
+	found = kmalloc(sizeof(*found), GFP_NOFS);
+	if (!found)
+		return -ENOMEM;
+
+	list_add(&found->list, &info->space_info);
+	found->flags = flags;
+	found->total_bytes = total_bytes;
+	found->bytes_used = bytes_used;
+	found->bytes_pinned = 0;
+	found->full = 0;
+	*space_info = found;
+	return 0;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -2497,6 +2607,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *space_info;
 	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -2547,6 +2658,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			bit = BLOCK_GROUP_METADATA;
 		}
 
+		ret = update_space_info(info, cache->flags, found_key.offset,
+					btrfs_block_group_used(&cache->item),
+					&space_info);
+		BUG_ON(ret);
+		cache->space_info = space_info;
+
 		/* use EXTENT_LOCKED to prevent merging */
 		set_extent_bits(block_group_cache, found_key.objectid,
 				found_key.objectid + found_key.offset - 1,
@@ -2563,3 +2680,58 @@ error:
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 size)
+{
+	int ret;
+	int bit = 0;
+	struct btrfs_root *extent_root;
+	struct btrfs_block_group_cache *cache;
+	struct extent_io_tree *block_group_cache;
+
+	extent_root = root->fs_info->extent_root;
+	block_group_cache = &root->fs_info->block_group_cache;
+
+	cache = kmalloc(sizeof(*cache), GFP_NOFS);
+	BUG_ON(!cache);
+	cache->key.objectid = chunk_objectid;
+	cache->key.offset = size;
+	cache->cached = 0;
+	cache->pinned = 0;
+	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+	memset(&cache->item, 0, sizeof(cache->item));
+	btrfs_set_block_group_used(&cache->item, bytes_used);
+	btrfs_set_block_group_chunk_tree(&cache->item, chunk_tree);
+	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+	cache->flags = type;
+	btrfs_set_block_group_flags(&cache->item, type);
+
+	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+				&cache->space_info);
+	BUG_ON(ret);
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		bit = BLOCK_GROUP_DATA;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		bit = BLOCK_GROUP_SYSTEM;
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		bit = BLOCK_GROUP_METADATA;
+	}
+	set_extent_bits(block_group_cache, chunk_objectid,
+			chunk_objectid + size - 1,
+			bit | EXTENT_LOCKED, GFP_NOFS);
+	set_state_private(block_group_cache, chunk_objectid,
+			  (unsigned long)cache);
+
+	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+				sizeof(cache->item));
+	BUG_ON(ret);
+
+	finish_current_insert(trans, extent_root);
+	ret = del_pending_extents(trans, extent_root);
+	BUG_ON(ret);
+	return 0;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17063cd2cb7..109576b57f6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1527,6 +1527,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
+	struct btrfs_block_group_cache *new_inode_group;
 	struct btrfs_key *location;
 	struct btrfs_path *path;
 	struct btrfs_inode_ref *ref;
@@ -1553,9 +1554,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 0;
 	else
 		owner = 1;
-	group = btrfs_find_block_group(root, group, 0,
+	new_inode_group = btrfs_find_block_group(root, group, 0,
 				       BTRFS_BLOCK_GROUP_METADATA, owner);
-	BTRFS_I(inode)->block_group = group;
+	if (!new_inode_group) {
+		printk("find_block group failed\n");
+		new_inode_group = group;
+	}
+	BTRFS_I(inode)->block_group = new_inode_group;
 	BTRFS_I(inode)->flags = 0;
 
 	key[0].objectid = objectid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 90a8d45dc6d..a52a13f365d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -131,7 +131,7 @@ check_pending:
 	btrfs_release_path(root, path);
 	BUG_ON(*start < search_start);
 
-	if (*start + num_bytes >= search_end) {
+	if (*start + num_bytes > search_end) {
 		ret = -ENOSPC;
 		goto error;
 	}
@@ -159,8 +159,9 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
-	if (ret)
+	if (ret) {
 		goto err;
+	}
 
 	key.objectid = device->devid;
 	key.offset = *start;
@@ -214,22 +215,6 @@ error:
 	return ret;
 }
 
-static struct btrfs_device *next_device(struct list_head *head,
-					struct list_head *last)
-{
-	struct list_head *next = last->next;
-	struct btrfs_device *dev;
-
-	if (list_empty(head))
-		return NULL;
-
-	if (next == head)
-		next = next->next;
-
-	dev = list_entry(next, struct btrfs_device, dev_list);
-	return dev;
-}
-
 static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
 			   u64 *objectid)
 {
@@ -397,31 +382,63 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u32 type)
+		      u64 *num_bytes, u64 type)
 {
 	u64 dev_offset;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
+	struct list_head private_devs;
 	struct list_head *dev_list = &extent_root->fs_info->devices;
-	struct list_head *last_dev = extent_root->fs_info->last_device;
+	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
-	int num_stripes;
+	u64 avail;
+	u64 max_avail = 0;
+	int num_stripes = 1;
+	int looped = 0;
 	int ret;
-	int index = 0;
+	int index;
 	struct btrfs_key key;
 
+	if (list_empty(dev_list))
+		return -ENOSPC;
+again:
+	INIT_LIST_HEAD(&private_devs);
+	cur = dev_list->next;
+	index = 0;
+	/* build a private list of devices we will allocate from */
+	while(index < num_stripes) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		avail = device->total_bytes - device->bytes_used;
+		cur = cur->next;
+		if (avail > max_avail)
+			max_avail = avail;
+		if (avail >= calc_size) {
+			list_move_tail(&device->dev_list, &private_devs);
+			index++;
+		}
+		if (cur == dev_list)
+			break;
+	}
+	if (index < num_stripes) {
+		list_splice(&private_devs, dev_list);
+		if (!looped && max_avail > 0) {
+			looped = 1;
+			calc_size = max_avail;
+			goto again;
+		}
+		return -ENOSPC;
+	}
 
 	ret = find_next_chunk(chunk_root, &key.objectid);
 	if (ret)
 		return ret;
 
-	num_stripes = 1;
 	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
 	if (!chunk)
 		return -ENOMEM;
@@ -429,11 +446,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	stripes = &chunk->stripe;
 
 	*num_bytes = calc_size;
+	index = 0;
 	while(index < num_stripes) {
-		device = next_device(dev_list, last_dev);
-		BUG_ON(!device);
-		last_dev = &device->dev_list;
-		extent_root->fs_info->last_device = last_dev;
+		BUG_ON(list_empty(&private_devs));
+		cur = private_devs.next;
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     key.objectid,
@@ -449,6 +467,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		physical = dev_offset;
 		index++;
 	}
+	BUG_ON(!list_empty(&private_devs));
 
 	/* key.objectid was set above */
 	key.offset = *num_bytes;
@@ -692,17 +711,17 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	int ret;
 
 	devid = btrfs_device_id(leaf, dev_item);
-	if (btrfs_find_device(root, devid))
-		return 0;
-
-	device = kmalloc(sizeof(*device), GFP_NOFS);
-	if (!device)
-		return -ENOMEM;
+	device = btrfs_find_device(root, devid);
+	if (!device) {
+		device = kmalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			return -ENOMEM;
+		list_add(&device->dev_list, &root->fs_info->devices);
+	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = root->fs_info->sb->s_bdev;
-	list_add(&device->dev_list, &root->fs_info->devices);
 	memcpy(&device->dev_key, key, sizeof(*key));
 	ret = 0;
 #if 0
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4a47dcb00a7..4ab9891878a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -71,7 +71,7 @@ int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u32 type);
+		      u64 *num_bytes, u64 type);
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
-- 
cgit v1.2.3


From 7d1660d411a24f9b5a2f38e41c9896d7151c934f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:03 -0400
Subject: Btrfs: Bring back find_free_extent CPU usage optimizations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 15082b1087b..5e392804ab1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -182,9 +182,9 @@ static int noinline find_search_start(struct btrfs_root *root,
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
 	struct extent_io_tree *free_space_cache;
+	struct extent_state *state;
 	u64 last;
 	u64 start = 0;
-	u64 end = 0;
 	u64 cache_miss = 0;
 	u64 total_fs_bytes;
 	u64 search_start = *start_ret;
@@ -205,30 +205,34 @@ again:
 		goto new_group;
 	}
 
+	spin_lock_irq(&free_space_cache->lock);
+	state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
 	while(1) {
-		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
-					    last, &start, &end, EXTENT_DIRTY);
-		if (ret) {
+		if (!state) {
 			if (!cache_miss)
 				cache_miss = last;
+			spin_unlock_irq(&free_space_cache->lock);
 			goto new_group;
 		}
 
-		start = max(last, start);
-		last = end + 1;
+		start = max(last, state->start);
+		last = state->end + 1;
 		if (last - start < num) {
 			if (last == cache->key.objectid + cache->key.offset)
 				cache_miss = start;
+			do {
+				state = extent_state_next(state);
+			} while(state && !(state->state & EXTENT_DIRTY));
 			continue;
 		}
+		spin_unlock_irq(&free_space_cache->lock);
 		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
 		if (start + num  > total_fs_bytes)
 			goto new_group;
 		*start_ret = start;
 		return 0;
-	}
-out:
+	} out:
 	cache = btrfs_lookup_block_group(root->fs_info, search_start);
 	if (!cache) {
 		printk("Unable to find block group for %Lu\n", search_start);
-- 
cgit v1.2.3


From a9218f6b00ada101c9772ee9c44af04fa5125d2a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:04 -0400
Subject: Add /dev/btrfs-control for device scanning ioctls

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  4 ----
 fs/btrfs/super.c       | 41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e392804ab1..14eb8fc8701 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -374,10 +374,6 @@ again:
 			found_group = cache;
 			goto found;
 		}
-		if (full_search) {
-			printk("failed on cache %Lu used %Lu total %Lu\n",
-			       cache->key.objectid, used, cache->key.offset);
-		}
 		cond_resched();
 	}
 	if (!full_search) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4423a91206a..67ed216df47 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -36,6 +36,7 @@
 #include <linux/parser.h>
 #include <linux/ctype.h>
 #include <linux/namei.h>
+#include <linux/miscdevice.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -444,6 +445,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	printk("btrfs control ioctl %d\n", cmd);
+	return 0;
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -451,6 +459,7 @@ static struct file_system_type btrfs_fs_type = {
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
+
 static void btrfs_write_super_lockfs(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
@@ -482,6 +491,30 @@ static struct super_operations btrfs_super_ops = {
 	.write_super_lockfs = btrfs_write_super_lockfs,
 	.unlockfs	= btrfs_unlockfs,
 };
+
+static const struct file_operations btrfs_ctl_fops = {
+	.unlocked_ioctl	 = btrfs_control_ioctl,
+	.compat_ioctl = btrfs_control_ioctl,
+	.owner	 = THIS_MODULE,
+};
+
+static struct miscdevice btrfs_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "btrfs-control",
+	.fops		= &btrfs_ctl_fops
+};
+
+static int btrfs_interface_init(void)
+{
+	return misc_register(&btrfs_misc);
+}
+
+void btrfs_interface_exit(void)
+{
+	if (misc_deregister(&btrfs_misc) < 0)
+		printk("misc_deregister failed for control device");
+}
+
 static int __init init_btrfs_fs(void)
 {
 	int err;
@@ -503,11 +536,16 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_extent_io;
 
-	err = register_filesystem(&btrfs_fs_type);
+	err = btrfs_interface_init();
 	if (err)
 		goto free_extent_map;
+	err = register_filesystem(&btrfs_fs_type);
+	if (err)
+		goto unregister_ioctl;
 	return 0;
 
+unregister_ioctl:
+	btrfs_interface_exit();
 free_extent_map:
 	extent_map_exit();
 free_extent_io:
@@ -526,6 +564,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_destroy_cachep();
 	extent_map_exit();
 	extent_io_exit();
+	btrfs_interface_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 }
-- 
cgit v1.2.3


From e085def2c4cc2d7c0c316376b4b66b86b10e3a4b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Make the FS tree the last objectid in the tree of tree roots

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a877105f5c4..285fb7e4610 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -47,24 +47,23 @@ extern struct kmem_cache *btrfs_path_cachep;
 /* stores information about which extents are in use, and reference counts */
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
 
-/* one per subvolume, storing files and directories */
-#define BTRFS_FS_TREE_OBJECTID 3ULL
-
-/* directory objectid inside the root tree */
-#define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
-
-
 /*
  * chunk tree stores translations from logical -> physical block numbering
  * the super block points to the chunk tree
  */
-#define BTRFS_CHUNK_TREE_OBJECTID 5ULL
+#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
 
 /*
  * stores information about which areas of a given device are in use.
  * one per device.  The tree of tree roots points to the device tree
  */
-#define BTRFS_DEV_TREE_OBJECTID 6ULL
+#define BTRFS_DEV_TREE_OBJECTID 4ULL
+
+/* one per subvolume, storing files and directories */
+#define BTRFS_FS_TREE_OBJECTID 5ULL
+
+/* directory objectid inside the root tree */
+#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
 
 /*
  * All files have objectids higher than this.
-- 
cgit v1.2.3


From 0d81ba5dbedef0c3970d6aa318aa84920943e6e3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Move device information into the super block so it can be
 scanned

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      | 21 ++----------------
 fs/btrfs/disk-io.c    |  4 +++-
 fs/btrfs/print-tree.c | 15 ++-----------
 fs/btrfs/volumes.c    | 61 +++++++++++++--------------------------------------
 fs/btrfs/volumes.h    | 13 +----------
 5 files changed, 23 insertions(+), 91 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 285fb7e4610..96a49321786 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -150,21 +150,11 @@ struct btrfs_dev_item {
 	/* minimal io size for this device */
 	__le32 sector_size;
 
-	/* the kernel device number */
-	__le64 rdev;
-
 	/* type and info about this device */
 	__le64 type;
 
-	/* partition number, 0 for whole dev */
-	__le32 partition;
-
-	/* length of the name data at the end of the item */
-	__le16 name_len;
-
-	/* physical drive uuid (or lvm uuid) */
+	/* btrfs generated uuid for this device */
 	u8 uuid[BTRFS_DEV_UUID_SIZE];
-	/* name goes here */
 } __attribute__ ((__packed__));
 
 struct btrfs_stripe {
@@ -255,6 +245,7 @@ struct btrfs_super_block {
 	__le32 sys_chunk_array_size;
 	u8 root_level;
 	u8 chunk_root_level;
+	struct btrfs_dev_item dev_item;
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -685,20 +676,12 @@ BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
 BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_FUNCS(device_rdev, struct btrfs_dev_item, rdev, 64);
-BTRFS_SETGET_FUNCS(device_partition, struct btrfs_dev_item, partition, 32);
-BTRFS_SETGET_FUNCS(device_name_len, struct btrfs_dev_item, name_len, 16);
 
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
-static inline char *btrfs_device_name(struct btrfs_dev_item *d)
-{
-	return (char *)(d + 1);
-}
-
 BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
 BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2a239ae49f7..26185d46712 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -370,7 +370,6 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 		next = list->next;
 		list_del(next);
 		device = list_entry(next, struct btrfs_device, dev_list);
-		kfree(device->name);
 		kfree(device);
 	}
 	return 0;
@@ -800,6 +799,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 
 	mutex_lock(&fs_info->fs_mutex);
+	ret = btrfs_read_super_device(tree_root, fs_info->sb_buffer);
+	BUG_ON(ret);
+
 	ret = btrfs_read_sys_array(tree_root);
 	BUG_ON(ret);
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9c1335dad40..ee0de112cf5 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -37,22 +37,11 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 static void print_dev_item(struct extent_buffer *eb,
 			   struct btrfs_dev_item *dev_item)
 {
-	char *name;
-	int name_len;
-
-	name_len = btrfs_device_name_len(eb, dev_item);
-	name = kmalloc(name_len, GFP_NOFS);
-	if (name) {
-		read_extent_buffer(eb, name,
-				   (unsigned long)btrfs_device_name(dev_item),
-				   name_len);
-	}
-	printk("\t\tdev item name %.*s devid %llu "
-	       "total_bytes %llu bytes used %Lu\n", name_len, name,
+	printk("\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %Lu\n",
 	       (unsigned long long)btrfs_device_id(eb, dev_item),
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
-	kfree(name);
 }
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a52a13f365d..ae22d01ecf5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -278,7 +278,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	key.offset = free_devid;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      sizeof(*dev_item) + device->name_len);
+				      sizeof(*dev_item));
 	if (ret)
 		goto out;
 
@@ -290,15 +290,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
-	btrfs_set_device_partition(leaf, dev_item, device->partition);
-	btrfs_set_device_name_len(leaf, dev_item, device->name_len);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 
-	ptr = (unsigned long)btrfs_device_name(dev_item);
-	write_extent_buffer(leaf, device->name, ptr, device->name_len);
-
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
@@ -345,8 +339,6 @@ int btrfs_update_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
-	btrfs_set_device_partition(leaf, dev_item, device->partition);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 	btrfs_mark_buffer_dirty(leaf);
@@ -676,7 +668,6 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 				 struct btrfs_device *device)
 {
 	unsigned long ptr;
-	char *name;
 
 	device->devid = btrfs_device_id(leaf, dev_item);
 	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
@@ -685,24 +676,14 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
 	device->io_width = btrfs_device_io_width(leaf, dev_item);
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
-	device->rdev = btrfs_device_rdev(leaf, dev_item);
-	device->partition = btrfs_device_partition(leaf, dev_item);
-	device->name_len = btrfs_device_name_len(leaf, dev_item);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
 
-	name = kmalloc(device->name_len + 1, GFP_NOFS);
-	if (!name)
-		return -ENOMEM;
-	device->name = name;
-	ptr = (unsigned long)btrfs_device_name(dev_item);
-	read_extent_buffer(leaf, name, ptr, device->name_len);
-	name[device->name_len] = '\0';
 	return 0;
 }
 
-static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
+static int read_one_dev(struct btrfs_root *root,
 			struct extent_buffer *leaf,
 			struct btrfs_dev_item *dev_item)
 {
@@ -722,7 +703,6 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = root->fs_info->sb->s_bdev;
-	memcpy(&device->dev_key, key, sizeof(*key));
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
@@ -733,12 +713,20 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	return ret;
 }
 
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+	struct btrfs_dev_item *dev_item;
+
+	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+						     dev_item);
+	return read_one_dev(root, buf, dev_item);
+}
+
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb = root->fs_info->sb_buffer;
 	struct btrfs_disk_key *disk_key;
-	struct btrfs_dev_item *dev_item;
 	struct btrfs_chunk *chunk;
 	struct btrfs_key key;
 	u32 num_stripes;
@@ -748,7 +736,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	unsigned long sb_ptr;
 	u32 cur;
 	int ret;
-	int dev_only = 1;
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
@@ -757,7 +744,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	 * once for all of the chunks.  This way there are device
 	 * structs filled in for every chunk
 	 */
-again:
 	ptr = super_copy->sys_chunk_array;
 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
 	cur = 0;
@@ -771,22 +757,10 @@ again:
 		sb_ptr += len;
 		cur += len;
 
-		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID &&
-		    key.type == BTRFS_DEV_ITEM_KEY) {
-			dev_item = (struct btrfs_dev_item *)sb_ptr;
-			if (dev_only) {
-				ret = read_one_dev(root, &key, sb, dev_item);
-				BUG_ON(ret);
-			}
-			len = sizeof(*dev_item);
-			len += btrfs_device_name_len(sb, dev_item);
-		} else if (key.type == BTRFS_CHUNK_ITEM_KEY) {
-
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
 			chunk = (struct btrfs_chunk *)sb_ptr;
-			if (!dev_only) {
-				ret = read_one_chunk(root, &key, sb, chunk);
-				BUG_ON(ret);
-			}
+			ret = read_one_chunk(root, &key, sb, chunk);
+			BUG_ON(ret);
 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
 			len = btrfs_chunk_item_size(num_stripes);
 		} else {
@@ -796,10 +770,6 @@ again:
 		sb_ptr += len;
 		cur += len;
 	}
-	if (dev_only == 1) {
-		dev_only = 0;
-		goto again;
-	}
 	return 0;
 }
 
@@ -846,8 +816,7 @@ again:
 				struct btrfs_dev_item *dev_item;
 				dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
-				ret = read_one_dev(root, &found_key, leaf,
-						   dev_item);
+				ret = read_one_dev(root, leaf, dev_item);
 				BUG_ON(ret);
 			}
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4ab9891878a..77fa6efd79c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,7 +21,6 @@
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
-	struct btrfs_key dev_key;
 
 	struct block_device *bdev;
 
@@ -43,22 +42,11 @@ struct btrfs_device {
 	/* minimal io size for this device */
 	u32 sector_size;
 
-	/* the kernel device number */
-	u64 rdev;
-
 	/* type and info about this device */
 	u64 type;
 
-	/* partition number, 0 for whole dev */
-	int partition;
-
-	/* length of the name data at the end of the item */
-	int name_len;
-
 	/* physical drive uuid (or lvm uuid) */
 	u8 uuid[BTRFS_DEV_UUID_SIZE];
-
-	char *name;
 };
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -75,4 +63,5 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 #endif
-- 
cgit v1.2.3


From 239b14b32dc39232ebf9cce29ff77c4c564355fd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Bring back mount -o ssd optimizations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/extent_io.c   |  2 ++
 fs/btrfs/extent_io.h   |  2 ++
 fs/btrfs/inode.c       | 29 +++++++++++++++++++++++++
 fs/btrfs/volumes.c     |  5 +++++
 fs/btrfs/volumes.h     |  3 +++
 8 files changed, 103 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 96a49321786..acf22ad6115 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1405,6 +1405,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio);
+
 static inline void dec_i_blocks(struct inode *inode, u64 dec)
 {
 	dec = dec >> 9;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 26185d46712..4890151cd68 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1103,4 +1103,6 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 static struct extent_io_ops btree_extent_io_ops = {
 	.writepage_io_hook = btree_writepage_io_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
+	/* note we're sharing with inode.c for the merge bio hook */
+	.merge_bio_hook = btrfs_merge_bio_hook,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 14eb8fc8701..e9ef644ff56 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1473,13 +1473,31 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
+	u64 *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	int empty_cluster = 2 * 1024 * 1024;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
+	if (data & BTRFS_BLOCK_GROUP_METADATA) {
+		last_ptr = &root->fs_info->last_alloc;
+	}
+
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+		last_ptr = &root->fs_info->last_data_alloc;
+	}
+
+	if (last_ptr) {
+		if (*last_ptr)
+			hint_byte = *last_ptr;
+		else {
+			empty_size += empty_cluster;
+		}
+	}
+
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
@@ -1489,11 +1507,14 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 			hint_byte = search_start;
 		block_group = btrfs_find_block_group(root, block_group,
 						     hint_byte, data, 1);
+		if (last_ptr && *last_ptr == 0 && block_group)
+			hint_byte = block_group->key.objectid;
 	} else {
 		block_group = btrfs_find_block_group(root,
 						     trans->block_group,
 						     search_start, data, 1);
 	}
+	search_start = max(search_start, hint_byte);
 
 	total_needed += empty_size;
 
@@ -1506,9 +1527,36 @@ check_failed:
 	}
 	ret = find_search_start(root, &block_group, &search_start,
 				total_needed, data);
+	if (ret == -ENOSPC && last_ptr && *last_ptr) {
+		*last_ptr = 0;
+		block_group = btrfs_lookup_block_group(info,
+						       orig_search_start);
+		search_start = orig_search_start;
+		ret = find_search_start(root, &block_group, &search_start,
+					total_needed, data);
+	}
+	if (ret == -ENOSPC)
+		goto enospc;
 	if (ret)
 		goto error;
 
+	if (last_ptr && *last_ptr && search_start != *last_ptr) {
+		*last_ptr = 0;
+		if (!empty_size) {
+			empty_size += empty_cluster;
+			total_needed += empty_size;
+		}
+		block_group = btrfs_lookup_block_group(info,
+						       orig_search_start);
+		search_start = orig_search_start;
+		ret = find_search_start(root, &block_group,
+					&search_start, total_needed, data);
+		if (ret == -ENOSPC)
+			goto enospc;
+		if (ret)
+			goto error;
+	}
+
 	search_start = stripe_align(root, search_start);
 	ins->objectid = search_start;
 	ins->offset = num_bytes;
@@ -1547,6 +1595,13 @@ check_failed:
 			trans->block_group = block_group;
 	}
 	ins->offset = num_bytes;
+	if (last_ptr) {
+		*last_ptr = ins->objectid + ins->offset;
+		if (*last_ptr ==
+		    btrfs_super_total_bytes(&root->fs_info->super_copy)) {
+			*last_ptr = 0;
+		}
+	}
 	return 0;
 
 new_group:
@@ -1612,12 +1667,12 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (root->ref_cows) {
 		if (data != BTRFS_BLOCK_GROUP_METADATA) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     num_bytes,
+					     2 * 1024 * 1024,
 					     BTRFS_BLOCK_GROUP_METADATA);
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes, data);
+				     num_bytes + 2 * 1024 * 1024, data);
 		BUG_ON(ret);
 	}
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7e3a1ebde9f..6dab664529c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1730,6 +1730,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
 		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		    (tree->ops && tree->ops->merge_bio_hook &&
+		     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
 		    bio_add_page(bio, page, size, offset) < size) {
 			ret = submit_one_bio(rw, bio);
 			bio = NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0dca89328f9..8b5319db251 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,6 +29,8 @@ struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio);
+	int (*merge_bio_hook)(struct page *page, unsigned long offset,
+			      size_t size, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 109576b57f6..5140d680184 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -296,6 +296,34 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio)
+{
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	u64 logical = bio->bi_sector << 9;
+	u64 physical;
+	u64 length = 0;
+	u64 map_length;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		length += bvec->bv_len;
+	}
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	if (map_length < length + size) {
+		printk("merge bio hook logical %Lu bio len %Lu physical %Lu "
+		       "len %Lu\n", logical, length, physical, map_length);
+		return 1;
+	}
+	return 0;
+}
+
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3033,6 +3061,7 @@ static struct file_operations btrfs_dir_file_operations = {
 static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
+	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ae22d01ecf5..16fb6bbe6e2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -578,6 +578,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	if (map_length < length) {
+		printk("mapping failed logical %Lu bio len %Lu physical %Lu "
+		       "len %Lu\n", logical, length, physical, map_length);
+		BUG();
+	}
 	BUG_ON(map_length < length);
 	bio->bi_sector = physical >> 9;
 	bio->bi_bdev = dev->bdev;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 77fa6efd79c..20259128152 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -64,4 +64,7 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev);
 #endif
-- 
cgit v1.2.3


From 8a4b83cc8bd75fca29ac68615896d9e92820e7c2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Add support for device scanning and detection ioctls

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  21 ++++-
 fs/btrfs/disk-io.c |  24 ++++--
 fs/btrfs/disk-io.h |   4 +-
 fs/btrfs/ioctl.h   |   6 +-
 fs/btrfs/super.c   |  61 ++++++++++----
 fs/btrfs/volumes.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/volumes.h |  25 ++++++
 7 files changed, 333 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index acf22ad6115..7556f8319c6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -37,7 +37,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 
-#define BTRFS_MAGIC "_B4RfS_M"
+#define BTRFS_MAGIC "_B5RfS_M"
 
 #define BTRFS_MAX_LEVEL 8
 
@@ -238,6 +238,7 @@ struct btrfs_super_block {
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
+	__le64 num_devices;
 	__le32 sectorsize;
 	__le32 nodesize;
 	__le32 leafsize;
@@ -440,6 +441,7 @@ struct btrfs_block_group_cache {
 };
 
 struct btrfs_device;
+struct btrfs_fs_devices;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	struct btrfs_root *extent_root;
@@ -489,7 +491,7 @@ struct btrfs_fs_info {
 	u64 total_pinned;
 	struct list_head dirty_cowonly_roots;
 
-	struct list_head devices;
+	struct btrfs_fs_devices *fs_devices;
 	struct list_head space_info;
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
@@ -677,6 +679,19 @@ BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
 
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
@@ -1106,6 +1121,8 @@ BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
 			 stripesize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
 			 root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+			 num_devices, 64);
 
 static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4890151cd68..f971a29e4f2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -365,12 +365,12 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 	struct list_head *next;
 	struct btrfs_device *device;
 
-	list = &fs_info->devices;
-	while(!list_empty(list)) {
-		next = list->next;
-		list_del(next);
+	list = &fs_info->fs_devices->devices;
+	list_for_each(next, list) {
 		device = list_entry(next, struct btrfs_device, dev_list);
-		kfree(device);
+		if (device->bdev && device->bdev != fs_info->sb->s_bdev)
+			close_bdev_excl(device->bdev);
+		device->bdev = NULL;
 	}
 	return 0;
 }
@@ -655,7 +655,8 @@ static int add_hasher(struct btrfs_fs_info *info, char *type) {
 	return 0;
 }
 #endif
-struct btrfs_root *open_ctree(struct super_block *sb)
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -697,8 +698,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
 	fs_info->dev_root = dev_root;
+	fs_info->fs_devices = fs_devices;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
-	INIT_LIST_HEAD(&fs_info->devices);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	fs_info->sb = sb;
@@ -779,6 +780,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
+	if (btrfs_super_num_devices(disk_super) != fs_devices->num_devices) {
+		printk("Btrfs: wanted %llu devices, but found %llu\n",
+		       (unsigned long long)btrfs_super_num_devices(disk_super),
+		       (unsigned long long)fs_devices->num_devices);
+		goto fail_sb_buffer;
+	}
 	nodesize = btrfs_super_nodesize(disk_super);
 	leafsize = btrfs_super_leafsize(disk_super);
 	sectorsize = btrfs_super_sectorsize(disk_super);
@@ -799,8 +806,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 
 	mutex_lock(&fs_info->fs_mutex);
-	ret = btrfs_read_super_device(tree_root, fs_info->sb_buffer);
-	BUG_ON(ret);
 
 	ret = btrfs_read_sys_array(tree_root);
 	BUG_ON(ret);
@@ -859,6 +864,7 @@ fail_sb_buffer:
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
+	close_all_devices(fs_info);
 	kfree(extent_root);
 	kfree(tree_root);
 	kfree(fs_info);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 206cb48638f..b7cbc58a555 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,6 +21,7 @@
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
 struct btrfs_device;
+struct btrfs_fs_devices;
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize);
@@ -29,7 +30,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb);
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 8c6290665d4..4551e82013c 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,8 +22,10 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_PATH_NAME_MAX 4095
+
 struct btrfs_ioctl_vol_args {
-	char name[BTRFS_VOL_NAME_MAX + 1];
+	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
@@ -32,4 +34,6 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67ed216df47..9624923a33d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -44,6 +44,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 #include "xattr.h"
+#include "volumes.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -216,7 +217,9 @@ static int parse_options (char * options,
 	return 1;
 }
 
-static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
+static int btrfs_fill_super(struct super_block * sb,
+			    struct btrfs_fs_devices *fs_devices,
+			    void * data, int silent)
 {
 	struct inode * inode;
 	struct dentry * root_dentry;
@@ -231,7 +234,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 
-	tree_root = open_ctree(sb);
+	tree_root = open_ctree(sb, fs_devices);
 
 	if (!tree_root || IS_ERR(tree_root)) {
 		printk("btrfs: open_ctree failed\n");
@@ -334,18 +337,23 @@ static int test_bdev_super(struct super_block *s, void *data)
 
 int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt, const char *subvol)
 {
 	struct block_device *bdev = NULL;
 	struct super_block *s;
 	struct dentry *root;
+	struct btrfs_fs_devices *fs_devices = NULL;
 	int error = 0;
 
-	bdev = open_bdev_excl(dev_name, flags, fs_type);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
+	if (error)
+		return error;
 
+	error = btrfs_open_devices(fs_devices, flags, fs_type);
+	if (error)
+		return error;
+
+	bdev = fs_devices->lowest_bdev;
 	/*
 	 * once the super is inserted into the list by sget, s_umount
 	 * will protect the lockfs code from trying to start a snapshot
@@ -372,7 +380,8 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		error = btrfs_fill_super(s, fs_devices, data,
+					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
@@ -408,7 +417,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_excl(bdev);
+	btrfs_close_devices(fs_devices);
 error:
 	return error;
 }
@@ -421,8 +430,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 	char *subvol_name = NULL;
 
 	parse_options((char *)data, NULL, &subvol_name);
-	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data,
-			btrfs_fill_super, mnt,
+	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, mnt,
 			subvol_name ? subvol_name : "default");
 	if (subvol_name)
 		kfree(subvol_name);
@@ -445,13 +453,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
-				unsigned long arg)
-{
-	printk("btrfs control ioctl %d\n", cmd);
-	return 0;
-}
-
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -460,6 +461,31 @@ static struct file_system_type btrfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct btrfs_ioctl_vol_args *vol;
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+	int len;
+
+	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+	switch (cmd) {
+	case BTRFS_IOC_SCAN_DEV:
+		ret = btrfs_scan_one_device(vol->name, MS_RDONLY,
+					    &btrfs_fs_type, &fs_devices);
+		break;
+	}
+out:
+	kfree(vol);
+	return 0;
+}
+
 static void btrfs_write_super_lockfs(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
@@ -567,6 +593,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_interface_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
+	btrfs_cleanup_fs_uuids();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 16fb6bbe6e2..263f01cc3db 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/buffer_head.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -28,6 +29,215 @@ struct map_lookup {
 	struct btrfs_device *dev;
 	u64 physical;
 };
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+int btrfs_cleanup_fs_uuids(void)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct list_head *uuid_cur;
+	struct list_head *devices_cur;
+	struct btrfs_device *dev;
+
+	list_for_each(uuid_cur, &fs_uuids) {
+		fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
+					list);
+		while(!list_empty(&fs_devices->devices)) {
+			devices_cur = fs_devices->devices.next;
+			dev = list_entry(devices_cur, struct btrfs_device,
+					 dev_list);
+			printk("uuid cleanup finds %s\n", dev->name);
+			if (dev->bdev) {
+				printk("closing\n");
+				close_bdev_excl(dev->bdev);
+			}
+			list_del(&dev->dev_list);
+			kfree(dev);
+		}
+	}
+	return 0;
+}
+
+static struct btrfs_device *__find_device(struct list_head *head, u64 devid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur;
+
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid)
+			return dev;
+	}
+	return NULL;
+}
+
+static struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+	struct list_head *cur;
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each(cur, &fs_uuids) {
+		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+			return fs_devices;
+	}
+	return NULL;
+}
+
+static int device_list_add(const char *path,
+			   struct btrfs_super_block *disk_super,
+			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices;
+	u64 found_transid = btrfs_super_generation(disk_super);
+
+	fs_devices = find_fsid(disk_super->fsid);
+	if (!fs_devices) {
+		fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+		if (!fs_devices)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&fs_devices->devices);
+		list_add(&fs_devices->list, &fs_uuids);
+		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+		fs_devices->lowest_devid = (u64)-1;
+		fs_devices->num_devices = 0;
+		device = NULL;
+	} else {
+		device = __find_device(&fs_devices->devices, devid);
+	}
+	if (!device) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device) {
+			/* we can safely leave the fs_devices entry around */
+			return -ENOMEM;
+		}
+		device->devid = devid;
+		device->name = kstrdup(path, GFP_NOFS);
+		if (!device->name) {
+			kfree(device);
+			return -ENOMEM;
+		}
+		list_add(&device->dev_list, &fs_devices->devices);
+		fs_devices->num_devices++;
+	}
+
+	if (found_transid > fs_devices->latest_trans) {
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+	}
+	if (fs_devices->lowest_devid > devid) {
+		fs_devices->lowest_devid = devid;
+		printk("lowest devid now %Lu\n", devid);
+	}
+	*fs_devices_ret = fs_devices;
+	return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	mutex_lock(&uuid_mutex);
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			printk("close devices closes %s\n", device->name);
+		}
+		device->bdev = NULL;
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder)
+{
+	struct block_device *bdev;
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		bdev = open_bdev_excl(device->name, flags, holder);
+printk("opening %s devid %Lu\n", device->name, device->devid);
+		if (IS_ERR(bdev)) {
+			printk("open %s failed\n", device->name);
+			ret = PTR_ERR(bdev);
+			goto fail;
+		}
+		if (device->devid == fs_devices->latest_devid)
+			fs_devices->latest_bdev = bdev;
+		if (device->devid == fs_devices->lowest_devid) {
+			fs_devices->lowest_bdev = bdev;
+printk("lowest bdev %s\n", device->name);
+		}
+		device->bdev = bdev;
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+fail:
+	mutex_unlock(&uuid_mutex);
+	btrfs_close_devices(fs_devices);
+	return ret;
+}
+
+int btrfs_scan_one_device(const char *path, int flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_super_block *disk_super;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	int ret;
+	u64 devid;
+
+	mutex_lock(&uuid_mutex);
+
+	printk("scan one opens %s\n", path);
+	bdev = open_bdev_excl(path, flags, holder);
+
+	if (IS_ERR(bdev)) {
+		printk("open failed\n");
+		ret = PTR_ERR(bdev);
+		goto error;
+	}
+
+	ret = set_blocksize(bdev, 4096);
+	if (ret)
+		goto error_close;
+	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+	    sizeof(disk_super->magic))) {
+		printk("no btrfs found on %s\n", path);
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	printk("found device %Lu on %s\n", devid, path);
+	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+
+error_brelse:
+	brelse(bh);
+error_close:
+	close_bdev_excl(bdev);
+	printk("scan one closes bdev %s\n", path);
+error:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
 
 /*
  * this uses a pretty simple search, the expectation is that it is
@@ -56,6 +266,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans,
 
 	/* FIXME use last free of some kind */
 
+	/* we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max((u64)1024 * 1024, search_start);
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
@@ -285,6 +499,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
 
+	device->devid = free_devid;
 	btrfs_set_device_id(leaf, dev_item, device->devid);
 	btrfs_set_device_type(leaf, dev_item, device->type);
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
@@ -382,7 +597,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
 	struct list_head private_devs;
-	struct list_head *dev_list = &extent_root->fs_info->devices;
+	struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices;
 	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
@@ -449,7 +664,7 @@ again:
 					     key.objectid,
 					     calc_size, &dev_offset);
 		BUG_ON(ret);
-
+printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
@@ -592,17 +807,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
 {
-	struct btrfs_device *dev;
-	struct list_head *cur = root->fs_info->devices.next;
-	struct list_head *head = &root->fs_info->devices;
+	struct list_head *head = &root->fs_info->fs_devices->devices;
 
-	while(cur != head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
-		if (dev->devid == devid)
-			return dev;
-		cur = cur->next;
-	}
-	return NULL;
+	return __find_device(head, devid);
 }
 
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
@@ -699,15 +906,16 @@ static int read_one_dev(struct btrfs_root *root,
 	devid = btrfs_device_id(leaf, dev_item);
 	device = btrfs_find_device(root, devid);
 	if (!device) {
+		printk("warning devid %Lu not found already\n", devid);
 		device = kmalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
 			return -ENOMEM;
-		list_add(&device->dev_list, &root->fs_info->devices);
+		list_add(&device->dev_list,
+			 &root->fs_info->fs_devices->devices);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
-	device->bdev = root->fs_info->sb->s_bdev;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 20259128152..12f297eb055 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
+	char *name;
+
 	/* the internal btrfs device id */
 	u64 devid;
 
@@ -49,6 +51,20 @@ struct btrfs_device {
 	u8 uuid[BTRFS_DEV_UUID_SIZE];
 };
 
+struct btrfs_fs_devices {
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+	/* the device with this id has the most recent coyp of the super */
+	u64 latest_devid;
+	u64 latest_trans;
+	u64 lowest_devid;
+	u64 num_devices;
+	struct block_device *latest_bdev;
+	struct block_device *lowest_bdev;
+	struct list_head devices;
+	struct list_head list;
+};
+
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 owner, u64 num_bytes, u64 *start);
@@ -67,4 +83,13 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 		    u64 logical, u64 *phys, u64 *length,
 		    struct btrfs_device **dev);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder);
+int btrfs_scan_one_device(const char *path, int flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device);
+int btrfs_cleanup_fs_uuids(void);
 #endif
-- 
cgit v1.2.3


From 593060d756e0c2382d59cf00d5f3b9e3a336c408 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 25 Mar 2008 16:50:33 -0400
Subject: Btrfs: Implement raid0 when multiple devices are present

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   3 ++
 fs/btrfs/disk-io.c     |   2 +
 fs/btrfs/extent-tree.c | 102 +++++++++++++++++++++++++-------------
 fs/btrfs/inode.c       |   2 -
 fs/btrfs/volumes.c     | 130 +++++++++++++++++++++++++++++++++++++------------
 5 files changed, 173 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7556f8319c6..f43dfe7ffe6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -413,6 +413,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
 #define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
 
 
 struct btrfs_block_group_item {
@@ -498,6 +499,8 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
+	int extra_data_alloc_bits;
+	int extra_alloc_bits;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f971a29e4f2..d8474796dea 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -736,6 +736,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->total_pinned = 0;
 	fs_info->last_alloc = 0;
 	fs_info->last_data_alloc = 0;
+	fs_info->extra_alloc_bits = 0;
+	fs_info->extra_data_alloc_bits = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e9ef644ff56..c226656f29b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -172,7 +172,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
-	return (cache->flags & bits);
+	return (cache->flags & bits) == bits;
 }
 
 static int noinline find_search_start(struct btrfs_root *root,
@@ -1010,6 +1010,35 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 
 }
 
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+
+	found = __find_space_info(info, flags);
+	if (found) {
+		found->total_bytes += total_bytes;
+		found->bytes_used += bytes_used;
+		WARN_ON(found->total_bytes < found->bytes_used);
+		*space_info = found;
+		return 0;
+	}
+	found = kmalloc(sizeof(*found), GFP_NOFS);
+	if (!found)
+		return -ENOMEM;
+
+	list_add(&found->list, &info->space_info);
+	found->flags = flags;
+	found->total_bytes = total_bytes;
+	found->bytes_used = bytes_used;
+	found->bytes_pinned = 0;
+	found->full = 0;
+	*space_info = found;
+	return 0;
+}
+
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags)
@@ -1021,6 +1050,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	int ret;
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
+	if (!space_info) {
+		ret = update_space_info(extent_root->fs_info, flags,
+					0, 0, &space_info);
+		BUG_ON(ret);
+	}
 	BUG_ON(!space_info);
 
 	if (space_info->full)
@@ -1044,6 +1078,17 @@ printk("space info full %Lu\n", flags);
 		     extent_root->fs_info->chunk_root->root_key.objectid,
 		     start, num_bytes);
 	BUG_ON(ret);
+
+	if (flags & BTRFS_BLOCK_GROUP_RAID0) {
+		if (flags & BTRFS_BLOCK_GROUP_DATA) {
+			extent_root->fs_info->extra_data_alloc_bits =
+				BTRFS_BLOCK_GROUP_RAID0;
+		}
+		if (flags & BTRFS_BLOCK_GROUP_METADATA) {
+			extent_root->fs_info->extra_alloc_bits =
+				BTRFS_BLOCK_GROUP_RAID0;
+		}
+	}
 	return 0;
 }
 
@@ -1655,24 +1700,31 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_ref *ref;
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
+	int extra_chunk_alloc_bits = 0;
 
 	if (data) {
-		data = BTRFS_BLOCK_GROUP_DATA;
+		data = BTRFS_BLOCK_GROUP_DATA | info->extra_data_alloc_bits;
 	} else if (root == root->fs_info->chunk_root) {
 		data = BTRFS_BLOCK_GROUP_SYSTEM;
 	} else {
-		data = BTRFS_BLOCK_GROUP_METADATA;
+		data = BTRFS_BLOCK_GROUP_METADATA | info->extra_alloc_bits;
 	}
+	if (btrfs_super_num_devices(&info->super_copy) > 1 &&
+	    !(data & BTRFS_BLOCK_GROUP_SYSTEM))
+		extra_chunk_alloc_bits = BTRFS_BLOCK_GROUP_RAID0;
 
 	if (root->ref_cows) {
-		if (data != BTRFS_BLOCK_GROUP_METADATA) {
+		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024,
-					     BTRFS_BLOCK_GROUP_METADATA);
+					     BTRFS_BLOCK_GROUP_METADATA |
+					     info->extra_alloc_bits |
+					     extra_chunk_alloc_bits);
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data);
+				     num_bytes + 2 * 1024 * 1024, data |
+				     extra_chunk_alloc_bits);
 		BUG_ON(ret);
 	}
 
@@ -2627,34 +2679,6 @@ error:
 	return ret;
 }
 
-static int update_space_info(struct btrfs_fs_info *info, u64 flags,
-			     u64 total_bytes, u64 bytes_used,
-			     struct btrfs_space_info **space_info)
-{
-	struct btrfs_space_info *found;
-
-	found = __find_space_info(info, flags);
-	if (found) {
-		found->total_bytes += total_bytes;
-		found->bytes_used += bytes_used;
-		WARN_ON(found->total_bytes < found->bytes_used);
-		*space_info = found;
-		return 0;
-	}
-	found = kmalloc(sizeof(*found), GFP_NOFS);
-	if (!found)
-		return -ENOMEM;
-
-	list_add(&found->list, &info->space_info);
-	found->flags = flags;
-	found->total_bytes = total_bytes;
-	found->bytes_used = bytes_used;
-	found->bytes_pinned = 0;
-	found->full = 0;
-	*space_info = found;
-	return 0;
-}
-
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -2712,6 +2736,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
 			bit = BLOCK_GROUP_METADATA;
 		}
+		if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) {
+			if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
+				info->extra_data_alloc_bits =
+					BTRFS_BLOCK_GROUP_RAID0;
+			}
+			if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
+				info->extra_alloc_bits =
+					BTRFS_BLOCK_GROUP_RAID0;
+			}
+		}
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5140d680184..db60d85598c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -317,8 +317,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	map_length = length;
 	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
 	if (map_length < length + size) {
-		printk("merge bio hook logical %Lu bio len %Lu physical %Lu "
-		       "len %Lu\n", logical, length, physical, map_length);
 		return 1;
 	}
 	return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 263f01cc3db..d8fce32a3bb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -25,10 +26,24 @@
 #include "print-tree.h"
 #include "volumes.h"
 
-struct map_lookup {
+struct stripe {
 	struct btrfs_device *dev;
 	u64 physical;
 };
+
+struct map_lookup {
+	u64 type;
+	int io_align;
+	int io_width;
+	int stripe_len;
+	int sector_size;
+	int num_stripes;
+	struct stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+			    (sizeof(struct stripe) * (n)))
+
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
@@ -592,6 +607,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      u64 *num_bytes, u64 type)
 {
 	u64 dev_offset;
+	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
@@ -610,10 +626,18 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int looped = 0;
 	int ret;
 	int index;
+	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
 	if (list_empty(dev_list))
 		return -ENOSPC;
+
+	if (type & BTRFS_BLOCK_GROUP_RAID0)
+		num_stripes = btrfs_super_num_devices(&info->super_copy);
+	if (type & BTRFS_BLOCK_GROUP_DATA)
+		stripe_len = 64 * 1024;
+	if (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
+		stripe_len = 32 * 1024;
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -650,9 +674,15 @@ again:
 	if (!chunk)
 		return -ENOMEM;
 
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		kfree(chunk);
+		return -ENOMEM;
+	}
+
 	stripes = &chunk->stripe;
 
-	*num_bytes = calc_size;
+	*num_bytes = calc_size * num_stripes;
 	index = 0;
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
@@ -669,6 +699,8 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
 
+		map->stripes[index].dev = device;
+		map->stripes[index].physical = dev_offset;
 		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
 		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
 		physical = dev_offset;
@@ -680,12 +712,18 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 	key.offset = *num_bytes;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
-	btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024);
+	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 	btrfs_set_stack_chunk_type(chunk, type);
 	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
-	btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize);
-	btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = stripe_len;
+	map->io_align = stripe_len;
+	map->io_width = stripe_len;
+	map->type = type;
+	map->num_stripes = num_stripes;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
@@ -695,25 +733,11 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_NOFS);
-	if (!map) {
-		free_extent_map(em);
-		return -ENOMEM;
-	}
-
 	em->bdev = (struct block_device *)map;
 	em->start = key.objectid;
 	em->len = key.offset;
 	em->block_start = 0;
 
-	map->physical = physical;
-	map->dev = device;
-
-	if (!map->dev) {
-		kfree(map);
-		free_extent_map(em);
-		return -EIO;
-	}
 	kfree(chunk);
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
@@ -758,6 +782,9 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	struct map_lookup *map;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	u64 offset;
+	u64 stripe_offset;
+	u64 stripe_nr;
+	int stripe_index;
 
 
 	spin_lock(&em_tree->lock);
@@ -767,9 +794,40 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
-	*phys = map->physical + offset;
-	*length = em->len - offset;
-	*dev = map->dev;
+
+	stripe_nr = offset;
+	/*
+	 * stripe_nr counts the total number of stripes we have to stride
+	 * to get to this block
+	 */
+	do_div(stripe_nr, map->stripe_len);
+
+	stripe_offset = stripe_nr * map->stripe_len;
+	BUG_ON(offset < stripe_offset);
+
+	/* stripe_offset is the offset of this block in its stripe*/
+	stripe_offset = offset - stripe_offset;
+
+	/*
+	 * after this do_div call, stripe_nr is the number of stripes
+	 * on this device we have to walk to find the data, and
+	 * stripe_index is the number of our device in the stripe array
+	 */
+	stripe_index = do_div(stripe_nr, map->num_stripes);
+
+	BUG_ON(stripe_index >= map->num_stripes);
+
+	*phys = map->stripes[stripe_index].physical + stripe_offset +
+		stripe_nr * map->stripe_len;
+
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+			      map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+	*dev = map->stripes[stripe_index].dev;
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 	return 0;
@@ -822,7 +880,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	u64 logical;
 	u64 length;
 	u64 devid;
+	int num_stripes;
 	int ret;
+	int i;
 
 	logical = key->objectid;
 	length = key->offset;
@@ -846,7 +906,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_NOFS);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		free_extent_map(em);
 		return -ENOMEM;
@@ -857,13 +918,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->len = length;
 	em->block_start = 0;
 
-	map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0);
-	devid = btrfs_stripe_devid_nr(leaf, chunk, 0);
-	map->dev = btrfs_find_device(root, devid);
-	if (!map->dev) {
-		kfree(map);
-		free_extent_map(em);
-		return -EIO;
+	map->num_stripes = num_stripes;
+	map->io_width = btrfs_chunk_io_width(leaf, chunk);
+	map->io_align = btrfs_chunk_io_align(leaf, chunk);
+	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	map->type = btrfs_chunk_type(leaf, chunk);
+	for (i = 0; i < num_stripes; i++) {
+		map->stripes[i].physical =
+			btrfs_stripe_offset_nr(leaf, chunk, i);
+		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+		map->stripes[i].dev = btrfs_find_device(root, devid);
+		if (!map->stripes[i].dev) {
+			kfree(map);
+			free_extent_map(em);
+			return -EIO;
+		}
 	}
 
 	spin_lock(&map_tree->map_tree.lock);
-- 
cgit v1.2.3


From 04160088141a6be06d5cb9e53e34076b51803ffc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 Mar 2008 10:28:07 -0400
Subject: Create a btrfs backing dev info

This allows intelligent versions of unplug and congestion functions

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/disk-io.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/inode.c   |  5 +++++
 3 files changed, 60 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f43dfe7ffe6..fec96ba7c23 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
+#include <linux/backing-dev.h>
 #include <asm/kmap_types.h>
 #include "bit-radix.h"
 #include "extent_io.h"
@@ -472,6 +473,7 @@ struct btrfs_fs_info {
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
+	struct backing_dev_info bdi;
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d8474796dea..6a70e4916ab 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -655,6 +655,55 @@ static int add_hasher(struct btrfs_fs_info *info, char *type) {
 	return 0;
 }
 #endif
+
+static int btrfs_congested_fn(void *congested_data, int bdi_bits)
+{
+	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+	int ret = 0;
+	struct list_head *cur;
+	struct btrfs_device *device;
+	struct backing_dev_info *bdi;
+
+	list_for_each(cur, &info->fs_devices->devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		bdi = blk_get_backing_dev_info(device->bdev);
+		if (bdi && bdi_congested(bdi, bdi_bits)) {
+			ret = 1;
+			break;
+		}
+	}
+	return ret;
+}
+
+void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+	struct list_head *cur;
+	struct btrfs_device *device;
+	struct btrfs_fs_info *info;
+
+	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+	list_for_each(cur, &info->fs_devices->devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		bdi = blk_get_backing_dev_info(device->bdev);
+		if (bdi->unplug_io_fn) {
+			bdi->unplug_io_fn(bdi, page);
+		}
+	}
+}
+
+static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
+{
+	bdi_init(bdi);
+	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+	bdi->state		= 0;
+	bdi->capabilities	= default_backing_dev_info.capabilities;
+	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
+	bdi->unplug_io_data	= info;
+	bdi->congested_fn	= btrfs_congested_fn;
+	bdi->congested_data	= info;
+	return 0;
+}
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices)
 {
@@ -708,11 +757,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->delalloc_bytes = 0;
+	setup_bdi(fs_info, &fs_info->bdi);
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
 	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
+
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
@@ -992,6 +1044,7 @@ int close_ctree(struct btrfs_root *root)
 #endif
 	close_all_devices(fs_info);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	bdi_destroy(&fs_info->bdi);
 
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index db60d85598c..0fa7cf227f1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -494,6 +494,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
@@ -508,6 +509,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		break;
 	default:
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -1792,6 +1794,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		drop_inode = 1;
 	else {
 		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
@@ -2960,6 +2963,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		drop_inode = 1;
 	else {
 		inode->i_mapping->a_ops = &btrfs_aops;
+		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
@@ -2999,6 +3003,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	inode->i_size = name_len - 1;
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
-- 
cgit v1.2.3


From 83041add611056e830e29fda913029e37e857239 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 Mar 2008 12:02:55 -0400
Subject: Btrfs: Use a higher default ra pages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a70e4916ab..134b722587e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -694,7 +694,7 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
 	bdi_init(bdi);
-	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+	bdi->ra_pages	= default_backing_dev_info.ra_pages * 4;
 	bdi->state		= 0;
 	bdi->capabilities	= default_backing_dev_info.capabilities;
 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
-- 
cgit v1.2.3


From 2d2ae547979854d10b75d557b3abdb3eb7511bbc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 26 Mar 2008 16:24:23 -0400
Subject: Btrfs: Add leak debugging for extent_buffer and extent_state

This also fixes one leak around the super block when failing to mount the
FS.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |  2 ++
 fs/btrfs/extent_io.c | 28 ++++++++++++++++++++++++++--
 fs/btrfs/extent_io.h |  3 ++-
 3 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 134b722587e..1c5e097a4df 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -915,12 +915,14 @@ fail_tree_root:
 	free_extent_buffer(tree_root->node);
 fail_sb_buffer:
 	free_extent_buffer(fs_info->sb_buffer);
+	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
 	close_all_devices(fs_info);
 	kfree(extent_root);
 	kfree(tree_root);
+	bdi_destroy(&fs_info->bdi);
 	kfree(fs_info);
 	return ERR_PTR(err);
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6dab664529c..b99cfabd008 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -26,6 +26,7 @@ static struct kmem_cache *extent_buffer_cache;
 
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
+static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
 
 #define BUFFER_LRU_MAX 64
 
@@ -64,15 +65,22 @@ free_state_cache:
 void extent_io_exit(void)
 {
 	struct extent_state *state;
+	struct extent_buffer *eb;
 
 	while (!list_empty(&states)) {
-		state = list_entry(states.next, struct extent_state, list);
+		state = list_entry(states.next, struct extent_state, leak_list);
 		printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
-		list_del(&state->list);
+		list_del(&state->leak_list);
 		kmem_cache_free(extent_state_cache, state);
 
 	}
 
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+		printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
 	if (extent_state_cache)
 		kmem_cache_destroy(extent_state_cache);
 	if (extent_buffer_cache)
@@ -109,6 +117,7 @@ EXPORT_SYMBOL(extent_io_tree_empty_lru);
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
+	unsigned long flags;
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state || IS_ERR(state))
@@ -116,6 +125,9 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(&state->leak_list, &states);
+	spin_unlock_irqrestore(&leak_lock, flags);
 
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
@@ -128,7 +140,11 @@ void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
+		unsigned long flags;
 		WARN_ON(state->tree);
+		spin_lock_irqsave(&leak_lock, flags);
+		list_del(&state->leak_list);
+		spin_unlock_irqrestore(&leak_lock, flags);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -2582,6 +2598,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
+	unsigned long flags;
 
 	spin_lock(&tree->lru_lock);
 	eb = find_lru(tree, start, len);
@@ -2594,6 +2611,9 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	INIT_LIST_HEAD(&eb->lru);
 	eb->start = start;
 	eb->len = len;
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(&eb->leak_list, &buffers);
+	spin_unlock_irqrestore(&leak_lock, flags);
 	atomic_set(&eb->refs, 1);
 
 	return eb;
@@ -2601,6 +2621,10 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
+	unsigned long flags;
+	spin_lock_irqsave(&leak_lock, flags);
+	list_del(&eb->leak_list);
+	spin_unlock_irqrestore(&leak_lock, flags);
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8b5319db251..16d67a61a25 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -66,7 +66,7 @@ struct extent_state {
 	/* for use by the FS */
 	u64 private;
 
-	struct list_head list;
+	struct list_head leak_list;
 };
 
 struct extent_buffer {
@@ -80,6 +80,7 @@ struct extent_buffer {
 	struct list_head lru;
 	atomic_t refs;
 	int flags;
+	struct list_head leak_list;
 };
 
 struct extent_map_tree;
-- 
cgit v1.2.3


From 63b10fc4874a014e22bc4c64e3d92b71180661fe Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 1 Apr 2008 11:21:32 -0400
Subject: Reorder the flags field in struct btrfs_header and record a flag on
 writeout

This allows detection of blocks that have already been written in the
running transaction so they can be recowed instead of modified again.
It is step one in trusting the transid field of the block pointers.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 10 ++++++++--
 fs/btrfs/ctree.h   | 28 +++++++++++++++++++++++++---
 fs/btrfs/disk-io.c | 13 +++++++++----
 3 files changed, 42 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 70b6ddfe15a..6ba5394834b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -124,6 +124,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
 	btrfs_set_header_owner(cow, new_root_objectid);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	ret = btrfs_inc_ref(trans, new_root, buf);
@@ -183,6 +184,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
 	btrfs_set_header_owner(cow, root->root_key.objectid);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
@@ -245,11 +247,14 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	}
 
 	header_trans = btrfs_header_generation(buf);
-	if (header_trans == trans->transid) {
+	spin_lock(&root->fs_info->hash_lock);
+	if (header_trans == trans->transid &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
+		spin_unlock(&root->fs_info->hash_lock);
 		return 0;
 	}
-
+	spin_unlock(&root->fs_info->hash_lock);
 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
@@ -1494,6 +1499,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_header_bytenr(split, split->start);
 	btrfs_set_header_generation(split, trans->transid);
 	btrfs_set_header_owner(split, root->root_key.objectid);
+	btrfs_set_header_flags(split, 0);
 	write_extent_buffer(split, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(split),
 			    BTRFS_FSID_SIZE);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fec96ba7c23..67d533cf8f4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -193,6 +193,8 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 
 #define BTRFS_FSID_SIZE 16
+#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+
 /*
  * every tree block (leaf or node) starts with this header.
  */
@@ -200,10 +202,10 @@ struct btrfs_header {
 	u8 csum[BTRFS_CSUM_SIZE];
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 	__le64 bytenr; /* which block this node is supposed to live in */
+	__le64 flags;
 	__le64 generation;
 	__le64 owner;
 	__le32 nritems;
-	__le16 flags;
 	u8 level;
 } __attribute__ ((__packed__));
 
@@ -229,9 +231,10 @@ struct btrfs_header {
  */
 struct btrfs_super_block {
 	u8 csum[BTRFS_CSUM_SIZE];
-	/* the first 3 fields must match struct btrfs_header */
+	/* the first 4 fields must match struct btrfs_header */
 	u8 fsid[16];    /* FS specific uuid */
 	__le64 bytenr; /* this block number */
+	__le64 flags;
 	__le64 magic;
 	__le64 generation;
 	__le64 root;
@@ -1045,9 +1048,28 @@ BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header,
 			  generation, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
-BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 16);
+BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
 
+static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	return (btrfs_header_flags(eb) & flag) == flag;
+}
+
+static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags | flag);
+	return (flags & flag) == flag;
+}
+
+static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
+{
+	u64 flags = btrfs_header_flags(eb);
+	btrfs_set_header_flags(eb, flags & ~flag);
+	return (flags & flag) == flag;
+}
+
 static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
 	unsigned long ptr = offsetof(struct btrfs_header, fsid);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1c5e097a4df..709e33d08bb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -159,17 +159,19 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			from_this_trans = 1;
 
 		/* FIXME, this is not good */
-		if (from_this_trans == 0 &&
-		    memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
+		if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
 			u32 val;
 			u32 found = 0;
 			memcpy(&found, result, BTRFS_CRC32_SIZE);
 
 			read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
+			WARN_ON(1);
 			printk("btrfs: %s checksum verify failed on %llu "
-			       "wanted %X found %X from_this_trans %d\n",
+			       "wanted %X found %X from_this_trans %d "
+			       "level %d\n",
 			       root->fs_info->sb->s_id,
-			       buf->start, val, found, from_this_trans);
+			       buf->start, val, found, from_this_trans,
+			       btrfs_header_level(buf));
 			return 1;
 		}
 	} else {
@@ -220,6 +222,9 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
+	spin_lock(&root->fs_info->hash_lock);
+	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+	spin_unlock(&root->fs_info->hash_lock);
 	csum_tree_block(root, eb, 0);
 err:
 	free_extent_buffer(eb);
-- 
cgit v1.2.3


From e58ca0203d32869a01540a293df40ddc480dc378 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 1 Apr 2008 11:21:34 -0400
Subject: Fix btrfs_fill_super to return -EINVAL when no FS found

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 3 +--
 fs/btrfs/super.c   | 4 ++--
 fs/btrfs/volumes.c | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 709e33d08bb..5547607681f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -728,9 +728,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
 					      GFP_NOFS);
 	int ret;
-	int err = -EIO;
+	int err = -EINVAL;
 	struct btrfs_super_block *disk_super;
-
 	if (!extent_root || !tree_root || !fs_info) {
 		err = -ENOMEM;
 		goto fail;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9624923a33d..89286490688 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -236,9 +236,9 @@ static int btrfs_fill_super(struct super_block * sb,
 
 	tree_root = open_ctree(sb, fs_devices);
 
-	if (!tree_root || IS_ERR(tree_root)) {
+	if (IS_ERR(tree_root)) {
 		printk("btrfs: open_ctree failed\n");
-		return -EIO;
+		return PTR_ERR(tree_root);
 	}
 	sb->s_fs_info = tree_root;
 	disk_super = &tree_root->fs_info->super_copy;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d8fce32a3bb..169be0f7285 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -237,7 +237,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 	    sizeof(disk_super->magic))) {
 		printk("no btrfs found on %s\n", path);
-		ret = -ENOENT;
+		ret = -EINVAL;
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
-- 
cgit v1.2.3


From 2b114d1d33551ab1dcaa1d0b3a956df3776fb767 Mon Sep 17 00:00:00 2001
From: Peter <htmldeveloper@gmail.com>
Date: Tue, 1 Apr 2008 11:21:40 -0400
Subject: Btrfs: Correct usage of IS_ERR() in extent_io.c

Signed-off-by: Peter Teoh <htmldeveloper@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b99cfabd008..776c33f1e6f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -120,7 +120,7 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 	unsigned long flags;
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
-	if (!state || IS_ERR(state))
+	if (!state)
 		return state;
 	state->state = 0;
 	state->private = 0;
@@ -957,7 +957,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node || IS_ERR(node)) {
+	if (!node) {
 		goto out;
 	}
 
@@ -990,7 +990,7 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node || IS_ERR(node)) {
+	if (!node) {
 		goto out;
 	}
 
@@ -1024,7 +1024,7 @@ u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 	 */
 search_again:
 	node = tree_search(tree, cur_start);
-	if (!node || IS_ERR(node)) {
+	if (!node) {
 		*end = (u64)-1;
 		goto out;
 	}
@@ -1112,7 +1112,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, cur_start);
-	if (!node || IS_ERR(node)) {
+	if (!node) {
 		goto out;
 	}
 
@@ -1213,7 +1213,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node || IS_ERR(node)) {
+	if (!node) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -1240,7 +1240,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node || IS_ERR(node)) {
+	if (!node) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -2642,7 +2642,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	int uptodate = 1;
 
 	eb = __alloc_extent_buffer(tree, start, len, mask);
-	if (!eb || IS_ERR(eb))
+	if (!eb)
 		return NULL;
 
 	if (eb->flags & EXTENT_BUFFER_FILLED)
@@ -2717,7 +2717,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 	int uptodate = 1;
 
 	eb = __alloc_extent_buffer(tree, start, len, mask);
-	if (!eb || IS_ERR(eb))
+	if (!eb)
 		return NULL;
 
 	if (eb->flags & EXTENT_BUFFER_FILLED)
-- 
cgit v1.2.3


From 140dfd00ae544cdc0c9951b80ad6df61e62859e2 Mon Sep 17 00:00:00 2001
From: Alex Chiang <achiang@hp.com>
Date: Tue, 1 Apr 2008 11:21:40 -0400
Subject: btrfs-progs: Stop stomping on 'name' input parameter

In btrfs_name_hash, Local variable 'buf' is declared as

	__u32 buf[2];

but we then try to do this:

	buf[0] = 0x67452301;
	buf[1] = 0xefcdab89;
	buf[2] = 0x98badcfe;
	buf[3] = 0x10325476;

Oops. Fix buf to be the proper size.

Signed-off-by: Alex Chiang <achiang@hp.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/hash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index d5252f5c9d0..21037cfca9b 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -81,7 +81,7 @@ u64 btrfs_name_hash(const char *name, int len)
 	__u32	hash;
 	__u32	minor_hash = 0;
 	const char	*p;
-	__u32		in[8], buf[2];
+	__u32		in[8], buf[4];
 	u64		hash_result;
 
 	if (len == 1 && *name == '.') {
-- 
cgit v1.2.3


From ecbe2402cb4e4e7413544dc392c1a78d0f290292 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 1 Apr 2008 11:26:07 -0400
Subject: Btrfs: Keep fs_mutex during reads done by snapshot deletion

There was an optimization to drop the fs_mutex when doing snapshot deletion
reads, but this can lead to false positives on checksumming errors.  Keep
the lock for now.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c226656f29b..cf283b0271a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2046,11 +2046,12 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		if (!next || !btrfs_buffer_uptodate(next)) {
 			free_extent_buffer(next);
 			reada_walk_down(root, cur, path->slots[*level]);
-			mutex_unlock(&root->fs_info->fs_mutex);
 			next = read_tree_block(root, bytenr, blocksize);
-			mutex_lock(&root->fs_info->fs_mutex);
 
-			/* we dropped the lock, check one more time */
+			/* we used to drop the lock above, keep the
+			 * code to double check so that we won't forget
+			 * when we drop the lock again in the future
+			 */
 			ret = lookup_extent_ref(trans, root, bytenr,
 						blocksize, &refs);
 			BUG_ON(ret);
-- 
cgit v1.2.3


From 0999df54f850fe1aba29b10d5c869493af107478 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 1 Apr 2008 13:48:14 -0400
Subject: Btrfs: Verify checksums on tree blocks found without read_tree_block

Checksums were only verified by btrfs_read_tree_block, which meant the
functions to probe the page cache for blocks were not validating checksums.
Normally this is fine because the buffers will only be in cache if they
have already been validated.

But, there is a window while the buffer is being read from disk where
it could be up to date in the cache but not yet verified.  This patch
makes sure all buffers go through checksum verification before they
are used.

This is safer, and it prevents modification of buffers before they go
through the csum code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  5 +++
 fs/btrfs/disk-io.c     | 86 +++++++++++++++++++++++++++++---------------------
 fs/btrfs/disk-io.h     |  2 ++
 fs/btrfs/extent-tree.c |  2 ++
 fs/btrfs/tree-defrag.c |  1 +
 5 files changed, 60 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6ba5394834b..df090bf2eec 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -158,6 +158,8 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	} else {
 		root_gen = 0;
 	}
+	if (!(buf->flags & EXTENT_CSUM))
+		WARN_ON(1);
 
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
@@ -245,6 +247,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
+	if (!(buf->flags & EXTENT_CSUM))
+		WARN_ON(1);
 
 	header_trans = btrfs_header_generation(buf);
 	spin_lock(&root->fs_info->hash_lock);
@@ -396,6 +400,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		if (search_start == 0)
 			search_start = last_block;
 
+		btrfs_verify_block_csum(root, cur);
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&tmp, search_start,
 					min(16 * blocksize,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5547607681f..e40fb318ad9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,27 +46,6 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 
 static struct extent_io_ops btree_extent_io_ops;
 
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize)
-{
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_buffer *eb;
-	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				bytenr, blocksize, GFP_NOFS);
-	return eb;
-}
-
-struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-						 u64 bytenr, u32 blocksize)
-{
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_buffer *eb;
-
-	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				 bytenr, blocksize, NULL, GFP_NOFS);
-	return eb;
-}
-
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 				    size_t page_offset, u64 start, u64 len,
 				    int create)
@@ -380,36 +359,29 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
-struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
-				      u32 blocksize)
+int btrfs_verify_block_csum(struct btrfs_root *root,
+			    struct extent_buffer *buf)
 {
-	struct extent_buffer *buf = NULL;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_io_tree *io_tree;
 	u64 end;
 	int ret;
 
-	io_tree = &BTRFS_I(btree_inode)->io_tree;
-
-	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
-	if (!buf)
-		return NULL;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1,
-				 btree_get_extent);
-
+	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	if (buf->flags & EXTENT_CSUM)
-		return buf;
+		return 0;
 
-	end = buf->start + PAGE_CACHE_SIZE - 1;
+	end = min_t(u64, buf->len, PAGE_CACHE_SIZE);
+	end = buf->start + end - 1;
 	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
 		buf->flags |= EXTENT_CSUM;
-		return buf;
+		return 0;
 	}
 
 	lock_extent(io_tree, buf->start, end, GFP_NOFS);
 
 	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
 		buf->flags |= EXTENT_CSUM;
+		ret = 0;
 		goto out_unlock;
 	}
 
@@ -419,6 +391,48 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 out_unlock:
 	unlock_extent(io_tree, buf->start, end, GFP_NOFS);
+	return ret;
+}
+
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+				bytenr, blocksize, GFP_NOFS);
+	return eb;
+}
+
+struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
+						 u64 bytenr, u32 blocksize)
+{
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_buffer *eb;
+
+	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
+				 bytenr, blocksize, NULL, GFP_NOFS);
+	return eb;
+}
+
+
+struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
+				      u32 blocksize)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree;
+	int ret;
+
+	io_tree = &BTRFS_I(btree_inode)->io_tree;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return NULL;
+	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1,
+				 btree_get_extent);
+
+	ret = btrfs_verify_block_csum(root, buf);
 	return buf;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b7cbc58a555..05b88d0e75e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -69,4 +69,6 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_open_device(struct btrfs_device *dev);
+int btrfs_verify_block_csum(struct btrfs_root *root,
+			    struct extent_buffer *buf);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cf283b0271a..a34c289aec2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2069,6 +2069,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				BUG_ON(ret);
 				continue;
 			}
+		} else if (next) {
+			btrfs_verify_block_csum(root, next);
 		}
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5935cbd8f2b..256af1870ee 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -101,6 +101,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 				path->slots[*level]++;
 				continue;
 			}
+			btrfs_verify_block_csum(root, next);
 		} else {
 			next = read_tree_block(root, bytenr,
 				       btrfs_level_size(root, *level - 1));
-- 
cgit v1.2.3


From 0ef8b2428a339e83fe225e82ef8bd0ea3f62b180 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Apr 2008 16:29:02 -0400
Subject: Btrfs: Properly dirty buffers in the split corner cases

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index df090bf2eec..3c7891cf0d7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2052,6 +2052,7 @@ again:
 				path->nodes[0] = right;
 				path->slots[0] = 0;
 				path->slots[1] += 1;
+				btrfs_mark_buffer_dirty(right);
 				return ret;
 			}
 			mid = slot;
@@ -2082,6 +2083,7 @@ again:
 					if (wret)
 						ret = wret;
 				}
+				btrfs_mark_buffer_dirty(right);
 				return ret;
 			} else if (extend && slot == 0) {
 				mid = 1;
-- 
cgit v1.2.3


From 8790d502e4401a4a3a4175b83a3a47e8d595c771 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Apr 2008 16:29:03 -0400
Subject: Btrfs: Add support for mirroring across drives

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   9 ++-
 fs/btrfs/disk-io.c     |  20 +++----
 fs/btrfs/extent-tree.c |  93 +++++++++++++++--------------
 fs/btrfs/inode.c       |   4 +-
 fs/btrfs/volumes.c     | 154 ++++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/volumes.h     |  11 ++--
 6 files changed, 198 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67d533cf8f4..0a207861472 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -418,6 +418,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
 #define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 #define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 
 
 struct btrfs_block_group_item {
@@ -504,8 +505,12 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
-	int extra_data_alloc_bits;
-	int extra_alloc_bits;
+	int avail_data_alloc_bits;
+	int avail_metadata_alloc_bits;
+	int avail_system_alloc_bits;
+	int data_alloc_profile;
+	int metadata_alloc_profile;
+	int system_alloc_profile;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e40fb318ad9..ff75ad58676 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -735,7 +735,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
-	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
+	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
 						GFP_NOFS);
 	struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root),
 						GFP_NOFS);
@@ -744,6 +744,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	int ret;
 	int err = -EINVAL;
 	struct btrfs_super_block *disk_super;
+
 	if (!extent_root || !tree_root || !fs_info) {
 		err = -ENOMEM;
 		goto fail;
@@ -756,11 +757,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 
-	memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj));
 	init_completion(&fs_info->kobj_unregister);
 	sb_set_blocksize(sb, 4096);
-	fs_info->running_transaction = NULL;
-	fs_info->last_trans_committed = 0;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
@@ -770,11 +768,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	fs_info->sb = sb;
-	fs_info->throttles = 0;
-	fs_info->mount_opt = 0;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
-	fs_info->delalloc_bytes = 0;
 	setup_bdi(fs_info, &fs_info->bdi);
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
@@ -802,12 +797,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	extent_io_tree_init(&fs_info->extent_ins,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
-	fs_info->closing = 0;
-	fs_info->total_pinned = 0;
-	fs_info->last_alloc = 0;
-	fs_info->last_data_alloc = 0;
-	fs_info->extra_alloc_bits = 0;
-	fs_info->extra_data_alloc_bits = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
@@ -923,6 +912,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
+	if (btrfs_super_num_devices(disk_super) > 0) {
+		fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
+		fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1;
+		fs_info->system_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
+	}
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a34c289aec2..4ab98d8b73f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -230,9 +230,13 @@ again:
 			goto new_group;
 		if (start + num  > total_fs_bytes)
 			goto new_group;
+		if (!block_group_bits(cache, data)) {
+			printk("block group bits don't match %Lu %Lu\n", cache->flags, data);
+		}
 		*start_ret = start;
 		return 0;
-	} out:
+	}
+out:
 	cache = btrfs_lookup_block_group(root->fs_info, search_start);
 	if (!cache) {
 		printk("Unable to find block group for %Lu\n", search_start);
@@ -365,14 +369,17 @@ again:
 		if (cache->key.objectid > total_fs_bytes)
 			break;
 
-		if (full_search)
-			free_check = cache->key.offset;
-		else
-			free_check = div_factor(cache->key.offset, factor);
+		if (block_group_bits(cache, data)) {
+			if (full_search)
+				free_check = cache->key.offset;
+			else
+				free_check = div_factor(cache->key.offset,
+							factor);
 
-		if (used + cache->pinned < free_check) {
-			found_group = cache;
-			goto found;
+			if (used + cache->pinned < free_check) {
+				found_group = cache;
+				goto found;
+			}
 		}
 		cond_resched();
 	}
@@ -1038,6 +1045,19 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	return 0;
 }
 
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+				   BTRFS_BLOCK_GROUP_RAID1);
+	if (extra_flags) {
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			fs_info->avail_data_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			fs_info->avail_metadata_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			fs_info->avail_system_alloc_bits |= extra_flags;
+	}
+}
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -1060,7 +1080,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	if (space_info->full)
 		return 0;
 
-	thresh = div_factor(space_info->total_bytes, 7);
+	thresh = div_factor(space_info->total_bytes, 6);
 	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
 	    thresh)
 		return 0;
@@ -1079,16 +1099,7 @@ printk("space info full %Lu\n", flags);
 		     start, num_bytes);
 	BUG_ON(ret);
 
-	if (flags & BTRFS_BLOCK_GROUP_RAID0) {
-		if (flags & BTRFS_BLOCK_GROUP_DATA) {
-			extent_root->fs_info->extra_data_alloc_bits =
-				BTRFS_BLOCK_GROUP_RAID0;
-		}
-		if (flags & BTRFS_BLOCK_GROUP_METADATA) {
-			extent_root->fs_info->extra_alloc_bits =
-				BTRFS_BLOCK_GROUP_RAID0;
-		}
-	}
+	set_avail_alloc_bits(extent_root->fs_info, flags);
 	return 0;
 }
 
@@ -1529,6 +1540,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
+		empty_cluster = 256 * 1024;
 	}
 
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
@@ -1693,6 +1705,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	u64 root_used;
 	u64 search_start = 0;
 	u64 new_hint;
+	u64 alloc_profile;
 	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
@@ -1700,31 +1713,32 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_ref *ref;
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
-	int extra_chunk_alloc_bits = 0;
 
 	if (data) {
-		data = BTRFS_BLOCK_GROUP_DATA | info->extra_data_alloc_bits;
+		alloc_profile = info->avail_data_alloc_bits &
+			        info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
 	} else if (root == root->fs_info->chunk_root) {
-		data = BTRFS_BLOCK_GROUP_SYSTEM;
+		alloc_profile = info->avail_system_alloc_bits &
+			        info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
 	} else {
-		data = BTRFS_BLOCK_GROUP_METADATA | info->extra_alloc_bits;
+		alloc_profile = info->avail_metadata_alloc_bits &
+			        info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
-	if (btrfs_super_num_devices(&info->super_copy) > 1 &&
-	    !(data & BTRFS_BLOCK_GROUP_SYSTEM))
-		extra_chunk_alloc_bits = BTRFS_BLOCK_GROUP_RAID0;
 
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024,
 					     BTRFS_BLOCK_GROUP_METADATA |
-					     info->extra_alloc_bits |
-					     extra_chunk_alloc_bits);
+					     (info->metadata_alloc_profile &
+					      info->avail_metadata_alloc_bits));
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data |
-				     extra_chunk_alloc_bits);
+				     num_bytes + 2 * 1024 * 1024, data);
 		BUG_ON(ret);
 	}
 
@@ -2046,12 +2060,12 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		if (!next || !btrfs_buffer_uptodate(next)) {
 			free_extent_buffer(next);
 			reada_walk_down(root, cur, path->slots[*level]);
+
+			mutex_unlock(&root->fs_info->fs_mutex);
 			next = read_tree_block(root, bytenr, blocksize);
+			mutex_lock(&root->fs_info->fs_mutex);
 
-			/* we used to drop the lock above, keep the
-			 * code to double check so that we won't forget
-			 * when we drop the lock again in the future
-			 */
+			/* we've dropped the lock, double check */
 			ret = lookup_extent_ref(trans, root, bytenr,
 						blocksize, &refs);
 			BUG_ON(ret);
@@ -2739,16 +2753,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
 			bit = BLOCK_GROUP_METADATA;
 		}
-		if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) {
-			if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
-				info->extra_data_alloc_bits =
-					BTRFS_BLOCK_GROUP_RAID0;
-			}
-			if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
-				info->extra_alloc_bits =
-					BTRFS_BLOCK_GROUP_RAID0;
-			}
-		}
+		set_avail_alloc_bits(info, cache->flags);
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0fa7cf227f1..a8ae68c6fbb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -306,6 +306,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	u64 physical;
 	u64 length = 0;
 	u64 map_length;
+	int total_devs;
 	struct bio_vec *bvec;
 	int i;
 	int ret;
@@ -315,7 +316,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	}
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	ret = btrfs_map_block(map_tree, READ, 0, logical, &physical,
+			      &map_length, &dev, &total_devs);
 	if (map_length < length + size) {
 		return 1;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 169be0f7285..bc3c0b97588 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -31,6 +31,13 @@ struct stripe {
 	u64 physical;
 };
 
+struct multi_bio {
+	atomic_t stripes;
+	bio_end_io_t *end_io;
+	void *private;
+	int error;
+};
+
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -632,12 +639,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID0)
+	if (type & (BTRFS_BLOCK_GROUP_RAID0))
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
-	if (type & BTRFS_BLOCK_GROUP_DATA)
-		stripe_len = 64 * 1024;
-	if (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
-		stripe_len = 32 * 1024;
+	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+		num_stripes = min_t(u64, 2,
+				  btrfs_super_num_devices(&info->super_copy));
+	}
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -682,7 +689,11 @@ again:
 
 	stripes = &chunk->stripe;
 
-	*num_bytes = calc_size * num_stripes;
+	if (type & BTRFS_BLOCK_GROUP_RAID1)
+		*num_bytes = calc_size;
+	else
+		*num_bytes = calc_size * num_stripes;
+
 	index = 0;
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
@@ -694,7 +705,7 @@ again:
 					     key.objectid,
 					     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
+printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
@@ -774,9 +785,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
-		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev)
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		    int dev_nr, u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev, int *total_devs)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -808,19 +819,39 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
-	/*
-	 * after this do_div call, stripe_nr is the number of stripes
-	 * on this device we have to walk to find the data, and
-	 * stripe_index is the number of our device in the stripe array
-	 */
-	stripe_index = do_div(stripe_nr, map->num_stripes);
-
+	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		stripe_index = dev_nr;
+		if (rw & (1 << BIO_RW))
+			*total_devs = map->num_stripes;
+		else {
+			int i;
+			u64 least = (u64)-1;
+			struct btrfs_device *cur;
+
+			for (i = 0; i < map->num_stripes; i++) {
+				cur = map->stripes[i].dev;
+				spin_lock(&cur->io_lock);
+				if (cur->total_ios < least) {
+					least = cur->total_ios;
+					stripe_index = i;
+				}
+				spin_unlock(&cur->io_lock);
+			}
+			*total_devs = 1;
+		}
+	} else {
+		/*
+		 * after this do_div call, stripe_nr is the number of stripes
+		 * on this device we have to walk to find the data, and
+		 * stripe_index is the number of our device in the stripe array
+		 */
+		stripe_index = do_div(stripe_nr, map->num_stripes);
+	}
 	BUG_ON(stripe_index >= map->num_stripes);
-
 	*phys = map->stripes[stripe_index].physical + stripe_offset +
 		stripe_nr * map->stripe_len;
 
-	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 			      map->stripe_len - stripe_offset);
@@ -833,33 +864,98 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_multi_stripe(struct bio *bio, int err)
+#else
+static int end_bio_multi_stripe(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	struct multi_bio *multi = bio->bi_private;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+	if (err)
+		multi->error = err;
+
+	if (atomic_dec_and_test(&multi->stripes)) {
+		bio->bi_private = multi->private;
+		bio->bi_end_io = multi->end_io;
+
+		if (!err && multi->error)
+			err = multi->error;
+		kfree(multi);
+
+		bio_endio(bio, err);
+	} else {
+		bio_put(bio);
+	}
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
+	struct bio *first_bio = bio;
 	u64 logical = bio->bi_sector << 9;
 	u64 physical;
 	u64 length = 0;
 	u64 map_length;
 	struct bio_vec *bvec;
+	struct multi_bio *multi = NULL;
 	int i;
 	int ret;
+	int dev_nr = 0;
+	int total_devs = 1;
 
 	bio_for_each_segment(bvec, bio, i) {
 		length += bvec->bv_len;
 	}
+
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
-	if (map_length < length) {
-		printk("mapping failed logical %Lu bio len %Lu physical %Lu "
-		       "len %Lu\n", logical, length, physical, map_length);
-		BUG();
+	while(dev_nr < total_devs) {
+		ret = btrfs_map_block(map_tree, rw, dev_nr, logical,
+				      &physical, &map_length, &dev,
+				      &total_devs);
+		if (map_length < length) {
+			printk("mapping failed logical %Lu bio len %Lu physical %Lu "
+			       "len %Lu\n", logical, length, physical, map_length);
+			BUG();
+		}
+		BUG_ON(map_length < length);
+		if (total_devs > 1) {
+			if (!multi) {
+				multi = kmalloc(sizeof(*multi), GFP_NOFS);
+				atomic_set(&multi->stripes, 1);
+				multi->end_io = bio->bi_end_io;
+				multi->private = first_bio->bi_private;
+				multi->error = 0;
+			} else {
+				atomic_inc(&multi->stripes);
+			}
+			if (dev_nr < total_devs - 1) {
+				bio = bio_clone(first_bio, GFP_NOFS);
+				BUG_ON(!bio);
+			} else {
+				bio = first_bio;
+			}
+			bio->bi_private = multi;
+			bio->bi_end_io = end_bio_multi_stripe;
+		}
+		bio->bi_sector = physical >> 9;
+		bio->bi_bdev = dev->bdev;
+		spin_lock(&dev->io_lock);
+		dev->total_ios++;
+		spin_unlock(&dev->io_lock);
+		submit_bio(rw, bio);
+		dev_nr++;
 	}
-	BUG_ON(map_length < length);
-	bio->bi_sector = physical >> 9;
-	bio->bi_bdev = dev->bdev;
-	submit_bio(rw, bio);
 	return 0;
 }
 
@@ -982,6 +1078,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
+		device->total_ios = 0;
+		spin_lock_init(&device->io_lock);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 12f297eb055..35dec3efd78 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -18,12 +18,16 @@
 
 #ifndef __BTRFS_VOLUMES_
 #define __BTRFS_VOLUMES_
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
+	spinlock_t io_lock;
 
 	struct block_device *bdev;
 
+	u64 total_ios;
+
 	char *name;
 
 	/* the internal btrfs device id */
@@ -68,9 +72,9 @@ struct btrfs_fs_devices {
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 owner, u64 num_bytes, u64 *start);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripe_nr,
 		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev);
+		    struct btrfs_device **dev, int *total_stripes);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -80,9 +84,6 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
-		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       int flags, void *holder);
 int btrfs_scan_one_device(const char *path, int flags, void *holder,
-- 
cgit v1.2.3


From 611f0e00a27fe0e5a571194a12443ecdc99a43ef Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Apr 2008 16:29:03 -0400
Subject: Btrfs: Add support for duplicate blocks on a single spindle

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  8 +++++---
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/volumes.c     | 32 ++++++++++++++++++++++++++++----
 4 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0a207861472..72deae63ec2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -419,6 +419,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 #define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
 
 
 struct btrfs_block_group_item {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ff75ad58676..42522232fde 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -913,9 +913,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
 	if (btrfs_super_num_devices(disk_super) > 0) {
-		fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
-		fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1;
-		fs_info->system_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
+		fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0 |
+			BTRFS_BLOCK_GROUP_RAID1;
+		fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1 |
+			BTRFS_BLOCK_GROUP_DUP;
+		fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 	}
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4ab98d8b73f..1885ec4280c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -231,7 +231,7 @@ again:
 		if (start + num  > total_fs_bytes)
 			goto new_group;
 		if (!block_group_bits(cache, data)) {
-			printk("block group bits don't match %Lu %Lu\n", cache->flags, data);
+			printk("block group bits don't match %Lu %d\n", cache->flags, data);
 		}
 		*start_ret = start;
 		return 0;
@@ -1048,7 +1048,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-				   BTRFS_BLOCK_GROUP_RAID1);
+				   BTRFS_BLOCK_GROUP_RAID1 |
+				   BTRFS_BLOCK_GROUP_DUP);
 	if (extra_flags) {
 		if (flags & BTRFS_BLOCK_GROUP_DATA)
 			fs_info->avail_data_alloc_bits |= extra_flags;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bc3c0b97588..b9294e3c05f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -627,6 +627,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map *em;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
+	u64 min_free = calc_size;
 	u64 avail;
 	u64 max_avail = 0;
 	int num_stripes = 1;
@@ -641,6 +642,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0))
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
+	if (type & (BTRFS_BLOCK_GROUP_DUP))
+		num_stripes = 2;
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
@@ -649,16 +652,23 @@ again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
 	index = 0;
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_free = calc_size * 2;
+
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
 		if (avail > max_avail)
 			max_avail = avail;
-		if (avail >= calc_size) {
+		if (avail >= min_free) {
 			list_move_tail(&device->dev_list, &private_devs);
 			index++;
+			if (type & BTRFS_BLOCK_GROUP_DUP)
+				index++;
 		}
 		if (cur == dev_list)
 			break;
@@ -689,17 +699,22 @@ again:
 
 	stripes = &chunk->stripe;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID1)
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
 	else
 		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
+printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes);
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
 		device = list_entry(cur, struct btrfs_device, dev_list);
-		list_move_tail(&device->dev_list, dev_list);
+
+		/* loop over this device again if we're doing a dup group */
+		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
+		    (index == num_stripes - 1))
+			list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     key.objectid,
@@ -839,6 +854,14 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			}
 			*total_devs = 1;
 		}
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		if (rw == WRITE) {
+			*total_devs = map->num_stripes;
+			stripe_index = dev_nr;
+		} else {
+			stripe_index = 0;
+			*total_devs = 1;
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -851,7 +874,8 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	*phys = map->stripes[stripe_index].physical + stripe_offset +
 		stripe_nr * map->stripe_len;
 
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1)) {
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 			      map->stripe_len - stripe_offset);
-- 
cgit v1.2.3


From 6bc34676c0b5836655ec1c7998e2647cabb933ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 4 Apr 2008 15:40:00 -0400
Subject: Btrfs: Don't allow written blocks from this transaction to be
 reallocated

When a block is freed, it can be immediately reused if it is from
the current transaction.  But, an extra check is required to make sure
the block had not been written yet.  If it were reused after being written,
the transid in the block header might match the transid of the
next time the block was allocated.

The parent node records the transaction ID of the block it is pointing to,
and this is used as part of validating the block on reads.  So, there
can only be one version of a block per transaction.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1885ec4280c..0c0dde6d961 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1288,7 +1288,9 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
 					btrfs_header_generation(buf);
-				if (header_transid == transid) {
+				if (header_transid == transid &&
+				    !btrfs_header_flag(buf,
+					       BTRFS_HEADER_FLAG_WRITTEN)) {
 					clean_tree_block(NULL, root, buf);
 					free_extent_buffer(buf);
 					return 1;
-- 
cgit v1.2.3


From d18a2c447524751137a12cc8ccaf9d1e0b7fa1b3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 4 Apr 2008 15:40:00 -0400
Subject: Btrfs: Fix allocation profile init

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 13 +++++++------
 fs/btrfs/disk-io.c     | 11 ++++-------
 fs/btrfs/extent-tree.c | 10 ++--------
 3 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 72deae63ec2..4b3b20459f6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -506,12 +506,13 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
-	int avail_data_alloc_bits;
-	int avail_metadata_alloc_bits;
-	int avail_system_alloc_bits;
-	int data_alloc_profile;
-	int metadata_alloc_profile;
-	int system_alloc_profile;
+
+	u64 avail_data_alloc_bits;
+	u64 avail_metadata_alloc_bits;
+	u64 avail_system_alloc_bits;
+	u64 data_alloc_profile;
+	u64 metadata_alloc_profile;
+	u64 system_alloc_profile;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 42522232fde..bc5bd5abb28 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -912,13 +912,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
-	if (btrfs_super_num_devices(disk_super) > 0) {
-		fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0 |
-			BTRFS_BLOCK_GROUP_RAID1;
-		fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1 |
-			BTRFS_BLOCK_GROUP_DUP;
-		fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
-	}
+	fs_info->data_alloc_profile = (u64)-1;
+	fs_info->metadata_alloc_profile = (u64)-1;
+	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0c0dde6d961..0cb742884cd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1100,7 +1100,6 @@ printk("space info full %Lu\n", flags);
 		     start, num_bytes);
 	BUG_ON(ret);
 
-	set_avail_alloc_bits(extent_root->fs_info, flags);
 	return 0;
 }
 
@@ -2813,13 +2812,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 				&cache->space_info);
 	BUG_ON(ret);
 
-	if (type & BTRFS_BLOCK_GROUP_DATA) {
-		bit = BLOCK_GROUP_DATA;
-	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		bit = BLOCK_GROUP_SYSTEM;
-	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-		bit = BLOCK_GROUP_METADATA;
-	}
+	bit = block_group_state_bits(type);
 	set_extent_bits(block_group_cache, chunk_objectid,
 			chunk_objectid + size - 1,
 			bit | EXTENT_LOCKED, GFP_NOFS);
@@ -2833,5 +2826,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	finish_current_insert(trans, extent_root);
 	ret = del_pending_extents(trans, extent_root);
 	BUG_ON(ret);
+	set_avail_alloc_bits(extent_root->fs_info, type);
 	return 0;
 }
-- 
cgit v1.2.3


From cea9e4452ebaf18dd0951e90dc84d82a5dee40b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Change btrfs_map_block to return a structure with mappings for all
 stripes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   |   4 +-
 fs/btrfs/inode.c   |   8 ++--
 fs/btrfs/volumes.c | 135 +++++++++++++++++++++++++++++------------------------
 fs/btrfs/volumes.h |  25 ++++++++--
 4 files changed, 103 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3c7891cf0d7..c92c6b0ee58 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2025,8 +2025,10 @@ again:
 					 root->root_key.objectid,
 					 root_gen, disk_key.objectid, 0,
 					 l->start, 0);
-	if (IS_ERR(right))
+	if (IS_ERR(right)) {
+		BUG_ON(1);
 		return PTR_ERR(right);
+	}
 
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(right, right->start);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a8ae68c6fbb..7ae677d8a6d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -301,12 +301,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_mapping_tree *map_tree;
-	struct btrfs_device *dev;
 	u64 logical = bio->bi_sector << 9;
-	u64 physical;
 	u64 length = 0;
 	u64 map_length;
-	int total_devs;
 	struct bio_vec *bvec;
 	int i;
 	int ret;
@@ -316,8 +313,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	}
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, READ, 0, logical, &physical,
-			      &map_length, &dev, &total_devs);
+	ret = btrfs_map_block(map_tree, READ, logical,
+			      &map_length, NULL);
+
 	if (map_length < length + size) {
 		return 1;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9294e3c05f..008d3640e8c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -26,18 +26,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 
-struct stripe {
-	struct btrfs_device *dev;
-	u64 physical;
-};
-
-struct multi_bio {
-	atomic_t stripes;
-	bio_end_io_t *end_io;
-	void *private;
-	int error;
-};
-
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -45,11 +33,11 @@ struct map_lookup {
 	int stripe_len;
 	int sector_size;
 	int num_stripes;
-	struct stripe stripes[];
+	struct btrfs_bio_stripe stripes[];
 };
 
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
-			    (sizeof(struct stripe) * (n)))
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
@@ -801,8 +789,8 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-		    int dev_nr, u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev, int *total_devs)
+		    u64 logical, u64 *length,
+		    struct btrfs_multi_bio **multi_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -810,8 +798,21 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 offset;
 	u64 stripe_offset;
 	u64 stripe_nr;
+	int stripes_allocated = 8;
 	int stripe_index;
+	int i;
+	struct btrfs_multi_bio *multi = NULL;
 
+	if (multi_ret && !(rw & (1 << BIO_RW))) {
+		stripes_allocated = 1;
+	}
+again:
+	if (multi_ret) {
+		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+				GFP_NOFS);
+		if (!multi)
+			return -ENOMEM;
+	}
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
@@ -821,6 +822,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
+	/* if our multi bio struct is too small, back off and try again */
+	if (multi_ret && (rw & (1 << BIO_RW)) &&
+	    stripes_allocated < map->num_stripes &&
+	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
+	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
+		stripes_allocated = map->num_stripes;
+		spin_unlock(&em_tree->lock);
+		free_extent_map(em);
+		kfree(multi);
+		goto again;
+	}
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
@@ -834,10 +846,22 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+			      map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+	if (!multi_ret)
+		goto out;
+
+	multi->num_stripes = 1;
+	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		stripe_index = dev_nr;
 		if (rw & (1 << BIO_RW))
-			*total_devs = map->num_stripes;
+			multi->num_stripes = map->num_stripes;
 		else {
 			int i;
 			u64 least = (u64)-1;
@@ -852,16 +876,10 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 				}
 				spin_unlock(&cur->io_lock);
 			}
-			*total_devs = 1;
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw == WRITE) {
-			*total_devs = map->num_stripes;
-			stripe_index = dev_nr;
-		} else {
-			stripe_index = 0;
-			*total_devs = 1;
-		}
+		if (rw & (1 << BIO_RW))
+			multi->num_stripes = map->num_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -871,18 +889,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
-	*phys = map->stripes[stripe_index].physical + stripe_offset +
-		stripe_nr * map->stripe_len;
-
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-			 BTRFS_BLOCK_GROUP_DUP)) {
-		/* we limit the length of each bio to what fits in a stripe */
-		*length = min_t(u64, em->len - offset,
-			      map->stripe_len - stripe_offset);
-	} else {
-		*length = em->len - offset;
+	BUG_ON(stripe_index != 0 && multi->num_stripes > 1);
+
+	for (i = 0; i < multi->num_stripes; i++) {
+		multi->stripes[i].physical =
+			map->stripes[stripe_index].physical + stripe_offset +
+			stripe_nr * map->stripe_len;
+		multi->stripes[i].dev = map->stripes[stripe_index].dev;
+		stripe_index++;
 	}
-	*dev = map->stripes[stripe_index].dev;
+	*multi_ret = multi;
+out:
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 	return 0;
@@ -895,7 +912,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 				   unsigned int bytes_done, int err)
 #endif
 {
-	struct multi_bio *multi = bio->bi_private;
+	struct btrfs_multi_bio *multi = bio->bi_private;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -904,7 +921,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (err)
 		multi->error = err;
 
-	if (atomic_dec_and_test(&multi->stripes)) {
+	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 
@@ -927,11 +944,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
 	u64 logical = bio->bi_sector << 9;
-	u64 physical;
 	u64 length = 0;
 	u64 map_length;
 	struct bio_vec *bvec;
-	struct multi_bio *multi = NULL;
+	struct btrfs_multi_bio *multi = NULL;
 	int i;
 	int ret;
 	int dev_nr = 0;
@@ -943,26 +959,22 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
+
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi);
+	BUG_ON(ret);
+
+	total_devs = multi->num_stripes;
+	if (map_length < length) {
+		printk("mapping failed logical %Lu bio len %Lu "
+		       "len %Lu\n", logical, length, map_length);
+		BUG();
+	}
+	multi->end_io = first_bio->bi_end_io;
+	multi->private = first_bio->bi_private;
+	atomic_set(&multi->stripes_pending, multi->num_stripes);
+
 	while(dev_nr < total_devs) {
-		ret = btrfs_map_block(map_tree, rw, dev_nr, logical,
-				      &physical, &map_length, &dev,
-				      &total_devs);
-		if (map_length < length) {
-			printk("mapping failed logical %Lu bio len %Lu physical %Lu "
-			       "len %Lu\n", logical, length, physical, map_length);
-			BUG();
-		}
-		BUG_ON(map_length < length);
 		if (total_devs > 1) {
-			if (!multi) {
-				multi = kmalloc(sizeof(*multi), GFP_NOFS);
-				atomic_set(&multi->stripes, 1);
-				multi->end_io = bio->bi_end_io;
-				multi->private = first_bio->bi_private;
-				multi->error = 0;
-			} else {
-				atomic_inc(&multi->stripes);
-			}
 			if (dev_nr < total_devs - 1) {
 				bio = bio_clone(first_bio, GFP_NOFS);
 				BUG_ON(!bio);
@@ -972,7 +984,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 			bio->bi_private = multi;
 			bio->bi_end_io = end_bio_multi_stripe;
 		}
-		bio->bi_sector = physical >> 9;
+		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+		dev = multi->stripes[dev_nr].dev;
 		bio->bi_bdev = dev->bdev;
 		spin_lock(&dev->io_lock);
 		dev->total_ios++;
@@ -980,6 +993,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 		submit_bio(rw, bio);
 		dev_nr++;
 	}
+	if (total_devs == 1)
+		kfree(multi);
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 35dec3efd78..10ca0104750 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -19,6 +19,8 @@
 #ifndef __BTRFS_VOLUMES_
 #define __BTRFS_VOLUMES_
 
+#include <linux/bio.h>
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
@@ -69,12 +71,29 @@ struct btrfs_fs_devices {
 	struct list_head list;
 };
 
+struct btrfs_bio_stripe {
+	struct btrfs_device *dev;
+	u64 physical;
+};
+
+struct btrfs_multi_bio {
+	atomic_t stripes_pending;
+	bio_end_io_t *end_io;
+	void *private;
+	int error;
+	int num_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
+
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 owner, u64 num_bytes, u64 *start);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripe_nr,
-		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev, int *total_stripes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		    u64 logical, u64 *length,
+		    struct btrfs_multi_bio **multi_ret);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From 728131d8e40c2a47c59ca91a806299c4708029f9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Add additional debugging for metadata checksum failures

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |  3 ++-
 fs/btrfs/extent_io.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/extent_io.h |  2 ++
 3 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bc5bd5abb28..e444b99e02d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -283,6 +283,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
 	if (ret == 1) {
+		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -376,7 +377,6 @@ int btrfs_verify_block_csum(struct btrfs_root *root,
 		buf->flags |= EXTENT_CSUM;
 		return 0;
 	}
-
 	lock_extent(io_tree, buf->start, end, GFP_NOFS);
 
 	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
@@ -384,6 +384,7 @@ int btrfs_verify_block_csum(struct btrfs_root *root,
 		ret = 0;
 		goto out_unlock;
 	}
+WARN_ON(buf->flags & EXTENT_CSUM);
 
 	ret = csum_tree_block(root, buf, 1);
 	set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 776c33f1e6f..13cc2360e37 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2592,6 +2592,33 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	return p;
 }
 
+int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
+			  unsigned long len)
+{
+	struct list_head *lru = &tree->buffer_lru;
+	struct list_head *cur = lru->next;
+	struct extent_buffer *eb;
+	int found = 0;
+
+	spin_lock(&tree->lru_lock);
+	if (list_empty(lru))
+		goto out;
+
+	do {
+		eb = list_entry(cur, struct extent_buffer, lru);
+		if (eb->start <= start && eb->start + eb->len > start) {
+			eb->flags &= ~EXTENT_UPTODATE;
+		}
+		if (eb->start == start) {
+			eb->flags &= ~EXTENT_CSUM;
+		}
+		cur = cur->next;
+	} while (cur != lru);
+out:
+	spin_unlock(&tree->lru_lock);
+	return found;
+}
+
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   u64 start,
 						   unsigned long len,
@@ -2909,10 +2936,32 @@ EXPORT_SYMBOL(set_extent_buffer_uptodate);
 int extent_buffer_uptodate(struct extent_io_tree *tree,
 			     struct extent_buffer *eb)
 {
+	int ret = 0;
+	int ret2;
+	int num_pages;
+	int i;
+	struct page *page;
+	int pg_uptodate = 1;
+
 	if (eb->flags & EXTENT_UPTODATE)
-		return 1;
-	return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+		ret = 1;
+
+	ret2  = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
 			   EXTENT_UPTODATE, 1);
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (!PageUptodate(page)) {
+			pg_uptodate = 0;
+			break;
+		}
+	}
+	if ((ret || ret2) && !pg_uptodate) {
+printk("uptodate error2 eb %Lu ret %d ret2 %d pg_uptodate %d\n", eb->start, ret, ret2, pg_uptodate);
+		WARN_ON(1);
+	}
+	return (ret || ret2);
 }
 EXPORT_SYMBOL(extent_buffer_uptodate);
 
@@ -2929,7 +2978,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	unsigned long num_pages;
 	struct bio *bio = NULL;
 
-
 	if (eb->flags & EXTENT_UPTODATE)
 		return 0;
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 16d67a61a25..d9f5bc4dbd7 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -216,4 +216,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long *map_start,
 		      unsigned long *map_len, int km);
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
+int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
+			  unsigned long len);
 #endif
-- 
cgit v1.2.3


From ce9adaa5a792c2099a83246265eb4055bc38b6b8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Do metadata checksums for reads via a workqueue

Before, metadata checksumming was done by the callers of read_tree_block,
which would set EXTENT_CSUM bits in the extent tree to show that a given
range of pages was already checksummed and didn't need to be verified
again.

But, those bits could go away via try_to_releasepage, and the end
result was bogus checksum failures on pages that never left the cache.

The new code validates checksums when the page is read.  It is a little
tricky because metadata blocks can span pages and a single read may
end up going via multiple bios.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |   5 -
 fs/btrfs/ctree.h       |   4 +
 fs/btrfs/disk-io.c     | 258 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/extent-tree.c |   4 -
 fs/btrfs/extent_io.c   | 100 ++++++++++++++-----
 fs/btrfs/extent_io.h   |   4 +-
 fs/btrfs/transaction.c |   2 +-
 7 files changed, 310 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c92c6b0ee58..efce173a935 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -158,9 +158,6 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	} else {
 		root_gen = 0;
 	}
-	if (!(buf->flags & EXTENT_CSUM))
-		WARN_ON(1);
-
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
@@ -247,8 +244,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	if (!(buf->flags & EXTENT_CSUM))
-		WARN_ON(1);
 
 	header_trans = btrfs_header_generation(buf);
 	spin_lock(&root->fs_info->hash_lock);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4b3b20459f6..e803c4daad2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -485,6 +485,10 @@ struct btrfs_fs_info {
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
+	struct list_head end_io_work_list;
+	struct work_struct end_io_work;
+	spinlock_t end_io_work_lock;
+
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	struct work_struct trans_work;
 #else
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e444b99e02d..82109204788 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -24,6 +24,7 @@
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> // for block_sync_page
+#include <linux/workqueue.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -45,6 +46,16 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 #endif
 
 static struct extent_io_ops btree_extent_io_ops;
+static struct workqueue_struct *end_io_workqueue;
+
+struct end_io_wq {
+	struct bio *bio;
+	bio_end_io_t *end_io;
+	void *private;
+	struct btrfs_fs_info *info;
+	int error;
+	struct list_head list;
+};
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 				    size_t page_offset, u64 start, u64 len,
@@ -219,11 +230,108 @@ static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
 	return 0;
 }
 
+int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+			       struct extent_state *state)
+{
+	struct extent_io_tree *tree;
+	u64 found_start;
+	int found_level;
+	unsigned long len;
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	int ret;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+	len = page->private >> 2;
+	if (len == 0) {
+		WARN_ON(1);
+	}
+	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
+				 btree_get_extent);
+	btrfs_clear_buffer_defrag(eb);
+	found_start = btrfs_header_bytenr(eb);
+	if (found_start != start) {
+		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
+		       start, found_start, len);
+		WARN_ON(1);
+		goto err;
+	}
+	if (eb->first_page != page) {
+		printk("bad first page %lu %lu\n", eb->first_page->index,
+		       page->index);
+		WARN_ON(1);
+		goto err;
+	}
+	found_level = btrfs_header_level(eb);
+
+	ret = csum_tree_block(root, eb, 1);
+
+	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
+	end = eb->start + end - 1;
+	release_extent_buffer_tail_pages(eb);
+err:
+	free_extent_buffer(eb);
+out:
+	return 0;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_workqueue_bio(struct bio *bio, int err)
+#else
+static int end_workqueue_bio(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	struct end_io_wq *end_io_wq = bio->bi_private;
+	struct btrfs_fs_info *fs_info;
+	unsigned long flags;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+
+	fs_info = end_io_wq->info;
+	spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
+	end_io_wq->error = err;
+	list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list);
+	spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
+	queue_work(end_io_workqueue, &fs_info->end_io_work);
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct end_io_wq *end_io_wq;
 	u64 offset;
 	offset = bio->bi_sector << 9;
+
+	if (rw & (1 << BIO_RW)) {
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+	}
+
+	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
+	if (!end_io_wq)
+		return -ENOMEM;
+
+	end_io_wq->private = bio->bi_private;
+	end_io_wq->end_io = bio->bi_end_io;
+	end_io_wq->info = root->fs_info;
+	end_io_wq->error = 0;
+	end_io_wq->bio = bio;
+
+	bio->bi_private = end_io_wq;
+	bio->bi_end_io = end_workqueue_bio;
+
 	if (offset == BTRFS_SUPER_INFO_OFFSET) {
 		bio->bi_bdev = root->fs_info->sb->s_bdev;
 		submit_bio(rw, bio);
@@ -363,36 +471,7 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 int btrfs_verify_block_csum(struct btrfs_root *root,
 			    struct extent_buffer *buf)
 {
-	struct extent_io_tree *io_tree;
-	u64 end;
-	int ret;
-
-	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
-	if (buf->flags & EXTENT_CSUM)
-		return 0;
-
-	end = min_t(u64, buf->len, PAGE_CACHE_SIZE);
-	end = buf->start + end - 1;
-	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
-		buf->flags |= EXTENT_CSUM;
-		return 0;
-	}
-	lock_extent(io_tree, buf->start, end, GFP_NOFS);
-
-	if (test_range_bit(io_tree, buf->start, end, EXTENT_CSUM, 1)) {
-		buf->flags |= EXTENT_CSUM;
-		ret = 0;
-		goto out_unlock;
-	}
-WARN_ON(buf->flags & EXTENT_CSUM);
-
-	ret = csum_tree_block(root, buf, 1);
-	set_extent_bits(io_tree, buf->start, end, EXTENT_CSUM, GFP_NOFS);
-	buf->flags |= EXTENT_CSUM;
-
-out_unlock:
-	unlock_extent(io_tree, buf->start, end, GFP_NOFS);
-	return ret;
+	return btrfs_buffer_uptodate(buf);
 }
 
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
@@ -430,11 +509,15 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
 	if (!buf)
 		return NULL;
-	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, 1,
-				 btree_get_extent);
 
-	ret = btrfs_verify_block_csum(root, buf);
+	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0,
+				       1, btree_get_extent);
+
+	if (ret == 0) {
+		buf->flags |= EXTENT_UPTODATE;
+	}
 	return buf;
+
 }
 
 int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -724,6 +807,99 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 	return 0;
 }
 
+static int bio_ready_for_csum(struct bio *bio)
+{
+	u64 length = 0;
+	u64 buf_len = 0;
+	u64 start = 0;
+	struct page *page;
+	struct extent_io_tree *io_tree = NULL;
+	struct btrfs_fs_info *info = NULL;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		page = bvec->bv_page;
+		if (page->private == EXTENT_PAGE_PRIVATE) {
+			length += bvec->bv_len;
+			continue;
+		}
+		if (!page->private) {
+			length += bvec->bv_len;
+			continue;
+		}
+		length = bvec->bv_len;
+		buf_len = page->private >> 2;
+		start = page_offset(page) + bvec->bv_offset;
+		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
+		info = BTRFS_I(page->mapping->host)->root->fs_info;
+	}
+	/* are we fully contained in this bio? */
+	if (buf_len <= length)
+		return 1;
+
+	ret = extent_range_uptodate(io_tree, start + length,
+				    start + buf_len - 1);
+	if (ret == 1)
+		return ret;
+	return ret;
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+void btrfs_end_io_csum(void *p)
+#else
+void btrfs_end_io_csum(struct work_struct *work)
+#endif
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	struct btrfs_fs_info *fs_info = p;
+#else
+	struct btrfs_fs_info *fs_info = container_of(work,
+						     struct btrfs_fs_info,
+						     end_io_work);
+#endif
+	unsigned long flags;
+	struct end_io_wq *end_io_wq;
+	struct bio *bio;
+	struct list_head *next;
+	int error;
+	int was_empty;
+
+	while(1) {
+		spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
+		if (list_empty(&fs_info->end_io_work_list)) {
+			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
+					       flags);
+			return;
+		}
+		next = fs_info->end_io_work_list.next;
+		list_del(next);
+		spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
+
+		end_io_wq = list_entry(next, struct end_io_wq, list);
+
+		bio = end_io_wq->bio;
+		if (!bio_ready_for_csum(bio)) {
+			spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
+			was_empty = list_empty(&fs_info->end_io_work_list);
+			list_add_tail(&end_io_wq->list,
+				      &fs_info->end_io_work_list);
+			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
+					       flags);
+			if (was_empty)
+				return;
+			continue;
+		}
+		error = end_io_wq->error;
+		bio->bi_private = end_io_wq->private;
+		bio->bi_end_io = end_io_wq->end_io;
+		kfree(end_io_wq);
+		bio_endio(bio, error);
+	}
+}
+
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices)
 {
@@ -750,11 +926,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail;
 	}
+	end_io_workqueue = create_workqueue("btrfs-end-io");
+	BUG_ON(!end_io_workqueue);
+
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
+	INIT_LIST_HEAD(&fs_info->end_io_work_list);
 	spin_lock_init(&fs_info->hash_lock);
+	spin_lock_init(&fs_info->end_io_work_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 
@@ -799,6 +980,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
+	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
@@ -1044,6 +1226,8 @@ int close_ctree(struct btrfs_root *root)
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+	flush_workqueue(end_io_workqueue);
+	destroy_workqueue(end_io_workqueue);
 
 	iput(fs_info->btree_inode);
 #if 0
@@ -1171,12 +1355,18 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
+	int ret;
+	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
 					buf, 0, 1, btree_get_extent);
+	if (ret == 0) {
+		buf->flags |= EXTENT_UPTODATE;
+	}
+	return ret;
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
 	.writepage_io_hook = btree_writepage_io_hook,
+	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
 	/* note we're sharing with inode.c for the merge bio hook */
 	.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0cb742884cd..283b08a32a4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1898,10 +1898,6 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
-	set_extent_bits(&BTRFS_I(root->fs_info->btree_inode)->io_tree,
-			buf->start, buf->start + buf->len - 1,
-			EXTENT_CSUM, GFP_NOFS);
-	buf->flags |= EXTENT_CSUM;
 	if (!btrfs_test_opt(root, SSD))
 		btrfs_set_buffer_defrag(buf);
 	trans->blocks_used++;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 13cc2360e37..cfc383c17a3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2592,6 +2592,22 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	return p;
 }
 
+int release_extent_buffer_tail_pages(struct extent_buffer *eb)
+{
+	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
+	struct page *page;
+	unsigned long i;
+
+	if (num_pages == 1)
+		return 0;
+	for (i = 1; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		page_cache_release(page);
+	}
+	return 0;
+}
+
+
 int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
 			  unsigned long len)
 {
@@ -2609,9 +2625,6 @@ int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
 		if (eb->start <= start && eb->start + eb->len > start) {
 			eb->flags &= ~EXTENT_UPTODATE;
 		}
-		if (eb->start == start) {
-			eb->flags &= ~EXTENT_CSUM;
-		}
 		cur = cur->next;
 	} while (cur != lru);
 out:
@@ -2682,7 +2695,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		page_cache_get(page0);
 		mark_page_accessed(page0);
 		set_page_extent_mapped(page0);
-		WARN_ON(!PageUptodate(page0));
 		set_page_extent_head(page0, len);
 	} else {
 		i = 0;
@@ -2933,13 +2945,39 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(set_extent_buffer_uptodate);
 
+int extent_range_uptodate(struct extent_io_tree *tree,
+			  u64 start, u64 end)
+{
+	struct page *page;
+	int ret;
+	int pg_uptodate = 1;
+	int uptodate;
+	unsigned long index;
+
+	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+	if (ret)
+		return 1;
+	while(start <= end) {
+		index = start >> PAGE_CACHE_SHIFT;
+		page = find_get_page(tree->mapping, index);
+		uptodate = PageUptodate(page);
+		page_cache_release(page);
+		if (!uptodate) {
+			pg_uptodate = 0;
+			break;
+		}
+		start += PAGE_CACHE_SIZE;
+	}
+	return pg_uptodate;
+}
+
 int extent_buffer_uptodate(struct extent_io_tree *tree,
-			     struct extent_buffer *eb)
+			   struct extent_buffer *eb)
 {
 	int ret = 0;
 	int ret2;
-	int num_pages;
-	int i;
+	unsigned long num_pages;
+	unsigned long i;
 	struct page *page;
 	int pg_uptodate = 1;
 
@@ -2975,13 +3013,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	struct page *page;
 	int err;
 	int ret = 0;
+	int locked_pages = 0;
+	int all_uptodate = 1;
+	int inc_all_pages = 0;
 	unsigned long num_pages;
 	struct bio *bio = NULL;
 
 	if (eb->flags & EXTENT_UPTODATE)
 		return 0;
 
-	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
 			   EXTENT_UPTODATE, 1)) {
 		return 0;
 	}
@@ -2997,17 +3038,30 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (PageUptodate(page)) {
-			continue;
-		}
 		if (!wait) {
-			if (TestSetPageLocked(page)) {
-				continue;
-			}
+			if (TestSetPageLocked(page))
+				goto unlock_exit;
 		} else {
 			lock_page(page);
 		}
+		locked_pages++;
+		if (!PageUptodate(page)) {
+			all_uptodate = 0;
+		}
+	}
+	if (all_uptodate) {
+		if (start_i == 0)
+			eb->flags |= EXTENT_UPTODATE;
+		goto unlock_exit;
+	}
+
+	for (i = start_i; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		if (inc_all_pages)
+			page_cache_get(page);
 		if (!PageUptodate(page)) {
+			if (start_i == 0)
+				inc_all_pages = 1;
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio);
 			if (err) {
@@ -3034,6 +3088,16 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (!ret)
 		eb->flags |= EXTENT_UPTODATE;
 	return ret;
+
+unlock_exit:
+	i = start_i;
+	while(locked_pages > 0) {
+		page = extent_buffer_page(eb, i);
+		i++;
+		unlock_page(page);
+		locked_pages--;
+	}
+	return ret;
 }
 EXPORT_SYMBOL(read_extent_buffer_pages);
 
@@ -3048,7 +3112,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	char *dst = (char *)dstv;
 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
-	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
@@ -3057,11 +3120,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 
 	while(len > 0) {
 		page = extent_buffer_page(eb, i);
-		if (!PageUptodate(page)) {
-			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
-			WARN_ON(1);
-		}
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
 		kaddr = kmap_atomic(page, KM_USER1);
@@ -3105,7 +3163,6 @@ printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len,
 	}
 
 	p = extent_buffer_page(eb, i);
-	WARN_ON(!PageUptodate(p));
 	kaddr = kmap_atomic(p, km);
 	*token = kaddr;
 	*map = kaddr + offset;
@@ -3165,7 +3222,6 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 
 	while(len > 0) {
 		page = extent_buffer_page(eb, i);
-		WARN_ON(!PageUptodate(page));
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index d9f5bc4dbd7..9d2991d1d3c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,7 +13,6 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_CSUM (1 << 9)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /*
@@ -218,4 +217,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
 			  unsigned long len);
+int release_extent_buffer_tail_pages(struct extent_buffer *eb);
+int extent_range_uptodate(struct extent_io_tree *tree,
+			  u64 start, u64 end);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5e9f69244f9..1ed179c020a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -843,7 +843,7 @@ void btrfs_transaction_flush_work(struct btrfs_root *root)
 
 void __init btrfs_init_transaction_sys(void)
 {
-	trans_wq = create_workqueue("btrfs");
+	trans_wq = create_workqueue("btrfs-transaction");
 }
 
 void btrfs_exit_transaction_sys(void)
-- 
cgit v1.2.3


From 22c599485b1fdd95e4476a4752596a6cf6c6629a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Handle data block end_io through the async work queue

Before it was done by the bio end_io routine, the work queue code is able
to scale much better with faster IO subsystems.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 34 +++++++++++++++++++++++-----------
 fs/btrfs/disk-io.h |  2 ++
 fs/btrfs/inode.c   |  4 +++-
 3 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 82109204788..94ff87d0eae 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -54,6 +54,7 @@ struct end_io_wq {
 	void *private;
 	struct btrfs_fs_info *info;
 	int error;
+	int metadata;
 	struct list_head list;
 };
 
@@ -308,29 +309,40 @@ static int end_workqueue_bio(struct bio *bio,
 #endif
 }
 
-static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			int metadata)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct end_io_wq *end_io_wq;
-	u64 offset;
-	offset = bio->bi_sector << 9;
-
-	if (rw & (1 << BIO_RW)) {
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
-	}
-
 	end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS);
 	if (!end_io_wq)
 		return -ENOMEM;
 
 	end_io_wq->private = bio->bi_private;
 	end_io_wq->end_io = bio->bi_end_io;
-	end_io_wq->info = root->fs_info;
+	end_io_wq->info = info;
 	end_io_wq->error = 0;
 	end_io_wq->bio = bio;
+	end_io_wq->metadata = metadata;
 
 	bio->bi_private = end_io_wq;
 	bio->bi_end_io = end_workqueue_bio;
+	return 0;
+}
+
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 offset;
+	int ret;
+
+	offset = bio->bi_sector << 9;
+
+	if (rw & (1 << BIO_RW)) {
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+	}
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
+	BUG_ON(ret);
 
 	if (offset == BTRFS_SUPER_INFO_OFFSET) {
 		bio->bi_bdev = root->fs_info->sb->s_bdev;
@@ -880,7 +892,7 @@ void btrfs_end_io_csum(struct work_struct *work)
 		end_io_wq = list_entry(next, struct end_io_wq, list);
 
 		bio = end_io_wq->bio;
-		if (!bio_ready_for_csum(bio)) {
+		if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
 			spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
 			was_empty = list_empty(&fs_info->end_io_work_list);
 			list_add_tail(&end_io_wq->list,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 05b88d0e75e..4fac0ccbf8f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,4 +71,6 @@ void btrfs_throttle(struct btrfs_root *root);
 int btrfs_open_device(struct btrfs_device *dev);
 int btrfs_verify_block_csum(struct btrfs_root *root,
 			    struct extent_buffer *buf);
+int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
+			int metadata);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7ae677d8a6d..e1ef1acdb35 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -328,7 +328,9 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	if (rw != WRITE) {
+	if (!(rw & (1 << BIO_RW))) {
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+		BUG_ON(ret);
 		goto mapit;
 	}
 
-- 
cgit v1.2.3


From f188591e987e21b6f7f8864c66a02858b95b530e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Retry metadata reads in the face of checksum failures

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 11 +++++++-
 fs/btrfs/disk-io.c   | 72 ++++++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/extent_io.c | 50 +++++++++++++++++++++---------------
 fs/btrfs/extent_io.h |  5 ++--
 fs/btrfs/inode.c     |  7 ++---
 fs/btrfs/volumes.c   | 39 +++++++++++++++++++++++++---
 fs/btrfs/volumes.h   |  6 +++--
 7 files changed, 138 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index efce173a935..ff4e9c6859d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -569,7 +569,16 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 static int noinline check_block(struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
-	return 0;
+	u64 found_start;
+	if (btrfs_header_level(path->nodes[level]) != level)
+	    printk("warning: bad level %Lu wanted %d found %d\n",
+		   path->nodes[level]->start, level,
+		   btrfs_header_level(path->nodes[level]));
+	found_start = btrfs_header_bytenr(path->nodes[level]);
+	if (found_start != path->nodes[level]->start) {
+	    printk("warning: bad bytentr %Lu found %Lu\n",
+		   path->nodes[level]->start, found_start);
+	}
 #if 0
 	struct extent_buffer *buf = path->nodes[level];
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 94ff87d0eae..59bdf0474be 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -156,7 +156,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			memcpy(&found, result, BTRFS_CRC32_SIZE);
 
 			read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
-			WARN_ON(1);
 			printk("btrfs: %s checksum verify failed on %llu "
 			       "wanted %X found %X from_this_trans %d "
 			       "level %d\n",
@@ -171,6 +170,40 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	return 0;
 }
 
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+					  struct extent_buffer *eb,
+					  u64 start)
+{
+	struct extent_io_tree *io_tree;
+	int ret;
+	int num_copies = 0;
+	int mirror_num = 0;
+
+	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+	while (1) {
+		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+					       btree_get_extent, mirror_num);
+		if (!ret) {
+			if (mirror_num)
+printk("good read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+			return ret;
+		}
+		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+					      eb->start, eb->len);
+printk("failed to read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+		if (num_copies == 1) {
+printk("reading %Lu failed only one copy\n", eb->start);
+			return ret;
+		}
+		mirror_num++;
+		if (mirror_num > num_copies) {
+printk("bailing at mirror %d of %d\n", mirror_num, num_copies);
+			return ret;
+		}
+	}
+printk("read extent buffer page last\n");
+	return -EIO;
+}
 
 int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
@@ -180,6 +213,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	int found_level;
 	unsigned long len;
 	struct extent_buffer *eb;
+	int ret;
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 	if (page->private == EXTENT_PAGE_PRIVATE)
@@ -191,8 +226,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
-				 btree_get_extent);
+	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE);
+	BUG_ON(ret);
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
@@ -240,7 +275,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	unsigned long len;
 	struct extent_buffer *eb;
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	int ret;
+	int ret = 0;
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	if (page->private == EXTENT_PAGE_PRIVATE)
@@ -252,25 +287,26 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		WARN_ON(1);
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
-				 btree_get_extent);
+
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
-		       start, found_start, len);
-		WARN_ON(1);
+printk("bad start on %Lu found %Lu\n", eb->start, found_start);
+		ret = -EIO;
 		goto err;
 	}
 	if (eb->first_page != page) {
 		printk("bad first page %lu %lu\n", eb->first_page->index,
 		       page->index);
 		WARN_ON(1);
+		ret = -EIO;
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
 
 	ret = csum_tree_block(root, eb, 1);
+	if (ret)
+		ret = -EIO;
 
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
@@ -278,7 +314,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 err:
 	free_extent_buffer(eb);
 out:
-	return 0;
+	return ret;
 }
 
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
@@ -329,7 +365,8 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 offset;
@@ -338,7 +375,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 	offset = bio->bi_sector << 9;
 
 	if (rw & (1 << BIO_RW)) {
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 	}
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
@@ -349,7 +386,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 		submit_bio(rw, bio);
 		return 0;
 	}
-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 }
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
@@ -459,7 +496,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, 0, btree_get_extent);
+				 buf, 0, 0, btree_get_extent, 0);
 	free_extent_buffer(buf);
 	return ret;
 }
@@ -522,8 +559,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	if (!buf)
 		return NULL;
 
-	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0,
-				       1, btree_get_extent);
+	ret = btree_read_extent_buffer_pages(root, buf, 0);
 
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
@@ -1366,10 +1402,8 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 int btrfs_read_buffer(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	int ret;
-	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-					buf, 0, 1, btree_get_extent);
+	ret = btree_read_extent_buffer_pages(root, buf, 0);
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfc383c17a3..2f159375c87 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1546,7 +1546,7 @@ static int end_bio_extent_readpage(struct bio *bio,
 				    !(state->state & EXTENT_LOCKED))
 					state = NULL;
 			}
-			if (!state) {
+			if (!state && uptodate) {
 				spin_unlock_irqrestore(&tree->lock, flags);
 				set_extent_uptodate(tree, start, end,
 						    GFP_ATOMIC);
@@ -1567,8 +1567,10 @@ static int end_bio_extent_readpage(struct bio *bio,
 			} else {
 				state = NULL;
 			}
-			set_state_cb(tree, clear, EXTENT_UPTODATE);
-			clear->state |= EXTENT_UPTODATE;
+			if (uptodate) {
+				set_state_cb(tree, clear, EXTENT_UPTODATE);
+				clear->state |= EXTENT_UPTODATE;
+			}
 			clear_state_bit(tree, clear, EXTENT_LOCKED,
 					1, 0);
 			if (cur == start)
@@ -1685,7 +1687,7 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
 {
 	u64 maxsector;
 	int ret = 0;
@@ -1722,7 +1724,8 @@ static int submit_one_bio(int rw, struct bio *bio)
 		WARN_ON(1);
 	}
 	if (tree->ops && tree->ops->submit_bio_hook)
-		tree->ops->submit_bio_hook(page->mapping->host, rw, bio);
+		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+					   mirror_num);
 	else
 		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1737,7 +1740,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct block_device *bdev,
 			      struct bio **bio_ret,
 			      unsigned long max_pages,
-			      bio_end_io_t end_io_func)
+			      bio_end_io_t end_io_func,
+			      int mirror_num)
 {
 	int ret = 0;
 	struct bio *bio;
@@ -1749,7 +1753,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		    (tree->ops && tree->ops->merge_bio_hook &&
 		     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
 		    bio_add_page(bio, page, size, offset) < size) {
-			ret = submit_one_bio(rw, bio);
+			ret = submit_one_bio(rw, bio, mirror_num);
 			bio = NULL;
 		} else {
 			return 0;
@@ -1769,7 +1773,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	if (bio_ret) {
 		*bio_ret = bio;
 	} else {
-		ret = submit_one_bio(rw, bio);
+		ret = submit_one_bio(rw, bio, mirror_num);
 	}
 
 	return ret;
@@ -1798,7 +1802,7 @@ void set_page_extent_head(struct page *page, unsigned long len)
 static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
-				   struct bio **bio)
+				   struct bio **bio, int mirror_num)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1901,7 +1905,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset,
 					 bdev, bio, nr,
-					 end_bio_extent_readpage);
+					 end_bio_extent_readpage, mirror_num);
 		}
 		if (ret)
 			SetPageError(page);
@@ -1923,9 +1927,9 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	struct bio *bio = NULL;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio);
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
 	if (bio)
-		submit_one_bio(READ, bio);
+		submit_one_bio(READ, bio, 0);
 	return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
@@ -2077,7 +2081,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			ret = submit_extent_page(WRITE, tree, page, sector,
 						 iosize, page_offset, bdev,
 						 &epd->bio, max_nr,
-						 end_bio_extent_writepage);
+						 end_bio_extent_writepage, 0);
 			if (ret)
 				SetPageError(page);
 		}
@@ -2244,7 +2248,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
 	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
+		submit_one_bio(WRITE, epd.bio, 0);
 	}
 	return ret;
 }
@@ -2265,7 +2269,7 @@ int extent_writepages(struct extent_io_tree *tree,
 
 	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
+		submit_one_bio(WRITE, epd.bio, 0);
 	}
 	return ret;
 }
@@ -2297,7 +2301,8 @@ int extent_readpages(struct extent_io_tree *tree,
 			page_cache_get(page);
 			if (!pagevec_add(&pvec, page))
 				__pagevec_lru_add(&pvec);
-			__extent_read_full_page(tree, page, get_extent, &bio);
+			__extent_read_full_page(tree, page, get_extent,
+						&bio, 0);
 		}
 		page_cache_release(page);
 	}
@@ -2305,7 +2310,7 @@ int extent_readpages(struct extent_io_tree *tree,
 		__pagevec_lru_add(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
-		submit_one_bio(READ, bio);
+		submit_one_bio(READ, bio, 0);
 	return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
@@ -2430,7 +2435,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
 					 NULL, 1,
-					 end_bio_extent_preparewrite);
+					 end_bio_extent_preparewrite, 0);
 			iocount++;
 			block_start = block_start + iosize;
 		} else {
@@ -2696,6 +2701,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		mark_page_accessed(page0);
 		set_page_extent_mapped(page0);
 		set_page_extent_head(page0, len);
+		uptodate = PageUptodate(page0);
 	} else {
 		i = 0;
 	}
@@ -3006,7 +3012,7 @@ EXPORT_SYMBOL(extent_buffer_uptodate);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb,
 			     u64 start, int wait,
-			     get_extent_t *get_extent)
+			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
 	unsigned long start_i;
@@ -3062,8 +3068,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		if (!PageUptodate(page)) {
 			if (start_i == 0)
 				inc_all_pages = 1;
+			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
-						      get_extent, &bio);
+						      get_extent, &bio,
+						      mirror_num);
 			if (err) {
 				ret = err;
 			}
@@ -3073,7 +3081,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (bio)
-		submit_one_bio(READ, bio);
+		submit_one_bio(READ, bio, mirror_num);
 
 	if (ret || !wait) {
 		return ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9d2991d1d3c..8d6b8a14cc3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,7 +27,8 @@ struct extent_state;
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio);
+	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio,
+			       int mirror_num);
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
@@ -172,7 +173,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
-			     get_extent_t *get_extent);
+			     get_extent_t *get_extent, int mirror_num);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1ef1acdb35..8c2d5d036bd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -314,7 +314,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 	ret = btrfs_map_block(map_tree, READ, logical,
-			      &map_length, NULL);
+			      &map_length, NULL, 0);
 
 	if (map_length < length + size) {
 		return 1;
@@ -322,7 +322,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	return 0;
 }
 
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -347,7 +348,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
 mapit:
-	return btrfs_map_bio(root, rw, bio);
+	return btrfs_map_bio(root, rw, bio, mirror_num);
 }
 
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 008d3640e8c..3b927f69832 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -788,9 +788,31 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+		ret = map->num_stripes;
+	else
+		ret = 1;
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+	return ret;
+}
+
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret)
+		    struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -822,6 +844,9 @@ again:
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
+	if (mirror_num > map->num_stripes)
+		mirror_num = 0;
+
 	/* if our multi bio struct is too small, back off and try again */
 	if (multi_ret && (rw & (1 << BIO_RW)) &&
 	    stripes_allocated < map->num_stripes &&
@@ -862,7 +887,9 @@ again:
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		if (rw & (1 << BIO_RW))
 			multi->num_stripes = map->num_stripes;
-		else {
+		else if (mirror_num) {
+			stripe_index = mirror_num - 1;
+		} else {
 			int i;
 			u64 least = (u64)-1;
 			struct btrfs_device *cur;
@@ -880,6 +907,8 @@ again:
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
 			multi->num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -938,7 +967,8 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
 
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
@@ -960,7 +990,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi);
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+			      mirror_num);
 	BUG_ON(ret);
 
 	total_devs = multi->num_stripes;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 10ca0104750..3d5d0a9cb82 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -93,7 +93,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   u64 owner, u64 num_bytes, u64 *start);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret);
+		    struct btrfs_multi_bio **multi_ret, int mirror_num);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -101,7 +101,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      u64 *num_bytes, u64 type);
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       int flags, void *holder);
@@ -112,4 +113,5 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
 int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
 #endif
-- 
cgit v1.2.3


From 7e38326f5b125288a2baea1d815c500502ab9fc0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Handle checksumming errors while reading data blocks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   1 +
 fs/btrfs/extent_io.c   |   9 ++++
 fs/btrfs/extent_io.h   |   3 ++
 fs/btrfs/inode.c       | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 126 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9b9db9cbc01..fe6ef8e3416 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@ struct btrfs_inode {
 	struct btrfs_key location;
 	struct extent_map_tree extent_tree;
 	struct extent_io_tree io_tree;
+	struct extent_io_tree io_failure_tree;
 	struct inode vfs_inode;
 
 	u64 ordered_trans;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2f159375c87..866460c3d72 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1534,6 +1534,15 @@ static int end_bio_extent_readpage(struct bio *bio,
 			if (ret)
 				uptodate = 0;
 		}
+		if (!uptodate && tree->ops &&
+		    tree->ops->readpage_io_failed_hook) {
+			ret = tree->ops->readpage_io_failed_hook(bio, page,
+							 start, end, state);
+			if (ret == 0) {
+				state = NULL;
+				continue;
+			}
+		}
 
 		spin_lock_irqsave(&tree->lock, flags);
 		if (!state || state->end != end) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 8d6b8a14cc3..b47859ccd78 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,9 @@ struct extent_io_ops {
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
+	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
+				       u64 start, u64 end,
+				       struct extent_state *state);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
 	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8c2d5d036bd..48f1d1b9645 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -385,6 +385,86 @@ out:
 	return ret;
 }
 
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	int last_mirror;
+};
+
+int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
+				  struct page *page, u64 start, u64 end,
+				  struct extent_state *state)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct bio *bio;
+	int num_copies;
+	int ret;
+	u64 logical;
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		size_t pg_offset = start - page_offset(page);
+		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->last_mirror = 0;
+
+		em = btrfs_get_extent(inode, NULL, pg_offset, start,
+				      failrec->len, 0);
+
+		if (!em || IS_ERR(em)) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		failrec->logical = logical;
+		free_extent_map(em);
+		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
+				EXTENT_DIRTY, GFP_NOFS);
+		set_state_private(failure_tree, start, (u64)failrec);
+	} else {
+		failrec = (struct io_failure_record *)private;
+	}
+	num_copies = btrfs_num_copies(
+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
+			      failrec->logical, failrec->len);
+	failrec->last_mirror++;
+	if (!state) {
+		spin_lock_irq(&BTRFS_I(inode)->io_tree.lock);
+		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+						    failrec->start,
+						    EXTENT_LOCKED);
+		if (state && state->start != failrec->start)
+			state = NULL;
+		spin_unlock_irq(&BTRFS_I(inode)->io_tree.lock);
+	}
+	if (!state || failrec->last_mirror > num_copies) {
+		set_state_private(failure_tree, failrec->start, 0);
+		clear_extent_bits(failure_tree, failrec->start,
+				  failrec->start + failrec->len - 1,
+				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		kfree(failrec);
+		return -EIO;
+	}
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_private = state;
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = failed_bio->bi_bdev;
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+	btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
+	return 0;
+}
+
 int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -419,6 +499,29 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
+
+	/* if the io failure tree for this inode is non-empty,
+	 * check to see if we've recovered from a failed IO
+	 */
+	private = 0;
+	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+			     (u64)-1, 1, EXTENT_DIRTY)) {
+		u64 private_failure;
+		struct io_failure_record *failure;
+		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+					start, &private_failure);
+		if (ret == 0) {
+			failure = (struct io_failure_record *)private_failure;
+			set_state_private(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start, 0);
+			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start,
+					  failure->start + failure->len - 1,
+					  EXTENT_DIRTY | EXTENT_LOCKED,
+					  GFP_NOFS);
+			kfree(failure);
+		}
+	}
 	return 0;
 
 zeroit:
@@ -429,7 +532,7 @@ zeroit:
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
-	return 0;
+	return -EIO;
 }
 
 void btrfs_read_locked_inode(struct inode *inode)
@@ -1271,6 +1374,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+			     inode->i_mapping, GFP_NOFS);
 	return 0;
 }
 
@@ -1578,6 +1683,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
+	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+			     inode->i_mapping, GFP_NOFS);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -1803,6 +1910,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
@@ -2972,6 +3081,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 				     inode->i_mapping, GFP_NOFS);
+		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
+				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
@@ -3070,6 +3181,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 };
-- 
cgit v1.2.3


From 85d824c4a407ba8dbd897f4e05bbe21bf24cc071 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Apr 2008 10:23:19 -0400
Subject: Btrfs: Disable extra debugging checks on tree blocks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ff4e9c6859d..e8bf6c221e4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -570,6 +570,7 @@ static int noinline check_block(struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
 	u64 found_start;
+	return 0;
 	if (btrfs_header_level(path->nodes[level]) != level)
 	    printk("warning: bad level %Lu wanted %d found %d\n",
 		   path->nodes[level]->start, level,
-- 
cgit v1.2.3


From 16432985920f3c45af82da214e2498f3e2f9066b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Apr 2008 10:23:21 -0400
Subject: Btrfs: Add O_DIRECT read and write (writes == buffered + cache flush)

This adds basic O_DIRECT read and write support.  In the write case, we
just do a normal buffered write followed by a cache flush.  O_DIRECT +
O_SYNC are required to trigger metadata syncs.

In the read case, there is a basic btrfs_get_block call for use by
the generic O_DIRECT code.  This does honor multi-volume mapping rules
but it skips all checksumming.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c  | 11 ++++++++--
 fs/btrfs/inode.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5fa450452f9..5b938645221 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -796,8 +796,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
 	pinned[0] = NULL;
 	pinned[1] = NULL;
-	if (file->f_flags & O_DIRECT)
-		return -EINVAL;
 
 	pos = *ppos;
 	start_pos = pos;
@@ -909,6 +907,15 @@ out_nolock:
 				      start_pos, num_written);
 		if (err < 0)
 			num_written = err;
+	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
+		do_sync_mapping_range(inode->i_mapping, start_pos,
+				      start_pos + num_written - 1,
+				      SYNC_FILE_RANGE_WRITE |
+				      SYNC_FILE_RANGE_WAIT_AFTER);
+
+		invalidate_mapping_pages(inode->i_mapping,
+		      start_pos >> PAGE_CACHE_SHIFT,
+		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 	}
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 48f1d1b9645..32e778fb014 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2265,6 +2265,70 @@ out:
 	return em;
 }
 
+static int btrfs_get_block(struct inode *inode, sector_t iblock,
+			struct buffer_head *bh_result, int create)
+{
+	struct extent_map *em;
+	u64 start = (u64)iblock << inode->i_blkbits;
+	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 len;
+	u64 logical;
+	u64 map_length;
+	int ret = 0;
+
+	em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0);
+
+	if (!em || IS_ERR(em))
+		goto out;
+
+	if (em->start > start || em->start + em->len <= start)
+	    goto out;
+
+	if (em->block_start == EXTENT_MAP_INLINE) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (em->block_start == EXTENT_MAP_HOLE ||
+	    em->block_start == EXTENT_MAP_DELALLOC) {
+		goto out;
+	}
+
+	len = em->start + em->len - start;
+	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
+
+	logical = start - em->start;
+	logical = em->block_start + logical;
+
+	map_length = len;
+	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+			      logical, &map_length, &multi, 0);
+	BUG_ON(ret);
+	bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
+	bh_result->b_size = min(map_length, len);
+	bh_result->b_bdev = multi->stripes[0].dev->bdev;
+	set_buffer_mapped(bh_result);
+	kfree(multi);
+out:
+	free_extent_map(em);
+	return ret;
+}
+
+static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
+			const struct iovec *iov, loff_t offset,
+			unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	if (rw == WRITE)
+		return -EINVAL;
+
+	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs, btrfs_get_block, NULL);
+}
+
 static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
 {
 	return extent_bmap(mapping, iblock, btrfs_get_extent);
@@ -3136,6 +3200,7 @@ out_fail:
 	btrfs_throttle(root);
 	return err;
 }
+
 static int btrfs_permission(struct inode *inode, int mask,
 			    struct nameidata *nd)
 {
@@ -3193,6 +3258,7 @@ static struct address_space_operations btrfs_aops = {
 	.readpages	= btrfs_readpages,
 	.sync_page	= block_sync_page,
 	.bmap		= btrfs_bmap,
+	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
-- 
cgit v1.2.3


From f29844623de29a12358d7fba35d0959465b64adf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Apr 2008 16:19:33 -0400
Subject: Btrfs: Write out all super blocks on commit, and bring back proper
 barrier support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/disk-io.h |   1 +
 fs/btrfs/volumes.c |   8 ++--
 fs/btrfs/volumes.h |   3 ++
 4 files changed, 122 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59bdf0474be..cf1de75f088 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -382,7 +382,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	BUG_ON(ret);
 
 	if (offset == BTRFS_SUPER_INFO_OFFSET) {
-		bio->bi_bdev = root->fs_info->sb->s_bdev;
+		bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 		submit_bio(rw, bio);
 		return 0;
 	}
@@ -988,7 +988,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->new_trans_lock);
 
 	init_completion(&fs_info->kobj_unregister);
-	sb_set_blocksize(sb, 4096);
+	sb_set_blocksize(sb, BTRFS_SUPER_INFO_SIZE);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
@@ -1169,14 +1169,121 @@ fail:
 	return ERR_PTR(err);
 }
 
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+			printk(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+				       bdevname(bh->b_bdev, b));
+		}
+		set_buffer_write_io_error(bh);
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+int write_all_supers(struct btrfs_root *root)
+{
+	struct list_head *cur;
+	struct list_head *head = &root->fs_info->fs_devices->devices;
+	struct btrfs_device *dev;
+	struct extent_buffer *sb;
+	struct btrfs_dev_item *dev_item;
+	struct buffer_head *bh;
+	int ret;
+	int do_barriers;
+
+	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+
+	sb = root->fs_info->sb_buffer;
+	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+						      dev_item);
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		btrfs_set_device_type(sb, dev_item, dev->type);
+		btrfs_set_device_id(sb, dev_item, dev->devid);
+		btrfs_set_device_total_bytes(sb, dev_item, dev->total_bytes);
+		btrfs_set_device_bytes_used(sb, dev_item, dev->bytes_used);
+		btrfs_set_device_io_align(sb, dev_item, dev->io_align);
+		btrfs_set_device_io_width(sb, dev_item, dev->io_width);
+		btrfs_set_device_sector_size(sb, dev_item, dev->sector_size);
+		write_extent_buffer(sb, dev->uuid,
+				    (unsigned long)btrfs_device_uuid(dev_item),
+				    BTRFS_DEV_UUID_SIZE);
+
+		btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN);
+		csum_tree_block(root, sb, 0);
+
+		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET /
+			      root->fs_info->sb->s_blocksize,
+			      BTRFS_SUPER_INFO_SIZE);
+
+		read_extent_buffer(sb, bh->b_data, 0, BTRFS_SUPER_INFO_SIZE);
+		dev->pending_io = bh;
+
+		get_bh(bh);
+		set_buffer_uptodate(bh);
+		lock_buffer(bh);
+		bh->b_end_io = btrfs_end_buffer_write_sync;
+
+		if (do_barriers && dev->barriers) {
+			ret = submit_bh(WRITE_BARRIER, bh);
+			if (ret == -EOPNOTSUPP) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       dev->name);
+				set_buffer_uptodate(bh);
+				dev->barriers = 0;
+				get_bh(bh);
+				lock_buffer(bh);
+				ret = submit_bh(WRITE, bh);
+			}
+		} else {
+			ret = submit_bh(WRITE, bh);
+		}
+		BUG_ON(ret);
+	}
+
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		BUG_ON(!dev->pending_io);
+		bh = dev->pending_io;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(dev->pending_io)) {
+			if (do_barriers && dev->barriers) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       dev->name);
+				set_buffer_uptodate(bh);
+				get_bh(bh);
+				lock_buffer(bh);
+				dev->barriers = 0;
+				ret = submit_bh(WRITE, bh);
+				BUG_ON(ret);
+				wait_on_buffer(bh);
+				BUG_ON(!buffer_uptodate(bh));
+			} else {
+				BUG();
+			}
+
+		}
+		dev->pending_io = NULL;
+		brelse(bh);
+	}
+	return 0;
+}
+
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root)
 {
 	int ret;
-	struct extent_buffer *super = root->fs_info->sb_buffer;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct super_block *sb = root->fs_info->sb;
 
+	ret = write_all_supers(root);
+#if 0
 	if (!btrfs_test_opt(root, NOBARRIER))
 		blkdev_issue_flush(sb->s_bdev, NULL);
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super);
@@ -1184,6 +1291,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 				     super->start, super->len);
 	if (!btrfs_test_opt(root, NOBARRIER))
 		blkdev_issue_flush(sb->s_bdev, NULL);
+#endif
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4fac0ccbf8f..60b01902db7 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -20,6 +20,7 @@
 #define __DISKIO__
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
 struct btrfs_device;
 struct btrfs_fs_devices;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3b927f69832..07d43553141 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		device->barriers = 1;
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
 			kfree(device);
@@ -208,6 +209,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	struct buffer_head *bh;
 	int ret;
 	u64 devid;
+	u64 transid;
 
 	mutex_lock(&uuid_mutex);
 
@@ -236,14 +238,14 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
-	printk("found device %Lu on %s\n", devid, path);
+	transid = btrfs_super_generation(disk_super);
+	printk("found device %Lu transid %Lu on %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 error_brelse:
 	brelse(bh);
 error_close:
 	close_bdev_excl(bdev);
-	printk("scan one closes bdev %s\n", path);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1143,7 +1145,7 @@ static int read_one_dev(struct btrfs_root *root,
 	device = btrfs_find_device(root, devid);
 	if (!device) {
 		printk("warning devid %Lu not found already\n", devid);
-		device = kmalloc(sizeof(*device), GFP_NOFS);
+		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
 			return -ENOMEM;
 		list_add(&device->dev_list,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3d5d0a9cb82..89548837a1c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,9 +21,12 @@
 
 #include <linux/bio.h>
 
+struct buffer_head;
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
+	struct buffer_head *pending_io;
+	int barriers;
 	spinlock_t io_lock;
 
 	struct block_device *bdev;
-- 
cgit v1.2.3


From 3dd39914bc0737d055c780d29c20a1d117819cdb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Apr 2008 10:51:07 -0400
Subject: Btrfs: Add extra checks to avoid removing extent_state from pages we
 can't free

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cf1de75f088..e22960e2350 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -436,6 +436,12 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	struct extent_map_tree *map;
 	int ret;
 
+	if (page_count(page) > 3) {
+		/* once for page->private, once for the caller, once
+		 * once for the page cache
+		 */
+		return 0;
+	}
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
-- 
cgit v1.2.3


From 587f77043a1c86e2a7900ff2ce86bef3c1f4e075 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 11 Apr 2008 12:16:46 -0400
Subject: Btrfs: Fixup a few u64<->pointer casts for 32 bit

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 32e778fb014..46f54dfa2b0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -430,9 +430,10 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 		free_extent_map(em);
 		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
 				EXTENT_DIRTY, GFP_NOFS);
-		set_state_private(failure_tree, start, (u64)failrec);
+		set_state_private(failure_tree, start,
+				 (u64)(unsigned long)failrec);
 	} else {
-		failrec = (struct io_failure_record *)private;
+		failrec = (struct io_failure_record *)(unsigned long)private;
 	}
 	num_copies = btrfs_num_copies(
 			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
@@ -511,7 +512,8 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
 					start, &private_failure);
 		if (ret == 0) {
-			failure = (struct io_failure_record *)private_failure;
+			failure = (struct io_failure_record *)(unsigned long)
+				   private_failure;
 			set_state_private(&BTRFS_I(inode)->io_failure_tree,
 					  failure->start, 0);
 			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-- 
cgit v1.2.3


From a5eb62e345fc1818d0d8b6181463200a9e8dfe39 Mon Sep 17 00:00:00 2001
From: Miguel <miguel.filipe@gmail.com>
Date: Fri, 11 Apr 2008 15:45:51 -0400
Subject: Btrfs: Endianess bug fix for v0.13 with kernels

Fix for a endianess BUG when using btrfs v0.13 with kernels older than 2.6.23

Problem:

Has of v0.13, btrfs-progs is using crc32c.c equivalent to the one found on
linux-2.6.23/lib/libcrc32c.c Since crc32c_le() changed in linux-2.6.23, when
running btrfs v0.13 with older kernels we have a missmatch between the versions
of crc32c_le() from btrfs-progs and libcrc32c in the kernel.  This missmatch
causes a bug when using btrfs on big endian machines.

Solution:
btrfs_crc32c() macro that when compiling for kernels older than 2.6.23, does
endianess conversion to parameters and return value of crc32c().
This endianess conversion nullifies the differences in implementation
of crc32c_le().
If kernel 2.6.23 or better, it calls crc32c().

Signed-off-by: Miguel Sousa Filipe <miguel.filipe@gmail.com>
---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/crc32c.h      | 17 +++++++++++++++++
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/extent-tree.c | 12 +++++-------
 3 files changed, 24 insertions(+), 9 deletions(-)
 create mode 100644 fs/btrfs/crc32c.h

(limited to 'fs')

diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
new file mode 100644
index 00000000000..a93255b4ee2
--- /dev/null
+++ b/fs/btrfs/crc32c.h
@@ -0,0 +1,17 @@
+#include <asm/byteorder.h>
+#include <linux/crc32c.h>
+#include <linux/version.h>
+
+/**
+ * implementation of crc32c_le() changed in linux-2.6.23,
+ * has of v0.13 btrfs-progs is using the latest version.
+ * We must workaround older implementations of crc32c_le()
+ * found on older kernel versions.
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+#define btrfs_crc32c(seed, data, length) \
+	__cpu_to_le32( crc32c( __le32_to_cpu(seed), data, length) )
+#else
+#define btrfs_crc32c(seed, data, length) \
+	crc32c(seed, data, length)
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e22960e2350..0bfcc31d94d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -18,13 +18,13 @@
 
 #include <linux/fs.h>
 #include <linux/blkdev.h>
-#include <linux/crc32c.h>
 #include <linux/scatterlist.h>
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> // for block_sync_page
 #include <linux/workqueue.h>
+#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -100,7 +100,7 @@ out:
 
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-	return crc32c(seed, data, len);
+	return btrfs_crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, char *result)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 283b08a32a4..a056a2df689 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -15,11 +15,10 @@
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
-
 #include <linux/sched.h>
-#include <linux/crc32c.h>
 #include <linux/pagemap.h>
 #include "hash.h"
+#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -398,16 +397,15 @@ static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
 	u32 high_crc = ~(u32)0;
 	u32 low_crc = ~(u32)0;
 	__le64 lenum;
-
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(ref_generation);
-	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 	if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
 		lenum = cpu_to_le64(owner);
-		low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+		low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 		lenum = cpu_to_le64(owner_offset);
-		low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+		low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
 	}
 	return ((u64)high_crc << 32) | (u64)low_crc;
 }
-- 
cgit v1.2.3


From 594994aa3ea4cb52cba9c83a4b6f03e6ff3a646e Mon Sep 17 00:00:00 2001
From: Miguel <miguel.filipe@gmail.com>
Date: Fri, 11 Apr 2008 15:46:48 -0400
Subject: Btrfs: define write_cache_pages for linux kernel <= 2.6.20 instead

write_cache_pages doesn't exist in linux 2.6.20,  change the #if
condition to match that.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 866460c3d72..88322684be6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2109,8 +2109,7 @@ done:
 	return 0;
 }
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,20)
 /* Taken directly from 2.6.23 for 2.6.18 back port */
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                 void *data);
-- 
cgit v1.2.3


From 73f61b2a6459df982cb4faf0e5cf5ac8b153aaff Mon Sep 17 00:00:00 2001
From: Miguel <miguel.filipe@gmail.com>
Date: Fri, 11 Apr 2008 15:50:59 -0400
Subject: Btrfs: bio_endio support for linux 2.6.23 and older.

bio_endio() changed prototype on linux 2.6.24, support older kernels
using the older prototype.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 5 ++++-
 fs/btrfs/volumes.c | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0bfcc31d94d..aebe8c21ec8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -949,11 +949,14 @@ void btrfs_end_io_csum(struct work_struct *work)
 		bio->bi_private = end_io_wq->private;
 		bio->bi_end_io = end_io_wq->end_io;
 		kfree(end_io_wq);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+		bio_endio(bio, bio->bi_size, err);
+#else
 		bio_endio(bio, error);
+#endif
 	}
 }
 
-
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 07d43553141..82bc6cfc110 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -960,7 +960,11 @@ static int end_bio_multi_stripe(struct bio *bio,
 			err = multi->error;
 		kfree(multi);
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+		bio_endio(bio, bio->bi_size, err);
+#else
 		bio_endio(bio, err);
+#endif
 	} else {
 		bio_put(bio);
 	}
-- 
cgit v1.2.3


From 98d20f67cf99ccda638dbcdf7b3a9ee0a428d932 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Apr 2008 09:46:10 -0400
Subject: Add a min size parameter to btrfs_alloc_extent

On huge machines, delayed allocation may try to allocate massive extents.
This change allows btrfs_alloc_extent to return something smaller than
the caller asked for, and the data allocation routines will loop over
the allocations until it fills the whole delayed alloc.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/extent-tree.c | 12 +++++++++---
 fs/btrfs/inode.c       |  2 ++
 3 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e803c4daad2..09d614fcafb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1286,7 +1286,8 @@ int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 u64 owner, u64 owner_offset);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
-		       u64 num_bytes, u64 root_objectid, u64 ref_generation,
+		       u64 num_bytes, u64 min_bytes,
+		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, int data);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a056a2df689..e49147e767d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1694,7 +1694,8 @@ error:
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
-		       u64 num_bytes, u64 root_objectid, u64 ref_generation,
+		       u64 num_bytes, u64 min_alloc_size,
+		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, int data)
@@ -1727,7 +1728,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 			        info->metadata_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
-
+again:
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
@@ -1751,6 +1752,11 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 			       search_start, search_end, hint_byte, ins,
 			       trans->alloc_exclude_start,
 			       trans->alloc_exclude_nr, data);
+	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
+		num_bytes = num_bytes >> 1;
+		num_bytes = max(num_bytes, min_alloc_size);
+		goto again;
+	}
 	BUG_ON(ret);
 	if (ret)
 		return ret;
@@ -1869,7 +1875,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	int ret;
 	struct extent_buffer *buf;
 
-	ret = btrfs_alloc_extent(trans, root, blocksize,
+	ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
 				 root_objectid, ref_generation,
 				 level, first_objectid, empty_size, hint,
 				 (u64)-1, &ins, 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 46f54dfa2b0..7869c0157a1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -125,6 +125,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
 		ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
+					 root->sectorsize,
 					 root->root_key.objectid,
 					 trans->transid,
 					 inode->i_ino, start, 0,
@@ -133,6 +134,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 			WARN_ON(1);
 			goto out;
 		}
+		cur_alloc_size = ins.offset;
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       start, ins.objectid, ins.offset,
 					       ins.offset);
-- 
cgit v1.2.3


From b248a4152956cbae1b23f4c70ef5b51d6ea2ecfb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Apr 2008 09:48:18 -0400
Subject: Btrfs: A few updates for 2.6.18 and versions older than 2.6.25

This includes fixing a missing spinlock init call that caused oops on mount
for most kernels other than 2.6.25.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 12 ++++++++++--
 fs/btrfs/file.c    |  8 +++++++-
 fs/btrfs/volumes.c | 15 +++++++--------
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aebe8c21ec8..9c94dddde70 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -850,7 +850,9 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	bdi_init(bdi);
+#endif
 	bdi->ra_pages	= default_backing_dev_info.ra_pages * 4;
 	bdi->state		= 0;
 	bdi->capabilities	= default_backing_dev_info.capabilities;
@@ -950,7 +952,7 @@ void btrfs_end_io_csum(struct work_struct *work)
 		bio->bi_end_io = end_io_wq->end_io;
 		kfree(end_io_wq);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-		bio_endio(bio, bio->bi_size, err);
+		bio_endio(bio, bio->bi_size, error);
 #else
 		bio_endio(bio, error);
 #endif
@@ -1037,10 +1039,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
-	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
+	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 #endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1173,7 +1176,9 @@ fail:
 	close_all_devices(fs_info);
 	kfree(extent_root);
 	kfree(tree_root);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
+#endif
 	kfree(fs_info);
 	return ERR_PTR(err);
 }
@@ -1407,7 +1412,10 @@ int close_ctree(struct btrfs_root *root)
 #endif
 	close_all_devices(fs_info);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
+#endif
 
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5b938645221..9fbda655206 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -908,11 +908,17 @@ out_nolock:
 		if (err < 0)
 			num_written = err;
 	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+		do_sync_file_range(file, start_pos,
+				      start_pos + num_written - 1,
+				      SYNC_FILE_RANGE_WRITE |
+				      SYNC_FILE_RANGE_WAIT_AFTER);
+#else
 		do_sync_mapping_range(inode->i_mapping, start_pos,
 				      start_pos + num_written - 1,
 				      SYNC_FILE_RANGE_WRITE |
 				      SYNC_FILE_RANGE_WAIT_AFTER);
-
+#endif
 		invalidate_mapping_pages(inode->i_mapping,
 		      start_pos >> PAGE_CACHE_SHIFT,
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 82bc6cfc110..f81519f0e4a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -126,6 +126,7 @@ static int device_list_add(const char *path,
 		}
 		device->devid = devid;
 		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
 			kfree(device);
@@ -759,8 +760,8 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, c
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-	BUG_ON(ret);
 	spin_unlock(&em_tree->lock);
+	BUG_ON(ret);
 	free_extent_map(em);
 	return ret;
 }
@@ -799,6 +800,7 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
+	spin_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -808,7 +810,6 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	else
 		ret = 1;
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
 	return ret;
 }
 
@@ -840,6 +841,7 @@ again:
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
+	spin_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -855,7 +857,6 @@ again:
 	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
 	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
 		stripes_allocated = map->num_stripes;
-		spin_unlock(&em_tree->lock);
 		free_extent_map(em);
 		kfree(multi);
 		goto again;
@@ -932,7 +933,6 @@ again:
 	*multi_ret = multi;
 out:
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
 	return 0;
 }
 
@@ -1060,16 +1060,15 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	length = key->offset;
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+	spin_unlock(&map_tree->map_tree.lock);
 
 	/* already mapped? */
 	if (em && em->start <= logical && em->start + em->len > logical) {
 		free_extent_map(em);
-		spin_unlock(&map_tree->map_tree.lock);
 		return 0;
 	} else if (em) {
 		free_extent_map(em);
 	}
-	spin_unlock(&map_tree->map_tree.lock);
 
 	map = kzalloc(sizeof(*map), GFP_NOFS);
 	if (!map)
@@ -1110,8 +1109,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	spin_lock(&map_tree->map_tree.lock);
 	ret = add_extent_mapping(&map_tree->map_tree, em);
-	BUG_ON(ret);
 	spin_unlock(&map_tree->map_tree.lock);
+	BUG_ON(ret);
 	free_extent_map(em);
 
 	return 0;
@@ -1154,7 +1153,7 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
-		device->total_ios = 0;
+		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 	}
 
-- 
cgit v1.2.3


From e17cade25ff8074101d653557a78df09c16ca276 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 15 Apr 2008 15:41:47 -0400
Subject: Btrfs: Add chunk uuids and update multi-device back references

Block headers now store the chunk tree uuid

Chunk items records the device uuid for each stripes

Device extent items record better back refs to the chunk tree

Block groups record better back refs to the chunk tree

The chunk tree format has also changed.  The objectid of BTRFS_CHUNK_ITEM_KEY
used to be the logical offset of the chunk.  Now it is a chunk tree id,
with the logical offset being stored in the offset field of the key.

This allows a single chunk tree to record multiple logical address spaces,
upping the number of bytes indexed by a chunk tree from 2^64 to
2^128.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 12 ++++++++
 fs/btrfs/ctree.h       | 80 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/disk-io.c     |  6 +++-
 fs/btrfs/extent-tree.c | 22 +++++---------
 fs/btrfs/print-tree.c  | 20 +++++++++----
 fs/btrfs/volumes.c     | 76 +++++++++++++++++++++++++++++++----------------
 fs/btrfs/volumes.h     |  6 ++--
 7 files changed, 160 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e8bf6c221e4..618e526c904 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1382,6 +1382,11 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	write_extent_buffer(c, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(c),
 			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
+			    BTRFS_UUID_SIZE);
+
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
 	lower_gen = btrfs_header_generation(lower);
@@ -1513,6 +1518,9 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	write_extent_buffer(split, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(split),
 			    BTRFS_FSID_SIZE);
+	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
+			    BTRFS_UUID_SIZE);
 
 	mid = (c_nritems + 1) / 2;
 
@@ -2043,6 +2051,10 @@ again:
 	write_extent_buffer(right, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(right),
 			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
+			    BTRFS_UUID_SIZE);
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + space_needed >
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 09d614fcafb..82d67c3db8b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -70,6 +70,7 @@ extern struct kmem_cache *btrfs_path_cachep;
  * All files have objectids higher than this.
  */
 #define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
 
 
 /*
@@ -131,7 +132,7 @@ struct btrfs_mapping_tree {
 	struct extent_map_tree map_tree;
 };
 
-#define BTRFS_DEV_UUID_SIZE 16
+#define BTRFS_UUID_SIZE 16
 struct btrfs_dev_item {
 	/* the internal btrfs device id */
 	__le64 devid;
@@ -154,17 +155,32 @@ struct btrfs_dev_item {
 	/* type and info about this device */
 	__le64 type;
 
+	/* grouping information for allocation decisions */
+	__le32 dev_group;
+
+	/* seek speed 0-100 where 100 is fastest */
+	u8 seek_speed;
+
+	/* bandwidth 0-100 where 100 is fastest */
+	u8 bandwidth;
+
 	/* btrfs generated uuid for this device */
-	u8 uuid[BTRFS_DEV_UUID_SIZE];
+	u8 uuid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
 struct btrfs_stripe {
 	__le64 devid;
 	__le64 offset;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
 struct btrfs_chunk {
+	/* size of this chunk in bytes */
+	__le64 length;
+
+	/* objectid of the root referencing this chunk */
 	__le64 owner;
+
 	__le64 stripe_len;
 	__le64 type;
 
@@ -199,10 +215,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
  * every tree block (leaf or node) starts with this header.
  */
 struct btrfs_header {
+	/* these first four must match the super block */
 	u8 csum[BTRFS_CSUM_SIZE];
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 	__le64 bytenr; /* which block this node is supposed to live in */
 	__le64 flags;
+
+	/* allowed to be different from the super from here on down */
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	__le64 generation;
 	__le64 owner;
 	__le32 nritems;
@@ -235,6 +255,8 @@ struct btrfs_super_block {
 	u8 fsid[16];    /* FS specific uuid */
 	__le64 bytenr; /* this block number */
 	__le64 flags;
+
+	/* allowed to be different from the btrfs_header from here own down */
 	__le64 magic;
 	__le64 generation;
 	__le64 root;
@@ -323,14 +345,16 @@ struct btrfs_extent_ref {
 
 /* dev extents record free space on individual devices.  The owner
  * field points back to the chunk allocation mapping tree that allocated
- * the extent
+ * the extent.  The chunk tree uuid field is a way to double check the owner
  */
 struct btrfs_dev_extent {
-	__le64 owner;
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 chunk_offset;
 	__le64 length;
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
-
 struct btrfs_inode_ref {
 	__le16 name_len;
 	/* name goes here */
@@ -424,7 +448,6 @@ struct btrfs_csum_item {
 
 struct btrfs_block_group_item {
 	__le64 used;
-	__le64 chunk_tree;
 	__le64 chunk_objectid;
 	__le64 flags;
 } __attribute__ ((__packed__));
@@ -451,6 +474,7 @@ struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
@@ -697,6 +721,9 @@ BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
 BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
 
 BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
@@ -710,12 +737,19 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
 BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
 			 sector_size, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+			 dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+			 seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+			 bandwidth, 8);
 
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
 BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
 BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
@@ -726,6 +760,12 @@ BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
 BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
 BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
 
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+	return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
 			 stripe_len, 64);
@@ -781,13 +821,10 @@ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 			 used, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_chunk_tree, struct btrfs_block_group_item,
-			 chunk_tree, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_chunk_tree, struct btrfs_block_group_item,
-			 chunk_tree, 64);
 BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
 			struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_chunk_objecitd,
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
 		   struct btrfs_block_group_item, chunk_objectid, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_flags,
 		   struct btrfs_block_group_item, flags, 64);
@@ -850,9 +887,20 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
 
 /* struct btrfs_dev_extent */
-BTRFS_SETGET_FUNCS(dev_extent_owner, struct btrfs_dev_extent, owner, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+		   chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+		   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+		   chunk_offset, 64);
 BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
 
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+	return (u8 *)((unsigned long)dev + ptr);
+}
+
 /* struct btrfs_extent_ref */
 BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
@@ -1087,6 +1135,12 @@ static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 	return (u8 *)ptr;
 }
 
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+	return (u8 *)ptr;
+}
+
 static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
 {
 	unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
@@ -1311,7 +1365,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
-			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9c94dddde70..79c284c8728 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1125,6 +1125,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					   blocksize);
 	BUG_ON(!chunk_root->node);
 
+	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+	         (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+		 BTRFS_UUID_SIZE);
+
 	ret = btrfs_read_chunk_tree(chunk_root);
 	BUG_ON(ret);
 
@@ -1229,7 +1233,7 @@ int write_all_supers(struct btrfs_root *root)
 		btrfs_set_device_sector_size(sb, dev_item, dev->sector_size);
 		write_extent_buffer(sb, dev->uuid,
 				    (unsigned long)btrfs_device_uuid(dev_item),
-				    BTRFS_DEV_UUID_SIZE);
+				    BTRFS_UUID_SIZE);
 
 		btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN);
 		csum_tree_block(root, sb, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e49147e767d..71f045c6349 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 bytes_used,
-			   u64 type, u64 chunk_tree, u64 chunk_objectid,
-			   u64 size);
 
 
 static int cache_block_group(struct btrfs_root *root,
@@ -980,7 +976,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		ret = get_state_private(block_group_cache, start, &ptr);
 		if (ret)
 			break;
-
 		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		err = write_one_cache_group(trans, root,
 					    path, cache);
@@ -1094,8 +1089,7 @@ printk("space info full %Lu\n", flags);
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
-		     extent_root->fs_info->chunk_root->root_key.objectid,
-		     start, num_bytes);
+		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
 
 	return 0;
@@ -2782,7 +2776,7 @@ error:
 
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
-			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size)
 {
 	int ret;
@@ -2796,14 +2790,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	cache = kmalloc(sizeof(*cache), GFP_NOFS);
 	BUG_ON(!cache);
-	cache->key.objectid = chunk_objectid;
+	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	cache->cached = 0;
 	cache->pinned = 0;
+
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 	memset(&cache->item, 0, sizeof(cache->item));
 	btrfs_set_block_group_used(&cache->item, bytes_used);
-	btrfs_set_block_group_chunk_tree(&cache->item, chunk_tree);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
 	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
@@ -2813,12 +2807,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	bit = block_group_state_bits(type);
-	set_extent_bits(block_group_cache, chunk_objectid,
-			chunk_objectid + size - 1,
+	set_extent_bits(block_group_cache, chunk_offset,
+			chunk_offset + size - 1,
 			bit | EXTENT_LOCKED, GFP_NOFS);
-	set_state_private(block_group_cache, chunk_objectid,
-			  (unsigned long)cache);
 
+	set_state_private(block_group_cache, chunk_offset,
+			  (unsigned long)cache);
 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
 				sizeof(cache->item));
 	BUG_ON(ret);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index ee0de112cf5..e99f3249d05 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -24,7 +24,8 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 {
 	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
 	int i;
-	printk("\t\tchunk owner %llu type %llu num_stripes %d\n",
+	printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
+	       (unsigned long long)btrfs_chunk_length(eb, chunk),
 	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
 	       (unsigned long long)btrfs_chunk_type(eb, chunk),
 	       num_stripes);
@@ -140,17 +141,24 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DEV_EXTENT_KEY:
 			dev_extent = btrfs_item_ptr(l, i,
 						    struct btrfs_dev_extent);
-			printk("\t\tdev extent owner %llu length %llu\n",
-			       (unsigned long long)btrfs_dev_extent_owner(l, dev_extent),
-			       (unsigned long long)btrfs_dev_extent_length(l, dev_extent));
+			printk("\t\tdev extent chunk_tree %llu\n"
+			       "\t\tchunk objectid %llu chunk offset %llu "
+			       "length %llu\n",
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_tree(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_offset(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_length(l, dev_extent));
 		};
 	}
 }
 
 void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 {
-	int i;
-	u32 nr;
+	int i; u32 nr;
 	struct btrfs_key key;
 	int level;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f81519f0e4a..23ebd95b25e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -180,7 +180,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		bdev = open_bdev_excl(device->name, flags, holder);
-printk("opening %s devid %Lu\n", device->name, device->devid);
+
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			ret = PTR_ERR(bdev);
@@ -190,7 +190,6 @@ printk("opening %s devid %Lu\n", device->name, device->devid);
 			fs_devices->latest_bdev = bdev;
 		if (device->devid == fs_devices->lowest_devid) {
 			fs_devices->lowest_bdev = bdev;
-printk("lowest bdev %s\n", device->name);
 		}
 		device->bdev = bdev;
 	}
@@ -372,7 +371,9 @@ error:
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
-			   u64 owner, u64 num_bytes, u64 *start)
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset,
+			   u64 num_bytes, u64 *start)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -400,7 +401,14 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	extent = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_dev_extent);
-	btrfs_set_dev_extent_owner(leaf, extent, owner);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+		    BTRFS_UUID_SIZE);
+
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
 err:
@@ -408,17 +416,18 @@ err:
 	return ret;
 }
 
-static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
+static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset)
 {
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
 	struct btrfs_key found_key;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	key.objectid = (u64)-1;
+	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
@@ -430,11 +439,18 @@ static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
 
 	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
 	if (ret) {
-		*objectid = 0;
+		*offset = 0;
 	} else {
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
-		*objectid = found_key.objectid + found_key.offset;
+		if (found_key.objectid != objectid)
+			*offset = 0;
+		else {
+			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					       struct btrfs_chunk);
+			*offset = found_key.offset +
+				btrfs_chunk_length(path->nodes[0], chunk);
+		}
 	}
 	ret = 0;
 error:
@@ -520,9 +536,12 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_set_device_group(leaf, dev_item, 0);
+	btrfs_set_device_seek_speed(leaf, dev_item, 0);
+	btrfs_set_device_bandwidth(leaf, dev_item, 0);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
-	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 	ret = 0;
 
@@ -674,7 +693,10 @@ again:
 		return -ENOSPC;
 	}
 
-	ret = find_next_chunk(chunk_root, &key.objectid);
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &key.offset);
 	if (ret)
 		return ret;
 
@@ -696,8 +718,9 @@ again:
 		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
-printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes);
+printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
+		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -708,26 +731,28 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes
 			list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
-					     key.objectid,
-					     calc_size, &dev_offset);
+			     info->chunk_root->root_key.objectid,
+			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
+			     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type);
+printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
 
 		map->stripes[index].dev = device;
 		map->stripes[index].physical = dev_offset;
-		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
-		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
+		stripe = stripes + index;
+		btrfs_set_stack_stripe_devid(stripe, device->devid);
+		btrfs_set_stack_stripe_offset(stripe, dev_offset);
+		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		physical = dev_offset;
 		index++;
 	}
 	BUG_ON(!list_empty(&private_devs));
 
-	/* key.objectid was set above */
-	key.offset = *num_bytes;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
+	/* key was set above */
+	btrfs_set_stack_chunk_length(chunk, *num_bytes);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
 	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 	btrfs_set_stack_chunk_type(chunk, type);
@@ -745,14 +770,14 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, c
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
 	BUG_ON(ret);
-	*start = key.objectid;
+	*start = key.offset;;
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
 	em->bdev = (struct block_device *)map;
-	em->start = key.objectid;
-	em->len = key.offset;
+	em->start = key.offset;
+	em->len = *num_bytes;
 	em->block_start = 0;
 
 	kfree(chunk);
@@ -1056,8 +1081,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	int ret;
 	int i;
 
-	logical = key->objectid;
-	length = key->offset;
+	logical = key->offset;
+	length = btrfs_chunk_length(leaf, chunk);
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
 	spin_unlock(&map_tree->map_tree.lock);
@@ -1131,7 +1156,7 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
-	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 
 	return 0;
 }
@@ -1143,7 +1168,6 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
-
 	devid = btrfs_device_id(leaf, dev_item);
 	device = btrfs_find_device(root, devid);
 	if (!device) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 89548837a1c..f9cae307217 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -57,7 +57,7 @@ struct btrfs_device {
 	u64 type;
 
 	/* physical drive uuid (or lvm uuid) */
-	u8 uuid[BTRFS_DEV_UUID_SIZE];
+	u8 uuid[BTRFS_UUID_SIZE];
 };
 
 struct btrfs_fs_devices {
@@ -93,7 +93,9 @@ struct btrfs_multi_bio {
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
-			   u64 owner, u64 num_bytes, u64 *start);
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset,
+			   u64 num_bytes, u64 *start);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_multi_bio **multi_ret, int mirror_num);
-- 
cgit v1.2.3


From 321aecc65671ae8136bd2ca6879b56f0221f8ac8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Apr 2008 10:49:51 -0400
Subject: Btrfs: Add RAID10 support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  7 +++++++
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/volumes.c     | 46 +++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 82d67c3db8b..a22edcf4917 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -197,6 +197,9 @@ struct btrfs_chunk {
 	 * item in the btree
 	 */
 	__le16 num_stripes;
+
+	/* sub stripes only matter for raid10 */
+	__le16 sub_stripes;
 	struct btrfs_stripe stripe;
 	/* additional stripes go here */
 } __attribute__ ((__packed__));
@@ -444,6 +447,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
 
 
 struct btrfs_block_group_item {
@@ -757,6 +761,7 @@ BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
 BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
 BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
 BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
 BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
 BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
 
@@ -778,6 +783,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
 			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+			 sub_stripes, 16);
 BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71f045c6349..4e5bd62e6e1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1042,6 +1042,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
 				   BTRFS_BLOCK_GROUP_RAID1 |
+				   BTRFS_BLOCK_GROUP_RAID10 |
 				   BTRFS_BLOCK_GROUP_DUP);
 	if (extra_flags) {
 		if (flags & BTRFS_BLOCK_GROUP_DATA)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23ebd95b25e..e6417a573d4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,6 +33,7 @@ struct map_lookup {
 	int stripe_len;
 	int sector_size;
 	int num_stripes;
+	int sub_stripes;
 	struct btrfs_bio_stripe stripes[];
 };
 
@@ -641,6 +642,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 avail;
 	u64 max_avail = 0;
 	int num_stripes = 1;
+	int sub_stripes = 0;
 	int looped = 0;
 	int ret;
 	int index;
@@ -658,6 +660,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
 	}
+	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		if (num_stripes < 4)
+			return -ENOSPC;
+		num_stripes &= ~(u32)1;
+		sub_stripes = 2;
+	}
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -714,6 +723,8 @@ again:
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		*num_bytes = calc_size * num_stripes / sub_stripes;
 	else
 		*num_bytes = calc_size * num_stripes;
 
@@ -760,12 +771,14 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, cal
 	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
 	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
 	map->sector_size = extent_root->sectorsize;
 	map->stripe_len = stripe_len;
 	map->io_align = stripe_len;
 	map->io_width = stripe_len;
 	map->type = type;
 	map->num_stripes = num_stripes;
+	map->sub_stripes = sub_stripes;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
@@ -832,6 +845,8 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	map = (struct map_lookup *)em->bdev;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		ret = map->sub_stripes;
 	else
 		ret = 1;
 	free_extent_map(em);
@@ -849,6 +864,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 stripe_offset;
 	u64 stripe_nr;
 	int stripes_allocated = 8;
+	int stripes_required = 1;
 	int stripe_index;
 	int i;
 	struct btrfs_multi_bio *multi = NULL;
@@ -877,10 +893,16 @@ again:
 		mirror_num = 0;
 
 	/* if our multi bio struct is too small, back off and try again */
-	if (multi_ret && (rw & (1 << BIO_RW)) &&
-	    stripes_allocated < map->num_stripes &&
-	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
-	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
+	if (rw & (1 << BIO_RW)) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			stripes_required = map->num_stripes;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripes_required = map->sub_stripes;
+		}
+	}
+	if (multi_ret && rw == WRITE &&
+	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
 		kfree(multi);
@@ -900,6 +922,7 @@ again:
 	stripe_offset = offset - stripe_offset;
 
 	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_RAID10 |
 			 BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
@@ -937,6 +960,19 @@ again:
 			multi->num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+		int orig_stripe_nr = stripe_nr;
+
+		stripe_index = do_div(stripe_nr, factor);
+		stripe_index *= map->sub_stripes;
+
+		if (rw & (1 << BIO_RW))
+			multi->num_stripes = map->sub_stripes;
+		else if (mirror_num)
+			stripe_index += mirror_num - 1;
+		else
+			stripe_index += orig_stripe_nr % map->sub_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -946,7 +982,6 @@ again:
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
-	BUG_ON(stripe_index != 0 && multi->num_stripes > 1);
 
 	for (i = 0; i < multi->num_stripes; i++) {
 		multi->stripes[i].physical =
@@ -1120,6 +1155,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	map->type = btrfs_chunk_type(leaf, chunk);
+	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
 	for (i = 0; i < num_stripes; i++) {
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
-- 
cgit v1.2.3


From 44b8bd7edda4f63de180d0f7325c9fb704b3806b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Apr 2008 11:14:51 -0400
Subject: Btrfs: Create a work queue for bio writes

This allows checksumming to happen in parallel among many cpus, and
keeps us from bogging down pdflush with the checksumming code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  3 ++
 fs/btrfs/disk-io.c   | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/disk-io.h   |  3 ++
 fs/btrfs/extent_io.h |  5 +--
 fs/btrfs/inode.c     | 28 +++++++++++-----
 5 files changed, 119 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a22edcf4917..ff15b8513f9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -514,8 +514,11 @@ struct btrfs_fs_info {
 	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head end_io_work_list;
+	struct list_head async_submit_work_list;
 	struct work_struct end_io_work;
+	struct work_struct async_submit_work;
 	spinlock_t end_io_work_lock;
+	spinlock_t async_submit_work_lock;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	struct work_struct trans_work;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 79c284c8728..9e41ea93ebc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -47,6 +47,7 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 
 static struct extent_io_ops btree_extent_io_ops;
 static struct workqueue_struct *end_io_workqueue;
+static struct workqueue_struct *async_submit_workqueue;
 
 struct end_io_wq {
 	struct bio *bio;
@@ -58,6 +59,15 @@ struct end_io_wq {
 	struct list_head list;
 };
 
+struct async_submit_bio {
+	struct inode *inode;
+	struct bio *bio;
+	struct list_head list;
+	extent_submit_bio_hook_t *submit_bio_hook;
+	int rw;
+	int mirror_num;
+};
+
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 				    size_t page_offset, u64 start, u64 len,
 				    int create)
@@ -365,7 +375,31 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			extent_submit_bio_hook_t *submit_bio_hook)
+{
+	struct async_submit_bio *async;
+
+	async = kmalloc(sizeof(*async), GFP_NOFS);
+	if (!async)
+		return -ENOMEM;
+
+	async->inode = inode;
+	async->rw = rw;
+	async->bio = bio;
+	async->mirror_num = mirror_num;
+	async->submit_bio_hook = submit_bio_hook;
+
+	spin_lock(&fs_info->async_submit_work_lock);
+	list_add_tail(&async->list, &fs_info->async_submit_work_list);
+	spin_unlock(&fs_info->async_submit_work_lock);
+
+	queue_work(async_submit_workqueue, &fs_info->async_submit_work);
+	return 0;
+}
+
+static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -389,6 +423,17 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 }
 
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num)
+{
+	if (!(rw & (1 << BIO_RW))) {
+		return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+	}
+	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, mirror_num,
+				   __btree_submit_bio_hook);
+}
+
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
@@ -903,9 +948,9 @@ static int bio_ready_for_csum(struct bio *bio)
 }
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-void btrfs_end_io_csum(void *p)
+static void btrfs_end_io_csum(void *p)
 #else
-void btrfs_end_io_csum(struct work_struct *work)
+static void btrfs_end_io_csum(struct work_struct *work)
 #endif
 {
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
@@ -959,6 +1004,39 @@ void btrfs_end_io_csum(struct work_struct *work)
 	}
 }
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+static void btrfs_async_submit_work(void *p)
+#else
+static void btrfs_async_submit_work(struct work_struct *work)
+#endif
+{
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	struct btrfs_fs_info *fs_info = p;
+#else
+	struct btrfs_fs_info *fs_info = container_of(work,
+						     struct btrfs_fs_info,
+						     async_submit_work);
+#endif
+	struct async_submit_bio *async;
+	struct list_head *next;
+
+	while(1) {
+		spin_lock(&fs_info->async_submit_work_lock);
+		if (list_empty(&fs_info->async_submit_work_list)) {
+			spin_unlock(&fs_info->async_submit_work_lock);
+			return;
+		}
+		next = fs_info->async_submit_work_list.next;
+		list_del(next);
+		spin_unlock(&fs_info->async_submit_work_lock);
+
+		async = list_entry(next, struct async_submit_bio, list);
+		async->submit_bio_hook(async->inode, async->rw, async->bio,
+				       async->mirror_num);
+		kfree(async);
+	}
+}
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices)
 {
@@ -987,14 +1065,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 	end_io_workqueue = create_workqueue("btrfs-end-io");
 	BUG_ON(!end_io_workqueue);
+	async_submit_workqueue = create_workqueue("btrfs-async-submit");
 
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->end_io_work_list);
+	INIT_LIST_HEAD(&fs_info->async_submit_work_list);
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->end_io_work_lock);
+	spin_lock_init(&fs_info->async_submit_work_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 
@@ -1041,9 +1122,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
+	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work,
+		  fs_info);
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
 	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
+	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work);
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 #endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1403,6 +1487,9 @@ int close_ctree(struct btrfs_root *root)
 	flush_workqueue(end_io_workqueue);
 	destroy_workqueue(end_io_workqueue);
 
+	flush_workqueue(async_submit_workqueue);
+	destroy_workqueue(async_submit_workqueue);
+
 	iput(fs_info->btree_inode);
 #if 0
 	while(!list_empty(&fs_info->hashers)) {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 60b01902db7..5d5f6979c41 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -74,4 +74,7 @@ int btrfs_verify_block_csum(struct btrfs_root *root,
 			    struct extent_buffer *buf);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
+int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+			int rw, struct bio *bio, int mirror_num,
+			extent_submit_bio_hook_t *submit_bio_hook);
 #endif
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b47859ccd78..232300d13be 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -24,11 +24,12 @@
 
 struct extent_state;
 
+typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
+				       struct bio *bio, int mirror_num);
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio,
-			       int mirror_num);
+	extent_submit_bio_hook_t *submit_bio_hook;
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7869c0157a1..457daf725d2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -324,13 +324,29 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	return 0;
 }
 
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
+	btrfs_csum_file_blocks(trans, root, inode, bio);
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return btrfs_map_bio(root, rw, bio, mirror_num);
+}
+
+int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
 	if (!(rw & (1 << BIO_RW))) {
 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 		BUG_ON(ret);
@@ -342,13 +358,9 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		goto mapit;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
-	btrfs_csum_file_blocks(trans, root, inode, bio);
-	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+				   inode, rw, bio, mirror_num,
+				   __btrfs_submit_bio_hook);
 mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num);
 }
-- 
cgit v1.2.3


From e015640f9c4fa2417dcc3bbbb3b2b61ad4059ab0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Apr 2008 11:15:20 -0400
Subject: Btrfs: Write bio checksumming outside the FS mutex

This significantly improves streaming write performance by allowing
concurrency in the data checksumming.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  4 +++-
 fs/btrfs/file-item.c | 46 ++++++++++++++++++++++++++++++++--------------
 fs/btrfs/inode.c     | 11 ++++++++++-
 3 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ff15b8513f9..b56ae195065 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1515,7 +1515,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     u64 bytenr, int mod);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio);
+			   struct bio *bio, char *sums);
+int btrfs_csum_one_bio(struct btrfs_root *root,
+		       struct bio *bio, char **sums_ret);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 374d0245b29..9259aece6ed 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -134,9 +134,36 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+int btrfs_csum_one_bio(struct btrfs_root *root,
+		       struct bio *bio, char **sums_ret)
+{
+	u32 *sums;
+	char *data;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+
+	sums = kmalloc(bio->bi_vcnt * BTRFS_CRC32_SIZE, GFP_NOFS);
+	if (!sums)
+		return -ENOMEM;
+	*sums_ret = (char *)sums;
+
+	while(bio_index < bio->bi_vcnt) {
+		data = kmap_atomic(bvec->bv_page, KM_USER0);
+		*sums = ~(u32)0;
+		*sums = btrfs_csum_data(root, data + bvec->bv_offset,
+					*sums, bvec->bv_len);
+		kunmap_atomic(data, KM_USER0);
+		btrfs_csum_final(*sums, (char *)sums);
+		sums++;
+		bio_index++;
+		bvec++;
+	}
+	return 0;
+}
+
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio)
+			   struct bio *bio, char *sums)
 {
 	u64 objectid = inode->i_ino;
 	u64 offset;
@@ -150,12 +177,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
 	u64 csum_offset;
-	u32 csum_result;
+	u32 *sums32 = (u32 *)sums;
 	u32 nritems;
 	u32 ins_size;
 	int bio_index = 0;
 	struct bio_vec *bvec = bio->bi_io_vec;
-	char *data;
 	char *eb_map;
 	char *eb_token;
 	unsigned long map_len;
@@ -278,15 +304,6 @@ found:
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
 next_bvec:
-	data = kmap_atomic(bvec->bv_page, KM_USER0);
-	csum_result = ~(u32)0;
-	csum_result = btrfs_csum_data(root, data + bvec->bv_offset,
-				      csum_result, bvec->bv_len);
-	kunmap_atomic(data, KM_USER0);
-	btrfs_csum_final(csum_result, (char *)&csum_result);
-	if (csum_result == 0) {
-		printk("csum result is 0 for inode %lu offset %Lu\n", inode->i_ino, offset);
-	}
 
 	if (!eb_token ||
 	   (unsigned long)item  + BTRFS_CRC32_SIZE >= map_start + map_len) {
@@ -304,13 +321,14 @@ next_bvec:
 	}
 	if (eb_token) {
 		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-		       &csum_result, BTRFS_CRC32_SIZE);
+		       sums32, BTRFS_CRC32_SIZE);
 	} else {
-		write_extent_buffer(leaf, &csum_result, (unsigned long)item,
+		write_extent_buffer(leaf, sums32, (unsigned long)item,
 				    BTRFS_CRC32_SIZE);
 	}
 	bio_index++;
 	bvec++;
+	sums32++;
 	if (bio_index < bio->bi_vcnt) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  BTRFS_CRC32_SIZE);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 457daf725d2..3e1f21c632c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -330,14 +330,23 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
+	char *sums = NULL;
+
+	ret = btrfs_csum_one_bio(root, bio, &sums);
+	BUG_ON(ret);
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+
 	btrfs_set_trans_block_group(trans, inode);
-	btrfs_csum_file_blocks(trans, root, inode, bio);
+	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
+
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
+
+	kfree(sums);
+
 	return btrfs_map_bio(root, rw, bio, mirror_num);
 }
 
-- 
cgit v1.2.3


From 699122f55904bec252ea4f8a64228ce98e3d9fda Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Apr 2008 12:59:22 -0400
Subject: Btrfs: Don't wait on tree block writeback before freeing them anymore

This isn't required anymore because we don't reallocate blocks that
have already been written in this transaction.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 5 -----
 fs/btrfs/extent-tree.c | 1 -
 fs/btrfs/inode.c       | 2 ++
 3 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 618e526c904..1c3d9d6fbda 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -751,7 +751,6 @@ static int balance_level(struct btrfs_trans_handle *trans,
 		add_root_to_dirty_list(root);
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
-		wait_on_tree_block_writeback(root, mid);
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
@@ -810,7 +809,6 @@ static int balance_level(struct btrfs_trans_handle *trans,
 			u32 blocksize = right->len;
 
 			clean_tree_block(trans, root, right);
-			wait_on_tree_block_writeback(root, right);
 			free_extent_buffer(right);
 			right = NULL;
 			wret = del_ptr(trans, root, path, level + 1, pslot +
@@ -854,7 +852,6 @@ static int balance_level(struct btrfs_trans_handle *trans,
 		u64 bytenr = mid->start;
 		u32 blocksize = mid->len;
 		clean_tree_block(trans, root, mid);
-		wait_on_tree_block_writeback(root, mid);
 		free_extent_buffer(mid);
 		mid = NULL;
 		wret = del_ptr(trans, root, path, level + 1, pslot);
@@ -2638,7 +2635,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		} else {
 			u64 root_gen = btrfs_header_generation(path->nodes[1]);
 			clean_tree_block(trans, root, leaf);
-			wait_on_tree_block_writeback(root, leaf);
 			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
@@ -2690,7 +2686,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 							   path->nodes[1]);
 
 				clean_tree_block(trans, root, leaf);
-				wait_on_tree_block_writeback(root, leaf);
 
 				wret = del_ptr(trans, root, path, 1, slot);
 				if (wret)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4e5bd62e6e1..76fd5d7146e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1887,7 +1887,6 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	}
 	btrfs_set_header_generation(buf, trans->transid);
 	clean_tree_block(trans, root, buf);
-	wait_on_tree_block_writeback(root, buf);
 	btrfs_set_buffer_uptodate(buf);
 
 	if (PageDirty(buf->first_page)) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e1f21c632c..5632ea76007 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -383,9 +383,11 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	struct btrfs_csum_item *item;
 	struct btrfs_path *path = NULL;
 	u32 csum;
+
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	path = btrfs_alloc_path();
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
-- 
cgit v1.2.3


From 7b859fe7cd383fa5e7e379884a748680d0af7bcc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Apr 2008 13:06:16 -0400
Subject: Btrfs: Only do async bio submission for pdflush

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9e41ea93ebc..76ee7a4ae00 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -381,6 +381,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 {
 	struct async_submit_bio *async;
 
+	/*
+	 * inline writerback should stay inline, only hop to the async
+	 * queue if we're pdflush
+	 */
+	if (!current_is_pdflush())
+		return submit_bio_hook(inode, rw, bio, mirror_num);
+
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 3b951516ed703af0f6d82053937655ad69b60864 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Apr 2008 11:29:12 -0400
Subject: Btrfs: Use the extent map cache to find the logical disk block during
 data retries

The data read retry code needs to find the logical disk block before it
can resubmit new bios.  But, finding this block isn't allowed to take
the fs_mutex because that will deadlock with a number of different callers.

This changes the retry code to use the extent map cache instead, but
that requires the extent map cache to have the extent we're looking for.
This is a problem because btrfs_drop_extent_cache just drops the entire
extent instead of the little tiny part it is invalidating.

The bulk of the code in this patch changes btrfs_drop_extent_cache to
invalidate only a portion of the extent cache, and changes btrfs_get_extent
to deal with the results.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |   1 +
 fs/btrfs/extent_io.c   |  12 +++--
 fs/btrfs/file.c        |  47 ++++++++++++++++++-
 fs/btrfs/inode.c       | 120 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/volumes.c     |   3 ++
 5 files changed, 168 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 76fd5d7146e..593011e5d45 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1747,6 +1747,7 @@ again:
 			       search_start, search_end, hint_byte, ins,
 			       trans->alloc_exclude_start,
 			       trans->alloc_exclude_nr, data);
+
 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes = num_bytes >> 1;
 		num_bytes = max(num_bytes, min_alloc_size);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 88322684be6..21597bea21f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1025,7 +1025,8 @@ u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 search_again:
 	node = tree_search(tree, cur_start);
 	if (!node) {
-		*end = (u64)-1;
+		if (!found)
+			*end = (u64)-1;
 		goto out;
 	}
 
@@ -1540,6 +1541,8 @@ static int end_bio_extent_readpage(struct bio *bio,
 							 start, end, state);
 			if (ret == 0) {
 				state = NULL;
+				uptodate =
+					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				continue;
 			}
 		}
@@ -1555,10 +1558,11 @@ static int end_bio_extent_readpage(struct bio *bio,
 				    !(state->state & EXTENT_LOCKED))
 					state = NULL;
 			}
-			if (!state && uptodate) {
+			if (!state) {
 				spin_unlock_irqrestore(&tree->lock, flags);
-				set_extent_uptodate(tree, start, end,
-						    GFP_ATOMIC);
+				if (uptodate)
+					set_extent_uptodate(tree, start, end,
+							    GFP_ATOMIC);
 				unlock_extent(tree, start, end, GFP_ATOMIC);
 				goto next_io;
 			}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9fbda655206..3f5525f0834 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -356,12 +356,23 @@ out_unlock:
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 {
 	struct extent_map *em;
+	struct extent_map *split = NULL;
+	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 len = end - start + 1;
+	int ret;
+	int testend = 1;
 
-	if (end == (u64)-1)
+	if (end == (u64)-1) {
 		len = (u64)-1;
+		testend = 0;
+	}
 	while(1) {
+		if (!split)
+			split = alloc_extent_map(GFP_NOFS);
+		if (!split2)
+			split2 = alloc_extent_map(GFP_NOFS);
+
 		spin_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (!em) {
@@ -369,6 +380,36 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			break;
 		}
 		remove_extent_mapping(em_tree, em);
+
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    em->start < start) {
+			split->start = em->start;
+			split->len = start - em->start;
+			split->block_start = em->block_start;
+			split->bdev = em->bdev;
+			split->flags = em->flags;
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = split2;
+			split2 = NULL;
+		}
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    testend && em->start + em->len > start + len) {
+			u64 diff = start + len - em->start;
+
+			split->start = start + len;
+			split->len = em->start + em->len - (start + len);
+			split->bdev = em->bdev;
+			split->flags = em->flags;
+
+			split->block_start = em->block_start + diff;
+
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = NULL;
+		}
 		spin_unlock(&em_tree->lock);
 
 		/* once for us */
@@ -376,6 +417,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 		/* once for the tree*/
 		free_extent_map(em);
 	}
+	if (split)
+		free_extent_map(split);
+	if (split2)
+		free_extent_map(split2);
 	return 0;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5632ea76007..40f8da88409 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,6 +122,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
 
+	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
+
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
 		ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
@@ -140,6 +142,11 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 					       ins.offset);
 		inode->i_blocks += ins.offset >> 9;
 		btrfs_check_file(root, inode);
+		if (num_bytes < cur_alloc_size) {
+			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+			       cur_alloc_size);
+			break;
+		}
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
@@ -427,6 +434,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	struct extent_map *em;
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct bio *bio;
 	int num_copies;
 	int ret;
@@ -434,7 +442,6 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
-		size_t pg_offset = start - page_offset(page);
 		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
 		if (!failrec)
 			return -ENOMEM;
@@ -442,8 +449,13 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 		failrec->len = end - start + 1;
 		failrec->last_mirror = 0;
 
-		em = btrfs_get_extent(inode, NULL, pg_offset, start,
-				      failrec->len, 0);
+		spin_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (em->start > start || em->start + em->len < start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		spin_unlock(&em_tree->lock);
 
 		if (!em || IS_ERR(em)) {
 			kfree(failrec);
@@ -559,6 +571,8 @@ zeroit:
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
+	if (private == 0)
+		return 0;
 	return -EIO;
 }
 
@@ -908,8 +922,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
+	u64 mask = root->sectorsize - 1;
 
-	btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
+	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -1212,7 +1227,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 						       hole_start, 0, 0,
 						       hole_size);
 			btrfs_drop_extent_cache(inode, hole_start,
-						hole_size - 1);
+						(u64)-1);
 			btrfs_check_file(root, inode);
 		}
 		btrfs_end_transaction(trans, root);
@@ -2083,6 +2098,68 @@ out_unlock:
 	return err;
 }
 
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+				struct extent_map *existing,
+				struct extent_map *em)
+{
+	u64 start_diff;
+	u64 new_end;
+	int ret = 0;
+	int real_blocks = existing->block_start < EXTENT_MAP_LAST_BYTE;
+
+	if (real_blocks && em->block_start >= EXTENT_MAP_LAST_BYTE)
+		goto invalid;
+
+	if (!real_blocks && em->block_start != existing->block_start)
+		goto invalid;
+
+	new_end = max(existing->start + existing->len, em->start + em->len);
+
+	if (existing->start >= em->start) {
+		if (em->start + em->len < existing->start)
+			goto invalid;
+
+		start_diff = existing->start - em->start;
+		if (real_blocks && em->block_start + start_diff !=
+		    existing->block_start)
+			goto invalid;
+
+		em->len = new_end - em->start;
+
+		remove_extent_mapping(em_tree, existing);
+		/* free for the tree */
+		free_extent_map(existing);
+		ret = add_extent_mapping(em_tree, em);
+
+	} else if (em->start > existing->start) {
+
+		if (existing->start + existing->len < em->start)
+			goto invalid;
+
+		start_diff = em->start - existing->start;
+		if (real_blocks && existing->block_start + start_diff !=
+		    em->block_start)
+			goto invalid;
+
+		remove_extent_mapping(em_tree, existing);
+		em->block_start = existing->block_start;
+		em->start = existing->start;
+		em->len = new_end - existing->start;
+		free_extent_map(existing);
+
+		ret = add_extent_mapping(em_tree, em);
+	} else {
+		goto invalid;
+	}
+	return ret;
+
+invalid:
+	printk("invalid extent map merge [%Lu %Lu %Lu] [%Lu %Lu %Lu]\n",
+	       existing->start, existing->len, existing->block_start,
+	       em->start, em->len, em->block_start);
+	return -EIO;
+}
+
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t pg_offset, u64 start, u64 len,
 				    int create)
@@ -2267,12 +2344,35 @@ insert:
 	err = 0;
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
+
+	/* it is possible that someone inserted the extent into the tree
+	 * while we had the lock dropped.  It is also possible that
+	 * an overlapping map exists in the tree
+	 */
 	if (ret == -EEXIST) {
-		free_extent_map(em);
-		em = lookup_extent_mapping(em_tree, start, len);
-		if (!em) {
-			err = -EIO;
-			printk("failing to insert %Lu %Lu\n", start, len);
+		struct extent_map *existing;
+		existing = lookup_extent_mapping(em_tree, start, len);
+		if (!existing) {
+			existing = lookup_extent_mapping(em_tree, em->start,
+							 em->len);
+			if (existing) {
+				err = merge_extent_mapping(em_tree, existing,
+							   em);
+				free_extent_map(existing);
+				if (err) {
+					free_extent_map(em);
+					em = NULL;
+				}
+			} else {
+				err = -EIO;
+				printk("failing to insert %Lu %Lu\n",
+				       start, len);
+				free_extent_map(em);
+				em = NULL;
+			}
+		} else {
+			free_extent_map(em);
+			em = existing;
 		}
 	}
 	spin_unlock(&em_tree->lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e6417a573d4..0e658c1d821 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -883,6 +883,9 @@ again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	spin_unlock(&em_tree->lock);
+	if (!em) {
+		printk("unable to find logical %Lu\n", logical);
+	}
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-- 
cgit v1.2.3


From 7bf3b490dfa57bdef5892dffa5021469a0cd2b7f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Apr 2008 11:58:30 -0400
Subject: Btrfs: Avoid 64 bit div for RAID10

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e658c1d821..c11b6fd408a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -724,7 +724,7 @@ again:
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
 	else if (type & BTRFS_BLOCK_GROUP_RAID10)
-		*num_bytes = calc_size * num_stripes / sub_stripes;
+		*num_bytes = calc_size * (num_stripes / sub_stripes);
 	else
 		*num_bytes = calc_size * num_stripes;
 
-- 
cgit v1.2.3


From 41471e8341a6b476bcd8ecc765a8b297c22a74f9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Apr 2008 14:08:30 -0400
Subject: Btrfs: Remove bogus max_sector warnings from the extent_io code

It was testing the bio before doing logical->physical mapping, so the
test was always wrong.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 21597bea21f..a984a7bc119 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1702,7 +1702,6 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 
 static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
 {
-	u64 maxsector;
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct page *page = bvec->bv_page;
@@ -1730,12 +1729,6 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
 
 	bio_get(bio);
 
-        maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
-	if (maxsector < bio->bi_sector) {
-		printk("sector too large max %Lu got %llu\n", maxsector,
-			(unsigned long long)bio->bi_sector);
-		WARN_ON(1);
-	}
 	if (tree->ops && tree->ops->submit_bio_hook)
 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num);
-- 
cgit v1.2.3


From a443755f1ca3e190e12e3a845ddecb3ee1782512 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:38 -0400
Subject: Btrfs: Check device uuids along with devids

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  5 +++++
 fs/btrfs/volumes.c | 30 +++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b56ae195065..d119d95d139 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -800,6 +800,11 @@ static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
 	return (struct btrfs_stripe *)offset;
 }
 
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
 static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
 					 struct btrfs_chunk *c, int nr)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c11b6fd408a..cdf0019cca2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -69,15 +69,18 @@ int btrfs_cleanup_fs_uuids(void)
 	return 0;
 }
 
-static struct btrfs_device *__find_device(struct list_head *head, u64 devid)
+static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
+					  u8 *uuid)
 {
 	struct btrfs_device *dev;
 	struct list_head *cur;
 
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
-		if (dev->devid == devid)
+		if (dev->devid == devid &&
+		    !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE)) {
 			return dev;
+		}
 	}
 	return NULL;
 }
@@ -117,7 +120,8 @@ static int device_list_add(const char *path,
 		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
-		device = __find_device(&fs_devices->devices, devid);
+		device = __find_device(&fs_devices->devices, devid,
+				       disk_super->dev_item.uuid);
 	}
 	if (!device) {
 		device = kzalloc(sizeof(*device), GFP_NOFS);
@@ -126,6 +130,8 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		memcpy(device->uuid, disk_super->dev_item.uuid,
+		       BTRFS_UUID_SIZE);
 		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
@@ -1098,11 +1104,12 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	return 0;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid)
 {
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 
-	return __find_device(head, devid);
+	return __find_device(head, devid, uuid);
 }
 
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
@@ -1115,6 +1122,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	u64 logical;
 	u64 length;
 	u64 devid;
+	u8 uuid[BTRFS_UUID_SIZE];
 	int num_stripes;
 	int ret;
 	int i;
@@ -1163,7 +1171,10 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
-		map->stripes[i].dev = btrfs_find_device(root, devid);
+		read_extent_buffer(leaf, uuid, (unsigned long)
+				   btrfs_stripe_dev_uuid_nr(chunk, i),
+				   BTRFS_UUID_SIZE);
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
 		if (!map->stripes[i].dev) {
 			kfree(map);
 			free_extent_map(em);
@@ -1207,8 +1218,13 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+
 	devid = btrfs_device_id(leaf, dev_item);
-	device = btrfs_find_device(root, devid);
+	read_extent_buffer(leaf, dev_uuid,
+			   (unsigned long)btrfs_device_uuid(dev_item),
+			   BTRFS_UUID_SIZE);
+	device = btrfs_find_device(root, devid, dev_uuid);
 	if (!device) {
 		printk("warning devid %Lu not found already\n", devid);
 		device = kzalloc(sizeof(*device), GFP_NOFS);
-- 
cgit v1.2.3


From 7ae9c09d8f001eb19ee2ba219dc5c3d4f6d60614 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:49 -0400
Subject: Btrfs: Add support for labels in the super block

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/volumes.c | 17 +++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d119d95d139..f00c4be59ad 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -247,6 +247,7 @@ struct btrfs_header {
  * room to translate 14 chunks with 3 stripes each.
  */
 #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
 
 /*
  * the super block basically lists the main trees of the FS
@@ -276,6 +277,7 @@ struct btrfs_super_block {
 	u8 root_level;
 	u8 chunk_root_level;
 	struct btrfs_dev_item dev_item;
+	char label[BTRFS_LABEL_SIZE];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cdf0019cca2..93aa36e2436 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -57,9 +57,7 @@ int btrfs_cleanup_fs_uuids(void)
 			devices_cur = fs_devices->devices.next;
 			dev = list_entry(devices_cur, struct btrfs_device,
 					 dev_list);
-			printk("uuid cleanup finds %s\n", dev->name);
 			if (dev->bdev) {
-				printk("closing\n");
 				close_bdev_excl(dev->bdev);
 			}
 			list_del(&dev->dev_list);
@@ -149,7 +147,6 @@ static int device_list_add(const char *path,
 	}
 	if (fs_devices->lowest_devid > devid) {
 		fs_devices->lowest_devid = devid;
-		printk("lowest devid now %Lu\n", devid);
 	}
 	*fs_devices_ret = fs_devices;
 	return 0;
@@ -166,7 +163,6 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
-			printk("close devices closes %s\n", device->name);
 		}
 		device->bdev = NULL;
 	}
@@ -220,11 +216,9 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	printk("scan one opens %s\n", path);
 	bdev = open_bdev_excl(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
-		printk("open failed\n");
 		ret = PTR_ERR(bdev);
 		goto error;
 	}
@@ -240,13 +234,20 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 	    sizeof(disk_super->magic))) {
-		printk("no btrfs found on %s\n", path);
 		ret = -EINVAL;
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
-	printk("found device %Lu transid %Lu on %s\n", devid, transid, path);
+	if (disk_super->label[0])
+		printk("device label %s ", disk_super->label);
+	else {
+		/* FIXME, make a readl uuid parser */
+		printk("device fsid %llx-%llx ",
+		       *(unsigned long long *)disk_super->fsid,
+		       *(unsigned long long *)(disk_super->fsid + 8));
+	}
+	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 error_brelse:
-- 
cgit v1.2.3


From 7b13b7b119c932a5eca486db4113f4c1fe3b97a8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:50 -0400
Subject: Btrfs: Don't drop extent_map cache during releasepage on the btree
 inode

The btree inode should only have a single extent_map in the cache,
it doesn't make sense to ever drop it.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 23 ++++++++++++++---------
 fs/btrfs/extent_io.c | 38 +++++++++++++++++++++++++++-----------
 fs/btrfs/extent_io.h |  3 +++
 3 files changed, 44 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 76ee7a4ae00..2de2b00afeb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -76,13 +76,12 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	struct extent_map *em;
 	int ret;
 
-again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
 	spin_unlock(&em_tree->lock);
-	if (em) {
+	if (em)
 		goto out;
-	}
+
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
 		em = ERR_PTR(-ENOMEM);
@@ -95,15 +94,21 @@ again:
 
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
-
 	if (ret == -EEXIST) {
 		free_extent_map(em);
-		em = NULL;
-		goto again;
+		em = lookup_extent_mapping(em_tree, start, len);
+		if (em)
+			ret = 0;
+		else
+			ret = -EIO;
 	} else if (ret) {
-		em = ERR_PTR(ret);
+		free_extent_map(em);
+		em = NULL;
 	}
+	spin_unlock(&em_tree->lock);
+
+	if (ret)
+		em = ERR_PTR(ret);
 out:
 	return em;
 }
@@ -496,7 +501,7 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	}
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
-	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
+	ret = try_release_extent_state(map, tree, page, gfp_flags);
 	if (ret == 1) {
 		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a984a7bc119..81f8b4fd069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2463,6 +2463,31 @@ err:
 }
 EXPORT_SYMBOL(extent_prepare_write);
 
+/*
+ * a helper for releasepage, this tests for areas of the page that
+ * are locked or under IO and drops the related state bits if it is safe
+ * to drop the page.
+ */
+int try_release_extent_state(struct extent_map_tree *map,
+			     struct extent_io_tree *tree, struct page *page,
+			     gfp_t mask)
+{
+	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret = 1;
+
+	if (test_range_bit(tree, start, end, EXTENT_IOBITS, 0))
+		ret = 0;
+	else {
+		if ((mask & GFP_NOFS) == GFP_NOFS)
+			mask = GFP_NOFS;
+		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
+				 1, 1, mask);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(try_release_extent_state);
+
 /*
  * a helper for releasepage.  As long as there are no locked extents
  * in the range corresponding to the page, both state records and extent
@@ -2475,8 +2500,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 	struct extent_map *em;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
 	u64 end = start + PAGE_CACHE_SIZE - 1;
-	u64 orig_start = start;
-	int ret = 1;
+
 	if ((mask & __GFP_WAIT) &&
 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
 		u64 len;
@@ -2507,15 +2531,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			free_extent_map(em);
 		}
 	}
-	if (test_range_bit(tree, orig_start, end, EXTENT_IOBITS, 0))
-		ret = 0;
-	else {
-		if ((mask & GFP_NOFS) == GFP_NOFS)
-			mask = GFP_NOFS;
-		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
-				 1, 1, mask);
-	}
-	return ret;
+	return try_release_extent_state(map, tree, page, mask);
 }
 EXPORT_SYMBOL(try_release_extent_mapping);
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 232300d13be..e4834614751 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -110,6 +110,9 @@ void extent_io_tree_empty_lru(struct extent_io_tree *tree);
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
+int try_release_extent_state(struct extent_map_tree *map,
+			     struct extent_io_tree *tree, struct page *page,
+			     gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-- 
cgit v1.2.3


From 9b3f68b90674419add8be1c0aa740dcdf04f44cc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:51 -0400
Subject: Btrfs: Calculate appropriate chunk sizes for both small and large
 filesystems

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 93aa36e2436..e3ddd7fb8ed 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -627,6 +627,27 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
+			       int sub_stripes)
+{
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+		return calc_size;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		return calc_size * (num_stripes / sub_stripes);
+	else
+		return calc_size * num_stripes;
+}
+
+
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
 		      u64 *num_bytes, u64 type)
@@ -643,11 +664,14 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
+	int min_chunk_size = 8 * 1024 * 1024;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
-	u64 min_free = calc_size;
+	u64 max_chunk_size = calc_size;
+	u64 min_free;
 	u64 avail;
 	u64 max_avail = 0;
+	u64 percent_max;
 	int num_stripes = 1;
 	int sub_stripes = 0;
 	int looped = 0;
@@ -666,6 +690,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
+		if (num_stripes < 2)
+			return -ENOSPC;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
@@ -674,13 +700,45 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		num_stripes &= ~(u32)1;
 		sub_stripes = 2;
 	}
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		max_chunk_size = 10 * calc_size;
+		min_chunk_size = 256 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		max_chunk_size = 4 * calc_size;
+		min_chunk_size = 64 * 1024 * 1024;
+	} else {
+		min_chunk_size = 32 * 1024 * 1024;
+	}
+
+	/* we don't want a chunk larger than 10% of the FS */
+	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
+	max_chunk_size = min(percent_max, max_chunk_size);
+
+	if (calc_size * num_stripes > max_chunk_size) {
+		calc_size = max_chunk_size;
+		do_div(calc_size, num_stripes);
+		do_div(calc_size, stripe_len);
+		calc_size *= stripe_len;
+	}
+	/* we don't want tiny stripes */
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
+	calc_size = max_t(u64, chunk_bytes_by_type(type, min_chunk_size,
+		          num_stripes, sub_stripes), calc_size);
+
 again:
+	do_div(calc_size, stripe_len);
+	calc_size *= stripe_len;
+
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
 	index = 0;
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
 		min_free = calc_size * 2;
+	else
+		min_free = calc_size;
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
@@ -727,13 +785,9 @@ again:
 	}
 
 	stripes = &chunk->stripe;
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
-		*num_bytes = calc_size;
-	else if (type & BTRFS_BLOCK_GROUP_RAID10)
-		*num_bytes = calc_size * (num_stripes / sub_stripes);
-	else
-		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
 printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
-- 
cgit v1.2.3


From a40a90a0420abd5ff86a0917facd3293ebb6a9b6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 11:55:51 -0400
Subject: Btrfs: Fix chunk allocation when some devices don't have enough room
 for stripes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e3ddd7fb8ed..fe5b00986d2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -664,7 +664,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
-	int min_chunk_size = 8 * 1024 * 1024;
+	int min_stripe_size = 1 * 1024 * 1024;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
 	u64 max_chunk_size = calc_size;
@@ -673,6 +673,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 max_avail = 0;
 	u64 percent_max;
 	int num_stripes = 1;
+	int min_stripes = 1;
 	int sub_stripes = 0;
 	int looped = 0;
 	int ret;
@@ -683,15 +684,20 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID0))
+	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
-	if (type & (BTRFS_BLOCK_GROUP_DUP))
+		min_stripes = 2;
+	}
+	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
 		num_stripes = 2;
+		min_stripes = 2;
+	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
 		if (num_stripes < 2)
 			return -ENOSPC;
+		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
@@ -699,22 +705,26 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
 		sub_stripes = 2;
+		min_stripes = 4;
 	}
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_chunk_size = 10 * calc_size;
-		min_chunk_size = 256 * 1024 * 1024;
+		min_stripe_size = 64 * 1024 * 1024;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 		max_chunk_size = 4 * calc_size;
-		min_chunk_size = 64 * 1024 * 1024;
-	} else {
-		min_chunk_size = 32 * 1024 * 1024;
+		min_stripe_size = 32 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		calc_size = 8 * 1024 * 1024;
+		max_chunk_size = calc_size * 2;
+		min_stripe_size = 1 * 1024 * 1024;
 	}
 
 	/* we don't want a chunk larger than 10% of the FS */
 	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
 	max_chunk_size = min(percent_max, max_chunk_size);
 
+again:
 	if (calc_size * num_stripes > max_chunk_size) {
 		calc_size = max_chunk_size;
 		do_div(calc_size, num_stripes);
@@ -722,12 +732,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		calc_size *= stripe_len;
 	}
 	/* we don't want tiny stripes */
-	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
-	calc_size = max_t(u64, chunk_bytes_by_type(type, min_chunk_size,
-		          num_stripes, sub_stripes), calc_size);
+	calc_size = max_t(u64, min_stripe_size, calc_size);
 
-again:
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
 
@@ -746,19 +752,27 @@ again:
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
-		if (avail > max_avail)
-			max_avail = avail;
 		if (avail >= min_free) {
 			list_move_tail(&device->dev_list, &private_devs);
 			index++;
 			if (type & BTRFS_BLOCK_GROUP_DUP)
 				index++;
-		}
+		} else if (avail > max_avail)
+			max_avail = avail;
 		if (cur == dev_list)
 			break;
 	}
 	if (index < num_stripes) {
 		list_splice(&private_devs, dev_list);
+		if (index >= min_stripes) {
+			num_stripes = index;
+			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+				num_stripes /= sub_stripes;
+				num_stripes *= sub_stripes;
+			}
+			looped = 1;
+			goto again;
+		}
 		if (!looped && max_avail > 0) {
 			looped = 1;
 			calc_size = max_avail;
@@ -766,7 +780,6 @@ again:
 		}
 		return -ENOSPC;
 	}
-
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-- 
cgit v1.2.3


From 0afbaf8c8216a0de64b55c2d455de2b6c9dde5eb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 14:17:20 -0400
Subject: Btrfs: Set the btree inode i_size to OFFSET_MAX

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2de2b00afeb..9d648f2f41a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -88,19 +88,35 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 		goto out;
 	}
 	em->start = 0;
-	em->len = i_size_read(inode);
+	em->len = (u64)-1;
 	em->block_start = 0;
 	em->bdev = inode->i_sb->s_bdev;
 
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
 	if (ret == -EEXIST) {
+		u64 failed_start = em->start;
+		u64 failed_len = em->len;
+
+		printk("failed to insert %Lu %Lu -> %Lu into tree\n",
+		       em->start, em->len, em->block_start);
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (em)
+		if (em) {
+			printk("after failing, found %Lu %Lu %Lu\n",
+			       em->start, em->len, em->block_start);
 			ret = 0;
-		else
+		} else {
+			em = lookup_extent_mapping(em_tree, failed_start,
+						   failed_len);
+			if (em) {
+				printk("double failure lookup gives us "
+				       "%Lu %Lu -> %Lu\n", em->start,
+				       em->len, em->block_start);
+				free_extent_map(em);
+			}
 			ret = -EIO;
+		}
 	} else if (ret) {
 		free_extent_map(em);
 		em = NULL;
@@ -1108,7 +1124,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
-	fs_info->btree_inode->i_size = sb->s_bdev->bd_inode->i_size;
+
+	/*
+	 * we set the i_size on the btree inode to the max possible int.
+	 * the real end of the address space is determined by all of
+	 * the devices in the system
+	 */
+	fs_info->btree_inode->i_size = OFFSET_MAX;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
@@ -1196,9 +1218,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	tree_root->stripesize = stripesize;
 	sb_set_blocksize(sb, sectorsize);
 
-	i_size_write(fs_info->btree_inode,
-		     btrfs_super_total_bytes(disk_super));
-
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
 		printk("btrfs: valid FS not found on %s\n", sb->s_id);
-- 
cgit v1.2.3


From 9ad6b7bc2e00ba02f915cffd5b6bcd6564bb2c75 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 16:11:30 -0400
Subject: Force page->private removal in btrfs_invalidatepage

btrfs_invalidatepage is not allowed to leave pages around on the lru.
Any such pages will trigger an oops later on because the VM will see
page->private and assume it is a buffer head.

This also forces extra flushes of the async work queues before
dropping all the pages on the btree inode during unmount.  Left over
items on the work queues are one possible cause of busy state ranges
during truncate_inode_pages.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 12 ++++++++++++
 fs/btrfs/inode.c   |  6 ++++++
 2 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9d648f2f41a..b479cdfe3ee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -533,6 +533,12 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
+	if (PagePrivate(page)) {
+		printk("2invalidate page cleaning up after releasepage\n");
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
 }
 
 #if 0
@@ -1484,6 +1490,8 @@ int close_ctree(struct btrfs_root *root)
 	write_ctree_super(NULL, root);
 	mutex_unlock(&fs_info->fs_mutex);
 
+	btrfs_transaction_flush_work(root);
+
 	if (fs_info->delalloc_bytes) {
 		printk("btrfs: at unmount delalloc count %Lu\n",
 		       fs_info->delalloc_bytes);
@@ -1514,7 +1522,11 @@ int close_ctree(struct btrfs_root *root)
 	extent_io_tree_empty_lru(&fs_info->extent_ins);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
+	flush_workqueue(end_io_workqueue);
+	flush_workqueue(async_submit_workqueue);
+
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+
 	flush_workqueue(end_io_workqueue);
 	destroy_workqueue(end_io_workqueue);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 40f8da88409..347cd85db12 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2524,6 +2524,12 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	extent_invalidatepage(tree, page, offset);
 	btrfs_releasepage(page, GFP_NOFS);
+	if (PagePrivate(page)) {
+		printk("invalidate page cleaning up after releasepage\n");
+		ClearPagePrivate(page);
+		set_page_private(page, 0);
+		page_cache_release(page);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 4575c9cceeca2f51c50536850e15e1dc5187f3d9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 16:13:31 -0400
Subject: Btrfs: Scale the bdi ra_pages by the number of devices in the FS

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b479cdfe3ee..46188ee1662 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -932,7 +932,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	bdi_init(bdi);
 #endif
-	bdi->ra_pages	= default_backing_dev_info.ra_pages * 4;
+	bdi->ra_pages	= default_backing_dev_info.ra_pages;
 	bdi->state		= 0;
 	bdi->capabilities	= default_backing_dev_info.capabilities;
 	bdi->unplug_io_fn	= btrfs_unplug_io_fn;
@@ -1214,6 +1214,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		       (unsigned long long)fs_devices->num_devices);
 		goto fail_sb_buffer;
 	}
+	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+
 	nodesize = btrfs_super_nodesize(disk_super);
 	leafsize = btrfs_super_leafsize(disk_super);
 	sectorsize = btrfs_super_sectorsize(disk_super);
-- 
cgit v1.2.3


From ad5bd91ecee2b687071da00094564fd075f934ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 08:28:10 -0400
Subject: Btrfs: Add 1MB to the min_free in alloc_chunk

This properly reflects the first 1MB we skip at the start of the device

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fe5b00986d2..4fc92cf813d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -746,6 +746,9 @@ again:
 	else
 		min_free = calc_size;
 
+	/* we add 1MB because we never use the first 1MB of the device */
+	min_free += 1024 * 1024;
+
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
-- 
cgit v1.2.3


From 4ef64eae288a3644d1f8b748eb075426701e96d1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 08:52:50 -0400
Subject: Btrfs: Remove debugging statements from the invalidatepage calls

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 +-
 fs/btrfs/inode.c   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 46188ee1662..c829612c797 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -534,7 +534,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		printk("2invalidate page cleaning up after releasepage\n");
+		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 347cd85db12..5b1514e58de 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2510,6 +2510,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
 	if (ret == 1) {
+		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -2525,7 +2526,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	extent_invalidatepage(tree, page, offset);
 	btrfs_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		printk("invalidate page cleaning up after releasepage\n");
+		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
-- 
cgit v1.2.3


From f2d8d74d7874f8f81222363cd6459a365796e35a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 10:03:05 -0400
Subject: Btrfs: Make an unplug function that doesn't unplug every spindle

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 26 ++++++++++--------
 fs/btrfs/inode.c   |  6 +----
 fs/btrfs/volumes.c | 79 +++++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/volumes.h |  2 ++
 4 files changed, 75 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c829612c797..7f5aca35494 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -913,18 +913,22 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 
 void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
-	struct list_head *cur;
-	struct btrfs_device *device;
-	struct btrfs_fs_info *info;
+	struct inode *inode = page->mapping->host;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	u64 offset = page_offset(page);
 
-	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
-		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn) {
-			bdi->unplug_io_fn(bdi, page);
-		}
-	}
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+	if (!em)
+		return;
+
+	offset = offset - em->start;
+	btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
+			  em->block_start + offset, page);
+	free_extent_map(em);
 }
 
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b1514e58de..e875c7c8a64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -313,13 +313,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	u64 logical = bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct bio_vec *bvec;
-	int i;
 	int ret;
 
-	bio_for_each_segment(bvec, bio, i) {
-		length += bvec->bv_len;
-	}
+	length = bio->bi_size;
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 	ret = btrfs_map_block(map_tree, READ, logical,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4fc92cf813d..46024070650 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -930,9 +931,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	return ret;
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret, int mirror_num)
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+			     u64 logical, u64 *length,
+			     struct btrfs_multi_bio **multi_ret,
+			     int mirror_num, struct page *unplug_page)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -944,6 +946,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int stripes_required = 1;
 	int stripe_index;
 	int i;
+	int num_stripes;
 	struct btrfs_multi_bio *multi = NULL;
 
 	if (multi_ret && !(rw & (1 << BIO_RW))) {
@@ -960,10 +963,14 @@ again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	spin_unlock(&em_tree->lock);
+
+	if (!em && unplug_page)
+		return 0;
+
 	if (!em) {
 		printk("unable to find logical %Lu\n", logical);
+		BUG();
 	}
-	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
@@ -1010,14 +1017,15 @@ again:
 	} else {
 		*length = em->len - offset;
 	}
-	if (!multi_ret)
+
+	if (!multi_ret && !unplug_page)
 		goto out;
 
-	multi->num_stripes = 1;
+	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->num_stripes;
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->num_stripes;
 		else if (mirror_num) {
 			stripe_index = mirror_num - 1;
 		} else {
@@ -1037,7 +1045,7 @@ again:
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->num_stripes;
+			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
@@ -1047,8 +1055,8 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->sub_stripes;
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
 		else
@@ -1063,19 +1071,50 @@ again:
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
-	for (i = 0; i < multi->num_stripes; i++) {
-		multi->stripes[i].physical =
-			map->stripes[stripe_index].physical + stripe_offset +
-			stripe_nr * map->stripe_len;
-		multi->stripes[i].dev = map->stripes[stripe_index].dev;
+	for (i = 0; i < num_stripes; i++) {
+		if (unplug_page) {
+			struct btrfs_device *device;
+			struct backing_dev_info *bdi;
+
+			device = map->stripes[stripe_index].dev;
+			bdi = blk_get_backing_dev_info(device->bdev);
+			if (bdi->unplug_io_fn) {
+				bdi->unplug_io_fn(bdi, unplug_page);
+			}
+		} else {
+			multi->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset + stripe_nr * map->stripe_len;
+			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+		}
 		stripe_index++;
 	}
-	*multi_ret = multi;
+	if (multi_ret) {
+		*multi_ret = multi;
+		multi->num_stripes = num_stripes;
+	}
 out:
 	free_extent_map(em);
 	return 0;
 }
 
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		      u64 logical, u64 *length,
+		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+				 mirror_num, NULL);
+}
+
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+		      u64 logical, struct page *page)
+{
+	u64 length = PAGE_CACHE_SIZE;
+	return __btrfs_map_block(map_tree, READ, logical, &length,
+				 NULL, 0, page);
+}
+
+
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_multi_stripe(struct bio *bio, int err)
 #else
@@ -1122,16 +1161,12 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	u64 logical = bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct bio_vec *bvec;
 	struct btrfs_multi_bio *multi = NULL;
-	int i;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
 
-	bio_for_each_segment(bvec, bio, i) {
-		length += bvec->bv_len;
-	}
+	length = bio->bi_size;
 
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f9cae307217..dfe7e8a9507 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -119,4 +119,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_device *device);
 int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+		      u64 logical, struct page *page);
 #endif
-- 
cgit v1.2.3


From 3c12ac7205d4bd679fefa722aa9b61385e4b4749 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 12:01:38 -0400
Subject: Btrfs: Simplify device selection for mirrored reads

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  8 ++++----
 fs/btrfs/volumes.c     | 23 +++++++----------------
 2 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 593011e5d45..cc0d7f30c36 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -640,7 +640,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 0;
+	path->reada = 1;
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -660,7 +660,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(root->fs_info->extent_root, path);
 
-	path->reada = 0;
+	path->reada = 1;
 	ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
 					  path, bytenr, root_objectid,
 					  ref_generation, owner, owner_offset);
@@ -692,7 +692,7 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 
 	WARN_ON(num_bytes < root->sectorsize);
 	path = btrfs_alloc_path();
-	path->reada = 0;
+	path->reada = 1;
 	key.objectid = bytenr;
 	key.offset = num_bytes;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -1328,7 +1328,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 0;
+	path->reada = 1;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, root_objectid,
 				    ref_generation,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 46024070650..107fc74c3ab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1029,19 +1029,8 @@ again:
 		else if (mirror_num) {
 			stripe_index = mirror_num - 1;
 		} else {
-			int i;
-			u64 least = (u64)-1;
-			struct btrfs_device *cur;
-
-			for (i = 0; i < map->num_stripes; i++) {
-				cur = map->stripes[i].dev;
-				spin_lock(&cur->io_lock);
-				if (cur->total_ios < least) {
-					least = cur->total_ios;
-					stripe_index = i;
-				}
-				spin_unlock(&cur->io_lock);
-			}
+			u64 orig_stripe_nr = stripe_nr;
+			stripe_index = do_div(orig_stripe_nr, num_stripes);
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
@@ -1050,7 +1039,6 @@ again:
 			stripe_index = mirror_num - 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
-		int orig_stripe_nr = stripe_nr;
 
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
@@ -1059,8 +1047,11 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else
-			stripe_index += orig_stripe_nr % map->sub_stripes;
+		else {
+			u64 orig_stripe_nr = stripe_nr;
+			stripe_index += do_div(orig_stripe_nr,
+					       map->sub_stripes);
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
-- 
cgit v1.2.3


From b30757178dad19a0388d958ff9eea66e674d39ed Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 09:22:07 -0400
Subject: Btrfs: Add a special device list for chunk allocations

This allows other code that needs to walk every device in the FS to do so
without locking against allocations.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 15 ++++++++++-----
 fs/btrfs/volumes.h |  7 +++++++
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 107fc74c3ab..5619e50583e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -111,6 +111,7 @@ static int device_list_add(const char *path,
 		if (!fs_devices)
 			return -ENOMEM;
 		INIT_LIST_HEAD(&fs_devices->devices);
+		INIT_LIST_HEAD(&fs_devices->alloc_list);
 		list_add(&fs_devices->list, &fs_uuids);
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
@@ -139,6 +140,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		list_add(&device->dev_list, &fs_devices->devices);
+		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
 		fs_devices->num_devices++;
 	}
 
@@ -660,7 +662,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
 	struct list_head private_devs;
-	struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices;
+	struct list_head *dev_list;
 	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
@@ -682,6 +684,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
+	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
@@ -752,12 +755,12 @@ again:
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
 		if (avail >= min_free) {
-			list_move_tail(&device->dev_list, &private_devs);
+			list_move_tail(&device->dev_alloc_list, &private_devs);
 			index++;
 			if (type & BTRFS_BLOCK_GROUP_DUP)
 				index++;
@@ -812,12 +815,12 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
-		device = list_entry(cur, struct btrfs_device, dev_list);
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
 		/* loop over this device again if we're doing a dup group */
 		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
 		    (index == num_stripes - 1))
-			list_move_tail(&device->dev_list, dev_list);
+			list_move_tail(&device->dev_alloc_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 			     info->chunk_root->root_key.objectid,
@@ -1329,6 +1332,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
+		list_add(&device->dev_alloc_list,
+			 &root->fs_info->fs_devices->alloc_list);
 		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 	}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index dfe7e8a9507..a2660d2b6b3 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,9 +24,12 @@
 struct buffer_head;
 struct btrfs_device {
 	struct list_head dev_list;
+	struct list_head dev_alloc_list;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
+
 	int barriers;
+
 	spinlock_t io_lock;
 
 	struct block_device *bdev;
@@ -70,7 +73,11 @@ struct btrfs_fs_devices {
 	u64 num_devices;
 	struct block_device *latest_bdev;
 	struct block_device *lowest_bdev;
+	/* all of the devices in the FS */
 	struct list_head devices;
+
+	/* devices not currently being allocated */
+	struct list_head alloc_list;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From 38b669880df44cba24ab1b01715b87318d469217 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 09:22:11 -0400
Subject: Deal with page == NULL in the btrfs_unplug_io_fn

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f5aca35494..19c258d2c7e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -911,12 +911,40 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	return ret;
 }
 
+/*
+ * this unplugs every device on the box, and it is only used when page
+ * is null
+ */
+static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+{
+	struct list_head *cur;
+	struct btrfs_device *device;
+	struct btrfs_fs_info *info;
+
+	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
+	list_for_each(cur, &info->fs_devices->devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		bdi = blk_get_backing_dev_info(device->bdev);
+		if (bdi->unplug_io_fn) {
+			bdi->unplug_io_fn(bdi, page);
+		}
+	}
+}
+
 void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode;
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
-	u64 offset = page_offset(page);
+	u64 offset;
+
+	if (!page) {
+		__unplug_io_fn(bdi, page);
+		return;
+	}
+
+	inode = page->mapping->host;
+	offset = page_offset(page);
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
 	spin_lock(&em_tree->lock);
-- 
cgit v1.2.3


From 409c6118d39cb2d8666bee3d61a1a9ae5bbd4f5d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 09:24:20 -0400
Subject: Btrfs: Set nodatasum on the inode when written by a nodatasum mount

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3f5525f0834..5d537f26dc8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -863,6 +863,14 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 
+	/*
+	 * if this is a nodatasum mount, force summing off for the inode
+	 * all the time.  That way a later mount with summing on won't
+	 * get confused
+	 */
+	if (btrfs_test_opt(root, NODATASUM))
+		btrfs_set_flag(inode, NODATASUM);
+
 	/*
 	 * there are lots of better ways to do this, but this code
 	 * makes sure the first and last page in the file range are
-- 
cgit v1.2.3


From e1c4b7451e22f5b0a9fbccfa560ee7b80c35b8cd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 13:26:46 -0400
Subject: Fix btrfs_get_extent and get_block corner cases, and disable O_DIRECT
 reads

The generic O_DIRECT code assumes all the bios have the same bdev,
which isn't true for multi-device btrfs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c |  1 +
 fs/btrfs/inode.c     | 32 +++++++++++++++++++++-----------
 fs/btrfs/volumes.c   |  2 +-
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 81f8b4fd069..6e4bf029c6d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1694,6 +1694,7 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	}
 
 	if (bio) {
+		bio->bi_size = 0;
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e875c7c8a64..abfe86df02d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -494,6 +494,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	bio->bi_end_io = failed_bio->bi_end_io;
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = failed_bio->bi_bdev;
+	bio->bi_size = 0;
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
 	btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
 	return 0;
@@ -2187,12 +2188,9 @@ again:
 	spin_unlock(&em_tree->lock);
 
 	if (em) {
-		if (em->start > start) {
-			printk("get_extent lookup [%Lu %Lu] em [%Lu %Lu]\n",
-			       start, len, em->start, em->len);
-			WARN_ON(1);
-		}
-		if (em->block_start == EXTENT_MAP_INLINE && page)
+		if (em->start > start || em->start + em->len <= start)
+			free_extent_map(em);
+		else if (em->block_start == EXTENT_MAP_INLINE && page)
 			free_extent_map(em);
 		else
 			goto out;
@@ -2340,7 +2338,6 @@ insert:
 	err = 0;
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
 	 * an overlapping map exists in the tree
@@ -2348,6 +2345,11 @@ insert:
 	if (ret == -EEXIST) {
 		struct extent_map *existing;
 		existing = lookup_extent_mapping(em_tree, start, len);
+		if (existing && (existing->start > start ||
+		    existing->start + existing->len <= start)) {
+			free_extent_map(existing);
+			existing = NULL;
+		}
 		if (!existing) {
 			existing = lookup_extent_mapping(em_tree, em->start,
 							 em->len);
@@ -2388,6 +2390,7 @@ out:
 	return em;
 }
 
+#if 0 /* waiting for O_DIRECT reads */
 static int btrfs_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
@@ -2405,22 +2408,24 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	if (!em || IS_ERR(em))
 		goto out;
 
-	if (em->start > start || em->start + em->len <= start)
+	if (em->start > start || em->start + em->len <= start) {
 	    goto out;
+	}
 
 	if (em->block_start == EXTENT_MAP_INLINE) {
 		ret = -EINVAL;
 		goto out;
 	}
 
+	len = em->start + em->len - start;
+	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
+
 	if (em->block_start == EXTENT_MAP_HOLE ||
 	    em->block_start == EXTENT_MAP_DELALLOC) {
+		bh_result->b_size = len;
 		goto out;
 	}
 
-	len = em->start + em->len - start;
-	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
-
 	logical = start - em->start;
 	logical = em->block_start + logical;
 
@@ -2430,6 +2435,7 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	BUG_ON(ret);
 	bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
 	bh_result->b_size = min(map_length, len);
+
 	bh_result->b_bdev = multi->stripes[0].dev->bdev;
 	set_buffer_mapped(bh_result);
 	kfree(multi);
@@ -2437,11 +2443,14 @@ out:
 	free_extent_map(em);
 	return ret;
 }
+#endif
 
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
+	return -EINVAL;
+#if 0
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
@@ -2450,6 +2459,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, btrfs_get_block, NULL);
+#endif
 }
 
 static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5619e50583e..bccb5566fd8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1161,7 +1161,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	int total_devs = 1;
 
 	length = bio->bi_size;
-
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
@@ -1192,6 +1191,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
+
 		bio->bi_bdev = dev->bdev;
 		spin_lock(&dev->io_lock);
 		dev->total_ios++;
-- 
cgit v1.2.3


From bcbfce8abd5f8d3f84eab60a9df1ec147f81c34f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 13:26:47 -0400
Subject: Btrfs: Fix the unplug_io_fn to grab a consistent copy of
 page->mapping

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 13 ++++++++++++-
 fs/btrfs/inode.c   | 16 ++++++++++------
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 19c258d2c7e..1281c393c7e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -936,14 +936,25 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	struct inode *inode;
 	struct extent_map_tree *em_tree;
 	struct extent_map *em;
+	struct address_space *mapping;
 	u64 offset;
 
+	/* the generic O_DIRECT read code does this */
 	if (!page) {
 		__unplug_io_fn(bdi, page);
 		return;
 	}
 
-	inode = page->mapping->host;
+	/*
+	 * page->mapping may change at any time.  Get a consistent copy
+	 * and use that for everything below
+	 */
+	smp_mb();
+	mapping = page->mapping;
+	if (!mapping)
+		return;
+
+	inode = mapping->host;
 	offset = page_offset(page);
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index abfe86df02d..d58b4d5a1fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -80,6 +80,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	u64 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	u64 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
 	u64 thresh;
+	unsigned long flags;
 	int ret = 0;
 
 	if (for_del)
@@ -89,10 +90,10 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 
 	do_div(thresh, 100);
 
-	spin_lock(&root->fs_info->delalloc_lock);
+	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
 		ret = -ENOSPC;
-	spin_unlock(&root->fs_info->delalloc_lock);
+	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 	return ret;
 }
 
@@ -275,12 +276,13 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
+	unsigned long flags;
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
-		spin_lock(&root->fs_info->delalloc_lock);
+		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
-		spin_unlock(&root->fs_info->delalloc_lock);
+		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 	}
 	return 0;
 }
@@ -290,7 +292,9 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 {
 	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
-		spin_lock(&root->fs_info->delalloc_lock);
+		unsigned long flags;
+
+		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 		if (end - start + 1 > root->fs_info->delalloc_bytes) {
 			printk("warning: delalloc account %Lu %Lu\n",
 			       end - start + 1, root->fs_info->delalloc_bytes);
@@ -300,7 +304,7 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
-		spin_unlock(&root->fs_info->delalloc_lock);
+		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 1514794e4253cf4a32a2acc6de52f2527ca1bdce Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Apr 2008 09:22:51 -0400
Subject: Btrfs: Make sure nodes have enough room for a double split

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1c3d9d6fbda..8ad8b5cc295 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1146,7 +1146,7 @@ again:
 				slot -= 1;
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_header_nritems(b) >=
-			    BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
+			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
 				int sret = split_node(trans, root, p, level);
 				BUG_ON(sret > 0);
 				if (sret)
-- 
cgit v1.2.3


From c448acf0a057debc34507277add0fc09b1b37312 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Apr 2008 09:34:34 -0400
Subject: Btrfs: Fix split_node to require more empty slots in the node as well

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8ad8b5cc295..ac999f0060b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1485,7 +1485,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		ret = push_nodes_for_insert(trans, root, path, level);
 		c = path->nodes[level];
 		if (!ret && btrfs_header_nritems(c) <
-		    BTRFS_NODEPTRS_PER_BLOCK(root) - 1)
+		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
 			return 0;
 		if (ret < 0)
 			return ret;
-- 
cgit v1.2.3


From 971a1f6648bfafd4239234f1ffb9c26e907c2744 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Apr 2008 10:54:32 -0400
Subject: Btrfs: Don't empty the middle buffer in push_nodes_for_insert

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ac999f0060b..319f9f9bf8d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -29,7 +29,7 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
 		      struct btrfs_path *path, int data_size, int extend);
 static int push_node_left(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct extent_buffer *dst,
-			  struct extent_buffer *src);
+			  struct extent_buffer *src, int empty);
 static int balance_node_right(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct extent_buffer *dst_buf,
@@ -789,7 +789,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	/* first, try to make some room in the middle buffer */
 	if (left) {
 		orig_slot += btrfs_header_nritems(left);
-		wret = push_node_left(trans, root, left, mid);
+		wret = push_node_left(trans, root, left, mid, 0);
 		if (wret < 0)
 			ret = wret;
 		if (btrfs_header_nritems(mid) < 2)
@@ -800,7 +800,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right) {
-		wret = push_node_left(trans, root, mid, right);
+		wret = push_node_left(trans, root, mid, right, 1);
 		if (wret < 0 && wret != -ENOSPC)
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
@@ -941,7 +941,7 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				wret = 1;
 			else {
 				wret = push_node_left(trans, root,
-						      left, mid);
+						      left, mid, 0);
 			}
 		}
 		if (wret < 0)
@@ -1239,7 +1239,7 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
  */
 static int push_node_left(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct extent_buffer *dst,
-			  struct extent_buffer *src)
+			  struct extent_buffer *src, int empty)
 {
 	int push_items = 0;
 	int src_nritems;
@@ -1252,12 +1252,17 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	WARN_ON(btrfs_header_generation(src) != trans->transid);
 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
 
+	if (!empty && src_nritems <= 2)
+		return 1;
+
 	if (push_items <= 0) {
 		return 1;
 	}
 
-	if (src_nritems < push_items)
-		push_items = src_nritems;
+	if (empty)
+		push_items = min(src_nritems, push_items);
+	else
+		push_items = min(src_nritems - 2, push_items);
 
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
-- 
cgit v1.2.3


From bce4eae9864e6b6ebde5d7f05576ff0a6c3724d0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Apr 2008 14:42:46 -0400
Subject: Btrfs: Fix balance_level to free the middle block if there is room in
 the left one

balance level starts by trying to empty the middle block, and then
pushes from the right to the middle.  This might empty the right block
and leave a small number of pointers in the middle.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 35 ++++++++++++++++++++++++++++-------
 fs/btrfs/extent-tree.c |  4 ++--
 2 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 319f9f9bf8d..29e5674b043 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -789,7 +789,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	/* first, try to make some room in the middle buffer */
 	if (left) {
 		orig_slot += btrfs_header_nritems(left);
-		wret = push_node_left(trans, root, left, mid, 0);
+		wret = push_node_left(trans, root, left, mid, 1);
 		if (wret < 0)
 			ret = wret;
 		if (btrfs_header_nritems(mid) < 2)
@@ -844,6 +844,11 @@ static int balance_level(struct btrfs_trans_handle *trans,
 			ret = wret;
 			goto enospc;
 		}
+		if (wret == 1) {
+			wret = push_node_left(trans, root, left, mid, 1);
+			if (wret < 0)
+				ret = wret;
+		}
 		BUG_ON(wret == 1);
 	}
 	if (btrfs_header_nritems(mid) == 0) {
@@ -1252,17 +1257,27 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	WARN_ON(btrfs_header_generation(src) != trans->transid);
 	WARN_ON(btrfs_header_generation(dst) != trans->transid);
 
-	if (!empty && src_nritems <= 2)
+	if (!empty && src_nritems <= 8)
 		return 1;
 
 	if (push_items <= 0) {
 		return 1;
 	}
 
-	if (empty)
+	if (empty) {
 		push_items = min(src_nritems, push_items);
-	else
-		push_items = min(src_nritems - 2, push_items);
+		if (push_items < src_nritems) {
+			/* leave at least 8 pointers in the node if
+			 * we aren't going to empty it
+			 */
+			if (src_nritems - push_items < 8) {
+				if (push_items <= 8)
+					return 1;
+				push_items -= 8;
+			}
+		}
+	} else
+		push_items = min(src_nritems - 8, push_items);
 
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
@@ -1308,13 +1323,19 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	src_nritems = btrfs_header_nritems(src);
 	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
-	if (push_items <= 0)
+	if (push_items <= 0) {
 		return 1;
+	}
+
+	if (src_nritems < 4) {
+		return 1;
+	}
 
 	max_push = src_nritems / 2 + 1;
 	/* don't try to empty the node */
-	if (max_push >= src_nritems)
+	if (max_push >= src_nritems) {
 		return 1;
+	}
 
 	if (max_push < push_items)
 		push_items = max_push;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cc0d7f30c36..c49592c5127 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -306,13 +306,13 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	int bit;
 	int ret;
 	int full_search = 0;
-	int factor = 8;
+	int factor = 10;
 
 	block_group_cache = &info->block_group_cache;
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 
 	if (!owner)
-		factor = 8;
+		factor = 10;
 
 	bit = block_group_state_bits(data);
 
-- 
cgit v1.2.3


From 81d7ed29ff6bdec903c36c26b386e16c014993b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 08:51:48 -0400
Subject: Btrfs: Throttle file_write when data=ordered is flushing the inode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  |  1 +
 fs/btrfs/file.c         |  1 +
 fs/btrfs/inode.c        |  4 ++++
 fs/btrfs/ordered-data.c | 13 +++++++++++++
 fs/btrfs/ordered-data.h |  1 +
 fs/btrfs/transaction.c  | 10 ++++++++--
 6 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fe6ef8e3416..5ba83894c8b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -31,6 +31,7 @@ struct btrfs_inode {
 	struct extent_io_tree io_tree;
 	struct extent_io_tree io_failure_tree;
 	struct inode vfs_inode;
+	atomic_t ordered_writeback;
 
 	u64 ordered_trans;
 	/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5d537f26dc8..8effdf4f5d6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -977,6 +977,7 @@ out_nolock:
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 	}
 	current->backing_dev_info = NULL;
+	btrfs_ordered_throttle(root, inode);
 	return num_written ? num_written : err;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d58b4d5a1fe..b31f52d4f2c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1419,6 +1419,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	return 0;
 }
 
@@ -1728,6 +1729,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -1956,6 +1958,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -3292,6 +3295,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 3ee51e10c18..b474902c90e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -269,3 +269,16 @@ int btrfs_del_ordered_inode(struct inode *inode)
 	return ret;
 }
 
+int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
+{
+	struct btrfs_transaction *cur = root->fs_info->running_transaction;
+	while(cur == root->fs_info->running_transaction &&
+	      atomic_read(&BTRFS_I(inode)->ordered_writeback)) {
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
+		congestion_wait(WRITE, HZ/20);
+#else
+		blk_congestion_wait(WRITE, HZ/20);
+#endif
+	}
+	return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f25c6771ec6..29047e0abaa 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -39,4 +39,5 @@ int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
 int btrfs_del_ordered_inode(struct inode *inode);
+int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1ed179c020a..c85cb48d95e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -512,8 +512,11 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		mutex_unlock(&root->fs_info->fs_mutex);
 
-		if (S_ISREG(inode->i_mode))
+		if (S_ISREG(inode->i_mode)) {
+			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
 			filemap_fdatawrite(inode->i_mapping);
+			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
+		}
 		iput(inode);
 
 		mutex_lock(&root->fs_info->fs_mutex);
@@ -530,8 +533,11 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		mutex_unlock(&root->fs_info->fs_mutex);
 
-		if (S_ISREG(inode->i_mode))
+		if (S_ISREG(inode->i_mode)) {
+			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
 			filemap_write_and_wait(inode->i_mapping);
+			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
+		}
 		atomic_dec(&inode->i_count);
 		iput(inode);
 
-- 
cgit v1.2.3


From 004fb5750615bb3cf53e2aa50f6ef1ea57d97df9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 09:00:55 -0400
Subject: Btrfs: write_extent_pages came in 2.6.23

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6e4bf029c6d..c1c980d1fbb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2107,7 +2107,7 @@ done:
 	return 0;
 }
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,20)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
 /* Taken directly from 2.6.23 for 2.6.18 back port */
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                 void *data);
-- 
cgit v1.2.3


From 84eed90fac1b927a2657ff3bb7a0f18b9cb688f7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 09:04:37 -0400
Subject: Btrfs: Add failure handling for read_sys_array

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 11 +++++++++--
 fs/btrfs/volumes.c | 16 +++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1281c393c7e..71838264ca6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1278,7 +1278,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_lock(&fs_info->fs_mutex);
 
 	ret = btrfs_read_sys_array(tree_root);
-	BUG_ON(ret);
+	if (ret) {
+		printk("btrfs: failed to read the system array on %s\n",
+		       sb->s_id);
+		goto fail_sys_array;
+	}
 
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_chunk_root_level(disk_super));
@@ -1335,8 +1339,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
-	mutex_unlock(&fs_info->fs_mutex);
 	free_extent_buffer(tree_root->node);
+fail_sys_array:
+	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
 	free_extent_buffer(fs_info->sb_buffer);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
@@ -1344,6 +1349,8 @@ fail_iput:
 	iput(fs_info->btree_inode);
 fail:
 	close_all_devices(fs_info);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
 	kfree(extent_root);
 	kfree(tree_root);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bccb5566fd8..c63a982e31d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1365,14 +1365,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	struct extent_buffer *sb = root->fs_info->sb_buffer;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
-	struct btrfs_key key;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	int ret = 0;
 	u32 num_stripes;
 	u32 array_size;
 	u32 len = 0;
-	u8 *ptr;
-	unsigned long sb_ptr;
 	u32 cur;
-	int ret;
+	struct btrfs_key key;
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
@@ -1397,17 +1397,19 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
 			chunk = (struct btrfs_chunk *)sb_ptr;
 			ret = read_one_chunk(root, &key, sb, chunk);
-			BUG_ON(ret);
+			if (ret)
+				break;
 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
 			len = btrfs_chunk_item_size(num_stripes);
 		} else {
-			BUG();
+			ret = -EIO;
+			break;
 		}
 		ptr += len;
 		sb_ptr += len;
 		cur += len;
 	}
-	return 0;
+	return ret;
 }
 
 int btrfs_read_chunk_tree(struct btrfs_root *root)
-- 
cgit v1.2.3


From 5e478dc9828ad33d7b08dcdf277e13f14a7c1be7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 09:10:45 -0400
Subject: Btrfs: write_cache_pages came in 2.6.22

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c1c980d1fbb..2368536629e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2107,7 +2107,7 @@ done:
 	return 0;
 }
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
 /* Taken directly from 2.6.23 for 2.6.18 back port */
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                 void *data);
-- 
cgit v1.2.3


From 8f18cf13396caae5a3d7ae91201cfb15181a9642 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 16:53:30 -0400
Subject: Btrfs: Make the resizer work based on shrinking and growing devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/disk-io.c     |   4 +
 fs/btrfs/extent-tree.c | 145 +++++++++-------------
 fs/btrfs/inode.c       |  36 ++++--
 fs/btrfs/volumes.c     | 324 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h     |   5 +
 6 files changed, 407 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f00c4be59ad..ac7106ec535 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -474,6 +474,7 @@ struct btrfs_block_group_cache {
 	u64 pinned;
 	u64 flags;
 	int cached;
+	int ro;
 };
 
 struct btrfs_device;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 71838264ca6..a9ce491d279 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -818,6 +818,10 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		return fs_info->tree_root;
 	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
 		return fs_info->extent_root;
+	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+		return fs_info->chunk_root;
+	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+		return fs_info->dev_root;
 
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
 				 (unsigned long)location->objectid);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c49592c5127..6540095544e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -187,6 +187,7 @@ static int noinline find_search_start(struct btrfs_root *root,
 
 	if (!cache)
 		goto out;
+
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
@@ -196,7 +197,7 @@ again:
 		goto out;
 
 	last = max(search_start, cache->key.objectid);
-	if (!block_group_bits(cache, data)) {
+	if (!block_group_bits(cache, data) || cache->ro) {
 		goto new_group;
 	}
 
@@ -221,6 +222,8 @@ again:
 			continue;
 		}
 		spin_unlock_irq(&free_space_cache->lock);
+		if (cache->ro)
+			goto new_group;
 		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
 		if (start + num  > total_fs_bytes)
@@ -319,7 +322,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (search_start && search_start < total_fs_bytes) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_block_group(info, search_start);
-		if (shint && block_group_bits(shint, data)) {
+		if (shint && block_group_bits(shint, data) && !shint->ro) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
 			    div_factor(shint->key.offset, factor)) {
@@ -327,7 +330,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 			}
 		}
 	}
-	if (hint && block_group_bits(hint, data) &&
+	if (hint && !hint->ro && block_group_bits(hint, data) &&
 	    hint->key.objectid < total_fs_bytes) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
@@ -364,7 +367,7 @@ again:
 		if (cache->key.objectid > total_fs_bytes)
 			break;
 
-		if (block_group_bits(cache, data)) {
+		if (!cache->ro && block_group_bits(cache, data)) {
 			if (full_search)
 				free_check = cache->key.offset;
 			else
@@ -1020,6 +1023,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (found) {
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
+		found->full = 0;
 		WARN_ON(found->total_bytes < found->bytes_used);
 		*space_info = found;
 		return 0;
@@ -1700,7 +1704,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	u64 super_used;
 	u64 root_used;
 	u64 search_start = 0;
-	u64 new_hint;
 	u64 alloc_profile;
 	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
@@ -1724,7 +1727,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	if (root->ref_cows) {
+	if (root != root->fs_info->extent_root) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024,
@@ -1738,10 +1741,6 @@ again:
 		BUG_ON(ret);
 	}
 
-	new_hint = max(hint_byte, root->fs_info->alloc_start);
-	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
-		hint_byte = new_hint;
-
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
 			       search_start, search_end, hint_byte, ins,
@@ -2473,15 +2472,16 @@ out:
 	return ret;
 }
 
-int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
+int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	struct btrfs_path *path;
 	u64 cur_byte;
 	u64 total_found;
+	u64 shrink_last_byte;
+	struct btrfs_block_group_cache *shrink_block_group;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
@@ -2489,17 +2489,29 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	int ret;
 	int progress = 0;
 
-	btrfs_set_super_total_bytes(&info->super_copy, new_size);
-	clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
-			   GFP_NOFS);
-	block_group_cache = &info->block_group_cache;
+	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
+						      shrink_start);
+	BUG_ON(!shrink_block_group);
+
+	shrink_last_byte = shrink_start + shrink_block_group->key.offset;
+
+	shrink_block_group->space_info->total_bytes -=
+		shrink_block_group->key.offset;
+printk("shrink_extent_tree %Lu -> %Lu type %Lu\n", shrink_start, shrink_last_byte, shrink_block_group->flags);
 	path = btrfs_alloc_path();
 	root = root->fs_info->extent_root;
 	path->reada = 2;
 
 again:
+	trans = btrfs_start_transaction(root, 1);
+	do_chunk_alloc(trans, root->fs_info->extent_root,
+			btrfs_block_group_used(&shrink_block_group->item) +
+			2 * 1024 * 1024, shrink_block_group->flags);
+	btrfs_end_transaction(trans, root);
+	shrink_block_group->ro = 1;
+
 	total_found = 0;
-	key.objectid = new_size;
+	key.objectid = shrink_start;
 	key.offset = 0;
 	key.type = 0;
 	cur_byte = key.objectid;
@@ -2511,10 +2523,12 @@ again:
 	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
 	if (ret < 0)
 		goto out;
+
 	if (ret == 0) {
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid + found_key.offset > new_size) {
+		if (found_key.objectid + found_key.offset > shrink_start &&
+		    found_key.objectid < shrink_last_byte) {
 			cur_byte = found_key.objectid;
 			key.objectid = cur_byte;
 		}
@@ -2543,6 +2557,9 @@ next:
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
+		if (found_key.objectid >= shrink_last_byte)
+			break;
+
 		if (progress && need_resched()) {
 			memcpy(&key, &found_key, sizeof(key));
 			mutex_unlock(&root->fs_info->fs_mutex);
@@ -2583,68 +2600,31 @@ next:
 		goto again;
 	}
 
+	/*
+	 * we've freed all the extents, now remove the block
+	 * group item from the tree
+	 */
 	trans = btrfs_start_transaction(root, 1);
-	key.objectid = new_size;
-	key.offset = 0;
-	key.type = 0;
-	while(1) {
-		u64 ptr;
-
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0)
-			goto out;
-
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-bg_next:
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				break;
-			if (ret == 1) {
-				ret = 0;
-				break;
-			}
-			leaf = path->nodes[0];
-			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	memcpy(&key, &shrink_block_group->key, sizeof(key));
 
-			/*
-			 * btrfs_next_leaf doesn't cow buffers, we have to
-			 * do the search again
-			 */
-			memcpy(&key, &found_key, sizeof(key));
-			btrfs_release_path(root, path);
-			goto resched_check;
-		}
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
 
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (btrfs_key_type(&found_key) != BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			printk("shrinker found key %Lu %u %Lu\n",
-				found_key.objectid, found_key.type,
-				found_key.offset);
-			path->slots[0]++;
-			goto bg_next;
-		}
-		ret = get_state_private(&info->block_group_cache,
-					found_key.objectid, &ptr);
-		if (!ret)
-			kfree((void *)(unsigned long)ptr);
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	kfree(shrink_block_group);
 
-		clear_extent_bits(&info->block_group_cache, found_key.objectid,
-				  found_key.objectid + found_key.offset - 1,
-				  (unsigned int)-1, GFP_NOFS);
+	clear_extent_bits(&info->block_group_cache, found_key.objectid,
+			  found_key.objectid + found_key.offset - 1,
+			  (unsigned int)-1, GFP_NOFS);
 
-		key.objectid = found_key.objectid + 1;
-		btrfs_del_item(trans, root, path);
-		btrfs_release_path(root, path);
-resched_check:
-		if (need_resched()) {
-			mutex_unlock(&root->fs_info->fs_mutex);
-			cond_resched();
-			mutex_lock(&root->fs_info->fs_mutex);
-		}
-	}
-	clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
+	btrfs_del_item(trans, root, path);
+	clear_extent_dirty(&info->free_space_cache,
+			   shrink_start, shrink_last_byte - 1,
 			   GFP_NOFS);
 	btrfs_commit_transaction(trans, root);
 out:
@@ -2652,13 +2632,6 @@ out:
 	return ret;
 }
 
-int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 new_size)
-{
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy, new_size);
-	return 0;
-}
-
 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
 			   struct btrfs_key *key)
 {
@@ -2726,7 +2699,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		cache = kmalloc(sizeof(*cache), GFP_NOFS);
+		cache = kzalloc(sizeof(*cache), GFP_NOFS);
 		if (!cache) {
 			ret = -ENOMEM;
 			break;
@@ -2736,8 +2709,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
-		cache->cached = 0;
-		cache->pinned = 0;
 
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
@@ -2789,12 +2760,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	extent_root = root->fs_info->extent_root;
 	block_group_cache = &root->fs_info->block_group_cache;
 
-	cache = kmalloc(sizeof(*cache), GFP_NOFS);
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
-	cache->cached = 0;
-	cache->pinned = 0;
 
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 	memset(&cache->item, 0, sizeof(cache->item));
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b31f52d4f2c..4d12aa532c5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/kernel.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
@@ -2887,9 +2888,12 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 {
 	u64 new_size;
 	u64 old_size;
+	u64 devid = 1;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device = NULL;
 	char *sizestr;
+	char *devstr = NULL;
 	int ret = 0;
 	int namelen;
 	int mod = 0;
@@ -2909,9 +2913,25 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		goto out;
 	}
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	sizestr = vol_args->name;
+	devstr = strchr(sizestr, ':');
+	if (devstr) {
+		char *end;
+		sizestr = devstr + 1;
+		*devstr = '\0';
+		devstr = vol_args->name;
+		devid = simple_strtoull(devstr, &end, 10);
+printk("resizing devid %Lu\n", devid);
+	}
+	device = btrfs_find_device(root, devid, NULL);
+	if (!device) {
+		printk("resizer unable to find device %Lu\n", devid);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 	if (!strcmp(sizestr, "max"))
-		new_size = root->fs_info->sb->s_bdev->bd_inode->i_size;
+		new_size = device->bdev->bd_inode->i_size;
 	else {
 		if (sizestr[0] == '-') {
 			mod = -1;
@@ -2923,12 +2943,11 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		new_size = btrfs_parse_size(sizestr);
 		if (new_size == 0) {
 			ret = -EINVAL;
-			goto out;
+			goto out_unlock;
 		}
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	old_size = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	old_size = device->total_bytes;
 
 	if (mod < 0) {
 		if (new_size > old_size) {
@@ -2944,7 +2963,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		ret = -EINVAL;
 		goto out_unlock;
 	}
-	if (new_size > root->fs_info->sb->s_bdev->bd_inode->i_size) {
+	if (new_size > device->bdev->bd_inode->i_size) {
 		ret = -EFBIG;
 		goto out_unlock;
 	}
@@ -2952,13 +2971,14 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	do_div(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-printk("new size is %Lu\n", new_size);
+printk("new size for %s is %llu\n", device->name, (unsigned long long)new_size);
+
 	if (new_size > old_size) {
 		trans = btrfs_start_transaction(root, 1);
-		ret = btrfs_grow_extent_tree(trans, root, new_size);
+		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
 	} else {
-		ret = btrfs_shrink_extent_tree(root, new_size);
+		ret = btrfs_shrink_device(device, new_size);
 	}
 
 out_unlock:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c63a982e31d..a2c56de1548 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -77,7 +77,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (dev->devid == devid &&
-		    !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE)) {
+		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 			return dev;
 		}
 	}
@@ -293,6 +293,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	 * so we make sure to start at an offset of at least 1MB
 	 */
 	search_start = max((u64)1024 * 1024, search_start);
+
+	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+		search_start = max(root->fs_info->alloc_start, search_start);
+
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
@@ -380,6 +384,33 @@ error:
 	return ret;
 }
 
+int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_device *device,
+			  u64 start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
@@ -560,6 +591,7 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+
 int btrfs_update_device(struct btrfs_trans_handle *trans,
 			struct btrfs_device *device)
 {
@@ -606,6 +638,254 @@ out:
 	return ret;
 }
 
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_super_block *super_copy =
+		&device->dev_root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = new_size - device->total_bytes;
+
+	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	return btrfs_update_device(trans, device);
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    u64 chunk_tree, u64 chunk_objectid,
+			    u64 chunk_offset)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	root = root->fs_info->chunk_root;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = chunk_objectid;
+	key.offset = chunk_offset;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+			chunk_offset)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)(ptr + len);
+			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+			len += btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		if (key.objectid == chunk_objectid &&
+		    key.offset == chunk_offset) {
+			memmove(ptr, ptr + len, array_size - (cur + len));
+			array_size -= len;
+			btrfs_set_super_sys_array_size(super_copy, array_size);
+		} else {
+			ptr += len;
+			cur += len;
+		}
+	}
+	return ret;
+}
+
+
+int btrfs_relocate_chunk(struct btrfs_root *root,
+			 u64 chunk_tree, u64 chunk_objectid,
+			 u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree;
+	struct btrfs_root *extent_root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map *em;
+	struct map_lookup *map;
+	int ret;
+	int i;
+
+	root = root->fs_info->chunk_root;
+	extent_root = root->fs_info->extent_root;
+	em_tree = &root->fs_info->mapping_tree.map_tree;
+
+	/* step one, relocate all the extents inside this chunk */
+	ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	/*
+	 * step two, delete the device extents and the
+	 * chunk tree entries
+	 */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset);
+	map = (struct map_lookup *)em->bdev;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+					    map->stripes[i].physical);
+		BUG_ON(ret);
+	}
+	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+			       chunk_offset);
+
+	BUG_ON(ret);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+		BUG_ON(ret);
+		goto out;
+	}
+
+
+
+	spin_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+
+out:
+	/* once for us */
+	free_extent_map(em);
+
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = device->total_bytes - new_size;
+
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	path->reada = 2;
+
+	device->total_bytes = new_size;
+	ret = btrfs_update_device(trans, device);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		goto done;
+	}
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	btrfs_end_transaction(trans, root);
+
+	key.objectid = device->devid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto done;
+
+		ret = btrfs_previous_item(root, path, 0, key.type);
+		if (ret < 0)
+			goto done;
+		if (ret) {
+			ret = 0;
+			goto done;
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+		if (key.objectid != device->devid)
+			goto done;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (key.offset + length <= new_size)
+			goto done;
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+		btrfs_release_path(root, path);
+
+		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+					   chunk_offset);
+		if (ret)
+			goto done;
+	}
+
+done:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key,
@@ -658,6 +938,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 dev_offset;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_path *path;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
@@ -724,6 +1005,10 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripe_size = 1 * 1024 * 1024;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	/* we don't want a chunk larger than 10% of the FS */
 	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
 	max_chunk_size = min(percent_max, max_chunk_size);
@@ -759,11 +1044,19 @@ again:
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
+
 		if (avail >= min_free) {
-			list_move_tail(&device->dev_alloc_list, &private_devs);
-			index++;
-			if (type & BTRFS_BLOCK_GROUP_DUP)
+			u64 ignored_start = 0;
+			ret = find_free_dev_extent(trans, device, path,
+						   min_free,
+						   &ignored_start);
+			if (ret == 0) {
+				list_move_tail(&device->dev_alloc_list,
+					       &private_devs);
 				index++;
+				if (type & BTRFS_BLOCK_GROUP_DUP)
+					index++;
+			}
 		} else if (avail > max_avail)
 			max_avail = avail;
 		if (cur == dev_list)
@@ -785,30 +1078,37 @@ again:
 			calc_size = max_avail;
 			goto again;
 		}
+		btrfs_free_path(path);
 		return -ENOSPC;
 	}
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
 			      &key.offset);
-	if (ret)
+	if (ret) {
+		btrfs_free_path(path);
 		return ret;
+	}
 
 	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
-	if (!chunk)
+	if (!chunk) {
+		btrfs_free_path(path);
 		return -ENOMEM;
+	}
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		kfree(chunk);
+		btrfs_free_path(path);
 		return -ENOMEM;
 	}
+	btrfs_free_path(path);
+	path = NULL;
 
 	stripes = &chunk->stripe;
 	*num_bytes = chunk_bytes_by_type(type, calc_size,
 					 num_stripes, sub_stripes);
 
-
 	index = 0;
 printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
@@ -874,6 +1174,11 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, cal
 	em->len = *num_bytes;
 	em->block_start = 0;
 
+	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
+				    chunk, btrfs_chunk_item_size(num_stripes));
+		BUG_ON(ret);
+	}
 	kfree(chunk);
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
@@ -1376,11 +1681,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
-	/*
-	 * we do this loop twice, once for the device items and
-	 * once for all of the chunks.  This way there are device
-	 * structs filled in for every chunk
-	 */
 	ptr = super_copy->sys_chunk_array;
 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
 	cur = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a2660d2b6b3..6fe8440b37a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -128,4 +128,9 @@ int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 		      u64 logical, struct page *page);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 #endif
-- 
cgit v1.2.3


From 3bf3d9e9c256e1a249a47bb8ceff682e6430aeff Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 26 Apr 2008 11:03:32 -0400
Subject: Btrfs: Avoid recursive chunk allocations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6540095544e..a589912fdd5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1727,7 +1727,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	if (root != root->fs_info->extent_root) {
+	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024,
-- 
cgit v1.2.3


From 8e7bf94fd5f44fa585e29fbe6a1bfabc04aea7cf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 09:02:36 -0400
Subject: Btrfs: Do more optimal file RA during shrinking and defrag

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 16 +++++++++++++++-
 fs/btrfs/inode.c       | 15 +++++++--------
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a589912fdd5..fe4fe709c31 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2263,6 +2263,12 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	return 0;
 }
 
+static unsigned long calc_ra(unsigned long start, unsigned long last,
+			     unsigned long nr)
+{
+	return min(last, start + nr - 1);
+}
+
 static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 					 u64 len)
 {
@@ -2275,6 +2281,8 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	struct page *page;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct file_ra_state *ra;
+	unsigned long total_read = 0;
+	unsigned long ra_pages;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 
@@ -2282,11 +2290,17 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	i = start >> PAGE_CACHE_SHIFT;
 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
+	ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
+
 	file_ra_state_init(ra, inode->i_mapping);
-	btrfs_force_ra(inode->i_mapping, ra, NULL, i, last_index);
 	kfree(ra);
 
 	for (; i <= last_index; i++) {
+		if (total_read % ra_pages == 0) {
+			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+				       calc_ra(i, last_index, ra_pages));
+		}
+		total_read++;
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page)
 			goto out_unlock;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4d12aa532c5..ab707c0930d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2814,14 +2814,12 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index)
 {
-	pgoff_t req_size;
+	pgoff_t req_size = last_index - offset + 1;
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-	req_size = last_index - offset + 1;
 	offset = page_cache_readahead(mapping, ra, file, offset, req_size);
 	return offset;
 #else
-	req_size = min(last_index - offset + 1, (pgoff_t)128);
 	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
 	return offset + req_size;
 #endif
@@ -2833,7 +2831,8 @@ int btrfs_defrag_file(struct file *file) {
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct page *page;
 	unsigned long last_index;
-	unsigned long ra_index = 0;
+	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
+	unsigned long total_read = 0;
 	u64 page_start;
 	u64 page_end;
 	unsigned long i;
@@ -2848,11 +2847,11 @@ int btrfs_defrag_file(struct file *file) {
 	mutex_lock(&inode->i_mutex);
 	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
 	for (i = 0; i <= last_index; i++) {
-		if (i == ra_index) {
-			ra_index = btrfs_force_ra(inode->i_mapping,
-						  &file->f_ra,
-						  file, ra_index, last_index);
+		if (total_read % ra_pages == 0) {
+			btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
+				       min(last_index, i + ra_pages - 1));
 		}
+		total_read++;
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page)
 			goto out_unlock;
-- 
cgit v1.2.3


From 788f20eb5affef584e75ea84bb80a4c3352a2c0e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 15:29:42 -0400
Subject: Btrfs: Add new ioctl to add devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/inode.c   | 23 +++++++++++++++++
 fs/btrfs/ioctl.h   |  6 +++++
 fs/btrfs/super.c   |  1 +
 fs/btrfs/volumes.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 6 files changed, 108 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ac7106ec535..b9257b37bb9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -551,6 +551,8 @@ struct btrfs_fs_info {
 	u64 data_alloc_profile;
 	u64 metadata_alloc_profile;
 	u64 system_alloc_profile;
+
+	void *bdev_holder;
 };
 
 /*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ab707c0930d..1a74b501869 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3070,6 +3070,27 @@ static int btrfs_ioctl_defrag(struct file *file)
 	return 0;
 }
 
+long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = btrfs_init_new_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3082,6 +3103,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_defrag(file);
 	case BTRFS_IOC_RESIZE:
 		return btrfs_ioctl_resize(root, (void __user *)arg);
+	case BTRFS_IOC_ADD_DEV:
+		return btrfs_ioctl_add_dev(root, (void __user *)arg);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 4551e82013c..8ad35fc4ba5 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -36,4 +36,10 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 89286490688..7153dfaa340 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -388,6 +388,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 			goto error;
 		}
 
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		s->s_flags |= MS_ACTIVE;
 	}
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a2c56de1548..b93c15aa17d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -19,6 +19,7 @@
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
+#include <linux/random.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -592,6 +593,80 @@ out:
 	return ret;
 }
 
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct list_head *cur;
+	struct list_head *devices;
+	u64 total_bytes;
+	int ret = 0;
+
+
+	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	if (!bdev) {
+		return -EIO;
+	}
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	devices = &root->fs_info->fs_devices->devices;
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device) {
+		/* we can safely leave the fs_devices entry around */
+		ret = -ENOMEM;
+		goto out_close_bdev;
+	}
+
+	device->barriers = 1;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->name = kstrdup(device_path, GFP_NOFS);
+	if (!device->name) {
+		kfree(device);
+		goto out_close_bdev;
+	}
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = bdev;
+
+	ret = btrfs_add_device(trans, root, device);
+	if (ret)
+		goto out_close_bdev;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes + device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes + 1);
+
+	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+out:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+
+out_close_bdev:
+	close_bdev_excl(bdev);
+	goto out;
+}
+
 int btrfs_update_device(struct btrfs_trans_handle *trans,
 			struct btrfs_device *device)
 {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6fe8440b37a..6f173450378 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -133,4 +133,5 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 				       u8 *uuid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
 #endif
-- 
cgit v1.2.3


From ec44a35cbeb26ab2da84cb280d778260f2312feb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 15:29:52 -0400
Subject: Btrfs: Add balance ioctl to restripe the chunks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   2 +-
 fs/btrfs/extent-tree.c | 106 +++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/inode.c       |  11 +++++
 fs/btrfs/volumes.c     | 115 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/volumes.h     |   1 +
 5 files changed, 208 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9257b37bb9..73b92dd150f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1364,7 +1364,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, int data);
+		       u64 search_end, struct btrfs_key *ins, u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe4fe709c31..95aee5a2937 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/pagemap.h>
+#include <linux/writeback.h>
 #include "hash.h"
 #include "crc32c.h"
 #include "ctree.h"
@@ -1058,6 +1059,26 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	}
 }
 
+static u64 reduce_alloc_profile(u64 flags)
+{
+	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+		      BTRFS_BLOCK_GROUP_RAID10)))
+		flags &= ~BTRFS_BLOCK_GROUP_DUP;
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (flags & BTRFS_BLOCK_GROUP_RAID10))
+		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
+	     (flags & BTRFS_BLOCK_GROUP_DUP)))
+		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+	return flags;
+}
+
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags)
@@ -1068,6 +1089,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 num_bytes;
 	int ret;
 
+	flags = reduce_alloc_profile(flags);
+
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
 		ret = update_space_info(extent_root->fs_info, flags,
@@ -1684,6 +1707,7 @@ enospc:
 error:
 	return ret;
 }
+
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -1697,7 +1721,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, int data)
+		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
 	int pending_ret;
@@ -1727,6 +1751,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
+	data = reduce_alloc_profile(data);
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
@@ -1752,6 +1777,9 @@ again:
 		num_bytes = max(num_bytes, min_alloc_size);
 		goto again;
 	}
+	if (ret) {
+		printk("allocation failed flags %Lu\n", data);
+	}
 	BUG_ON(ret);
 	if (ret)
 		return ret;
@@ -2274,8 +2302,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 {
 	u64 page_start;
 	u64 page_end;
-	u64 delalloc_start;
-	u64 existing_delalloc;
 	unsigned long last_index;
 	unsigned long i;
 	struct page *page;
@@ -2293,7 +2319,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
 
 	file_ra_state_init(ra, inode->i_mapping);
-	kfree(ra);
 
 	for (; i <= last_index; i++) {
 		if (total_read % ra_pages == 0) {
@@ -2313,26 +2338,30 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 				goto out_unlock;
 			}
 		}
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(page);
+#else
+		cancel_dirty_page(page, PAGE_CACHE_SIZE);
+#endif
+		wait_on_page_writeback(page);
+		set_page_extent_mapped(page);
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
-		delalloc_start = page_start;
-		existing_delalloc = count_range_bits(io_tree,
-					     &delalloc_start, page_end,
-					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
-
+		set_page_dirty(page);
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
 	}
 
 out_unlock:
+	kfree(ra);
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
@@ -2397,8 +2426,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 			goto out;
 		}
 		relocate_inode_pages(inode, ref_offset, extent_key->offset);
-		/* FIXME, data=ordered will help get rid of this */
-		filemap_fdatawrite(inode->i_mapping);
 		iput(inode);
 		mutex_lock(&extent_root->fs_info->fs_mutex);
 	} else {
@@ -2486,6 +2513,47 @@ out:
 	return ret;
 }
 
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+	u64 num_devices;
+	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
+	if (num_devices == 1) {
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* turn raid0 into single device chunks */
+		if (flags & BTRFS_BLOCK_GROUP_RAID0)
+			return stripped;
+
+		/* turn mirroring into duplication */
+		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+			     BTRFS_BLOCK_GROUP_RAID10))
+			return stripped | BTRFS_BLOCK_GROUP_DUP;
+		return flags;
+	} else {
+		/* they already had raid on here, just return */
+		if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+		    (flags & BTRFS_BLOCK_GROUP_RAID1)) {
+		}
+		if (flags & stripped)
+			return flags;
+
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* switch duplicated blocks with raid1 */
+		if (flags & BTRFS_BLOCK_GROUP_DUP)
+			return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+		/* turn single device chunks into raid0 */
+		return stripped | BTRFS_BLOCK_GROUP_RAID0;
+	}
+	return flags;
+}
+
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 {
 	struct btrfs_trans_handle *trans;
@@ -2494,6 +2562,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	u64 cur_byte;
 	u64 total_found;
 	u64 shrink_last_byte;
+	u64 new_alloc_flags;
 	struct btrfs_block_group_cache *shrink_block_group;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_key key;
@@ -2511,17 +2580,20 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 
 	shrink_block_group->space_info->total_bytes -=
 		shrink_block_group->key.offset;
-printk("shrink_extent_tree %Lu -> %Lu type %Lu\n", shrink_start, shrink_last_byte, shrink_block_group->flags);
 	path = btrfs_alloc_path();
 	root = root->fs_info->extent_root;
 	path->reada = 2;
 
 again:
-	trans = btrfs_start_transaction(root, 1);
-	do_chunk_alloc(trans, root->fs_info->extent_root,
+	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+		trans = btrfs_start_transaction(root, 1);
+		new_alloc_flags = update_block_group_flags(root,
+						   shrink_block_group->flags);
+		do_chunk_alloc(trans, root->fs_info->extent_root,
 			btrfs_block_group_used(&shrink_block_group->item) +
-			2 * 1024 * 1024, shrink_block_group->flags);
-	btrfs_end_transaction(trans, root);
+			2 * 1024 * 1024, new_alloc_flags);
+		btrfs_end_transaction(trans, root);
+	}
 	shrink_block_group->ro = 1;
 
 	total_found = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1a74b501869..99483447459 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2864,6 +2864,15 @@ int btrfs_defrag_file(struct file *file) {
 				goto out_unlock;
 			}
 		}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(page);
+#else
+		cancel_dirty_page(page, PAGE_CACHE_SIZE);
+#endif
+		wait_on_page_writeback(page);
+		set_page_extent_mapped(page);
+
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -3105,6 +3114,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_resize(root, (void __user *)arg);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, (void __user *)arg);
+	case BTRFS_IOC_BALANCE:
+		return btrfs_balance(root->fs_info->dev_root);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b93c15aa17d..6476ecbf132 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -869,6 +869,107 @@ out:
 	return 0;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+	int ret;
+	struct list_head *cur;
+	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_device *device;
+	u64 old_size;
+	u64 size_to_free;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
+	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key found_key;
+
+
+	dev_root = dev_root->fs_info->dev_root;
+
+	mutex_lock(&dev_root->fs_info->fs_mutex);
+	/* step one make some room on all the devices */
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		old_size = device->total_bytes;
+		size_to_free = div_factor(old_size, 1);
+		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+		if (device->total_bytes - device->bytes_used > size_to_free)
+			continue;
+
+		ret = btrfs_shrink_device(device, old_size - size_to_free);
+		BUG_ON(ret);
+
+		trans = btrfs_start_transaction(dev_root, 1);
+		BUG_ON(!trans);
+
+		ret = btrfs_grow_device(trans, device, old_size);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, dev_root);
+	}
+
+	/* step two, relocate all the chunks */
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+
+		/*
+		 * this shouldn't happen, it means the last relocate
+		 * failed
+		 */
+		if (ret == 0)
+			break;
+
+		ret = btrfs_previous_item(chunk_root, path, 0,
+					  BTRFS_CHUNK_ITEM_KEY);
+		if (ret) {
+			break;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid)
+			break;
+		chunk = btrfs_item_ptr(path->nodes[0],
+				       path->slots[0],
+				       struct btrfs_chunk);
+		key.offset = found_key.offset;
+		/* chunk zero is special */
+		if (key.offset == 0)
+			break;
+
+		ret = btrfs_relocate_chunk(chunk_root,
+					   chunk_root->root_key.objectid,
+					   found_key.objectid,
+					   found_key.offset);
+		BUG_ON(ret);
+		btrfs_release_path(chunk_root, path);
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->fs_mutex);
+	return ret;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -985,15 +1086,6 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
 static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
 			       int sub_stripes)
 {
@@ -1040,6 +1132,11 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
+	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (type & BTRFS_BLOCK_GROUP_DUP)) {
+		WARN_ON(1);
+		type &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
 	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
 	if (list_empty(dev_list))
 		return -ENOSPC;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6f173450378..3f9a17f2e41 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -134,4 +134,5 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 				       u8 *uuid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
 #endif
-- 
cgit v1.2.3


From 4235298e4fc3c1a09f659cfe2fd285024eeb2241 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 16:40:52 -0400
Subject: Btrfs: Drop some verbose printks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 18 +++++-------------
 fs/btrfs/extent_io.c | 13 +++++--------
 fs/btrfs/volumes.c   |  2 --
 3 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a9ce491d279..1c7d84aff86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -214,25 +214,18 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	while (1) {
 		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
 					       btree_get_extent, mirror_num);
-		if (!ret) {
-			if (mirror_num)
-printk("good read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+		if (!ret)
 			return ret;
-		}
+
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
-printk("failed to read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
-		if (num_copies == 1) {
-printk("reading %Lu failed only one copy\n", eb->start);
+		if (num_copies == 1)
 			return ret;
-		}
+
 		mirror_num++;
-		if (mirror_num > num_copies) {
-printk("bailing at mirror %d of %d\n", mirror_num, num_copies);
+		if (mirror_num > num_copies)
 			return ret;
-		}
 	}
-printk("read extent buffer page last\n");
 	return -EIO;
 }
 
@@ -322,7 +315,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-printk("bad start on %Lu found %Lu\n", eb->start, found_start);
 		ret = -EIO;
 		goto err;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2368536629e..dd403b426ff 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3003,17 +3003,18 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 			   struct extent_buffer *eb)
 {
 	int ret = 0;
-	int ret2;
 	unsigned long num_pages;
 	unsigned long i;
 	struct page *page;
 	int pg_uptodate = 1;
 
 	if (eb->flags & EXTENT_UPTODATE)
-		ret = 1;
+		return 1;
 
-	ret2  = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
 			   EXTENT_UPTODATE, 1);
+	if (ret)
+		return ret;
 
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
@@ -3023,11 +3024,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 			break;
 		}
 	}
-	if ((ret || ret2) && !pg_uptodate) {
-printk("uptodate error2 eb %Lu ret %d ret2 %d pg_uptodate %d\n", eb->start, ret, ret2, pg_uptodate);
-		WARN_ON(1);
-	}
-	return (ret || ret2);
+	return pg_uptodate;
 }
 EXPORT_SYMBOL(extent_buffer_uptodate);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6476ecbf132..9a724113456 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1282,7 +1282,6 @@ again:
 					 num_stripes, sub_stripes);
 
 	index = 0;
-printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
 		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
@@ -1299,7 +1298,6 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
 			     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
-- 
cgit v1.2.3


From a236aed14ccb0661611d4416f6b573d892bdc60a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Apr 2008 09:38:00 -0400
Subject: Btrfs: Deal with failed writes in mirrored configurations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 17 +++++++++++++++--
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/volumes.c     | 17 ++++++++++++++---
 fs/btrfs/volumes.h     |  3 ++-
 4 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1c7d84aff86..e35e70165b5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1385,7 +1385,10 @@ int write_all_supers(struct btrfs_root *root)
 	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
+	int max_errors;
+	int total_errors = 0;
 
+	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
 	sb = root->fs_info->sb_buffer;
@@ -1433,8 +1436,14 @@ int write_all_supers(struct btrfs_root *root)
 		} else {
 			ret = submit_bh(WRITE, bh);
 		}
-		BUG_ON(ret);
+		if (ret)
+			total_errors++;
 	}
+	if (total_errors > max_errors) {
+		printk("btrfs: %d errors while writing supers\n", total_errors);
+		BUG();
+	}
+	total_errors = 0;
 
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
@@ -1454,13 +1463,17 @@ int write_all_supers(struct btrfs_root *root)
 				wait_on_buffer(bh);
 				BUG_ON(!buffer_uptodate(bh));
 			} else {
-				BUG();
+				total_errors++;
 			}
 
 		}
 		dev->pending_io = NULL;
 		brelse(bh);
 	}
+	if (total_errors > max_errors) {
+		printk("btrfs: %d errors while writing supers\n", total_errors);
+		BUG();
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 95aee5a2937..f94794a9932 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,8 +315,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	block_group_cache = &info->block_group_cache;
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 
-	if (!owner)
-		factor = 10;
+	if (data & BTRFS_BLOCK_GROUP_METADATA)
+		factor = 9;
 
 	bit = block_group_state_bits(data);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9a724113456..57ab755aca7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1425,6 +1425,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int stripe_index;
 	int i;
 	int num_stripes;
+	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
 	if (multi_ret && !(rw & (1 << BIO_RW))) {
@@ -1436,6 +1437,8 @@ again:
 				GFP_NOFS);
 		if (!multi)
 			return -ENOMEM;
+
+		atomic_set(&multi->error, 0);
 	}
 
 	spin_lock(&em_tree->lock);
@@ -1462,8 +1465,10 @@ again:
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
 			stripes_required = map->num_stripes;
+			max_errors = 1;
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 			stripes_required = map->sub_stripes;
+			max_errors = 1;
 		}
 	}
 	if (multi_ret && rw == WRITE &&
@@ -1561,6 +1566,7 @@ again:
 	if (multi_ret) {
 		*multi_ret = multi;
 		multi->num_stripes = num_stripes;
+		multi->max_errors = max_errors;
 	}
 out:
 	free_extent_map(em);
@@ -1598,14 +1604,19 @@ static int end_bio_multi_stripe(struct bio *bio,
 		return 1;
 #endif
 	if (err)
-		multi->error = err;
+		atomic_inc(&multi->error);
 
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 
-		if (!err && multi->error)
-			err = multi->error;
+		/* only send an error to the higher layers if it is
+		 * beyond the tolerance of the multi-bio
+		 */
+		if (atomic_read(&multi->error) > multi->max_errors)
+			err = -EIO;
+		else
+			err = 0;
 		kfree(multi);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3f9a17f2e41..a9663e92bb1 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,7 +90,8 @@ struct btrfs_multi_bio {
 	atomic_t stripes_pending;
 	bio_end_io_t *end_io;
 	void *private;
-	int error;
+	atomic_t error;
+	int max_errors;
 	int num_stripes;
 	struct btrfs_bio_stripe stripes[];
 };
-- 
cgit v1.2.3


From 2fff734fafa742236aecbdcdc8b1ff4d221cbaca Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Apr 2008 14:12:09 -0400
Subject: Btrfs: Tune stripe selection for raid1 and raid10

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 57ab755aca7..6d8dd4438c7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1509,17 +1509,17 @@ again:
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		if (unplug_page || (rw & (1 << BIO_RW)))
 			num_stripes = map->num_stripes;
-		else if (mirror_num) {
+		else if (mirror_num)
 			stripe_index = mirror_num - 1;
-		} else {
-			u64 orig_stripe_nr = stripe_nr;
-			stripe_index = do_div(orig_stripe_nr, num_stripes);
-		}
+		else
+			stripe_index = current->pid % map->num_stripes;
+
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
+
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 
@@ -1530,11 +1530,8 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else {
-			u64 orig_stripe_nr = stripe_nr;
-			stripe_index += do_div(orig_stripe_nr,
-					       map->sub_stripes);
-		}
+		else
+			stripe_index += current->pid % map->sub_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
-- 
cgit v1.2.3


From d6bfde8765668c8687de72f7a40f52acdf4f2f19 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 30 Apr 2008 13:59:35 -0400
Subject: Btrfs: Fixes for 2.6.18 enterprise kernels

2.6.18 seems to get caught in an infinite loop when
cancel_rearming_delayed_workqueue is called more than once, so this switches
to cancel_delayed_work, which is arguably more correct.

Also, balance_dirty_pages can run into problems with 2.6.18 based kernels
because it doesn't have the per-bdi dirty limits.  This avoids calling
balance_dirty_pages on the btree inode unless there is actually something
to balance, which is a good optimization in general.

Finally there's a compile fix for ordered-data.h

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c      | 24 +++++++++++++++++++-----
 fs/btrfs/ordered-data.c |  1 +
 fs/btrfs/transaction.c  |  8 ++++++--
 3 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e35e70165b5..fabc31b334b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1548,6 +1548,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_commit_transaction(trans, root);
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
+
 	write_ctree_super(NULL, root);
 	mutex_unlock(&fs_info->fs_mutex);
 
@@ -1583,17 +1584,17 @@ int close_ctree(struct btrfs_root *root)
 	extent_io_tree_empty_lru(&fs_info->extent_ins);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
-	flush_workqueue(end_io_workqueue);
 	flush_workqueue(async_submit_workqueue);
+	flush_workqueue(end_io_workqueue);
 
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
-	flush_workqueue(end_io_workqueue);
-	destroy_workqueue(end_io_workqueue);
-
 	flush_workqueue(async_submit_workqueue);
 	destroy_workqueue(async_submit_workqueue);
 
+	flush_workqueue(end_io_workqueue);
+	destroy_workqueue(end_io_workqueue);
+
 	iput(fs_info->btree_inode);
 #if 0
 	while(!list_empty(&fs_info->hashers)) {
@@ -1663,8 +1664,21 @@ void btrfs_throttle(struct btrfs_root *root)
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 {
-	balance_dirty_pages_ratelimited_nr(
+	struct extent_io_tree *tree;
+	u64 num_dirty;
+	u64 start = 0;
+	unsigned long thresh = 16 * 1024 * 1024;
+	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+
+	if (current_is_pdflush())
+		return;
+
+	num_dirty = count_range_bits(tree, &start, (u64)-1,
+				     thresh, EXTENT_DIRTY);
+	if (num_dirty > thresh) {
+		balance_dirty_pages_ratelimited_nr(
 				   root->fs_info->btree_inode->i_mapping, 1);
+	}
 }
 
 void btrfs_set_buffer_defrag(struct extent_buffer *buf)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b474902c90e..3814cb0246b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -18,6 +18,7 @@
 
 #include <linux/gfp.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c85cb48d95e..9826942fa18 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -814,6 +814,9 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	int ret;
 
 	mutex_lock(&root->fs_info->fs_mutex);
+	if (root->fs_info->closing)
+		goto out;
+
 	mutex_lock(&root->fs_info->trans_mutex);
 	cur = root->fs_info->running_transaction;
 	if (!cur) {
@@ -838,12 +841,13 @@ out:
 
 void btrfs_transaction_queue_work(struct btrfs_root *root, int delay)
 {
-	queue_delayed_work(trans_wq, &root->fs_info->trans_work, delay);
+	if (!root->fs_info->closing)
+		queue_delayed_work(trans_wq, &root->fs_info->trans_work, delay);
 }
 
 void btrfs_transaction_flush_work(struct btrfs_root *root)
 {
-	cancel_rearming_delayed_workqueue(trans_wq, &root->fs_info->trans_work);
+	cancel_delayed_work(&root->fs_info->trans_work);
 	flush_workqueue(trans_wq);
 }
 
-- 
cgit v1.2.3


From f2eb0a241f0e5c135d93243b0236cb1f14c305e0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 2 May 2008 14:43:14 -0400
Subject: Btrfs: Clone file data ioctl

Add a new ioctl to clone file data

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |   4 +-
 fs/btrfs/file-item.c |  12 ++--
 fs/btrfs/file.c      |   2 +-
 fs/btrfs/inode.c     | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/ioctl.h     |   1 +
 5 files changed, 179 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 73b92dd150f..3b6f8524a4a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1516,9 +1516,9 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 /* file-item.c */
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
-			       u64 objectid, u64 pos, u64 offset,
+			       u64 objectid, u64 pos, u64 disk_offset,
 			       u64 disk_num_bytes,
-			       u64 num_bytes);
+			     u64 num_bytes, u64 offset);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9259aece6ed..f537eb43c2c 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -28,10 +28,10 @@
 			       sizeof(struct btrfs_item) * 2) / \
 			       BTRFS_CRC32_SIZE) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 objectid, u64 pos,
-			       u64 offset, u64 disk_num_bytes,
-			       u64 num_bytes)
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -53,9 +53,9 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
-	btrfs_set_file_extent_disk_bytenr(leaf, item, offset);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
 	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
-	btrfs_set_file_extent_offset(leaf, item, 0);
+	btrfs_set_file_extent_offset(leaf, item, offset);
 	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8effdf4f5d6..a50507f3056 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -285,7 +285,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
-						       0, 0, hole_size);
+						       0, 0, hole_size, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size -1);
 			btrfs_check_file(root, inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 99483447459..c6fae29c0b9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
@@ -141,7 +142,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		cur_alloc_size = ins.offset;
 		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
 					       start, ins.objectid, ins.offset,
-					       ins.offset);
+					       ins.offset, 0);
 		inode->i_blocks += ins.offset >> 9;
 		btrfs_check_file(root, inode);
 		if (num_bytes < cur_alloc_size) {
@@ -1227,7 +1228,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       hole_start, 0, 0,
-						       hole_size);
+						       hole_size, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
 						(u64)-1);
 			btrfs_check_file(root, inode);
@@ -3100,6 +3101,170 @@ out:
 	return ret;
 }
 
+void dup_item_to_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       struct extent_buffer *leaf,
+		       int slot,
+		       struct btrfs_key *key,
+		       u64 destino)
+{
+	struct btrfs_path *cpath = btrfs_alloc_path();
+	int len = btrfs_item_size_nr(leaf, slot);
+	int dstoff;
+	struct btrfs_key ckey = *key;
+	int ret;
+
+	ckey.objectid = destino;
+	ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
+	dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
+	copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
+			   btrfs_item_ptr_offset(leaf, slot),
+			   len);
+	btrfs_release_path(root, cpath);
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	u64 pos;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int nextret;
+	int slot;
+
+	src_file = fget(src_fd);
+	if (!src_file)
+		return -EBADF;
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (inode < src) {
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&src->i_mutex);
+	} else {
+		mutex_lock(&src->i_mutex);
+		mutex_lock(&inode->i_mutex);
+	}
+
+	ret = -ENOTEMPTY;
+	if (inode->i_size)
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		filemap_write_and_wait(src->i_mapping);
+		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+		if (BTRFS_I(src)->delalloc_bytes == 0)
+			break;
+		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 0);
+	path = btrfs_alloc_path();
+	pos = 0;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
+					       pos, 0);
+		if (ret < 0)
+			goto out;
+		if (ret > 0) {
+			if (path->slots[0] == 0) {
+				ret = 0;
+				goto out;
+			}
+			path->slots[0]--;
+		}
+next_slot:
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		nritems = btrfs_header_nritems(leaf);
+
+		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		    key.objectid != src->i_ino)
+			goto out;
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int found_type;
+			pos = key.offset;
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(leaf, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 len = btrfs_file_extent_num_bytes(leaf,
+								      extent);
+				u64 ds = btrfs_file_extent_disk_bytenr(leaf,
+								       extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
+				u64 off = btrfs_file_extent_offset(leaf,
+								   extent);
+				btrfs_insert_file_extent(trans, root,
+							 inode->i_ino, pos,
+							 ds, dl, len, off);
+				/* ds == 0 means there's a hole */
+				if (ds != 0) {
+					btrfs_inc_extent_ref(trans, root,
+						     ds, dl,
+						     root->root_key.objectid,
+						     trans->transid,
+						     inode->i_ino, pos);
+				}
+				pos = key.offset + len;
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+				dup_item_to_inode(trans, root, path, leaf, slot,
+						  &key, inode->i_ino);
+				pos = key.offset + btrfs_item_size_nr(leaf,
+								      slot);
+			}
+		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
+			dup_item_to_inode(trans, root, path, leaf, slot, &key,
+					  inode->i_ino);
+
+		if (slot >= nritems - 1) {
+			nextret = btrfs_next_leaf(root, path);
+			if (nextret)
+				goto out;
+		} else {
+			path->slots[0]++;
+		}
+		goto next_slot;
+	}
+
+out:
+	btrfs_free_path(path);
+	ret = 0;
+
+	inode->i_blocks = src->i_blocks;
+	i_size_write(inode, src->i_size);
+	btrfs_update_inode(trans, root, inode);
+
+	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+out_fput:
+	fput(src_file);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3116,6 +3281,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_add_dev(root, (void __user *)arg);
 	case BTRFS_IOC_BALANCE:
 		return btrfs_balance(root->fs_info->dev_root);
+	case BTRFS_IOC_CLONE:
+		return btrfs_ioctl_clone(file, arg);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 8ad35fc4ba5..b0e73f51d63 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -36,6 +36,7 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
 #define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
-- 
cgit v1.2.3


From 006a58a25fe8aaa7559be80dcff94d98aee84309 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 May 2008 14:43:15 -0400
Subject: Btrfs: Compile warning fixup in volume.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6d8dd4438c7..b3818757310 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -547,7 +547,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	unsigned long ptr;
-	u64 free_devid;
+	u64 free_devid = 0;
 
 	root = root->fs_info->chunk_root;
 
-- 
cgit v1.2.3


From bb8885cc0aa6b1606143d92c70762b16026016a5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 May 2008 14:49:33 -0400
Subject: Btrfs: Fix do_sync_file_range ifdefs (2.6.22)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a50507f3056..c89c3ac4b23 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -961,7 +961,7 @@ out_nolock:
 		if (err < 0)
 			num_written = err;
 	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
 		do_sync_file_range(file, start_pos,
 				      start_pos + num_written - 1,
 				      SYNC_FILE_RANGE_WRITE |
-- 
cgit v1.2.3


From 12fa8ec64f445aa932ba154053fe95432f30f2c6 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Fri, 2 May 2008 15:03:58 -0400
Subject: Btrfs: Add workaround for AppArmor changing remove_suid()

In openSUSE 10.3, AppArmor modifies remove_suid to take a struct path
rather than just a dentry. This patch tests that the kernel is openSUSE
10.3 or newer and adjusts the call accordingly.

Debian/Ubuntu with AppArmor applied will also need a similar patch.
Maintainers of btrfs under those distributions should build on this
patch or, alternatively, alter their package descriptions to add
-DREMOVE_SUID_PATH to the compiler command line.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
- --- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ b/compat.h	2008-02-06 16:46:13.000000000 -0500
@@ -0,0 +1,15 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+
+
+/*
+ * Even if AppArmor isn't enabled, it still has different prototypes.
+ * Add more distro/version pairs here to declare which has AppArmor applied.
+ */
+#if defined(CONFIG_SUSE_KERNEL)
+# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
+# define REMOVE_SUID_PATH 1
+# endif
+#endif
+
+#endif /* _COMPAT_H_ */
- --- a/file.c	2008-02-06 11:37:39.000000000 -0500
+++ b/file.c	2008-02-06 16:46:23.000000000 -0500
@@ -37,6 +37,7 @@
 #include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
+#include "compat.h"

 static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
@@ -790,7 +791,11 @@ static ssize_t btrfs_file_write(struct f
 		goto out_nolock;
 	if (count == 0)
 		goto out_nolock;
+#ifdef REMOVE_SUID_PATH
+	err = remove_suid(&file->f_path);
+#else
 	err = remove_suid(fdentry(file));
+#endif
 	if (err)
 		goto out_nolock;
 	file_update_time(file);

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compat.h | 25 +++++++++++++++++++++++++
 fs/btrfs/file.c   |  5 +++++
 2 files changed, 30 insertions(+)
 create mode 100644 fs/btrfs/compat.h

(limited to 'fs')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
new file mode 100644
index 00000000000..d39a768e65f
--- /dev/null
+++ b/fs/btrfs/compat.h
@@ -0,0 +1,25 @@
+#ifndef _COMPAT_H_
+#define _COMPAT_H_
+
+
+/*
+ * Even if AppArmor isn't enabled, it still has different prototypes.
+ * Add more distro/version pairs here to declare which has AppArmor applied.
+ */
+#if defined(CONFIG_SUSE_KERNEL)
+# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
+# define REMOVE_SUID_PATH 1
+# endif
+#endif
+
+/*
+ * catch any other distros that have patched in apparmor.  This isn't
+ * 100% reliable because it won't catch people that hand compile their
+ * own distro kernels without apparmor compiled in.  But, it is better
+ * than nothing.
+ */
+#ifdef CONFIG_SECURITY_APPARMOR
+# define REMOVE_SUID_PATH 1
+#endif
+
+#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c89c3ac4b23..df97d470812 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -37,6 +37,7 @@
 #include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
+#include "compat.h"
 
 
 static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
@@ -852,7 +853,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		goto out_nolock;
 	if (count == 0)
 		goto out_nolock;
+#ifdef REMOVE_SUID_PATH
+	err = remove_suid(&file->f_path);
+#else
 	err = remove_suid(fdentry(file));
+#endif
 	if (err)
 		goto out_nolock;
 	file_update_time(file);
-- 
cgit v1.2.3


From b9d86667c94e5fe4bf9f6aa500e7ff1138e717ff Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 May 2008 16:13:49 -0400
Subject: Btrfs: Silence bogus inode.c compiler warnings

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c6fae29c0b9..5204599e430 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2035,12 +2035,12 @@ fail:
 
 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	struct inode *inode;
+	struct inode *inode = NULL;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int err = 0;
 	int drop_on_err = 0;
-	u64 objectid;
+	u64 objectid = 0;
 	unsigned long nr = 1;
 
 	mutex_lock(&root->fs_info->fs_mutex);
-- 
cgit v1.2.3


From 5d9cd9ecbf40b8bd5045a3c2f1feb35db6a12266 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 May 2008 06:26:21 -0400
Subject: Btrfs: Fix clone ioctl to not hold the path over inserts

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 79 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 47 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5204599e430..f7beb9b0d37 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3101,7 +3101,7 @@ out:
 	return ret;
 }
 
-void dup_item_to_inode(struct btrfs_trans_handle *trans,
+int dup_item_to_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       struct btrfs_path *path,
 		       struct extent_buffer *leaf,
@@ -3109,19 +3109,22 @@ void dup_item_to_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_key *key,
 		       u64 destino)
 {
-	struct btrfs_path *cpath = btrfs_alloc_path();
+	char *dup;
 	int len = btrfs_item_size_nr(leaf, slot);
-	int dstoff;
 	struct btrfs_key ckey = *key;
-	int ret;
+	int ret = 0;
+
+	dup = kmalloc(len, GFP_NOFS);
+	if (!dup)
+		return -ENOMEM;
+
+	read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len);
+	btrfs_release_path(root, path);
 
 	ckey.objectid = destino;
-	ret = btrfs_insert_empty_item(trans, root, cpath, &ckey, len);
-	dstoff = btrfs_item_ptr_offset(cpath->nodes[0], cpath->slots[0]);
-	copy_extent_buffer(cpath->nodes[0], leaf, dstoff,
-			   btrfs_item_ptr_offset(leaf, slot),
-			   len);
-	btrfs_release_path(root, cpath);
+	ret = btrfs_insert_item(trans, root, &ckey, dup, len);
+	kfree(dup);
+	return ret;
 }
 
 long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
@@ -3137,7 +3140,6 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	u32 nritems;
-	int nextret;
 	int slot;
 
 	src_file = fget(src_fd);
@@ -3174,20 +3176,32 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 0);
 	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	key.offset = 0;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.objectid = src->i_ino;
 	pos = 0;
+	path->reada = 2;
+
 	while (1) {
-		ret = btrfs_lookup_file_extent(trans, root, path, src->i_ino,
-					       pos, 0);
+		/*
+		 * note the key will change type as we walk through the
+		 * tree.
+		 */
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
-		if (ret > 0) {
-			if (path->slots[0] == 0) {
-				ret = 0;
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
 				goto out;
-			}
-			path->slots[0]--;
+			if (ret > 0)
+				break;
 		}
-next_slot:
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -3195,7 +3209,8 @@ next_slot:
 
 		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
 		    key.objectid != src->i_ino)
-			goto out;
+			break;
+
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 			struct btrfs_file_extent_item *extent;
 			int found_type;
@@ -3225,28 +3240,28 @@ next_slot:
 				}
 				pos = key.offset + len;
 			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-				dup_item_to_inode(trans, root, path, leaf, slot,
-						  &key, inode->i_ino);
+				ret = dup_item_to_inode(trans, root, path,
+							leaf, slot, &key,
+							inode->i_ino);
+				if (ret)
+					goto out;
 				pos = key.offset + btrfs_item_size_nr(leaf,
 								      slot);
 			}
-		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY)
-			dup_item_to_inode(trans, root, path, leaf, slot, &key,
-					  inode->i_ino);
+		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
+			ret = dup_item_to_inode(trans, root, path, leaf,
+						slot, &key, inode->i_ino);
 
-		if (slot >= nritems - 1) {
-			nextret = btrfs_next_leaf(root, path);
-			if (nextret)
+			if (ret)
 				goto out;
-		} else {
-			path->slots[0]++;
 		}
-		goto next_slot;
+		key.offset++;
+		btrfs_release_path(root, path);
 	}
 
+	ret = 0;
 out:
 	btrfs_free_path(path);
-	ret = 0;
 
 	inode->i_blocks = src->i_blocks;
 	i_size_write(inode, src->i_size);
-- 
cgit v1.2.3


From a061fc8da7b990faa41ca503e66faef3ecdeead0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 May 2008 11:43:44 -0400
Subject: Btrfs: Add support for online device removal

This required a few structural changes to the code that manages bdev pointers:

The VFS super block now gets an anon-bdev instead of a pointer to the
lowest bdev.  This allows us to avoid swapping the super block bdev pointer
around at run time.

The code to read in the super block no longer goes through the extent
buffer interface.  Things got ugly keeping the mapping constant.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   3 +-
 fs/btrfs/disk-io.c     | 101 ++++++++++------------
 fs/btrfs/extent-tree.c | 139 +++++++++++++++++++++++++------
 fs/btrfs/inode.c       |  27 +++++-
 fs/btrfs/super.c       |  35 ++------
 fs/btrfs/transaction.c |   5 +-
 fs/btrfs/volumes.c     | 221 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h     |   3 +
 8 files changed, 412 insertions(+), 122 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3b6f8524a4a..33ab165591c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,7 +505,7 @@ struct btrfs_fs_info {
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
-	struct extent_buffer *sb_buffer;
+	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
@@ -1208,6 +1208,7 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fabc31b334b..9d5424ad01a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -78,9 +78,13 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
-	spin_unlock(&em_tree->lock);
-	if (em)
+	if (em) {
+		em->bdev =
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+		spin_unlock(&em_tree->lock);
 		goto out;
+	}
+	spin_unlock(&em_tree->lock);
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
@@ -90,7 +94,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	em->start = 0;
 	em->len = (u64)-1;
 	em->block_start = 0;
-	em->bdev = inode->i_sb->s_bdev;
+	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
@@ -435,11 +439,6 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
 	BUG_ON(ret);
 
-	if (offset == BTRFS_SUPER_INFO_OFFSET) {
-		bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-		submit_bio(rw, bio);
-		return 0;
-	}
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 }
 
@@ -587,8 +586,7 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 	list = &fs_info->fs_devices->devices;
 	list_for_each(next, list) {
 		device = list_entry(next, struct btrfs_device, dev_list);
-		if (device->bdev && device->bdev != fs_info->sb->s_bdev)
-			close_bdev_excl(device->bdev);
+		close_bdev_excl(device->bdev);
 		device->bdev = NULL;
 	}
 	return 0;
@@ -1118,6 +1116,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 leafsize;
 	u32 blocksize;
 	u32 stripesize;
+	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
@@ -1153,7 +1152,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->new_trans_lock);
 
 	init_completion(&fs_info->kobj_unregister);
-	sb_set_blocksize(sb, BTRFS_SUPER_INFO_SIZE);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
@@ -1170,6 +1168,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
 
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = blksize_bits(4096);
+
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
 	 * the real end of the address space is determined by all of
@@ -1229,19 +1230,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
-	fs_info->sb_buffer = read_tree_block(tree_root,
-					     BTRFS_SUPER_INFO_OFFSET,
-					     4096);
 
-	if (!fs_info->sb_buffer)
+	bh = __bread(fs_devices->latest_bdev,
+		     BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh)
 		goto fail_iput;
 
-	read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0,
-			   sizeof(fs_info->super_copy));
+	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+	brelse(bh);
 
-	read_extent_buffer(fs_info->sb_buffer, fs_info->fsid,
-			   (unsigned long)btrfs_super_fsid(fs_info->sb_buffer),
-			   BTRFS_FSID_SIZE);
+	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
 
 	disk_super = &fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
@@ -1263,7 +1261,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	tree_root->leafsize = leafsize;
 	tree_root->sectorsize = sectorsize;
 	tree_root->stripesize = stripesize;
-	sb_set_blocksize(sb, sectorsize);
+
+	sb->s_blocksize = sectorsize;
+	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
@@ -1339,7 +1339,6 @@ fail_tree_root:
 fail_sys_array:
 	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
-	free_extent_buffer(fs_info->sb_buffer);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 fail_iput:
 	iput(fs_info->btree_inode);
@@ -1380,41 +1379,44 @@ int write_all_supers(struct btrfs_root *root)
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
-	struct extent_buffer *sb;
+	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
 	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
 	int max_errors;
 	int total_errors = 0;
+	u32 crc;
+	u64 flags;
 
 	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
-	sb = root->fs_info->sb_buffer;
-	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
-						      dev_item);
+	sb = &root->fs_info->super_for_commit;
+	dev_item = &sb->dev_item;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
-		btrfs_set_device_type(sb, dev_item, dev->type);
-		btrfs_set_device_id(sb, dev_item, dev->devid);
-		btrfs_set_device_total_bytes(sb, dev_item, dev->total_bytes);
-		btrfs_set_device_bytes_used(sb, dev_item, dev->bytes_used);
-		btrfs_set_device_io_align(sb, dev_item, dev->io_align);
-		btrfs_set_device_io_width(sb, dev_item, dev->io_width);
-		btrfs_set_device_sector_size(sb, dev_item, dev->sector_size);
-		write_extent_buffer(sb, dev->uuid,
-				    (unsigned long)btrfs_device_uuid(dev_item),
-				    BTRFS_UUID_SIZE);
-
-		btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN);
-		csum_tree_block(root, sb, 0);
-
-		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET /
-			      root->fs_info->sb->s_blocksize,
+		btrfs_set_stack_device_type(dev_item, dev->type);
+		btrfs_set_stack_device_id(dev_item, dev->devid);
+		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+		btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		flags = btrfs_super_flags(sb);
+		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+
+		crc = ~(u32)0;
+		crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
+				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, sb->csum);
+
+		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
 			      BTRFS_SUPER_INFO_SIZE);
 
-		read_extent_buffer(sb, bh->b_data, 0, BTRFS_SUPER_INFO_SIZE);
+		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 		dev->pending_io = bh;
 
 		get_bh(bh);
@@ -1483,15 +1485,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 
 	ret = write_all_supers(root);
-#if 0
-	if (!btrfs_test_opt(root, NOBARRIER))
-		blkdev_issue_flush(sb->s_bdev, NULL);
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super);
-	ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping,
-				     super->start, super->len);
-	if (!btrfs_test_opt(root, NOBARRIER))
-		blkdev_issue_flush(sb->s_bdev, NULL);
-#endif
 	return ret;
 }
 
@@ -1570,8 +1563,6 @@ int close_ctree(struct btrfs_root *root)
 	if (root->fs_info->dev_root->node);
 		free_extent_buffer(root->fs_info->dev_root->node);
 
-	free_extent_buffer(fs_info->sb_buffer);
-
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
 
@@ -1652,7 +1643,7 @@ void btrfs_throttle(struct btrfs_root *root)
 {
 	struct backing_dev_info *bdi;
 
-	bdi = root->fs_info->sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+	bdi = &root->fs_info->bdi;
 	if (root->fs_info->throttles && bdi_write_congested(bdi)) {
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
 		congestion_wait(WRITE, HZ/20);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f94794a9932..c0e67bde842 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -147,6 +147,8 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	u64 end;
 	int ret;
 
+	bytenr = max_t(u64, bytenr,
+		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
 	block_group_cache = &info->block_group_cache;
 	ret = find_first_extent_bit(block_group_cache,
 				    bytenr, &start, &end,
@@ -1059,16 +1061,25 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	}
 }
 
-static u64 reduce_alloc_profile(u64 flags)
+static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
+	u64 num_devices = root->fs_info->fs_devices->num_devices;
+
+	if (num_devices == 1)
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+	if (num_devices < 4)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		      BTRFS_BLOCK_GROUP_RAID10)))
+		      BTRFS_BLOCK_GROUP_RAID10))) {
 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
 
 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (flags & BTRFS_BLOCK_GROUP_RAID10))
+	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+	}
 
 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
@@ -1078,7 +1089,6 @@ static u64 reduce_alloc_profile(u64 flags)
 	return flags;
 }
 
-
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags)
@@ -1089,7 +1099,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 num_bytes;
 	int ret;
 
-	flags = reduce_alloc_profile(flags);
+	flags = reduce_alloc_profile(extent_root, flags);
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
@@ -1169,6 +1179,21 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+	u64 start;
+	u64 end;
+	int ret;
+	ret = find_first_extent_bit(&root->fs_info->block_group_cache,
+				    search_start, &start, &end,
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
+				    BLOCK_GROUP_SYSTEM);
+	if (ret)
+		return 0;
+	return start;
+}
+
+
 static int update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
@@ -1185,16 +1210,25 @@ static int update_pinned_extents(struct btrfs_root *root,
 	}
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
-		WARN_ON(!cache);
-		len = min(num, cache->key.offset -
-			  (bytenr - cache->key.objectid));
+		if (!cache) {
+			u64 first = first_logical_byte(root, bytenr);
+			WARN_ON(first < bytenr);
+			len = min(first - bytenr, num);
+		} else {
+			len = min(num, cache->key.offset -
+				  (bytenr - cache->key.objectid));
+		}
 		if (pin) {
-			cache->pinned += len;
-			cache->space_info->bytes_pinned += len;
+			if (cache) {
+				cache->pinned += len;
+				cache->space_info->bytes_pinned += len;
+			}
 			fs_info->total_pinned += len;
 		} else {
-			cache->pinned -= len;
-			cache->space_info->bytes_pinned -= len;
+			if (cache) {
+				cache->pinned -= len;
+				cache->space_info->bytes_pinned -= len;
+			}
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1547,7 +1581,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				     int data)
 {
 	int ret;
-	u64 orig_search_start = search_start;
+	u64 orig_search_start;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
@@ -1577,6 +1611,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	search_start = max(search_start, first_logical_byte(root, 0));
+	orig_search_start = search_start;
+
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
@@ -1751,7 +1788,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	data = reduce_alloc_profile(data);
+	data = reduce_alloc_profile(root, data);
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
@@ -2309,6 +2346,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	struct file_ra_state *ra;
 	unsigned long total_read = 0;
 	unsigned long ra_pages;
+	struct btrfs_trans_handle *trans;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 
@@ -2326,9 +2364,13 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 				       calc_ra(i, last_index, ra_pages));
 		}
 		total_read++;
+		if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
+			goto truncate_racing;
+
 		page = grab_cache_page(inode->i_mapping, i);
-		if (!page)
+		if (!page) {
 			goto out_unlock;
+		}
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
 			lock_page(page);
@@ -2350,20 +2392,33 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
-		set_page_dirty(page);
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
+		set_page_dirty(page);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
 	}
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+					   total_read);
 
 out_unlock:
 	kfree(ra);
+	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+	if (trans) {
+		btrfs_add_ordered_inode(inode);
+		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+		mark_inode_dirty(inode);
+	}
 	mutex_unlock(&inode->i_mutex);
 	return 0;
+
+truncate_racing:
+	vmtruncate(inode, inode->i_size);
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+					   total_read);
+	goto out_unlock;
 }
 
 /*
@@ -2466,6 +2521,27 @@ out:
 	return 0;
 }
 
+static int noinline del_extent_zero(struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct btrfs_key *extent_key)
+{
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	trans = btrfs_start_transaction(extent_root, 1);
+	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+	if (ret > 0) {
+		ret = -EIO;
+		goto out;
+	}
+	if (ret < 0)
+		goto out;
+	ret = btrfs_del_item(trans, extent_root, path);
+out:
+	btrfs_end_transaction(trans, extent_root);
+	return ret;
+}
+
 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key)
@@ -2477,6 +2553,10 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	u32 item_size;
 	int ret = 0;
 
+	if (extent_key->objectid == 0) {
+		ret = del_extent_zero(extent_root, path, extent_key);
+		goto out;
+	}
 	key.objectid = extent_key->objectid;
 	key.type = BTRFS_EXTENT_REF_KEY;
 	key.offset = 0;
@@ -2490,15 +2570,24 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 		ret = 0;
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] == nritems)
-			goto out;
+		if (path->slots[0] == nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret > 0) {
+				ret = 0;
+				goto out;
+			}
+			if (ret < 0)
+				goto out;
+		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != extent_key->objectid)
+		if (found_key.objectid != extent_key->objectid) {
 			break;
+		}
 
-		if (found_key.type != BTRFS_EXTENT_REF_KEY)
+		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
 			break;
+		}
 
 		key.offset = found_key.offset + 1;
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -2519,7 +2608,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
+	num_devices = root->fs_info->fs_devices->num_devices;
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
@@ -2535,9 +2624,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 		return flags;
 	} else {
 		/* they already had raid on here, just return */
-		if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-		    (flags & BTRFS_BLOCK_GROUP_RAID1)) {
-		}
 		if (flags & stripped)
 			return flags;
 
@@ -2570,7 +2656,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	struct extent_buffer *leaf;
 	u32 nritems;
 	int ret;
-	int progress = 0;
+	int progress;
 
 	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
 						      shrink_start);
@@ -2597,6 +2683,7 @@ again:
 	shrink_block_group->ro = 1;
 
 	total_found = 0;
+	progress = 0;
 	key.objectid = shrink_start;
 	key.offset = 0;
 	key.type = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f7beb9b0d37..b437d3bdf95 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2194,6 +2194,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
+	if (em)
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
 	spin_unlock(&em_tree->lock);
 
 	if (em) {
@@ -2212,7 +2214,7 @@ again:
 
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
-	em->bdev = inode->i_sb->s_bdev;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
 	if (ret < 0) {
@@ -3101,6 +3103,27 @@ out:
 	return ret;
 }
 
+long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = btrfs_rm_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
 int dup_item_to_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       struct btrfs_path *path,
@@ -3294,6 +3317,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_resize(root, (void __user *)arg);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, (void __user *)arg);
+	case BTRFS_IOC_RM_DEV:
+		return btrfs_ioctl_rm_dev(root, (void __user *)arg);
 	case BTRFS_IOC_BALANCE:
 		return btrfs_balance(root->fs_info->dev_root);
 	case BTRFS_IOC_CLONE:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7153dfaa340..020e5a83e31 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -315,24 +315,12 @@ static void btrfs_write_super(struct super_block *sb)
 	sb->s_dirt = 0;
 }
 
-/*
- * This is almost a copy of get_sb_bdev in fs/super.c.
- * We need the local copy to allow direct mounting of
- * subvolumes, but this could be easily integrated back
- * into the generic version.  --hch
- */
-
-/* start copy & paste */
-static int set_bdev_super(struct super_block *s, void *data)
+static int btrfs_test_super(struct super_block *s, void *data)
 {
-	s->s_bdev = data;
-	s->s_dev = s->s_bdev->bd_dev;
-	return 0;
-}
+	struct btrfs_fs_devices *test_fs_devices = data;
+	struct btrfs_root *root = btrfs_sb(s);
 
-static int test_bdev_super(struct super_block *s, void *data)
-{
-	return (void *)s->s_bdev == data;
+	return root->fs_info->fs_devices == test_fs_devices;
 }
 
 int btrfs_get_sb_bdev(struct file_system_type *fs_type,
@@ -354,14 +342,9 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 		return error;
 
 	bdev = fs_devices->lowest_bdev;
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
-	down(&bdev->bd_mount_sem);
-	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
-	up(&bdev->bd_mount_sem);
+	btrfs_lock_volumes();
+	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+	btrfs_unlock_volumes();
 	if (IS_ERR(s))
 		goto error_s;
 
@@ -373,13 +356,11 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 			goto error_bdev;
 		}
 
-		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		sb_set_blocksize(s, block_size(bdev));
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
@@ -458,7 +439,7 @@ static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.get_sb		= btrfs_get_sb,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9826942fa18..57746c11eae 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -738,9 +738,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				   chunk_root->node->start);
 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
 					 btrfs_header_level(chunk_root->node));
-	write_extent_buffer(root->fs_info->sb_buffer,
-			    &root->fs_info->super_copy, 0,
-			    sizeof(root->fs_info->super_copy));
+	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+	       sizeof(root->fs_info->super_copy));
 
 	btrfs_copy_pinned(root, pinned_copy);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b3818757310..55da5f0c56e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -45,6 +45,16 @@ struct map_lookup {
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
+void btrfs_lock_volumes(void)
+{
+	mutex_lock(&uuid_mutex);
+}
+
+void btrfs_unlock_volumes(void)
+{
+	mutex_unlock(&uuid_mutex);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -193,12 +203,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			ret = PTR_ERR(bdev);
 			goto fail;
 		}
+		set_blocksize(bdev, 4096);
 		if (device->devid == fs_devices->latest_devid)
 			fs_devices->latest_bdev = bdev;
 		if (device->devid == fs_devices->lowest_devid) {
 			fs_devices->lowest_bdev = bdev;
 		}
 		device->bdev = bdev;
+
 	}
 	mutex_unlock(&uuid_mutex);
 	return 0;
@@ -393,6 +405,9 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_root *root = device->dev_root;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf = NULL;
+	struct btrfs_dev_extent *extent = NULL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -403,8 +418,25 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid,
+					  BTRFS_DEV_EXTENT_KEY);
+		BUG_ON(ret);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+		BUG_ON(found_key.offset > start || found_key.offset +
+		       btrfs_dev_extent_length(leaf, extent) < start);
+		ret = 0;
+	} else if (ret == 0) {
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+	}
 	BUG_ON(ret);
 
+	device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 
@@ -593,6 +625,170 @@ out:
 	return ret;
 }
 
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+			     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct block_device *bdev = device->bdev;
+	struct btrfs_device *next_dev;
+	struct btrfs_key key;
+	u64 total_bytes;
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_trans_handle *trans;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	list_del_init(&device->dev_list);
+	list_del_init(&device->dev_alloc_list);
+	fs_devices = root->fs_info->fs_devices;
+
+	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
+			      dev_list);
+	if (bdev == fs_devices->lowest_bdev)
+		fs_devices->lowest_bdev = next_dev->bdev;
+	if (bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_dev->bdev;
+	if (bdev == fs_devices->latest_bdev)
+		fs_devices->latest_bdev = next_dev->bdev;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes - device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes - 1);
+out:
+	btrfs_free_path(path);
+	btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 all_avail;
+	u64 devid;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&uuid_mutex);
+
+	all_avail = root->fs_info->avail_data_alloc_bits |
+		root->fs_info->avail_system_alloc_bits |
+		root->fs_info->avail_metadata_alloc_bits;
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+	    root->fs_info->fs_devices->num_devices <= 4) {
+		printk("btrfs: unable to go below four devices on raid10\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+	    root->fs_info->fs_devices->num_devices <= 2) {
+		printk("btrfs: unable to go below two devices on raid1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto out;
+	}
+
+	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+	    sizeof(disk_super->magic))) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	device = btrfs_find_device(root, devid, NULL);
+	if (!device) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+
+	root->fs_info->fs_devices->num_devices--;
+
+	ret = btrfs_shrink_device(device, 0);
+	if (ret)
+		goto error_brelse;
+
+
+	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+	if (ret)
+		goto error_brelse;
+
+	/* make sure this device isn't detected as part of the FS anymore */
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+
+	brelse(bh);
+
+	/* one close for the device struct or super_block */
+	close_bdev_excl(device->bdev);
+
+	/* one close for us */
+	close_bdev_excl(device->bdev);
+
+	kfree(device->name);
+	kfree(device);
+	ret = 0;
+	goto out;
+
+error_brelse:
+	brelse(bh);
+error_close:
+	close_bdev_excl(bdev);
+out:
+	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_trans_handle *trans;
@@ -831,13 +1027,17 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 	spin_unlock(&em_tree->lock);
 
-	BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset);
+	BUG_ON(em->start > chunk_offset ||
+	       em->start + em->len < chunk_offset);
 	map = (struct map_lookup *)em->bdev;
 
 	for (i = 0; i < map->num_stripes; i++) {
 		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
 					    map->stripes[i].physical);
 		BUG_ON(ret);
+
+		ret = btrfs_update_device(trans, map->stripes[i].dev);
+		BUG_ON(ret);
 	}
 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
 			       chunk_offset);
@@ -847,11 +1047,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
 		BUG_ON(ret);
-		goto out;
 	}
 
-
-
 	spin_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
 	kfree(map);
@@ -861,7 +1058,6 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 
-out:
 	/* once for us */
 	free_extent_map(em);
 
@@ -1449,7 +1645,7 @@ again:
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu\n", logical);
+		printk("unable to find logical %Lu len %Lu\n", logical, *length);
 		BUG();
 	}
 
@@ -1712,6 +1908,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
+
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
 	spin_unlock(&map_tree->map_tree.lock);
@@ -1845,7 +2042,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
-	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -1857,6 +2054,12 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 cur;
 	struct btrfs_key key;
 
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+					  BTRFS_SUPER_INFO_SIZE);
+	if (!sb)
+		return -ENOMEM;
+	btrfs_set_buffer_uptodate(sb);
+	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
 	ptr = super_copy->sys_chunk_array;
@@ -1867,8 +2070,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		disk_key = (struct btrfs_disk_key *)ptr;
 		btrfs_disk_key_to_cpu(&key, disk_key);
 
-		len = sizeof(*disk_key);
-		ptr += len;
+		len = sizeof(*disk_key); ptr += len;
 		sb_ptr += len;
 		cur += len;
 
@@ -1887,6 +2089,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		sb_ptr += len;
 		cur += len;
 	}
+	free_extent_buffer(sb);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a9663e92bb1..0f94a69e6eb 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -125,6 +125,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
@@ -136,4 +137,6 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
 #endif
-- 
cgit v1.2.3


From bf4ef67924d87b0addb32f084e83a9283496350e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 May 2008 13:26:18 -0400
Subject: Btrfs: Properly find the root for snapshotted blocks during chunk
 relocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   2 +
 fs/btrfs/extent-tree.c | 120 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/root-tree.c   |  46 ++++++++++++++++++-
 3 files changed, 156 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 33ab165591c..f4b4677bec7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1459,6 +1459,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *item);
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+		      u64 *found_objectid);
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest_root);
 /* dir-item.c */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0e67bde842..8e69b5acfef 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2421,6 +2421,85 @@ truncate_racing:
 	goto out_unlock;
 }
 
+/*
+ * The back references tell us which tree holds a ref on a block,
+ * but it is possible for the tree root field in the reference to
+ * reflect the original root before a snapshot was made.  In this
+ * case we should search through all the children of a given root
+ * to find potential holders of references on a block.
+ *
+ * Instead, we do something a little less fancy and just search
+ * all the roots for a given key/block combination.
+ */
+static int find_root_for_ref(struct btrfs_root *root,
+			     struct btrfs_path *path,
+			     struct btrfs_key *key0,
+			     int level,
+			     int file_key,
+			     struct btrfs_root **found_root,
+			     u64 bytenr)
+{
+	struct btrfs_key root_location;
+	struct btrfs_root *cur_root = *found_root;
+	struct btrfs_file_extent_item *file_extent;
+	u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
+	u64 found_bytenr;
+	int ret;
+	int i;
+
+	root_location.offset = (u64)-1;
+	root_location.type = BTRFS_ROOT_ITEM_KEY;
+	path->lowest_level = level;
+	path->reada = 0;
+	while(1) {
+		ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
+		found_bytenr = 0;
+		if (ret == 0 && file_key) {
+			struct extent_buffer *leaf = path->nodes[0];
+			file_extent = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(leaf, file_extent) ==
+			    BTRFS_FILE_EXTENT_REG) {
+				found_bytenr =
+					btrfs_file_extent_disk_bytenr(leaf,
+							       file_extent);
+		       }
+		} else if (ret == 0) {
+			if (path->nodes[level])
+				found_bytenr = path->nodes[level]->start;
+		}
+
+		for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+			if (!path->nodes[i])
+				break;
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+		btrfs_release_path(cur_root, path);
+
+		if (found_bytenr == bytenr) {
+			*found_root = cur_root;
+			ret = 0;
+			goto out;
+		}
+		ret = btrfs_search_root(root->fs_info->tree_root,
+					root_search_start, &root_search_start);
+		if (ret)
+			break;
+
+		root_location.objectid = root_search_start;
+		cur_root = btrfs_read_fs_root_no_name(root->fs_info,
+						      &root_location);
+		if (!cur_root) {
+			ret = 1;
+			break;
+		}
+	}
+out:
+	path->lowest_level = 0;
+	return ret;
+}
+
 /*
  * note, this releases the path
  */
@@ -2430,13 +2509,15 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 {
 	struct inode *inode;
 	struct btrfs_root *found_root;
-	struct btrfs_key *root_location;
+	struct btrfs_key root_location;
+	struct btrfs_key found_key;
 	struct btrfs_extent_ref *ref;
 	u64 ref_root;
 	u64 ref_gen;
 	u64 ref_objectid;
 	u64 ref_offset;
 	int ret;
+	int level;
 
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			     struct btrfs_extent_ref);
@@ -2446,20 +2527,30 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 	ref_offset = btrfs_ref_offset(path->nodes[0], ref);
 	btrfs_release_path(extent_root, path);
 
-	root_location = kmalloc(sizeof(*root_location), GFP_NOFS);
-	root_location->objectid = ref_root;
+	root_location.objectid = ref_root;
 	if (ref_gen == 0)
-		root_location->offset = 0;
+		root_location.offset = 0;
 	else
-		root_location->offset = (u64)-1;
-	root_location->type = BTRFS_ROOT_ITEM_KEY;
+		root_location.offset = (u64)-1;
+	root_location.type = BTRFS_ROOT_ITEM_KEY;
 
 	found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
-						root_location);
+						&root_location);
 	BUG_ON(!found_root);
-	kfree(root_location);
 
 	if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		found_key.objectid = ref_objectid;
+		found_key.type = BTRFS_EXTENT_DATA_KEY;
+		found_key.offset = ref_offset;
+		level = 0;
+
+		ret = find_root_for_ref(extent_root, path, &found_key,
+					level, 1, &found_root,
+					extent_key->objectid);
+
+		if (ret)
+			goto out;
+
 		mutex_unlock(&extent_root->fs_info->fs_mutex);
 		inode = btrfs_iget_locked(extent_root->fs_info->sb,
 					  ref_objectid, found_root);
@@ -2485,12 +2576,9 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		mutex_lock(&extent_root->fs_info->fs_mutex);
 	} else {
 		struct btrfs_trans_handle *trans;
-		struct btrfs_key found_key;
 		struct extent_buffer *eb;
-		int level;
 		int i;
 
-		trans = btrfs_start_transaction(found_root, 1);
 		eb = read_tree_block(found_root, extent_key->objectid,
 				     extent_key->offset);
 		level = btrfs_header_level(eb);
@@ -2502,6 +2590,15 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 
 		free_extent_buffer(eb);
 
+		ret = find_root_for_ref(extent_root, path, &found_key,
+					level, 0, &found_root,
+					extent_key->objectid);
+
+		if (ret)
+			goto out;
+
+		trans = btrfs_start_transaction(found_root, 1);
+
 		path->lowest_level = level;
 		path->reada = 2;
 		ret = btrfs_search_slot(trans, found_root, &found_key, path,
@@ -2578,6 +2675,7 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 			}
 			if (ret < 0)
 				goto out;
+			leaf = path->nodes[0];
 		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 08f19ec8809..8bf21ba0a43 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -21,6 +21,51 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
+/*
+ * returns 0 on finding something, 1 if no more roots are there
+ * and < 0 on error
+ */
+int btrfs_search_root(struct btrfs_root *root, u64 search_start,
+		      u64 *found_objectid)
+{
+	struct btrfs_path *path;
+	struct btrfs_key search_key;
+	int ret;
+
+	root = root->fs_info->tree_root;
+	search_key.objectid = search_start;
+	search_key.type = (u8)-1;
+	search_key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret == 0) {
+		ret = 1;
+		goto out;
+	}
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	}
+	btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
+	if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
+		search_key.offset++;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+	ret = 0;
+	*found_objectid = search_key.objectid;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 			struct btrfs_root_item *item, struct btrfs_key *key)
 {
@@ -55,7 +100,6 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	memcpy(key, &found_key, sizeof(found_key));
 	ret = 0;
 out:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
 }
-- 
cgit v1.2.3


From a68d5933a0e409592860229b35230c8e87155472 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 May 2008 14:11:56 -0400
Subject: Btrfs: Update nodatacow mode to support cloned single files and
 resizing

Before, nodatacow only checked to make sure multiple roots didn't have
references on a single extent.  This check makes sure that multiple
inodes don't have references.

nodatacow needed an extra check to see if the block group was currently
readonly.  This way cows forced by the chunk relocation code are honored.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent-tree.c |  9 +++++++++
 fs/btrfs/inode.c       | 13 +++++++++----
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f4b4677bec7..6c2c2c4e4d2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1325,7 +1325,7 @@ static inline struct dentry *fdentry(struct file *file) {
 /* extent-tree.c */
 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 				  struct btrfs_path *count_path,
-				  u64 first_extent);
+				  u64 expected_owner, u64 first_extent);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8e69b5acfef..30a5094fffa 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -721,12 +721,14 @@ out:
 
 u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 				  struct btrfs_path *count_path,
+				  u64 expected_owner,
 				  u64 first_extent)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	struct btrfs_path *path;
 	u64 bytenr;
 	u64 found_objectid;
+	u64 found_owner;
 	u64 root_objectid = root->root_key.objectid;
 	u32 total_count = 0;
 	u32 cur_count;
@@ -792,6 +794,13 @@ again:
 			total_count = 2;
 			goto out;
 		}
+		if (level == -1) {
+			found_owner = btrfs_ref_objectid(l, ref_item);
+			if (found_owner != expected_owner) {
+				total_count = 2;
+				goto out;
+			}
+		}
 		total_count = 1;
 		path->slots[0]++;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b437d3bdf95..1bf37d15b17 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -172,6 +172,7 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
 	u64 loops = 0;
 	u64 total_fs_bytes;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_group_cache *block_group;
 	struct extent_buffer *leaf;
 	int found_type;
 	struct btrfs_path *path;
@@ -230,16 +231,20 @@ again:
 		if (bytenr == 0)
 			goto not_found;
 
+		if (btrfs_count_snapshots_in_path(root, path, inode->i_ino,
+						  bytenr) != 1) {
+			goto not_found;
+		}
+
 		/*
 		 * we may be called by the resizer, make sure we're inside
 		 * the limits of the FS
 		 */
-		if (bytenr + extent_num_bytes > total_fs_bytes)
+		block_group = btrfs_lookup_block_group(root->fs_info,
+						       bytenr);
+		if (!block_group || block_group->ro)
 			goto not_found;
 
-		if (btrfs_count_snapshots_in_path(root, path, bytenr) != 1) {
-			goto not_found;
-		}
 
 		start = extent_end;
 	} else {
-- 
cgit v1.2.3


From c1c4d91c38e6e5b29543b02d4a4fb7f007e44f70 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 May 2008 15:05:58 -0400
Subject: Btrfs: Only open block devices once during mount -o subvol=

btrfs_open_devices needed a check to see if the device was already
open.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 55da5f0c56e..aa001b08ced 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -196,6 +196,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	mutex_lock(&uuid_mutex);
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev)
+			continue;
+
 		bdev = open_bdev_excl(device->name, flags, holder);
 
 		if (IS_ERR(bdev)) {
-- 
cgit v1.2.3


From bbaf549e0c3d28399fc5abd68020d4025ae5c3a7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 May 2008 16:31:21 -0400
Subject: Btrfs: A number of nodatacow fixes

Once part of a delalloc request fails the cow checks, just cow the
entire range

It is possible for the back references to all be from the same root,
but still have snapshots against an extent.  The checks are now more strict,
forcing cow any time there are multiple refs against the data extent.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++++++++++
 fs/btrfs/inode.c       |  8 +++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 30a5094fffa..db996f0edf0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -731,6 +731,7 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 	u64 found_owner;
 	u64 root_objectid = root->root_key.objectid;
 	u32 total_count = 0;
+	u32 extent_refs;
 	u32 cur_count;
 	u32 nritems;
 	int ret;
@@ -767,6 +768,7 @@ again:
 	}
 
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+	extent_refs = btrfs_extent_refs(l, item);
 	while (1) {
 		l = path->nodes[0];
 		nritems = btrfs_header_nritems(l);
@@ -800,10 +802,28 @@ again:
 				total_count = 2;
 				goto out;
 			}
+			/*
+			 * nasty.  we don't count a reference held by
+			 * the running transaction.  This allows nodatacow
+			 * to avoid cow most of the time
+			 */
+			if (found_owner >= BTRFS_FIRST_FREE_OBJECTID &&
+			    btrfs_ref_generation(l, ref_item) ==
+			    root->fs_info->generation) {
+				extent_refs--;
+			}
 		}
 		total_count = 1;
 		path->slots[0]++;
 	}
+	/*
+	 * if there is more than one reference against a data extent,
+	 * we have to assume the other ref is another snapshot
+	 */
+	if (level == -1 && extent_refs > 1) {
+		total_count = 2;
+		goto out;
+	}
 	if (cur_count == 0) {
 		total_count = 0;
 		goto out;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1bf37d15b17..a492fd238c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -207,9 +207,8 @@ again:
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 	found_type = btrfs_key_type(&found_key);
 	if (found_key.objectid != inode->i_ino ||
-	    found_type != BTRFS_EXTENT_DATA_KEY) {
+	    found_type != BTRFS_EXTENT_DATA_KEY)
 		goto not_found;
-	}
 
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
@@ -245,7 +244,6 @@ again:
 		if (!block_group || block_group->ro)
 			goto not_found;
 
-
 		start = extent_end;
 	} else {
 		goto not_found;
@@ -260,8 +258,8 @@ loop:
 	goto again;
 
 not_found:
-	cow_file_range(inode, start, cow_end);
-	start = cow_end + 1;
+	cow_file_range(inode, start, end);
+	start = end + 1;
 	goto loop;
 }
 
-- 
cgit v1.2.3


From 323da79c9f096ed4da04e5ea00f766f75b28aeaa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 May 2008 11:46:48 -0400
Subject: Btrfs: Chunk relocation fine tuning, and add a few printks to show
 progress

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 20 +++++++++++++++++---
 fs/btrfs/volumes.c     |  2 ++
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db996f0edf0..df95fb660d4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2493,7 +2493,7 @@ static int find_root_for_ref(struct btrfs_root *root,
 					btrfs_file_extent_disk_bytenr(leaf,
 							       file_extent);
 		       }
-		} else if (ret == 0) {
+		} else if (!file_key) {
 			if (path->nodes[level])
 				found_bytenr = path->nodes[level]->start;
 		}
@@ -2797,14 +2797,25 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	root = root->fs_info->extent_root;
 	path->reada = 2;
 
+	printk("btrfs relocating block group %llu flags %llu\n",
+	       (unsigned long long)shrink_start,
+	       (unsigned long long)shrink_block_group->flags);
+
 again:
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+		u64 calc;
+
 		trans = btrfs_start_transaction(root, 1);
 		new_alloc_flags = update_block_group_flags(root,
 						   shrink_block_group->flags);
+		if (new_alloc_flags != shrink_block_group->flags) {
+			calc =
+			     btrfs_block_group_used(&shrink_block_group->item);
+		} else {
+			calc = shrink_block_group->key.offset;
+		}
 		do_chunk_alloc(trans, root->fs_info->extent_root,
-			btrfs_block_group_used(&shrink_block_group->item) +
-			2 * 1024 * 1024, new_alloc_flags);
+			       calc + 2 * 1024 * 1024, new_alloc_flags);
 		btrfs_end_transaction(trans, root);
 	}
 	shrink_block_group->ro = 1;
@@ -2888,6 +2899,9 @@ next:
 	btrfs_release_path(root, path);
 
 	if (total_found > 0) {
+		printk("btrfs relocate found %llu last extent was %llu\n",
+		       (unsigned long long)total_found,
+		       (unsigned long long)found_key.objectid);
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aa001b08ced..b5d7bd1915b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1011,6 +1011,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
+	printk("btrfs relocating chunk %llu\n",
+	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
 	em_tree = &root->fs_info->mapping_tree.map_tree;
-- 
cgit v1.2.3


From 188de649c51f03df3f8a601aea0110a3870815c2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 May 2008 11:52:25 -0400
Subject: Btrfs: Don't do btree balance_dirty_pages on old kernels, it stalls
 forever

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9d5424ad01a..c3083165b17 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1655,6 +1655,11 @@ void btrfs_throttle(struct btrfs_root *root)
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 {
+	/*
+	 * looks as though older kernels can get into trouble with
+	 * this code, they end up stuck in balance_dirty_pages forever
+	 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
@@ -1670,6 +1675,9 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 		balance_dirty_pages_ratelimited_nr(
 				   root->fs_info->btree_inode->i_mapping, 1);
 	}
+#else
+	return;
+#endif
 }
 
 void btrfs_set_buffer_defrag(struct extent_buffer *buf)
-- 
cgit v1.2.3


From ca7a79ad8dbe24669a8ced01f9fc0126b872745d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 May 2008 12:59:19 -0400
Subject: Btrfs: Pass down the expected generation number when reading tree
 blocks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 55 +++++++++++++++++++-------------------------------
 fs/btrfs/disk-io.c     | 30 ++++++++++++---------------
 fs/btrfs/disk-io.h     |  7 ++++---
 fs/btrfs/extent-tree.c | 15 ++++++++------
 fs/btrfs/print-tree.c  |  3 ++-
 fs/btrfs/tree-defrag.c |  9 ++++++---
 6 files changed, 55 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 29e5674b043..289d71d8653 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -302,6 +302,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	struct extent_buffer *cur;
 	struct extent_buffer *tmp;
 	u64 blocknr;
+	u64 gen;
 	u64 search_start = *last_ret;
 	u64 last_block = 0;
 	u64 other;
@@ -354,6 +355,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 		progress_passed = 1;
 		blocknr = btrfs_node_blockptr(parent, i);
+		gen = btrfs_node_ptr_generation(parent, i);
 		if (last_block == 0)
 			last_block = blocknr;
 
@@ -387,15 +389,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			}
 			if (!cur) {
 				cur = read_tree_block(root, blocknr,
-							 blocksize);
+							 blocksize, gen);
 			} else if (!uptodate) {
-				btrfs_read_buffer(cur);
+				btrfs_read_buffer(cur, gen);
 			}
 		}
 		if (search_start == 0)
 			search_start = last_block;
 
-		btrfs_verify_block_csum(root, cur);
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&tmp, search_start,
 					min(16 * blocksize,
@@ -696,12 +697,17 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 static struct extent_buffer *read_node_slot(struct btrfs_root *root,
 				   struct extent_buffer *parent, int slot)
 {
+	int level = btrfs_header_level(parent);
 	if (slot < 0)
 		return NULL;
 	if (slot >= btrfs_header_nritems(parent))
 		return NULL;
+
+	BUG_ON(level == 0);
+
 	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
-		       btrfs_level_size(root, btrfs_header_level(parent) - 1));
+		       btrfs_level_size(root, level - 1),
+		       btrfs_node_ptr_generation(parent, slot));
 }
 
 static int balance_level(struct btrfs_trans_handle *trans,
@@ -1076,7 +1082,8 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 		if ((search >= lowest_read && search <= highest_read) ||
 		    (search < lowest_read && lowest_read - search <= 32768) ||
 		    (search > highest_read && search - highest_read <= 32768)) {
-			readahead_tree_block(root, search, blocksize);
+			readahead_tree_block(root, search, blocksize,
+				     btrfs_node_ptr_generation(node, nr));
 			nread += blocksize;
 		}
 		nscan++;
@@ -1109,8 +1116,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow)
 {
 	struct extent_buffer *b;
-	u64 bytenr;
-	u64 ptr_gen;
 	int slot;
 	int ret;
 	int level;
@@ -1174,20 +1179,12 @@ again:
 			/* this is only true while dropping a snapshot */
 			if (level == lowest_level)
 				break;
-			bytenr = btrfs_node_blockptr(b, slot);
-			ptr_gen = btrfs_node_ptr_generation(b, slot);
+
 			if (should_reada)
 				reada_for_search(root, p, level, slot,
 						 key->objectid);
-			b = read_tree_block(root, bytenr,
-					    btrfs_level_size(root, level - 1));
-			if (ptr_gen != btrfs_header_generation(b)) {
-				printk("block %llu bad gen wanted %llu "
-				       "found %llu\n",
-			        (unsigned long long)b->start,
-				(unsigned long long)ptr_gen,
-			        (unsigned long long)btrfs_header_generation(b));
-			}
+
+			b = read_node_slot(root, b, slot);
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -1650,8 +1647,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
-	right = read_tree_block(root, btrfs_node_blockptr(upper, slot + 1),
-				root->leafsize);
+	right = read_node_slot(root, upper, slot + 1);
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		free_extent_buffer(right);
@@ -1826,8 +1822,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 
-	left = read_tree_block(root, btrfs_node_blockptr(path->nodes[1],
-			       slot - 1), root->leafsize);
+	left = read_node_slot(root, path->nodes[1], slot - 1);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
 		free_extent_buffer(left);
@@ -2742,7 +2737,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  */
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
-	u64 bytenr;
 	int slot;
 	int level = 1;
 	struct extent_buffer *c;
@@ -2762,12 +2756,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		}
 		slot--;
 
-		bytenr = btrfs_node_blockptr(c, slot);
 		if (next)
 			free_extent_buffer(next);
 
-		next = read_tree_block(root, bytenr,
-				       btrfs_level_size(root, level - 1));
+		next = read_node_slot(root, c, slot);
 		break;
 	}
 	path->slots[level] = slot;
@@ -2782,8 +2774,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		path->slots[level] = slot;
 		if (!level)
 			break;
-		next = read_tree_block(root, btrfs_node_blockptr(next, slot),
-				       btrfs_level_size(root, level - 1));
+		next = read_node_slot(root, next, slot);
 	}
 	return 0;
 }
@@ -2797,7 +2788,6 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
 	int slot;
 	int level = 1;
-	u64 bytenr;
 	struct extent_buffer *c;
 	struct extent_buffer *next = NULL;
 
@@ -2814,15 +2804,13 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			continue;
 		}
 
-		bytenr = btrfs_node_blockptr(c, slot);
 		if (next)
 			free_extent_buffer(next);
 
 		if (path->reada)
 			reada_for_search(root, path, level, slot, 0);
 
-		next = read_tree_block(root, bytenr,
-				       btrfs_level_size(root, level -1));
+		next = read_node_slot(root, c, slot);
 		break;
 	}
 	path->slots[level] = slot;
@@ -2836,8 +2824,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			break;
 		if (path->reada)
 			reada_for_search(root, path, level, 0, 0);
-		next = read_tree_block(root, btrfs_node_blockptr(next, 0),
-				       btrfs_level_size(root, level - 1));
+		next = read_node_slot(root, next, 0);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c3083165b17..edee7a44f86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -207,7 +207,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 
 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
-					  u64 start)
+					  u64 start, u64 parent_transid)
 {
 	struct extent_io_tree *io_tree;
 	int ret;
@@ -254,7 +254,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE);
+	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
+					     btrfs_header_generation(eb));
 	BUG_ON(ret);
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
@@ -562,7 +563,8 @@ static struct address_space_operations btree_aops = {
 	.sync_page	= block_sync_page,
 };
 
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 u64 parent_transid)
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
@@ -592,12 +594,6 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
-int btrfs_verify_block_csum(struct btrfs_root *root,
-			    struct extent_buffer *buf)
-{
-	return btrfs_buffer_uptodate(buf);
-}
-
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
@@ -621,7 +617,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
-				      u32 blocksize)
+				      u32 blocksize, u64 parent_transid)
 {
 	struct extent_buffer *buf = NULL;
 	struct inode *btree_inode = root->fs_info->btree_inode;
@@ -634,7 +630,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	if (!buf)
 		return NULL;
 
-	ret = btree_read_extent_buffer_pages(root, buf, 0);
+	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
@@ -715,7 +711,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize);
+				     blocksize, 0);
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -771,7 +767,7 @@ out:
 	}
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize);
+				     blocksize, 0);
 	BUG_ON(!root->node);
 insert:
 	root->ref_cows = 1;
@@ -1288,7 +1284,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
-					   blocksize);
+					   blocksize, 0);
 	BUG_ON(!chunk_root->node);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
@@ -1304,7 +1300,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
-					  blocksize);
+					  blocksize, 0);
 	if (!tree_root->node)
 		goto fail_sb_buffer;
 
@@ -1732,11 +1728,11 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 		     EXTENT_DEFRAG, GFP_NOFS);
 }
 
-int btrfs_read_buffer(struct extent_buffer *buf)
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	int ret;
-	ret = btree_read_extent_buffer_pages(root, buf, 0);
+	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
 	}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5d5f6979c41..e29c895d523 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,8 +25,9 @@ struct btrfs_device;
 struct btrfs_fs_devices;
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
-				      u32 blocksize);
-int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
+				      u32 blocksize, u64 parent_transid);
+int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 u64 parent_transid);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -65,7 +66,7 @@ int btrfs_buffer_defrag(struct extent_buffer *buf);
 int btrfs_buffer_defrag_done(struct extent_buffer *buf);
 int btrfs_clear_buffer_defrag(struct extent_buffer *buf);
 int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
-int btrfs_read_buffer(struct extent_buffer *buf);
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 void btrfs_throttle(struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index df95fb660d4..db07dde4a87 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1338,7 +1338,8 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 					&extent_item, sizeof(extent_item));
 		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
-		eb = read_tree_block(extent_root, ins.objectid, ins.offset);
+		eb = read_tree_block(extent_root, ins.objectid, ins.offset,
+				     trans->transid);
 		level = btrfs_header_level(eb);
 		if (level == 0) {
 			btrfs_item_key(eb, &first, 0);
@@ -2076,7 +2077,8 @@ static void noinline reada_walk_down(struct btrfs_root *root,
 			}
 		}
 		mutex_unlock(&root->fs_info->fs_mutex);
-		ret = readahead_tree_block(root, bytenr, blocksize);
+		ret = readahead_tree_block(root, bytenr, blocksize,
+					   btrfs_node_ptr_generation(node, i));
 		last = bytenr + blocksize;
 		cond_resched();
 		mutex_lock(&root->fs_info->fs_mutex);
@@ -2096,6 +2098,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 	u64 root_owner;
 	u64 root_gen;
 	u64 bytenr;
+	u64 ptr_gen;
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
@@ -2132,6 +2135,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			break;
 		}
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 		ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
@@ -2152,7 +2156,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			reada_walk_down(root, cur, path->slots[*level]);
 
 			mutex_unlock(&root->fs_info->fs_mutex);
-			next = read_tree_block(root, bytenr, blocksize);
+			next = read_tree_block(root, bytenr, blocksize,
+					       ptr_gen);
 			mutex_lock(&root->fs_info->fs_mutex);
 
 			/* we've dropped the lock, double check */
@@ -2173,8 +2178,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				BUG_ON(ret);
 				continue;
 			}
-		} else if (next) {
-			btrfs_verify_block_csum(root, next);
 		}
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
@@ -2609,7 +2612,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		int i;
 
 		eb = read_tree_block(found_root, extent_key->objectid,
-				     extent_key->offset);
+				     extent_key->offset, 0);
 		level = btrfs_header_level(eb);
 
 		if (level == 0)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index e99f3249d05..14d86372030 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -186,7 +186,8 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 	for (i = 0; i < nr; i++) {
 		struct extent_buffer *next = read_tree_block(root,
 					btrfs_node_blockptr(c, i),
-					btrfs_level_size(root, level - 1));
+					btrfs_level_size(root, level - 1),
+					btrfs_node_ptr_generation(c, i));
 		if (btrfs_is_leaf(next) &&
 		    btrfs_header_level(c) != 1)
 			BUG();
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 256af1870ee..5085e9e693b 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -28,6 +28,7 @@ static void reada_defrag(struct btrfs_root *root,
 	int i;
 	u32 nritems;
 	u64 bytenr;
+	u64 gen;
 	u32 blocksize;
 	int ret;
 
@@ -35,7 +36,8 @@ static void reada_defrag(struct btrfs_root *root,
 	nritems = btrfs_header_nritems(node);
 	for (i = 0; i < nritems; i++) {
 		bytenr = btrfs_node_blockptr(node, i);
-		ret = readahead_tree_block(root, bytenr, blocksize);
+		gen = btrfs_node_ptr_generation(node, i);
+		ret = readahead_tree_block(root, bytenr, blocksize, gen);
 		if (ret)
 			break;
 	}
@@ -101,10 +103,11 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 				path->slots[*level]++;
 				continue;
 			}
-			btrfs_verify_block_csum(root, next);
 		} else {
 			next = read_tree_block(root, bytenr,
-				       btrfs_level_size(root, *level - 1));
+				       btrfs_level_size(root, *level - 1),
+				       btrfs_node_ptr_generation(cur,
+							 path->slots[*level]));
 		}
 		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
 				      path->slots[*level], &next);
-- 
cgit v1.2.3


From 1259ab75c62462b8ffad90067b5e1f6312786a18 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 May 2008 13:39:03 -0400
Subject: Btrfs: Handle write errors on raid1 and raid10

When duplicate copies exist, writes are allowed to fail to one of those
copies.  This changeset includes a few changes that allow the FS to
continue even when some IOs fail.

It also adds verification of the parent generation number for btree blocks.
This generation is stored in the pointer to a block, and it ensures
that missed writes to are detected.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  2 +-
 fs/btrfs/disk-io.c     | 56 ++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/disk-io.h     |  2 +-
 fs/btrfs/extent-tree.c |  4 +--
 fs/btrfs/extent_io.c   | 49 +++++++++++++++++++++++++++++++------
 fs/btrfs/extent_io.h   |  7 +++++-
 fs/btrfs/inode.c       | 66 +++++++++++++++++++++++++++++++-------------------
 fs/btrfs/tree-defrag.c |  7 +++---
 fs/btrfs/volumes.c     | 11 ++++++---
 9 files changed, 155 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 289d71d8653..02e571e6ee6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -379,7 +379,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
 		if (cur)
-			uptodate = btrfs_buffer_uptodate(cur);
+			uptodate = btrfs_buffer_uptodate(cur, gen);
 		else
 			uptodate = 0;
 		if (!cur || !uptodate) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index edee7a44f86..574b1245964 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -205,6 +205,33 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	return 0;
 }
 
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+				 struct extent_buffer *eb, u64 parent_transid)
+{
+	int ret;
+
+	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+		return 0;
+
+	lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+	if (extent_buffer_uptodate(io_tree, eb) &&
+	    btrfs_header_generation(eb) == parent_transid) {
+		ret = 0;
+		goto out;
+	}
+	printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+	       (unsigned long long)eb->start,
+	       (unsigned long long)parent_transid,
+	       (unsigned long long)btrfs_header_generation(eb));
+	ret = 1;
+out:
+	clear_extent_buffer_uptodate(io_tree, eb);
+	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+		      GFP_NOFS);
+	return ret;
+
+}
+
 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
 					  u64 start, u64 parent_transid)
@@ -218,7 +245,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	while (1) {
 		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
 					       btree_get_extent, mirror_num);
-		if (!ret)
+		if (!ret &&
+		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
 
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
@@ -330,6 +358,13 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ret = -EIO;
 		goto err;
 	}
+	if (memcmp_extent_buffer(eb, root->fs_info->fsid,
+				 (unsigned long)btrfs_header_fsid(eb),
+				 BTRFS_FSID_SIZE)) {
+		printk("bad fsid on block %Lu\n", eb->start);
+		ret = -EIO;
+		goto err;
+	}
 	found_level = btrfs_header_level(eb);
 
 	ret = csum_tree_block(root, eb, 1);
@@ -1363,7 +1398,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 					"I/O error on %s\n",
 				       bdevname(bh->b_bdev, b));
 		}
-		set_buffer_write_io_error(bh);
+		/* note, we dont' set_buffer_write_io_error because we have
+		 * our own ways of dealing with the IO errors
+		 */
 		clear_buffer_uptodate(bh);
 	}
 	unlock_buffer(bh);
@@ -1459,7 +1496,8 @@ int write_all_supers(struct btrfs_root *root)
 				ret = submit_bh(WRITE, bh);
 				BUG_ON(ret);
 				wait_on_buffer(bh);
-				BUG_ON(!buffer_uptodate(bh));
+				if (!buffer_uptodate(bh))
+					total_errors++;
 			} else {
 				total_errors++;
 			}
@@ -1607,10 +1645,18 @@ int close_ctree(struct btrfs_root *root)
 	return 0;
 }
 
-int btrfs_buffer_uptodate(struct extent_buffer *buf)
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 {
+	int ret;
 	struct inode *btree_inode = buf->first_page->mapping->host;
-	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+
+	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+	if (!ret)
+		return ret;
+
+	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+				    parent_transid);
+	return !ret;
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e29c895d523..30d1ed293c2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -56,7 +56,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
 				 struct extent_buffer *buf);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db07dde4a87..605018c6045 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1366,7 +1366,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 	if (!pending) {
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
-			if (btrfs_buffer_uptodate(buf)) {
+			if (btrfs_buffer_uptodate(buf, 0)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
@@ -2151,7 +2151,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			continue;
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
-		if (!next || !btrfs_buffer_uptodate(next)) {
+		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
 			reada_walk_down(root, cur, path->slots[*level]);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dd403b426ff..2a3624adc0c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1366,7 +1366,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 				   unsigned int bytes_done, int err)
 #endif
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int uptodate = err == 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct extent_state *state = bio->bi_private;
 	struct extent_io_tree *tree = state->tree;
@@ -1375,6 +1375,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 	u64 end;
 	u64 cur;
 	int whole_page;
+	int ret;
 	unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -1395,17 +1396,30 @@ static int end_bio_extent_writepage(struct bio *bio,
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 
+		if (tree->ops && tree->ops->writepage_end_io_hook) {
+			ret = tree->ops->writepage_end_io_hook(page, start,
+						       end, state);
+			if (ret)
+				uptodate = 0;
+		}
+
+		if (!uptodate && tree->ops &&
+		    tree->ops->writepage_io_failed_hook) {
+			ret = tree->ops->writepage_io_failed_hook(bio, page,
+							 start, end, state);
+			if (ret == 0) {
+				state = NULL;
+				uptodate = (err == 0);
+				continue;
+			}
+		}
+
 		if (!uptodate) {
 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
 
-		if (tree->ops && tree->ops->writepage_end_io_hook) {
-			tree->ops->writepage_end_io_hook(page, start, end,
-							 state);
-		}
-
 		/*
 		 * bios can get merged in funny ways, and so we need to
 		 * be careful with the state variable.  We know the
@@ -2073,9 +2087,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		} else {
 			ret = 0;
 		}
-		if (ret)
+		if (ret) {
 			SetPageError(page);
-		else {
+		} else {
 			unsigned long max_nr = end_index + 1;
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
@@ -2948,6 +2962,25 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(set_extent_buffer_dirty);
 
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	eb->flags &= ~EXTENT_UPTODATE;
+
+	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			      GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		ClearPageUptodate(page);
+	}
+	return 0;
+}
+
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 				struct extent_buffer *eb)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e4834614751..f1960dafaa1 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -36,9 +36,12 @@ struct extent_io_ops {
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
 				       u64 start, u64 end,
 				       struct extent_state *state);
+	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+					u64 start, u64 end,
+				       struct extent_state *state);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
-	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state);
 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
@@ -212,6 +215,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 			     struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 			       struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb);
 int extent_buffer_uptodate(struct extent_io_tree *tree,
 			   struct extent_buffer *eb);
 int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a492fd238c8..08760ff9bab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -430,9 +430,9 @@ struct io_failure_record {
 	int last_mirror;
 };
 
-int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
-				  struct page *page, u64 start, u64 end,
-				  struct extent_state *state)
+int btrfs_io_failed_hook(struct bio *failed_bio,
+			 struct page *page, u64 start, u64 end,
+			 struct extent_state *state)
 {
 	struct io_failure_record *failrec = NULL;
 	u64 private;
@@ -443,6 +443,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	struct bio *bio;
 	int num_copies;
 	int ret;
+	int rw;
 	u64 logical;
 
 	ret = get_state_private(failure_tree, start, &private);
@@ -505,7 +506,41 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	bio->bi_bdev = failed_bio->bi_bdev;
 	bio->bi_size = 0;
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
+	if (failed_bio->bi_rw & (1 << BIO_RW))
+		rw = WRITE;
+	else
+		rw = READ;
+
+	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+						      failrec->last_mirror);
+	return 0;
+}
+
+int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failure;
+	int ret;
+
+	private = 0;
+	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+			     (u64)-1, 1, EXTENT_DIRTY)) {
+		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+					start, &private_failure);
+		if (ret == 0) {
+			failure = (struct io_failure_record *)(unsigned long)
+				   private_failure;
+			set_state_private(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start, 0);
+			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start,
+					  failure->start + failure->len - 1,
+					  EXTENT_DIRTY | EXTENT_LOCKED,
+					  GFP_NOFS);
+			kfree(failure);
+		}
+	}
 	return 0;
 }
 
@@ -547,26 +582,7 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	/* if the io failure tree for this inode is non-empty,
 	 * check to see if we've recovered from a failed IO
 	 */
-	private = 0;
-	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY)) {
-		u64 private_failure;
-		struct io_failure_record *failure;
-		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-					start, &private_failure);
-		if (ret == 0) {
-			failure = (struct io_failure_record *)(unsigned long)
-				   private_failure;
-			set_state_private(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start, 0);
-			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start,
-					  failure->start + failure->len - 1,
-					  EXTENT_DIRTY | EXTENT_LOCKED,
-					  GFP_NOFS);
-			kfree(failure);
-		}
-	}
+	btrfs_clean_io_failures(inode, start);
 	return 0;
 
 zeroit:
@@ -3657,7 +3673,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
-	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
+	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 };
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5085e9e693b..c02e2bf2f02 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -51,6 +51,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	u64 bytenr;
+	u64 ptr_gen;
 	int ret = 0;
 	int is_extent = 0;
 
@@ -93,11 +94,12 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 			break;
 		}
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 
 		if (cache_only) {
 			next = btrfs_find_tree_block(root, bytenr,
 					   btrfs_level_size(root, *level - 1));
-			if (!next || !btrfs_buffer_uptodate(next) ||
+			if (!next || !btrfs_buffer_uptodate(next, ptr_gen) ||
 			    !btrfs_buffer_defrag(next)) {
 				free_extent_buffer(next);
 				path->slots[*level]++;
@@ -106,8 +108,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		} else {
 			next = read_tree_block(root, bytenr,
 				       btrfs_level_size(root, *level - 1),
-				       btrfs_node_ptr_generation(cur,
-							 path->slots[*level]));
+				       ptr_gen);
 		}
 		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
 				      path->slots[*level], &next);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b5d7bd1915b..5fc7fb48147 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1807,14 +1807,19 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
-
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the multi-bio
 		 */
-		if (atomic_read(&multi->error) > multi->max_errors)
+		if (atomic_read(&multi->error) > multi->max_errors) {
 			err = -EIO;
-		else
+		} else if (err) {
+			/*
+			 * this bio is actually up to date, we didn't
+			 * go over the max number of errors
+			 */
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			err = 0;
+		}
 		kfree(multi);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-- 
cgit v1.2.3


From dfe25020689bb2d318782d2c9c7141203583fc70 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 May 2008 13:46:40 -0400
Subject: Btrfs: Add mount -o degraded to allow mounts to continue with missing
 devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |   3 +
 fs/btrfs/disk-io.c |  49 ++++++----
 fs/btrfs/disk-io.h |   3 +-
 fs/btrfs/super.c   |  25 +++--
 fs/btrfs/volumes.c | 279 ++++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/volumes.h |   3 +
 6 files changed, 253 insertions(+), 109 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6c2c2c4e4d2..a9377fb16c3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -657,6 +657,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NODATACOW		(1 << 1)
 #define BTRFS_MOUNT_NOBARRIER		(1 << 2)
 #define BTRFS_MOUNT_SSD			(1 << 3)
+#define BTRFS_MOUNT_DEGRADED		(1 << 4)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1606,4 +1607,6 @@ int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct inode *inode);
 /* super.c */
 u64 btrfs_parse_size(char *str);
+int btrfs_parse_options(char *options, struct btrfs_root *root,
+			char **subvol_name);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 574b1245964..38b0d9ecda6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -614,21 +614,6 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	return ret;
 }
 
-static int close_all_devices(struct btrfs_fs_info *fs_info)
-{
-	struct list_head *list;
-	struct list_head *next;
-	struct btrfs_device *device;
-
-	list = &fs_info->fs_devices->devices;
-	list_for_each(next, list) {
-		device = list_entry(next, struct btrfs_device, dev_list);
-		close_bdev_excl(device->bdev);
-		device->bdev = NULL;
-	}
-	return 0;
-}
-
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
@@ -927,6 +912,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->bdev)
+			continue;
 		bdi = blk_get_backing_dev_info(device->bdev);
 		if (bdi && bdi_congested(bdi, bdi_bits)) {
 			ret = 1;
@@ -1140,7 +1127,8 @@ static void btrfs_async_submit_work(struct work_struct *work)
 }
 
 struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices)
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -1276,12 +1264,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
-	if (btrfs_super_num_devices(disk_super) != fs_devices->num_devices) {
+	btrfs_parse_options(options, tree_root, NULL);
+
+	if (btrfs_super_num_devices(disk_super) > fs_devices->num_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
 		       (unsigned long long)btrfs_super_num_devices(disk_super),
 		       (unsigned long long)fs_devices->num_devices);
-		goto fail_sb_buffer;
+		if (btrfs_test_opt(tree_root, DEGRADED))
+			printk("continuing in degraded mode\n");
+		else {
+			goto fail_sb_buffer;
+		}
 	}
+
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 
 	nodesize = btrfs_super_nodesize(disk_super);
@@ -1329,6 +1324,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_read_chunk_tree(chunk_root);
 	BUG_ON(ret);
 
+	btrfs_close_extra_devices(fs_devices);
+
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 
@@ -1374,7 +1371,7 @@ fail_sb_buffer:
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
-	close_all_devices(fs_info);
+	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 	kfree(extent_root);
@@ -1429,6 +1426,13 @@ int write_all_supers(struct btrfs_root *root)
 	dev_item = &sb->dev_item;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (!dev->bdev) {
+			total_errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata)
+			continue;
+
 		btrfs_set_stack_device_type(dev_item, dev->type);
 		btrfs_set_stack_device_id(dev_item, dev->devid);
 		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
@@ -1482,6 +1486,11 @@ int write_all_supers(struct btrfs_root *root)
 
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (!dev->bdev)
+			continue;
+		if (!dev->in_fs_metadata)
+			continue;
+
 		BUG_ON(!dev->pending_io);
 		bh = dev->pending_io;
 		wait_on_buffer(bh);
@@ -1631,7 +1640,7 @@ int close_ctree(struct btrfs_root *root)
 		kfree(hasher);
 	}
 #endif
-	close_all_devices(fs_info);
+	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 30d1ed293c2..2bc64fefe6e 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -33,7 +33,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
 struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices);
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 020e5a83e31..273a5b511f5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -65,11 +65,13 @@ static void btrfs_put_super (struct super_block * sb)
 }
 
 enum {
-	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent,
-	Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_err,
+	Opt_degraded, Opt_subvol, Opt_nodatasum, Opt_nodatacow,
+	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+	Opt_ssd, Opt_err,
 };
 
 static match_table_t tokens = {
+	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
@@ -106,9 +108,8 @@ u64 btrfs_parse_size(char *str)
 	return res;
 }
 
-static int parse_options (char * options,
-			  struct btrfs_root *root,
-			  char **subvol_name)
+int btrfs_parse_options(char *options, struct btrfs_root *root,
+			char **subvol_name)
 {
 	char * p;
 	struct btrfs_fs_info *info = NULL;
@@ -135,6 +136,12 @@ static int parse_options (char * options,
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		case Opt_degraded:
+			if (info) {
+				printk("btrfs: allowing degraded mounts\n");
+				btrfs_set_opt(info->mount_opt, DEGRADED);
+			}
+			break;
 		case Opt_subvol:
 			if (subvol_name) {
 				*subvol_name = match_strdup(&args[0]);
@@ -234,7 +241,7 @@ static int btrfs_fill_super(struct super_block * sb,
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 
-	tree_root = open_ctree(sb, fs_devices);
+	tree_root = open_ctree(sb, fs_devices, (char *)data);
 
 	if (IS_ERR(tree_root)) {
 		printk("btrfs: open_ctree failed\n");
@@ -267,8 +274,6 @@ static int btrfs_fill_super(struct super_block * sb,
 		goto fail_close;
 	}
 
-	parse_options((char *)data, tree_root, NULL);
-
 	/* this does the super kobj at the same time */
 	err = btrfs_sysfs_add_super(tree_root->fs_info);
 	if (err)
@@ -341,7 +346,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 	if (error)
 		return error;
 
-	bdev = fs_devices->lowest_bdev;
+	bdev = fs_devices->latest_bdev;
 	btrfs_lock_volumes();
 	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
 	btrfs_unlock_volumes();
@@ -411,7 +416,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 	int ret;
 	char *subvol_name = NULL;
 
-	parse_options((char *)data, NULL, &subvol_name);
+	btrfs_parse_options((char *)data, NULL, &subvol_name);
 	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, mnt,
 			subvol_name ? subvol_name : "default");
 	if (subvol_name)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5fc7fb48147..43f74d17bce 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -73,6 +73,7 @@ int btrfs_cleanup_fs_uuids(void)
 				close_bdev_excl(dev->bdev);
 			}
 			list_del(&dev->dev_list);
+			kfree(dev->name);
 			kfree(dev);
 		}
 	}
@@ -127,7 +128,6 @@ static int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		fs_devices->lowest_devid = (u64)-1;
 		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
@@ -159,13 +159,35 @@ static int device_list_add(const char *path,
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
 	}
-	if (fs_devices->lowest_devid > devid) {
-		fs_devices->lowest_devid = devid;
-	}
 	*fs_devices_ret = fs_devices;
 	return 0;
 }
 
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	mutex_lock(&uuid_mutex);
+again:
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->in_fs_metadata) {
+printk("getting rid of extra dev %s\n", device->name);
+			if (device->bdev)
+				close_bdev_excl(device->bdev);
+			list_del(&device->dev_list);
+			list_del(&device->dev_alloc_list);
+			fs_devices->num_devices--;
+			kfree(device->name);
+			kfree(device);
+			goto again;
+		}
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *head = &fs_devices->devices;
@@ -179,6 +201,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 			close_bdev_excl(device->bdev);
 		}
 		device->bdev = NULL;
+		device->in_fs_metadata = 0;
 	}
 	mutex_unlock(&uuid_mutex);
 	return 0;
@@ -199,6 +222,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (device->bdev)
 			continue;
 
+		if (!device->name)
+			continue;
+
 		bdev = open_bdev_excl(device->name, flags, holder);
 
 		if (IS_ERR(bdev)) {
@@ -209,10 +235,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 		if (device->devid == fs_devices->latest_devid)
 			fs_devices->latest_bdev = bdev;
-		if (device->devid == fs_devices->lowest_devid) {
-			fs_devices->lowest_bdev = bdev;
-		}
 		device->bdev = bdev;
+		device->in_fs_metadata = 0;
 
 	}
 	mutex_unlock(&uuid_mutex);
@@ -439,7 +463,8 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(ret);
 
-	device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	if (device->bytes_used > 0)
+		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 
@@ -460,6 +485,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 
+	WARN_ON(!device->in_fs_metadata);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -674,8 +700,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 
 	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
 			      dev_list);
-	if (bdev == fs_devices->lowest_bdev)
-		fs_devices->lowest_bdev = next_dev->bdev;
 	if (bdev == root->fs_info->sb->s_bdev)
 		root->fs_info->sb->s_bdev = next_dev->bdev;
 	if (bdev == fs_devices->latest_bdev)
@@ -698,7 +722,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_device *device;
 	struct block_device *bdev;
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	struct btrfs_super_block *disk_super;
 	u64 all_avail;
 	u64 devid;
@@ -712,47 +736,73 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_metadata_alloc_bits;
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    root->fs_info->fs_devices->num_devices <= 4) {
+	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
 		printk("btrfs: unable to go below four devices on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    root->fs_info->fs_devices->num_devices <= 2) {
+	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
 		printk("btrfs: unable to go below two devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
-	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-		goto out;
-	}
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *cur;
+		struct list_head *devices;
+		struct btrfs_device *tmp;
 
-	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
-	if (!bh) {
-		ret = -EIO;
-		goto error_close;
-	}
-	disk_super = (struct btrfs_super_block *)bh->b_data;
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-	    sizeof(disk_super->magic))) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
-	if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
-	devid = le64_to_cpu(disk_super->dev_item.devid);
-	device = btrfs_find_device(root, devid, NULL);
-	if (!device) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
+		device = NULL;
+		devices = &root->fs_info->fs_devices->devices;
+		list_for_each(cur, devices) {
+			tmp = list_entry(cur, struct btrfs_device, dev_list);
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				device = tmp;
+				break;
+			}
+		}
+		bdev = NULL;
+		bh = NULL;
+		disk_super = NULL;
+		if (!device) {
+			printk("btrfs: no missing devices found to remove\n");
+			goto out;
+		}
+
+	} else {
+		bdev = open_bdev_excl(device_path, 0,
+				      root->fs_info->bdev_holder);
+		if (IS_ERR(bdev)) {
+			ret = PTR_ERR(bdev);
+			goto out;
+		}
+
+		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		if (!bh) {
+			ret = -EIO;
+			goto error_close;
+		}
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic))) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+		if (memcmp(disk_super->fsid, root->fs_info->fsid,
+			   BTRFS_FSID_SIZE)) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		device = btrfs_find_device(root, devid, NULL);
+		if (!device) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
 
+	}
 	root->fs_info->fs_devices->num_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
@@ -764,19 +814,25 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto error_brelse;
 
-	/* make sure this device isn't detected as part of the FS anymore */
-	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-	set_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-
-	brelse(bh);
-
-	/* one close for the device struct or super_block */
-	close_bdev_excl(device->bdev);
+	if (bh) {
+		/* make sure this device isn't detected as part of
+		 * the FS anymore
+		 */
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
 
-	/* one close for us */
-	close_bdev_excl(device->bdev);
+		brelse(bh);
+	}
 
+	if (device->bdev) {
+		/* one close for the device struct or super_block */
+		close_bdev_excl(device->bdev);
+	}
+	if (bdev) {
+		/* one close for us */
+		close_bdev_excl(bdev);
+	}
 	kfree(device->name);
 	kfree(device);
 	ret = 0;
@@ -785,7 +841,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 error_brelse:
 	brelse(bh);
 error_close:
-	close_bdev_excl(bdev);
+	if (bdev)
+		close_bdev_excl(bdev);
 out:
 	mutex_unlock(&uuid_mutex);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -839,6 +896,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->total_bytes = i_size_read(bdev->bd_inode);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
+	device->in_fs_metadata = 1;
 
 	ret = btrfs_add_device(trans, root, device);
 	if (ret)
@@ -1041,8 +1099,10 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 					    map->stripes[i].physical);
 		BUG_ON(ret);
 
-		ret = btrfs_update_device(trans, map->stripes[i].dev);
-		BUG_ON(ret);
+		if (map->stripes[i].dev) {
+			ret = btrfs_update_device(trans, map->stripes[i].dev);
+			BUG_ON(ret);
+		}
 	}
 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
 			       chunk_offset);
@@ -1415,10 +1475,13 @@ again:
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
-		avail = device->total_bytes - device->bytes_used;
+		if (device->total_bytes > device->bytes_used)
+			avail = device->total_bytes - device->bytes_used;
+		else
+			avail = 0;
 		cur = cur->next;
 
-		if (avail >= min_free) {
+		if (device->in_fs_metadata && avail >= min_free) {
 			u64 ignored_start = 0;
 			ret = find_free_dev_extent(trans, device, path,
 						   min_free,
@@ -1430,7 +1493,7 @@ again:
 				if (type & BTRFS_BLOCK_GROUP_DUP)
 					index++;
 			}
-		} else if (avail > max_avail)
+		} else if (device->in_fs_metadata && avail > max_avail)
 			max_avail = avail;
 		if (cur == dev_list)
 			break;
@@ -1610,6 +1673,22 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	return ret;
 }
 
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+			    int optimal)
+{
+	int i;
+	if (map->stripes[optimal].dev->bdev)
+		return optimal;
+	for (i = first; i < first + num; i++) {
+		if (map->stripes[i].dev->bdev)
+			return i;
+	}
+	/* we couldn't find one that doesn't fail.  Just return something
+	 * and the io error handling code will clean up eventually
+	 */
+	return optimal;
+}
+
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_multi_bio **multi_ret,
@@ -1712,8 +1791,11 @@ again:
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
-		else
-			stripe_index = current->pid % map->num_stripes;
+		else {
+			stripe_index = find_live_mirror(map, 0,
+					    map->num_stripes,
+					    current->pid % map->num_stripes);
+		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
@@ -1731,8 +1813,11 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else
-			stripe_index += current->pid % map->sub_stripes;
+		else {
+			stripe_index = find_live_mirror(map, stripe_index,
+					      map->sub_stripes, stripe_index +
+					      current->pid % map->sub_stripes);
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -1749,9 +1834,11 @@ again:
 			struct backing_dev_info *bdi;
 
 			device = map->stripes[stripe_index].dev;
-			bdi = blk_get_backing_dev_info(device->bdev);
-			if (bdi->unplug_io_fn) {
-				bdi->unplug_io_fn(bdi, unplug_page);
+			if (device->bdev) {
+				bdi = blk_get_backing_dev_info(device->bdev);
+				if (bdi->unplug_io_fn) {
+					bdi->unplug_io_fn(bdi, unplug_page);
+				}
 			}
 		} else {
 			multi->stripes[i].physical =
@@ -1880,12 +1967,21 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
-
-		bio->bi_bdev = dev->bdev;
-		spin_lock(&dev->io_lock);
-		dev->total_ios++;
-		spin_unlock(&dev->io_lock);
-		submit_bio(rw, bio);
+		if (dev && dev->bdev) {
+			bio->bi_bdev = dev->bdev;
+			spin_lock(&dev->io_lock);
+			dev->total_ios++;
+			spin_unlock(&dev->io_lock);
+			submit_bio(rw, bio);
+		} else {
+			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+			bio->bi_sector = logical >> 9;
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+			bio_endio(bio, bio->bi_size, -EIO);
+#else
+			bio_endio(bio, -EIO);
+#endif
+		}
 		dev_nr++;
 	}
 	if (total_devs == 1)
@@ -1901,6 +1997,27 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 	return __find_device(head, devid, uuid);
 }
 
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+					    u64 devid, u8 *dev_uuid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	list_add(&device->dev_list,
+		 &fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &fs_devices->alloc_list);
+	device->barriers = 1;
+	device->dev_root = root->fs_info->dev_root;
+	device->devid = devid;
+	fs_devices->num_devices++;
+	spin_lock_init(&device->io_lock);
+	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+	return device;
+}
+
+
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -1965,11 +2082,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
 		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
-		if (!map->stripes[i].dev) {
+
+		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
 			return -EIO;
 		}
+		if (!map->stripes[i].dev) {
+			map->stripes[i].dev =
+				add_missing_dev(root, devid, uuid);
+			if (!map->stripes[i].dev) {
+				kfree(map);
+				free_extent_map(em);
+				return -EIO;
+			}
+		}
+		map->stripes[i].dev->in_fs_metadata = 1;
 	}
 
 	spin_lock(&map_tree->map_tree.lock);
@@ -2016,20 +2144,15 @@ static int read_one_dev(struct btrfs_root *root,
 			   BTRFS_UUID_SIZE);
 	device = btrfs_find_device(root, devid, dev_uuid);
 	if (!device) {
-		printk("warning devid %Lu not found already\n", devid);
-		device = kzalloc(sizeof(*device), GFP_NOFS);
+		printk("warning devid %Lu missing\n", devid);
+		device = add_missing_dev(root, devid, dev_uuid);
 		if (!device)
 			return -ENOMEM;
-		list_add(&device->dev_list,
-			 &root->fs_info->fs_devices->devices);
-		list_add(&device->dev_alloc_list,
-			 &root->fs_info->fs_devices->alloc_list);
-		device->barriers = 1;
-		spin_lock_init(&device->io_lock);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
+	device->in_fs_metadata = 1;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 0f94a69e6eb..454fe810332 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -27,8 +27,10 @@ struct btrfs_device {
 	struct list_head dev_alloc_list;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
+	u64 generation;
 
 	int barriers;
+	int in_fs_metadata;
 
 	spinlock_t io_lock;
 
@@ -122,6 +124,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, int flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
 int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
-- 
cgit v1.2.3


From a0af469b58944f6e8c5c8ecbebb42997baf0cb9e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 May 2008 16:03:06 -0400
Subject: Fix btrfs_open_devices to deal with changes since the scan ioctls

Devices can change after the scan ioctls are done, and btrfs_open_devices
needs to be able to verify them as they are opened and used by the FS.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c |  4 ++--
 fs/btrfs/volumes.c | 70 +++++++++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/volumes.h |  4 ++--
 3 files changed, 63 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38b0d9ecda6..264f297260f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1266,10 +1266,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_parse_options(options, tree_root, NULL);
 
-	if (btrfs_super_num_devices(disk_super) > fs_devices->num_devices) {
+	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
 		       (unsigned long long)btrfs_super_num_devices(disk_super),
-		       (unsigned long long)fs_devices->num_devices);
+		       (unsigned long long)fs_devices->open_devices);
 		if (btrfs_test_opt(tree_root, DEGRADED))
 			printk("continuing in degraded mode\n");
 		else {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 43f74d17bce..501d23d3ebf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -71,6 +71,7 @@ int btrfs_cleanup_fs_uuids(void)
 					 dev_list);
 			if (dev->bdev) {
 				close_bdev_excl(dev->bdev);
+				fs_devices->open_devices--;
 			}
 			list_del(&dev->dev_list);
 			kfree(dev->name);
@@ -174,9 +175,10 @@ again:
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->in_fs_metadata) {
-printk("getting rid of extra dev %s\n", device->name);
-			if (device->bdev)
+			if (device->bdev) {
 				close_bdev_excl(device->bdev);
+				fs_devices->open_devices--;
+			}
 			list_del(&device->dev_list);
 			list_del(&device->dev_alloc_list);
 			fs_devices->num_devices--;
@@ -188,6 +190,7 @@ printk("getting rid of extra dev %s\n", device->name);
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
+
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *head = &fs_devices->devices;
@@ -199,10 +202,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
+			fs_devices->open_devices--;
 		}
 		device->bdev = NULL;
 		device->in_fs_metadata = 0;
 	}
+	fs_devices->mounted = 0;
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
@@ -214,9 +219,19 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	struct list_head *head = &fs_devices->devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
-	int ret;
+	struct block_device *latest_bdev = NULL;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 latest_devid = 0;
+	u64 latest_transid = 0;
+	u64 transid;
+	u64 devid;
+	int ret = 0;
 
 	mutex_lock(&uuid_mutex);
+	if (fs_devices->mounted)
+		goto out;
+
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev)
@@ -229,21 +244,52 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
-			ret = PTR_ERR(bdev);
-			goto fail;
+			goto error;
 		}
 		set_blocksize(bdev, 4096);
-		if (device->devid == fs_devices->latest_devid)
-			fs_devices->latest_bdev = bdev;
+
+		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		if (!bh)
+			goto error_close;
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic)))
+			goto error_brelse;
+
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		if (devid != device->devid)
+			goto error_brelse;
+
+		transid = btrfs_super_generation(disk_super);
+		if (transid > latest_transid) {
+			latest_devid = devid;
+			latest_transid = transid;
+			latest_bdev = bdev;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
+		fs_devices->open_devices++;
+		continue;
 
+error_brelse:
+		brelse(bh);
+error_close:
+		close_bdev_excl(bdev);
+error:
+		continue;
 	}
+	if (fs_devices->open_devices == 0) {
+		ret = -EIO;
+		goto out;
+	}
+	fs_devices->mounted = 1;
+	fs_devices->latest_bdev = latest_bdev;
+	fs_devices->latest_devid = latest_devid;
+	fs_devices->latest_trans = latest_transid;
+out:
 	mutex_unlock(&uuid_mutex);
-	return 0;
-fail:
-	mutex_unlock(&uuid_mutex);
-	btrfs_close_devices(fs_devices);
 	return ret;
 }
 
@@ -828,6 +874,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
 		close_bdev_excl(device->bdev);
+		root->fs_info->fs_devices->open_devices--;
 	}
 	if (bdev) {
 		/* one close for us */
@@ -914,6 +961,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	list_add(&device->dev_alloc_list,
 		 &root->fs_info->fs_devices->alloc_list);
 	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
 out:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 454fe810332..4df6b1608f9 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -71,16 +71,16 @@ struct btrfs_fs_devices {
 	/* the device with this id has the most recent coyp of the super */
 	u64 latest_devid;
 	u64 latest_trans;
-	u64 lowest_devid;
 	u64 num_devices;
+	u64 open_devices;
 	struct block_device *latest_bdev;
-	struct block_device *lowest_bdev;
 	/* all of the devices in the FS */
 	struct list_head devices;
 
 	/* devices not currently being allocated */
 	struct list_head alloc_list;
 	struct list_head list;
+	int mounted;
 };
 
 struct btrfs_bio_stripe {
-- 
cgit v1.2.3


From 211c17f51f46dc6c308c742098273dd46b5ca59c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 May 2008 09:13:45 -0400
Subject: Fix corners in writepage and btrfs_truncate_page

The extent_io writepage calls needed an extra check for discarding
pages that started on th last byte in the file.

btrfs_truncate_page needed checks to make sure the page was still part
of the file after reading it, and most importantly, needed to wait for
all IO to the page to finish before freeing the corresponding extents on
disk.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 15 +++++++++------
 fs/btrfs/file.c      |  8 --------
 fs/btrfs/inode.c     |  9 ++++++++-
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2a3624adc0c..17c508a941e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1988,9 +1988,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 nr_delalloc;
 	u64 delalloc_end;
 
+
 	WARN_ON(!PageLocked(page));
-	if (page->index > end_index) {
-		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+	page_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	if (page->index > end_index ||
+	   (page->index == end_index && !page_offset)) {
+		page->mapping->a_ops->invalidatepage(page, 0);
 		unlock_page(page);
 		return 0;
 	}
@@ -1998,13 +2001,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	if (page->index == end_index) {
 		char *userpage;
 
-		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
-
 		userpage = kmap_atomic(page, KM_USER0);
-		memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
-		flush_dcache_page(page);
+		memset(userpage + page_offset, 0,
+		       PAGE_CACHE_SIZE - page_offset);
 		kunmap_atomic(userpage, KM_USER0);
+		flush_dcache_page(page);
 	}
+	page_offset = 0;
 
 	set_page_extent_mapped(page);
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index df97d470812..d968900c697 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -305,7 +305,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	    (inline_size & (root->sectorsize -1)) == 0 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
 		u64 last_end;
-		u64 existing_delalloc = 0;
 
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
@@ -315,13 +314,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		last_end = (u64)(pages[num_pages -1]->index) <<
 				PAGE_CACHE_SHIFT;
 		last_end += PAGE_CACHE_SIZE - 1;
-		if (start_pos < isize) {
-			u64 delalloc_start = start_pos;
-			existing_delalloc = count_range_bits(io_tree,
-					     &delalloc_start,
-					     end_of_last_block, (u64)-1,
-					     EXTENT_DELALLOC);
-		}
 		set_extent_delalloc(io_tree, start_pos, end_of_last_block,
 				 GFP_NOFS);
 		btrfs_add_ordered_inode(inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 08760ff9bab..40b4a8ec17f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1180,19 +1180,26 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 		goto out;
 
 	ret = -ENOMEM;
+again:
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
 	if (!PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
+		if (page->mapping != mapping) {
+			unlock_page(page);
+			page_cache_release(page);
+			goto again;
+		}
 		if (!PageUptodate(page)) {
 			ret = -EIO;
 			goto out;
 		}
 	}
-	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 
+	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+	wait_on_page_writeback(page);
 	ret = btrfs_cow_one_page(inode, page, offset);
 
 	unlock_page(page);
-- 
cgit v1.2.3


From cb03c743c6486e4efb8be6bdf7bf23177826d156 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 15 May 2008 16:15:45 -0400
Subject: Btrfs: Change the congestion functions to meter the number of async
 submits as well

The async submit workqueue was absorbing too many requests, leading to long
stalls where the async submitters were stalling.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 1 +
 fs/btrfs/disk-io.c | 9 +++++++++
 2 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a9377fb16c3..293464c0c6b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -522,6 +522,7 @@ struct btrfs_fs_info {
 	struct work_struct async_submit_work;
 	spinlock_t end_io_work_lock;
 	spinlock_t async_submit_work_lock;
+	atomic_t nr_async_submits;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	struct work_struct trans_work;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 264f297260f..373374340e9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -453,6 +453,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 	spin_lock(&fs_info->async_submit_work_lock);
 	list_add_tail(&async->list, &fs_info->async_submit_work_list);
+	atomic_inc(&fs_info->nr_async_submits);
 	spin_unlock(&fs_info->async_submit_work_lock);
 
 	queue_work(async_submit_workqueue, &fs_info->async_submit_work);
@@ -906,10 +907,16 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
 	int ret = 0;
+	int limit = 256 * info->fs_devices->open_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
 
+	if ((bdi_bits & (1 << BDI_write_congested)) &&
+	    atomic_read(&info->nr_async_submits) > limit) {
+		return 1;
+	}
+
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->bdev)
@@ -1117,6 +1124,7 @@ static void btrfs_async_submit_work(struct work_struct *work)
 		}
 		next = fs_info->async_submit_work_list.next;
 		list_del(next);
+		atomic_dec(&fs_info->nr_async_submits);
 		spin_unlock(&fs_info->async_submit_work_lock);
 
 		async = list_entry(next, struct async_submit_bio, list);
@@ -1179,6 +1187,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
+	atomic_set(&fs_info->nr_async_submits, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
-- 
cgit v1.2.3


From 1c8cfcc1590d378c82ec75888f71e87fd26551e3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 May 2008 13:06:51 -0400
Subject: Btrfs: Enable btree balancing on old kernels again

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 373374340e9..d8a256d3c83 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1719,7 +1719,6 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	 * looks as though older kernels can get into trouble with
 	 * this code, they end up stuck in balance_dirty_pages forever
 	 */
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
@@ -1735,9 +1734,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 		balance_dirty_pages_ratelimited_nr(
 				   root->fs_info->btree_inode->i_mapping, 1);
 	}
-#else
 	return;
-#endif
 }
 
 void btrfs_set_buffer_defrag(struct extent_buffer *buf)
-- 
cgit v1.2.3


From 6af5ac3c1aede574acde5fc477f1a1c876fb568e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 May 2008 13:14:57 -0400
Subject: Btrfs: Handle transid == 0 while opening devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 501d23d3ebf..3fbf0834e88 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -262,7 +262,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			goto error_brelse;
 
 		transid = btrfs_super_generation(disk_super);
-		if (transid > latest_transid) {
+		if (!latest_transid || transid > latest_transid) {
 			latest_devid = devid;
 			latest_transid = transid;
 			latest_bdev = bdev;
-- 
cgit v1.2.3


From 515dc3228c568f9df8b634f82b3471e20205ac16 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 May 2008 13:30:15 -0400
Subject: Btrfs: Use kzalloc on the fs_devices allocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3fbf0834e88..333e5269577 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -120,7 +120,7 @@ static int device_list_add(const char *path,
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 		if (!fs_devices)
 			return -ENOMEM;
 		INIT_LIST_HEAD(&fs_devices->devices);
@@ -129,7 +129,6 @@ static int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
-- 
cgit v1.2.3


From 0ef3e66b6700eb8f052daa8b89443ff872fbbdfc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 24 May 2008 14:04:53 -0400
Subject: Btrfs: Allocator fix variety pack

* Force chunk allocation when find_free_extent has to do a full scan
* Record the max key at the start of defrag so it doesn't run forever
* Block groups might not be contiguous, make a forward search for the
  next block group in extent-tree.c
* Get rid of extra checks for total fs size
* Fix relocate_one_reference to avoid relocating the same file data block
  twice when referenced by an older transaction
* Use the open device count when allocating chunks so that we don't
  try to allocate from devices that don't exist

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |   2 +-
 fs/btrfs/ctree.h       |   2 +
 fs/btrfs/extent-tree.c | 273 +++++++++++++++++++++++++++++++++----------------
 fs/btrfs/tree-defrag.c |  17 +++
 fs/btrfs/volumes.c     |  12 +--
 5 files changed, 209 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 02e571e6ee6..1d404bde3cf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -363,7 +363,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			other = btrfs_node_blockptr(parent, i - 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
-		if (close && i < end_slot - 2) {
+		if (!close && i < end_slot - 2) {
 			other = btrfs_node_blockptr(parent, i + 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 293464c0c6b..7b73a9c3d86 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -464,6 +464,7 @@ struct btrfs_space_info {
 	u64 bytes_used;
 	u64 bytes_pinned;
 	int full;
+	int force_alloc;
 	struct list_head list;
 };
 
@@ -589,6 +590,7 @@ struct btrfs_root {
 	int ref_cows;
 	int track_dirty;
 	struct btrfs_key defrag_progress;
+	struct btrfs_key defrag_max;
 	int defrag_running;
 	int defrag_level;
 	char *name;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 605018c6045..41a63462d3e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -136,6 +136,35 @@ err:
 	return 0;
 }
 
+struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
+						       btrfs_fs_info *info,
+							 u64 bytenr)
+{
+	struct extent_io_tree *block_group_cache;
+	struct btrfs_block_group_cache *block_group = NULL;
+	u64 ptr;
+	u64 start;
+	u64 end;
+	int ret;
+
+	bytenr = max_t(u64, bytenr,
+		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
+	block_group_cache = &info->block_group_cache;
+	ret = find_first_extent_bit(block_group_cache,
+				    bytenr, &start, &end,
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
+				    BLOCK_GROUP_SYSTEM);
+	if (ret) {
+		return NULL;
+	}
+	ret = get_state_private(block_group_cache, start, &ptr);
+	if (ret)
+		return NULL;
+
+	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
+	return block_group;
+}
+
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr)
@@ -175,7 +204,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 
 static int noinline find_search_start(struct btrfs_root *root,
 			      struct btrfs_block_group_cache **cache_ret,
-			      u64 *start_ret, int num, int data)
+			      u64 *start_ret, u64 num, int data)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
@@ -188,21 +217,21 @@ static int noinline find_search_start(struct btrfs_root *root,
 	u64 search_start = *start_ret;
 	int wrapped = 0;
 
-	if (!cache)
-		goto out;
-
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
+	if (!cache)
+		goto out;
+
 again:
 	ret = cache_block_group(root, cache);
-	if (ret)
+	if (ret) {
 		goto out;
+	}
 
 	last = max(search_start, cache->key.objectid);
-	if (!block_group_bits(cache, data) || cache->ro) {
+	if (!block_group_bits(cache, data) || cache->ro)
 		goto new_group;
-	}
 
 	spin_lock_irq(&free_space_cache->lock);
 	state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
@@ -217,20 +246,17 @@ again:
 		start = max(last, state->start);
 		last = state->end + 1;
 		if (last - start < num) {
-			if (last == cache->key.objectid + cache->key.offset)
-				cache_miss = start;
 			do {
 				state = extent_state_next(state);
 			} while(state && !(state->state & EXTENT_DIRTY));
 			continue;
 		}
 		spin_unlock_irq(&free_space_cache->lock);
-		if (cache->ro)
+		if (cache->ro) {
 			goto new_group;
+		}
 		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
-		if (start + num  > total_fs_bytes)
-			goto new_group;
 		if (!block_group_bits(cache, data)) {
 			printk("block group bits don't match %Lu %d\n", cache->flags, data);
 		}
@@ -248,7 +274,7 @@ out:
 new_group:
 	last = cache->key.objectid + cache->key.offset;
 wrapped:
-	cache = btrfs_lookup_block_group(root->fs_info, last);
+	cache = btrfs_lookup_first_block_group(root->fs_info, last);
 	if (!cache || cache->key.objectid >= total_fs_bytes) {
 no_cache:
 		if (!wrapped) {
@@ -261,13 +287,13 @@ no_cache:
 	if (cache_miss && !cache->cached) {
 		cache_block_group(root, cache);
 		last = cache_miss;
-		cache = btrfs_lookup_block_group(root->fs_info, last);
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 	}
+	cache_miss = 0;
 	cache = btrfs_find_block_group(root, cache, last, data, 0);
 	if (!cache)
 		goto no_cache;
 	*cache_ret = cache;
-	cache_miss = 0;
 	goto again;
 }
 
@@ -303,28 +329,26 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
 	u64 last = 0;
-	u64 hint_last;
 	u64 start;
 	u64 end;
 	u64 free_check;
 	u64 ptr;
-	u64 total_fs_bytes;
 	int bit;
 	int ret;
 	int full_search = 0;
 	int factor = 10;
+	int wrapped = 0;
 
 	block_group_cache = &info->block_group_cache;
-	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA)
 		factor = 9;
 
 	bit = block_group_state_bits(data);
 
-	if (search_start && search_start < total_fs_bytes) {
+	if (search_start) {
 		struct btrfs_block_group_cache *shint;
-		shint = btrfs_lookup_block_group(info, search_start);
+		shint = btrfs_lookup_first_block_group(info, search_start);
 		if (shint && block_group_bits(shint, data) && !shint->ro) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
@@ -333,24 +357,18 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 			}
 		}
 	}
-	if (hint && !hint->ro && block_group_bits(hint, data) &&
-	    hint->key.objectid < total_fs_bytes) {
+	if (hint && !hint->ro && block_group_bits(hint, data)) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
 		    div_factor(hint->key.offset, factor)) {
 			return hint;
 		}
 		last = hint->key.objectid + hint->key.offset;
-		hint_last = last;
 	} else {
 		if (hint)
-			hint_last = max(hint->key.objectid, search_start);
+			last = max(hint->key.objectid, search_start);
 		else
-			hint_last = search_start;
-
-		if (hint_last >= total_fs_bytes)
-			hint_last = search_start;
-		last = hint_last;
+			last = search_start;
 	}
 again:
 	while(1) {
@@ -360,23 +378,17 @@ again:
 			break;
 
 		ret = get_state_private(block_group_cache, start, &ptr);
-		if (ret)
-			break;
+		if (ret) {
+			last = end + 1;
+			continue;
+		}
 
 		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
-		if (cache->key.objectid > total_fs_bytes)
-			break;
-
 		if (!cache->ro && block_group_bits(cache, data)) {
-			if (full_search)
-				free_check = cache->key.offset;
-			else
-				free_check = div_factor(cache->key.offset,
-							factor);
-
+			free_check = div_factor(cache->key.offset, factor);
 			if (used + cache->pinned < free_check) {
 				found_group = cache;
 				goto found;
@@ -384,9 +396,15 @@ again:
 		}
 		cond_resched();
 	}
-	if (!full_search) {
+	if (!wrapped) {
+		last = search_start;
+		wrapped = 1;
+		goto again;
+	}
+	if (!full_search && factor < 10) {
 		last = search_start;
 		full_search = 1;
+		factor = 10;
 		goto again;
 	}
 found:
@@ -1070,6 +1088,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_used = bytes_used;
 	found->bytes_pinned = 0;
 	found->full = 0;
+	found->force_alloc = 0;
 	*space_info = found;
 	return 0;
 }
@@ -1120,7 +1139,7 @@ static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags)
+			  u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
 	u64 thresh;
@@ -1138,11 +1157,16 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(!space_info);
 
+	if (space_info->force_alloc) {
+		force = 1;
+		space_info->force_alloc = 0;
+	}
 	if (space_info->full)
 		return 0;
 
 	thresh = div_factor(space_info->total_bytes, 6);
-	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
+	if (!force &&
+	   (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
 	    thresh)
 		return 0;
 
@@ -1152,7 +1176,6 @@ printk("space info full %Lu\n", flags);
 		space_info->full = 1;
 		return 0;
 	}
-
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
@@ -1619,11 +1642,16 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	int chunk_alloc_done = 0;
 	int empty_cluster = 2 * 1024 * 1024;
+	int allowed_chunk_alloc = 0;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
+	if (orig_root->ref_cows || empty_size)
+		allowed_chunk_alloc = 1;
+
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
 		empty_cluster = 256 * 1024;
@@ -1648,7 +1676,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
 	if (hint_byte) {
-		block_group = btrfs_lookup_block_group(info, hint_byte);
+		block_group = btrfs_lookup_first_block_group(info, hint_byte);
 		if (!block_group)
 			hint_byte = search_start;
 		block_group = btrfs_find_block_group(root, block_group,
@@ -1666,17 +1694,28 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 check_failed:
 	if (!block_group) {
-		block_group = btrfs_lookup_block_group(info, search_start);
+		block_group = btrfs_lookup_first_block_group(info,
+							     search_start);
 		if (!block_group)
-			block_group = btrfs_lookup_block_group(info,
+			block_group = btrfs_lookup_first_block_group(info,
 						       orig_search_start);
 	}
+	if (full_scan && !chunk_alloc_done) {
+		if (allowed_chunk_alloc) {
+			do_chunk_alloc(trans, root,
+				     num_bytes + 2 * 1024 * 1024, data, 1);
+			allowed_chunk_alloc = 0;
+		} else if (block_group && block_group_bits(block_group, data)) {
+			block_group->space_info->force_alloc = 1;
+		}
+		chunk_alloc_done = 1;
+	}
 	ret = find_search_start(root, &block_group, &search_start,
 				total_needed, data);
 	if (ret == -ENOSPC && last_ptr && *last_ptr) {
 		*last_ptr = 0;
-		block_group = btrfs_lookup_block_group(info,
-						       orig_search_start);
+		block_group = btrfs_lookup_first_block_group(info,
+							     orig_search_start);
 		search_start = orig_search_start;
 		ret = find_search_start(root, &block_group, &search_start,
 					total_needed, data);
@@ -1692,7 +1731,7 @@ check_failed:
 			empty_size += empty_cluster;
 			total_needed += empty_size;
 		}
-		block_group = btrfs_lookup_block_group(info,
+		block_group = btrfs_lookup_first_block_group(info,
 						       orig_search_start);
 		search_start = orig_search_start;
 		ret = find_search_start(root, &block_group,
@@ -1765,7 +1804,7 @@ enospc:
 		} else
 			wrapped = 1;
 	}
-	block_group = btrfs_lookup_block_group(info, search_start);
+	block_group = btrfs_lookup_first_block_group(info, search_start);
 	cond_resched();
 	block_group = btrfs_find_block_group(root, block_group,
 					     search_start, data, 0);
@@ -1819,17 +1858,21 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	}
 again:
 	data = reduce_alloc_profile(root, data);
-	if (root->ref_cows) {
+	/*
+	 * the only place that sets empty_size is btrfs_realloc_node, which
+	 * is not called recursively on allocations
+	 */
+	if (empty_size || root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     2 * 1024 * 1024,
-					     BTRFS_BLOCK_GROUP_METADATA |
-					     (info->metadata_alloc_profile &
-					      info->avail_metadata_alloc_bits));
+				     2 * 1024 * 1024,
+				     BTRFS_BLOCK_GROUP_METADATA |
+				     (info->metadata_alloc_profile &
+				      info->avail_metadata_alloc_bits), 0);
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data);
+				     num_bytes + 2 * 1024 * 1024, data, 0);
 		BUG_ON(ret);
 	}
 
@@ -1842,6 +1885,8 @@ again:
 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes = num_bytes >> 1;
 		num_bytes = max(num_bytes, min_alloc_size);
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       num_bytes, data, 1);
 		goto again;
 	}
 	if (ret) {
@@ -2537,7 +2582,11 @@ out:
  */
 static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 				  struct btrfs_path *path,
-				  struct btrfs_key *extent_key)
+				  struct btrfs_key *extent_key,
+				  u64 *last_file_objectid,
+				  u64 *last_file_offset,
+				  u64 *last_file_root,
+				  u64 last_extent)
 {
 	struct inode *inode;
 	struct btrfs_root *found_root;
@@ -2576,6 +2625,12 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		found_key.offset = ref_offset;
 		level = 0;
 
+		if (last_extent == extent_key->objectid &&
+		    *last_file_objectid == ref_objectid &&
+		    *last_file_offset == ref_offset &&
+		    *last_file_root == ref_root)
+			goto out;
+
 		ret = find_root_for_ref(extent_root, path, &found_key,
 					level, 1, &found_root,
 					extent_key->objectid);
@@ -2583,6 +2638,12 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		if (ret)
 			goto out;
 
+		if (last_extent == extent_key->objectid &&
+		    *last_file_objectid == ref_objectid &&
+		    *last_file_offset == ref_offset &&
+		    *last_file_root == ref_root)
+			goto out;
+
 		mutex_unlock(&extent_root->fs_info->fs_mutex);
 		inode = btrfs_iget_locked(extent_root->fs_info->sb,
 					  ref_objectid, found_root);
@@ -2603,6 +2664,10 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 			mutex_lock(&extent_root->fs_info->fs_mutex);
 			goto out;
 		}
+		*last_file_objectid = inode->i_ino;
+		*last_file_root = found_root->root_key.objectid;
+		*last_file_offset = ref_offset;
+
 		relocate_inode_pages(inode, ref_offset, extent_key->offset);
 		iput(inode);
 		mutex_lock(&extent_root->fs_info->fs_mutex);
@@ -2643,6 +2708,8 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 			path->nodes[i] = NULL;
 		}
 		btrfs_release_path(found_root, path);
+		if (found_root == found_root->fs_info->extent_root)
+			btrfs_extent_post_op(trans, found_root);
 		btrfs_end_transaction(trans, found_root);
 	}
 
@@ -2678,6 +2745,10 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
+	u64 last_file_objectid = 0;
+	u64 last_file_root = 0;
+	u64 last_file_offset = (u64)-1;
+	u64 last_extent = 0;
 	u32 nritems;
 	u32 item_size;
 	int ret = 0;
@@ -2722,9 +2793,13 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 		key.offset = found_key.offset + 1;
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 
-		ret = relocate_one_reference(extent_root, path, extent_key);
+		ret = relocate_one_reference(extent_root, path, extent_key,
+					     &last_file_objectid,
+					     &last_file_offset,
+					     &last_file_root, last_extent);
 		if (ret)
 			goto out;
+		last_extent = extent_key->objectid;
 	}
 	ret = 0;
 out:
@@ -2770,6 +2845,32 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
+int __alloc_chunk_for_shrink(struct btrfs_root *root,
+		     struct btrfs_block_group_cache *shrink_block_group,
+		     int force)
+{
+	struct btrfs_trans_handle *trans;
+	u64 new_alloc_flags;
+	u64 calc;
+
+	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+
+		trans = btrfs_start_transaction(root, 1);
+		new_alloc_flags = update_block_group_flags(root,
+						   shrink_block_group->flags);
+		if (new_alloc_flags != shrink_block_group->flags) {
+			calc =
+			     btrfs_block_group_used(&shrink_block_group->item);
+		} else {
+			calc = shrink_block_group->key.offset;
+		}
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
+		btrfs_end_transaction(trans, root);
+	}
+	return 0;
+}
+
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 {
 	struct btrfs_trans_handle *trans;
@@ -2778,7 +2879,6 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	u64 cur_byte;
 	u64 total_found;
 	u64 shrink_last_byte;
-	u64 new_alloc_flags;
 	struct btrfs_block_group_cache *shrink_block_group;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_key key;
@@ -2792,7 +2892,8 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 						      shrink_start);
 	BUG_ON(!shrink_block_group);
 
-	shrink_last_byte = shrink_start + shrink_block_group->key.offset;
+	shrink_last_byte = shrink_block_group->key.objectid +
+		shrink_block_group->key.offset;
 
 	shrink_block_group->space_info->total_bytes -=
 		shrink_block_group->key.offset;
@@ -2804,23 +2905,10 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	       (unsigned long long)shrink_start,
 	       (unsigned long long)shrink_block_group->flags);
 
+	__alloc_chunk_for_shrink(root, shrink_block_group, 1);
+
 again:
-	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
-		u64 calc;
 
-		trans = btrfs_start_transaction(root, 1);
-		new_alloc_flags = update_block_group_flags(root,
-						   shrink_block_group->flags);
-		if (new_alloc_flags != shrink_block_group->flags) {
-			calc =
-			     btrfs_block_group_used(&shrink_block_group->item);
-		} else {
-			calc = shrink_block_group->key.offset;
-		}
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       calc + 2 * 1024 * 1024, new_alloc_flags);
-		btrfs_end_transaction(trans, root);
-	}
 	shrink_block_group->ro = 1;
 
 	total_found = 0;
@@ -2888,6 +2976,8 @@ next:
 
 		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
 		    found_key.objectid + found_key.offset <= cur_byte) {
+			memcpy(&key, &found_key, sizeof(key));
+			key.offset++;
 			path->slots[0]++;
 			goto next;
 		}
@@ -2897,6 +2987,7 @@ next:
 		key.objectid = cur_byte;
 		btrfs_release_path(root, path);
 		ret = relocate_one_extent(root, path, &found_key);
+		__alloc_chunk_for_shrink(root, shrink_block_group, 0);
 	}
 
 	btrfs_release_path(root, path);
@@ -2930,20 +3021,27 @@ next:
 	if (ret < 0)
 		goto out;
 
-	leaf = path->nodes[0];
-	nritems = btrfs_header_nritems(leaf);
-	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-	kfree(shrink_block_group);
-
-	clear_extent_bits(&info->block_group_cache, found_key.objectid,
-			  found_key.objectid + found_key.offset - 1,
+	clear_extent_bits(&info->block_group_cache, key.objectid,
+			  key.objectid + key.offset - 1,
 			  (unsigned int)-1, GFP_NOFS);
 
+
+	clear_extent_bits(&info->free_space_cache,
+			   key.objectid, key.objectid + key.offset - 1,
+			   (unsigned int)-1, GFP_NOFS);
+
+	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
+	kfree(shrink_block_group);
+
 	btrfs_del_item(trans, root, path);
-	clear_extent_dirty(&info->free_space_cache,
-			   shrink_start, shrink_last_byte - 1,
-			   GFP_NOFS);
 	btrfs_commit_transaction(trans, root);
+
+	/* the code to unpin extents might set a few bits in the free
+	 * space cache for this range again
+	 */
+	clear_extent_bits(&info->free_space_cache,
+			   key.objectid, key.objectid + key.offset - 1,
+			   (unsigned int)-1, GFP_NOFS);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3081,9 +3179,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
-
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
-	memset(&cache->item, 0, sizeof(cache->item));
+
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
 	cache->flags = type;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index c02e2bf2f02..155961c7b4d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -198,6 +198,13 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	if (root->defrag_progress.objectid == 0) {
+		u32 nritems;
+
+		nritems = btrfs_header_nritems(root->node);
+		root->defrag_max.objectid = 0;
+		/* from above we know this is not a leaf */
+		btrfs_node_key_to_cpu(root->node, &root->defrag_max,
+				      nritems - 1);
 		extent_buffer_get(root->node);
 		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 		BUG_ON(ret);
@@ -254,6 +261,16 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 out:
 	if (path)
 		btrfs_free_path(path);
+	if (ret == -EAGAIN) {
+		if (root->defrag_max.objectid > root->defrag_progress.objectid)
+			goto done;
+		if (root->defrag_max.type > root->defrag_progress.type)
+			goto done;
+		if (root->defrag_max.offset > root->defrag_progress.offset)
+			goto done;
+		ret = 0;
+	}
+done:
 	if (ret != -EAGAIN) {
 		memset(&root->defrag_progress, 0,
 		       sizeof(root->defrag_progress));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 333e5269577..722eb455015 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -750,10 +750,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	if (bdev == fs_devices->latest_bdev)
 		fs_devices->latest_bdev = next_dev->bdev;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
-				    total_bytes - device->total_bytes);
-
 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes - 1);
@@ -849,6 +845,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	}
 	root->fs_info->fs_devices->num_devices--;
+	root->fs_info->fs_devices->open_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
@@ -873,7 +870,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
 		close_bdev_excl(device->bdev);
-		root->fs_info->fs_devices->open_devices--;
 	}
 	if (bdev) {
 		/* one close for us */
@@ -1450,7 +1446,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1459,13 +1455,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
-				  btrfs_super_num_devices(&info->super_copy));
+			    extent_root->fs_info->fs_devices->open_devices);
 		if (num_stripes < 2)
 			return -ENOSPC;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		if (num_stripes < 4)
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
-- 
cgit v1.2.3


From da496f2acf61153e0d4d42ded6a53f3367939db0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 27 May 2008 10:52:17 -0400
Subject: Btrfs: Always use the async submission queue for checksummed writes

This avoids IO stalls and poorly ordered IO from inline writers mixing in
with the async submission queue

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d8a256d3c83..016c0c0226e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -434,13 +434,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 {
 	struct async_submit_bio *async;
 
-	/*
-	 * inline writerback should stay inline, only hop to the async
-	 * queue if we're pdflush
-	 */
-	if (!current_is_pdflush())
-		return submit_bio_hook(inode, rw, bio, mirror_num);
-
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
 		return -ENOMEM;
-- 
cgit v1.2.3


From e1b81e6761bd8419146d4bbe0aadd8b2d348a01c Mon Sep 17 00:00:00 2001
From: Mingming <cmm@us.ibm.com>
Date: Tue, 27 May 2008 10:55:43 -0400
Subject: btrfs delete ordered inode handling fix

Use btrfs_release_file instead of a put_inode call

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c         |  7 +++++++
 fs/btrfs/inode.c        | 24 +-----------------------
 fs/btrfs/ordered-data.c | 21 ++++++++++++++-------
 fs/btrfs/ordered-data.h |  2 +-
 fs/btrfs/super.c        |  1 -
 5 files changed, 23 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d968900c697..c4fa4664a45 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -978,6 +978,12 @@ out_nolock:
 	return num_written ? num_written : err;
 }
 
+static int btrfs_release_file (struct inode * inode, struct file * filp)
+{
+	btrfs_del_ordered_inode(inode);
+	return 0;
+}
+
 static int btrfs_sync_file(struct file *file,
 			   struct dentry *dentry, int datasync)
 {
@@ -1044,6 +1050,7 @@ struct file_operations btrfs_file_operations = {
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
+	.release	= btrfs_release_file,
 	.fsync		= btrfs_sync_file,
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 40b4a8ec17f..1569fb86451 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -857,15 +857,11 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 
 	if (inode->i_nlink == 0) {
-		int found;
 		/* if the inode isn't linked anywhere,
 		 * we don't need to worry about
 		 * data=ordered
 		 */
-		found = btrfs_del_ordered_inode(inode);
-		if (found == 1) {
-			atomic_dec(&inode->i_count);
-		}
+		btrfs_del_ordered_inode(inode);
 	}
 
 	btrfs_end_transaction(trans, root);
@@ -1271,24 +1267,6 @@ fail:
 	return err;
 }
 
-void btrfs_put_inode(struct inode *inode)
-{
-	int ret;
-
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
-
-	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
-		return;
-
-	ret = btrfs_del_ordered_inode(inode);
-	if (ret == 1) {
-		atomic_dec(&inode->i_count);
-	}
-}
-
 void btrfs_delete_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 3814cb0246b..8dd8180183f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -231,7 +231,7 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 	return 1;
 }
 
-static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				     struct inode *inode,
 				     u64 root_objectid, u64 objectid)
 {
@@ -243,31 +243,38 @@ static int __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 	node = __tree_search(&tree->tree, root_objectid, objectid, &prev);
 	if (!node) {
 		write_unlock(&tree->lock);
-		return 0;
+		return;
 	}
 	rb_erase(node, &tree->tree);
 	BTRFS_I(inode)->ordered_trans = 0;
 	write_unlock(&tree->lock);
+	atomic_dec(&inode->i_count);
 	entry = rb_entry(node, struct tree_entry, rb_node);
 	kfree(entry);
-	return 1;
+	return;
 }
 
-int btrfs_del_ordered_inode(struct inode *inode)
+void btrfs_del_ordered_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 root_objectid = root->root_key.objectid;
-	int ret = 0;
+
+	if (!BTRFS_I(inode)->ordered_trans) {
+		return;
+	}
+
+	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+		return;
 
 	spin_lock(&root->fs_info->new_trans_lock);
 	if (root->fs_info->running_transaction) {
 		struct btrfs_ordered_inode_tree *tree;
 		tree = &root->fs_info->running_transaction->ordered_inode_tree;
-		ret = __btrfs_del_ordered_inode(tree, inode, root_objectid,
+		 __btrfs_del_ordered_inode(tree, inode, root_objectid,
 						inode->i_ino);
 	}
 	spin_unlock(&root->fs_info->new_trans_lock);
-	return ret;
 }
 
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 29047e0abaa..c515c4b3999 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -38,6 +38,6 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-int btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 273a5b511f5..05029656e42 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -487,7 +487,6 @@ static void btrfs_unlockfs(struct super_block *sb)
 
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
-	.put_inode	= btrfs_put_inode,
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
-- 
cgit v1.2.3


From 48ec2cf8738b0db53dc8361026cb7a328f7ad386 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Jun 2008 09:35:50 -0400
Subject: Btrfs: Fix race in running_transaction checks

When a new transaction was started, the code would incorrectly
set the pointer in fs_info before all the data structures were setup.
fsync heavy workloads hit races on the setup of the ordered inode spinlock

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 57746c11eae..1cb084efd6e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -56,7 +56,6 @@ static noinline int join_transaction(struct btrfs_root *root)
 		total_trans++;
 		BUG_ON(!cur_trans);
 		root->fs_info->generation++;
-		root->fs_info->running_transaction = cur_trans;
 		root->fs_info->last_alloc = 0;
 		root->fs_info->last_data_alloc = 0;
 		cur_trans->num_writers = 1;
@@ -74,6 +73,9 @@ static noinline int join_transaction(struct btrfs_root *root)
 		extent_io_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
+		spin_lock(&root->fs_info->new_trans_lock);
+		root->fs_info->running_transaction = cur_trans;
+		spin_unlock(&root->fs_info->new_trans_lock);
 	} else {
 		cur_trans->num_writers++;
 		cur_trans->num_joined++;
-- 
cgit v1.2.3


From 3b96362cc8d314c935c335d5c3c42eb93c23166b Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Mon, 9 Jun 2008 21:57:42 -0400
Subject: Btrfs: Invalidate dcache entry after creating snapshot and

We need to invalidate an existing dcache entry after creating a new
snapshot or subvolume, because a negative dache entry will stop us from
accessing the new snapshot or subvolume.

---
  ctree.h       |   23 +++++++++++++++++++++++
  inode.c       |    4 ++++
  transaction.c |    4 ++++
  3 files changed, 31 insertions(+)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/inode.c       | 29 +++++++++++++++++++++++++++++
 fs/btrfs/transaction.c |  8 +++++++-
 3 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7b73a9c3d86..ad4eacca7f5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1544,6 +1544,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
+				  int namelen);
+
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1569fb86451..31aa4ba06fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2642,6 +2642,31 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_throttle(root);
 }
 
+/*
+ * Invalidate a single dcache entry at the root of the filesystem.
+ * Needed after creation of snapshot or subvolume.
+ */
+void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
+				  int namelen)
+{
+	struct dentry *alias, *entry;
+	struct qstr qstr;
+
+	alias = d_find_alias(root->fs_info->sb->s_root->d_inode);
+	if (alias) {
+		qstr.name = name;
+		qstr.len = namelen;
+		/* change me if btrfs ever gets a d_hash operation */
+		qstr.hash = full_name_hash(qstr.name, qstr.len);
+		entry = d_lookup(alias, &qstr);
+		dput(alias);
+		if (entry) {
+			d_invalidate(entry);
+			dput(entry);
+		}
+	}
+}
+
 static int noinline create_subvol(struct btrfs_root *root, char *name,
 				  int namelen)
 {
@@ -2761,6 +2786,10 @@ static int noinline create_subvol(struct btrfs_root *root, char *name,
 	ret = btrfs_update_inode(trans, new_root, inode);
 	if (ret)
 		goto fail;
+
+	/* Invalidate existing dcache entry for new subvolume. */
+	btrfs_invalidate_dcache_root(root, name, namelen);
+
 fail:
 	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, new_root);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1cb084efd6e..f04684f7fea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -560,6 +560,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *root = pending->root;
 	struct extent_buffer *tmp;
 	int ret;
+	int namelen;
 	u64 objectid;
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
@@ -595,8 +596,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	 * insert the directory item
 	 */
 	key.offset = (u64)-1;
+	namelen = strlen(pending->name);
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    pending->name, strlen(pending->name),
+				    pending->name, namelen,
 				    root->fs_info->sb->s_root->d_inode->i_ino,
 				    &key, BTRFS_FT_DIR);
 
@@ -606,6 +608,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 			     pending->name, strlen(pending->name), objectid,
 			     root->fs_info->sb->s_root->d_inode->i_ino);
+
+	/* Invalidate existing dcache entry for new snapshot. */
+	btrfs_invalidate_dcache_root(root, pending->name, namelen);
+
 fail:
 	kfree(new_root_item);
 	return ret;
-- 
cgit v1.2.3


From f819d837eea9829ed6c356fb88cdd0a170cbd947 Mon Sep 17 00:00:00 2001
From: Linda Knippers <linda.knippers@hp.com>
Date: Mon, 9 Jun 2008 22:17:11 -0400
Subject: btrfsctl -A error code fixup

Send the error back to userland if the ioctl fails

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 05029656e42..77f44494e22 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -453,7 +453,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 {
 	struct btrfs_ioctl_vol_args *vol;
 	struct btrfs_fs_devices *fs_devices;
-	int ret;
+	int ret = 0;
 	int len;
 
 	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
@@ -470,7 +470,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	}
 out:
 	kfree(vol);
-	return 0;
+	return ret;
 }
 
 static void btrfs_write_super_lockfs(struct super_block *sb)
-- 
cgit v1.2.3


From 51ebc0d3d5cd1a2728068499a8fc1bdb0a581ab5 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Mon, 9 Jun 2008 22:19:40 -0400
Subject: Btrfs: bdi_init and bdi_destroy come with 2.6.23

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 016c0c0226e..3805e7eab82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -984,7 +984,7 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_init(bdi);
 #endif
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
@@ -1378,7 +1378,7 @@ fail:
 
 	kfree(extent_root);
 	kfree(tree_root);
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
 #endif
 	kfree(fs_info);
@@ -1645,7 +1645,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
 #endif
 
-- 
cgit v1.2.3


From eba12c7bfcb4855fc757357e5e5b0b9a474499ba Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Mon, 9 Jun 2008 22:21:46 -0400
Subject: Btrfs: Dislable acl xattr handlers

The acl code is not yet complete, and the xattr handlers are causing
problems for cp -p on some distros.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/xattr.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 0a4950400f8..f63488dc2f1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,22 +29,22 @@
 static struct xattr_handler *btrfs_xattr_handler_map[] = {
 	[BTRFS_XATTR_INDEX_USER]		= &btrfs_xattr_user_handler,
 #ifdef CONFIG_FS_POSIX_ACL
-	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
-	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
+//	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
+//	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
 #endif
 	[BTRFS_XATTR_INDEX_TRUSTED]		= &btrfs_xattr_trusted_handler,
 	[BTRFS_XATTR_INDEX_SECURITY]		= &btrfs_xattr_security_handler,
-	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
+//	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
 };
 struct xattr_handler *btrfs_xattr_handlers[] = {
 	&btrfs_xattr_user_handler,
 #ifdef CONFIG_FS_POSIX_ACL
-	&btrfs_xattr_acl_access_handler,
-	&btrfs_xattr_acl_default_handler,
+//	&btrfs_xattr_acl_access_handler,
+//	&btrfs_xattr_acl_default_handler,
 #endif
 	&btrfs_xattr_trusted_handler,
 	&btrfs_xattr_security_handler,
-	&btrfs_xattr_system_handler,
+//	&btrfs_xattr_system_handler,
 	NULL,
 };
 
-- 
cgit v1.2.3


From 6bf13c0cc833bf5ba013d6aa60379484bf48c4e6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 10 Jun 2008 10:07:39 -0400
Subject: Btrfs: transaction ioctls

These ioctls let a user application hold a transaction open while it
performs a series of operations.  A final ioctl does a sync on the fs
(closing the current transaction).  This is the main requirement for
Ceph's OSD to be able to keep the data it's storing in a btrfs volume
consistent, and AFAICS it works just fine.  The application would do
something like

	fd = ::open("some/file", O_RDONLY);
	::ioctl(fd, BTRFS_IOC_TRANS_START);
	/* do a bunch of stuff */
	::ioctl(fd, BTRFS_IOC_TRANS_END);
or just
	::close(fd);

And to ensure it commits to disk,

	::ioctl(fd, BTRFS_IOC_SYNC);

When a transaction is held open, the trans_handle is attached to the
struct file (via private_data) so that it will get cleaned up if the
process dies unexpectedly.  A held transaction is also ended on fsync() to
avoid a deadlock.

A misbehaving application could also deliberately hold a transaction open,
effectively locking up the FS, so it may make sense to restrict something
like this to root or something.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  4 ++++
 fs/btrfs/file.c  |  7 ++++++-
 fs/btrfs/inode.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h |  9 ++++++++
 fs/btrfs/super.c |  2 +-
 5 files changed, 83 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad4eacca7f5..1dcf4fb5b68 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1575,6 +1575,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
+long btrfs_ioctl_trans_end(struct file *file);
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
@@ -1595,6 +1596,8 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_release_file(struct inode *inode, struct file *file);
+
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only);
@@ -1615,4 +1618,5 @@ int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 u64 btrfs_parse_size(char *str);
 int btrfs_parse_options(char *options, struct btrfs_root *root,
 			char **subvol_name);
+int btrfs_sync_fs(struct super_block *sb, int wait);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c4fa4664a45..73c6d085bd9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -978,9 +978,11 @@ out_nolock:
 	return num_written ? num_written : err;
 }
 
-static int btrfs_release_file (struct inode * inode, struct file * filp)
+int btrfs_release_file(struct inode * inode, struct file * filp)
 {
 	btrfs_del_ordered_inode(inode);
+	if (filp->private_data)
+		btrfs_ioctl_trans_end(filp);
 	return 0;
 }
 
@@ -1011,6 +1013,9 @@ static int btrfs_sync_file(struct file *file,
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
+	if (file->private_data)
+		btrfs_ioctl_trans_end(file);
+
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		ret = -ENOMEM;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 31aa4ba06fc..0f14697bece 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3336,6 +3336,61 @@ out_fput:
 	return ret;
 }
 
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_start(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	if (file->private_data) {
+		ret = -EINPROGRESS;
+		goto out;
+	}
+	trans = btrfs_start_transaction(root, 0);
+	if (trans)
+		file->private_data = trans;
+	else
+		ret = -ENOMEM;
+	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+out:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = file->private_data;
+	if (!trans) {
+		ret = -EINVAL;
+		goto out;
+	}
+	btrfs_end_transaction(trans, root);
+	file->private_data = 0;
+out:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3356,6 +3411,13 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_balance(root->fs_info->dev_root);
 	case BTRFS_IOC_CLONE:
 		return btrfs_ioctl_clone(file, arg);
+	case BTRFS_IOC_TRANS_START:
+		return btrfs_ioctl_trans_start(file);
+	case BTRFS_IOC_TRANS_END:
+		return btrfs_ioctl_trans_end(file);
+	case BTRFS_IOC_SYNC:
+		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
 	}
 
 	return -ENOTTY;
@@ -3679,6 +3741,7 @@ static struct file_operations btrfs_dir_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
 #endif
+	.release        = btrfs_release_file,
 };
 
 static struct extent_io_ops btrfs_extent_io_ops = {
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b0e73f51d63..85ed35a775b 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -36,6 +36,14 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
 				   struct btrfs_ioctl_vol_args)
+/* trans start and trans end are dangerous, and only for
+ * use by applications that know how to avoid the
+ * resulting deadlocks
+ */
+#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
+#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
+#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
+
 #define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
 #define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
 				   struct btrfs_ioctl_vol_args)
@@ -43,4 +51,5 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
 				   struct btrfs_ioctl_vol_args)
+
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 77f44494e22..39bb86945ed 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -293,7 +293,7 @@ fail_close:
 	return err;
 }
 
-static int btrfs_sync_fs(struct super_block *sb, int wait)
+int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
-- 
cgit v1.2.3


From 95c9eb178b96474e9d896f20f982818c5f2ceaa0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2008 10:20:57 -0400
Subject: btrfs: tiny makefile cleanup

use normal kbuild syntax to build acl.o conditinally and remove comment
out lines.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 5ac6f2946c2..e5fc3cfea0a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,13 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o
 
-ifeq ($(CONFIG_FS_POSIX_ACL),y)
-btrfs-y += acl.o
-endif
-#btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
-#	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
-#	  inode-map.o \
-
+btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
 
 # Normal Makefile
-- 
cgit v1.2.3


From 306929f364b993581c91596230807fa1c022268a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2008 10:21:04 -0400
Subject: btrfs: fix strange indentation in lookup_extent_mapping

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 268ad8facf6..ba46f7911d9 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -231,7 +231,13 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 {
 	struct extent_map *em;
 	struct rb_node *rb_node;
-	struct rb_node *prev = NULL; struct rb_node *next = NULL; u64 end = range_end(start, len); em = tree->last; if (em && end > em->start && start < extent_map_end(em)) goto found;
+	struct rb_node *prev = NULL;
+	struct rb_node *next = NULL;
+	u64 end = range_end(start, len);
+
+	em = tree->last;
+	if (em && end > em->start && start < extent_map_end(em))
+		goto found;
 
 	rb_node = __tree_search(&tree->map, start, &prev, &next);
 	if (!rb_node && prev) {
-- 
cgit v1.2.3


From edf24abe51493ccda384644d487fe2f796ac21c8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2008 10:40:29 -0400
Subject: btrfs: sanity mount option parsing and early mount code

Also adds lots of comments to describe what's going on here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |   3 +-
 fs/btrfs/disk-io.c |   5 +-
 fs/btrfs/super.c   | 241 ++++++++++++++++++++++++++++++-----------------------
 3 files changed, 141 insertions(+), 108 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1dcf4fb5b68..49cbc62b42f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1616,7 +1616,6 @@ int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct inode *inode);
 /* super.c */
 u64 btrfs_parse_size(char *str);
-int btrfs_parse_options(char *options, struct btrfs_root *root,
-			char **subvol_name);
+int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3805e7eab82..b9a53646ceb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1266,8 +1266,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
-	btrfs_parse_options(options, tree_root, NULL);
+	err = btrfs_parse_options(tree_root, options);
+	if (err)
+		goto fail_sb_buffer;
 
+	err = -EINVAL;
 	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
 		       (unsigned long long)btrfs_super_num_devices(disk_super),
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 39bb86945ed..288300fa584 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -108,15 +108,18 @@ u64 btrfs_parse_size(char *str)
 	return res;
 }
 
-int btrfs_parse_options(char *options, struct btrfs_root *root,
-			char **subvol_name)
+/*
+ * Regular mount options parser.  Everything that is needed only when
+ * reading in a new superblock is parsed here.
+ */
+int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
-	char * p;
-	struct btrfs_fs_info *info = NULL;
+	struct btrfs_fs_info *info = root->fs_info;
 	substring_t args[MAX_OPT_ARGS];
+	char *p, *num;
 
 	if (!options)
-		return 1;
+		return 0;
 
 	/*
 	 * strsep changes the string, duplicate it because parse_options
@@ -126,10 +129,8 @@ int btrfs_parse_options(char *options, struct btrfs_root *root,
 	if (!options)
 		return -ENOMEM;
 
-	if (root)
-		info = root->fs_info;
 
-	while ((p = strsep (&options, ",")) != NULL) {
+	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 		if (!*p)
 			continue;
@@ -137,83 +138,64 @@ int btrfs_parse_options(char *options, struct btrfs_root *root,
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_degraded:
-			if (info) {
-				printk("btrfs: allowing degraded mounts\n");
-				btrfs_set_opt(info->mount_opt, DEGRADED);
-			}
+			printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+			btrfs_set_opt(info->mount_opt, DEGRADED);
 			break;
 		case Opt_subvol:
-			if (subvol_name) {
-				*subvol_name = match_strdup(&args[0]);
-			}
+			/*
+			 * This one is parsed by btrfs_parse_early_options
+			 * and can be happily ignored here.
+			 */
 			break;
 		case Opt_nodatasum:
-			if (info) {
-				printk("btrfs: setting nodatacsum\n");
-				btrfs_set_opt(info->mount_opt, NODATASUM);
-			}
+			printk(KERN_INFO "btrfs: setting nodatacsum\n");
+			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-			if (info) {
-				printk("btrfs: setting nodatacow\n");
-				btrfs_set_opt(info->mount_opt, NODATACOW);
-				btrfs_set_opt(info->mount_opt, NODATASUM);
-			}
+			printk(KERN_INFO "btrfs: setting nodatacow\n");
+			btrfs_set_opt(info->mount_opt, NODATACOW);
+			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_ssd:
-			if (info) {
-				printk("btrfs: use ssd allocation scheme\n");
-				btrfs_set_opt(info->mount_opt, SSD);
-			}
+			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
+			btrfs_set_opt(info->mount_opt, SSD);
 			break;
 		case Opt_nobarrier:
-			if (info) {
-				printk("btrfs: turning off barriers\n");
-				btrfs_set_opt(info->mount_opt, NOBARRIER);
-			}
+			printk(KERN_INFO "btrfs: turning off barriers\n");
+			btrfs_set_opt(info->mount_opt, NOBARRIER);
 			break;
 		case Opt_max_extent:
-			if (info) {
-				char *num = match_strdup(&args[0]);
-				if (num) {
-					info->max_extent =
-						btrfs_parse_size(num);
-					kfree(num);
-
-					info->max_extent = max_t(u64,
-							 info->max_extent,
-							 root->sectorsize);
-					printk("btrfs: max_extent at %Lu\n",
-					       info->max_extent);
-				}
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->max_extent = btrfs_parse_size(num);
+				kfree(num);
+
+				info->max_extent = max_t(u64,
+					info->max_extent, root->sectorsize);
+				printk(KERN_INFO "btrfs: max_extent at %llu\n",
+				       info->max_extent);
 			}
 			break;
 		case Opt_max_inline:
-			if (info) {
-				char *num = match_strdup(&args[0]);
-				if (num) {
-					info->max_inline =
-						btrfs_parse_size(num);
-					kfree(num);
-
-					info->max_inline = max_t(u64,
-							 info->max_inline,
-							 root->sectorsize);
-					printk("btrfs: max_inline at %Lu\n",
-					       info->max_inline);
-				}
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->max_inline = btrfs_parse_size(num);
+				kfree(num);
+
+				info->max_inline = max_t(u64,
+					info->max_inline, root->sectorsize);
+				printk(KERN_INFO "btrfs: max_inline at %llu\n",
+					info->max_inline);
 			}
 			break;
 		case Opt_alloc_start:
-			if (info) {
-				char *num = match_strdup(&args[0]);
-				if (num) {
-					info->alloc_start =
-						btrfs_parse_size(num);
-					kfree(num);
-					printk("btrfs: allocations start at "
-					       "%Lu\n", info->alloc_start);
-				}
+			num = match_strdup(&args[0]);
+			if (num) {
+				info->alloc_start = btrfs_parse_size(num);
+				kfree(num);
+				printk(KERN_INFO
+					"btrfs: allocations start at %llu\n",
+					info->alloc_start);
 			}
 			break;
 		default:
@@ -221,7 +203,61 @@ int btrfs_parse_options(char *options, struct btrfs_root *root,
 		}
 	}
 	kfree(options);
-	return 1;
+	return 0;
+}
+
+/*
+ * Parse mount options that are required early in the mount process.
+ *
+ * All other options will be parsed on much later in the mount process and
+ * only when we need to allocate a new super block.
+ */
+static int btrfs_parse_early_options(const char *options,
+			char **subvol_name)
+{
+	substring_t args[MAX_OPT_ARGS];
+	char *opts, *p;
+	int error = 0;
+
+	if (!options)
+		goto out;
+
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	opts = kstrdup(options, GFP_KERNEL);
+	if (!opts)
+		return -ENOMEM;
+
+	while ((p = strsep(&opts, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_subvol:
+			*subvol_name = match_strdup(&args[0]);
+			break;
+		default:
+			break;
+		}
+	}
+
+	kfree(opts);
+ out:
+	/*
+	 * If no subvolume name is specified we use the default one.  Allocate
+	 * a copy of the string "default" here so that code later in the
+	 * mount path doesn't care if it's the default volume or another one.
+	 */
+	if (!*subvol_name) {
+		*subvol_name = kstrdup("default", GFP_KERNEL);
+		if (!*subvol_name)
+			return -ENOMEM;
+	}
+	return error;
 }
 
 static int btrfs_fill_super(struct super_block * sb,
@@ -328,23 +364,33 @@ static int btrfs_test_super(struct super_block *s, void *data)
 	return root->fs_info->fs_devices == test_fs_devices;
 }
 
-int btrfs_get_sb_bdev(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data,
-	struct vfsmount *mnt, const char *subvol)
+/*
+ * Find a superblock for the given device / mount point.
+ *
+ * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
+ *	  for multiple device setup.  Make sure to keep it in sync.
+ */
+static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data, struct vfsmount *mnt)
 {
+	char *subvol_name = NULL;
 	struct block_device *bdev = NULL;
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
 	int error = 0;
 
+	error = btrfs_parse_early_options(data, &subvol_name);
+	if (error)
+		goto error;
+
 	error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
 	if (error)
-		return error;
+		goto error_free_subvol_name;
 
 	error = btrfs_open_devices(fs_devices, flags, fs_type);
 	if (error)
-		return error;
+		goto error_free_subvol_name;
 
 	bdev = fs_devices->latest_bdev;
 	btrfs_lock_volumes();
@@ -378,51 +424,36 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	if (subvol) {
-		root = lookup_one_len(subvol, s->s_root, strlen(subvol));
-		if (IS_ERR(root)) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
-			error = PTR_ERR(root);
-			goto error;
-		}
-		if (!root->d_inode) {
-			dput(root);
-			up_write(&s->s_umount);
-			deactivate_super(s);
-			error = -ENXIO;
-			goto error;
-		}
-	} else {
-		root = dget(s->s_root);
+	root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
+	if (IS_ERR(root)) {
+		up_write(&s->s_umount);
+		deactivate_super(s);
+		error = PTR_ERR(root);
+		goto error;
+	}
+	if (!root->d_inode) {
+		dput(root);
+		up_write(&s->s_umount);
+		deactivate_super(s);
+		error = -ENXIO;
+		goto error;
 	}
 
 	mnt->mnt_sb = s;
 	mnt->mnt_root = root;
+
+	kfree(subvol_name);
 	return 0;
 
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
 	btrfs_close_devices(fs_devices);
+error_free_subvol_name:
+	kfree(subvol_name);
 error:
 	return error;
 }
-/* end copy & paste */
-
-static int btrfs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	int ret;
-	char *subvol_name = NULL;
-
-	btrfs_parse_options((char *)data, NULL, &subvol_name);
-	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, mnt,
-			subvol_name ? subvol_name : "default");
-	if (subvol_name)
-		kfree(subvol_name);
-	return ret;
-}
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-- 
cgit v1.2.3


From 43e570b08a6c6b1d75f218566a6240542a386fd9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2008 10:40:46 -0400
Subject: btrfs: allow scanning multiple devices during mount

Allows to specify one or multiple device=/dev/foo options during mount
so that ioctls on the control device can be avoided.  Especially useful
when trying to mount a multi-device setup as root.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 288300fa584..346932e546b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -65,7 +65,7 @@ static void btrfs_put_super (struct super_block * sb)
 }
 
 enum {
-	Opt_degraded, Opt_subvol, Opt_nodatasum, Opt_nodatacow,
+	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
 	Opt_ssd, Opt_err,
 };
@@ -73,6 +73,7 @@ enum {
 static match_table_t tokens = {
 	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
+	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
 	{Opt_nobarrier, "nobarrier"},
@@ -142,8 +143,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, DEGRADED);
 			break;
 		case Opt_subvol:
+		case Opt_device:
 			/*
-			 * This one is parsed by btrfs_parse_early_options
+			 * These are parsed by btrfs_parse_early_options
 			 * and can be happily ignored here.
 			 */
 			break;
@@ -212,8 +214,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
  * All other options will be parsed on much later in the mount process and
  * only when we need to allocate a new super block.
  */
-static int btrfs_parse_early_options(const char *options,
-			char **subvol_name)
+static int btrfs_parse_early_options(const char *options, int flags,
+		void *holder, char **subvol_name,
+		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *opts, *p;
@@ -240,11 +243,18 @@ static int btrfs_parse_early_options(const char *options,
 		case Opt_subvol:
 			*subvol_name = match_strdup(&args[0]);
 			break;
+		case Opt_device:
+			error = btrfs_scan_one_device(match_strdup(&args[0]),
+					flags, holder, fs_devices);
+			if (error)
+				goto out_free_opts;
+			break;
 		default:
 			break;
 		}
 	}
 
+ out_free_opts:
 	kfree(opts);
  out:
 	/*
@@ -380,7 +390,8 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	struct btrfs_fs_devices *fs_devices = NULL;
 	int error = 0;
 
-	error = btrfs_parse_early_options(data, &subvol_name);
+	error = btrfs_parse_early_options(data, flags, fs_type,
+					  &subvol_name, &fs_devices);
 	if (error)
 		goto error;
 
-- 
cgit v1.2.3


From 8b7128429235d9bd72cfd5ed20c77c4f3118f744 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 16:50:36 -0400
Subject: Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across
other CPUs in the system.  But, workqueues only schedule work on the
same CPU that queued the work, giving them a limited benefit for systems with
higher CPU counts.

This code adds a generic facility to schedule work with pools of kthreads,
and changes the bio submission code to queue bios up.  The queueing is
important to make sure large numbers of procs on the system don't
turn streaming workloads into random workloads by sending IO down
concurrently.

The end result of all of this is much higher performance (and CPU usage) when
doing checksumming on large machines.  Two worker pools are created,
one for writes and one for endio processing.  The two could deadlock if
we tried to service both from a single pool.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/async-thread.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/async-thread.h |  78 +++++++++++++
 fs/btrfs/ctree.h        |  14 ++-
 fs/btrfs/disk-io.c      | 200 ++++++++++++++-------------------
 fs/btrfs/inode.c        |   4 +-
 fs/btrfs/volumes.c      | 162 ++++++++++++++++++++++++++-
 fs/btrfs/volumes.h      |  10 +-
 8 files changed, 626 insertions(+), 132 deletions(-)
 create mode 100644 fs/btrfs/async-thread.c
 create mode 100644 fs/btrfs/async-thread.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index e5fc3cfea0a..9dcfc2fe333 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o
+	   extent_io.o volumes.o async-thread.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 00000000000..2911b67bd6f
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+	/* list of struct btrfs_work that are waiting for service */
+	struct list_head pending;
+
+	/* list of worker threads from struct btrfs_workers */
+	struct list_head worker_list;
+
+	/* kthread */
+	struct task_struct *task;
+
+	/* number of things on the pending list */
+	atomic_t num_pending;
+
+	/* protects the pending list. */
+	spinlock_t lock;
+
+	/* set to non-zero when this thread is already awake and kicking */
+	int working;
+};
+
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+	struct btrfs_worker_thread *worker = arg;
+	struct list_head *cur;
+	struct btrfs_work *work;
+	do {
+		spin_lock_irq(&worker->lock);
+		while(!list_empty(&worker->pending)) {
+			cur = worker->pending.next;
+			work = list_entry(cur, struct btrfs_work, list);
+			list_del(&work->list);
+			clear_bit(0, &work->flags);
+
+			work->worker = worker;
+			spin_unlock_irq(&worker->lock);
+
+			work->func(work);
+
+			atomic_dec(&worker->num_pending);
+			spin_lock_irq(&worker->lock);
+		}
+		worker->working = 0;
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&worker->lock);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+	struct list_head *cur;
+	struct btrfs_worker_thread *worker;
+
+	while(!list_empty(&workers->worker_list)) {
+		cur = workers->worker_list.next;
+		worker = list_entry(cur, struct btrfs_worker_thread,
+				    worker_list);
+		kthread_stop(worker->task);
+		list_del(&worker->worker_list);
+		kfree(worker);
+	}
+	return 0;
+}
+
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, int max)
+{
+	workers->num_workers = 0;
+	INIT_LIST_HEAD(&workers->worker_list);
+	workers->last = NULL;
+	spin_lock_init(&workers->lock);
+	workers->max_workers = max;
+}
+
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+	struct btrfs_worker_thread *worker;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < num_workers; i++) {
+		worker = kzalloc(sizeof(*worker), GFP_NOFS);
+		if (!worker) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		INIT_LIST_HEAD(&worker->pending);
+		INIT_LIST_HEAD(&worker->worker_list);
+		spin_lock_init(&worker->lock);
+		atomic_set(&worker->num_pending, 0);
+		worker->task = kthread_run(worker_loop, worker, "btrfs");
+		if (IS_ERR(worker->task)) {
+			ret = PTR_ERR(worker->task);
+			goto fail;
+		}
+
+		spin_lock_irq(&workers->lock);
+		list_add_tail(&worker->worker_list, &workers->worker_list);
+		workers->last = worker;
+		workers->num_workers++;
+		spin_unlock_irq(&workers->lock);
+	}
+	return 0;
+fail:
+	btrfs_stop_workers(workers);
+	return ret;
+}
+
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	struct list_head *next;
+	struct list_head *start;
+	int enforce_min = workers->num_workers < workers->max_workers;
+
+	/* start with the last thread if it isn't busy */
+	worker = workers->last;
+	if (atomic_read(&worker->num_pending) < 64)
+		goto done;
+
+	next = worker->worker_list.next;
+	start = &worker->worker_list;
+
+	/*
+	 * check all the workers for someone that is bored.  FIXME, do
+	 * something smart here
+	 */
+	while(next != start) {
+		if (next == &workers->worker_list) {
+			next = workers->worker_list.next;
+			continue;
+		}
+		worker = list_entry(next, struct btrfs_worker_thread,
+				    worker_list);
+		if (atomic_read(&worker->num_pending) < 64 || !enforce_min)
+			goto done;
+		next = next->next;
+	}
+	/*
+	 * nobody was bored, if we're already at the max thread count,
+	 * use the last thread
+	 */
+	if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) {
+		return workers->last;
+	}
+	return NULL;
+done:
+	workers->last = worker;
+	return worker;
+}
+
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+
+again:
+	spin_lock_irqsave(&workers->lock, flags);
+	worker = next_worker(workers);
+	spin_unlock_irqrestore(&workers->lock, flags);
+
+	if (!worker) {
+		spin_lock_irqsave(&workers->lock, flags);
+		if (workers->num_workers >= workers->max_workers) {
+			/*
+			 * we have failed to find any workers, just
+			 * return the force one
+			 */
+			worker = list_entry(workers->worker_list.next,
+				  struct btrfs_worker_thread, worker_list);
+			spin_unlock_irqrestore(&workers->lock, flags);
+		} else {
+			spin_unlock_irqrestore(&workers->lock, flags);
+			/* we're below the limit, start another worker */
+			btrfs_start_workers(workers, 1);
+			goto again;
+		}
+	}
+	return worker;
+}
+
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker = work->worker;
+	unsigned long flags;
+
+	if (test_and_set_bit(0, &work->flags))
+		goto out;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	list_add_tail(&work->list, &worker->pending);
+	spin_unlock_irqrestore(&worker->lock, flags);
+out:
+	return 0;
+}
+
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+	int wake = 0;
+
+	/* don't requeue something already on a list */
+	if (test_and_set_bit(0, &work->flags))
+		goto out;
+
+	worker = find_worker(workers);
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	list_add_tail(&work->list, &worker->pending);
+
+	/*
+	 * avoid calling into wake_up_process if this thread has already
+	 * been kicked
+	 */
+	if (!worker->working)
+		wake = 1;
+	worker->working = 1;
+
+	spin_unlock_irqrestore(&worker->lock, flags);
+
+	if (wake)
+		wake_up_process(worker->task);
+out:
+	return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 00000000000..52fc9da0f9e
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+
+struct btrfs_worker_thread;
+
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work.  There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+	/*
+	 * only func should be set to the function you want called
+	 * your work struct is passed as the only arg
+	 */
+	void (*func)(struct btrfs_work *work);
+
+	/*
+	 * flags should be set to zero.  It is used to make sure the
+	 * struct is only inserted once into the list.
+	 */
+	unsigned long flags;
+
+	/* don't touch these */
+	struct btrfs_worker_thread *worker;
+	struct list_head list;
+};
+
+struct btrfs_workers {
+	/* current number of running workers */
+	int num_workers;
+
+	/* max number of workers allowed.  changed by btrfs_start_workers */
+	int max_workers;
+
+	/* list with all the work threads */
+	struct list_head worker_list;
+
+	/* the last worker thread to have something queued */
+	struct btrfs_worker_thread *last;
+
+	/* lock for finding the next worker thread to queue on */
+	spinlock_t lock;
+};
+
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, int max);
+int btrfs_requeue_work(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 49cbc62b42f..6c91a510c96 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
 #include "bit-radix.h"
 #include "extent_io.h"
 #include "extent_map.h"
+#include "async-thread.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -518,13 +519,20 @@ struct btrfs_fs_info {
 	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head end_io_work_list;
-	struct list_head async_submit_work_list;
 	struct work_struct end_io_work;
-	struct work_struct async_submit_work;
 	spinlock_t end_io_work_lock;
-	spinlock_t async_submit_work_lock;
 	atomic_t nr_async_submits;
 
+	/*
+	 * there is a pool of worker threads for checksumming during writes
+	 * and a pool for checksumming after reads.  This is because readers
+	 * can run with FS locks held, and the writers may be waiting for
+	 * those locks.  We don't want ordering in the pending list to cause
+	 * deadlocks, and so the two are serviced separately.
+	 */
+	struct btrfs_workers workers;
+	struct btrfs_workers endio_workers;
+
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	struct work_struct trans_work;
 #else
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9a53646ceb..98ff4fbcb38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,6 +31,7 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "print-tree.h"
+#include "async-thread.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -46,8 +47,7 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 #endif
 
 static struct extent_io_ops btree_extent_io_ops;
-static struct workqueue_struct *end_io_workqueue;
-static struct workqueue_struct *async_submit_workqueue;
+static void end_workqueue_fn(struct btrfs_work *work);
 
 struct end_io_wq {
 	struct bio *bio;
@@ -57,6 +57,7 @@ struct end_io_wq {
 	int error;
 	int metadata;
 	struct list_head list;
+	struct btrfs_work work;
 };
 
 struct async_submit_bio {
@@ -66,6 +67,7 @@ struct async_submit_bio {
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int rw;
 	int mirror_num;
+	struct btrfs_work work;
 };
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
@@ -389,7 +391,6 @@ static int end_workqueue_bio(struct bio *bio,
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
 	struct btrfs_fs_info *fs_info;
-	unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -397,11 +398,10 @@ static int end_workqueue_bio(struct bio *bio,
 #endif
 
 	fs_info = end_io_wq->info;
-	spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
 	end_io_wq->error = err;
-	list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list);
-	spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
-	queue_work(end_io_workqueue, &fs_info->end_io_work);
+	end_io_wq->work.func = end_workqueue_fn;
+	end_io_wq->work.flags = 0;
+	btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
@@ -428,6 +428,19 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
+static void run_one_async_submit(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	fs_info = BTRFS_I(async->inode)->root->fs_info;
+	atomic_dec(&fs_info->nr_async_submits);
+	async->submit_bio_hook(async->inode, async->rw, async->bio,
+			       async->mirror_num);
+	kfree(async);
+}
+
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			extent_submit_bio_hook_t *submit_bio_hook)
@@ -443,13 +456,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->bio = bio;
 	async->mirror_num = mirror_num;
 	async->submit_bio_hook = submit_bio_hook;
-
-	spin_lock(&fs_info->async_submit_work_lock);
-	list_add_tail(&async->list, &fs_info->async_submit_work_list);
+	async->work.func = run_one_async_submit;
+	async->work.flags = 0;
 	atomic_inc(&fs_info->nr_async_submits);
-	spin_unlock(&fs_info->async_submit_work_lock);
-
-	queue_work(async_submit_workqueue, &fs_info->async_submit_work);
+	btrfs_queue_worker(&fs_info->workers, &async->work);
 	return 0;
 }
 
@@ -462,19 +472,32 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	offset = bio->bi_sector << 9;
 
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump ingo btrfs_map_bio
+	 */
 	if (rw & (1 << BIO_RW)) {
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 0);
 	}
 
+	/*
+	 * called for a read, do the setup so that checksum validation
+	 * can happen in the async kernel threads
+	 */
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
 	BUG_ON(ret);
 
-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num)
 {
+	/*
+	 * kthread helpers are used to submit writes so that checksumming
+	 * can happen in parallel across all CPUs
+	 */
 	if (!(rw & (1 << BIO_RW))) {
 		return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
 	}
@@ -1036,95 +1059,40 @@ static int bio_ready_for_csum(struct bio *bio)
 	return ret;
 }
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-static void btrfs_end_io_csum(void *p)
-#else
-static void btrfs_end_io_csum(struct work_struct *work)
-#endif
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions.  This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
 {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct btrfs_fs_info *fs_info = p;
-#else
-	struct btrfs_fs_info *fs_info = container_of(work,
-						     struct btrfs_fs_info,
-						     end_io_work);
-#endif
-	unsigned long flags;
-	struct end_io_wq *end_io_wq;
 	struct bio *bio;
-	struct list_head *next;
+	struct end_io_wq *end_io_wq;
+	struct btrfs_fs_info *fs_info;
 	int error;
-	int was_empty;
 
-	while(1) {
-		spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
-		if (list_empty(&fs_info->end_io_work_list)) {
-			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
-					       flags);
-			return;
-		}
-		next = fs_info->end_io_work_list.next;
-		list_del(next);
-		spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
-
-		end_io_wq = list_entry(next, struct end_io_wq, list);
-
-		bio = end_io_wq->bio;
-		if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
-			spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
-			was_empty = list_empty(&fs_info->end_io_work_list);
-			list_add_tail(&end_io_wq->list,
-				      &fs_info->end_io_work_list);
-			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
-					       flags);
-			if (was_empty)
-				return;
-			continue;
-		}
-		error = end_io_wq->error;
-		bio->bi_private = end_io_wq->private;
-		bio->bi_end_io = end_io_wq->end_io;
-		kfree(end_io_wq);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-		bio_endio(bio, bio->bi_size, error);
-#else
-		bio_endio(bio, error);
-#endif
-	}
-}
+	end_io_wq = container_of(work, struct end_io_wq, work);
+	bio = end_io_wq->bio;
+	fs_info = end_io_wq->info;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-static void btrfs_async_submit_work(void *p)
-#else
-static void btrfs_async_submit_work(struct work_struct *work)
-#endif
-{
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct btrfs_fs_info *fs_info = p;
+	/* metadata bios are special because the whole tree block must
+	 * be checksummed at once.  This makes sure the entire block is in
+	 * ram and up to date before trying to verify things.  For
+	 * blocksize <= pagesize, it is basically a noop
+	 */
+	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
+		btrfs_queue_worker(&fs_info->endio_workers,
+				   &end_io_wq->work);
+		return;
+	}
+	error = end_io_wq->error;
+	bio->bi_private = end_io_wq->private;
+	bio->bi_end_io = end_io_wq->end_io;
+	kfree(end_io_wq);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	bio_endio(bio, bio->bi_size, error);
 #else
-	struct btrfs_fs_info *fs_info = container_of(work,
-						     struct btrfs_fs_info,
-						     async_submit_work);
+	bio_endio(bio, error);
 #endif
-	struct async_submit_bio *async;
-	struct list_head *next;
-
-	while(1) {
-		spin_lock(&fs_info->async_submit_work_lock);
-		if (list_empty(&fs_info->async_submit_work_list)) {
-			spin_unlock(&fs_info->async_submit_work_lock);
-			return;
-		}
-		next = fs_info->async_submit_work_list.next;
-		list_del(next);
-		atomic_dec(&fs_info->nr_async_submits);
-		spin_unlock(&fs_info->async_submit_work_lock);
-
-		async = list_entry(next, struct async_submit_bio, list);
-		async->submit_bio_hook(async->inode, async->rw, async->bio,
-				       async->mirror_num);
-		kfree(async);
-	}
 }
 
 struct btrfs_root *open_ctree(struct super_block *sb,
@@ -1155,19 +1123,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail;
 	}
-	end_io_workqueue = create_workqueue("btrfs-end-io");
-	BUG_ON(!end_io_workqueue);
-	async_submit_workqueue = create_workqueue("btrfs-async-submit");
-
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
-	INIT_LIST_HEAD(&fs_info->end_io_work_list);
-	INIT_LIST_HEAD(&fs_info->async_submit_work_list);
 	spin_lock_init(&fs_info->hash_lock);
-	spin_lock_init(&fs_info->end_io_work_lock);
-	spin_lock_init(&fs_info->async_submit_work_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 
@@ -1222,13 +1182,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->do_barriers = 1;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
-	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work,
-		  fs_info);
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
-	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
-	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work);
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 #endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1240,6 +1195,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 
+	/* we need to start all the end_io workers up front because the
+	 * queue work function gets called at interrupt time.  The endio
+	 * workers don't normally start IO, so some number of them <= the
+	 * number of cpus is fine.  They handle checksumming after a read.
+	 *
+	 * The other worker threads do start IO, so the max is larger than
+	 * the number of CPUs.  FIXME, tune this for huge machines
+	 */
+	btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2);
+	btrfs_init_workers(&fs_info->endio_workers, num_online_cpus());
+	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->endio_workers, num_online_cpus());
+
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
 	if (ret) {
@@ -1375,6 +1343,8 @@ fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 fail_iput:
 	iput(fs_info->btree_inode);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -1623,16 +1593,10 @@ int close_ctree(struct btrfs_root *root)
 	extent_io_tree_empty_lru(&fs_info->extent_ins);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
-	flush_workqueue(async_submit_workqueue);
-	flush_workqueue(end_io_workqueue);
-
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
-	flush_workqueue(async_submit_workqueue);
-	destroy_workqueue(async_submit_workqueue);
-
-	flush_workqueue(end_io_workqueue);
-	destroy_workqueue(end_io_workqueue);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
 
 	iput(fs_info->btree_inode);
 #if 0
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0f14697bece..7daef8d3700 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -359,7 +359,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	kfree(sums);
 
-	return btrfs_map_bio(root, rw, bio, mirror_num);
+	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
@@ -383,7 +383,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				   inode, rw, bio, mirror_num,
 				   __btrfs_submit_bio_hook);
 mapit:
-	return btrfs_map_bio(root, rw, bio, mirror_num);
+	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 722eb455015..c57458ce633 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "async-thread.h"
 
 struct map_lookup {
 	u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+	struct bio *pending;
+	struct backing_dev_info *bdi;
+	struct bio *tail;
+	struct bio *cur;
+	int again = 0;
+	unsigned long num_run = 0;
+
+	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+	spin_lock(&device->io_lock);
+
+	/* take all the bios off the list at once and process them
+	 * later on (without the lock held).  But, remember the
+	 * tail and other pointers so the bios can be properly reinserted
+	 * into the list if we hit congestion
+	 */
+	pending = device->pending_bios;
+	tail = device->pending_bio_tail;
+	WARN_ON(pending && !tail);
+	device->pending_bios = NULL;
+	device->pending_bio_tail = NULL;
+
+	/*
+	 * if pending was null this time around, no bios need processing
+	 * at all and we can stop.  Otherwise it'll loop back up again
+	 * and do an additional check so no bios are missed.
+	 *
+	 * device->running_pending is used to synchronize with the
+	 * schedule_bio code.
+	 */
+	if (pending) {
+		again = 1;
+		device->running_pending = 1;
+	} else {
+		again = 0;
+		device->running_pending = 0;
+	}
+	spin_unlock(&device->io_lock);
+
+	while(pending) {
+		cur = pending;
+		pending = pending->bi_next;
+		cur->bi_next = NULL;
+		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+		submit_bio(cur->bi_rw, cur);
+		num_run++;
+
+		/*
+		 * we made progress, there is more work to do and the bdi
+		 * is now congested.  Back off and let other work structs
+		 * run instead
+		 */
+		if (pending && num_run && bdi_write_congested(bdi)) {
+			struct bio *old_head;
+
+			spin_lock(&device->io_lock);
+			old_head = device->pending_bios;
+			device->pending_bios = pending;
+			if (device->pending_bio_tail)
+				tail->bi_next = old_head;
+			else
+				device->pending_bio_tail = tail;
+
+			spin_unlock(&device->io_lock);
+			btrfs_requeue_work(&device->work);
+			goto done;
+		}
+	}
+	if (again)
+		goto loop;
+done:
+	return 0;
+}
+
+void pending_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, work);
+	run_scheduled_bios(device);
+}
+
 static int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, disk_super->dev_item.uuid,
 		       BTRFS_UUID_SIZE);
 		device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	device->barriers = 1;
+	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
 	spin_lock_init(&device->io_lock);
 	device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
 
+struct async_sched {
+	struct bio *bio;
+	int rw;
+	struct btrfs_fs_info *info;
+	struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+		 int rw, struct bio *bio)
+{
+	int should_queue = 1;
+
+	/* don't bother with additional async steps for reads, right now */
+	if (!(rw & (1 << BIO_RW))) {
+		submit_bio(rw, bio);
+		return 0;
+	}
+
+	/*
+	 * nr_async_sumbits allows us to reliably return congestion to the
+	 * higher layers.  Otherwise, the async bio makes it appear we have
+	 * made progress against dirty pages when we've really just put it
+	 * on a queue for later
+	 */
+	atomic_inc(&root->fs_info->nr_async_submits);
+	bio->bi_next = NULL;
+	bio->bi_rw |= rw;
+
+	spin_lock(&device->io_lock);
+
+	if (device->pending_bio_tail)
+		device->pending_bio_tail->bi_next = bio;
+
+	device->pending_bio_tail = bio;
+	if (!device->pending_bios)
+		device->pending_bios = bio;
+	if (device->running_pending)
+		should_queue = 0;
+
+	spin_unlock(&device->io_lock);
+
+	if (should_queue)
+		btrfs_queue_worker(&root->fs_info->workers, &device->work);
+	return 0;
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-		  int mirror_num)
+		  int mirror_num, int async_submit)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		dev = multi->stripes[dev_nr].dev;
 		if (dev && dev->bdev) {
 			bio->bi_bdev = dev->bdev;
-			spin_lock(&dev->io_lock);
-			dev->total_ios++;
-			spin_unlock(&dev->io_lock);
-			submit_bio(rw, bio);
+			if (async_submit)
+				schedule_bio(root, dev, rw, bio);
+			else
+				submit_bio(rw, bio);
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
+	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4df6b1608f9..48a44f7a938 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,6 +20,7 @@
 #define __BTRFS_VOLUMES_
 
 #include <linux/bio.h>
+#include "async-thread.h"
 
 struct buffer_head;
 struct btrfs_device {
@@ -27,6 +28,9 @@ struct btrfs_device {
 	struct list_head dev_alloc_list;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
+	struct bio *pending_bios;
+	struct bio *pending_bio_tail;
+	int running_pending;
 	u64 generation;
 
 	int barriers;
@@ -36,8 +40,6 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
-	u64 total_ios;
-
 	char *name;
 
 	/* the internal btrfs device id */
@@ -63,6 +65,8 @@ struct btrfs_device {
 
 	/* physical drive uuid (or lvm uuid) */
 	u8 uuid[BTRFS_UUID_SIZE];
+
+	struct btrfs_work work;
 };
 
 struct btrfs_fs_devices {
@@ -117,7 +121,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-		  int mirror_num);
+		  int mirror_num, int async_submit);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       int flags, void *holder);
-- 
cgit v1.2.3


From 15ada040d7cd68d7853938a92b116292cc16a2f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 16:51:38 -0400
Subject: Btrfs: Fix mount -o max_inline=0

max_inline=0 used to force the max_inline size to one sector instead.  Now
it properly disables inline data items, while still being able to read
any that happen to exist on disk.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 346932e546b..f3274befd46 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -184,8 +184,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				info->max_inline = btrfs_parse_size(num);
 				kfree(num);
 
-				info->max_inline = max_t(u64,
-					info->max_inline, root->sectorsize);
+				if (info->max_inline) {
+					info->max_inline = max_t(u64,
+						info->max_inline,
+						root->sectorsize);
+				}
 				printk(KERN_INFO "btrfs: max_inline at %llu\n",
 					info->max_inline);
 			}
-- 
cgit v1.2.3


From d05e5a4dad985acc938e62d9752c813647dedd9d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 17:09:48 -0400
Subject: Btrfs: Add backport for the kthread work on kernels older than 2.6.20

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 2911b67bd6f..8b9e2cf5060 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,10 +16,17 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/version.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/freezer.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+# include <linux/freezer.h>
+#else
+# include <linux/sched.h>
+#endif
+
 #include "async-thread.h"
 
 /*
-- 
cgit v1.2.3


From 35d8ba66294ff2a53c17337a1aa1ff6739492f41 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 20:21:24 -0400
Subject: Btrfs: Worker thread optimizations

This changes the worker thread pool to maintain a list of idle threads,
avoiding a complex search for a good thread to wake up.

Threads have two states:

idle - we try to reuse the last thread used in hopes of improving the batching
ratios

busy - each time a new work item is added to a busy task, the task is
rotated to the end of the line.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 100 +++++++++++++++++++++++++++++++++---------------
 fs/btrfs/async-thread.h |   7 ++--
 2 files changed, 73 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8b9e2cf5060..8d4cc4679d5 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -34,6 +34,9 @@
  * One of these is allocated per thread.
  */
 struct btrfs_worker_thread {
+	/* pool we belong to */
+	struct btrfs_workers *workers;
+
 	/* list of struct btrfs_work that are waiting for service */
 	struct list_head pending;
 
@@ -51,8 +54,44 @@ struct btrfs_worker_thread {
 
 	/* set to non-zero when this thread is already awake and kicking */
 	int working;
+
+	/* are we currently idle */
+	int idle;
 };
 
+/*
+ * helper function to move a thread onto the idle list after it
+ * has finished some requests.
+ */
+static void check_idle_worker(struct btrfs_worker_thread *worker)
+{
+	if (!worker->idle && atomic_read(&worker->num_pending) <
+	    worker->workers->idle_thresh / 2) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 1;
+		list_move(&worker->worker_list, &worker->workers->idle_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
+/*
+ * helper function to move a thread off the idle list after new
+ * pending work is added.
+ */
+static void check_busy_worker(struct btrfs_worker_thread *worker)
+{
+	if (worker->idle && atomic_read(&worker->num_pending) >=
+	    worker->workers->idle_thresh) {
+		unsigned long flags;
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 0;
+		list_move_tail(&worker->worker_list,
+			       &worker->workers->worker_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+}
+
 /*
  * main loop for servicing work items
  */
@@ -76,6 +115,7 @@ static int worker_loop(void *arg)
 
 			atomic_dec(&worker->num_pending);
 			spin_lock_irq(&worker->lock);
+			check_idle_worker(worker);
 		}
 		worker->working = 0;
 		if (freezing(current)) {
@@ -98,6 +138,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 	struct list_head *cur;
 	struct btrfs_worker_thread *worker;
 
+	list_splice_init(&workers->idle_list, &workers->worker_list);
 	while(!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
@@ -116,9 +157,10 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
 {
 	workers->num_workers = 0;
 	INIT_LIST_HEAD(&workers->worker_list);
-	workers->last = NULL;
+	INIT_LIST_HEAD(&workers->idle_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
+	workers->idle_thresh = 64;
 }
 
 /*
@@ -143,14 +185,14 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
 		worker->task = kthread_run(worker_loop, worker, "btrfs");
+		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
 			ret = PTR_ERR(worker->task);
 			goto fail;
 		}
 
 		spin_lock_irq(&workers->lock);
-		list_add_tail(&worker->worker_list, &workers->worker_list);
-		workers->last = worker;
+		list_add_tail(&worker->worker_list, &workers->idle_list);
 		workers->num_workers++;
 		spin_unlock_irq(&workers->lock);
 	}
@@ -169,42 +211,30 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
 	struct list_head *next;
-	struct list_head *start;
 	int enforce_min = workers->num_workers < workers->max_workers;
 
-	/* start with the last thread if it isn't busy */
-	worker = workers->last;
-	if (atomic_read(&worker->num_pending) < 64)
-		goto done;
-
-	next = worker->worker_list.next;
-	start = &worker->worker_list;
-
 	/*
-	 * check all the workers for someone that is bored.  FIXME, do
-	 * something smart here
+	 * if we find an idle thread, don't move it to the end of the
+	 * idle list.  This improves the chance that the next submission
+	 * will reuse the same thread, and maybe catch it while it is still
+	 * working
 	 */
-	while(next != start) {
-		if (next == &workers->worker_list) {
-			next = workers->worker_list.next;
-			continue;
-		}
+	if (!list_empty(&workers->idle_list)) {
+		next = workers->idle_list.next;
 		worker = list_entry(next, struct btrfs_worker_thread,
 				    worker_list);
-		if (atomic_read(&worker->num_pending) < 64 || !enforce_min)
-			goto done;
-		next = next->next;
+		return worker;
 	}
+	if (enforce_min || list_empty(&workers->worker_list))
+		return NULL;
+
 	/*
-	 * nobody was bored, if we're already at the max thread count,
-	 * use the last thread
+	 * if we pick a busy task, move the task to the end of the list.
+	 * hopefully this will keep things somewhat evenly balanced
 	 */
-	if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) {
-		return workers->last;
-	}
-	return NULL;
-done:
-	workers->last = worker;
+	next = workers->worker_list.next;
+	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+	list_move_tail(next, &workers->worker_list);
 	return worker;
 }
 
@@ -221,11 +251,17 @@ again:
 	if (!worker) {
 		spin_lock_irqsave(&workers->lock, flags);
 		if (workers->num_workers >= workers->max_workers) {
+			struct list_head *fallback = NULL;
 			/*
 			 * we have failed to find any workers, just
 			 * return the force one
 			 */
-			worker = list_entry(workers->worker_list.next,
+			if (!list_empty(&workers->worker_list))
+				fallback = workers->worker_list.next;
+			if (!list_empty(&workers->idle_list))
+				fallback = workers->idle_list.next;
+			BUG_ON(!fallback);
+			worker = list_entry(fallback,
 				  struct btrfs_worker_thread, worker_list);
 			spin_unlock_irqrestore(&workers->lock, flags);
 		} else {
@@ -254,6 +290,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
 	list_add_tail(&work->list, &worker->pending);
+	check_busy_worker(worker);
 	spin_unlock_irqrestore(&worker->lock, flags);
 out:
 	return 0;
@@ -276,6 +313,7 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
+	check_busy_worker(worker);
 	list_add_tail(&work->list, &worker->pending);
 
 	/*
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 52fc9da0f9e..3436ff89759 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -60,11 +60,12 @@ struct btrfs_workers {
 	/* max number of workers allowed.  changed by btrfs_start_workers */
 	int max_workers;
 
+	/* once a worker has this many requests or fewer, it is idle */
+	int idle_thresh;
+
 	/* list with all the work threads */
 	struct list_head worker_list;
-
-	/* the last worker thread to have something queued */
-	struct btrfs_worker_thread *last;
+	struct list_head idle_list;
 
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
-- 
cgit v1.2.3


From 4543df7ecc8ae4928c1e51d6e7dc188d650abee4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 21:47:56 -0400
Subject: Btrfs: Add a mount option to control worker thread pool size

mount -o thread_pool_size changes the default, which is
min(num_cpus + 2, 8).  Larger thread pools would make more sense on
very large disk arrays.

This mount option controls the max size of each thread pool.  There
are multiple thread pools, so the total worker count will be larger
than the mount option.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 30 +++++++++++++++---------------
 fs/btrfs/super.c   | 13 ++++++++++++-
 3 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6c91a510c96..7ae4666103c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -532,6 +532,7 @@ struct btrfs_fs_info {
 	 */
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
+	int thread_pool_size;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	struct work_struct trans_work;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 98ff4fbcb38..c6a710a668c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1117,6 +1117,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					      GFP_NOFS);
 	int ret;
 	int err = -EINVAL;
+
 	struct btrfs_super_block *disk_super;
 
 	if (!extent_root || !tree_root || !fs_info) {
@@ -1148,6 +1149,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
+	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
@@ -1195,19 +1197,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 
-	/* we need to start all the end_io workers up front because the
-	 * queue work function gets called at interrupt time.  The endio
-	 * workers don't normally start IO, so some number of them <= the
-	 * number of cpus is fine.  They handle checksumming after a read.
-	 *
-	 * The other worker threads do start IO, so the max is larger than
-	 * the number of CPUs.  FIXME, tune this for huge machines
-	 */
-	btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2);
-	btrfs_init_workers(&fs_info->endio_workers, num_online_cpus());
-	btrfs_start_workers(&fs_info->workers, 1);
-	btrfs_start_workers(&fs_info->endio_workers, num_online_cpus());
-
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
 	if (ret) {
@@ -1238,6 +1227,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (err)
 		goto fail_sb_buffer;
 
+	/*
+	 * we need to start all the end_io workers up front because the
+	 * queue work function gets called at interrupt time, and so it
+	 * cannot dynamically grow.
+	 */
+	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+
+
 	err = -EINVAL;
 	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
@@ -1341,10 +1341,10 @@ fail_sys_array:
 	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
-fail_iput:
-	iput(fs_info->btree_inode);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+fail_iput:
+	iput(fs_info->btree_inode);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3274befd46..196d0e280b1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -67,7 +67,7 @@ static void btrfs_put_super (struct super_block * sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -80,6 +80,7 @@ static match_table_t tokens = {
 	{Opt_max_extent, "max_extent=%s"},
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
+	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_ssd, "ssd"},
 	{Opt_err, NULL}
 };
@@ -118,6 +119,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	struct btrfs_fs_info *info = root->fs_info;
 	substring_t args[MAX_OPT_ARGS];
 	char *p, *num;
+	int intarg;
 
 	if (!options)
 		return 0;
@@ -166,6 +168,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: turning off barriers\n");
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
 			break;
+		case Opt_thread_pool:
+			intarg = 0;
+			match_int(&args[0], &intarg);
+			if (intarg) {
+				info->thread_pool_size = intarg;
+				printk(KERN_INFO "btrfs: thread pool %d\n",
+				       info->thread_pool_size);
+			}
+			break;
 		case Opt_max_extent:
 			num = match_strdup(&args[0]);
 			if (num) {
-- 
cgit v1.2.3


From 9d2423c5c3fbb0f110ac0b6cdc5a8e4d64729483 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jun 2008 21:52:17 -0400
Subject: Btrfs: kerneldoc comments for extent_map.c

Add kerneldoc comments for all exported functions.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 61 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ba46f7911d9..f5a04eb9a2a 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -31,6 +31,14 @@ void extent_map_exit(void)
 		kmem_cache_destroy(extent_map_cache);
 }
 
+/**
+ * extent_map_tree_init - initialize extent map tree
+ * @tree:		tree to initialize
+ * @mask:		flags for memory allocations during tree operations
+ *
+ * Initialize the extent tree @tree.  Should be called for each new inode
+ * or other user of the extent_map interface.
+ */
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
 	tree->map.rb_node = NULL;
@@ -39,6 +47,14 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 }
 EXPORT_SYMBOL(extent_map_tree_init);
 
+/**
+ * alloc_extent_map - allocate new extent map structure
+ * @mask:	memory allocation flags
+ *
+ * Allocate a new extent_map structure.  The new structure is
+ * returned with a reference count of one and needs to be
+ * freed using free_extent_map()
+ */
 struct extent_map *alloc_extent_map(gfp_t mask)
 {
 	struct extent_map *em;
@@ -52,6 +68,13 @@ struct extent_map *alloc_extent_map(gfp_t mask)
 }
 EXPORT_SYMBOL(alloc_extent_map);
 
+/**
+ * free_extent_map - drop reference count of an extent_map
+ * @em:		extent map beeing releasead
+ *
+ * Drops the reference out on @em by one and free the structure
+ * if the reference count hits zero.
+ */
 void free_extent_map(struct extent_map *em)
 {
 	if (!em)
@@ -166,10 +189,15 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	return 0;
 }
 
-/*
- * add_extent_mapping tries a simple forward/backward merge with existing
- * mappings.  The extent_map struct passed in will be inserted into
- * the tree directly (no copies made, just a reference taken).
+/**
+ * add_extent_mapping - add new extent map to the extent tree
+ * @tree:	tree to insert new map in
+ * @em:		map to insert
+ *
+ * Insert @em into @tree or perform a simple forward/backward merge with
+ * existing mappings.  The extent_map struct passed in will be inserted
+ * into the tree directly, with an additional reference taken, or a
+ * reference dropped if the merge attempt was sucessfull.
  */
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em)
@@ -220,11 +248,16 @@ static u64 range_end(u64 start, u64 len)
 	return start + len;
 }
 
-/*
- * lookup_extent_mapping returns the first extent_map struct in the
- * tree that intersects the [start, len] range.  There may
- * be additional objects in the tree that intersect, so check the object
- * returned carefully to make sure you don't need additional lookups.
+/**
+ * lookup_extent_mapping - lookup extent_map
+ * @tree:	tree to lookup in
+ * @start:	byte offset to start the search
+ * @len:	length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.  There may be additional objects in the tree that
+ * intersect, so check the object returned carefully to make sure that no
+ * additional lookups are needed.
  */
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len)
@@ -273,9 +306,13 @@ out:
 }
 EXPORT_SYMBOL(lookup_extent_mapping);
 
-/*
- * removes an extent_map struct from the tree.  No reference counts are
- * dropped, and no checks are done to  see if the range is in use
+/**
+ * remove_extent_mapping - removes an extent_map from the extent tree
+ * @tree:	extent tree to remove from
+ * @em:		extent map beeing removed
+ *
+ * Removes @em from @tree.  No reference counts are dropped, and no checks
+ * are done to see if the range is in use
  */
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
-- 
cgit v1.2.3


From f46b5a66b3316ef2f8febfe4c56e2d555e2c3979 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jun 2008 21:53:53 -0400
Subject: Btrfs: split out ioctl.c

Split the ioctl handling out of inode.c into a file of it's own.
Also fix up checkpatch.pl warnings for the moved code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile |   2 +-
 fs/btrfs/ctree.h  |   9 +-
 fs/btrfs/inode.c  | 733 +-------------------------------------------------
 fs/btrfs/ioctl.c  | 781 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 796 insertions(+), 729 deletions(-)
 create mode 100644 fs/btrfs/ioctl.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9dcfc2fe333..7ed6b39e42d 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o
+	   extent_io.o volumes.o async-thread.o ioctl.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7ae4666103c..181c81d2989 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1553,6 +1553,10 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+int btrfs_create_subvol_root(struct btrfs_root *new_root,
+		struct btrfs_trans_handle *trans, u64 new_dirid,
+		struct btrfs_block_group_cache *block_group);
+
 void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 				  int namelen);
 
@@ -1585,7 +1589,6 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
-long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
@@ -1598,6 +1601,10 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode);
+
+/* ioctl.c */
+long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
 /* file.c */
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7daef8d3700..0c79346fd2c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2667,114 +2667,17 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 	}
 }
 
-static int noinline create_subvol(struct btrfs_root *root, char *name,
-				  int namelen)
+int btrfs_create_subvol_root(struct btrfs_root *new_root,
+		struct btrfs_trans_handle *trans, u64 new_dirid,
+		struct btrfs_block_group_cache *block_group)
 {
-	struct btrfs_trans_handle *trans;
-	struct btrfs_key key;
-	struct btrfs_root_item root_item;
-	struct btrfs_inode_item *inode_item;
-	struct extent_buffer *leaf;
-	struct btrfs_root *new_root = root;
 	struct inode *inode;
-	struct inode *dir;
 	int ret;
-	int err;
-	u64 objectid;
-	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
-	unsigned long nr = 1;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_check_free_space(root, 1, 0);
-	if (ret)
-		goto fail_commit;
-
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
-
-	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-				       0, &objectid);
-	if (ret)
-		goto fail;
-
-	leaf = __btrfs_alloc_free_block(trans, root, root->leafsize,
-					objectid, trans->transid, 0, 0,
-					0, 0);
-	if (IS_ERR(leaf))
-		return PTR_ERR(leaf);
-
-	btrfs_set_header_nritems(leaf, 0);
-	btrfs_set_header_level(leaf, 0);
-	btrfs_set_header_bytenr(leaf, leaf->start);
-	btrfs_set_header_generation(leaf, trans->transid);
-	btrfs_set_header_owner(leaf, objectid);
-
-	write_extent_buffer(leaf, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
-			    BTRFS_FSID_SIZE);
-	btrfs_mark_buffer_dirty(leaf);
-
-	inode_item = &root_item.inode;
-	memset(inode_item, 0, sizeof(*inode_item));
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nblocks = cpu_to_le64(1);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-
-	btrfs_set_root_bytenr(&root_item, leaf->start);
-	btrfs_set_root_level(&root_item, 0);
-	btrfs_set_root_refs(&root_item, 1);
-	btrfs_set_root_used(&root_item, 0);
-
-	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
-	root_item.drop_level = 0;
-
-	free_extent_buffer(leaf);
-	leaf = NULL;
-
-	btrfs_set_root_dirid(&root_item, new_dirid);
-
-	key.objectid = objectid;
-	key.offset = 1;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
-				&root_item);
-	if (ret)
-		goto fail;
-
-	/*
-	 * insert the directory item
-	 */
-	key.offset = (u64)-1;
-	dir = root->fs_info->sb->s_root->d_inode;
-	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    name, namelen, dir->i_ino, &key,
-				    BTRFS_FT_DIR);
-	if (ret)
-		goto fail;
-
-	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
-			     name, namelen, objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino);
-	if (ret)
-		goto fail;
-
-	ret = btrfs_commit_transaction(trans, root);
-	if (ret)
-		goto fail_commit;
-
-	new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen);
-	BUG_ON(!new_root);
-
-	trans = btrfs_start_transaction(new_root, 1);
-	BUG_ON(!trans);
 
 	inode = btrfs_new_inode(trans, new_root, "..", 2, new_dirid,
-				new_dirid,
-				BTRFS_I(dir)->block_group, S_IFDIR | 0700);
+				new_dirid, block_group, S_IFDIR | 0700);
 	if (IS_ERR(inode))
-		goto fail;
+		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 	new_root->inode = inode;
@@ -2783,67 +2686,8 @@ static int noinline create_subvol(struct btrfs_root *root, char *name,
 				     new_dirid);
 	inode->i_nlink = 1;
 	inode->i_size = 0;
-	ret = btrfs_update_inode(trans, new_root, inode);
-	if (ret)
-		goto fail;
-
-	/* Invalidate existing dcache entry for new subvolume. */
-	btrfs_invalidate_dcache_root(root, name, namelen);
 
-fail:
-	nr = trans->blocks_used;
-	err = btrfs_commit_transaction(trans, new_root);
-	if (err && !ret)
-		ret = err;
-fail_commit:
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
-	return ret;
-}
-
-static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
-{
-	struct btrfs_pending_snapshot *pending_snapshot;
-	struct btrfs_trans_handle *trans;
-	int ret;
-	int err;
-	unsigned long nr = 0;
-
-	if (!root->ref_cows)
-		return -EINVAL;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_check_free_space(root, 1, 0);
-	if (ret)
-		goto fail_unlock;
-
-	pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS);
-	if (!pending_snapshot) {
-		ret = -ENOMEM;
-		goto fail_unlock;
-	}
-	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
-	if (!pending_snapshot->name) {
-		ret = -ENOMEM;
-		kfree(pending_snapshot);
-		goto fail_unlock;
-	}
-	memcpy(pending_snapshot->name, name, namelen);
-	pending_snapshot->name[namelen] = '\0';
-	trans = btrfs_start_transaction(root, 1);
-	BUG_ON(!trans);
-	pending_snapshot->root = root;
-	list_add(&pending_snapshot->list,
-		 &trans->transaction->pending_snapshots);
-	ret = btrfs_update_inode(trans, root, root->inode);
-	err = btrfs_commit_transaction(trans, root);
-
-fail_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
-	return ret;
+	return btrfs_update_inode(trans, new_root, inode);
 }
 
 unsigned long btrfs_force_ra(struct address_space *mapping,
@@ -2861,571 +2705,6 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 #endif
 }
 
-int btrfs_defrag_file(struct file *file) {
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct page *page;
-	unsigned long last_index;
-	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
-	unsigned long total_read = 0;
-	u64 page_start;
-	u64 page_end;
-	unsigned long i;
-	int ret;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	ret = btrfs_check_free_space(root, inode->i_size, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	if (ret)
-		return -ENOSPC;
-
-	mutex_lock(&inode->i_mutex);
-	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
-	for (i = 0; i <= last_index; i++) {
-		if (total_read % ra_pages == 0) {
-			btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
-				       min(last_index, i + ra_pages - 1));
-		}
-		total_read++;
-		page = grab_cache_page(inode->i_mapping, i);
-		if (!page)
-			goto out_unlock;
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				page_cache_release(page);
-				goto out_unlock;
-			}
-		}
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(page);
-#else
-		cancel_dirty_page(page, PAGE_CACHE_SIZE);
-#endif
-		wait_on_page_writeback(page);
-		set_page_extent_mapped(page);
-
-		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-		page_end = page_start + PAGE_CACHE_SIZE - 1;
-
-		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-		set_extent_delalloc(io_tree, page_start,
-				    page_end, GFP_NOFS);
-
-		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-		set_page_dirty(page);
-		unlock_page(page);
-		page_cache_release(page);
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
-	}
-
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
-	return 0;
-}
-
-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
-{
-	u64 new_size;
-	u64 old_size;
-	u64 devid = 1;
-	struct btrfs_ioctl_vol_args *vol_args;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_device *device = NULL;
-	char *sizestr;
-	char *devstr = NULL;
-	int ret = 0;
-	int namelen;
-	int mod = 0;
-
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
-	namelen = strlen(vol_args->name);
-	if (namelen > BTRFS_VOL_NAME_MAX) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	sizestr = vol_args->name;
-	devstr = strchr(sizestr, ':');
-	if (devstr) {
-		char *end;
-		sizestr = devstr + 1;
-		*devstr = '\0';
-		devstr = vol_args->name;
-		devid = simple_strtoull(devstr, &end, 10);
-printk("resizing devid %Lu\n", devid);
-	}
-	device = btrfs_find_device(root, devid, NULL);
-	if (!device) {
-		printk("resizer unable to find device %Lu\n", devid);
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-	if (!strcmp(sizestr, "max"))
-		new_size = device->bdev->bd_inode->i_size;
-	else {
-		if (sizestr[0] == '-') {
-			mod = -1;
-			sizestr++;
-		} else if (sizestr[0] == '+') {
-			mod = 1;
-			sizestr++;
-		}
-		new_size = btrfs_parse_size(sizestr);
-		if (new_size == 0) {
-			ret = -EINVAL;
-			goto out_unlock;
-		}
-	}
-
-	old_size = device->total_bytes;
-
-	if (mod < 0) {
-		if (new_size > old_size) {
-			ret = -EINVAL;
-			goto out_unlock;
-		}
-		new_size = old_size - new_size;
-	} else if (mod > 0) {
-		new_size = old_size + new_size;
-	}
-
-	if (new_size < 256 * 1024 * 1024) {
-		ret = -EINVAL;
-		goto out_unlock;
-	}
-	if (new_size > device->bdev->bd_inode->i_size) {
-		ret = -EFBIG;
-		goto out_unlock;
-	}
-
-	do_div(new_size, root->sectorsize);
-	new_size *= root->sectorsize;
-
-printk("new size for %s is %llu\n", device->name, (unsigned long long)new_size);
-
-	if (new_size > old_size) {
-		trans = btrfs_start_transaction(root, 1);
-		ret = btrfs_grow_device(trans, device, new_size);
-		btrfs_commit_transaction(trans, root);
-	} else {
-		ret = btrfs_shrink_device(device, new_size);
-	}
-
-out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
-out:
-	kfree(vol_args);
-	return ret;
-}
-
-static int noinline btrfs_ioctl_snap_create(struct btrfs_root *root,
-					    void __user *arg)
-{
-	struct btrfs_ioctl_vol_args *vol_args;
-	struct btrfs_dir_item *di;
-	struct btrfs_path *path;
-	u64 root_dirid;
-	int namelen;
-	int ret;
-
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	namelen = strlen(vol_args->name);
-	if (namelen > BTRFS_VOL_NAME_MAX) {
-		ret = -EINVAL;
-		goto out;
-	}
-	if (strchr(vol_args->name, '/')) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
-	mutex_lock(&root->fs_info->fs_mutex);
-	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
-			    path, root_dirid,
-			    vol_args->name, namelen, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
-	btrfs_free_path(path);
-
-	if (di && !IS_ERR(di)) {
-		ret = -EEXIST;
-		goto out;
-	}
-
-	if (IS_ERR(di)) {
-		ret = PTR_ERR(di);
-		goto out;
-	}
-
-	if (root == root->fs_info->tree_root)
-		ret = create_subvol(root, vol_args->name, namelen);
-	else
-		ret = create_snapshot(root, vol_args->name, namelen);
-out:
-	kfree(vol_args);
-	return ret;
-}
-
-static int btrfs_ioctl_defrag(struct file *file)
-{
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFDIR:
-		mutex_lock(&root->fs_info->fs_mutex);
-		btrfs_defrag_root(root, 0);
-		btrfs_defrag_root(root->fs_info->extent_root, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
-		break;
-	case S_IFREG:
-		btrfs_defrag_file(file);
-		break;
-	}
-
-	return 0;
-}
-
-long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
-{
-	struct btrfs_ioctl_vol_args *vol_args;
-	int ret;
-
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
-	ret = btrfs_init_new_device(root, vol_args->name);
-
-out:
-	kfree(vol_args);
-	return ret;
-}
-
-long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
-{
-	struct btrfs_ioctl_vol_args *vol_args;
-	int ret;
-
-	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
-
-	if (!vol_args)
-		return -ENOMEM;
-
-	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
-		ret = -EFAULT;
-		goto out;
-	}
-	ret = btrfs_rm_device(root, vol_args->name);
-
-out:
-	kfree(vol_args);
-	return ret;
-}
-
-int dup_item_to_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       struct btrfs_path *path,
-		       struct extent_buffer *leaf,
-		       int slot,
-		       struct btrfs_key *key,
-		       u64 destino)
-{
-	char *dup;
-	int len = btrfs_item_size_nr(leaf, slot);
-	struct btrfs_key ckey = *key;
-	int ret = 0;
-
-	dup = kmalloc(len, GFP_NOFS);
-	if (!dup)
-		return -ENOMEM;
-
-	read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len);
-	btrfs_release_path(root, path);
-
-	ckey.objectid = destino;
-	ret = btrfs_insert_item(trans, root, &ckey, dup, len);
-	kfree(dup);
-	return ret;
-}
-
-long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
-{
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct file *src_file;
-	struct inode *src;
-	struct btrfs_trans_handle *trans;
-	int ret;
-	u64 pos;
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	struct extent_buffer *leaf;
-	u32 nritems;
-	int slot;
-
-	src_file = fget(src_fd);
-	if (!src_file)
-		return -EBADF;
-	src = src_file->f_dentry->d_inode;
-
-	ret = -EXDEV;
-	if (src->i_sb != inode->i_sb)
-		goto out_fput;
-
-	if (inode < src) {
-		mutex_lock(&inode->i_mutex);
-		mutex_lock(&src->i_mutex);
-	} else {
-		mutex_lock(&src->i_mutex);
-		mutex_lock(&inode->i_mutex);
-	}
-
-	ret = -ENOTEMPTY;
-	if (inode->i_size)
-		goto out_unlock;
-
-	/* do any pending delalloc/csum calc on src, one way or
-	   another, and lock file content */
-	while (1) {
-		filemap_write_and_wait(src->i_mapping);
-		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
-		if (BTRFS_I(src)->delalloc_bytes == 0)
-			break;
-		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
-	}
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = btrfs_start_transaction(root, 0);
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	key.offset = 0;
-	key.type = BTRFS_EXTENT_DATA_KEY;
-	key.objectid = src->i_ino;
-	pos = 0;
-	path->reada = 2;
-
-	while (1) {
-		/*
-		 * note the key will change type as we walk through the
-		 * tree.
-		 */
-		ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
-		if (ret < 0)
-			goto out;
-
-		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				goto out;
-			if (ret > 0)
-				break;
-		}
-		leaf = path->nodes[0];
-		slot = path->slots[0];
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		nritems = btrfs_header_nritems(leaf);
-
-		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
-		    key.objectid != src->i_ino)
-			break;
-
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
-			struct btrfs_file_extent_item *extent;
-			int found_type;
-			pos = key.offset;
-			extent = btrfs_item_ptr(leaf, slot,
-						struct btrfs_file_extent_item);
-			found_type = btrfs_file_extent_type(leaf, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
-				u64 len = btrfs_file_extent_num_bytes(leaf,
-								      extent);
-				u64 ds = btrfs_file_extent_disk_bytenr(leaf,
-								       extent);
-				u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
-								 extent);
-				u64 off = btrfs_file_extent_offset(leaf,
-								   extent);
-				btrfs_insert_file_extent(trans, root,
-							 inode->i_ino, pos,
-							 ds, dl, len, off);
-				/* ds == 0 means there's a hole */
-				if (ds != 0) {
-					btrfs_inc_extent_ref(trans, root,
-						     ds, dl,
-						     root->root_key.objectid,
-						     trans->transid,
-						     inode->i_ino, pos);
-				}
-				pos = key.offset + len;
-			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-				ret = dup_item_to_inode(trans, root, path,
-							leaf, slot, &key,
-							inode->i_ino);
-				if (ret)
-					goto out;
-				pos = key.offset + btrfs_item_size_nr(leaf,
-								      slot);
-			}
-		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
-			ret = dup_item_to_inode(trans, root, path, leaf,
-						slot, &key, inode->i_ino);
-
-			if (ret)
-				goto out;
-		}
-		key.offset++;
-		btrfs_release_path(root, path);
-	}
-
-	ret = 0;
-out:
-	btrfs_free_path(path);
-
-	inode->i_blocks = src->i_blocks;
-	i_size_write(inode, src->i_size);
-	btrfs_update_inode(trans, root, inode);
-
-	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
-
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
-
-out_unlock:
-	mutex_unlock(&src->i_mutex);
-	mutex_unlock(&inode->i_mutex);
-out_fput:
-	fput(src_file);
-	return ret;
-}
-
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks.  They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-long btrfs_ioctl_trans_start(struct file *file)
-{
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
-	int ret = 0;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	if (file->private_data) {
-		ret = -EINPROGRESS;
-		goto out;
-	}
-	trans = btrfs_start_transaction(root, 0);
-	if (trans)
-		file->private_data = trans;
-	else
-		ret = -ENOMEM;
-	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
-out:
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
-
-/*
- * there are many ways the trans_start and trans_end ioctls can lead
- * to deadlocks.  They should only be used by applications that
- * basically own the machine, and have a very in depth understanding
- * of all the possible deadlocks and enospc problems.
- */
-long btrfs_ioctl_trans_end(struct file *file)
-{
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
-	int ret = 0;
-
-	mutex_lock(&root->fs_info->fs_mutex);
-	trans = file->private_data;
-	if (!trans) {
-		ret = -EINVAL;
-		goto out;
-	}
-	btrfs_end_transaction(trans, root);
-	file->private_data = 0;
-out:
-	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
-}
-
-long btrfs_ioctl(struct file *file, unsigned int
-		cmd, unsigned long arg)
-{
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
-
-	switch (cmd) {
-	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(root, (void __user *)arg);
-	case BTRFS_IOC_DEFRAG:
-		return btrfs_ioctl_defrag(file);
-	case BTRFS_IOC_RESIZE:
-		return btrfs_ioctl_resize(root, (void __user *)arg);
-	case BTRFS_IOC_ADD_DEV:
-		return btrfs_ioctl_add_dev(root, (void __user *)arg);
-	case BTRFS_IOC_RM_DEV:
-		return btrfs_ioctl_rm_dev(root, (void __user *)arg);
-	case BTRFS_IOC_BALANCE:
-		return btrfs_balance(root->fs_info->dev_root);
-	case BTRFS_IOC_CLONE:
-		return btrfs_ioctl_clone(file, arg);
-	case BTRFS_IOC_TRANS_START:
-		return btrfs_ioctl_trans_start(file);
-	case BTRFS_IOC_TRANS_END:
-		return btrfs_ioctl_trans_end(file);
-	case BTRFS_IOC_SYNC:
-		btrfs_sync_fs(file->f_dentry->d_sb, 1);
-		return 0;
-	}
-
-	return -ENOTTY;
-}
-
-/*
- * Called inside transaction, so use GFP_NOFS
- */
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
 	struct btrfs_inode *ei;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
new file mode 100644
index 00000000000..da8de6cfdb5
--- /dev/null
+++ b/fs/btrfs/ioctl.c
@@ -0,0 +1,781 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/statfs.h>
+#include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include <linux/xattr.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "ioctl.h"
+#include "print-tree.h"
+#include "volumes.h"
+
+
+
+static noinline int create_subvol(struct btrfs_root *root, char *name,
+				  int namelen)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *new_root = root;
+	struct inode *dir;
+	int ret;
+	int err;
+	u64 objectid;
+	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+	unsigned long nr = 1;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto fail_commit;
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
+				       0, &objectid);
+	if (ret)
+		goto fail;
+
+	leaf = __btrfs_alloc_free_block(trans, root, root->leafsize,
+					objectid, trans->transid, 0, 0,
+					0, 0);
+	if (IS_ERR(leaf))
+		return PTR_ERR(leaf);
+
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_owner(leaf, objectid);
+
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 1);
+	btrfs_set_root_used(&root_item, 0);
+
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
+
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, new_dirid);
+
+	key.objectid = objectid;
+	key.offset = 1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	/*
+	 * insert the directory item
+	 */
+	key.offset = (u64)-1;
+	dir = root->fs_info->sb->s_root->d_inode;
+	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
+				    name, namelen, dir->i_ino, &key,
+				    BTRFS_FT_DIR);
+	if (ret)
+		goto fail;
+
+	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
+			     name, namelen, objectid,
+			     root->fs_info->sb->s_root->d_inode->i_ino);
+	if (ret)
+		goto fail;
+
+	ret = btrfs_commit_transaction(trans, root);
+	if (ret)
+		goto fail_commit;
+
+	new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen);
+	BUG_ON(!new_root);
+
+	trans = btrfs_start_transaction(new_root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_create_subvol_root(new_root, trans, new_dirid,
+				       BTRFS_I(dir)->block_group);
+	if (ret)
+		goto fail;
+
+	/* Invalidate existing dcache entry for new subvolume. */
+	btrfs_invalidate_dcache_root(root, name, namelen);
+
+fail:
+	nr = trans->blocks_used;
+	err = btrfs_commit_transaction(trans, new_root);
+	if (err && !ret)
+		ret = err;
+fail_commit:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
+	return ret;
+}
+
+static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
+{
+	struct btrfs_pending_snapshot *pending_snapshot;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+	unsigned long nr = 0;
+
+	if (!root->ref_cows)
+		return -EINVAL;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, 1, 0);
+	if (ret)
+		goto fail_unlock;
+
+	pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS);
+	if (!pending_snapshot) {
+		ret = -ENOMEM;
+		goto fail_unlock;
+	}
+	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+	if (!pending_snapshot->name) {
+		ret = -ENOMEM;
+		kfree(pending_snapshot);
+		goto fail_unlock;
+	}
+	memcpy(pending_snapshot->name, name, namelen);
+	pending_snapshot->name[namelen] = '\0';
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+	pending_snapshot->root = root;
+	list_add(&pending_snapshot->list,
+		 &trans->transaction->pending_snapshots);
+	ret = btrfs_update_inode(trans, root, root->inode);
+	err = btrfs_commit_transaction(trans, root);
+
+fail_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_btree_balance_dirty(root, nr);
+	btrfs_throttle(root);
+	return ret;
+}
+
+int btrfs_defrag_file(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct page *page;
+	unsigned long last_index;
+	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
+	unsigned long total_read = 0;
+	u64 page_start;
+	u64 page_end;
+	unsigned long i;
+	int ret;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	if (ret)
+		return -ENOSPC;
+
+	mutex_lock(&inode->i_mutex);
+	last_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	for (i = 0; i <= last_index; i++) {
+		if (total_read % ra_pages == 0) {
+			btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
+				       min(last_index, i + ra_pages - 1));
+		}
+		total_read++;
+		page = grab_cache_page(inode->i_mapping, i);
+		if (!page)
+			goto out_unlock;
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto out_unlock;
+			}
+		}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(page);
+#else
+		cancel_dirty_page(page, PAGE_CACHE_SIZE);
+#endif
+		wait_on_page_writeback(page);
+		set_page_extent_mapped(page);
+
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		set_extent_delalloc(io_tree, page_start,
+				    page_end, GFP_NOFS);
+
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		set_page_dirty(page);
+		unlock_page(page);
+		page_cache_release(page);
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
+	}
+
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+/*
+ * Called inside transaction, so use GFP_NOFS
+ */
+
+static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+{
+	u64 new_size;
+	u64 old_size;
+	u64 devid = 1;
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device = NULL;
+	char *sizestr;
+	char *devstr = NULL;
+	int ret = 0;
+	int namelen;
+	int mod = 0;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	namelen = strlen(vol_args->name);
+	if (namelen > BTRFS_VOL_NAME_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	sizestr = vol_args->name;
+	devstr = strchr(sizestr, ':');
+	if (devstr) {
+		char *end;
+		sizestr = devstr + 1;
+		*devstr = '\0';
+		devstr = vol_args->name;
+		devid = simple_strtoull(devstr, &end, 10);
+		printk(KERN_INFO "resizing devid %llu\n", devid);
+	}
+	device = btrfs_find_device(root, devid, NULL);
+	if (!device) {
+		printk(KERN_INFO "resizer unable to find device %llu\n", devid);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (!strcmp(sizestr, "max"))
+		new_size = device->bdev->bd_inode->i_size;
+	else {
+		if (sizestr[0] == '-') {
+			mod = -1;
+			sizestr++;
+		} else if (sizestr[0] == '+') {
+			mod = 1;
+			sizestr++;
+		}
+		new_size = btrfs_parse_size(sizestr);
+		if (new_size == 0) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+	}
+
+	old_size = device->total_bytes;
+
+	if (mod < 0) {
+		if (new_size > old_size) {
+			ret = -EINVAL;
+			goto out_unlock;
+		}
+		new_size = old_size - new_size;
+	} else if (mod > 0) {
+		new_size = old_size + new_size;
+	}
+
+	if (new_size < 256 * 1024 * 1024) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	if (new_size > device->bdev->bd_inode->i_size) {
+		ret = -EFBIG;
+		goto out_unlock;
+	}
+
+	do_div(new_size, root->sectorsize);
+	new_size *= root->sectorsize;
+
+	printk(KERN_INFO "new size for %s is %llu\n",
+		device->name, (unsigned long long)new_size);
+
+	if (new_size > old_size) {
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_grow_device(trans, device, new_size);
+		btrfs_commit_transaction(trans, root);
+	} else {
+		ret = btrfs_shrink_device(device, new_size);
+	}
+
+out_unlock:
+	mutex_unlock(&root->fs_info->fs_mutex);
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
+					    void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	u64 root_dirid;
+	int namelen;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	namelen = strlen(vol_args->name);
+	if (namelen > BTRFS_VOL_NAME_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (strchr(vol_args->name, '/')) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
+	mutex_lock(&root->fs_info->fs_mutex);
+	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
+			    path, root_dirid,
+			    vol_args->name, namelen, 0);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	btrfs_free_path(path);
+
+	if (di && !IS_ERR(di)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
+	}
+
+	if (root == root->fs_info->tree_root)
+		ret = create_subvol(root, vol_args->name, namelen);
+	else
+		ret = create_snapshot(root, vol_args->name, namelen);
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+static int btrfs_ioctl_defrag(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		mutex_lock(&root->fs_info->fs_mutex);
+		btrfs_defrag_root(root, 0);
+		btrfs_defrag_root(root->fs_info->extent_root, 0);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		break;
+	case S_IFREG:
+		btrfs_defrag_file(file);
+		break;
+	}
+
+	return 0;
+}
+
+long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = btrfs_init_new_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = btrfs_rm_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
+int dup_item_to_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       struct extent_buffer *leaf,
+		       int slot,
+		       struct btrfs_key *key,
+		       u64 destino)
+{
+	char *dup;
+	int len = btrfs_item_size_nr(leaf, slot);
+	struct btrfs_key ckey = *key;
+	int ret = 0;
+
+	dup = kmalloc(len, GFP_NOFS);
+	if (!dup)
+		return -ENOMEM;
+
+	read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len);
+	btrfs_release_path(root, path);
+
+	ckey.objectid = destino;
+	ret = btrfs_insert_item(trans, root, &ckey, dup, len);
+	kfree(dup);
+	return ret;
+}
+
+long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	u64 pos;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int slot;
+
+	src_file = fget(src_fd);
+	if (!src_file)
+		return -EBADF;
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (inode < src) {
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&src->i_mutex);
+	} else {
+		mutex_lock(&src->i_mutex);
+		mutex_lock(&inode->i_mutex);
+	}
+
+	ret = -ENOTEMPTY;
+	if (inode->i_size)
+		goto out_unlock;
+
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		filemap_write_and_wait(src->i_mapping);
+		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+		if (BTRFS_I(src)->delalloc_bytes == 0)
+			break;
+		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+	}
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 0);
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	key.offset = 0;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.objectid = src->i_ino;
+	pos = 0;
+	path->reada = 2;
+
+	while (1) {
+		/*
+		 * note the key will change type as we walk through the
+		 * tree.
+		 */
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
+
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+		}
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		nritems = btrfs_header_nritems(leaf);
+
+		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		    key.objectid != src->i_ino)
+			break;
+
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int found_type;
+			pos = key.offset;
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			found_type = btrfs_file_extent_type(leaf, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 len = btrfs_file_extent_num_bytes(leaf,
+								      extent);
+				u64 ds = btrfs_file_extent_disk_bytenr(leaf,
+								       extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
+				u64 off = btrfs_file_extent_offset(leaf,
+								   extent);
+				btrfs_insert_file_extent(trans, root,
+							 inode->i_ino, pos,
+							 ds, dl, len, off);
+				/* ds == 0 means there's a hole */
+				if (ds != 0) {
+					btrfs_inc_extent_ref(trans, root,
+						     ds, dl,
+						     root->root_key.objectid,
+						     trans->transid,
+						     inode->i_ino, pos);
+				}
+				pos = key.offset + len;
+			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = dup_item_to_inode(trans, root, path,
+							leaf, slot, &key,
+							inode->i_ino);
+				if (ret)
+					goto out;
+				pos = key.offset + btrfs_item_size_nr(leaf,
+								      slot);
+			}
+		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
+			ret = dup_item_to_inode(trans, root, path, leaf,
+						slot, &key, inode->i_ino);
+
+			if (ret)
+				goto out;
+		}
+		key.offset++;
+		btrfs_release_path(root, path);
+	}
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+
+	inode->i_blocks = src->i_blocks;
+	i_size_write(inode, src->i_size);
+	btrfs_update_inode(trans, root, inode);
+
+	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+
+out_unlock:
+	mutex_unlock(&src->i_mutex);
+	mutex_unlock(&inode->i_mutex);
+out_fput:
+	fput(src_file);
+	return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_start(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	if (file->private_data) {
+		ret = -EINPROGRESS;
+		goto out;
+	}
+	trans = btrfs_start_transaction(root, 0);
+	if (trans)
+		file->private_data = trans;
+	else
+		ret = -ENOMEM;
+	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+out:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+/*
+ * there are many ways the trans_start and trans_end ioctls can lead
+ * to deadlocks.  They should only be used by applications that
+ * basically own the machine, and have a very in depth understanding
+ * of all the possible deadlocks and enospc problems.
+ */
+long btrfs_ioctl_trans_end(struct file *file)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = file->private_data;
+	if (!trans) {
+		ret = -EINVAL;
+		goto out;
+	}
+	btrfs_end_transaction(trans, root);
+	file->private_data = 0;
+out:
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
+long btrfs_ioctl(struct file *file, unsigned int
+		cmd, unsigned long arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+	switch (cmd) {
+	case BTRFS_IOC_SNAP_CREATE:
+		return btrfs_ioctl_snap_create(root, (void __user *)arg);
+	case BTRFS_IOC_DEFRAG:
+		return btrfs_ioctl_defrag(file);
+	case BTRFS_IOC_RESIZE:
+		return btrfs_ioctl_resize(root, (void __user *)arg);
+	case BTRFS_IOC_ADD_DEV:
+		return btrfs_ioctl_add_dev(root, (void __user *)arg);
+	case BTRFS_IOC_RM_DEV:
+		return btrfs_ioctl_rm_dev(root, (void __user *)arg);
+	case BTRFS_IOC_BALANCE:
+		return btrfs_balance(root->fs_info->dev_root);
+	case BTRFS_IOC_CLONE:
+		return btrfs_ioctl_clone(file, arg);
+	case BTRFS_IOC_TRANS_START:
+		return btrfs_ioctl_trans_start(file);
+	case BTRFS_IOC_TRANS_END:
+		return btrfs_ioctl_trans_end(file);
+	case BTRFS_IOC_SYNC:
+		btrfs_sync_fs(file->f_dentry->d_sb, 1);
+		return 0;
+	}
+
+	return -ENOTTY;
+}
-- 
cgit v1.2.3


From df5b5520b21d8a2554ede65c09b288833c77144d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jun 2008 21:53:58 -0400
Subject: BTRFS_IOC_TRANS_START should be privilegued

As mentioned in the comment next to it btrfs_ioctl_trans_start can
do bad damage to filesystems and thus should be limited to privilegued
users.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index da8de6cfdb5..6fb45580275 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -706,6 +706,9 @@ long btrfs_ioctl_trans_start(struct file *file)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	mutex_lock(&root->fs_info->fs_mutex);
 	if (file->private_data) {
 		ret = -EINPROGRESS;
-- 
cgit v1.2.3


From 1cc127b5d1b71453091859301de4a7dd6ee96fa8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Jun 2008 14:46:17 -0400
Subject: Btrfs: Add a thread pool just for submit_bio

If a bio submission is after a lock holder waiting for the bio
on the work queue, it is possible to deadlock.  Move the bios
into their own pool.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 4 ++++
 fs/btrfs/disk-io.c | 4 ++++
 fs/btrfs/volumes.c | 3 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 181c81d2989..dcea9d706d9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -529,9 +529,13 @@ struct btrfs_fs_info {
 	 * can run with FS locks held, and the writers may be waiting for
 	 * those locks.  We don't want ordering in the pending list to cause
 	 * deadlocks, and so the two are serviced separately.
+	 *
+	 * A third pool does submit_bio to avoid deadlocking with the other
+	 * two
 	 */
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers submit_workers;
 	int thread_pool_size;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c6a710a668c..e5c758e306d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1233,8 +1233,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * cannot dynamically grow.
 	 */
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 
 
@@ -1343,6 +1345,7 @@ fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
@@ -1597,6 +1600,7 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->submit_workers);
 
 	iput(fs_info->btree_inode);
 #if 0
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c57458ce633..ba396857102 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2112,7 +2112,8 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	spin_unlock(&device->io_lock);
 
 	if (should_queue)
-		btrfs_queue_worker(&root->fs_info->workers, &device->work);
+		btrfs_queue_worker(&root->fs_info->submit_workers,
+				   &device->work);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 925baeddc5b0764a53f2214a1253251bab0e0324 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Btrfs: Start btree concurrency work.

The allocation trees and the chunk trees are serialized via their own
dedicated mutexes.  This means allocation location is still not very
fine grained.

The main FS btree is protected by locks on each block in the btree.  Locks
are taken top / down, and as processing finishes on a given level of the
tree, the lock is released after locking the lower level.

The end result of a search is now a path where only the lowest level
is locked.  Releasing or freeing the path drops any locks held.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/ctree.c       | 386 ++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/ctree.h       |  23 ++-
 fs/btrfs/disk-io.c     |  13 +-
 fs/btrfs/extent-tree.c | 252 +++++++++++++++++++++-----------
 fs/btrfs/extent_io.c   |   8 -
 fs/btrfs/inode.c       |   8 +-
 fs/btrfs/ioctl.c       |   8 +-
 fs/btrfs/locking.c     |  50 +++++++
 fs/btrfs/locking.h     |  26 ++++
 fs/btrfs/transaction.c |  15 +-
 fs/btrfs/tree-defrag.c |   2 +
 12 files changed, 579 insertions(+), 214 deletions(-)
 create mode 100644 fs/btrfs/locking.c
 create mode 100644 fs/btrfs/locking.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7ed6b39e42d..0e2dcc718de 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1d404bde3cf..75625c68fdd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -21,6 +21,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "print-tree.h"
+#include "locking.h"
 
 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, int level);
@@ -64,12 +65,47 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	int i;
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i])
-			break;
+			continue;
+		if (p->locks[i]) {
+			btrfs_tree_unlock(p->nodes[i]);
+			p->locks[i] = 0;
+		}
 		free_extent_buffer(p->nodes[i]);
 	}
 	memset(p, 0, sizeof(*p));
 }
 
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+	spin_lock(&root->node_lock);
+	eb = root->node;
+	extent_buffer_get(eb);
+	spin_unlock(&root->node_lock);
+	return eb;
+}
+
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
+{
+	struct extent_buffer *eb;
+
+	while(1) {
+		eb = btrfs_root_node(root);
+		btrfs_tree_lock(eb);
+
+		spin_lock(&root->node_lock);
+		if (eb == root->node) {
+			spin_unlock(&root->node_lock);
+			break;
+		}
+		spin_unlock(&root->node_lock);
+
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return eb;
+}
+
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
 	if (root->track_dirty && list_empty(&root->dirty_list)) {
@@ -111,7 +147,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	} else {
 		first_key.objectid = 0;
 	}
-	cow = __btrfs_alloc_free_block(trans, new_root, buf->len,
+	cow = btrfs_alloc_free_block(trans, new_root, buf->len,
 				       new_root_objectid,
 				       trans->transid, first_key.objectid,
 				       level, buf->start, 0);
@@ -151,8 +187,14 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	int different_trans = 0;
 	int level;
+	int unlock_orig = 0;
 	struct btrfs_key first_key;
 
+	if (*cow_ret == buf)
+		unlock_orig = 1;
+
+	WARN_ON(!btrfs_tree_locked(buf));
+
 	if (root->ref_cows) {
 		root_gen = trans->transid;
 	} else {
@@ -172,7 +214,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	} else {
 		first_key.objectid = 0;
 	}
-	cow = __btrfs_alloc_free_block(trans, root, buf->len,
+	cow = btrfs_alloc_free_block(trans, root, buf->len,
 				     root->root_key.objectid,
 				     root_gen, first_key.objectid, level,
 				     search_start, empty_size);
@@ -196,9 +238,14 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	}
 
 	if (buf == root->node) {
+		WARN_ON(parent && parent != buf);
 		root_gen = btrfs_header_generation(buf);
+
+		spin_lock(&root->node_lock);
 		root->node = cow;
 		extent_buffer_get(cow);
+		spin_unlock(&root->node_lock);
+
 		if (buf != root->commit_root) {
 			btrfs_free_extent(trans, root, buf->start,
 					  buf->len, root->root_key.objectid,
@@ -219,6 +266,8 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 				  btrfs_header_owner(parent), root_gen,
 				  0, 0, 1);
 	}
+	if (unlock_orig)
+		btrfs_tree_unlock(buf);
 	free_extent_buffer(buf);
 	btrfs_mark_buffer_dirty(cow);
 	*cow_ret = cow;
@@ -316,6 +365,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	int progress_passed = 0;
 	struct btrfs_disk_key disk_key;
 
+	/* FIXME this code needs locking */
+	return 0;
+
 	parent_level = btrfs_header_level(parent);
 	if (cache_only && parent_level != 1)
 		return 0;
@@ -729,6 +781,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 		return 0;
 
 	mid = path->nodes[level];
+	WARN_ON(!path->locks[level]);
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -749,14 +802,21 @@ static int balance_level(struct btrfs_trans_handle *trans,
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
+		btrfs_tree_lock(child);
 		BUG_ON(!child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
 		BUG_ON(ret);
 
+		spin_lock(&root->node_lock);
 		root->node = child;
+		spin_unlock(&root->node_lock);
+
 		add_root_to_dirty_list(root);
+		btrfs_tree_unlock(child);
+		path->locks[level] = 0;
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
@@ -775,6 +835,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
+		btrfs_tree_lock(left);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left);
 		if (wret) {
@@ -784,6 +845,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	}
 	right = read_node_slot(root, parent, pslot + 1);
 	if (right) {
+		btrfs_tree_lock(right);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right);
 		if (wret) {
@@ -815,6 +877,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 			u32 blocksize = right->len;
 
 			clean_tree_block(trans, root, right);
+			btrfs_tree_unlock(right);
 			free_extent_buffer(right);
 			right = NULL;
 			wret = del_ptr(trans, root, path, level + 1, pslot +
@@ -862,7 +925,9 @@ static int balance_level(struct btrfs_trans_handle *trans,
 		u64 root_gen = btrfs_header_generation(parent);
 		u64 bytenr = mid->start;
 		u32 blocksize = mid->len;
+
 		clean_tree_block(trans, root, mid);
+		btrfs_tree_unlock(mid);
 		free_extent_buffer(mid);
 		mid = NULL;
 		wret = del_ptr(trans, root, path, level + 1, pslot);
@@ -885,11 +950,14 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	if (left) {
 		if (btrfs_header_nritems(left) > orig_slot) {
 			extent_buffer_get(left);
+			/* left was locked after cow */
 			path->nodes[level] = left;
 			path->slots[level + 1] -= 1;
 			path->slots[level] = orig_slot;
-			if (mid)
+			if (mid) {
+				btrfs_tree_unlock(mid);
 				free_extent_buffer(mid);
+			}
 		} else {
 			orig_slot -= btrfs_header_nritems(left);
 			path->slots[level] = orig_slot;
@@ -901,10 +969,15 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
 		BUG();
 enospc:
-	if (right)
+	if (right) {
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
-	if (left)
+	}
+	if (left) {
+		if (path->nodes[level] != left)
+			btrfs_tree_unlock(left);
 		free_extent_buffer(left);
+	}
 	return ret;
 }
 
@@ -942,6 +1015,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	/* first, try to make some room in the middle buffer */
 	if (left) {
 		u32 left_nr;
+
+		btrfs_tree_lock(left);
 		left_nr = btrfs_header_nritems(left);
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -967,24 +1042,28 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				path->nodes[level] = left;
 				path->slots[level + 1] -= 1;
 				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(mid);
 				free_extent_buffer(mid);
 			} else {
 				orig_slot -=
 					btrfs_header_nritems(left);
 				path->slots[level] = orig_slot;
+				btrfs_tree_unlock(left);
 				free_extent_buffer(left);
 			}
 			return 0;
 		}
+		btrfs_tree_unlock(left);
 		free_extent_buffer(left);
 	}
-	right= read_node_slot(root, parent, pslot + 1);
+	right = read_node_slot(root, parent, pslot + 1);
 
 	/*
 	 * then try to empty the right most buffer into the middle
 	 */
 	if (right) {
 		u32 right_nr;
+		btrfs_tree_lock(right);
 		right_nr = btrfs_header_nritems(right);
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1013,12 +1092,15 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 				path->slots[level + 1] += 1;
 				path->slots[level] = orig_slot -
 					btrfs_header_nritems(mid);
+				btrfs_tree_unlock(mid);
 				free_extent_buffer(mid);
 			} else {
+				btrfs_tree_unlock(right);
 				free_extent_buffer(right);
 			}
 			return 0;
 		}
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
 	}
 	return 1;
@@ -1050,6 +1132,8 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 		return;
 
 	node = path->nodes[level];
+	WARN_ON(!path->skip_locking && !btrfs_tree_locked(node));
+
 	search = btrfs_node_blockptr(node, slot);
 	blocksize = btrfs_level_size(root, level - 1);
 	eb = btrfs_find_tree_block(root, search, blocksize);
@@ -1098,6 +1182,39 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 			highest_read = search;
 	}
 }
+
+static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
+{
+	int i;
+	int skip_level = level;
+	struct extent_buffer *t;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			break;
+		if (!path->locks[i])
+			break;
+		if (path->slots[i] == 0) {
+			skip_level = i + 1;
+			continue;
+		}
+		if (path->keep_locks) {
+			u32 nritems;
+			t = path->nodes[i];
+			nritems = btrfs_header_nritems(t);
+			if (path->slots[i] >= nritems - 1) {
+				skip_level = i + 1;
+				continue;
+			}
+		}
+		t = path->nodes[i];
+		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+			btrfs_tree_unlock(t);
+			path->locks[i] = 0;
+		}
+	}
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -1120,15 +1237,27 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 	int level;
 	int should_reada = p->reada;
+	int lowest_unlock = 1;
 	u8 lowest_level = 0;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len);
 	WARN_ON(p->nodes[0] != NULL);
-	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
+	// WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
+	WARN_ON(root == root->fs_info->extent_root &&
+		!mutex_is_locked(&root->fs_info->alloc_mutex));
+	WARN_ON(root == root->fs_info->chunk_root &&
+		!mutex_is_locked(&root->fs_info->chunk_mutex));
+	WARN_ON(root == root->fs_info->dev_root &&
+		!mutex_is_locked(&root->fs_info->chunk_mutex));
+	if (ins_len < 0)
+		lowest_unlock = 2;
 again:
-	b = root->node;
-	extent_buffer_get(b);
+	if (!p->skip_locking)
+		b = btrfs_lock_root_node(root);
+	else
+		b = btrfs_root_node(root);
+
 	while (b) {
 		level = btrfs_header_level(b);
 		if (cow) {
@@ -1147,9 +1276,12 @@ again:
 			WARN_ON(1);
 		level = btrfs_header_level(b);
 		p->nodes[level] = b;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
 		ret = check_block(root, p, level);
 		if (ret)
 			return -1;
+
 		ret = bin_search(b, key, level, &slot);
 		if (level != 0) {
 			if (ret && slot > 0)
@@ -1177,14 +1309,19 @@ again:
 				BUG_ON(btrfs_header_nritems(b) == 1);
 			}
 			/* this is only true while dropping a snapshot */
-			if (level == lowest_level)
+			if (level == lowest_level) {
+				unlock_up(p, level, lowest_unlock);
 				break;
+			}
 
 			if (should_reada)
 				reada_for_search(root, p, level, slot,
 						 key->objectid);
 
 			b = read_node_slot(root, b, slot);
+			if (!p->skip_locking)
+				btrfs_tree_lock(b);
+			unlock_up(p, level, lowest_unlock);
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -1195,6 +1332,7 @@ again:
 				if (sret)
 					return sret;
 			}
+			unlock_up(p, level, lowest_unlock);
 			return ret;
 		}
 	}
@@ -1225,6 +1363,13 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
 			break;
 		t = path->nodes[i];
 		btrfs_set_node_key(t, key, tslot);
+		if (!btrfs_tree_locked(path->nodes[i])) {
+			int ii;
+printk("fixup without lock on level %d\n", btrfs_header_level(path->nodes[i]));
+			for (ii = 0; ii < BTRFS_MAX_LEVEL; ii++) {
+printk("level %d slot %d\n", ii, path->slots[ii]);
+			}
+		}
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
@@ -1370,6 +1515,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	u64 lower_gen;
 	struct extent_buffer *lower;
 	struct extent_buffer *c;
+	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
 
 	BUG_ON(path->nodes[level]);
@@ -1386,12 +1532,13 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = __btrfs_alloc_free_block(trans, root, root->nodesize,
+	c = btrfs_alloc_free_block(trans, root, root->nodesize,
 				   root->root_key.objectid,
 				   root_gen, lower_key.objectid, level,
 				   root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
+
 	memset_extent_buffer(c, 0, 0, root->nodesize);
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_header_level(c, level);
@@ -1416,23 +1563,31 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 
 	btrfs_mark_buffer_dirty(c);
 
-	/* the super has an extra ref to root->node */
-	free_extent_buffer(root->node);
+	spin_lock(&root->node_lock);
+	old = root->node;
 	root->node = c;
+	spin_unlock(&root->node_lock);
+
+	/* the super has an extra ref to root->node */
+	free_extent_buffer(old);
+
 	add_root_to_dirty_list(root);
 	extent_buffer_get(c);
 	path->nodes[level] = c;
+	path->locks[level] = 1;
 	path->slots[level] = 0;
 
 	if (root->ref_cows && lower_gen != trans->transid) {
 		struct btrfs_path *back_path = btrfs_alloc_path();
 		int ret;
+		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = btrfs_insert_extent_backref(trans,
 						  root->fs_info->extent_root,
 						  path, lower->start,
 						  root->root_key.objectid,
 						  trans->transid, 0, 0);
 		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_free_path(back_path);
 	}
 	return 0;
@@ -1521,7 +1676,7 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 		root_gen = 0;
 
 	btrfs_node_key(c, &disk_key, 0);
-	split = __btrfs_alloc_free_block(trans, root, root->nodesize,
+	split = btrfs_alloc_free_block(trans, root, root->nodesize,
 					 root->root_key.objectid,
 					 root_gen,
 					 btrfs_disk_key_objectid(&disk_key),
@@ -1564,10 +1719,12 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
+		btrfs_tree_unlock(c);
 		free_extent_buffer(c);
 		path->nodes[level] = split;
 		path->slots[level + 1] += 1;
 	} else {
+		btrfs_tree_unlock(split);
 		free_extent_buffer(split);
 	}
 	return ret;
@@ -1648,30 +1805,24 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 
 	right = read_node_slot(root, upper, slot + 1);
+	btrfs_tree_lock(right);
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (free_space < data_size + sizeof(struct btrfs_item))
+		goto out_unlock;
 
 	/* cow and double check */
 	ret = btrfs_cow_block(trans, root, right, upper,
 			      slot + 1, &right);
-	if (ret) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (ret)
+		goto out_unlock;
+
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (free_space < data_size + sizeof(struct btrfs_item))
+		goto out_unlock;
 
 	left_nritems = btrfs_header_nritems(left);
-	if (left_nritems == 0) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (left_nritems == 0)
+		goto out_unlock;
 
 	if (empty)
 		nr = 0;
@@ -1707,10 +1858,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		left->map_token = NULL;
 	}
 
-	if (push_items == 0) {
-		free_extent_buffer(right);
-		return 1;
-	}
+	if (push_items == 0)
+		goto out_unlock;
 
 	if (!empty && push_items == left_nritems)
 		WARN_ON(1);
@@ -1778,14 +1927,24 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] >= left_nritems) {
 		path->slots[0] -= left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = right;
 		path->slots[1] += 1;
 	} else {
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
 	}
 	return 0;
+
+out_unlock:
+	btrfs_tree_unlock(right);
+	free_extent_buffer(right);
+	return 1;
 }
+
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
@@ -1823,10 +1982,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
+	btrfs_tree_lock(left);
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	/* cow and double check */
@@ -1834,14 +1994,14 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			      path->nodes[1], slot - 1, &left);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size + sizeof(struct btrfs_item)) {
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	if (empty)
@@ -1876,8 +2036,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	if (push_items == 0) {
-		free_extent_buffer(left);
-		return 1;
+		ret = 1;
+		goto out;
 	}
 	if (!empty && push_items == btrfs_header_nritems(right))
 		WARN_ON(1);
@@ -1975,15 +2135,23 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
 		path->slots[0] += old_left_nritems;
+		if (btrfs_header_nritems(path->nodes[0]) == 0)
+			clean_tree_block(trans, root, path->nodes[0]);
+		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = left;
 		path->slots[1] -= 1;
 	} else {
+		btrfs_tree_unlock(left);
 		free_extent_buffer(left);
 		path->slots[0] -= push_items;
 	}
 	BUG_ON(path->slots[0] < 0);
 	return ret;
+out:
+	btrfs_tree_unlock(left);
+	free_extent_buffer(left);
+	return ret;
 }
 
 /*
@@ -2052,7 +2220,7 @@ again:
 
 	btrfs_item_key(l, &disk_key, 0);
 
-	right = __btrfs_alloc_free_block(trans, root, root->leafsize,
+	right = btrfs_alloc_free_block(trans, root, root->leafsize,
 					 root->root_key.objectid,
 					 root_gen, disk_key.objectid, 0,
 					 l->start, 0);
@@ -2085,6 +2253,8 @@ again:
 						  path->slots[1] + 1, 1);
 				if (wret)
 					ret = wret;
+
+				btrfs_tree_unlock(path->nodes[0]);
 				free_extent_buffer(path->nodes[0]);
 				path->nodes[0] = right;
 				path->slots[0] = 0;
@@ -2111,6 +2281,7 @@ again:
 						  path->slots[1], 1);
 				if (wret)
 					ret = wret;
+				btrfs_tree_unlock(path->nodes[0]);
 				free_extent_buffer(path->nodes[0]);
 				path->nodes[0] = right;
 				path->slots[0] = 0;
@@ -2184,12 +2355,15 @@ again:
 	BUG_ON(path->slots[0] != slot);
 
 	if (mid <= slot) {
+		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
 		path->nodes[0] = right;
 		path->slots[0] -= mid;
 		path->slots[1] += 1;
-	} else
+	} else {
+		btrfs_tree_unlock(right);
 		free_extent_buffer(right);
+	}
 
 	BUG_ON(path->slots[0] < 0);
 
@@ -2418,10 +2592,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		total_data += data_size[i];
 	}
 
-	/* create a root if there isn't one */
-	if (!root->node)
-		BUG();
-
 	total_size = total_data + (nr - 1) * sizeof(struct btrfs_item);
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
 	if (ret == 0) {
@@ -2516,7 +2686,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-
 out:
 	return ret;
 }
@@ -2655,7 +2824,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			btrfs_set_header_level(leaf, 0);
 		} else {
 			u64 root_gen = btrfs_header_generation(path->nodes[1]);
-			clean_tree_block(trans, root, leaf);
 			wret = del_ptr(trans, root, path, 1, path->slots[1]);
 			if (wret)
 				ret = wret;
@@ -2706,8 +2874,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				root_gen = btrfs_header_generation(
 							   path->nodes[1]);
 
-				clean_tree_block(trans, root, leaf);
-
 				wret = del_ptr(trans, root, path, 1, slot);
 				if (wret)
 					ret = wret;
@@ -2720,7 +2886,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				if (wret)
 					ret = wret;
 			} else {
-				btrfs_mark_buffer_dirty(leaf);
+				/* if we're still in the path, make sure
+				 * we're dirty.  Otherwise, one of the
+				 * push_leaf functions must have already
+				 * dirtied this buffer
+				 */
+				if (path->nodes[0] == leaf)
+					btrfs_mark_buffer_dirty(leaf);
 				free_extent_buffer(leaf);
 			}
 		} else {
@@ -2731,56 +2903,40 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 }
 
 /*
- * walk up the tree as far as required to find the previous leaf.
+ * search the tree again to find a leaf with lesser keys
  * returns 0 if it found something or 1 if there are no lesser leaves.
  * returns < 0 on io errors.
  */
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
-	int slot;
-	int level = 1;
-	struct extent_buffer *c;
-	struct extent_buffer *next = NULL;
+	struct btrfs_key key;
+	struct btrfs_disk_key found_key;
+	int ret;
 
-	while(level < BTRFS_MAX_LEVEL) {
-		if (!path->nodes[level])
-			return 1;
+	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
 
-		slot = path->slots[level];
-		c = path->nodes[level];
-		if (slot == 0) {
-			level++;
-			if (level == BTRFS_MAX_LEVEL)
-				return 1;
-			continue;
-		}
-		slot--;
-
-		if (next)
-			free_extent_buffer(next);
+	if (key.offset > 0)
+		key.offset--;
+	else if (key.type > 0)
+		key.type--;
+	else if (key.objectid > 0)
+		key.objectid--;
+	else
+		return 1;
 
-		next = read_node_slot(root, c, slot);
-		break;
-	}
-	path->slots[level] = slot;
-	while(1) {
-		level--;
-		c = path->nodes[level];
-		free_extent_buffer(c);
-		slot = btrfs_header_nritems(next);
-		if (slot != 0)
-			slot--;
-		path->nodes[level] = next;
-		path->slots[level] = slot;
-		if (!level)
-			break;
-		next = read_node_slot(root, next, slot);
-	}
-	return 0;
+	btrfs_release_path(root, path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	btrfs_item_key(path->nodes[0], &found_key, 0);
+	ret = comp_keys(&found_key, &key);
+	if (ret < 0)
+		return 0;
+	return 1;
 }
 
 /*
- * walk up the tree as far as required to find the next leaf.
+ * search the tree again to find a leaf with greater keys
  * returns 0 if it found something or 1 if there are no greater leaves.
  * returns < 0 on io errors.
  */
@@ -2790,6 +2946,28 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	int level = 1;
 	struct extent_buffer *c;
 	struct extent_buffer *next = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int ret;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (nritems == 0) {
+		return 1;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
+
+	path->keep_locks = 1;
+	btrfs_release_path(root, path);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	path->keep_locks = 0;
+
+	if (ret < 0)
+		return ret;
+
+	if (path->slots[0] < nritems - 1) {
+		goto done;
+	}
 
 	while(level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
@@ -2799,33 +2977,45 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		c = path->nodes[level];
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL)
+			if (level == BTRFS_MAX_LEVEL) {
 				return 1;
+			}
 			continue;
 		}
 
-		if (next)
+		if (next) {
+			btrfs_tree_unlock(next);
 			free_extent_buffer(next);
+		}
 
-		if (path->reada)
+		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
+		if (!path->skip_locking)
+			btrfs_tree_lock(next);
 		break;
 	}
 	path->slots[level] = slot;
 	while(1) {
 		level--;
 		c = path->nodes[level];
+		if (path->locks[level])
+			btrfs_tree_unlock(c);
 		free_extent_buffer(c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
+		path->locks[level] = 1;
 		if (!level)
 			break;
-		if (path->reada)
-			reada_for_search(root, path, level, 0, 0);
+		if (level == 1 && path->locks[1] && path->reada)
+			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
+		if (!path->skip_locking)
+			btrfs_tree_lock(next);
 	}
+done:
+	unlock_up(path, 0, 1);
 	return 0;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dcea9d706d9..50891b39f36 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -330,8 +330,13 @@ struct btrfs_node {
 struct btrfs_path {
 	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
+	/* if there is real range locking, this locks field will change */
+	int locks[BTRFS_MAX_LEVEL];
 	int reada;
+	/* keep some upper locks as we walk down */
+	int keep_locks;
 	int lowest_level;
+	int skip_locking;
 };
 
 /*
@@ -515,6 +520,8 @@ struct btrfs_fs_info {
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
 	struct mutex fs_mutex;
+	struct mutex alloc_mutex;
+	struct mutex chunk_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -576,6 +583,10 @@ struct btrfs_fs_info {
  */
 struct btrfs_root {
 	struct extent_buffer *node;
+
+	/* the node lock is held while changing the node pointer */
+	spinlock_t node_lock;
+
 	struct extent_buffer *commit_root;
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
@@ -1353,13 +1364,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
 						 int data, int owner);
-int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, u64 owner_objectid);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root, u32 size,
-					    u64 root_objectid,
-					    u64 hint, u64 empty_size);
-struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u32 blocksize,
 					     u64 root_objectid,
@@ -1368,8 +1373,6 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     int level,
 					     u64 hint,
 					     u64 empty_size);
-int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 new_size);
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
@@ -1409,6 +1412,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
+
+struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e5c758e306d..fe40bdd984f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -32,6 +32,7 @@
 #include "volumes.h"
 #include "print-tree.h"
 #include "async-thread.h"
+#include "locking.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -681,9 +682,11 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
-	    root->fs_info->running_transaction->transid)
+	    root->fs_info->running_transaction->transid) {
+		WARN_ON(!btrfs_tree_locked(buf));
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
+	}
 	return 0;
 }
 
@@ -720,6 +723,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->in_sysfs = 0;
 
 	INIT_LIST_HEAD(&root->dirty_list);
+	spin_lock_init(&root->node_lock);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -1196,6 +1200,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
+	mutex_init(&fs_info->alloc_mutex);
+	mutex_init(&fs_info->chunk_mutex);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1274,7 +1280,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	mutex_lock(&fs_info->fs_mutex);
 
+	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(tree_root);
+	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk("btrfs: failed to read the system array on %s\n",
 		       sb->s_id);
@@ -1296,7 +1304,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	         (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
 		 BTRFS_UUID_SIZE);
 
+	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
+	mutex_unlock(&fs_info->chunk_mutex);
 	BUG_ON(ret);
 
 	btrfs_close_extra_devices(fs_devices);
@@ -1654,6 +1664,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
+	WARN_ON(!btrfs_tree_locked(buf));
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
 			(unsigned long long)buf->start,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41a63462d3e..7e40c516fe6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -25,6 +25,7 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
+#include "locking.h"
 
 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
@@ -36,7 +37,28 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
+static struct btrfs_block_group_cache *
+__btrfs_find_block_group(struct btrfs_root *root,
+			 struct btrfs_block_group_cache *hint,
+			 u64 search_start, int data, int owner);
 
+void maybe_lock_mutex(struct btrfs_root *root)
+{
+	if (root != root->fs_info->extent_root &&
+	    root != root->fs_info->chunk_root &&
+	    root != root->fs_info->dev_root) {
+		mutex_lock(&root->fs_info->alloc_mutex);
+	}
+}
+
+void maybe_unlock_mutex(struct btrfs_root *root)
+{
+	if (root != root->fs_info->extent_root &&
+	    root != root->fs_info->chunk_root &&
+	    root != root->fs_info->dev_root) {
+		mutex_unlock(&root->fs_info->alloc_mutex);
+	}
+}
 
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
@@ -66,6 +88,7 @@ static int cache_block_group(struct btrfs_root *root,
 		return -ENOMEM;
 
 	path->reada = 2;
+	path->skip_locking = 1;
 	first_free = block_group->key.objectid;
 	key.objectid = block_group->key.objectid;
 	key.offset = 0;
@@ -290,7 +313,7 @@ no_cache:
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 	}
 	cache_miss = 0;
-	cache = btrfs_find_block_group(root, cache, last, data, 0);
+	cache = __btrfs_find_block_group(root, cache, last, data, 0);
 	if (!cache)
 		goto no_cache;
 	*cache_ret = cache;
@@ -318,10 +341,10 @@ static int block_group_state_bits(u64 flags)
 	return bits;
 }
 
-struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
-						 struct btrfs_block_group_cache
-						 *hint, u64 search_start,
-						 int data, int owner)
+static struct btrfs_block_group_cache *
+__btrfs_find_block_group(struct btrfs_root *root,
+			 struct btrfs_block_group_cache *hint,
+			 u64 search_start, int data, int owner)
 {
 	struct btrfs_block_group_cache *cache;
 	struct extent_io_tree *block_group_cache;
@@ -411,6 +434,18 @@ found:
 	return found_group;
 }
 
+struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
+						 struct btrfs_block_group_cache
+						 *hint, u64 search_start,
+						 int data, int owner)
+{
+
+	struct btrfs_block_group_cache *ret;
+	mutex_lock(&root->fs_info->alloc_mutex);
+	ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	return ret;
+}
 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
 			   u64 owner, u64 owner_offset)
 {
@@ -646,7 +681,7 @@ out:
 	return ret;
 }
 
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes,
 				u64 root_objectid, u64 ref_generation,
@@ -696,6 +731,22 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset)
+{
+	int ret;
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+				     root_objectid, ref_generation,
+				     owner, owner_offset);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	return ret;
+}
+
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root)
 {
@@ -760,6 +811,10 @@ u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
 	struct btrfs_extent_ref *ref_item;
 	int level = -1;
 
+	/* FIXME, needs locking */
+	BUG();
+
+	mutex_lock(&root->fs_info->alloc_mutex);
 	path = btrfs_alloc_path();
 again:
 	if (level == -1)
@@ -854,33 +909,9 @@ again:
 
 out:
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return total_count;
 }
-int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root, u64 owner_objectid)
-{
-	u64 generation;
-	u64 key_objectid;
-	u64 level;
-	u32 nritems;
-	struct btrfs_disk_key disk_key;
-
-	level = btrfs_header_level(root->node);
-	generation = trans->transid;
-	nritems = btrfs_header_nritems(root->node);
-	if (nritems > 0) {
-		if (level == 0)
-			btrfs_item_key(root->node, &disk_key, 0);
-		else
-			btrfs_node_key(root->node, &disk_key, 0);
-		key_objectid = btrfs_disk_key_objectid(&disk_key);
-	} else {
-		key_objectid = 0;
-	}
-	return btrfs_inc_extent_ref(trans, root, root->node->start,
-				    root->node->len, owner_objectid,
-				    generation, level, key_objectid);
-}
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf)
@@ -897,6 +928,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (!root->ref_cows)
 		return 0;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 	for (i = 0; i < nritems; i++) {
@@ -913,7 +945,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (disk_bytenr == 0)
 				continue;
-			ret = btrfs_inc_extent_ref(trans, root, disk_bytenr,
+			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
 				    btrfs_file_extent_disk_num_bytes(buf, fi),
 				    root->root_key.objectid, trans->transid,
 				    key.objectid, key.offset);
@@ -924,7 +956,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			btrfs_node_key_to_cpu(buf, &key, i);
-			ret = btrfs_inc_extent_ref(trans, root, bytenr,
+			ret = __btrfs_inc_extent_ref(trans, root, bytenr,
 					   btrfs_level_size(root, level - 1),
 					   root->root_key.objectid,
 					   trans->transid,
@@ -935,6 +967,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 		}
 	}
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
 fail:
 	WARN_ON(1);
@@ -965,6 +998,7 @@ fail:
 		}
 	}
 #endif
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1019,6 +1053,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_extent_bit(block_group_cache, last,
 					    &start, &end, BLOCK_GROUP_DIRTY);
@@ -1045,6 +1080,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				  BLOCK_GROUP_DIRTY, GFP_NOFS);
 	}
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return werr;
 }
 
@@ -1162,26 +1198,28 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 		space_info->force_alloc = 0;
 	}
 	if (space_info->full)
-		return 0;
+		goto out;
 
 	thresh = div_factor(space_info->total_bytes, 6);
 	if (!force &&
 	   (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
 	    thresh)
-		return 0;
+		goto out;
 
+	mutex_lock(&extent_root->fs_info->chunk_mutex);
 	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
 	if (ret == -ENOSPC) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
-		return 0;
+		goto out;
 	}
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-
+	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+out:
 	return 0;
 }
 
@@ -1318,6 +1356,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	struct extent_io_tree *free_space_cache;
 	free_space_cache = &root->fs_info->free_space_cache;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -1327,6 +1366,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
 	}
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -1363,18 +1403,24 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 				  GFP_NOFS);
 		eb = read_tree_block(extent_root, ins.objectid, ins.offset,
 				     trans->transid);
+		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
 		if (level == 0) {
 			btrfs_item_key(eb, &first, 0);
 		} else {
 			btrfs_node_key(eb, &first, 0);
 		}
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+		/*
+		 * the first key is just a hint, so the race we've created
+		 * against reading it is fine
+		 */
 		err = btrfs_insert_extent_backref(trans, extent_root, path,
 					  start, extent_root->root_key.objectid,
 					  0, level,
 					  btrfs_disk_key_objectid(&first));
 		BUG_ON(err);
-		free_extent_buffer(eb);
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -1384,12 +1430,14 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			  int pending)
 {
 	int err = 0;
-	struct extent_buffer *buf;
 
 	if (!pending) {
+#if 0
+		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
-			if (btrfs_buffer_uptodate(buf, 0)) {
+			if (!btrfs_try_tree_lock(buf) &&
+			    btrfs_buffer_uptodate(buf, 0)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
@@ -1398,12 +1446,15 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 				    !btrfs_header_flag(buf,
 					       BTRFS_HEADER_FLAG_WRITTEN)) {
 					clean_tree_block(NULL, root, buf);
+					btrfs_tree_unlock(buf);
 					free_extent_buffer(buf);
 					return 1;
 				}
+				btrfs_tree_unlock(buf);
 			}
 			free_extent_buffer(buf);
 		}
+#endif
 		update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
@@ -1586,10 +1637,11 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 /*
  * remove an extent from the root, returns 0 on success
  */
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 bytenr, u64 num_bytes,
-		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, u64 owner_offset, int pin)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, u64 bytenr,
+			       u64 num_bytes, u64 root_objectid,
+			       u64 ref_generation, u64 owner_objectid,
+			       u64 owner_offset, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
@@ -1610,6 +1662,22 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret ? ret : pending_ret;
 }
 
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, u64 bytenr,
+		      u64 num_bytes, u64 root_objectid,
+		      u64 ref_generation, u64 owner_objectid,
+		      u64 owner_offset, int pin)
+{
+	int ret;
+
+	maybe_lock_mutex(root);
+	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes,
+				  root_objectid, ref_generation,
+				  owner_objectid, owner_offset, pin);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
 	u64 mask = ((u64)root->stripesize - 1);
@@ -1679,12 +1747,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		block_group = btrfs_lookup_first_block_group(info, hint_byte);
 		if (!block_group)
 			hint_byte = search_start;
-		block_group = btrfs_find_block_group(root, block_group,
+		block_group = __btrfs_find_block_group(root, block_group,
 						     hint_byte, data, 1);
 		if (last_ptr && *last_ptr == 0 && block_group)
 			hint_byte = block_group->key.objectid;
 	} else {
-		block_group = btrfs_find_block_group(root,
+		block_group = __btrfs_find_block_group(root,
 						     trans->block_group,
 						     search_start, data, 1);
 	}
@@ -1806,7 +1874,7 @@ enospc:
 	}
 	block_group = btrfs_lookup_first_block_group(info, search_start);
 	cond_resched();
-	block_group = btrfs_find_block_group(root, block_group,
+	block_group = __btrfs_find_block_group(root, block_group,
 					     search_start, data, 0);
 	goto check_failed;
 
@@ -1843,6 +1911,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
+	maybe_lock_mutex(root);
+
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
 			        info->data_alloc_profile;
@@ -1892,9 +1962,10 @@ again:
 	if (ret) {
 		printk("allocation failed flags %Lu\n", data);
 	}
-	BUG_ON(ret);
-	if (ret)
-		return ret;
+	if (ret) {
+		BUG();
+		goto out;
+	}
 
 	/* block accounting for super block */
 	super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -1953,11 +2024,11 @@ again:
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
 
-	if (ret) {
-		return ret;
-	}
+	if (ret)
+		goto out;
 	if (pending_ret) {
-		return pending_ret;
+		ret = pending_ret;
+		goto out;
 	}
 
 update_block:
@@ -1967,36 +2038,15 @@ update_block:
 		       ins->objectid, ins->offset);
 		BUG();
 	}
-	return 0;
+out:
+	maybe_unlock_mutex(root);
+	return ret;
 }
-
 /*
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     u32 blocksize,
-					     u64 root_objectid, u64 hint,
-					     u64 empty_size)
-{
-	u64 ref_generation;
-
-	if (root->ref_cows)
-		ref_generation = trans->transid;
-	else
-		ref_generation = 0;
-
-
-	return __btrfs_alloc_free_block(trans, root, blocksize, root_objectid,
-					ref_generation, 0, 0, hint, empty_size);
-}
-
-/*
- * helper function to allocate a block for a given tree
- * returns the tree buffer or NULL.
- */
-struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u32 blocksize,
 					     u64 root_objectid,
@@ -2026,6 +2076,7 @@ struct extent_buffer *__btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 	}
 	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 	btrfs_set_buffer_uptodate(buf);
 
@@ -2076,7 +2127,7 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 		if (disk_bytenr == 0)
 			continue;
-		ret = btrfs_free_extent(trans, root, disk_bytenr,
+		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf_owner, leaf_generation,
 				key.objectid, key.offset, 0);
@@ -2151,6 +2202,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 	int ret;
 	u32 refs;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
+
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = lookup_extent_ref(trans, root,
@@ -2182,6 +2235,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
+
 		ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1) {
@@ -2189,7 +2243,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			root_owner = btrfs_header_owner(parent);
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
-			ret = btrfs_free_extent(trans, root, bytenr,
+			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, root_owner,
 						root_gen, 0, 0, 1);
 			BUG_ON(ret);
@@ -2201,9 +2255,11 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			reada_walk_down(root, cur, path->slots[*level]);
 
 			mutex_unlock(&root->fs_info->fs_mutex);
+			mutex_unlock(&root->fs_info->alloc_mutex);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			mutex_lock(&root->fs_info->fs_mutex);
+			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
 			ret = lookup_extent_ref(trans, root, bytenr,
@@ -2216,7 +2272,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 
 				path->slots[*level]++;
 				free_extent_buffer(next);
-				ret = btrfs_free_extent(trans, root, bytenr,
+				ret = __btrfs_free_extent(trans, root, bytenr,
 							blocksize,
 							root_owner,
 							root_gen, 0, 0, 1);
@@ -2244,13 +2300,14 @@ out:
 	}
 
 	root_gen = btrfs_header_generation(parent);
-	ret = btrfs_free_extent(trans, root, path->nodes[*level]->start,
+	ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
 				path->nodes[*level]->len,
 				root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -2350,6 +2407,12 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_node_key(node, &found_key, path->slots[level]);
 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
 			       sizeof(found_key)));
+		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+			if (path->nodes[i] && path->locks[i]) {
+				path->locks[i] = 0;
+				btrfs_tree_unlock(path->nodes[i]);
+			}
+		}
 	}
 	while(1) {
 		wret = walk_down_tree(trans, root, path, &level);
@@ -2383,6 +2446,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	u64 end;
 	u64 ptr;
 	int ret;
+
+	mutex_lock(&info->alloc_mutex);
 	while(1) {
 		ret = find_first_extent_bit(&info->block_group_cache, 0,
 					    &start, &end, (unsigned int)-1);
@@ -2402,6 +2467,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		clear_extent_dirty(&info->free_space_cache, start,
 				   end, GFP_NOFS);
 	}
+	mutex_unlock(&info->alloc_mutex);
 	return 0;
 }
 
@@ -2678,6 +2744,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 
 		eb = read_tree_block(found_root, extent_key->objectid,
 				     extent_key->offset, 0);
+		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
 
 		if (level == 0)
@@ -2685,6 +2752,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		else
 			btrfs_node_key_to_cpu(eb, &found_key, 0);
 
+		btrfs_tree_unlock(eb);
 		free_extent_buffer(eb);
 
 		ret = find_root_for_ref(extent_root, path, &found_key,
@@ -2888,6 +2956,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	int ret;
 	int progress;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
 						      shrink_start);
 	BUG_ON(!shrink_block_group);
@@ -3044,20 +3113,22 @@ next:
 			   (unsigned int)-1, GFP_NOFS);
 out:
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
 			   struct btrfs_key *key)
 {
-	int ret;
+	int ret = 0;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 	int slot;
 
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 	if (ret < 0)
-		return ret;
+		goto out;
+
 	while(1) {
 		slot = path->slots[0];
 		leaf = path->nodes[0];
@@ -3066,18 +3137,20 @@ int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
 			if (ret == 0)
 				continue;
 			if (ret < 0)
-				goto error;
+				goto out;
 			break;
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 		if (found_key.objectid >= key->objectid &&
-		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
-			return 0;
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			ret = 0;
+			goto out;
+		}
 		path->slots[0]++;
 	}
 	ret = -ENOENT;
-error:
+out:
 	return ret;
 }
 
@@ -3103,6 +3176,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
+	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
@@ -3158,6 +3232,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -3205,5 +3280,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = del_pending_extents(trans, extent_root);
 	BUG_ON(ret);
 	set_avail_alloc_bits(extent_root->fs_info, type);
+
 	return 0;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 17c508a941e..bd15cdcaba9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2889,7 +2889,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		lock_page(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
 		else
@@ -2907,7 +2906,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			end  = start + PAGE_CACHE_SIZE - 1;
 			if (test_range_bit(tree, start, end,
 					   EXTENT_DIRTY, 0)) {
-				unlock_page(page);
 				continue;
 			}
 		}
@@ -2919,7 +2917,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 						PAGECACHE_TAG_DIRTY);
 		}
 		read_unlock_irq(&page->mapping->tree_lock);
-		unlock_page(page);
 	}
 	return 0;
 }
@@ -2948,17 +2945,12 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 		 * on us if the page isn't already dirty.
 		 */
 		if (i == 0) {
-			lock_page(page);
 			set_page_extent_head(page, eb->len);
 		} else if (PagePrivate(page) &&
 			   page->private != EXTENT_PAGE_PRIVATE) {
-			lock_page(page);
 			set_page_extent_mapped(page);
-			unlock_page(page);
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		if (i == 0)
-			unlock_page(page);
 	}
 	return set_extent_dirty(tree, eb->start,
 				eb->start + eb->len - 1, GFP_NOFS);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0c79346fd2c..61bd8953a68 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -115,6 +115,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
+	mutex_unlock(&root->fs_info->fs_mutex);
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
@@ -159,6 +160,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	btrfs_add_ordered_inode(inode);
 	btrfs_update_inode(trans, root, inode);
 out:
+	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_end_transaction(trans, root);
 	return ret;
 }
@@ -349,10 +351,12 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
+	mutex_unlock(&root->fs_info->fs_mutex);
 
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -807,6 +811,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 		goto err;
 	}
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	btrfs_release_path(root, path);
 
 	dentry->d_inode->i_ctime = dir->i_ctime;
 	ret = btrfs_del_inode_ref(trans, root, name, name_len,
@@ -881,8 +886,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
 
-	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
 		return -ENOTEMPTY;
+	}
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6fb45580275..3fbf74e93db 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "locking.h"
 
 
@@ -75,9 +76,9 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	if (ret)
 		goto fail;
 
-	leaf = __btrfs_alloc_free_block(trans, root, root->leafsize,
-					objectid, trans->transid, 0, 0,
-					0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      objectid, trans->transid, 0, 0,
+				      0, 0);
 	if (IS_ERR(leaf))
 		return PTR_ERR(leaf);
 
@@ -108,6 +109,7 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
 	root_item.drop_level = 0;
 
+	btrfs_tree_unlock(leaf);
 	free_extent_buffer(leaf);
 	leaf = NULL;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
new file mode 100644
index 00000000000..80813a307b4
--- /dev/null
+++ b/fs/btrfs/locking.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+#include <linux/page-flags.h>
+#include <linux/bug.h>
+#include "ctree.h"
+#include "extent_io.h"
+#include "locking.h"
+
+int btrfs_tree_lock(struct extent_buffer *eb)
+{
+	lock_page(eb->first_page);
+	return 0;
+}
+
+int btrfs_try_tree_lock(struct extent_buffer *eb)
+{
+	return TestSetPageLocked(eb->first_page);
+}
+
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+	WARN_ON(!PageLocked(eb->first_page));
+	unlock_page(eb->first_page);
+	return 0;
+}
+
+int btrfs_tree_locked(struct extent_buffer *eb)
+{
+	return PageLocked(eb->first_page);
+}
+
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
new file mode 100644
index 00000000000..2dab96d8280
--- /dev/null
+++ b/fs/btrfs/locking.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_LOCKING_
+#define __BTRFS_LOCKING_
+
+int btrfs_tree_lock(struct extent_buffer *eb);
+int btrfs_tree_unlock(struct extent_buffer *eb);
+int btrfs_tree_locked(struct extent_buffer *eb);
+int btrfs_try_tree_lock(struct extent_buffer *eb);
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index f04684f7fea..1ed433a7149 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -23,6 +23,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "locking.h"
 
 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
@@ -96,8 +97,7 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_DEFRAG_TAG);
-			root->commit_root = root->node;
-			extent_buffer_get(root->node);
+			root->commit_root = btrfs_root_node(root);
 		} else {
 			WARN_ON(1);
 		}
@@ -559,6 +559,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct btrfs_root *root = pending->root;
 	struct extent_buffer *tmp;
+	struct extent_buffer *old;
 	int ret;
 	int namelen;
 	u64 objectid;
@@ -578,16 +579,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
-	extent_buffer_get(root->node);
-	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-	free_extent_buffer(tmp);
+	old = btrfs_lock_root_node(root);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old);
 
-	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
+	btrfs_copy_root(trans, root, old, &tmp, objectid);
+	btrfs_tree_unlock(old);
+	free_extent_buffer(old);
 
 	btrfs_set_root_bytenr(new_root_item, tmp->start);
 	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				new_root_item);
+	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 155961c7b4d..fab851d8538 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -181,6 +181,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (root->fs_info->extent_root == root)
 		is_extent = 1;
 
+	goto out;
+
 	if (root->ref_cows == 0 && !is_extent)
 		goto out;
 
-- 
cgit v1.2.3


From a213501153fd66e2359e091b1612841305ba6551 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Btrfs: Replace the big fs_mutex with a collection of other locks

Extent alloctions are still protected by a large alloc_mutex.
Objectid allocations are covered by a objectid mutex
Other btree operations are protected by a lock on individual btree nodes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 40 +++++++++++++++++++++-----
 fs/btrfs/ctree.h       |  5 ++--
 fs/btrfs/disk-io.c     | 15 +++++-----
 fs/btrfs/extent-tree.c | 18 +++++-------
 fs/btrfs/file.c        |  7 +----
 fs/btrfs/inode-map.c   |  8 ++++++
 fs/btrfs/inode.c       | 76 ++++----------------------------------------------
 fs/btrfs/ioctl.c       | 24 ++++------------
 fs/btrfs/super.c       |  2 --
 fs/btrfs/transaction.c | 42 +++++++++++-----------------
 fs/btrfs/volumes.c     | 19 +++++++++----
 fs/btrfs/xattr.c       | 10 +------
 12 files changed, 101 insertions(+), 165 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 75625c68fdd..dff4da082d0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -63,6 +63,9 @@ void btrfs_free_path(struct btrfs_path *p)
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
+	int skip = p->skip_locking;
+	int keep = p->keep_locks;
+
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i])
 			continue;
@@ -73,6 +76,8 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 		free_extent_buffer(p->nodes[i]);
 	}
 	memset(p, 0, sizeof(*p));
+	p->skip_locking = skip;
+	p->keep_locks = keep;
 }
 
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
@@ -1202,13 +1207,19 @@ static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
 			u32 nritems;
 			t = path->nodes[i];
 			nritems = btrfs_header_nritems(t);
-			if (path->slots[i] >= nritems - 1) {
+			if (nritems < 2 || path->slots[i] >= nritems - 2) {
+if (path->keep_locks) {
+//printk("path %p skip level now %d\n", path, skip_level);
+}
 				skip_level = i + 1;
 				continue;
 			}
 		}
 		t = path->nodes[i];
 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+if (path->keep_locks) {
+//printk("path %p unlocking level %d slot %d nritems %d skip_level %d\n", path, i, path->slots[i], btrfs_header_nritems(t), skip_level);
+}
 			btrfs_tree_unlock(t);
 			path->locks[i] = 0;
 		}
@@ -1243,7 +1254,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len);
 	WARN_ON(p->nodes[0] != NULL);
-	// WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 	WARN_ON(root == root->fs_info->extent_root &&
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
 	WARN_ON(root == root->fs_info->chunk_root &&
@@ -1321,7 +1331,7 @@ again:
 			b = read_node_slot(root, b, slot);
 			if (!p->skip_locking)
 				btrfs_tree_lock(b);
-			unlock_up(p, level, lowest_unlock);
+			unlock_up(p, level + 1, lowest_unlock);
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -1804,6 +1814,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
+	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
 	free_space = btrfs_leaf_free_space(root, right);
@@ -1981,6 +1993,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 
+	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
 	free_space = btrfs_leaf_free_space(root, left);
@@ -2957,15 +2971,16 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
 
-	path->keep_locks = 1;
 	btrfs_release_path(root, path);
+	path->keep_locks = 1;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	path->keep_locks = 0;
 
 	if (ret < 0)
 		return ret;
 
-	if (path->slots[0] < nritems - 1) {
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (nritems > 0 && path->slots[0] < nritems - 1) {
 		goto done;
 	}
 
@@ -2992,8 +3007,17 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
-		if (!path->skip_locking)
+		if (!path->skip_locking) {
+			if (!btrfs_tree_locked(c)) {
+				int i;
+				WARN_ON(1);
+printk("path %p no lock on level %d\n", path, level);
+for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+printk("path %p level %d slot %d nritems %d\n", path, i, path->slots[i], btrfs_header_nritems(path->nodes[i]));
+}
+			}
 			btrfs_tree_lock(next);
+		}
 		break;
 	}
 	path->slots[level] = slot;
@@ -3011,8 +3035,10 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
-		if (!path->skip_locking)
+		if (!path->skip_locking) {
+			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
 			btrfs_tree_lock(next);
+		}
 	}
 done:
 	unlock_up(path, 0, 1);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50891b39f36..692b8ea42de 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -519,9 +519,9 @@ struct btrfs_fs_info {
 	struct backing_dev_info bdi;
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
-	struct mutex fs_mutex;
 	struct mutex alloc_mutex;
 	struct mutex chunk_mutex;
+	struct mutex drop_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -554,7 +554,7 @@ struct btrfs_fs_info {
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
-	unsigned long throttles;
+	atomic_t throttles;
 
 	u64 total_pinned;
 	struct list_head dirty_cowonly_roots;
@@ -594,6 +594,7 @@ struct btrfs_root {
 	struct inode *inode;
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
+	struct mutex objectid_mutex;
 	u64 objectid;
 	u64 last_trans;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fe40bdd984f..f638803549e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -724,6 +724,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	spin_lock_init(&root->node_lock);
+	mutex_init(&root->objectid_mutex);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -1146,6 +1147,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->throttles, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
@@ -1199,7 +1201,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	mutex_init(&fs_info->trans_mutex);
-	mutex_init(&fs_info->fs_mutex);
+	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->alloc_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 
@@ -1278,8 +1280,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_sb_buffer;
 	}
 
-	mutex_lock(&fs_info->fs_mutex);
-
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
@@ -1342,7 +1342,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->metadata_alloc_profile = (u64)-1;
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
-	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
 fail_extent_root:
@@ -1350,7 +1349,6 @@ fail_extent_root:
 fail_tree_root:
 	free_extent_buffer(tree_root->node);
 fail_sys_array:
-	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->workers);
@@ -1562,8 +1560,9 @@ int close_ctree(struct btrfs_root *root)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	fs_info->closing = 1;
+	smp_mb();
+
 	btrfs_transaction_flush_work(root);
-	mutex_lock(&fs_info->fs_mutex);
 	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
@@ -1574,7 +1573,6 @@ int close_ctree(struct btrfs_root *root)
 	BUG_ON(ret);
 
 	write_ctree_super(NULL, root);
-	mutex_unlock(&fs_info->fs_mutex);
 
 	btrfs_transaction_flush_work(root);
 
@@ -1679,7 +1677,8 @@ void btrfs_throttle(struct btrfs_root *root)
 	struct backing_dev_info *bdi;
 
 	bdi = &root->fs_info->bdi;
-	if (root->fs_info->throttles && bdi_write_congested(bdi)) {
+	if (atomic_read(&root->fs_info->throttles) &&
+	    bdi_write_congested(bdi)) {
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
 		congestion_wait(WRITE, HZ/20);
 #else
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7e40c516fe6..890b9e9d8e2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1577,9 +1577,11 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 
 		/* block accounting for super block */
+		spin_lock_irq(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
 		btrfs_set_super_bytes_used(&info->super_copy,
 					   super_used - num_bytes);
+		spin_unlock_irq(&info->delalloc_lock);
 
 		/* block accounting for root item */
 		root_used = btrfs_root_used(&root->root_item);
@@ -1968,8 +1970,10 @@ again:
 	}
 
 	/* block accounting for super block */
+	spin_lock_irq(&info->delalloc_lock);
 	super_used = btrfs_super_bytes_used(&info->super_copy);
 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+	spin_unlock_irq(&info->delalloc_lock);
 
 	/* block accounting for root item */
 	root_used = btrfs_root_used(&root->root_item);
@@ -2172,12 +2176,12 @@ static void noinline reada_walk_down(struct btrfs_root *root,
 				continue;
 			}
 		}
-		mutex_unlock(&root->fs_info->fs_mutex);
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		ret = readahead_tree_block(root, bytenr, blocksize,
 					   btrfs_node_ptr_generation(node, i));
 		last = bytenr + blocksize;
 		cond_resched();
-		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->alloc_mutex);
 		if (ret)
 			break;
 	}
@@ -2254,11 +2258,9 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			free_extent_buffer(next);
 			reada_walk_down(root, cur, path->slots[*level]);
 
-			mutex_unlock(&root->fs_info->fs_mutex);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
-			mutex_lock(&root->fs_info->fs_mutex);
 			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
@@ -2381,6 +2383,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int orig_level;
 	struct btrfs_root_item *root_item = &root->root_item;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
@@ -2710,7 +2713,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		    *last_file_root == ref_root)
 			goto out;
 
-		mutex_unlock(&extent_root->fs_info->fs_mutex);
 		inode = btrfs_iget_locked(extent_root->fs_info->sb,
 					  ref_objectid, found_root);
 		if (inode->i_state & I_NEW) {
@@ -2727,7 +2729,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		 * the latest version of the tree root
 		 */
 		if (is_bad_inode(inode)) {
-			mutex_lock(&extent_root->fs_info->fs_mutex);
 			goto out;
 		}
 		*last_file_objectid = inode->i_ino;
@@ -2736,7 +2737,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 
 		relocate_inode_pages(inode, ref_offset, extent_key->offset);
 		iput(inode);
-		mutex_lock(&extent_root->fs_info->fs_mutex);
 	} else {
 		struct btrfs_trans_handle *trans;
 		struct extent_buffer *eb;
@@ -3033,9 +3033,7 @@ next:
 
 		if (progress && need_resched()) {
 			memcpy(&key, &found_key, sizeof(key));
-			mutex_unlock(&root->fs_info->fs_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->fs_mutex);
 			btrfs_release_path(root, path);
 			btrfs_search_slot(NULL, root, &key, path, 0, 0);
 			progress = 0;
@@ -3068,9 +3066,7 @@ next:
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
 
-		mutex_unlock(&root->fs_info->fs_mutex);
 		btrfs_clean_old_snapshots(tree_root);
-		mutex_lock(&root->fs_info->fs_mutex);
 
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 73c6d085bd9..18bbe108a0e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -252,7 +252,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	end_of_last_block = start_pos + num_bytes - 1;
 
 	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		err = -ENOMEM;
@@ -341,7 +340,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 failed:
 	err = btrfs_end_transaction(trans, root);
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
 }
@@ -905,9 +903,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(pages));
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		ret = btrfs_check_free_space(root, write_bytes, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		if (ret)
 			goto out;
 
@@ -998,9 +994,9 @@ static int btrfs_sync_file(struct file *file,
 	 * check the transaction that last modified this inode
 	 * and see if its already been committed
 	 */
-	mutex_lock(&root->fs_info->fs_mutex);
 	if (!BTRFS_I(inode)->last_trans)
 		goto out;
+
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
@@ -1023,7 +1019,6 @@ static int btrfs_sync_file(struct file *file,
 	}
 	ret = btrfs_commit_transaction(trans, root);
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret > 0 ? EIO : ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index a0925eabdaa..298346ae148 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -69,6 +69,12 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	struct btrfs_key search_key;
 	u64 search_start = dirid;
 
+	mutex_lock(&root->objectid_mutex);
+	if (root->last_inode_alloc) {
+		*objectid = ++root->last_inode_alloc;
+		mutex_unlock(&root->objectid_mutex);
+		return 0;
+	}
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	search_start = root->last_inode_alloc;
@@ -124,9 +130,11 @@ found:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	BUG_ON(*objectid < search_start);
+	mutex_unlock(&root->objectid_mutex);
 	return 0;
 error:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+	mutex_unlock(&root->objectid_mutex);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61bd8953a68..b2251e27ac8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -79,12 +79,15 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 			   int for_del)
 {
-	u64 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	u64 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
+	u64 total;
+	u64 used;
 	u64 thresh;
 	unsigned long flags;
 	int ret = 0;
 
+	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
 	if (for_del)
 		thresh = total * 90;
 	else
@@ -92,7 +95,6 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 
 	do_div(thresh, 100);
 
-	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
 		ret = -ENOSPC;
 	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
@@ -115,7 +117,6 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
@@ -160,7 +161,6 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	btrfs_add_ordered_inode(inode);
 	btrfs_update_inode(trans, root, inode);
 out:
-	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_end_transaction(trans, root);
 	return ret;
 }
@@ -269,14 +269,13 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
-	mutex_lock(&root->fs_info->fs_mutex);
+
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, start, end);
 	else
 		ret = cow_file_range(inode, start, end);
 
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -349,17 +348,13 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_csum_one_bio(root, bio, &sums);
 	BUG_ON(ret);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	kfree(sums);
 
@@ -404,7 +399,6 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	path = btrfs_alloc_path();
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
 	if (IS_ERR(item)) {
@@ -422,7 +416,6 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 out:
 	if (path)
 		btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -616,7 +609,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -662,8 +654,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 	btrfs_free_path(path);
 	inode_item = NULL;
 
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
@@ -691,9 +681,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	return;
 
 make_bad:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	make_bad_inode(inode);
 }
 
@@ -758,7 +746,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -849,7 +836,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	unsigned long nr = 0;
 
 	root = BTRFS_I(dir)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
 
 	ret = btrfs_check_free_space(root, 1, 1);
 	if (ret)
@@ -871,7 +857,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return ret;
@@ -890,7 +875,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 1);
 	if (ret)
 		goto fail;
@@ -907,7 +891,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 
@@ -1129,7 +1112,6 @@ error:
 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
 	}
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	inode->i_sb->s_dirt = 1;
 	return ret;
@@ -1234,9 +1216,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (attr->ia_size <= hole_start)
 			goto out;
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		err = btrfs_check_free_space(root, 1, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		if (err)
 			goto fail;
 
@@ -1245,7 +1225,6 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		hole_size = block_end - hole_start;
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		err = btrfs_drop_extents(trans, root, inode,
@@ -1262,7 +1241,6 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 			btrfs_check_file(root, inode);
 		}
 		btrfs_end_transaction(trans, root);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		if (err)
 			return err;
@@ -1286,7 +1264,6 @@ void btrfs_delete_inode(struct inode *inode)
 	}
 
 	inode->i_size = 0;
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
@@ -1298,7 +1275,6 @@ void btrfs_delete_inode(struct inode *inode)
 	clear_inode(inode);
 
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return;
@@ -1306,7 +1282,6 @@ void btrfs_delete_inode(struct inode *inode)
 no_delete_lock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 no_delete:
@@ -1402,7 +1377,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
 
 	*sub_root = btrfs_read_fs_root(root->fs_info, location,
 					dentry->d_name.name,
@@ -1416,7 +1390,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	location->offset = 0;
 
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return 0;
 }
 
@@ -1482,9 +1455,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_inode_by_name(dir, dentry, &location);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	if (ret < 0)
 		return ERR_PTR(ret);
@@ -1559,7 +1530,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		filp->f_pos = 1;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	path = btrfs_alloc_path();
 	path->reada = 2;
@@ -1668,9 +1638,7 @@ read_dir_items:
 nopos:
 	ret = 0;
 err:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -1681,11 +1649,9 @@ int btrfs_write_inode(struct inode *inode, int wait)
 	int ret = 0;
 
 	if (wait) {
-		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
-		mutex_unlock(&root->fs_info->fs_mutex);
 	}
 	return ret;
 }
@@ -1701,12 +1667,10 @@ void btrfs_dirty_inode(struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -1874,7 +1838,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -1912,8 +1875,6 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -1934,7 +1895,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	unsigned long nr = 0;
 	u64 objectid;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -1980,8 +1940,6 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -2009,7 +1967,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 #else
 	inc_nlink(inode);
 #endif
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -2032,8 +1989,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -2053,7 +2008,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 objectid = 0;
 	unsigned long nr = 1;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto out_unlock;
@@ -2106,7 +2060,6 @@ out_fail:
 	btrfs_end_transaction(trans, root);
 
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -2199,7 +2152,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
 
 again:
 	spin_lock(&em_tree->lock);
@@ -2402,7 +2354,6 @@ out:
 		if (!err)
 			err = ret;
 	}
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (err) {
 		free_extent_map(em);
 		WARN_ON(1);
@@ -2584,9 +2535,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	int ret;
 	u64 page_start;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret)
 		goto out;
 
@@ -2631,7 +2580,6 @@ static void btrfs_truncate(struct inode *inode)
 
 	btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
@@ -2643,7 +2591,6 @@ static void btrfs_truncate(struct inode *inode)
 
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 }
@@ -2827,7 +2774,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct timespec ctime = CURRENT_TIME;
-	struct btrfs_path *path;
 	int ret;
 
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -2835,7 +2781,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		return -ENOTEMPTY;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
 		goto out_unlock;
@@ -2843,11 +2788,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, new_dir);
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out_fail;
-	}
 
 	old_dentry->d_inode->i_nlink++;
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
@@ -2869,10 +2809,8 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		goto out_fail;
 
 out_fail:
-	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -2898,7 +2836,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto out_fail;
@@ -2979,7 +2916,6 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 out_fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3fbf74e93db..6002eb64daf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -63,7 +63,6 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	unsigned long nr = 1;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
 		goto fail_commit;
@@ -164,7 +163,6 @@ fail:
 	if (err && !ret)
 		ret = err;
 fail_commit:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return ret;
@@ -181,7 +179,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
 		goto fail_unlock;
@@ -208,7 +205,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	err = btrfs_commit_transaction(trans, root);
 
 fail_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return ret;
@@ -228,9 +224,7 @@ int btrfs_defrag_file(struct file *file)
 	unsigned long i;
 	int ret;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, inode->i_size, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret)
 		return -ENOSPC;
 
@@ -315,7 +309,8 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		goto out;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -385,7 +380,8 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	}
 
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 out:
 	kfree(vol_args);
 	return ret;
@@ -428,11 +424,9 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
 	}
 
 	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
-	mutex_lock(&root->fs_info->fs_mutex);
 	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
 			    path, root_dirid,
 			    vol_args->name, namelen, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_free_path(path);
 
 	if (di && !IS_ERR(di)) {
@@ -445,10 +439,12 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
 		goto out;
 	}
 
+	mutex_lock(&root->fs_info->drop_mutex);
 	if (root == root->fs_info->tree_root)
 		ret = create_subvol(root, vol_args->name, namelen);
 	else
 		ret = create_snapshot(root, vol_args->name, namelen);
+	mutex_unlock(&root->fs_info->drop_mutex);
 out:
 	kfree(vol_args);
 	return ret;
@@ -461,10 +457,8 @@ static int btrfs_ioctl_defrag(struct file *file)
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
-		mutex_lock(&root->fs_info->fs_mutex);
 		btrfs_defrag_root(root, 0);
 		btrfs_defrag_root(root->fs_info->extent_root, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		break;
 	case S_IFREG:
 		btrfs_defrag_file(file);
@@ -588,7 +582,6 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 0);
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -685,7 +678,6 @@ out:
 	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
 
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 out_unlock:
 	mutex_unlock(&src->i_mutex);
@@ -711,7 +703,6 @@ long btrfs_ioctl_trans_start(struct file *file)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	if (file->private_data) {
 		ret = -EINPROGRESS;
 		goto out;
@@ -723,7 +714,6 @@ long btrfs_ioctl_trans_start(struct file *file)
 		ret = -ENOMEM;
 	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -740,7 +730,6 @@ long btrfs_ioctl_trans_end(struct file *file)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = file->private_data;
 	if (!trans) {
 		ret = -EINVAL;
@@ -749,7 +738,6 @@ long btrfs_ioctl_trans_end(struct file *file)
 	btrfs_end_transaction(trans, root);
 	file->private_data = 0;
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 196d0e280b1..b61ded7a20c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -366,12 +366,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 	btrfs_clean_old_snapshots(root);
-	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1ed433a7149..5a1ee0665ae 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -370,6 +370,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
 
+	smp_mb();
 	if (root->defrag_running)
 		return 0;
 	trans = btrfs_start_transaction(root, 1);
@@ -378,16 +379,15 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		mutex_unlock(&info->fs_mutex);
 		btrfs_btree_balance_dirty(info->tree_root, nr);
 		cond_resched();
 
-		mutex_lock(&info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		if (ret != -EAGAIN)
 			break;
 	}
 	root->defrag_running = 0;
+	smp_mb();
 	radix_tree_tag_clear(&info->fs_roots_radix,
 		     (unsigned long)root->root_key.objectid,
 		     BTRFS_ROOT_DEFRAG_TAG);
@@ -435,14 +435,14 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	while(!list_empty(list)) {
 		struct btrfs_root *root;
 
-		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
 		num_bytes = btrfs_root_used(&dirty->root->root_item);
 		root = dirty->latest_root;
-		root->fs_info->throttles++;
+		atomic_inc(&root->fs_info->throttles);
 
+		mutex_lock(&root->fs_info->drop_mutex);
 		while(1) {
 			trans = btrfs_start_transaction(tree_root, 1);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
@@ -459,14 +459,16 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			nr = trans->blocks_used;
 			ret = btrfs_end_transaction(trans, tree_root);
 			BUG_ON(ret);
-			mutex_unlock(&tree_root->fs_info->fs_mutex);
+
+			mutex_unlock(&root->fs_info->drop_mutex);
 			btrfs_btree_balance_dirty(tree_root, nr);
 			cond_resched();
-			mutex_lock(&tree_root->fs_info->fs_mutex);
+			mutex_lock(&root->fs_info->drop_mutex);
 		}
 		BUG_ON(ret);
-		root->fs_info->throttles--;
+		atomic_dec(&root->fs_info->throttles);
 
+		mutex_lock(&root->fs_info->alloc_mutex);
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
@@ -474,11 +476,15 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret) {
 			BUG();
 			break;
 		}
+		mutex_unlock(&root->fs_info->drop_mutex);
+
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
@@ -486,7 +492,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
-		mutex_unlock(&tree_root->fs_info->fs_mutex);
 
 		btrfs_btree_balance_dirty(tree_root, nr);
 		cond_resched();
@@ -503,7 +508,7 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 	u64 objectid = 0;
 	int ret;
 
-	root->fs_info->throttles++;
+	atomic_inc(&root->fs_info->throttles);
 	while(1) {
 		ret = btrfs_find_first_ordered_inode(
 				&cur_trans->ordered_inode_tree,
@@ -512,7 +517,6 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 			break;
 
 		mutex_unlock(&root->fs_info->trans_mutex);
-		mutex_unlock(&root->fs_info->fs_mutex);
 
 		if (S_ISREG(inode->i_mode)) {
 			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
@@ -521,7 +525,6 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		}
 		iput(inode);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
 	while(1) {
@@ -533,7 +536,6 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		if (!ret)
 			break;
 		mutex_unlock(&root->fs_info->trans_mutex);
-		mutex_unlock(&root->fs_info->fs_mutex);
 
 		if (S_ISREG(inode->i_mode)) {
 			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
@@ -543,10 +545,9 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		atomic_dec(&inode->i_count);
 		iput(inode);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
-	root->fs_info->throttles--;
+	atomic_dec(&root->fs_info->throttles);
 	return 0;
 }
 
@@ -661,7 +662,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
 
-		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
 
@@ -669,7 +669,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		put_transaction(cur_trans);
 		mutex_unlock(&root->fs_info->trans_mutex);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
 
@@ -687,12 +686,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 					struct btrfs_transaction, list);
 		if (!prev_trans->commit_done) {
 			prev_trans->use_count++;
-			mutex_unlock(&root->fs_info->fs_mutex);
 			mutex_unlock(&root->fs_info->trans_mutex);
 
 			wait_for_commit(root, prev_trans);
 
-			mutex_lock(&root->fs_info->fs_mutex);
 			mutex_lock(&root->fs_info->trans_mutex);
 			put_transaction(prev_trans);
 		}
@@ -709,12 +706,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		else
 			timeout = 1;
 
-		mutex_unlock(&root->fs_info->fs_mutex);
 		mutex_unlock(&root->fs_info->trans_mutex);
 
 		schedule_timeout(timeout);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
 		ret = btrfs_write_ordered_inodes(trans, root);
@@ -755,12 +750,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_copy_pinned(root, pinned_copy);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 
@@ -781,9 +774,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (root->fs_info->closing) {
-		mutex_unlock(&root->fs_info->fs_mutex);
 		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
-		mutex_lock(&root->fs_info->fs_mutex);
 	}
 	return ret;
 }
@@ -823,7 +814,7 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	unsigned long delay = HZ * 30;
 	int ret;
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	smp_mb();
 	if (root->fs_info->closing)
 		goto out;
 
@@ -844,7 +835,6 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_clean_old_snapshots(root);
 	btrfs_transaction_queue_work(root, delay);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ba396857102..869864ddcc2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -866,7 +866,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&uuid_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
@@ -984,7 +985,8 @@ error_close:
 		close_bdev_excl(bdev);
 out:
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1003,7 +1005,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!bdev) {
 		return -EIO;
 	}
-	mutex_lock(&root->fs_info->fs_mutex);
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+
 	trans = btrfs_start_transaction(root, 1);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
@@ -1057,7 +1062,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->open_devices++;
 out:
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	return ret;
 
 out_close_bdev:
@@ -1297,9 +1304,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_key found_key;
 
 
+	BUG(); /* FIXME, needs locking */
+
 	dev_root = dev_root->fs_info->dev_root;
 
-	mutex_lock(&dev_root->fs_info->fs_mutex);
 	/* step one make some room on all the devices */
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1368,7 +1376,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->fs_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f63488dc2f1..3e9eb91c3c8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -153,7 +153,6 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 		return -ENOMEM;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	/* lookup the xattr by name */
 	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
 				strlen(name), 0);
@@ -181,7 +180,6 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 	ret = btrfs_dir_data_len(leaf, di);
 
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	kfree(name);
 	btrfs_free_path(path);
 	return ret;
@@ -210,7 +208,6 @@ int btrfs_xattr_set(struct inode *inode, int name_index,
 		return -ENOMEM;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
@@ -260,7 +257,6 @@ out:
 	}
 
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	kfree(name);
 	btrfs_free_path(path);
 
@@ -297,8 +293,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		return -ENOMEM;
 	path->reada = 2;
 
-	mutex_lock(&root->fs_info->fs_mutex);
-
 	/* search for our xattrs */
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
@@ -379,15 +373,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	ret = total_size;
 
 err:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_free_path(path);
 
 	return ret;
 }
 
 /*
- * delete all the xattrs associated with the inode.  fs_mutex should be
- * held when we come into here
+ * delete all the xattrs associated with the inode.
  */
 int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct inode *inode)
-- 
cgit v1.2.3


From 051e1b9f748ae673b7325d3fc049bb838606cffa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Drop locks in btrfs_search_slot when reading a tree block.

One lock per btree block can make for significant congestion if everyone
has to wait for IO at the high levels of the btree.  This drops
locks held by a path when doing reads during a tree search.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 74 +++++++++++++++++++++++++-------------------------
 fs/btrfs/ctree.h       |  1 -
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c |  1 -
 4 files changed, 38 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dff4da082d0..1b756fae279 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -63,7 +63,6 @@ void btrfs_free_path(struct btrfs_path *p)
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
-	int skip = p->skip_locking;
 	int keep = p->keep_locks;
 
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
@@ -76,7 +75,6 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 		free_extent_buffer(p->nodes[i]);
 	}
 	memset(p, 0, sizeof(*p));
-	p->skip_locking = skip;
 	p->keep_locks = keep;
 }
 
@@ -1137,7 +1135,6 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 		return;
 
 	node = path->nodes[level];
-	WARN_ON(!path->skip_locking && !btrfs_tree_locked(node));
 
 	search = btrfs_node_blockptr(node, slot);
 	blocksize = btrfs_level_size(root, level - 1);
@@ -1192,6 +1189,7 @@ static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
 {
 	int i;
 	int skip_level = level;
+	int no_skips = 0;
 	struct extent_buffer *t;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1199,27 +1197,24 @@ static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
 			break;
 		if (!path->locks[i])
 			break;
-		if (path->slots[i] == 0) {
+		if (!no_skips && path->slots[i] == 0) {
 			skip_level = i + 1;
 			continue;
 		}
-		if (path->keep_locks) {
+		if (!no_skips && path->keep_locks) {
 			u32 nritems;
 			t = path->nodes[i];
 			nritems = btrfs_header_nritems(t);
-			if (nritems < 2 || path->slots[i] >= nritems - 2) {
-if (path->keep_locks) {
-//printk("path %p skip level now %d\n", path, skip_level);
-}
+			if (nritems < 1 || path->slots[i] >= nritems - 1) {
 				skip_level = i + 1;
 				continue;
 			}
 		}
+		if (skip_level < i && i >= lowest_unlock)
+			no_skips = 1;
+
 		t = path->nodes[i];
 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
-if (path->keep_locks) {
-//printk("path %p unlocking level %d slot %d nritems %d skip_level %d\n", path, i, path->slots[i], btrfs_header_nritems(t), skip_level);
-}
 			btrfs_tree_unlock(t);
 			path->locks[i] = 0;
 		}
@@ -1244,6 +1239,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      ins_len, int cow)
 {
 	struct extent_buffer *b;
+	struct extent_buffer *tmp;
 	int slot;
 	int ret;
 	int level;
@@ -1263,10 +1259,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ins_len < 0)
 		lowest_unlock = 2;
 again:
-	if (!p->skip_locking)
-		b = btrfs_lock_root_node(root);
-	else
-		b = btrfs_root_node(root);
+	b = btrfs_lock_root_node(root);
 
 	while (b) {
 		level = btrfs_header_level(b);
@@ -1286,8 +1279,7 @@ again:
 			WARN_ON(1);
 		level = btrfs_header_level(b);
 		p->nodes[level] = b;
-		if (!p->skip_locking)
-			p->locks[level] = 1;
+		p->locks[level] = 1;
 		ret = check_block(root, p, level);
 		if (ret)
 			return -1;
@@ -1328,10 +1320,29 @@ again:
 				reada_for_search(root, p, level, slot,
 						 key->objectid);
 
-			b = read_node_slot(root, b, slot);
-			if (!p->skip_locking)
-				btrfs_tree_lock(b);
-			unlock_up(p, level + 1, lowest_unlock);
+			tmp = btrfs_find_tree_block(root,
+					  btrfs_node_blockptr(b, slot),
+					  btrfs_level_size(root, level - 1));
+			if (tmp && btrfs_buffer_uptodate(tmp,
+				   btrfs_node_ptr_generation(b, slot))) {
+				b = tmp;
+			} else {
+				/*
+				 * reduce lock contention at high levels
+				 * of the btree by dropping locks before
+				 * we read.
+				 */
+				if (level > 1) {
+					btrfs_release_path(NULL, p);
+					if (tmp)
+						free_extent_buffer(tmp);
+					goto again;
+				} else {
+					b = read_node_slot(root, b, slot);
+				}
+			}
+			btrfs_tree_lock(b);
+			unlock_up(p, level, lowest_unlock);
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -3007,17 +3018,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
-		if (!path->skip_locking) {
-			if (!btrfs_tree_locked(c)) {
-				int i;
-				WARN_ON(1);
-printk("path %p no lock on level %d\n", path, level);
-for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
-printk("path %p level %d slot %d nritems %d\n", path, i, path->slots[i], btrfs_header_nritems(path->nodes[i]));
-}
-			}
-			btrfs_tree_lock(next);
-		}
+		WARN_ON(!btrfs_tree_locked(c));
+		btrfs_tree_lock(next);
 		break;
 	}
 	path->slots[level] = slot;
@@ -3035,10 +3037,8 @@ printk("path %p level %d slot %d nritems %d\n", path, i, path->slots[i], btrfs_h
 		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
-		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
-			btrfs_tree_lock(next);
-		}
+		WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+		btrfs_tree_lock(next);
 	}
 done:
 	unlock_up(path, 0, 1);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 692b8ea42de..9ea12d42741 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -336,7 +336,6 @@ struct btrfs_path {
 	/* keep some upper locks as we walk down */
 	int keep_locks;
 	int lowest_level;
-	int skip_locking;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f638803549e..ffc363d2fb2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1684,6 +1684,7 @@ void btrfs_throttle(struct btrfs_root *root)
 #else
 		blk_congestion_wait(WRITE, HZ/20);
 #endif
+
 	}
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 890b9e9d8e2..0905653dd3f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -88,7 +88,6 @@ static int cache_block_group(struct btrfs_root *root,
 		return -ENOMEM;
 
 	path->reada = 2;
-	path->skip_locking = 1;
 	first_free = block_group->key.objectid;
 	key.objectid = block_group->key.objectid;
 	key.offset = 0;
-- 
cgit v1.2.3


From 594a24eb0e7fa8413f8b443863be4b7c72bfde9f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Fix btrfs_del_ordered_inode to allow forcing the drop during unlinks

This allows us to delete an unlinked inode with dirty pages from the list
instead of forcing commit to write these out before deleting the inode.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c        | 18 +++++++++++++-----
 fs/btrfs/file.c         |  2 +-
 fs/btrfs/inode.c        |  4 +---
 fs/btrfs/ordered-data.c |  6 +++---
 fs/btrfs/ordered-data.h |  2 +-
 5 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 1b756fae279..9601241e552 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1245,7 +1245,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int level;
 	int should_reada = p->reada;
 	int lowest_unlock = 1;
+	int blocksize;
 	u8 lowest_level = 0;
+	u64 blocknr;
+	u64 gen;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len);
@@ -1320,11 +1323,12 @@ again:
 				reada_for_search(root, p, level, slot,
 						 key->objectid);
 
-			tmp = btrfs_find_tree_block(root,
-					  btrfs_node_blockptr(b, slot),
-					  btrfs_level_size(root, level - 1));
-			if (tmp && btrfs_buffer_uptodate(tmp,
-				   btrfs_node_ptr_generation(b, slot))) {
+			blocknr = btrfs_node_blockptr(b, slot);
+			gen = btrfs_node_ptr_generation(b, slot);
+			blocksize = btrfs_level_size(root, level - 1);
+
+			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
 				b = tmp;
 			} else {
 				/*
@@ -1334,6 +1338,10 @@ again:
 				 */
 				if (level > 1) {
 					btrfs_release_path(NULL, p);
+					if (tmp)
+						free_extent_buffer(tmp);
+					tmp = read_tree_block(root, blocknr,
+							 blocksize, gen);
 					if (tmp)
 						free_extent_buffer(tmp);
 					goto again;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 18bbe108a0e..b7f8f92daf8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -976,7 +976,7 @@ out_nolock:
 
 int btrfs_release_file(struct inode * inode, struct file * filp)
 {
-	btrfs_del_ordered_inode(inode);
+	btrfs_del_ordered_inode(inode, 0);
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2251e27ac8..cf27b598462 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -852,7 +852,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 		 * we don't need to worry about
 		 * data=ordered
 		 */
-		btrfs_del_ordered_inode(inode);
+		btrfs_del_ordered_inode(inode, 1);
 	}
 
 	btrfs_end_transaction(trans, root);
@@ -1276,14 +1276,12 @@ void btrfs_delete_inode(struct inode *inode)
 
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return;
 
 no_delete_lock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 no_delete:
 	clear_inode(inode);
 }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 8dd8180183f..5e4c0d95ce4 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -254,7 +254,7 @@ static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 	return;
 }
 
-void btrfs_del_ordered_inode(struct inode *inode)
+void btrfs_del_ordered_inode(struct inode *inode, int force)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 root_objectid = root->root_key.objectid;
@@ -263,8 +263,8 @@ void btrfs_del_ordered_inode(struct inode *inode)
 		return;
 	}
 
-	if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))
+	if (!force && (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		return;
 
 	spin_lock(&root->fs_info->new_trans_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c515c4b3999..4fa78736423 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -38,6 +38,6 @@ int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
 				       u64 *root_objectid, u64 *objectid,
 				       struct inode **inode);
-void btrfs_del_ordered_inode(struct inode *inode);
+void btrfs_del_ordered_inode(struct inode *inode, int force);
 int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
 #endif
-- 
cgit v1.2.3


From 168fd7d271d9d8e81ff0b03eb08c36d82670c8a9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Fix btrfs_next_leaf to check for new items after dropping locks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9601241e552..b8f7aecf68d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2999,7 +2999,14 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		return ret;
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * by releasing the path above we dropped all our locks.  A balance
+	 * could have added more items next to the key that used to be
+	 * at the very end of the block.  So, check again here and
+	 * advance the path if there are now more items available.
+	 */
 	if (nritems > 0 && path->slots[0] < nritems - 1) {
+		path->slots[0]++;
 		goto done;
 	}
 
-- 
cgit v1.2.3


From 5cd57b2cbbb06a350df2698314e4e6a80805fc2f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Btrfs: Add a skip_locking parameter to struct path, and make various
 funcs honor it

Allocations may need to read in block groups from the extent allocation tree,
which will require a tree search and take locks on the extent allocation
tree.  But, those locks might already be held in other places, leading
to deadlocks.

Since the alloc_mutex serializes everything right now, it is safe to
skip the btree locking while caching block groups.  A better fix will be
to either create a recursive lock or find a way to back off existing
locks while caching block groups.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 32 ++++++++++++++++++--------------
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c |  6 ++++++
 3 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b8f7aecf68d..fd68601b60b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -64,6 +64,7 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
 	int keep = p->keep_locks;
+	int skip = p->skip_locking;
 
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i])
@@ -76,6 +77,7 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	}
 	memset(p, 0, sizeof(*p));
 	p->keep_locks = keep;
+	p->skip_locking = skip;
 }
 
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
@@ -1262,7 +1264,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ins_len < 0)
 		lowest_unlock = 2;
 again:
-	b = btrfs_lock_root_node(root);
+	if (p->skip_locking)
+		b = btrfs_root_node(root);
+	else
+		b = btrfs_lock_root_node(root);
 
 	while (b) {
 		level = btrfs_header_level(b);
@@ -1282,7 +1287,8 @@ again:
 			WARN_ON(1);
 		level = btrfs_header_level(b);
 		p->nodes[level] = b;
-		p->locks[level] = 1;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
 		ret = check_block(root, p, level);
 		if (ret)
 			return -1;
@@ -1349,7 +1355,8 @@ again:
 					b = read_node_slot(root, b, slot);
 				}
 			}
-			btrfs_tree_lock(b);
+			if (!p->skip_locking)
+				btrfs_tree_lock(b);
 			unlock_up(p, level, lowest_unlock);
 		} else {
 			p->slots[level] = slot;
@@ -1392,13 +1399,6 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
 			break;
 		t = path->nodes[i];
 		btrfs_set_node_key(t, key, tslot);
-		if (!btrfs_tree_locked(path->nodes[i])) {
-			int ii;
-printk("fixup without lock on level %d\n", btrfs_header_level(path->nodes[i]));
-			for (ii = 0; ii < BTRFS_MAX_LEVEL; ii++) {
-printk("level %d slot %d\n", ii, path->slots[ii]);
-			}
-		}
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
@@ -3033,8 +3033,10 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
-		WARN_ON(!btrfs_tree_locked(c));
-		btrfs_tree_lock(next);
+		if (!path->skip_locking) {
+			WARN_ON(!btrfs_tree_locked(c));
+			btrfs_tree_lock(next);
+		}
 		break;
 	}
 	path->slots[level] = slot;
@@ -3052,8 +3054,10 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
-		WARN_ON(!btrfs_tree_locked(path->nodes[level]));
-		btrfs_tree_lock(next);
+		if (!path->skip_locking) {
+			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+			btrfs_tree_lock(next);
+		}
 	}
 done:
 	unlock_up(path, 0, 1);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ea12d42741..e9bbb53eda6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -335,6 +335,7 @@ struct btrfs_path {
 	int reada;
 	/* keep some upper locks as we walk down */
 	int keep_locks;
+	int skip_locking;
 	int lowest_level;
 };
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0905653dd3f..544fc3f2fe6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -88,6 +88,12 @@ static int cache_block_group(struct btrfs_root *root,
 		return -ENOMEM;
 
 	path->reada = 2;
+	/*
+	 * we get into deadlocks with paths held by callers of this function.
+	 * since the alloc_mutex is protecting things right now, just
+	 * skip the locking here
+	 */
+	path->skip_locking = 1;
 	first_free = block_group->key.objectid;
 	key.objectid = block_group->key.objectid;
 	key.offset = 0;
-- 
cgit v1.2.3


From 333db94cdde9e6dfdedab9290d04d812f83e0922 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Btrfs: Fix snapshot deletion to release the alloc_mutex much more
 often.

This lowers the impact of snapshot deletion on the rest of the FS.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  2 +-
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 28 ++++++++++++++++++++--------
 3 files changed, 23 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fd68601b60b..5edbcc09b3c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1255,7 +1255,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len);
 	WARN_ON(p->nodes[0] != NULL);
-	WARN_ON(root == root->fs_info->extent_root &&
+	WARN_ON(cow && root == root->fs_info->extent_root &&
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
 	WARN_ON(root == root->fs_info->chunk_root &&
 		!mutex_is_locked(&root->fs_info->chunk_mutex));
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ffc363d2fb2..3cc480b8381 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1674,6 +1674,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 void btrfs_throttle(struct btrfs_root *root)
 {
+#if 0
 	struct backing_dev_info *bdi;
 
 	bdi = &root->fs_info->bdi;
@@ -1686,6 +1687,7 @@ void btrfs_throttle(struct btrfs_root *root)
 #endif
 
 	}
+#endif
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 544fc3f2fe6..6274f30031d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1223,8 +1223,8 @@ printk("space info full %Lu\n", flags);
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 out:
+	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return 0;
 }
 
@@ -2181,17 +2181,29 @@ static void noinline reada_walk_down(struct btrfs_root *root,
 				continue;
 			}
 		}
-		mutex_unlock(&root->fs_info->alloc_mutex);
 		ret = readahead_tree_block(root, bytenr, blocksize,
 					   btrfs_node_ptr_generation(node, i));
 		last = bytenr + blocksize;
 		cond_resched();
-		mutex_lock(&root->fs_info->alloc_mutex);
 		if (ret)
 			break;
 	}
 }
 
+/*
+ * we want to avoid as much random IO as we can with the alloc mutex
+ * held, so drop the lock and do the lookup, then do it again with the
+ * lock held.
+ */
+int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
+			      u32 *refs)
+{
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	lookup_extent_ref(NULL, root, start, len, refs);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	return lookup_extent_ref(NULL, root, start, len, refs);
+}
+
 /*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
@@ -2215,8 +2227,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-	ret = lookup_extent_ref(trans, root,
-				path->nodes[*level]->start,
+	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
 				path->nodes[*level]->len, &refs);
 	BUG_ON(ret);
 	if (refs > 1)
@@ -2245,7 +2256,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 
-		ret = lookup_extent_ref(trans, root, bytenr, blocksize, &refs);
+		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
 		if (refs != 1) {
 			parent = path->nodes[*level];
@@ -2261,15 +2272,16 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
+			mutex_unlock(&root->fs_info->alloc_mutex);
+
 			reada_walk_down(root, cur, path->slots[*level]);
 
-			mutex_unlock(&root->fs_info->alloc_mutex);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
-			ret = lookup_extent_ref(trans, root, bytenr,
+			ret = drop_snap_lookup_refcount(root, bytenr,
 						blocksize, &refs);
 			BUG_ON(ret);
 			if (refs != 1) {
-- 
cgit v1.2.3


From 89ce8a63d0c761fbb02089850605360f389477d8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Add btrfs_end_transaction_throttle to force writers to wait for
 pending commits

The existing throttle mechanism was often not sufficient to prevent
new writers from coming in and making a given transaction run forever.
This adds an explicit wait at the end of most operations so they will
allow the current transaction to close.

There is no wait inside file_write, inode updates, or cow filling, all which
have different deadlock possibilities.

This is a temporary measure until better asynchronous commit support is
added.  This code leads to stalls as it waits for data=ordered
writeback, and it really needs to be fixed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 18 --------------
 fs/btrfs/disk-io.h     |  1 -
 fs/btrfs/file.c        |  1 -
 fs/btrfs/inode.c       | 24 ++++++------------
 fs/btrfs/ioctl.c       |  2 --
 fs/btrfs/transaction.c | 66 ++++++++++++++++++++++++++++++++++----------------
 fs/btrfs/transaction.h |  2 ++
 7 files changed, 55 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3cc480b8381..52569b57692 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1672,24 +1672,6 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
 }
 
-void btrfs_throttle(struct btrfs_root *root)
-{
-#if 0
-	struct backing_dev_info *bdi;
-
-	bdi = &root->fs_info->bdi;
-	if (atomic_read(&root->fs_info->throttles) &&
-	    bdi_write_congested(bdi)) {
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
-		congestion_wait(WRITE, HZ/20);
-#else
-		blk_congestion_wait(WRITE, HZ/20);
-#endif
-
-	}
-#endif
-}
-
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 {
 	/*
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2bc64fefe6e..deff6b4815a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -70,7 +70,6 @@ int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
-void btrfs_throttle(struct btrfs_root *root);
 int btrfs_open_device(struct btrfs_device *dev);
 int btrfs_verify_block_csum(struct btrfs_root *root,
 			    struct extent_buffer *buf);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b7f8f92daf8..ece221cba90 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -934,7 +934,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
 		if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root, 1);
-		btrfs_throttle(root);
 		cond_resched();
 	}
 out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cf27b598462..bbba3350d02 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -855,10 +855,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 		btrfs_del_ordered_inode(inode, 1);
 	}
 
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 fail:
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return ret;
 }
 
@@ -889,10 +888,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 
 	nr = trans->blocks_used;
-	ret = btrfs_end_transaction(trans, root);
+	ret = btrfs_end_transaction_throttle(trans, root);
 fail:
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 
 	if (ret && !err)
 		err = ret;
@@ -1871,14 +1869,13 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return err;
 }
 
@@ -1936,14 +1933,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return err;
 }
 
@@ -1985,14 +1981,13 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return err;
 }
 
@@ -2055,13 +2050,12 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 out_fail:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 
 out_unlock:
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return err;
 }
 
@@ -2587,10 +2581,9 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_update_inode(trans, root, inode);
 	nr = trans->blocks_used;
 
-	ret = btrfs_end_transaction(trans, root);
+	ret = btrfs_end_transaction_throttle(trans, root);
 	BUG_ON(ret);
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 }
 
 /*
@@ -2912,14 +2905,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 out_fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return err;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6002eb64daf..026039a2ac5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -164,7 +164,6 @@ fail:
 		ret = err;
 fail_commit:
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return ret;
 }
 
@@ -206,7 +205,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 
 fail_unlock:
 	btrfs_btree_balance_dirty(root, nr);
-	btrfs_throttle(root);
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5a1ee0665ae..69ed5f85a38 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -130,8 +130,27 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	return h;
 }
 
-int btrfs_end_transaction(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root)
+static noinline int wait_for_commit(struct btrfs_root *root,
+				    struct btrfs_transaction *commit)
+{
+	DEFINE_WAIT(wait);
+	mutex_lock(&root->fs_info->trans_mutex);
+	while(!commit->commit_done) {
+		prepare_to_wait(&commit->commit_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		if (commit->commit_done)
+			break;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		schedule();
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+	mutex_unlock(&root->fs_info->trans_mutex);
+	finish_wait(&commit->commit_wait, &wait);
+	return 0;
+}
+
+static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, int throttle)
 {
 	struct btrfs_transaction *cur_trans;
 
@@ -140,8 +159,18 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	WARN_ON(cur_trans != trans->transaction);
 	WARN_ON(cur_trans->num_writers < 1);
 	cur_trans->num_writers--;
+
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
+
+	if (cur_trans->in_commit && throttle) {
+		int ret;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		ret = wait_for_commit(root, cur_trans);
+		BUG_ON(ret);
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+
 	put_transaction(cur_trans);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	memset(trans, 0, sizeof(*trans));
@@ -149,6 +178,18 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_end_transaction(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 0);
+}
+
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root)
+{
+	return __btrfs_end_transaction(trans, root, 1);
+}
+
 
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
@@ -240,25 +281,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static noinline int wait_for_commit(struct btrfs_root *root,
-				    struct btrfs_transaction *commit)
-{
-	DEFINE_WAIT(wait);
-	mutex_lock(&root->fs_info->trans_mutex);
-	while(!commit->commit_done) {
-		prepare_to_wait(&commit->commit_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (commit->commit_done)
-			break;
-		mutex_unlock(&root->fs_info->trans_mutex);
-		schedule();
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	mutex_unlock(&root->fs_info->trans_mutex);
-	finish_wait(&commit->commit_wait, &wait);
-	return 0;
-}
-
 struct dirty_root {
 	struct list_head list;
 	struct btrfs_root *root;
@@ -680,6 +702,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	trans->transaction->in_commit = 1;
+printk("trans %Lu in commit\n", trans->transid);
 	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
@@ -760,6 +783,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	kfree(pinned_copy);
 
 	cur_trans->commit_done = 1;
+printk("trans %Lu done in commit\n", cur_trans->transid);
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c3172ddb332..52559b51b18 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -101,4 +101,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
+int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From a74a4b97b61beede185b4b3ad359d7d378b0d312 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Replace the transaction work queue with kthreads

This creates one kthread for commits and one kthread for
deleting old snapshots.  All the work queues are removed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |   5 ++-
 fs/btrfs/ctree.h       |  13 ++----
 fs/btrfs/disk-io.c     | 116 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/extent-tree.c |  10 ++---
 fs/btrfs/super.c       |  16 +++----
 fs/btrfs/transaction.c |  72 +-----------------------------
 fs/btrfs/transaction.h |  10 -----
 fs/btrfs/volumes.c     |  12 +++--
 8 files changed, 136 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5edbcc09b3c..40f0e0cb804 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1352,6 +1352,8 @@ again:
 						free_extent_buffer(tmp);
 					goto again;
 				} else {
+					if (tmp)
+						free_extent_buffer(tmp);
 					b = read_node_slot(root, b, slot);
 				}
 			}
@@ -3048,7 +3050,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		free_extent_buffer(c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
-		path->locks[level] = 1;
+		if (!path->skip_locking)
+			path->locks[level] = 1;
 		if (!level)
 			break;
 		if (level == 1 && path->locks[1] && path->reada)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e9bbb53eda6..244fe86bcc5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -23,7 +23,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
-#include <linux/workqueue.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <asm/kmap_types.h>
@@ -519,15 +518,14 @@ struct btrfs_fs_info {
 	struct backing_dev_info bdi;
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
+	struct mutex transaction_kthread_mutex;
+	struct mutex cleaner_mutex;
 	struct mutex alloc_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
-	struct list_head end_io_work_list;
-	struct work_struct end_io_work;
-	spinlock_t end_io_work_lock;
 	atomic_t nr_async_submits;
 
 	/*
@@ -543,13 +541,10 @@ struct btrfs_fs_info {
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers submit_workers;
+	struct task_struct *transaction_kthread;
+	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct work_struct trans_work;
-#else
-	struct delayed_work trans_work;
-#endif
 	struct kobject super_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 52569b57692..31ca9f89388 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/version.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
@@ -24,6 +25,12 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> // for block_sync_page
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+# include <linux/freezer.h>
+#else
+# include <linux/sched.h>
+#endif
 #include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -1100,6 +1107,87 @@ static void end_workqueue_fn(struct btrfs_work *work)
 #endif
 }
 
+static int cleaner_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+
+	do {
+		smp_mb();
+		if (root->fs_info->closing)
+			break;
+
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		mutex_lock(&root->fs_info->cleaner_mutex);
+printk("cleaner awake\n");
+		btrfs_clean_old_snapshots(root);
+printk("cleaner done\n");
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			smp_mb();
+			if (root->fs_info->closing)
+				break;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_transaction *cur;
+	unsigned long now;
+	unsigned long delay;
+	int ret;
+
+	do {
+		smp_mb();
+		if (root->fs_info->closing)
+			break;
+
+		delay = HZ * 30;
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+		cur = root->fs_info->running_transaction;
+		if (!cur) {
+			mutex_unlock(&root->fs_info->trans_mutex);
+			goto sleep;
+		}
+		now = get_seconds();
+		if (now < cur->start_time || now - cur->start_time < 30) {
+			mutex_unlock(&root->fs_info->trans_mutex);
+			delay = HZ * 5;
+			goto sleep;
+		}
+		mutex_unlock(&root->fs_info->trans_mutex);
+		btrfs_defrag_dirty_roots(root->fs_info);
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_commit_transaction(trans, root);
+sleep:
+		wake_up_process(root->fs_info->cleaner_kthread);
+		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			if (root->fs_info->closing)
+				break;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(delay);
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices,
 			      char *options)
@@ -1189,11 +1277,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
-#else
-	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
-#endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -1204,6 +1287,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->alloc_mutex);
 	mutex_init(&fs_info->chunk_mutex);
+	mutex_init(&fs_info->transaction_kthread_mutex);
+	mutex_init(&fs_info->cleaner_mutex);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1247,7 +1332,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 
-
 	err = -EINVAL;
 	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
@@ -1341,9 +1425,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->data_alloc_profile = (u64)-1;
 	fs_info->metadata_alloc_profile = (u64)-1;
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+					       "btrfs-cleaner");
+	if (!fs_info->cleaner_kthread)
+		goto fail_extent_root;
+
+	fs_info->transaction_kthread = kthread_run(transaction_kthread,
+						   tree_root,
+						   "btrfs-transaction");
+	if (!fs_info->transaction_kthread)
+		goto fail_trans_kthread;
+
 
 	return tree_root;
 
+fail_trans_kthread:
+	kthread_stop(fs_info->cleaner_kthread);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
@@ -1562,8 +1659,11 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
-	btrfs_transaction_flush_work(root);
+	kthread_stop(root->fs_info->transaction_kthread);
+	kthread_stop(root->fs_info->cleaner_kthread);
+
 	btrfs_defrag_dirty_roots(root->fs_info);
+	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	/* run commit again to  drop the original snapshot */
@@ -1574,8 +1674,6 @@ int close_ctree(struct btrfs_root *root)
 
 	write_ctree_super(NULL, root);
 
-	btrfs_transaction_flush_work(root);
-
 	if (fs_info->delalloc_bytes) {
 		printk("btrfs: at unmount delalloc count %Lu\n",
 		       fs_info->delalloc_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6274f30031d..89cc4f61186 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1216,15 +1216,16 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	if (ret == -ENOSPC) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
-		goto out;
+		goto out_unlock;
 	}
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-out:
+out_unlock:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+out:
 	return 0;
 }
 
@@ -2274,7 +2275,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			free_extent_buffer(next);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 
-			reada_walk_down(root, cur, path->slots[*level]);
+			if (path->slots[*level] == 0)
+				reada_walk_down(root, cur, path->slots[*level]);
 
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
@@ -2446,8 +2448,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
-		ret = -EAGAIN;
-		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b61ded7a20c..726d6871fa1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -340,7 +340,6 @@ static int btrfs_fill_super(struct super_block * sb,
 		goto fail_close;
 
 	sb->s_root = root_dentry;
-	btrfs_transaction_queue_work(tree_root, HZ * 30);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
 	save_mount_options(sb, data);
@@ -416,9 +415,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto error_free_subvol_name;
 
 	bdev = fs_devices->latest_bdev;
-	btrfs_lock_volumes();
 	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
-	btrfs_unlock_volumes();
 	if (IS_ERR(s))
 		goto error_s;
 
@@ -530,13 +527,15 @@ out:
 static void btrfs_write_super_lockfs(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
-	btrfs_transaction_flush_work(root);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+	mutex_lock(&root->fs_info->cleaner_mutex);
 }
 
 static void btrfs_unlockfs(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
-	btrfs_transaction_queue_work(root, HZ * 30);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 }
 
 static struct super_operations btrfs_super_ops = {
@@ -589,10 +588,9 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		return err;
 
-	btrfs_init_transaction_sys();
 	err = btrfs_init_cachep();
 	if (err)
-		goto free_transaction_sys;
+		goto free_sysfs;
 
 	err = extent_io_init();
 	if (err)
@@ -618,15 +616,13 @@ free_extent_io:
 	extent_io_exit();
 free_cachep:
 	btrfs_destroy_cachep();
-free_transaction_sys:
-	btrfs_exit_transaction_sys();
+free_sysfs:
 	btrfs_exit_sysfs();
 	return err;
 }
 
 static void __exit exit_btrfs_fs(void)
 {
-	btrfs_exit_transaction_sys();
 	btrfs_destroy_cachep();
 	extent_map_exit();
 	extent_io_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 69ed5f85a38..0c53ff775b9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -29,8 +29,6 @@ static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
-static struct workqueue_struct *trans_wq;
-
 #define BTRFS_ROOT_TRANS_TAG 0
 #define BTRFS_ROOT_DEFRAG_TAG 1
 
@@ -807,81 +805,15 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
 	struct list_head dirty_roots;
 	INIT_LIST_HEAD(&dirty_roots);
-
+again:
 	mutex_lock(&root->fs_info->trans_mutex);
 	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	if (!list_empty(&dirty_roots)) {
 		drop_dirty_roots(root, &dirty_roots);
+		goto again;
 	}
 	return 0;
 }
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-void btrfs_transaction_cleaner(void *p)
-#else
-void btrfs_transaction_cleaner(struct work_struct *work)
-#endif
-{
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct btrfs_fs_info *fs_info = p;
-#else
-	struct btrfs_fs_info *fs_info = container_of(work,
-						     struct btrfs_fs_info,
-						     trans_work.work);
-
-#endif
-	struct btrfs_root *root = fs_info->tree_root;
-	struct btrfs_transaction *cur;
-	struct btrfs_trans_handle *trans;
-	unsigned long now;
-	unsigned long delay = HZ * 30;
-	int ret;
-
-	smp_mb();
-	if (root->fs_info->closing)
-		goto out;
-
-	mutex_lock(&root->fs_info->trans_mutex);
-	cur = root->fs_info->running_transaction;
-	if (!cur) {
-		mutex_unlock(&root->fs_info->trans_mutex);
-		goto out;
-	}
-	now = get_seconds();
-	if (now < cur->start_time || now - cur->start_time < 30) {
-		mutex_unlock(&root->fs_info->trans_mutex);
-		delay = HZ * 5;
-		goto out;
-	}
-	mutex_unlock(&root->fs_info->trans_mutex);
-	btrfs_defrag_dirty_roots(root->fs_info);
-	trans = btrfs_start_transaction(root, 1);
-	ret = btrfs_commit_transaction(trans, root);
-out:
-	btrfs_clean_old_snapshots(root);
-	btrfs_transaction_queue_work(root, delay);
-}
-
-void btrfs_transaction_queue_work(struct btrfs_root *root, int delay)
-{
-	if (!root->fs_info->closing)
-		queue_delayed_work(trans_wq, &root->fs_info->trans_work, delay);
-}
-
-void btrfs_transaction_flush_work(struct btrfs_root *root)
-{
-	cancel_delayed_work(&root->fs_info->trans_work);
-	flush_workqueue(trans_wq);
-}
-
-void __init btrfs_init_transaction_sys(void)
-{
-	trans_wq = create_workqueue("btrfs-transaction");
-}
-
-void btrfs_exit_transaction_sys(void)
-{
-	destroy_workqueue(trans_wq);
-}
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 52559b51b18..e1e5a06b65f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -82,16 +82,6 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-void btrfs_transaction_cleaner(void *p);
-#else
-void btrfs_transaction_cleaner(struct work_struct *work);
-#endif
-
-void btrfs_transaction_flush_work(struct btrfs_root *root);
-void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
-void btrfs_init_transaction_sys(void);
-void btrfs_exit_transaction_sys(void);
 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
 			struct list_head *dead_list);
 int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 869864ddcc2..4e7cee27aab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -271,13 +271,17 @@ again:
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->in_fs_metadata) {
-			if (device->bdev) {
-				close_bdev_excl(device->bdev);
-				fs_devices->open_devices--;
-			}
+			struct block_device *bdev;
 			list_del(&device->dev_list);
 			list_del(&device->dev_alloc_list);
 			fs_devices->num_devices--;
+			if (device->bdev) {
+				bdev = device->bdev;
+				fs_devices->open_devices--;
+				mutex_unlock(&uuid_mutex);
+				close_bdev_excl(bdev);
+				mutex_lock(&uuid_mutex);
+			}
 			kfree(device->name);
 			kfree(device);
 			goto again;
-- 
cgit v1.2.3


From e7a84565bcdb239caad29ccbe559ef978090ac7e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Add btree locking to the tree defragmentation code

The online btree defragger is simplified and rewritten to use
standard btree searches instead of a walk up / down mechanism.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  46 ++++++++--
 fs/btrfs/ctree.h       |   2 +
 fs/btrfs/extent-tree.c |   7 ++
 fs/btrfs/tree-defrag.c | 239 ++++++++++---------------------------------------
 4 files changed, 93 insertions(+), 201 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 40f0e0cb804..7f4cc2b88d0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -354,7 +354,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_key *progress)
 {
 	struct extent_buffer *cur;
-	struct extent_buffer *tmp;
 	u64 blocknr;
 	u64 gen;
 	u64 search_start = *last_ret;
@@ -370,9 +369,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	int progress_passed = 0;
 	struct btrfs_disk_key disk_key;
 
-	/* FIXME this code needs locking */
-	return 0;
-
 	parent_level = btrfs_header_level(parent);
 	if (cache_only && parent_level != 1)
 		return 0;
@@ -454,20 +450,23 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		if (search_start == 0)
 			search_start = last_block;
 
+		btrfs_tree_lock(cur);
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
-					&tmp, search_start,
+					&cur, search_start,
 					min(16 * blocksize,
 					    (end_slot - i) * blocksize));
 		if (err) {
+			btrfs_tree_unlock(cur);
 			free_extent_buffer(cur);
 			break;
 		}
-		search_start = tmp->start;
-		last_block = tmp->start;
+		search_start = cur->start;
+		last_block = cur->start;
 		*last_ret = search_start;
 		if (parent_level == 1)
-			btrfs_clear_buffer_defrag(tmp);
-		free_extent_buffer(tmp);
+			btrfs_clear_buffer_defrag(cur);
+		btrfs_tree_unlock(cur);
+		free_extent_buffer(cur);
 	}
 	if (parent->map_token) {
 		unmap_extent_buffer(parent, parent->map_token,
@@ -2970,6 +2969,35 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	return 1;
 }
 
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int lowest_level)
+{
+	int level = lowest_level;
+	int slot;
+	struct extent_buffer *c;
+
+	while(level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			return 1;
+
+		slot = path->slots[level] + 1;
+		c = path->nodes[level];
+		if (slot >= btrfs_header_nritems(c)) {
+			level++;
+			if (level == BTRFS_MAX_LEVEL) {
+				return 1;
+			}
+			continue;
+		}
+		if (level == 0)
+			btrfs_item_key_to_cpu(c, key, slot);
+		else
+			btrfs_node_key_to_cpu(c, key, slot);
+		return 0;
+	}
+	return 1;
+}
+
 /*
  * search the tree again to find a leaf with greater keys
  * returns 0 if it found something or 1 if there are no greater leaves.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 244fe86bcc5..ca8e6f15859 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1411,6 +1411,8 @@ int btrfs_previous_item(struct btrfs_root *root,
 
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
+			struct btrfs_key *key, int lowest_level);
 
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 89cc4f61186..a9b3a25a45b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2201,6 +2201,7 @@ int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 {
 	mutex_unlock(&root->fs_info->alloc_mutex);
 	lookup_extent_ref(NULL, root, start, len, refs);
+	cond_resched();
 	mutex_lock(&root->fs_info->alloc_mutex);
 	return lookup_extent_ref(NULL, root, start, len, refs);
 }
@@ -2280,6 +2281,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
+			cond_resched();
 			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
@@ -2329,6 +2331,7 @@ out:
 	*level += 1;
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->alloc_mutex);
+	cond_resched();
 	return 0;
 }
 
@@ -2448,6 +2451,10 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
+		if (trans->transaction->in_commit) {
+			ret = -EAGAIN;
+			break;
+		}
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index fab851d8538..1677e4edaf6 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -21,167 +21,26 @@
 #include "disk-io.h"
 #include "print-tree.h"
 #include "transaction.h"
-
-static void reada_defrag(struct btrfs_root *root,
-			 struct extent_buffer *node)
-{
-	int i;
-	u32 nritems;
-	u64 bytenr;
-	u64 gen;
-	u32 blocksize;
-	int ret;
-
-	blocksize = btrfs_level_size(root, btrfs_header_level(node) - 1);
-	nritems = btrfs_header_nritems(node);
-	for (i = 0; i < nritems; i++) {
-		bytenr = btrfs_node_blockptr(node, i);
-		gen = btrfs_node_ptr_generation(node, i);
-		ret = readahead_tree_block(root, bytenr, blocksize, gen);
-		if (ret)
-			break;
-	}
-}
-
-static int defrag_walk_down(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path, int *level,
-			    int cache_only, u64 *last_ret)
-{
-	struct extent_buffer *next;
-	struct extent_buffer *cur;
-	u64 bytenr;
-	u64 ptr_gen;
-	int ret = 0;
-	int is_extent = 0;
-
-	WARN_ON(*level < 0);
-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
-	if (root->fs_info->extent_root == root)
-		is_extent = 1;
-
-	if (*level == 1 && cache_only && path->nodes[1] &&
-	    !btrfs_buffer_defrag(path->nodes[1])) {
-		goto out;
-	}
-	while(*level > 0) {
-		WARN_ON(*level < 0);
-		WARN_ON(*level >= BTRFS_MAX_LEVEL);
-		cur = path->nodes[*level];
-
-		if (!cache_only && *level > 1 && path->slots[*level] == 0)
-			reada_defrag(root, cur);
-
-		if (btrfs_header_level(cur) != *level)
-			WARN_ON(1);
-
-		if (path->slots[*level] >=
-		    btrfs_header_nritems(cur))
-			break;
-
-		if (*level == 1) {
-			WARN_ON(btrfs_header_generation(path->nodes[*level]) !=
-							trans->transid);
-			ret = btrfs_realloc_node(trans, root,
-						 path->nodes[*level],
-						 path->slots[*level],
-						 cache_only, last_ret,
-						 &root->defrag_progress);
-			if (is_extent)
-				btrfs_extent_post_op(trans, root);
-
-			break;
-		}
-		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
-		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
-
-		if (cache_only) {
-			next = btrfs_find_tree_block(root, bytenr,
-					   btrfs_level_size(root, *level - 1));
-			if (!next || !btrfs_buffer_uptodate(next, ptr_gen) ||
-			    !btrfs_buffer_defrag(next)) {
-				free_extent_buffer(next);
-				path->slots[*level]++;
-				continue;
-			}
-		} else {
-			next = read_tree_block(root, bytenr,
-				       btrfs_level_size(root, *level - 1),
-				       ptr_gen);
-		}
-		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
-				      path->slots[*level], &next);
-		BUG_ON(ret);
-		if (is_extent)
-			btrfs_extent_post_op(trans, root);
-
-		WARN_ON(*level <= 0);
-		if (path->nodes[*level-1])
-			free_extent_buffer(path->nodes[*level-1]);
-		path->nodes[*level-1] = next;
-		*level = btrfs_header_level(next);
-		path->slots[*level] = 0;
-	}
-	WARN_ON(*level < 0);
-	WARN_ON(*level >= BTRFS_MAX_LEVEL);
-
-	btrfs_clear_buffer_defrag(path->nodes[*level]);
-out:
-	free_extent_buffer(path->nodes[*level]);
-	path->nodes[*level] = NULL;
-	*level += 1;
-	WARN_ON(ret && ret != -EAGAIN);
-	return ret;
-}
-
-static int defrag_walk_up(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
-			  struct btrfs_path *path, int *level,
-			  int cache_only)
-{
-	int i;
-	int slot;
-	struct extent_buffer *node;
-
-	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
-		slot = path->slots[i];
-		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
-			path->slots[i]++;
-			*level = i;
-			node = path->nodes[i];
-			WARN_ON(i == 0);
-			btrfs_node_key_to_cpu(node, &root->defrag_progress,
-					      path->slots[i]);
-			root->defrag_level = i;
-			return 0;
-		} else {
-			btrfs_clear_buffer_defrag(path->nodes[*level]);
-			free_extent_buffer(path->nodes[*level]);
-			path->nodes[*level] = NULL;
-			*level = i + 1;
-		}
-	}
-	return 1;
-}
+#include "locking.h"
 
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only)
 {
 	struct btrfs_path *path = NULL;
-	struct extent_buffer *tmp;
+	struct btrfs_key key;
 	int ret = 0;
 	int wret;
 	int level;
 	int orig_level;
 	int i;
 	int is_extent = 0;
+	int next_key_ret = 0;
 	u64 last_ret = 0;
 
-	if (root->fs_info->extent_root == root)
+	if (root->fs_info->extent_root == root) {
+		mutex_lock(&root->fs_info->alloc_mutex);
 		is_extent = 1;
-
-	goto out;
+	}
 
 	if (root->ref_cows == 0 && !is_extent)
 		goto out;
@@ -200,67 +59,63 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	if (root->defrag_progress.objectid == 0) {
+		struct extent_buffer *root_node;
 		u32 nritems;
 
-		nritems = btrfs_header_nritems(root->node);
+		root_node = btrfs_lock_root_node(root);
+		nritems = btrfs_header_nritems(root_node);
 		root->defrag_max.objectid = 0;
 		/* from above we know this is not a leaf */
-		btrfs_node_key_to_cpu(root->node, &root->defrag_max,
+		btrfs_node_key_to_cpu(root_node, &root->defrag_max,
 				      nritems - 1);
-		extent_buffer_get(root->node);
-		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
-		BUG_ON(ret);
-		path->nodes[level] = root->node;
-		path->slots[level] = 0;
-		if (is_extent)
-			btrfs_extent_post_op(trans, root);
+		btrfs_tree_unlock(root_node);
+		free_extent_buffer(root_node);
+		memset(&key, 0, sizeof(key));
 	} else {
-		level = root->defrag_level;
-		path->lowest_level = level;
-		wret = btrfs_search_slot(trans, root, &root->defrag_progress,
-					 path, 0, 1);
-
-		if (is_extent)
-			btrfs_extent_post_op(trans, root);
-
-		if (wret < 0) {
-			ret = wret;
-			goto out;
-		}
-
-		while(level > 0 && !path->nodes[level])
-			level--;
-
-		if (!path->nodes[level]) {
-			ret = 0;
-			goto out;
-		}
+		memcpy(&key, &root->defrag_progress, sizeof(key));
 	}
 
-	while(1) {
-		wret = defrag_walk_down(trans, root, path, &level, cache_only,
-					&last_ret);
-		if (wret > 0)
-			break;
-		if (wret < 0)
-			ret = wret;
+	path->lowest_level = 1;
+	path->keep_locks = 1;
+	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 
-		wret = defrag_walk_up(trans, root, path, &level, cache_only);
-		if (wret > 0)
-			break;
-		if (wret < 0)
-			ret = wret;
-		else
-			ret = -EAGAIN;
-		break;
+	if (wret < 0) {
+		ret = wret;
+		goto out;
+	}
+	if (!path->nodes[1]) {
+		ret = 0;
+		goto out;
+	}
+	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1);
+	ret = btrfs_realloc_node(trans, root,
+				 path->nodes[1], 0,
+				 cache_only, &last_ret,
+				 &root->defrag_progress);
+	WARN_ON(ret && ret != -EAGAIN);
+	if (next_key_ret == 0) {
+		memcpy(&root->defrag_progress, &key, sizeof(key));
+		ret = -EAGAIN;
 	}
-	for (i = 0; i <= orig_level; i++) {
+
+	for (i = 1; i < BTRFS_MAX_LEVEL; i++) {
+		if (path->locks[i]) {
+			btrfs_tree_unlock(path->nodes[i]);
+			path->locks[i] = 0;
+		}
 		if (path->nodes[i]) {
 			free_extent_buffer(path->nodes[i]);
 			path->nodes[i] = NULL;
 		}
 	}
+	if (is_extent)
+		btrfs_extent_post_op(trans, root);
+
 out:
+	if (is_extent)
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
 	if (path)
 		btrfs_free_path(path);
 	if (ret == -EAGAIN) {
-- 
cgit v1.2.3


From 079899c2384023cd8efcd3806680b4f1d2abbd54 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Change find_extent_buffer to use TestSetPageLocked

This makes it possible for callers to check for extent_buffers in cache
without deadlocking against any btree locks held.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 --
 fs/btrfs/extent_io.c   | 7 ++++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a9b3a25a45b..dc3c03c6612 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1438,7 +1438,6 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 	int err = 0;
 
 	if (!pending) {
-#if 0
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
@@ -1460,7 +1459,6 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			}
 			free_extent_buffer(buf);
 		}
-#endif
 		update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bd15cdcaba9..40a5f53cb04 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2808,10 +2808,15 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 		goto lru_add;
 
 	for (i = 0; i < num_pages; i++, index++) {
-		p = find_lock_page(mapping, index);
+		p = find_get_page(mapping, index);
 		if (!p) {
 			goto fail;
 		}
+		if (TestSetPageLocked(p)) {
+			page_cache_release(p);
+			goto fail;
+		}
+
 		set_page_extent_mapped(p);
 		mark_page_accessed(p);
 
-- 
cgit v1.2.3


From 1b1e2135dc1e4efbcf25ac9ac9979316d4e1193e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Add a per-inode csum mutex to avoid races creating csum items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  | 1 +
 fs/btrfs/file.c         | 6 +++---
 fs/btrfs/inode.c        | 6 ++++++
 fs/btrfs/ordered-data.c | 5 +++--
 fs/btrfs/transaction.c  | 2 +-
 fs/btrfs/tree-defrag.c  | 7 +++++++
 6 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5ba83894c8b..40b4e0c9cd0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -30,6 +30,7 @@ struct btrfs_inode {
 	struct extent_map_tree extent_tree;
 	struct extent_io_tree io_tree;
 	struct extent_io_tree io_failure_tree;
+	struct mutex csum_mutex;
 	struct inode vfs_inode;
 	atomic_t ordered_writeback;
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ece221cba90..8037792f878 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -267,13 +267,13 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 
 	/* FIXME...EIEIO, ENOSPC and more */
 	/* insert any holes we need to create */
-	if (isize < end_pos) {
+	if (isize < start_pos) {
 		u64 last_pos_in_file;
 		u64 hole_size;
 		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
-		hole_size = (end_pos - last_pos_in_file + mask) & ~mask;
-		if (last_pos_in_file < end_pos) {
+		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
+		if (last_pos_in_file < start_pos) {
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bbba3350d02..d39433dfb2c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -351,7 +351,9 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
+	mutex_lock(&BTRFS_I(inode)->csum_mutex);
 	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
+	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
@@ -1400,6 +1402,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	return 0;
 }
@@ -1701,6 +1704,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
@@ -1924,6 +1928,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
@@ -2862,6 +2867,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5e4c0d95ce4..254da822566 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -152,12 +152,13 @@ int btrfs_add_ordered_inode(struct inode *inode)
 			   inode->i_ino, &entry->rb_node);
 
 	BTRFS_I(inode)->ordered_trans = transid;
+	if (!node)
+		igrab(inode);
 
 	write_unlock(&tree->lock);
+
 	if (node)
 		kfree(entry);
-	else
-		igrab(inode);
 	return 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0c53ff775b9..8e909cb97c6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -477,7 +477,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			if (err)
 				ret = err;
 			nr = trans->blocks_used;
-			ret = btrfs_end_transaction(trans, tree_root);
+			ret = btrfs_end_transaction_throttle(trans, tree_root);
 			BUG_ON(ret);
 
 			mutex_unlock(&root->fs_info->drop_mutex);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 1677e4edaf6..b17693f61fb 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -38,8 +38,15 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	u64 last_ret = 0;
 
 	if (root->fs_info->extent_root == root) {
+		/*
+		 * there's recursion here right now in the tree locking,
+		 * we can't defrag the extent root without deadlock
+		 */
+		goto out;
+#if 0
 		mutex_lock(&root->fs_info->alloc_mutex);
 		is_extent = 1;
+#endif
 	}
 
 	if (root->ref_cows == 0 && !is_extent)
-- 
cgit v1.2.3


From 3f157a2fd2ad731e1ed9964fecdc5f459f04a4a4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Online btree defragmentation fixes

The btree defragger wasn't making forward progress because the new key wasn't
being saved by the btrfs_search_forward function.

This also disables the automatic btree defrag, it wasn't scaling well to
huge filesystems.  The auto-defrag needs to be done differently.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 170 ++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/ctree.h       |   7 +-
 fs/btrfs/disk-io.c     |  61 +-----------------
 fs/btrfs/disk-io.h     |   6 --
 fs/btrfs/extent-tree.c |   2 -
 fs/btrfs/super.c       |   1 -
 fs/btrfs/transaction.c |  35 +---------
 fs/btrfs/transaction.h |   1 -
 fs/btrfs/tree-defrag.c |  36 ++++++-----
 9 files changed, 190 insertions(+), 129 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7f4cc2b88d0..0cb80f32a9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -63,10 +63,9 @@ void btrfs_free_path(struct btrfs_path *p)
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
-	int keep = p->keep_locks;
-	int skip = p->skip_locking;
 
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		p->slots[i] = 0;
 		if (!p->nodes[i])
 			continue;
 		if (p->locks[i]) {
@@ -74,10 +73,8 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 			p->locks[i] = 0;
 		}
 		free_extent_buffer(p->nodes[i]);
+		p->nodes[i] = NULL;
 	}
-	memset(p, 0, sizeof(*p));
-	p->keep_locks = keep;
-	p->skip_locking = skip;
 }
 
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
@@ -463,8 +460,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		search_start = cur->start;
 		last_block = cur->start;
 		*last_ret = search_start;
-		if (parent_level == 1)
-			btrfs_clear_buffer_defrag(cur);
 		btrfs_tree_unlock(cur);
 		free_extent_buffer(cur);
 	}
@@ -2969,8 +2964,138 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	return 1;
 }
 
+/*
+ * A helper function to walk down the tree starting at min_key, and looking
+ * for nodes or leaves that are either in cache or have a minimum
+ * transaction id.  This is used by the btree defrag code, but could
+ * also be used to search for blocks that have changed since a given
+ * transaction id.
+ *
+ * This does not cow, but it does stuff the starting key it finds back
+ * into min_key, so you can call btrfs_search_slot with cow=1 on the
+ * key and get a writable path.
+ *
+ * This does lock as it descends, and path->keep_locks should be set
+ * to 1 by the caller.
+ *
+ * This honors path->lowest_level to prevent descent past a given level
+ * of the tree.
+ *
+ * returns zero if something useful was found, < 0 on error and 1 if there
+ * was nothing in the tree that matched the search criteria.
+ */
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_path *path, int cache_only,
+			 u64 min_trans)
+{
+	struct extent_buffer *cur;
+	struct btrfs_key found_key;
+	int slot;
+	u32 nritems;
+	int level;
+	int ret = 1;
+
+again:
+	cur = btrfs_lock_root_node(root);
+	level = btrfs_header_level(cur);
+	path->nodes[level] = cur;
+	path->locks[level] = 1;
+
+	if (btrfs_header_generation(cur) < min_trans) {
+		ret = 1;
+		goto out;
+	}
+	while(1) {
+		nritems = btrfs_header_nritems(cur);
+		level = btrfs_header_level(cur);
+		bin_search(cur, min_key, level, &slot);
+
+		/* at level = 0, we're done, setup the path and exit */
+		if (level == 0) {
+			ret = 0;
+			path->slots[level] = slot;
+			btrfs_item_key_to_cpu(cur, &found_key, slot);
+			goto out;
+		}
+		/*
+		 * check this node pointer against the cache_only and
+		 * min_trans parameters.  If it isn't in cache or is too
+		 * old, skip to the next one.
+		 */
+		while(slot < nritems) {
+			u64 blockptr;
+			u64 gen;
+			struct extent_buffer *tmp;
+			blockptr = btrfs_node_blockptr(cur, slot);
+			gen = btrfs_node_ptr_generation(cur, slot);
+			if (gen < min_trans) {
+				slot++;
+				continue;
+			}
+			if (!cache_only)
+				break;
+
+			tmp = btrfs_find_tree_block(root, blockptr,
+					    btrfs_level_size(root, level - 1));
+
+			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+				free_extent_buffer(tmp);
+				break;
+			}
+			if (tmp)
+				free_extent_buffer(tmp);
+			slot++;
+		}
+		/*
+		 * we didn't find a candidate key in this node, walk forward
+		 * and find another one
+		 */
+		if (slot >= nritems) {
+			ret = btrfs_find_next_key(root, path, min_key, level,
+						  cache_only, min_trans);
+			if (ret == 0) {
+				btrfs_release_path(root, path);
+				goto again;
+			} else {
+				goto out;
+			}
+		}
+		/* save our key for returning back */
+		btrfs_node_key_to_cpu(cur, &found_key, slot);
+		path->slots[level] = slot;
+		if (level == path->lowest_level) {
+			ret = 0;
+			unlock_up(path, level, 1);
+			goto out;
+		}
+		cur = read_node_slot(root, cur, slot);
+
+		btrfs_tree_lock(cur);
+		path->locks[level - 1] = 1;
+		path->nodes[level - 1] = cur;
+		unlock_up(path, level, 1);
+	}
+out:
+	if (ret == 0)
+		memcpy(min_key, &found_key, sizeof(found_key));
+	return ret;
+}
+
+/*
+ * this is similar to btrfs_next_leaf, but does not try to preserve
+ * and fixup the path.  It looks for and returns the next key in the
+ * tree based on the current path and the cache_only and min_trans
+ * parameters.
+ *
+ * 0 is returned if another key is found, < 0 if there are any errors
+ * and 1 is returned if there are no higher keys in the tree
+ *
+ * path->keep_locks should be set to 1 on the search made before
+ * calling this function.
+ */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-			struct btrfs_key *key, int lowest_level)
+			struct btrfs_key *key, int lowest_level,
+			int cache_only, u64 min_trans)
 {
 	int level = lowest_level;
 	int slot;
@@ -2982,6 +3107,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 
 		slot = path->slots[level] + 1;
 		c = path->nodes[level];
+next:
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
 			if (level == BTRFS_MAX_LEVEL) {
@@ -2991,8 +3117,28 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 		}
 		if (level == 0)
 			btrfs_item_key_to_cpu(c, key, slot);
-		else
+		else {
+			u64 blockptr = btrfs_node_blockptr(c, slot);
+			u64 gen = btrfs_node_ptr_generation(c, slot);
+
+			if (cache_only) {
+				struct extent_buffer *cur;
+				cur = btrfs_find_tree_block(root, blockptr,
+					    btrfs_level_size(root, level - 1));
+				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
+					slot++;
+					if (cur)
+						free_extent_buffer(cur);
+					goto next;
+				}
+				free_extent_buffer(cur);
+			}
+			if (gen < min_trans) {
+				slot++;
+				goto next;
+			}
 			btrfs_node_key_to_cpu(c, key, slot);
+		}
 		return 0;
 	}
 	return 1;
@@ -3095,6 +3241,12 @@ done:
 	return 0;
 }
 
+/*
+ * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
+ * searching until it gets past min_objectid or finds an item of 'type'
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ca8e6f15859..a28796482b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -609,6 +609,7 @@ struct btrfs_root {
 	u64 last_inode_alloc;
 	int ref_cows;
 	int track_dirty;
+	u64 defrag_trans_start;
 	struct btrfs_key defrag_progress;
 	struct btrfs_key defrag_max;
 	int defrag_running;
@@ -1412,7 +1413,11 @@ int btrfs_previous_item(struct btrfs_root *root,
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-			struct btrfs_key *key, int lowest_level);
+			struct btrfs_key *key, int lowest_level,
+			int cache_only, u64 min_trans);
+int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_path *path, int cache_only,
+			 u64 min_trans);
 
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 31ca9f89388..4cdc0b6a267 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -295,7 +295,6 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 					     btrfs_header_generation(eb));
 	BUG_ON(ret);
-	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
 		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
@@ -355,7 +354,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
-	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
 		ret = -EIO;
@@ -736,6 +734,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
+	root->defrag_trans_start = fs_info->generation;
 	init_completion(&root->kobj_unregister);
 	root->defrag_running = 0;
 	root->defrag_level = 0;
@@ -1168,7 +1167,6 @@ static int transaction_kthread(void *arg)
 			goto sleep;
 		}
 		mutex_unlock(&root->fs_info->trans_mutex);
-		btrfs_defrag_dirty_roots(root->fs_info);
 		trans = btrfs_start_transaction(root, 1);
 		ret = btrfs_commit_transaction(trans, root);
 sleep:
@@ -1434,12 +1432,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 						   tree_root,
 						   "btrfs-transaction");
 	if (!fs_info->transaction_kthread)
-		goto fail_trans_kthread;
+		goto fail_cleaner;
 
 
 	return tree_root;
 
-fail_trans_kthread:
+fail_cleaner:
 	kthread_stop(fs_info->cleaner_kthread);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
@@ -1662,7 +1660,6 @@ int close_ctree(struct btrfs_root *root)
 	kthread_stop(root->fs_info->transaction_kthread);
 	kthread_stop(root->fs_info->cleaner_kthread);
 
-	btrfs_defrag_dirty_roots(root->fs_info);
 	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
@@ -1794,58 +1791,6 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	return;
 }
 
-void btrfs_set_buffer_defrag(struct extent_buffer *buf)
-{
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
-			buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
-}
-
-void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
-{
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
-			buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
-			GFP_NOFS);
-}
-
-int btrfs_buffer_defrag(struct extent_buffer *buf)
-{
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
-		     buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
-}
-
-int btrfs_buffer_defrag_done(struct extent_buffer *buf)
-{
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
-		     buf->start, buf->start + buf->len - 1,
-		     EXTENT_DEFRAG_DONE, 0);
-}
-
-int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
-{
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
-		     buf->start, buf->start + buf->len - 1,
-		     EXTENT_DEFRAG_DONE, GFP_NOFS);
-}
-
-int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
-{
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
-		     buf->start, buf->start + buf->len - 1,
-		     EXTENT_DEFRAG, GFP_NOFS);
-}
-
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index deff6b4815a..353c3c50c95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -61,12 +61,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
 				 struct extent_buffer *buf);
-void btrfs_set_buffer_defrag(struct extent_buffer *buf);
-void btrfs_set_buffer_defrag_done(struct extent_buffer *buf);
-int btrfs_buffer_defrag(struct extent_buffer *buf);
-int btrfs_buffer_defrag_done(struct extent_buffer *buf);
-int btrfs_clear_buffer_defrag(struct extent_buffer *buf);
-int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dc3c03c6612..5e0857ffbc3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2095,8 +2095,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
-	if (!btrfs_test_opt(root, SSD))
-		btrfs_set_buffer_defrag(buf);
 	trans->blocks_used++;
 	return buf;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 726d6871fa1..5e28cf5c2e8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -365,7 +365,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 	btrfs_clean_old_snapshots(root);
-	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8e909cb97c6..98f422d9ab0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -30,7 +30,6 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
 #define BTRFS_ROOT_TRANS_TAG 0
-#define BTRFS_ROOT_DEFRAG_TAG 1
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
@@ -92,9 +91,6 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_TRANS_TAG);
-			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-				   (unsigned long)root->root_key.objectid,
-				   BTRFS_ROOT_DEFRAG_TAG);
 			root->commit_root = btrfs_root_node(root);
 		} else {
 			WARN_ON(1);
@@ -403,44 +399,15 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		cond_resched();
 
 		trans = btrfs_start_transaction(root, 1);
-		if (ret != -EAGAIN)
+		if (root->fs_info->closing || ret != -EAGAIN)
 			break;
 	}
 	root->defrag_running = 0;
 	smp_mb();
-	radix_tree_tag_clear(&info->fs_roots_radix,
-		     (unsigned long)root->root_key.objectid,
-		     BTRFS_ROOT_DEFRAG_TAG);
 	btrfs_end_transaction(trans, root);
 	return 0;
 }
 
-int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info)
-{
-	struct btrfs_root *gang[1];
-	struct btrfs_root *root;
-	int i;
-	int ret;
-	int err = 0;
-	u64 last = 0;
-
-	while(1) {
-		ret = radix_tree_gang_lookup_tag(&info->fs_roots_radix,
-						 (void **)gang, last,
-						 ARRAY_SIZE(gang),
-						 BTRFS_ROOT_DEFRAG_TAG);
-		if (ret == 0)
-			break;
-		for (i = 0; i < ret; i++) {
-			root = gang[i];
-			last = root->root_key.objectid + 1;
-			btrfs_defrag_root(root, 1);
-		}
-	}
-	btrfs_defrag_root(info->extent_root, 1);
-	return err;
-}
-
 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 				     struct list_head *list)
 {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e1e5a06b65f..9ccd5a5b170 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -84,7 +84,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 
 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
 			struct list_head *dead_list);
-int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b17693f61fb..cc2650b0695 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -32,10 +32,13 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	int wret;
 	int level;
 	int orig_level;
-	int i;
 	int is_extent = 0;
 	int next_key_ret = 0;
 	u64 last_ret = 0;
+	u64 min_trans = 0;
+
+	if (cache_only)
+		goto out;
 
 	if (root->fs_info->extent_root == root) {
 		/*
@@ -43,10 +46,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		 * we can't defrag the extent root without deadlock
 		 */
 		goto out;
-#if 0
-		mutex_lock(&root->fs_info->alloc_mutex);
-		is_extent = 1;
-#endif
 	}
 
 	if (root->ref_cows == 0 && !is_extent)
@@ -84,6 +83,17 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 
 	path->lowest_level = 1;
 	path->keep_locks = 1;
+	if (cache_only)
+		min_trans = root->defrag_trans_start;
+
+	ret = btrfs_search_forward(root, &key, path, cache_only, min_trans);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+	btrfs_release_path(root, path);
 	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 
 	if (wret < 0) {
@@ -95,7 +105,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-	next_key_ret = btrfs_find_next_key(root, path, &key, 1);
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+					   min_trans);
 	ret = btrfs_realloc_node(trans, root,
 				 path->nodes[1], 0,
 				 cache_only, &last_ret,
@@ -106,19 +117,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		ret = -EAGAIN;
 	}
 
-	for (i = 1; i < BTRFS_MAX_LEVEL; i++) {
-		if (path->locks[i]) {
-			btrfs_tree_unlock(path->nodes[i]);
-			path->locks[i] = 0;
-		}
-		if (path->nodes[i]) {
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
-	}
+	btrfs_release_path(root, path);
 	if (is_extent)
 		btrfs_extent_post_op(trans, root);
-
 out:
 	if (is_extent)
 		mutex_unlock(&root->fs_info->alloc_mutex);
@@ -138,6 +139,7 @@ done:
 	if (ret != -EAGAIN) {
 		memset(&root->defrag_progress, 0,
 		       sizeof(root->defrag_progress));
+		root->defrag_trans_start = trans->transid;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From f9efa9c784aa3b801feb367f72c6867d26fb348e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:14:04 -0400
Subject: Btrfs: Reduce contention on the root node

This calls unlock_up sooner in btrfs_search_slot in order to decrease the
amount of work done with the higher level tree locks held.

Also, it changes btrfs_tree_lock to spin for a big against the page lock
before scheduling.  This makes a big difference in context switch rate under
highly contended workloads.

Longer term, a better locking structure is needed than the page lock.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 17 +++++++++++------
 fs/btrfs/locking.c | 10 ++++++++++
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cb80f32a9c..c6759fc1004 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1313,16 +1313,13 @@ again:
 				slot = p->slots[level];
 				BUG_ON(btrfs_header_nritems(b) == 1);
 			}
+			unlock_up(p, level, lowest_unlock);
+
 			/* this is only true while dropping a snapshot */
 			if (level == lowest_level) {
-				unlock_up(p, level, lowest_unlock);
 				break;
 			}
 
-			if (should_reada)
-				reada_for_search(root, p, level, slot,
-						 key->objectid);
-
 			blocknr = btrfs_node_blockptr(b, slot);
 			gen = btrfs_node_ptr_generation(b, slot);
 			blocksize = btrfs_level_size(root, level - 1);
@@ -1340,6 +1337,11 @@ again:
 					btrfs_release_path(NULL, p);
 					if (tmp)
 						free_extent_buffer(tmp);
+					if (should_reada)
+						reada_for_search(root, p,
+								 level, slot,
+								 key->objectid);
+
 					tmp = read_tree_block(root, blocknr,
 							 blocksize, gen);
 					if (tmp)
@@ -1348,12 +1350,15 @@ again:
 				} else {
 					if (tmp)
 						free_extent_buffer(tmp);
+					if (should_reada)
+						reada_for_search(root, p,
+								 level, slot,
+								 key->objectid);
 					b = read_node_slot(root, b, slot);
 				}
 			}
 			if (!p->skip_locking)
 				btrfs_tree_lock(b);
-			unlock_up(p, level, lowest_unlock);
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 80813a307b4..058a506a0dd 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -27,6 +27,16 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
+	int i;
+
+	if (!TestSetPageLocked(eb->first_page))
+		return 0;
+	for (i = 0; i < 512; i++) {
+		cpu_relax();
+		if (!TestSetPageLocked(eb->first_page))
+			return 0;
+	}
+	cpu_relax();
 	lock_page(eb->first_page);
 	return 0;
 }
-- 
cgit v1.2.3


From a7a16fd772620605c76e8ac8bdbc8ccc9e3df1a0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 26 Jun 2008 10:34:20 -0400
Subject: Btrfs: Fix deadlock while searching for dead roots on mount

btrfs_find_dead_roots called btrfs_read_fs_root_no_radix, which
means we end up calling btrfs_search_slot with a path already held.

The fix is to remember the key inside btrfs_find_dead_roots and drop
the path.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/root-tree.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 8bf21ba0a43..a5c0e98b5ae 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -154,6 +154,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
 	struct btrfs_path *path;
 	int ret;
 	u32 nritems;
@@ -166,6 +167,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+
+again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -196,7 +199,11 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 		if (btrfs_disk_root_refs(leaf, ri) != 0)
 			goto next;
 
-		dead_root = btrfs_read_fs_root_no_radix(root->fs_info, &key);
+		memcpy(&found_key, &key, sizeof(key));
+		key.offset++;
+		btrfs_release_path(root, path);
+		dead_root = btrfs_read_fs_root_no_radix(root->fs_info,
+							&found_key);
 		if (IS_ERR(dead_root)) {
 			ret = PTR_ERR(dead_root);
 			goto err;
@@ -206,6 +213,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 					  &root->fs_info->dead_roots);
 		if (ret)
 			goto err;
+		goto again;
 next:
 		slot++;
 		path->slots[0]++;
-- 
cgit v1.2.3


From 7d9eb12c8739e7dc80c78c6b3596f912ecd8f941 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jul 2008 14:19:17 -0400
Subject: Btrfs: Add locking around volume management (device
 add/remove/balance)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  4 ---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 73 +++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/ioctl.c       |  6 ++---
 fs/btrfs/volumes.c     | 58 +++++++++++++++++++++++++++++----------
 6 files changed, 103 insertions(+), 40 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c6759fc1004..bbf9bf37406 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1251,10 +1251,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(cow && root == root->fs_info->extent_root &&
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
-	WARN_ON(root == root->fs_info->chunk_root &&
-		!mutex_is_locked(&root->fs_info->chunk_mutex));
-	WARN_ON(root == root->fs_info->dev_root &&
-		!mutex_is_locked(&root->fs_info->chunk_mutex));
 	if (ins_len < 0)
 		lowest_unlock = 2;
 again:
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a28796482b4..f3783dbd9b6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -523,6 +523,7 @@ struct btrfs_fs_info {
 	struct mutex alloc_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
+	struct mutex volume_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4cdc0b6a267..8f4c40033e9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1287,6 +1287,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
+	mutex_init(&fs_info->volume_mutex);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e0857ffbc3..8ebfa6be079 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -245,6 +245,7 @@ static int noinline find_search_start(struct btrfs_root *root,
 	u64 search_start = *start_ret;
 	int wrapped = 0;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
@@ -1242,6 +1243,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while(total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache) {
@@ -1297,6 +1299,7 @@ static int update_pinned_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (pin) {
 		set_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
@@ -1391,6 +1394,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	int level;
 	int err = 0;
 
+	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 	path = btrfs_alloc_path();
@@ -1437,6 +1441,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 {
 	int err = 0;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (!pending) {
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
@@ -1490,6 +1495,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -1619,6 +1625,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	struct extent_io_tree *pending_del;
 	struct extent_io_tree *pinned_extents;
 
+	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	pending_del = &extent_root->fs_info->pending_del;
 	pinned_extents = &extent_root->fs_info->pinned_extents;
 
@@ -2428,6 +2435,10 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_node_key(node, &found_key, path->slots[level]);
 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
 			       sizeof(found_key)));
+		/*
+		 * unlock our path, this is safe because only this
+		 * function is allowed to delete this snapshot
+		 */
 		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 			if (path->nodes[i] && path->locks[i]) {
 				path->locks[i] = 0;
@@ -2611,7 +2622,6 @@ static int find_root_for_ref(struct btrfs_root *root,
 	u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
 	u64 found_bytenr;
 	int ret;
-	int i;
 
 	root_location.offset = (u64)-1;
 	root_location.type = BTRFS_ROOT_ITEM_KEY;
@@ -2635,12 +2645,6 @@ static int find_root_for_ref(struct btrfs_root *root,
 				found_bytenr = path->nodes[level]->start;
 		}
 
-		for (i = level; i < BTRFS_MAX_LEVEL; i++) {
-			if (!path->nodes[i])
-				break;
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
 		btrfs_release_path(cur_root, path);
 
 		if (found_bytenr == bytenr) {
@@ -2689,6 +2693,8 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 	int ret;
 	int level;
 
+	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			     struct btrfs_extent_ref);
 	ref_root = btrfs_ref_root(path->nodes[0], ref);
@@ -2707,6 +2713,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 	found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
 						&root_location);
 	BUG_ON(!found_root);
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 
 	if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 		found_key.objectid = ref_objectid;
@@ -2748,9 +2755,9 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		/* this can happen if the reference is not against
 		 * the latest version of the tree root
 		 */
-		if (is_bad_inode(inode)) {
+		if (is_bad_inode(inode))
 			goto out;
-		}
+
 		*last_file_objectid = inode->i_ino;
 		*last_file_root = found_root->root_key.objectid;
 		*last_file_offset = ref_offset;
@@ -2760,7 +2767,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 	} else {
 		struct btrfs_trans_handle *trans;
 		struct extent_buffer *eb;
-		int i;
+		int needs_lock = 0;
 
 		eb = read_tree_block(found_root, extent_key->objectid,
 				     extent_key->offset, 0);
@@ -2782,26 +2789,40 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		if (ret)
 			goto out;
 
+		/*
+		 * right here almost anything could happen to our key,
+		 * but that's ok.  The cow below will either relocate it
+		 * or someone else will have relocated it.  Either way,
+		 * it is in a different spot than it was before and
+		 * we're happy.
+		 */
+
 		trans = btrfs_start_transaction(found_root, 1);
 
+		if (found_root == extent_root->fs_info->extent_root ||
+		    found_root == extent_root->fs_info->chunk_root ||
+		    found_root == extent_root->fs_info->dev_root) {
+			needs_lock = 1;
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
+
 		path->lowest_level = level;
 		path->reada = 2;
 		ret = btrfs_search_slot(trans, found_root, &found_key, path,
 					0, 1);
 		path->lowest_level = 0;
-		for (i = level; i < BTRFS_MAX_LEVEL; i++) {
-			if (!path->nodes[i])
-				break;
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
 		btrfs_release_path(found_root, path);
+
 		if (found_root == found_root->fs_info->extent_root)
 			btrfs_extent_post_op(trans, found_root);
+		if (needs_lock)
+			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+
 		btrfs_end_transaction(trans, found_root);
-	}
 
+	}
 out:
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -2943,7 +2964,10 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
 
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		trans = btrfs_start_transaction(root, 1);
+		mutex_lock(&root->fs_info->alloc_mutex);
+
 		new_alloc_flags = update_block_group_flags(root,
 						   shrink_block_group->flags);
 		if (new_alloc_flags != shrink_block_group->flags) {
@@ -2954,7 +2978,10 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		}
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
+
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_end_transaction(trans, root);
+		mutex_lock(&root->fs_info->alloc_mutex);
 	}
 	return 0;
 }
@@ -3031,9 +3058,9 @@ again:
 		if (ret < 0)
 			goto out;
 
+next:
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-next:
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
@@ -3083,6 +3110,7 @@ next:
 		printk("btrfs relocate found %llu last extent was %llu\n",
 		       (unsigned long long)total_found,
 		       (unsigned long long)found_key.objectid);
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
 
@@ -3090,6 +3118,7 @@ next:
 
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
+		mutex_lock(&root->fs_info->alloc_mutex);
 		goto again;
 	}
 
@@ -3097,7 +3126,10 @@ next:
 	 * we've freed all the extents, now remove the block
 	 * group item from the tree
 	 */
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	trans = btrfs_start_transaction(root, 1);
+	mutex_lock(&root->fs_info->alloc_mutex);
 	memcpy(&key, &shrink_block_group->key, sizeof(key));
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -3119,8 +3151,12 @@ next:
 	kfree(shrink_block_group);
 
 	btrfs_del_item(trans, root, path);
+	btrfs_release_path(root, path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_commit_transaction(trans, root);
 
+	mutex_lock(&root->fs_info->alloc_mutex);
+
 	/* the code to unpin extents might set a few bits in the free
 	 * space cache for this range again
 	 */
@@ -3263,6 +3299,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *cache;
 	struct extent_io_tree *block_group_cache;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	extent_root = root->fs_info->extent_root;
 	block_group_cache = &root->fs_info->block_group_cache;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 026039a2ac5..83f17a5cbd6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -307,8 +307,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		goto out;
 	}
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -378,8 +377,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	}
 
 out_unlock:
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->volume_mutex);
 out:
 	kfree(vol_args);
 	return ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4e7cee27aab..5e6ee7a6f73 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -56,6 +56,18 @@ void btrfs_unlock_volumes(void)
 	mutex_unlock(&uuid_mutex);
 }
 
+static void lock_chunks(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -822,6 +834,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
+	lock_chunks(root);
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
@@ -856,6 +869,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 				    total_bytes - 1);
 out:
 	btrfs_free_path(path);
+	unlock_chunks(root);
 	btrfs_commit_transaction(trans, root);
 	return ret;
 }
@@ -870,9 +884,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&uuid_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
 		root->fs_info->avail_system_alloc_bits |
@@ -988,9 +1001,8 @@ error_close:
 	if (bdev)
 		close_bdev_excl(bdev);
 out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1010,10 +1022,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EIO;
 	}
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	trans = btrfs_start_transaction(root, 1);
+	lock_chunks(root);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1065,9 +1077,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->num_devices++;
 	root->fs_info->fs_devices->open_devices++;
 out:
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->volume_mutex);
 
 	return ret;
 
@@ -1122,7 +1134,7 @@ out:
 	return ret;
 }
 
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size)
 {
 	struct btrfs_super_block *super_copy =
@@ -1134,6 +1146,16 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	return btrfs_update_device(trans, device);
 }
 
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	int ret;
+	lock_chunks(device->dev_root);
+	ret = __btrfs_grow_device(trans, device, new_size);
+	unlock_chunks(device->dev_root);
+	return ret;
+}
+
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    u64 chunk_tree, u64 chunk_objectid,
@@ -1234,6 +1256,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
+	lock_chunks(root);
+
 	/*
 	 * step two, delete the device extents and the
 	 * chunk tree entries
@@ -1278,6 +1302,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	/* once for us */
 	free_extent_map(em);
 
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 	return 0;
 }
@@ -1308,8 +1333,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_key found_key;
 
 
-	BUG(); /* FIXME, needs locking */
-
+	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
 
 	/* step one make some room on all the devices */
@@ -1355,13 +1379,14 @@ int btrfs_balance(struct btrfs_root *dev_root)
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret) {
+		if (ret)
 			break;
-		}
+
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
 		if (found_key.objectid != key.objectid)
 			break;
+
 		chunk = btrfs_item_ptr(path->nodes[0],
 				       path->slots[0],
 				       struct btrfs_chunk);
@@ -1370,16 +1395,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		if (key.offset == 0)
 			break;
 
+		btrfs_release_path(chunk_root, path);
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		BUG_ON(ret);
-		btrfs_release_path(chunk_root, path);
 	}
 	ret = 0;
 error:
 	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -1419,14 +1445,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
 	path->reada = 2;
 
+	lock_chunks(root);
+
 	device->total_bytes = new_size;
 	ret = btrfs_update_device(trans, device);
 	if (ret) {
+		unlock_chunks(root);
 		btrfs_end_transaction(trans, root);
 		goto done;
 	}
 	WARN_ON(diff > old_total);
 	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 
 	key.objectid = device->devid;
-- 
cgit v1.2.3


From 77a41afb7d0dd0f27b6f2f1a5bc701929c7034de Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jul 2008 14:32:12 -0400
Subject: Btrfs: Drop some verbose printks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 2 --
 fs/btrfs/transaction.c | 2 --
 2 files changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8f4c40033e9..b01b3f4f92a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1117,9 +1117,7 @@ static int cleaner_kthread(void *arg)
 
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->cleaner_mutex);
-printk("cleaner awake\n");
 		btrfs_clean_old_snapshots(root);
-printk("cleaner done\n");
 		mutex_unlock(&root->fs_info->cleaner_mutex);
 
 		if (freezing(current)) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 98f422d9ab0..a8a3cb03de5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -667,7 +667,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	trans->transaction->in_commit = 1;
-printk("trans %Lu in commit\n", trans->transid);
 	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
@@ -748,7 +747,6 @@ printk("trans %Lu in commit\n", trans->transid);
 	kfree(pinned_copy);
 
 	cur_trans->commit_done = 1;
-printk("trans %Lu done in commit\n", cur_trans->transid);
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
 	put_transaction(cur_trans);
-- 
cgit v1.2.3


From e6dcd2dc9c489108648e2ed543315dd134d50a9a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:53:50 -0400
Subject: Btrfs: New data=ordered implementation

The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk.  This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.

The new code changes the way data allocations and extents work:

* When delayed allocation is filled, data extents are reserved, and
  the extent bit EXTENT_ORDERED is set on the entire range of the extent.
  A struct btrfs_ordered_extent is allocated an inserted into a per-inode
  rbtree to track the pending extents.

* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
  to that page.

* When all of the bytes corresponding to a single struct btrfs_ordered_extent
  are written, The previously reserved extent is inserted into the FS
  btree and into the extent allocation trees.  The checksums for the file
  data are also updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  |   4 +-
 fs/btrfs/ctree.h        |  19 +-
 fs/btrfs/disk-io.c      |  13 +-
 fs/btrfs/extent-tree.c  | 132 +++++++++-----
 fs/btrfs/extent_io.c    |  52 +++++-
 fs/btrfs/extent_io.h    |  14 +-
 fs/btrfs/extent_map.c   |   5 +-
 fs/btrfs/file-item.c    |  62 ++++---
 fs/btrfs/file.c         |  67 ++++---
 fs/btrfs/inode.c        | 447 ++++++++++++++++++++++++++++++-----------------
 fs/btrfs/ordered-data.c | 455 ++++++++++++++++++++++++++++++------------------
 fs/btrfs/ordered-data.h |  71 ++++++--
 fs/btrfs/transaction.c  |  67 +------
 fs/btrfs/transaction.h  |   4 -
 14 files changed, 910 insertions(+), 502 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 40b4e0c9cd0..8d03687510e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,6 +21,7 @@
 
 #include "extent_map.h"
 #include "extent_io.h"
+#include "ordered-data.h"
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -32,9 +33,8 @@ struct btrfs_inode {
 	struct extent_io_tree io_failure_tree;
 	struct mutex csum_mutex;
 	struct inode vfs_inode;
-	atomic_t ordered_writeback;
+	struct btrfs_ordered_inode_tree ordered_tree;
 
-	u64 ordered_trans;
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f3783dbd9b6..ceebc052ddc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
+#include <linux/wait.h>
 #include <asm/kmap_types.h>
 #include "bit-radix.h"
 #include "extent_io.h"
@@ -37,6 +38,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_B5RfS_M"
 
@@ -510,6 +512,7 @@ struct btrfs_fs_info {
 	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
+	wait_queue_head_t transaction_throttle;
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
@@ -541,6 +544,7 @@ struct btrfs_fs_info {
 	 */
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
@@ -1384,6 +1388,17 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -1556,9 +1571,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     u64 bytenr, int mod);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio, char *sums);
+			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, char **sums_ret);
+		       struct bio *bio, struct btrfs_ordered_sum **sums_ret);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b01b3f4f92a..4a5ebafb935 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -407,7 +407,11 @@ static int end_workqueue_bio(struct bio *bio,
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
-	btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+	if (bio->bi_rw & (1 << BIO_RW))
+		btrfs_queue_worker(&fs_info->endio_write_workers,
+				   &end_io_wq->work);
+	else
+		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
@@ -1286,6 +1290,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
+	init_waitqueue_head(&fs_info->transaction_throttle);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1325,9 +1330,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_write_workers,
+			   fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_write_workers,
+			    fs_info->thread_pool_size);
 
 	err = -EINVAL;
 	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
@@ -1447,6 +1456,7 @@ fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
 	iput(fs_info->btree_inode);
@@ -1702,6 +1712,7 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 	iput(fs_info->btree_inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8ebfa6be079..343d1101c31 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1895,36 +1895,17 @@ error:
 	return ret;
 }
 
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       u64 num_bytes, u64 min_alloc_size,
-		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 owner_offset,
-		       u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, u64 data)
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
 {
 	int ret;
-	int pending_ret;
-	u64 super_used;
-	u64 root_used;
 	u64 search_start = 0;
 	u64 alloc_profile;
-	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_root *extent_root = info->extent_root;
-	struct btrfs_extent_item *extent_item;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_path *path;
-	struct btrfs_key keys[2];
-
-	maybe_lock_mutex(root);
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -1974,11 +1955,48 @@ again:
 	}
 	if (ret) {
 		printk("allocation failed flags %Lu\n", data);
-	}
-	if (ret) {
 		BUG();
-		goto out;
 	}
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+	return 0;
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
+{
+	int ret;
+	maybe_lock_mutex(root);
+	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+				     empty_size, hint_byte, search_end, ins,
+				     data);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, u64 owner_offset,
+					 struct btrfs_key *ins)
+{
+	int ret;
+	int pending_ret;
+	u64 super_used;
+	u64 root_used;
+	u64 num_bytes = ins->offset;
+	u32 sizes[2];
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_path *path;
+	struct btrfs_key keys[2];
 
 	/* block accounting for super block */
 	spin_lock_irq(&info->delalloc_lock);
@@ -1990,10 +2008,6 @@ again:
 	root_used = btrfs_root_used(&root->root_item);
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 
-	clear_extent_dirty(&root->fs_info->free_space_cache,
-			   ins->objectid, ins->objectid + ins->offset - 1,
-			   GFP_NOFS);
-
 	if (root == extent_root) {
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
@@ -2001,10 +2015,6 @@ again:
 		goto update_block;
 	}
 
-	WARN_ON(trans->alloc_exclude_nr);
-	trans->alloc_exclude_start = ins->objectid;
-	trans->alloc_exclude_nr = ins->offset;
-
 	memcpy(&keys[0], ins, sizeof(*ins));
 	keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
 					 owner, owner_offset);
@@ -2054,6 +2064,51 @@ update_block:
 		BUG();
 	}
 out:
+	return ret;
+}
+
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins)
+{
+	int ret;
+	maybe_lock_mutex(root);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 min_alloc_size,
+		       u64 root_objectid, u64 ref_generation,
+		       u64 owner, u64 owner_offset,
+		       u64 empty_size, u64 hint_byte,
+		       u64 search_end, struct btrfs_key *ins, u64 data)
+{
+	int ret;
+
+	maybe_lock_mutex(root);
+
+	ret = __btrfs_reserve_extent(trans, root, num_bytes,
+				     min_alloc_size, empty_size, hint_byte,
+				     search_end, ins, data);
+	BUG_ON(ret);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	BUG_ON(ret);
+
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2288,8 +2343,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
-			ret = drop_snap_lookup_refcount(root, bytenr,
-						blocksize, &refs);
+			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+						&refs);
 			BUG_ON(ret);
 			if (refs != 1) {
 				parent = path->nodes[*level];
@@ -2584,7 +2639,6 @@ out_unlock:
 	kfree(ra);
 	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
 	if (trans) {
-		btrfs_add_ordered_inode(inode);
 		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 40a5f53cb04..3f82a6e9ca4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -793,6 +793,13 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_dirty);
 
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_ordered);
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
@@ -812,8 +819,8 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
-			      mask);
+			      EXTENT_DELALLOC | EXTENT_DIRTY,
+			      0, NULL, mask);
 }
 EXPORT_SYMBOL(set_extent_delalloc);
 
@@ -825,6 +832,13 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(clear_extent_dirty);
 
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_ordered);
+
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
@@ -1395,10 +1409,9 @@ static int end_bio_extent_writepage(struct bio *bio,
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
-
 		if (tree->ops && tree->ops->writepage_end_io_hook) {
 			ret = tree->ops->writepage_end_io_hook(page, start,
-						       end, state);
+						       end, state, uptodate);
 			if (ret)
 				uptodate = 0;
 		}
@@ -1868,9 +1881,14 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			unlock_extent(tree, cur, end, GFP_NOFS);
 			break;
 		}
-
 		extent_offset = cur - em->start;
+		if (extent_map_end(em) <= cur) {
+printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
+		}
 		BUG_ON(extent_map_end(em) <= cur);
+		if (end < cur) {
+printk("2bad mapping end %Lu cur %Lu\n", end, cur);
+		}
 		BUG_ON(end < cur);
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
@@ -1976,6 +1994,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
+	u64 unlock_start;
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
@@ -1988,7 +2007,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 nr_delalloc;
 	u64 delalloc_end;
 
-
 	WARN_ON(!PageLocked(page));
 	page_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
@@ -2030,6 +2048,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		delalloc_start = delalloc_end + 1;
 	}
 	lock_extent(tree, start, page_end, GFP_NOFS);
+	unlock_start = start;
 
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
@@ -2038,6 +2057,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	if (last_byte <= start) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_extent(tree, start, page_end, GFP_NOFS);
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		unlock_start = page_end + 1;
 		goto done;
 	}
 
@@ -2047,6 +2071,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 page_end, NULL, 1);
+			unlock_start = page_end + 1;
 			break;
 		}
 		em = epd->get_extent(inode, page, page_offset, cur,
@@ -2071,8 +2100,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		    block_start == EXTENT_MAP_INLINE) {
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
+
+			unlock_extent(tree, unlock_start, cur + iosize -1,
+				      GFP_NOFS);
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 cur + iosize - 1,
+							 NULL, 1);
 			cur = cur + iosize;
 			page_offset += iosize;
+			unlock_start = cur;
 			continue;
 		}
 
@@ -2119,7 +2156,8 @@ done:
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
-	unlock_extent(tree, start, page_end, GFP_NOFS);
+	if (unlock_start <= page_end)
+		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
 	return 0;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f1960dafaa1..2268a799589 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,6 +13,8 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_ORDERED (1 << 9)
+#define EXTENT_ORDERED_METADATA (1 << 10)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /*
@@ -42,7 +44,7 @@ struct extent_io_ops {
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
-				      struct extent_state *state);
+				      struct extent_state *state, int uptodate);
 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
 	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
@@ -131,6 +133,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   int bits, int filled);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -141,8 +145,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
@@ -209,6 +219,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len);
 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
 				    struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f5a04eb9a2a..81123277c2b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -206,10 +206,11 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
+	BUG_ON(spin_trylock(&tree->lock));
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
-		merge = rb_entry(rb, struct extent_map, rb_node);
 		ret = -EEXIST;
+		free_extent_map(merge);
 		goto out;
 	}
 	atomic_inc(&em->refs);
@@ -268,6 +269,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
+	BUG_ON(spin_trylock(&tree->lock));
 	em = tree->last;
 	if (em && end > em->start && start < extent_map_end(em))
 		goto found;
@@ -318,6 +320,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
 	int ret = 0;
 
+	BUG_ON(spin_trylock(&tree->lock));
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
 	if (tree->last == em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f537eb43c2c..345caf8ff51 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -135,26 +135,37 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, char **sums_ret)
+		       struct bio *bio, struct btrfs_ordered_sum **sums_ret)
 {
-	u32 *sums;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
 	char *data;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
 
-	sums = kmalloc(bio->bi_vcnt * BTRFS_CRC32_SIZE, GFP_NOFS);
+	WARN_ON(bio->bi_vcnt <= 0);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
-	*sums_ret = (char *)sums;
+	*sums_ret = sums;
+	sector_sum = &sums->sums;
+	sums->file_offset = page_offset(bvec->bv_page);
+	sums->len = bio->bi_size;
+	INIT_LIST_HEAD(&sums->list);
 
 	while(bio_index < bio->bi_vcnt) {
 		data = kmap_atomic(bvec->bv_page, KM_USER0);
-		*sums = ~(u32)0;
-		*sums = btrfs_csum_data(root, data + bvec->bv_offset,
-					*sums, bvec->bv_len);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root,
+						  data + bvec->bv_offset,
+						  sector_sum->sum,
+						  bvec->bv_len);
 		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(*sums, (char *)sums);
-		sums++;
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->offset = page_offset(bvec->bv_page) +
+			bvec->bv_offset;
+		sector_sum++;
 		bio_index++;
 		bvec++;
 	}
@@ -163,7 +174,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root,
 
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio, char *sums)
+			   struct btrfs_ordered_sum *sums)
 {
 	u64 objectid = inode->i_ino;
 	u64 offset;
@@ -171,17 +182,16 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
 	u64 next_offset;
+	u64 total_bytes = 0;
 	int found_next;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
 	u64 csum_offset;
-	u32 *sums32 = (u32 *)sums;
+	struct btrfs_sector_sum *sector_sum;
 	u32 nritems;
 	u32 ins_size;
-	int bio_index = 0;
-	struct bio_vec *bvec = bio->bi_io_vec;
 	char *eb_map;
 	char *eb_token;
 	unsigned long map_len;
@@ -189,10 +199,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+	sector_sum = &sums->sums;
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
-	offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+	offset = sector_sum->offset;
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
@@ -303,7 +314,7 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-next_bvec:
+next_sector:
 
 	if (!eb_token ||
 	   (unsigned long)item  + BTRFS_CRC32_SIZE >= map_start + map_len) {
@@ -321,21 +332,20 @@ next_bvec:
 	}
 	if (eb_token) {
 		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-		       sums32, BTRFS_CRC32_SIZE);
+		       &sector_sum->sum, BTRFS_CRC32_SIZE);
 	} else {
-		write_extent_buffer(leaf, sums32, (unsigned long)item,
-				    BTRFS_CRC32_SIZE);
+		write_extent_buffer(leaf, &sector_sum->sum,
+				    (unsigned long)item, BTRFS_CRC32_SIZE);
 	}
-	bio_index++;
-	bvec++;
-	sums32++;
-	if (bio_index < bio->bi_vcnt) {
+	total_bytes += root->sectorsize;
+	sector_sum++;
+	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  BTRFS_CRC32_SIZE);
 		if (item < item_end && offset + PAGE_CACHE_SIZE ==
-		    page_offset(bvec->bv_page)) {
-			offset = page_offset(bvec->bv_page);
-			goto next_bvec;
+		    sector_sum->offset) {
+			    offset = sector_sum->offset;
+			goto next_sector;
 		}
 	}
 	if (eb_token) {
@@ -343,7 +353,7 @@ next_bvec:
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	if (bio_index < bio->bi_vcnt) {
+	if (total_bytes < sums->len) {
 		btrfs_release_path(root, path);
 		goto again;
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8037792f878..12e765f7e0d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,7 +34,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
 #include "compat.h"
@@ -273,7 +272,9 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
 		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		if (last_pos_in_file < start_pos) {
+		if (hole_size > 0) {
+			btrfs_wait_ordered_range(inode, last_pos_in_file,
+						 last_pos_in_file + hole_size);
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
@@ -303,19 +304,17 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	    inline_size > root->fs_info->max_inline ||
 	    (inline_size & (root->sectorsize -1)) == 0 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		u64 last_end;
-
+		/* check for reserved extents on each page, we don't want
+		 * to reset the delalloc bit on things that already have
+		 * extents reserved.
+		 */
+		set_extent_delalloc(io_tree, start_pos,
+				    end_of_last_block, GFP_NOFS);
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
 			set_page_dirty(p);
 		}
-		last_end = (u64)(pages[num_pages -1]->index) <<
-				PAGE_CACHE_SHIFT;
-		last_end += PAGE_CACHE_SIZE - 1;
-		set_extent_delalloc(io_tree, start_pos, end_of_last_block,
-				 GFP_NOFS);
-		btrfs_add_ordered_inode(inode);
 	} else {
 		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
@@ -350,10 +349,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *tmp;
 	u64 len = end - start + 1;
+	u64 next_start;
 	int ret;
 	int testend = 1;
 
+	WARN_ON(end < start);
 	if (end == (u64)-1) {
 		len = (u64)-1;
 		testend = 0;
@@ -370,6 +372,8 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			spin_unlock(&em_tree->lock);
 			break;
 		}
+		tmp = rb_entry(&em->rb_node, struct extent_map, rb_node);
+		next_start = tmp->start;
 		remove_extent_mapping(em_tree, em);
 
 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
@@ -778,37 +782,58 @@ static int prepare_pages(struct btrfs_root *root, struct file *file,
 	struct inode *inode = fdentry(file)->d_inode;
 	int err = 0;
 	u64 start_pos;
+	u64 last_pos;
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
+	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 
 	memset(pages, 0, num_pages * sizeof(struct page *));
-
+again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
 			err = -ENOMEM;
 			BUG_ON(1);
 		}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(pages[i]);
-#else
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-#endif
 		wait_on_page_writeback(pages[i]);
-		set_page_extent_mapped(pages[i]);
-		WARN_ON(!PageLocked(pages[i]));
 	}
 	if (start_pos < inode->i_size) {
-		u64 last_pos;
-		last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(inode)->io_tree,
 			    start_pos, last_pos - 1, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > start_pos &&
+		    ordered->file_offset < last_pos) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent(&BTRFS_I(inode)->io_tree,
+				      start_pos, last_pos - 1, GFP_NOFS);
+			for (i = 0; i < num_pages; i++) {
+				unlock_page(pages[i]);
+				page_cache_release(pages[i]);
+			}
+			btrfs_wait_ordered_range(inode, start_pos,
+						 last_pos - start_pos);
+			goto again;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+
 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
 				  GFP_NOFS);
 		unlock_extent(&BTRFS_I(inode)->io_tree,
 			      start_pos, last_pos - 1, GFP_NOFS);
 	}
+	for (i = 0; i < num_pages; i++) {
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(pages[i]);
+#else
+		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+#endif
+		set_page_extent_mapped(pages[i]);
+		WARN_ON(!PageLocked(pages[i]));
+	}
 	return 0;
 }
 
@@ -969,13 +994,11 @@ out_nolock:
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 	}
 	current->backing_dev_info = NULL;
-	btrfs_ordered_throttle(root, inode);
 	return num_written ? num_written : err;
 }
 
 int btrfs_release_file(struct inode * inode, struct file * filp)
 {
-	btrfs_del_ordered_inode(inode, 0);
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d39433dfb2c..c5a62f0b959 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,6 +43,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "ordered-data.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -109,10 +110,11 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	u64 num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 orig_start = start;
 	u64 orig_num_bytes;
 	struct btrfs_key ins;
-	int ret;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
@@ -120,33 +122,44 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	ret = btrfs_drop_extents(trans, root, inode,
-				 start, start + num_bytes, start, &alloc_hint);
 	orig_num_bytes = num_bytes;
 
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
 
 	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
 
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
-		ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
-					 root->sectorsize,
-					 root->root_key.objectid,
-					 trans->transid,
-					 inode->i_ino, start, 0,
-					 alloc_hint, (u64)-1, &ins, 1);
+		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+					   root->sectorsize, 0, 0,
+					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
 			goto out;
 		}
+		em = alloc_extent_map(GFP_NOFS);
+		em->start = start;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		while(1) {
+			spin_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			spin_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, start,
+						start + ins.offset - 1);
+		}
+
 		cur_alloc_size = ins.offset;
-		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-					       start, ins.objectid, ins.offset,
-					       ins.offset, 0);
-		inode->i_blocks += ins.offset >> 9;
-		btrfs_check_file(root, inode);
+		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+					       ins.offset);
+		BUG_ON(ret);
 		if (num_bytes < cur_alloc_size) {
 			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
 			       cur_alloc_size);
@@ -156,10 +169,6 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
-	btrfs_drop_extent_cache(inode, orig_start,
-				orig_start + orig_num_bytes - 1);
-	btrfs_add_ordered_inode(inode);
-	btrfs_update_inode(trans, root, inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -341,25 +350,15 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	int ret = 0;
-	char *sums = NULL;
+	struct btrfs_ordered_sum *sums;
 
 	ret = btrfs_csum_one_bio(root, bio, &sums);
 	BUG_ON(ret);
 
-	trans = btrfs_start_transaction(root, 1);
-
-	btrfs_set_trans_block_group(trans, inode);
-	mutex_lock(&BTRFS_I(inode)->csum_mutex);
-	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
-
-	ret = btrfs_end_transaction(trans, root);
+	ret = btrfs_add_ordered_sum(inode, sums);
 	BUG_ON(ret);
 
-	kfree(sums);
-
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
@@ -369,14 +368,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	if (!(rw & (1 << BIO_RW))) {
-		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-		BUG_ON(ret);
-		goto mapit;
-	}
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
 
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM)) {
+	if (!(rw & (1 << BIO_RW))) {
 		goto mapit;
 	}
 
@@ -387,6 +382,96 @@ mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+			     struct inode *inode, u64 file_offset,
+			     struct list_head *list)
+{
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	btrfs_set_trans_block_group(trans, inode);
+	while(!list_empty(list)) {
+		cur = list->next;
+		sum = list_entry(cur, struct btrfs_ordered_sum, list);
+		mutex_lock(&BTRFS_I(inode)->csum_mutex);
+		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
+				       inode, sum);
+		mutex_unlock(&BTRFS_I(inode)->csum_mutex);
+		list_del(&sum->list);
+		kfree(sum);
+	}
+	return 0;
+}
+
+int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ordered_extent *ordered_extent;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 alloc_hint = 0;
+	struct list_head list;
+	struct btrfs_key ins;
+	int ret;
+
+	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+	if (!ret) {
+		return 0;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+
+	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	BUG_ON(!ordered_extent);
+
+	lock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+
+	INIT_LIST_HEAD(&list);
+
+	ins.objectid = ordered_extent->start;
+	ins.offset = ordered_extent->len;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
+					  trans->transid, inode->i_ino,
+					  ordered_extent->file_offset, &ins);
+	BUG_ON(ret);
+	ret = btrfs_drop_extents(trans, root, inode,
+				 ordered_extent->file_offset,
+				 ordered_extent->file_offset +
+				 ordered_extent->len,
+				 ordered_extent->file_offset, &alloc_hint);
+	BUG_ON(ret);
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       ordered_extent->file_offset,
+				       ordered_extent->start,
+				       ordered_extent->len,
+				       ordered_extent->len, 0);
+	BUG_ON(ret);
+	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
+				ordered_extent->file_offset +
+				ordered_extent->len - 1);
+	inode->i_blocks += ordered_extent->len >> 9;
+	unlock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+	add_pending_csums(trans, inode, ordered_extent->file_offset,
+			  &ordered_extent->list);
+
+	btrfs_remove_ordered_extent(inode, ordered_extent);
+	/* once for us */
+	btrfs_put_ordered_extent(ordered_extent);
+	/* once for the tree */
+	btrfs_put_ordered_extent(ordered_extent);
+
+	btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 {
 	int ret = 0;
@@ -409,7 +494,8 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 		if (ret == -ENOENT || ret == -EFBIG)
 			ret = 0;
 		csum = 0;
-		printk("no csum found for inode %lu start %Lu\n", inode->i_ino, start);
+		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
+		       start);
 		goto out;
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
@@ -833,7 +919,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
-	struct inode *inode = dentry->d_inode;
 	int ret;
 	unsigned long nr = 0;
 
@@ -849,14 +934,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	nr = trans->blocks_used;
 
-	if (inode->i_nlink == 0) {
-		/* if the inode isn't linked anywhere,
-		 * we don't need to worry about
-		 * data=ordered
-		 */
-		btrfs_del_ordered_inode(inode, 1);
-	}
-
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	btrfs_btree_balance_dirty(root, nr);
@@ -931,6 +1008,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	u64 mask = root->sectorsize - 1;
 
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
@@ -1117,34 +1195,6 @@ error:
 	return ret;
 }
 
-static int btrfs_cow_one_page(struct inode *inode, struct page *page,
-			      size_t zero_start)
-{
-	char *kaddr;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-	int ret = 0;
-
-	WARN_ON(!PageLocked(page));
-	set_page_extent_mapped(page);
-
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
-			    page_end, GFP_NOFS);
-
-	if (zero_start != PAGE_CACHE_SIZE) {
-		kaddr = kmap(page);
-		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
-		flush_dcache_page(page);
-		kunmap(page);
-	}
-	set_page_dirty(page);
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-	return ret;
-}
-
 /*
  * taken from block_truncate_page, but does cow as it zeros out
  * any bytes left in the last page in the file.
@@ -1153,12 +1203,16 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
 	u32 blocksize = root->sectorsize;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
 	int ret = 0;
 	u64 page_start;
+	u64 page_end;
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
@@ -1168,6 +1222,10 @@ again:
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
+
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
 	if (!PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
@@ -1181,10 +1239,32 @@ again:
 			goto out;
 		}
 	}
-
-	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	wait_on_page_writeback(page);
-	ret = btrfs_cow_one_page(inode, page, offset);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
+			    page_end, GFP_NOFS);
+	ret = 0;
+	if (offset != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 	unlock_page(page);
 	page_cache_release(page);
@@ -1222,8 +1302,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		hole_size = block_end - hole_start;
+		btrfs_wait_ordered_range(inode, hole_start, hole_size);
+		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
@@ -1258,6 +1339,7 @@ void btrfs_delete_inode(struct inode *inode)
 	unsigned long nr;
 	int ret;
 
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode)) {
 		goto no_delete;
@@ -1403,7 +1485,6 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	return 0;
 }
 
@@ -1705,7 +1786,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -1930,7 +2010,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
-		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2066,64 +2145,18 @@ out_unlock:
 
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
 				struct extent_map *existing,
-				struct extent_map *em)
+				struct extent_map *em,
+				u64 map_start, u64 map_len)
 {
 	u64 start_diff;
-	u64 new_end;
-	int ret = 0;
-	int real_blocks = existing->block_start < EXTENT_MAP_LAST_BYTE;
-
-	if (real_blocks && em->block_start >= EXTENT_MAP_LAST_BYTE)
-		goto invalid;
-
-	if (!real_blocks && em->block_start != existing->block_start)
-		goto invalid;
-
-	new_end = max(existing->start + existing->len, em->start + em->len);
-
-	if (existing->start >= em->start) {
-		if (em->start + em->len < existing->start)
-			goto invalid;
 
-		start_diff = existing->start - em->start;
-		if (real_blocks && em->block_start + start_diff !=
-		    existing->block_start)
-			goto invalid;
-
-		em->len = new_end - em->start;
-
-		remove_extent_mapping(em_tree, existing);
-		/* free for the tree */
-		free_extent_map(existing);
-		ret = add_extent_mapping(em_tree, em);
-
-	} else if (em->start > existing->start) {
-
-		if (existing->start + existing->len < em->start)
-			goto invalid;
-
-		start_diff = em->start - existing->start;
-		if (real_blocks && existing->block_start + start_diff !=
-		    em->block_start)
-			goto invalid;
-
-		remove_extent_mapping(em_tree, existing);
-		em->block_start = existing->block_start;
-		em->start = existing->start;
-		em->len = new_end - existing->start;
-		free_extent_map(existing);
-
-		ret = add_extent_mapping(em_tree, em);
-	} else {
-		goto invalid;
-	}
-	return ret;
-
-invalid:
-	printk("invalid extent map merge [%Lu %Lu %Lu] [%Lu %Lu %Lu]\n",
-	       existing->start, existing->len, existing->block_start,
-	       em->start, em->len, em->block_start);
-	return -EIO;
+	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+	start_diff = map_start - em->start;
+	em->start = map_start;
+	em->len = map_len;
+	if (em->block_start < EXTENT_MAP_LAST_BYTE)
+		em->block_start += start_diff;
+	return add_extent_mapping(em_tree, em);
 }
 
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2170,10 +2203,9 @@ again:
 		err = -ENOMEM;
 		goto out;
 	}
-
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
 	if (ret < 0) {
@@ -2314,6 +2346,9 @@ insert:
 	 */
 	if (ret == -EEXIST) {
 		struct extent_map *existing;
+
+		ret = 0;
+
 		existing = lookup_extent_mapping(em_tree, start, len);
 		if (existing && (existing->start > start ||
 		    existing->start + existing->len <= start)) {
@@ -2325,7 +2360,8 @@ insert:
 							 em->len);
 			if (existing) {
 				err = merge_extent_mapping(em_tree, existing,
-							   em);
+							   em, start,
+							   root->sectorsize);
 				free_extent_map(existing);
 				if (err) {
 					free_extent_map(em);
@@ -2341,6 +2377,7 @@ insert:
 		} else {
 			free_extent_map(em);
 			em = existing;
+			err = 0;
 		}
 	}
 	spin_unlock(&em_tree->lock);
@@ -2348,8 +2385,9 @@ out:
 	btrfs_free_path(path);
 	if (trans) {
 		ret = btrfs_end_transaction(trans, root);
-		if (!err)
+		if (!err) {
 			err = ret;
+		}
 	}
 	if (err) {
 		free_extent_map(em);
@@ -2474,8 +2512,7 @@ btrfs_readpages(struct file *file, struct address_space *mapping,
 	return extent_readpages(tree, mapping, pages, nr_pages,
 				btrfs_get_extent);
 }
-
-static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	struct extent_io_tree *tree;
 	struct extent_map_tree *map;
@@ -2493,15 +2530,54 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	return ret;
 }
 
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					      page_offset(page));
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		return 0;
+	}
+	return __btrfs_releasepage(page, gfp_flags);
+}
+
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct extent_io_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+	wait_on_page_writeback(page);
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	extent_invalidatepage(tree, page, offset);
-	btrfs_releasepage(page, GFP_NOFS);
+	if (offset) {
+		btrfs_releasepage(page, GFP_NOFS);
+		return;
+	}
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					   page_offset(page));
+	if (ordered) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+		btrfs_writepage_end_io_hook(page, page_start,
+					    page_end, NULL, 1);
+		btrfs_put_ordered_extent(ordered);
+		lock_extent(tree, page_start, page_end, GFP_NOFS);
+	}
+	clear_extent_bit(tree, page_start, page_end,
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+		 EXTENT_ORDERED,
+		 1, 1, GFP_NOFS);
+	__btrfs_releasepage(page, GFP_NOFS);
+
 	if (PagePrivate(page)) {
-		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
+		invalidate_extent_lru(tree, page_offset(page),
+				      PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -2527,35 +2603,63 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	unsigned long end;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
+	unsigned long zero_start;
 	loff_t size;
 	int ret;
 	u64 page_start;
+	u64 page_end;
 
 	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
 	if (ret)
 		goto out;
 
 	ret = -EINVAL;
-
+again:
 	lock_page(page);
-	wait_on_page_writeback(page);
 	size = i_size_read(inode);
-	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 	if ((page->mapping != inode->i_mapping) ||
-	    (page_start > size)) {
+	    (page_start >= size)) {
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
+	wait_on_page_writeback(page);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
+			    page_end, GFP_NOFS);
+	ret = 0;
 
 	/* page is wholly or partially inside EOF */
 	if (page_start + PAGE_CACHE_SIZE > size)
-		end = size & ~PAGE_CACHE_MASK;
+		zero_start = size & ~PAGE_CACHE_MASK;
 	else
-		end = PAGE_CACHE_SIZE;
+		zero_start = PAGE_CACHE_SIZE;
 
-	ret = btrfs_cow_one_page(inode, page, end);
+	if (zero_start != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
 	unlock_page(page);
@@ -2662,15 +2766,28 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
-	ei->ordered_trans = 0;
+	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	return &ei->vfs_inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
 {
+	struct btrfs_ordered_extent *ordered;
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
+	while(1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		if (!ordered)
+			break;
+		else {
+			printk("found ordered extent %Lu %Lu\n",
+			       ordered->file_offset, ordered->len);
+			btrfs_remove_ordered_extent(inode, ordered);
+			btrfs_put_ordered_extent(ordered);
+			btrfs_put_ordered_extent(ordered);
+		}
+	}
 	btrfs_drop_extent_cache(inode, 0, (u64)-1);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
@@ -2869,7 +2986,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
-		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2921,6 +3037,20 @@ out_fail:
 	return err;
 }
 
+static int btrfs_set_page_dirty(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if (!test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			    EXTENT_DELALLOC, 0)) {
+printk("inode %lu page %Lu not delalloc\n", inode->i_ino, page_offset(page));
+WARN_ON(1);
+	}
+	return __set_page_dirty_nobuffers(page);
+}
+
 static int btrfs_permission(struct inode *inode, int mask,
 			    struct nameidata *nd)
 {
@@ -2967,6 +3097,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
@@ -2982,7 +3113,7 @@ static struct address_space_operations btrfs_aops = {
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.set_page_dirty	= btrfs_set_page_dirty,
 };
 
 static struct address_space_operations btrfs_symlink_aops = {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 254da822566..6513270f054 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -22,48 +22,30 @@
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "extent_io.h"
 
-struct tree_entry {
-	u64 root_objectid;
-	u64 objectid;
-	struct inode *inode;
-	struct rb_node rb_node;
-};
 
-/*
- * returns > 0 if entry passed (root, objectid) is > entry,
- * < 0 if (root, objectid) < entry and zero if they are equal
- */
-static int comp_entry(struct tree_entry *entry, u64 root_objectid,
-		      u64 objectid)
+static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
-	if (root_objectid < entry->root_objectid)
-		return -1;
-	if (root_objectid > entry->root_objectid)
-		return 1;
-	if (objectid < entry->objectid)
-		return -1;
-	if (objectid > entry->objectid)
-		return 1;
-	return 0;
+	if (entry->file_offset + entry->len < entry->file_offset)
+		return (u64)-1;
+	return entry->file_offset + entry->len;
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
-				   u64 objectid, struct rb_node *node)
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+				   struct rb_node *node)
 {
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
-	struct tree_entry *entry;
-	int comp;
+	struct btrfs_ordered_extent *entry;
 
 	while(*p) {
 		parent = *p;
-		entry = rb_entry(parent, struct tree_entry, rb_node);
+		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
 
-		comp = comp_entry(entry, root_objectid, objectid);
-		if (comp < 0)
+		if (file_offset < entry->file_offset)
 			p = &(*p)->rb_left;
-		else if (comp > 0)
+		else if (file_offset >= entry_end(entry))
 			p = &(*p)->rb_right;
 		else
 			return parent;
@@ -74,24 +56,23 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
 	return NULL;
 }
 
-static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
-				     u64 objectid, struct rb_node **prev_ret)
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+				     struct rb_node **prev_ret)
 {
 	struct rb_node * n = root->rb_node;
 	struct rb_node *prev = NULL;
-	struct tree_entry *entry;
-	struct tree_entry *prev_entry = NULL;
-	int comp;
+	struct rb_node *test;
+	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *prev_entry = NULL;
 
 	while(n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
+		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 		prev = n;
 		prev_entry = entry;
-		comp = comp_entry(entry, root_objectid, objectid);
 
-		if (comp < 0)
+		if (file_offset < entry->file_offset)
 			n = n->rb_left;
-		else if (comp > 0)
+		else if (file_offset >= entry_end(entry))
 			n = n->rb_right;
 		else
 			return n;
@@ -99,195 +80,329 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
 	if (!prev_ret)
 		return NULL;
 
-	while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) {
-		prev = rb_next(prev);
-		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	while(prev && file_offset >= entry_end(prev_entry)) {
+		test = rb_next(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		if (file_offset < entry_end(prev_entry))
+			break;
+
+		prev = test;
+	}
+	if (prev)
+		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+				      rb_node);
+	while(prev && file_offset < entry_end(prev_entry)) {
+		test = rb_prev(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		prev = test;
 	}
 	*prev_ret = prev;
 	return NULL;
 }
 
-static inline struct rb_node *tree_search(struct rb_root *root,
-					  u64 root_objectid, u64 objectid)
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
+{
+	if (file_offset < entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+					  u64 file_offset)
 {
+	struct rb_root *root = &tree->tree;
 	struct rb_node *prev;
 	struct rb_node *ret;
-	ret = __tree_search(root, root_objectid, objectid, &prev);
+	struct btrfs_ordered_extent *entry;
+
+	if (tree->last) {
+		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+				 rb_node);
+		if (offset_in_entry(entry, file_offset))
+			return tree->last;
+	}
+	ret = __tree_search(root, file_offset, &prev);
 	if (!ret)
-		return prev;
+		ret = prev;
+	if (ret)
+		tree->last = ret;
 	return ret;
 }
 
-int btrfs_add_ordered_inode(struct inode *inode)
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 root_objectid = root->root_key.objectid;
-	u64 transid = root->fs_info->running_transaction->transid;
-	struct tree_entry *entry;
-	struct rb_node *node;
 	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
 
-	if (transid <= BTRFS_I(inode)->ordered_trans)
-		return 0;
-
-	tree = &root->fs_info->running_transaction->ordered_inode_tree;
-
-	read_lock(&tree->lock);
-	node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
-	read_unlock(&tree->lock);
-	if (node) {
-		return 0;
-	}
-
-	entry = kmalloc(sizeof(*entry), GFP_NOFS);
+	tree = &BTRFS_I(inode)->ordered_tree;
+	entry = kzalloc(sizeof(*entry), GFP_NOFS);
 	if (!entry)
 		return -ENOMEM;
 
-	write_lock(&tree->lock);
-	entry->objectid = inode->i_ino;
-	entry->root_objectid = root_objectid;
+	mutex_lock(&tree->mutex);
+	entry->file_offset = file_offset;
+	entry->start = start;
+	entry->len = len;
 	entry->inode = inode;
+	/* one ref for the tree */
+	atomic_set(&entry->refs, 1);
+	init_waitqueue_head(&entry->wait);
+	INIT_LIST_HEAD(&entry->list);
 
-	node = tree_insert(&tree->tree, root_objectid,
-			   inode->i_ino, &entry->rb_node);
-
-	BTRFS_I(inode)->ordered_trans = transid;
-	if (!node)
-		igrab(inode);
-
-	write_unlock(&tree->lock);
+	node = tree_insert(&tree->tree, file_offset,
+			   &entry->rb_node);
+	if (node) {
+		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		atomic_inc(&entry->refs);
+	}
+	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+			   entry_end(entry) - 1, GFP_NOFS);
 
-	if (node)
-		kfree(entry);
+	set_bit(BTRFS_ORDERED_START, &entry->flags);
+	mutex_unlock(&tree->mutex);
+	BUG_ON(node);
 	return 0;
 }
 
-int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				   u64 *root_objectid, u64 *objectid,
-				   struct inode **inode)
+int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
 {
-	struct tree_entry *entry;
+	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
 
-	write_lock(&tree->lock);
-	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, sum->file_offset);
 	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+search_fail:
+printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset);
+		node = rb_first(&tree->tree);
+		while(node) {
+			entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+			printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start);
+			node = rb_next(node);
+		}
+		BUG();
 	}
-	entry = rb_entry(node, struct tree_entry, rb_node);
+	BUG_ON(!node);
 
-	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
-		node = rb_next(node);
-		if (!node)
-			break;
-		entry = rb_entry(node, struct tree_entry, rb_node);
-	}
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, sum->file_offset)) {
+		goto search_fail;
 	}
 
-	*root_objectid = entry->root_objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
-	*objectid = entry->objectid;
-	write_unlock(&tree->lock);
-	return 1;
+	list_add_tail(&sum->list, &entry->list);
+	mutex_unlock(&tree->mutex);
+	return 0;
 }
 
-int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode)
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   u64 file_offset, u64 io_size)
 {
-	struct tree_entry *entry;
+	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
-
-	write_lock(&tree->lock);
-	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	struct btrfs_ordered_extent *entry;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+			     GFP_NOFS);
+	node = tree_search(tree, file_offset);
 	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+		ret = 1;
+		goto out;
 	}
 
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
-		node = rb_next(node);
-		if (!node)
-			break;
-		entry = rb_entry(node, struct tree_entry, rb_node);
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset)) {
+		ret = 1;
+		goto out;
 	}
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
+
+	ret = test_range_bit(io_tree, entry->file_offset,
+			     entry->file_offset + entry->len - 1,
+			     EXTENT_ORDERED, 0);
+	if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) {
+printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry));
 	}
+	if (ret == 0)
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+out:
+	mutex_unlock(&tree->mutex);
+	return ret == 0;
+}
 
-	*root_objectid = entry->root_objectid;
-	*objectid = entry->objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
-	rb_erase(node, &tree->tree);
-	write_unlock(&tree->lock);
-	kfree(entry);
-	return 1;
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+	if (atomic_dec_and_test(&entry->refs))
+		kfree(entry);
+	return 0;
 }
 
-static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				     struct inode *inode,
-				     u64 root_objectid, u64 objectid)
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry)
 {
-	struct tree_entry *entry;
+	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
-	struct rb_node *prev;
 
-	write_lock(&tree->lock);
-	node = __tree_search(&tree->tree, root_objectid, objectid, &prev);
-	if (!node) {
-		write_unlock(&tree->lock);
-		return;
-	}
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
-	BTRFS_I(inode)->ordered_trans = 0;
-	write_unlock(&tree->lock);
-	atomic_dec(&inode->i_count);
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	kfree(entry);
-	return;
+	tree->last = NULL;
+	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	mutex_unlock(&tree->mutex);
+	wake_up(&entry->wait);
+	return 0;
 }
 
-void btrfs_del_ordered_inode(struct inode *inode, int force)
+void btrfs_wait_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 root_objectid = root->root_key.objectid;
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
+#else
+	do_sync_mapping_range(inode->i_mapping, start, end,
+			      SYNC_FILE_RANGE_WRITE);
+#endif
+	wait_event(entry->wait,
+		   test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
+}
 
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
+static void btrfs_start_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry, int wait)
+{
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
 
-	if (!force && (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
-		return;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
+#else
+	do_sync_mapping_range(inode->i_mapping, start, end,
+			      SYNC_FILE_RANGE_WRITE);
+#endif
+	if (wait)
+		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						 &entry->flags));
+}
 
-	spin_lock(&root->fs_info->new_trans_lock);
-	if (root->fs_info->running_transaction) {
-		struct btrfs_ordered_inode_tree *tree;
-		tree = &root->fs_info->running_transaction->ordered_inode_tree;
-		 __btrfs_del_ordered_inode(tree, inode, root_objectid,
-						inode->i_ino);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+	u64 end;
+	struct btrfs_ordered_extent *ordered;
+	int found;
+	int should_wait = 0;
+
+again:
+	if (start + len < start)
+		end = (u64)-1;
+	else
+		end = start + len - 1;
+	found = 0;
+	while(1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+		if (!ordered) {
+			break;
+		}
+		if (ordered->file_offset >= start + len) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered->file_offset + ordered->len < start) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		btrfs_start_ordered_extent(inode, ordered, should_wait);
+		found++;
+		end = ordered->file_offset;
+		btrfs_put_ordered_extent(ordered);
+		if (end == 0)
+			break;
+		end--;
+	}
+	if (should_wait && found) {
+		should_wait = 0;
+		goto again;
 	}
-	spin_unlock(&root->fs_info->new_trans_lock);
 }
 
-int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
+int btrfs_add_ordered_pending(struct inode *inode,
+			      struct btrfs_ordered_extent *ordered,
+			      u64 start, u64 len)
 {
-	struct btrfs_transaction *cur = root->fs_info->running_transaction;
-	while(cur == root->fs_info->running_transaction &&
-	      atomic_read(&BTRFS_I(inode)->ordered_writeback)) {
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
-		congestion_wait(WRITE, HZ/20);
-#else
-		blk_congestion_wait(WRITE, HZ/20);
-#endif
-	}
+	WARN_ON(1);
 	return 0;
+#if 0
+	int ret;
+	struct btrfs_ordered_inode_tree *tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS);
+	ret = 0;
+out:
+	mutex_unlock(&tree->mutex);
+	return ret;
+#endif
+}
+
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset))
+		entry = NULL;
+	if (entry)
+		atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
+}
+
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4fa78736423..33292c5fe90 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -20,24 +20,73 @@
 #define __BTRFS_ORDERED_DATA__
 
 struct btrfs_ordered_inode_tree {
-	rwlock_t lock;
+	struct mutex mutex;
 	struct rb_root tree;
+	struct rb_node *last;
 };
 
+struct btrfs_sector_sum {
+	u64 offset;
+	u32 sum;
+};
+
+struct btrfs_ordered_sum {
+	u64 file_offset;
+	u64 len;
+	struct list_head list;
+	struct btrfs_sector_sum sums;
+};
+
+/* bits for the flags field */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+#define BTRFS_ORDERED_START 2 /* set when tree setup */
+
+struct btrfs_ordered_extent {
+	u64 file_offset;
+	u64 start;
+	u64 len;
+	unsigned long flags;
+	atomic_t refs;
+	struct list_head list;
+	struct inode *inode;
+	wait_queue_head_t wait;
+	struct rb_node rb_node;
+};
+
+
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
+{
+	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+		root->sectorsize;
+	return sizeof(struct btrfs_ordered_sum) +
+		num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
-	rwlock_init(&t->lock);
+	mutex_init(&t->mutex);
 	t->tree.rb_node = NULL;
+	t->last = NULL;
 }
 
-int btrfs_add_ordered_inode(struct inode *inode);
-int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-void btrfs_del_ordered_inode(struct inode *inode, int force);
-int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				       u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len);
+int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset);
+void btrfs_wait_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_add_ordered_pending(struct inode *inode,
+			      struct btrfs_ordered_extent *ordered,
+			      u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a8a3cb03de5..86a5acc19ce 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -67,7 +67,6 @@ static noinline int join_transaction(struct btrfs_root *root)
 		cur_trans->start_time = get_seconds();
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
 		extent_io_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
@@ -158,10 +157,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		wake_up(&cur_trans->writer_wait);
 
 	if (cur_trans->in_commit && throttle) {
-		int ret;
+		DEFINE_WAIT(wait);
 		mutex_unlock(&root->fs_info->trans_mutex);
-		ret = wait_for_commit(root, cur_trans);
-		BUG_ON(ret);
+		prepare_to_wait(&root->fs_info->transaction_throttle, &wait,
+				TASK_UNINTERRUPTIBLE);
+		schedule();
+		finish_wait(&root->fs_info->transaction_throttle, &wait);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
 
@@ -486,58 +487,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	return ret;
 }
 
-int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
-{
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	struct inode *inode;
-	u64 root_objectid = 0;
-	u64 objectid = 0;
-	int ret;
-
-	atomic_inc(&root->fs_info->throttles);
-	while(1) {
-		ret = btrfs_find_first_ordered_inode(
-				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid, &inode);
-		if (!ret)
-			break;
-
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		if (S_ISREG(inode->i_mode)) {
-			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
-			filemap_fdatawrite(inode->i_mapping);
-			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
-		}
-		iput(inode);
-
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	while(1) {
-		root_objectid = 0;
-		objectid = 0;
-		ret = btrfs_find_del_first_ordered_inode(
-				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid, &inode);
-		if (!ret)
-			break;
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		if (S_ISREG(inode->i_mode)) {
-			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
-			filemap_write_and_wait(inode->i_mapping);
-			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
-		}
-		atomic_dec(&inode->i_count);
-		iput(inode);
-
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	atomic_dec(&root->fs_info->throttles);
-	return 0;
-}
-
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_pending_snapshot *pending)
@@ -666,6 +615,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	extent_io_tree_init(pinned_copy,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
+printk("commit trans %Lu\n", trans->transid);
 	trans->transaction->in_commit = 1;
 	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -699,8 +649,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
-		ret = btrfs_write_ordered_inodes(trans, root);
-
 	} while (cur_trans->num_writers > 1 ||
 		 (cur_trans->num_joined != joined));
 
@@ -736,6 +684,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_copy_pinned(root, pinned_copy);
 
+	wake_up(&root->fs_info->transaction_throttle);
+
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
@@ -758,6 +708,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
+printk("done commit trans %Lu\n", trans->transid);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (root->fs_info->closing) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 9ccd5a5b170..910350cd4cf 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,7 +19,6 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
-#include "ordered-data.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -31,7 +30,6 @@ struct btrfs_transaction {
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
-	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
@@ -88,8 +86,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
-int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From 247e743cbe6e655768c3679f84821e03c1577902 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:53:51 -0400
Subject: Btrfs: Use async helpers to deal with pages that have been improperly
 dirtied

Higher layers sometimes call set_page_dirty without asking the filesystem
to help.  This causes many problems for the data=ordered and cow code.
This commit detects pages that haven't been properly setup for IO and
kicks off an async helper to deal with them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  6 ++++
 fs/btrfs/disk-io.c   |  4 +++
 fs/btrfs/extent_io.c | 10 ++++++
 fs/btrfs/extent_io.h |  1 +
 fs/btrfs/file.c      |  1 +
 fs/btrfs/inode.c     | 93 +++++++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 106 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ceebc052ddc..4ddc8a8f82c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -546,6 +546,12 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
+	/*
+	 * fixup workers take dirty pages that didn't properly go through
+	 * the cow mechanism and make them safe to write.  It happens
+	 * for the sys_munmap function call path
+	 */
+	struct btrfs_workers fixup_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4a5ebafb935..66466d125c0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1329,11 +1329,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->fixup_workers, 1);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_write_workers,
 			   fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
+	btrfs_start_workers(&fs_info->fixup_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_write_workers,
 			    fs_info->thread_pool_size);
@@ -1454,6 +1456,7 @@ fail_tree_root:
 fail_sys_array:
 fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
+	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -1710,6 +1713,7 @@ int close_ctree(struct btrfs_root *root)
 
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
+	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3f82a6e9ca4..feff16cb9b4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2050,6 +2050,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	lock_extent(tree, start, page_end, GFP_NOFS);
 	unlock_start = start;
 
+	if (tree->ops && tree->ops->writepage_start_hook) {
+		ret = tree->ops->writepage_start_hook(page, start, page_end);
+		if (ret == -EAGAIN) {
+			unlock_extent(tree, start, page_end, GFP_NOFS);
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
 		printk("found delalloc bits after lock_extent\n");
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2268a799589..23affd27af5 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -30,6 +30,7 @@ typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
 				       struct bio *bio, int mirror_num);
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 12e765f7e0d..20928639d17 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -313,6 +313,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
+			ClearPageChecked(p);
 			set_page_dirty(p);
 		}
 	} else {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c5a62f0b959..47a008c1930 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -403,6 +403,87 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+struct btrfs_writepage_fixup {
+	struct page *page;
+	struct btrfs_work work;
+};
+
+/* see btrfs_writepage_start_hook for details on why this is required */
+void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+{
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_ordered_extent *ordered;
+	struct page *page;
+	struct inode *inode;
+	u64 page_start;
+	u64 page_end;
+
+	fixup = container_of(work, struct btrfs_writepage_fixup, work);
+	page = fixup->page;
+
+	lock_page(page);
+	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
+		ClearPageChecked(page);
+		goto out_page;
+	}
+
+	inode = page->mapping->host;
+	page_start = page_offset(page);
+	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered)
+		goto out;
+
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			    GFP_NOFS);
+	ClearPageChecked(page);
+out:
+	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
+out_page:
+	unlock_page(page);
+	page_cache_release(page);
+}
+
+/*
+ * There are a few paths in the higher layers of the kernel that directly
+ * set the page dirty bit without asking the filesystem if it is a
+ * good idea.  This causes problems because we want to make sure COW
+ * properly happens and the data=ordered rules are followed.
+ *
+ * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * hasn't been properly setup for IO.  We kick off an async process
+ * to fix it up.  The async helper will wait for ordered extents, set
+ * the delalloc bit and make it safe to write the page.
+ */
+int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_writepage_fixup *fixup;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
+			     EXTENT_ORDERED, 0);
+	if (ret)
+		return 0;
+
+	if (PageChecked(page))
+		return -EAGAIN;
+
+	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+	if (!fixup)
+		return -EAGAIN;
+printk("queueing worker to fixup page %lu %Lu\n", inode->i_ino, page_offset(page));
+	SetPageChecked(page);
+	page_cache_get(page);
+	fixup->work.func = btrfs_writepage_fixup_worker;
+	fixup->page = page;
+	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+	return -EAGAIN;
+}
+
 int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
@@ -1263,6 +1344,7 @@ again:
 		flush_dcache_page(page);
 		kunmap(page);
 	}
+	ClearPageChecked(page);
 	set_page_dirty(page);
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
@@ -2658,6 +2740,7 @@ again:
 		flush_dcache_page(page);
 		kunmap(page);
 	}
+	ClearPageChecked(page);
 	set_page_dirty(page);
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
@@ -3039,15 +3122,6 @@ out_fail:
 
 static int btrfs_set_page_dirty(struct page *page)
 {
-	struct inode *inode = page->mapping->host;
-	u64 page_start = page_offset(page);
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-
-	if (!test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			    EXTENT_DELALLOC, 0)) {
-printk("inode %lu page %Lu not delalloc\n", inode->i_ino, page_offset(page));
-WARN_ON(1);
-	}
 	return __set_page_dirty_nobuffers(page);
 }
 
@@ -3098,6 +3172,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
+	.writepage_start_hook = btrfs_writepage_start_hook,
 	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
-- 
cgit v1.2.3


From dbe674a99c8af088faa4c95eddaeb271a3140ab6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:05 -0400
Subject: Btrfs: Update on disk i_size only after pending ordered extents are
 done

This changes the ordered data code to update i_size after the extent
is on disk.  An on disk i_size is maintained in the in-memory btrfs inode
structures, and this is updated as extents finish.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  |  9 +++++
 fs/btrfs/file.c         |  2 +-
 fs/btrfs/inode.c        | 28 ++++++++++------
 fs/btrfs/ordered-data.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ordered-data.h |  2 ++
 5 files changed, 119 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8d03687510e..81c0444f37b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,11 +40,20 @@ struct btrfs_inode {
 	 */
 	u64 last_trans;
 	u64 delalloc_bytes;
+	u64 disk_i_size;
 	u32 flags;
 };
+
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
 	return container_of(inode, struct btrfs_inode, vfs_inode);
 }
 
+static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+{
+	inode->i_size = size;
+	BTRFS_I(inode)->disk_i_size = size;
+}
+
+
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 20928639d17..3e4e5c227c0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -338,7 +338,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
-	err = btrfs_end_transaction(trans, root);
+	err = btrfs_end_transaction_throttle(trans, root);
 out_unlock:
 	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 47a008c1930..baf46017d0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -542,6 +542,7 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
+	btrfs_ordered_update_i_size(inode, ordered_extent);
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
@@ -792,7 +793,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
 	inode->i_uid = btrfs_inode_uid(leaf, inode_item);
 	inode->i_gid = btrfs_inode_gid(leaf, inode_item);
-	inode->i_size = btrfs_inode_size(leaf, inode_item);
+	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
 
 	tspec = btrfs_inode_atime(inode_item);
 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
@@ -860,7 +861,7 @@ static void fill_inode_item(struct extent_buffer *leaf,
 {
 	btrfs_set_inode_uid(leaf, item, inode->i_uid);
 	btrfs_set_inode_gid(leaf, item, inode->i_gid);
-	btrfs_set_inode_size(leaf, item, inode->i_size);
+	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
 	btrfs_set_inode_mode(leaf, item, inode->i_mode);
 	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
 
@@ -982,7 +983,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 err:
 	btrfs_free_path(path);
 	if (!ret) {
-		dir->i_size -= name_len * 2;
+		btrfs_i_size_write(dir, dir->i_size - name_len * 2);
 		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 		btrfs_update_inode(trans, root, dir);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
@@ -1044,7 +1045,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* now the directory is empty */
 	err = btrfs_unlink_trans(trans, root, dir, dentry);
 	if (!err) {
-		inode->i_size = 0;
+		btrfs_i_size_write(inode, 0);
 	}
 
 	nr = trans->blocks_used;
@@ -1089,7 +1090,6 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	u64 mask = root->sectorsize - 1;
 
-	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
@@ -1427,7 +1427,7 @@ void btrfs_delete_inode(struct inode *inode)
 		goto no_delete;
 	}
 
-	inode->i_size = 0;
+	btrfs_i_size_write(inode, 0);
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
@@ -1561,6 +1561,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
 	BTRFS_I(inode)->delalloc_bytes = 0;
+	BTRFS_I(inode)->disk_i_size = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1869,6 +1870,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
+	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->root = root;
 
 	if (mode & S_IFDIR)
@@ -1964,7 +1966,8 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 					     dentry->d_parent->d_inode->i_ino);
 		}
 		parent_inode = dentry->d_parent->d_inode;
-		parent_inode->i_size += dentry->d_name.len * 2;
+		btrfs_i_size_write(parent_inode, parent_inode->i_size +
+				   dentry->d_name.len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 		ret = btrfs_update_inode(trans, root,
 					 dentry->d_parent->d_inode);
@@ -2092,6 +2095,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2199,7 +2203,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode->i_fop = &btrfs_dir_file_operations;
 	btrfs_set_trans_block_group(trans, inode);
 
-	inode->i_size = 0;
+	btrfs_i_size_write(inode, 0);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		goto out_fail;
@@ -2756,6 +2760,7 @@ static void btrfs_truncate(struct inode *inode)
 	int ret;
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
+	u64 mask = root->sectorsize - 1;
 
 	if (!S_ISREG(inode->i_mode))
 		return;
@@ -2766,6 +2771,8 @@ static void btrfs_truncate(struct inode *inode)
 
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+	btrfs_i_size_write(inode, inode->i_size);
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	ret = btrfs_truncate_in_trans(trans, root, inode,
@@ -2821,7 +2828,7 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 	ret = btrfs_insert_inode_ref(trans, new_root, "..", 2, new_dirid,
 				     new_dirid);
 	inode->i_nlink = 1;
-	inode->i_size = 0;
+	btrfs_i_size_write(inode, 0);
 
 	return btrfs_update_inode(trans, new_root, inode);
 }
@@ -3069,6 +3076,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
+		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -3103,7 +3111,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-	inode->i_size = name_len - 1;
+	btrfs_i_size_write(inode, name_len - 1);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
 		drop_inode = 1;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6513270f054..d86a953ae51 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -406,3 +406,92 @@ out:
 	mutex_unlock(&tree->mutex);
 	return entry;
 }
+
+int btrfs_ordered_update_i_size(struct inode *inode,
+				struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 disk_i_size;
+	u64 new_i_size;
+	u64 i_size_test;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *test;
+
+	mutex_lock(&tree->mutex);
+	disk_i_size = BTRFS_I(inode)->disk_i_size;
+
+	/*
+	 * if the disk i_size is already at the inode->i_size, or
+	 * this ordered extent is inside the disk i_size, we're done
+	 */
+	if (disk_i_size >= inode->i_size ||
+	    ordered->file_offset + ordered->len <= disk_i_size) {
+		goto out;
+	}
+
+	/*
+	 * we can't update the disk_isize if there are delalloc bytes
+	 * between disk_i_size and  this ordered extent
+	 */
+	if (test_range_bit(io_tree, disk_i_size,
+			   ordered->file_offset + ordered->len - 1,
+			   EXTENT_DELALLOC, 0)) {
+		goto out;
+	}
+	/*
+	 * walk backward from this ordered extent to disk_i_size.
+	 * if we find an ordered extent then we can't update disk i_size
+	 * yet
+	 */
+	while(1) {
+		node = rb_prev(&ordered->rb_node);
+		if (!node)
+			break;
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset + test->len <= disk_i_size)
+			break;
+		if (test->file_offset >= inode->i_size)
+			break;
+		if (test->file_offset >= disk_i_size)
+			goto out;
+	}
+	new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+
+	/*
+	 * at this point, we know we can safely update i_size to at least
+	 * the offset from this ordered extent.  But, we need to
+	 * walk forward and see if ios from higher up in the file have
+	 * finished.
+	 */
+	node = rb_next(&ordered->rb_node);
+	i_size_test = 0;
+	if (node) {
+		/*
+		 * do we have an area where IO might have finished
+		 * between our ordered extent and the next one.
+		 */
+		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		if (test->file_offset > entry_end(ordered)) {
+			i_size_test = test->file_offset - 1;
+		}
+	} else {
+		i_size_test = i_size_read(inode);
+	}
+
+	/*
+	 * i_size_test is the end of a region after this ordered
+	 * extent where there are no ordered extents.  As long as there
+	 * are no delalloc bytes in this area, it is safe to update
+	 * disk_i_size to the end of the region.
+	 */
+	if (i_size_test > entry_end(ordered) &&
+	    !test_range_bit(io_tree, entry_end(ordered), i_size_test,
+			   EXTENT_DELALLOC, 0)) {
+		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+	}
+	BTRFS_I(inode)->disk_i_size = new_i_size;
+out:
+	mutex_unlock(&tree->mutex);
+	return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 33292c5fe90..40e9126ad95 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,4 +89,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_add_ordered_pending(struct inode *inode,
 			      struct btrfs_ordered_extent *ordered,
 			      u64 start, u64 len);
+int btrfs_ordered_update_i_size(struct inode *inode,
+				struct btrfs_ordered_extent *ordered);
 #endif
-- 
cgit v1.2.3


From f9295749388f82c8d2f485e99c72cd7c7876a99b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:14 -0400
Subject: btrfs_start_transaction: wait for commits in progress to finish

btrfs_commit_transaction has to loop waiting for any writers in the
transaction to finish before it can proceed.  btrfs_start_transaction
should be polite and not join a transaction that is in the process
of being finished off.

There are a few places that can't wait, basically the ones doing IO that
might be needed to finish the transaction.  For them, btrfs_join_transaction
is added.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/file.c        |  2 +-
 fs/btrfs/inode.c       | 10 +++++-----
 fs/btrfs/transaction.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/transaction.h |  3 +++
 6 files changed, 51 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ddc8a8f82c..acbce542d29 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -513,6 +513,7 @@ struct btrfs_fs_info {
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
+	wait_queue_head_t transaction_wait;
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 66466d125c0..99bb385c298 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1291,6 +1291,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
 	init_waitqueue_head(&fs_info->transaction_throttle);
+	init_waitqueue_head(&fs_info->transaction_wait);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e4e5c227c0..d6505892cd5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -251,7 +251,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	end_of_last_block = start_pos + num_bytes - 1;
 
 	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	if (!trans) {
 		err = -ENOMEM;
 		goto out_unlock;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index baf46017d0d..0a687326c0b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -116,7 +116,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
 
@@ -502,7 +502,7 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 		return 0;
 	}
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 
 	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
@@ -1812,7 +1812,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
 	int ret = 0;
 
 	if (wait) {
-		trans = btrfs_start_transaction(root, 1);
+		trans = btrfs_join_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
 	}
@@ -1830,7 +1830,7 @@ void btrfs_dirty_inode(struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
@@ -2395,7 +2395,7 @@ again:
 				free_extent_map(em);
 				em = NULL;
 				btrfs_release_path(root, path);
-				trans = btrfs_start_transaction(root, 1);
+				trans = btrfs_join_transaction(root, 1);
 				goto again;
 			}
 			write_extent_buffer(leaf, map + pg_offset, ptr,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 86a5acc19ce..05823904ecb 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -62,6 +62,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 		init_waitqueue_head(&cur_trans->writer_wait);
 		init_waitqueue_head(&cur_trans->commit_wait);
 		cur_trans->in_commit = 0;
+		cur_trans->blocked = 0;
 		cur_trans->use_count = 1;
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
@@ -99,14 +100,36 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 	return 0;
 }
 
-struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-						   int num_blocks)
+struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+					     int num_blocks, int join)
 {
 	struct btrfs_trans_handle *h =
 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	struct btrfs_transaction *cur_trans;
 	int ret;
 
 	mutex_lock(&root->fs_info->trans_mutex);
+	cur_trans = root->fs_info->running_transaction;
+	if (cur_trans && cur_trans->blocked && !join) {
+		DEFINE_WAIT(wait);
+		cur_trans->use_count++;
+		while(1) {
+			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (cur_trans->blocked) {
+				mutex_unlock(&root->fs_info->trans_mutex);
+				schedule();
+				mutex_lock(&root->fs_info->trans_mutex);
+				finish_wait(&root->fs_info->transaction_wait,
+					    &wait);
+			} else {
+				finish_wait(&root->fs_info->transaction_wait,
+					    &wait);
+				break;
+			}
+		}
+		put_transaction(cur_trans);
+	}
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
@@ -123,6 +146,17 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 	return h;
 }
 
+struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
+						   int num_blocks)
+{
+	return start_transaction(root, num_blocks, 0);
+}
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+						   int num_blocks)
+{
+	return start_transaction(root, num_blocks, 1);
+}
+
 static noinline int wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
@@ -156,7 +190,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
 
-	if (cur_trans->in_commit && throttle) {
+	if (0 && cur_trans->in_commit && throttle) {
 		DEFINE_WAIT(wait);
 		mutex_unlock(&root->fs_info->trans_mutex);
 		prepare_to_wait(&root->fs_info->transaction_throttle, &wait,
@@ -617,6 +651,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 printk("commit trans %Lu\n", trans->transid);
 	trans->transaction->in_commit = 1;
+	trans->transaction->blocked = 1;
 	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
@@ -684,7 +719,9 @@ printk("commit trans %Lu\n", trans->transid);
 
 	btrfs_copy_pinned(root, pinned_copy);
 
+	trans->transaction->blocked = 0;
 	wake_up(&root->fs_info->transaction_throttle);
+	wake_up(&root->fs_info->transaction_wait);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 910350cd4cf..11fbdeceb26 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,6 +27,7 @@ struct btrfs_transaction {
 	int in_commit;
 	int use_count;
 	int commit_done;
+	int blocked;
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
@@ -75,6 +76,8 @@ int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+						   int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From ba1da2f442ec91a1534afa893f9bef7e33056ace Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:15 -0400
Subject: Btrfs: Don't pin pages in ram until the entire ordered extent is on
 disk.

Checksum items are not inserted until the entire ordered extent is on disk,
but individual pages might be clean and available for reclaim long before
the whole extent is on disk.

In order to allow those pages to be freed, we need to be able to search
the list of ordered extents to find the checksum that is going to be inserted
in the tree.  This way if the page needs to be read back in before
the checksums are in the btree, we'll be able to verify the checksum on
the page.

This commit adds the ability to search the pending ordered extents for
a given offset in the file, and changes btrfs_releasepage to allow
ordered pages to be freed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c         |  2 +-
 fs/btrfs/inode.c        | 37 +++++++++++++++++++++----------------
 fs/btrfs/ordered-data.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/ordered-data.h |  1 +
 4 files changed, 69 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d6505892cd5..3e4e5c227c0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -251,7 +251,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	end_of_last_block = start_pos + num_bytes - 1;
 
 	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-	trans = btrfs_join_transaction(root, 1);
+	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		err = -ENOMEM;
 		goto out_unlock;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0a687326c0b..293355c92a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
-static int add_pending_csums(struct btrfs_trans_handle *trans,
+static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct inode *inode, u64 file_offset,
 			     struct list_head *list)
 {
@@ -390,15 +390,12 @@ static int add_pending_csums(struct btrfs_trans_handle *trans,
 	struct btrfs_ordered_sum *sum;
 
 	btrfs_set_trans_block_group(trans, inode);
-	while(!list_empty(list)) {
-		cur = list->next;
+	list_for_each(cur, list) {
 		sum = list_entry(cur, struct btrfs_ordered_sum, list);
 		mutex_lock(&BTRFS_I(inode)->csum_mutex);
 		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
 				       inode, sum);
 		mutex_unlock(&BTRFS_I(inode)->csum_mutex);
-		list_del(&sum->list);
-		kfree(sum);
 	}
 	return 0;
 }
@@ -498,9 +495,8 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	int ret;
 
 	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
-	if (!ret) {
+	if (!ret)
 		return 0;
-	}
 
 	trans = btrfs_join_transaction(root, 1);
 
@@ -571,6 +567,18 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	path = btrfs_alloc_path();
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
 	if (IS_ERR(item)) {
+		/*
+		 * It is possible there is an ordered extent that has
+		 * not yet finished for this range in the file.  If so,
+		 * that extent will have a csum cached, and it will insert
+		 * the sum after all the blocks in the extent are fully
+		 * on disk.  So, look for an ordered extent and use the
+		 * sum if found.
+		 */
+		ret = btrfs_find_ordered_sum(inode, start, &csum);
+		if (ret == 0)
+			goto found;
+
 		ret = PTR_ERR(item);
 		/* a csum that isn't present is a preallocated region. */
 		if (ret == -ENOENT || ret == -EFBIG)
@@ -582,6 +590,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
 			   BTRFS_CRC32_SIZE);
+found:
 	set_state_private(io_tree, start, csum);
 out:
 	if (path)
@@ -888,7 +897,7 @@ static void fill_inode_item(struct extent_buffer *leaf,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
-int btrfs_update_inode(struct btrfs_trans_handle *trans,
+int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode)
 {
@@ -1567,6 +1576,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	return 0;
 }
@@ -1868,6 +1878,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->disk_i_size = 0;
@@ -2097,6 +2108,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2618,14 +2630,6 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 
 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
-	struct btrfs_ordered_extent *ordered;
-
-	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
-					      page_offset(page));
-	if (ordered) {
-		btrfs_put_ordered_extent(ordered);
-		return 0;
-	}
 	return __btrfs_releasepage(page, gfp_flags);
 }
 
@@ -3078,6 +3082,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d86a953ae51..b739e3abebb 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -245,8 +245,18 @@ out:
 
 int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 {
-	if (atomic_dec_and_test(&entry->refs))
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	if (atomic_dec_and_test(&entry->refs)) {
+		while(!list_empty(&entry->list)) {
+			cur = entry->list.next;
+			sum = list_entry(cur, struct btrfs_ordered_sum, list);
+			list_del(&sum->list);
+			kfree(sum);
+		}
 		kfree(entry);
+	}
 	return 0;
 }
 
@@ -444,8 +454,9 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * if we find an ordered extent then we can't update disk i_size
 	 * yet
 	 */
+	node = &ordered->rb_node;
 	while(1) {
-		node = rb_prev(&ordered->rb_node);
+		node = rb_prev(node);
 		if (!node)
 			break;
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
@@ -495,3 +506,36 @@ out:
 	mutex_unlock(&tree->mutex);
 	return 0;
 }
+
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
+{
+	struct btrfs_ordered_sum *ordered_sum;
+	struct btrfs_sector_sum *sector_sums;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+	struct list_head *cur;
+	int ret = 1;
+	int index;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
+	if (!ordered)
+		return 1;
+
+	mutex_lock(&tree->mutex);
+	list_for_each_prev(cur, &ordered->list) {
+		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+		if (offset >= ordered_sum->file_offset &&
+		    offset < ordered_sum->file_offset + ordered_sum->len) {
+			index = (offset - ordered_sum->file_offset) /
+				BTRFS_I(inode)->root->sectorsize;;
+			sector_sums = &ordered_sum->sums;
+			*sum = sector_sums[index].sum;
+			ret = 0;
+			goto out;
+		}
+	}
+out:
+	mutex_unlock(&tree->mutex);
+	return ret;
+}
+
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 40e9126ad95..33f0d9e91b1 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -91,4 +91,5 @@ int btrfs_add_ordered_pending(struct inode *inode,
 			      u64 start, u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
 #endif
-- 
cgit v1.2.3


From ee6e6504e147a59a9f4d582662c105e9d72ae638 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:40 -0400
Subject: Add a per-inode lock around btrfs_drop_extents

btrfs_drop_extents is always called with a range lock held on the inode.
But, it may operate on extents outside that range as it drops and splits
them.

This patch adds a per-inode mutex that is held while calling
btrfs_drop_extents and while inserting new extents into the tree.  It
prevents races from two procs working against adjacent ranges in the tree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/extent-tree.c |  2 ++
 fs/btrfs/file.c        |  8 ++++++++
 fs/btrfs/inode.c       | 10 ++++++++++
 4 files changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 81c0444f37b..3bf40591742 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -32,6 +32,7 @@ struct btrfs_inode {
 	struct extent_io_tree io_tree;
 	struct extent_io_tree io_failure_tree;
 	struct mutex csum_mutex;
+	struct mutex extent_mutex;
 	struct inode vfs_inode;
 	struct btrfs_ordered_inode_tree ordered_tree;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 343d1101c31..4036c62b667 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1671,6 +1671,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
 			    ref_generation, owner_objectid, owner_offset,
 			    pin, pin == 0);
+
+	finish_current_insert(trans, root->fs_info->extent_root);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
 	return ret ? ret : pending_ret;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e4e5c227c0..40ad1b2958c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -242,6 +242,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
 	u64 inline_size;
+	int did_inline = 0;
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -275,6 +276,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		if (hole_size > 0) {
 			btrfs_wait_ordered_range(inode, last_pos_in_file,
 						 last_pos_in_file + hole_size);
+			mutex_lock(&BTRFS_I(inode)->extent_mutex);
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
@@ -289,6 +291,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 						       0, 0, hole_size, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size -1);
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 			btrfs_check_file(root, inode);
 		}
 		if (err)
@@ -321,6 +324,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		/* step one, delete the existing extents in this range */
 		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
+		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		err = btrfs_drop_extents(trans, root, inode, start_pos,
 					 aligned_end, aligned_end, &hint_byte);
 		if (err)
@@ -332,9 +336,13 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 					   inline_size, pages, 0, num_pages);
 		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
 		BUG_ON(err);
+		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+		did_inline = 1;
 	}
 	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
+		if (did_inline)
+			BTRFS_I(inode)->disk_i_size = end_pos;
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 293355c92a4..0d1e2ba47a9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -516,6 +516,8 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 					  trans->transid, inode->i_ino,
 					  ordered_extent->file_offset, &ins);
 	BUG_ON(ret);
+
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	ret = btrfs_drop_extents(trans, root, inode,
 				 ordered_extent->file_offset,
 				 ordered_extent->file_offset +
@@ -531,6 +533,8 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
 				ordered_extent->len - 1);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+
 	inode->i_blocks += ordered_extent->len >> 9;
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
@@ -1399,6 +1403,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
+		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		err = btrfs_drop_extents(trans, root, inode,
 					 hole_start, block_end, hole_start,
 					 &alloc_hint);
@@ -1412,6 +1417,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 						(u64)-1);
 			btrfs_check_file(root, inode);
 		}
+		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 		btrfs_end_transaction(trans, root);
 		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		if (err)
@@ -1578,6 +1584,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
+	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	return 0;
 }
 
@@ -1880,6 +1887,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			     inode->i_mapping, GFP_NOFS);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
+	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->root = root;
@@ -2105,6 +2113,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
+		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
@@ -3079,6 +3088,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
+		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-- 
cgit v1.2.3


From 0bd40a718444b28793283b70286f9e29d464a189 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:43 -0400
Subject: btrfs_next_leaf: do readahead when skip_locking is turned on

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index bbf9bf37406..cdc713062b0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3206,7 +3206,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			free_extent_buffer(next);
 		}
 
-		if (level == 1 && path->locks[1] && path->reada)
+		if (level == 1 && (path->locks[1] || path->skip_locking) &&
+		    path->reada)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
-- 
cgit v1.2.3


From 54641bd17db9fbfc13c7b1d4ee0dd2713bf3e076 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 12:54:48 -0400
Subject: Btrfs: Force caching of metadata block groups on mount to avoid
 deadlock

This is a temporary change to avoid deadlocks until the extent tree locking
is fixed up.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4036c62b667..be2aef1cb7a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3333,6 +3333,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		set_state_private(block_group_cache, found_key.objectid,
 				  (unsigned long)cache);
 
+		/* hack for now */
+		if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
+			cache_block_group(root->fs_info->extent_root,
+					  cache);
+		}
 		if (key.objectid >=
 		    btrfs_super_total_bytes(&info->super_copy))
 			break;
-- 
cgit v1.2.3


From eb84ae039e10f1f80443d846ba1350122bbdc753 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Jul 2008 13:53:27 -0400
Subject: Btrfs: Cleanup and comment ordered-data.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c        |  12 ++++-
 fs/btrfs/ordered-data.c | 130 ++++++++++++++++++++++++++----------------------
 fs/btrfs/ordered-data.h |  49 +++++++++++++++---
 3 files changed, 121 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0d1e2ba47a9..f37e09e724f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1343,7 +1343,7 @@ again:
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
-		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 		goto again;
 	}
@@ -2660,6 +2660,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
 					   page_offset(page));
 	if (ordered) {
+		/*
+		 * IO on this page will never be started, so we need
+		 * to account for any ordered extents now
+		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
@@ -2732,11 +2736,15 @@ again:
 	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 	set_page_extent_mapped(page);
 
+	/*
+	 * we can't set the delalloc bits if there are pending ordered
+	 * extents.  Drop our locks and wait for them to finish
+	 */
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
-		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 		goto again;
 	}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b739e3abebb..230fd3ca6b2 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -136,6 +136,19 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 	return ret;
 }
 
+/* allocate and add a new ordered_extent into the per-inode tree.
+ * file_offset is the logical offset in the file
+ *
+ * start is the disk block number of an extent already reserved in the
+ * extent allocation tree
+ *
+ * len is the length of the extent
+ *
+ * This also sets the EXTENT_ORDERED bit on the range in the inode.
+ *
+ * The tree is given a single reference on the ordered extent that was
+ * inserted.
+ */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len)
 {
@@ -152,7 +165,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
-	entry->inode = inode;
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
 	init_waitqueue_head(&entry->wait);
@@ -167,12 +179,15 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
 			   entry_end(entry) - 1, GFP_NOFS);
 
-	set_bit(BTRFS_ORDERED_START, &entry->flags);
 	mutex_unlock(&tree->mutex);
 	BUG_ON(node);
 	return 0;
 }
 
+/*
+ * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
+ * when an ordered extent is finished.
+ */
 int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
 {
 	struct btrfs_ordered_inode_tree *tree;
@@ -182,29 +197,25 @@ int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
 	tree = &BTRFS_I(inode)->ordered_tree;
 	mutex_lock(&tree->mutex);
 	node = tree_search(tree, sum->file_offset);
-	if (!node) {
-search_fail:
-printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset);
-		node = rb_first(&tree->tree);
-		while(node) {
-			entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-			printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start);
-			node = rb_next(node);
-		}
-		BUG();
-	}
 	BUG_ON(!node);
 
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-	if (!offset_in_entry(entry, sum->file_offset)) {
-		goto search_fail;
-	}
+	BUG_ON(!offset_in_entry(entry, sum->file_offset));
 
 	list_add_tail(&sum->list, &entry->list);
 	mutex_unlock(&tree->mutex);
 	return 0;
 }
 
+/*
+ * this is used to account for finished IO across a given range
+ * of the file.  The IO should not span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ */
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				   u64 file_offset, u64 io_size)
 {
@@ -233,9 +244,6 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 	ret = test_range_bit(io_tree, entry->file_offset,
 			     entry->file_offset + entry->len - 1,
 			     EXTENT_ORDERED, 0);
-	if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) {
-printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry));
-	}
 	if (ret == 0)
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 out:
@@ -243,6 +251,10 @@ out:
 	return ret == 0;
 }
 
+/*
+ * used to drop a reference on an ordered extent.  This will free
+ * the extent if the last reference is dropped
+ */
 int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 {
 	struct list_head *cur;
@@ -260,6 +272,10 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 	return 0;
 }
 
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * but, anyone waiting on this extent is woken up.
+ */
 int btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry)
 {
@@ -277,27 +293,25 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	return 0;
 }
 
-void btrfs_wait_ordered_extent(struct inode *inode,
-			       struct btrfs_ordered_extent *entry)
-{
-	u64 start = entry->file_offset;
-	u64 end = start + entry->len - 1;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
-#else
-	do_sync_mapping_range(inode->i_mapping, start, end,
-			      SYNC_FILE_RANGE_WRITE);
-#endif
-	wait_event(entry->wait,
-		   test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
-}
-
-static void btrfs_start_ordered_extent(struct inode *inode,
-			       struct btrfs_ordered_extent *entry, int wait)
+/*
+ * Used to start IO or wait for a given ordered extent to finish.
+ *
+ * If wait is one, this effectively waits on page writeback for all the pages
+ * in the extent, and it waits on the io completion code to insert
+ * metadata into the btree corresponding to the extent
+ */
+void btrfs_start_ordered_extent(struct inode *inode,
+				       struct btrfs_ordered_extent *entry,
+				       int wait)
 {
 	u64 start = entry->file_offset;
 	u64 end = start + entry->len - 1;
 
+	/*
+	 * pages in the range can be dirty, clean or writeback.  We
+	 * start IO on any dirty ones so the wait doesn't stall waiting
+	 * for pdflush to find them
+	 */
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
 	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
 #else
@@ -309,6 +323,9 @@ static void btrfs_start_ordered_extent(struct inode *inode,
 						 &entry->flags));
 }
 
+/*
+ * Used to wait on ordered extents across a large range of bytes.
+ */
 void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
 	u64 end;
@@ -349,31 +366,11 @@ again:
 	}
 }
 
-int btrfs_add_ordered_pending(struct inode *inode,
-			      struct btrfs_ordered_extent *ordered,
-			      u64 start, u64 len)
-{
-	WARN_ON(1);
-	return 0;
-#if 0
-	int ret;
-	struct btrfs_ordered_inode_tree *tree;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-
-	tree = &BTRFS_I(inode)->ordered_tree;
-	mutex_lock(&tree->mutex);
-	if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
-		ret = -EAGAIN;
-		goto out;
-	}
-	set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS);
-	ret = 0;
-out:
-	mutex_unlock(&tree->mutex);
-	return ret;
-#endif
-}
 
+/*
+ * find an ordered extent corresponding to file_offset.  return NULL if
+ * nothing is found, otherwise take a reference on the extent and return it
+ */
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset)
 {
@@ -397,6 +394,10 @@ out:
 	return entry;
 }
 
+/*
+ * lookup and return any extent before 'file_offset'.  NULL is returned
+ * if none is found
+ */
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
 {
@@ -417,6 +418,10 @@ out:
 	return entry;
 }
 
+/*
+ * After an extent is done, call this to conditionally update the on disk
+ * i_size.  i_size is updated to cover any fully written part of the file.
+ */
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered)
 {
@@ -507,6 +512,11 @@ out:
 	return 0;
 }
 
+/*
+ * search the ordered extents for one corresponding to 'offset' and
+ * try to find a checksum.  This is used because we allow pages to
+ * be reclaimed before their checksum is actually put into the btree
+ */
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 {
 	struct btrfs_ordered_sum *ordered_sum;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 33f0d9e91b1..98f491d1022 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -19,12 +19,19 @@
 #ifndef __BTRFS_ORDERED_DATA__
 #define __BTRFS_ORDERED_DATA__
 
+/* one of these per inode */
 struct btrfs_ordered_inode_tree {
 	struct mutex mutex;
 	struct rb_root tree;
 	struct rb_node *last;
 };
 
+/*
+ * these are used to collect checksums done just before bios submission.
+ * They are attached via a list into the ordered extent, and
+ * checksum items are inserted into the tree after all the blocks in
+ * the ordered extent are on disk
+ */
 struct btrfs_sector_sum {
 	u64 offset;
 	u32 sum;
@@ -34,27 +41,56 @@ struct btrfs_ordered_sum {
 	u64 file_offset;
 	u64 len;
 	struct list_head list;
+	/* last field is a variable length array of btrfs_sector_sums */
 	struct btrfs_sector_sum sums;
 };
 
-/* bits for the flags field */
+/*
+ * bits for the flags field:
+ *
+ * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written.
+ * It is used to make sure metadata is inserted into the tree only once
+ * per extent.
+ *
+ * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the
+ * rbtree, just before waking any waiters.  It is used to indicate the
+ * IO is done and any metadata is inserted into the tree.
+ */
 #define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+
 #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
-#define BTRFS_ORDERED_START 2 /* set when tree setup */
 
 struct btrfs_ordered_extent {
+	/* logical offset in the file */
 	u64 file_offset;
+
+	/* disk byte number */
 	u64 start;
+
+	/* length of the extent in bytes */
 	u64 len;
+
+	/* flags (described above) */
 	unsigned long flags;
+
+	/* reference count */
 	atomic_t refs;
+
+	/* list of checksums for insertion when the extent io is done */
 	struct list_head list;
-	struct inode *inode;
+
+	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 	wait_queue_head_t wait;
+
+	/* our friendly rbtree entry */
 	struct rb_node rb_node;
 };
 
 
+/*
+ * calculates the total size you need to allocate for an ordered sum
+ * structure spanning 'bytes' in the file
+ */
 static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
 {
 	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
@@ -81,14 +117,11 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
-void btrfs_wait_ordered_extent(struct inode *inode,
-			       struct btrfs_ordered_extent *entry);
+void btrfs_start_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry, int wait);
 void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
-int btrfs_add_ordered_pending(struct inode *inode,
-			      struct btrfs_ordered_extent *ordered,
-			      u64 start, u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
-- 
cgit v1.2.3


From 3edf7d33f4edb1e4a9bb0a4c0a84d95fb4d22a09 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Jul 2008 06:17:13 -0400
Subject: Btrfs: Handle data checksumming on bios that span multiple ordered
 extents

Data checksumming is done right before the bio is sent down the IO stack,
which means a single bio might span more than one ordered extent.  In
this case, the checksumming data is split between two ordered extents.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        |  4 ++--
 fs/btrfs/file-item.c    | 43 +++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/inode.c        |  6 +-----
 fs/btrfs/ordered-data.c | 36 ++++++++++++++++++------------------
 fs/btrfs/ordered-data.h | 11 +++++++++--
 5 files changed, 69 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index acbce542d29..96ab2797c09 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1579,8 +1579,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
 			   struct btrfs_ordered_sum *sums);
-int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, struct btrfs_ordered_sum **sums_ret);
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 345caf8ff51..e02f1e5acb0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -134,26 +134,53 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, struct btrfs_ordered_sum **sums_ret)
+int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+		       struct bio *bio)
 {
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
 	char *data;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
+	unsigned long total_bytes = 0;
+	unsigned long this_sum_bytes = 0;
+	u64 offset;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
-	*sums_ret = sums;
+
 	sector_sum = &sums->sums;
-	sums->file_offset = page_offset(bvec->bv_page);
+	sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 	sums->len = bio->bi_size;
 	INIT_LIST_HEAD(&sums->list);
+	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+	BUG_ON(!ordered);
 
 	while(bio_index < bio->bi_vcnt) {
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		if (offset >= ordered->file_offset + ordered->len) {
+			unsigned long bytes_left;
+			sums->len = this_sum_bytes;
+			this_sum_bytes = 0;
+			btrfs_add_ordered_sum(inode, ordered, sums);
+			btrfs_put_ordered_extent(ordered);
+
+			bytes_left = bio->bi_size - total_bytes;
+
+			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
+				       GFP_NOFS);
+			BUG_ON(!sums);
+			sector_sum = &sums->sums;
+			sums->len = bytes_left;
+			sums->file_offset = offset;
+			ordered = btrfs_lookup_ordered_extent(inode,
+						      sums->file_offset);
+			BUG_ON(!ordered);
+		}
+
 		data = kmap_atomic(bvec->bv_page, KM_USER0);
 		sector_sum->sum = ~(u32)0;
 		sector_sum->sum = btrfs_csum_data(root,
@@ -165,10 +192,18 @@ int btrfs_csum_one_bio(struct btrfs_root *root,
 				 (char *)&sector_sum->sum);
 		sector_sum->offset = page_offset(bvec->bv_page) +
 			bvec->bv_offset;
+
 		sector_sum++;
 		bio_index++;
+		total_bytes += bvec->bv_len;
+		this_sum_bytes += bvec->bv_len;
 		bvec++;
 	}
+	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_put_ordered_extent(ordered);
+	if (total_bytes != bio->bi_size) {
+printk("warning, total bytes %lu bio size %u\n", total_bytes, bio->bi_size);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f37e09e724f..4d729d90d2b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -351,12 +351,8 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
-	struct btrfs_ordered_sum *sums;
 
-	ret = btrfs_csum_one_bio(root, bio, &sums);
-	BUG_ON(ret);
-
-	ret = btrfs_add_ordered_sum(inode, sums);
+	ret = btrfs_csum_one_bio(root, inode, bio);
 	BUG_ON(ret);
 
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 230fd3ca6b2..1ddb7bceea9 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -186,22 +186,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
 /*
  * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
- * when an ordered extent is finished.
+ * when an ordered extent is finished.  If the list covers more than one
+ * ordered extent, it is split across multiples.
  */
-int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
+int btrfs_add_ordered_sum(struct inode *inode,
+			  struct btrfs_ordered_extent *entry,
+			  struct btrfs_ordered_sum *sum)
 {
 	struct btrfs_ordered_inode_tree *tree;
-	struct rb_node *node;
-	struct btrfs_ordered_extent *entry;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	mutex_lock(&tree->mutex);
-	node = tree_search(tree, sum->file_offset);
-	BUG_ON(!node);
-
-	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-	BUG_ON(!offset_in_entry(entry, sum->file_offset));
-
 	list_add_tail(&sum->list, &entry->list);
 	mutex_unlock(&tree->mutex);
 	return 0;
@@ -524,8 +519,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 	struct list_head *cur;
+	unsigned long num_sectors;
+	unsigned long i;
+	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
 	int ret = 1;
-	int index;
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	if (!ordered)
@@ -534,14 +531,17 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 	mutex_lock(&tree->mutex);
 	list_for_each_prev(cur, &ordered->list) {
 		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		if (offset >= ordered_sum->file_offset &&
-		    offset < ordered_sum->file_offset + ordered_sum->len) {
-			index = (offset - ordered_sum->file_offset) /
-				BTRFS_I(inode)->root->sectorsize;;
+		if (offset >= ordered_sum->file_offset) {
+			num_sectors = ordered_sum->len / sectorsize;
 			sector_sums = &ordered_sum->sums;
-			*sum = sector_sums[index].sum;
-			ret = 0;
-			goto out;
+			for (i = 0; i < num_sectors; i++) {
+				if (sector_sums[i].offset == offset) {
+printk("find ordered sum inode %lu offset %Lu\n", inode->i_ino, offset);
+					*sum = sector_sums[i].sum;
+					ret = 0;
+					goto out;
+				}
+			}
 		}
 	}
 out:
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 98f491d1022..1794efd13ca 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -39,7 +39,11 @@ struct btrfs_sector_sum {
 
 struct btrfs_ordered_sum {
 	u64 file_offset;
-	u64 len;
+	/*
+	 * this is the length in bytes covered by the sums array below.
+	 * But, the sums array may not be contiguous in the file.
+	 */
+	unsigned long len;
 	struct list_head list;
 	/* last field is a variable length array of btrfs_sector_sums */
 	struct btrfs_sector_sum sums;
@@ -95,6 +99,7 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
 {
 	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
 		root->sectorsize;
+	num_sectors++;
 	return sizeof(struct btrfs_ordered_sum) +
 		num_sectors * sizeof(struct btrfs_sector_sum);
 }
@@ -114,7 +119,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len);
-int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
+int btrfs_add_ordered_sum(struct inode *inode,
+			  struct btrfs_ordered_extent *entry,
+			  struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
-- 
cgit v1.2.3


From 211f90e68b679d27fe23c5505f86d6ce62c98bae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Jul 2008 11:56:15 -0400
Subject: Btrfs: Don't allow releasepage to succeed if EXTENT_ORDERED is set

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c |  3 ++-
 fs/btrfs/inode.c     | 14 +++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index feff16cb9b4..3153b4fdd0b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2542,7 +2542,8 @@ int try_release_extent_state(struct extent_map_tree *map,
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	int ret = 1;
 
-	if (test_range_bit(tree, start, end, EXTENT_IOBITS, 0))
+	if (test_range_bit(tree, start, end,
+			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
 		ret = 0;
 	else {
 		if ((mask & GFP_NOFS) == GFP_NOFS)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4d729d90d2b..8803abc89bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -477,10 +477,8 @@ printk("queueing worker to fixup page %lu %Lu\n", inode->i_ino, page_offset(page
 	return -EAGAIN;
 }
 
-int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
-				struct extent_state *state, int uptodate)
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
-	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
@@ -550,6 +548,12 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return 0;
 }
 
+int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
+{
+	return btrfs_finish_ordered_io(page->mapping->host, start, end);
+}
+
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 {
 	int ret = 0;
@@ -2663,8 +2667,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
-		btrfs_writepage_end_io_hook(page, page_start,
-					    page_end, NULL, 1);
+		btrfs_finish_ordered_io(page->mapping->host,
+					page_start, page_end);
 		btrfs_put_ordered_extent(ordered);
 		lock_extent(tree, page_start, page_end, GFP_NOFS);
 	}
-- 
cgit v1.2.3


From 7f3c74fb831fa19bafe087e817c0a5ff3883f1ea Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Jul 2008 12:01:11 -0400
Subject: Btrfs: Keep extent mappings in ram until pending ordered extents are
 done

It was possible for stale mappings from disk to be used instead of the
new pending ordered extent.  This adds a flag to the extent map struct
to keep it pinned until the pending ordered extent is actually on disk.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c  | 27 +++++++++++++++------------
 fs/btrfs/extent_map.c |  4 ++++
 fs/btrfs/extent_map.h |  3 +++
 fs/btrfs/file-item.c  |  5 +----
 fs/btrfs/file.c       | 14 ++++++++++----
 fs/btrfs/inode.c      | 15 +++++++++++++++
 6 files changed, 48 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3153b4fdd0b..d4a63ae7ed1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2000,7 +2000,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	struct block_device *bdev;
 	int ret;
 	int nr = 0;
-	size_t page_offset = 0;
+	size_t pg_offset = 0;
 	size_t blocksize;
 	loff_t i_size = i_size_read(inode);
 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -2008,9 +2008,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 delalloc_end;
 
 	WARN_ON(!PageLocked(page));
-	page_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
-	   (page->index == end_index && !page_offset)) {
+	   (page->index == end_index && !pg_offset)) {
 		page->mapping->a_ops->invalidatepage(page, 0);
 		unlock_page(page);
 		return 0;
@@ -2020,12 +2020,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		char *userpage;
 
 		userpage = kmap_atomic(page, KM_USER0);
-		memset(userpage + page_offset, 0,
-		       PAGE_CACHE_SIZE - page_offset);
+		memset(userpage + pg_offset, 0,
+		       PAGE_CACHE_SIZE - pg_offset);
 		kunmap_atomic(userpage, KM_USER0);
 		flush_dcache_page(page);
 	}
-	page_offset = 0;
+	pg_offset = 0;
 
 	set_page_extent_mapped(page);
 
@@ -2088,7 +2088,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			unlock_start = page_end + 1;
 			break;
 		}
-		em = epd->get_extent(inode, page, page_offset, cur,
+		em = epd->get_extent(inode, page, pg_offset, cur,
 				     end - cur + 1, 1);
 		if (IS_ERR(em) || !em) {
 			SetPageError(page);
@@ -2113,12 +2113,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 			unlock_extent(tree, unlock_start, cur + iosize -1,
 				      GFP_NOFS);
+
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 cur + iosize - 1,
 							 NULL, 1);
 			cur = cur + iosize;
-			page_offset += iosize;
+			pg_offset += iosize;
 			unlock_start = cur;
 			continue;
 		}
@@ -2127,7 +2128,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
 				   EXTENT_DIRTY, 0)) {
 			cur = cur + iosize;
-			page_offset += iosize;
+			pg_offset += iosize;
 			continue;
 		}
 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
@@ -2141,6 +2142,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			SetPageError(page);
 		} else {
 			unsigned long max_nr = end_index + 1;
+
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
 				printk("warning page %lu not writeback, "
@@ -2150,14 +2152,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			}
 
 			ret = submit_extent_page(WRITE, tree, page, sector,
-						 iosize, page_offset, bdev,
+						 iosize, pg_offset, bdev,
 						 &epd->bio, max_nr,
 						 end_bio_extent_writepage, 0);
 			if (ret)
 				SetPageError(page);
 		}
 		cur = cur + iosize;
-		page_offset += iosize;
+		pg_offset += iosize;
 		nr++;
 	}
 done:
@@ -2579,7 +2581,8 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 				spin_unlock(&map->lock);
 				break;
 			}
-			if (em->start != start) {
+			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+			    em->start != start) {
 				spin_unlock(&map->lock);
 				free_extent_map(em);
 				break;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 81123277c2b..71b1ac15535 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -173,6 +173,9 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 
 static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 {
+	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
+		return 0;
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -320,6 +323,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 {
 	int ret = 0;
 
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 	BUG_ON(spin_trylock(&tree->lock));
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 56314217cfc..a3978ec2784 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -8,6 +8,9 @@
 #define EXTENT_MAP_INLINE (u64)-2
 #define EXTENT_MAP_DELALLOC (u64)-1
 
+/* bits for the flags field */
+#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+
 struct extent_map {
 	struct rb_node rb_node;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index e02f1e5acb0..d9c69e16d36 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -192,7 +192,6 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 				 (char *)&sector_sum->sum);
 		sector_sum->offset = page_offset(bvec->bv_page) +
 			bvec->bv_offset;
-
 		sector_sum++;
 		bio_index++;
 		total_bytes += bvec->bv_len;
@@ -201,9 +200,6 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	}
 	btrfs_add_ordered_sum(inode, ordered, sums);
 	btrfs_put_ordered_extent(ordered);
-	if (total_bytes != bio->bi_size) {
-printk("warning, total bytes %lu bio size %u\n", total_bytes, bio->bi_size);
-	}
 	return 0;
 }
 
@@ -372,6 +368,7 @@ next_sector:
 		write_extent_buffer(leaf, &sector_sum->sum,
 				    (unsigned long)item, BTRFS_CRC32_SIZE);
 	}
+
 	total_bytes += root->sectorsize;
 	sector_sum++;
 	if (total_bytes < sums->len) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 40ad1b2958c..eccdb9562ba 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -358,9 +358,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_map *tmp;
 	u64 len = end - start + 1;
-	u64 next_start;
 	int ret;
 	int testend = 1;
 
@@ -381,8 +379,16 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			spin_unlock(&em_tree->lock);
 			break;
 		}
-		tmp = rb_entry(&em->rb_node, struct extent_map, rb_node);
-		next_start = tmp->start;
+		if (test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+			start = em->start + em->len;
+			free_extent_map(em);
+			spin_unlock(&em_tree->lock);
+			if (start < end) {
+				len = end - start + 1;
+				continue;
+			}
+			break;
+		}
 		remove_extent_mapping(em_tree, em);
 
 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8803abc89bb..08dbe738b51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -144,6 +144,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		em->len = ins.offset;
 		em->block_start = ins.objectid;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		while(1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
@@ -483,6 +484,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
 	u64 alloc_hint = 0;
 	struct list_head list;
 	struct btrfs_key ins;
@@ -524,6 +527,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				       ordered_extent->len,
 				       ordered_extent->len, 0);
 	BUG_ON(ret);
+
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, ordered_extent->file_offset,
+			       ordered_extent->len);
+	if (em) {
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		free_extent_map(em);
+	}
+	spin_unlock(&em_tree->lock);
+
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
 				ordered_extent->len - 1);
@@ -538,6 +552,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	btrfs_ordered_update_i_size(inode, ordered_extent);
 	btrfs_remove_ordered_extent(inode, ordered_extent);
+
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
-- 
cgit v1.2.3


From e5a2217ef6ff088d08a27208929a6f9c635d672c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Jul 2008 20:42:20 -0400
Subject: Fix btrfs_wait_ordered_extent_range to properly wait

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c    |  3 ++-
 fs/btrfs/inode.c        | 26 ++++++++++++++++++++------
 fs/btrfs/ordered-data.c | 45 +++++++++++++++++++++++++++------------------
 3 files changed, 49 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d9c69e16d36..45127e4797c 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -161,7 +161,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 
 	while(bio_index < bio->bi_vcnt) {
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		if (offset >= ordered->file_offset + ordered->len) {
+		if (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset) {
 			unsigned long bytes_left;
 			sums->len = this_sum_bytes;
 			this_sum_bytes = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 08dbe738b51..50ee4befac8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -128,7 +128,9 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		goto out;
 
 	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
@@ -144,6 +146,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 		em->len = ins.offset;
 		em->block_start = ins.objectid;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		while(1) {
 			spin_lock(&em_tree->lock);
@@ -156,6 +159,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 			btrfs_drop_extent_cache(inode, start,
 						start + ins.offset - 1);
 		}
+		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
@@ -487,6 +491,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_map *em;
 	u64 alloc_hint = 0;
+	u64 clear_start;
+	u64 clear_end;
 	struct list_head list;
 	struct btrfs_key ins;
 	int ret;
@@ -509,12 +515,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	ins.objectid = ordered_extent->start;
 	ins.offset = ordered_extent->len;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
+
 	ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
 					  trans->transid, inode->i_ino,
 					  ordered_extent->file_offset, &ins);
 	BUG_ON(ret);
 
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
+
 	ret = btrfs_drop_extents(trans, root, inode,
 				 ordered_extent->file_offset,
 				 ordered_extent->file_offset +
@@ -528,13 +536,19 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				       ordered_extent->len, 0);
 	BUG_ON(ret);
 
-
 	spin_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, ordered_extent->file_offset,
-			       ordered_extent->len);
-	if (em) {
-		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-		free_extent_map(em);
+	clear_start = ordered_extent->file_offset;
+	clear_end = ordered_extent->file_offset + ordered_extent->len;
+	while(clear_start < clear_end) {
+		em = lookup_extent_mapping(em_tree, clear_start,
+					   clear_end - clear_start);
+		if (em) {
+			clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+			clear_start = em->start + em->len;
+			free_extent_map(em);
+		} else {
+			break;
+		}
 	}
 	spin_unlock(&em_tree->lock);
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 1ddb7bceea9..c2b4a9c4ddb 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -324,22 +324,37 @@ void btrfs_start_ordered_extent(struct inode *inode,
 void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
 	u64 end;
+	u64 orig_end;
+	u64 wait_end;
 	struct btrfs_ordered_extent *ordered;
-	int found;
-	int should_wait = 0;
-
-again:
-	if (start + len < start)
-		end = (u64)-1;
-	else
-		end = start + len - 1;
-	found = 0;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+
+	if (start + len < start) {
+		wait_end = (inode->i_size + mask) & ~mask;
+		orig_end = (u64)-1;
+	} else {
+		orig_end = start + len - 1;
+		wait_end = orig_end;
+	}
+
+	/* start IO across the range first to instantiate any delalloc
+	 * extents
+	 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+	do_sync_file_range(file, start, wait_end, SYNC_FILE_RANGE_WRITE);
+#else
+	do_sync_mapping_range(inode->i_mapping, start, wait_end,
+			      SYNC_FILE_RANGE_WRITE);
+#endif
+	end = orig_end;
+	wait_on_extent_writeback(&BTRFS_I(inode)->io_tree, start, orig_end);
+
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
 		if (!ordered) {
 			break;
 		}
-		if (ordered->file_offset >= start + len) {
+		if (ordered->file_offset > orig_end) {
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
@@ -347,21 +362,15 @@ again:
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-		btrfs_start_ordered_extent(inode, ordered, should_wait);
-		found++;
+		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
 		btrfs_put_ordered_extent(ordered);
-		if (end == 0)
+		if (end == 0 || end == start)
 			break;
 		end--;
 	}
-	if (should_wait && found) {
-		should_wait = 0;
-		goto again;
-	}
 }
 
-
 /*
  * find an ordered extent corresponding to file_offset.  return NULL if
  * nothing is found, otherwise take a reference on the extent and return it
-- 
cgit v1.2.3


From 4a09675279674041862d2210635b0cc1f60be28e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Jul 2008 10:29:44 -0400
Subject: Btrfs: Data ordered fixes

* In btrfs_delete_inode, wait for ordered extents after calling
truncate_inode_pages.  This is much faster, and more correct

* Properly clear our the PageChecked bit everywhere we redirty the page.

* Change the writepage fixup handler to lock the page range and check to
see if an ordered extent had been inserted since the improperly dirtied
page was discovered

* Wait for ordered extents outside the transaction.  This isn't required
for locking rules but does improve transaction latencies

* Reduce contention on the alloc_mutex by dropping it while incrementing
refs on a node/leaf and while dropping refs on a leaf.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c  | 18 +++++++++++++++---
 fs/btrfs/file.c         |  1 +
 fs/btrfs/inode.c        | 23 ++++++++++++++++++-----
 fs/btrfs/ordered-data.c | 11 +++++++++--
 4 files changed, 43 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be2aef1cb7a..ccd49322f79 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -934,7 +934,6 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	if (!root->ref_cows)
 		return 0;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 	for (i = 0; i < nritems; i++) {
@@ -951,29 +950,36 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (disk_bytenr == 0)
 				continue;
+
+			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
 				    btrfs_file_extent_disk_num_bytes(buf, fi),
 				    root->root_key.objectid, trans->transid,
 				    key.objectid, key.offset);
+			mutex_unlock(&root->fs_info->alloc_mutex);
 			if (ret) {
 				faili = i;
+				WARN_ON(1);
 				goto fail;
 			}
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			btrfs_node_key_to_cpu(buf, &key, i);
+
+			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_inc_extent_ref(trans, root, bytenr,
 					   btrfs_level_size(root, level - 1),
 					   root->root_key.objectid,
 					   trans->transid,
 					   level - 1, key.objectid);
+			mutex_unlock(&root->fs_info->alloc_mutex);
 			if (ret) {
 				faili = i;
+				WARN_ON(1);
 				goto fail;
 			}
 		}
 	}
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
 fail:
 	WARN_ON(1);
@@ -1004,7 +1010,6 @@ fail:
 		}
 	}
 #endif
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -2180,6 +2185,8 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 	leaf_owner = btrfs_header_owner(leaf);
 	leaf_generation = btrfs_header_generation(leaf);
 
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
 
@@ -2197,12 +2204,17 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 		if (disk_bytenr == 0)
 			continue;
+
+		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf_owner, leaf_generation,
 				key.objectid, key.offset, 0);
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 	}
+
+	mutex_lock(&root->fs_info->alloc_mutex);
 	return 0;
 }
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eccdb9562ba..591a30208ac 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -75,6 +75,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 	for (i = 0; i < num_pages; i++) {
 		if (!pages[i])
 			break;
+		ClearPageChecked(pages[i]);
 		unlock_page(pages[i]);
 		mark_page_accessed(pages[i]);
 		page_cache_release(pages[i]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 50ee4befac8..8fb6dc25e7a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -418,7 +418,7 @@ void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 
 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
 	page = fixup->page;
-
+again:
 	lock_page(page);
 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
 		ClearPageChecked(page);
@@ -430,9 +430,21 @@ void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
 	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
-	ordered = btrfs_lookup_ordered_extent(inode, page_start);
-	if (ordered)
+
+	/* already ordered? We're done */
+	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			     EXTENT_ORDERED, 0)) {
 		goto out;
+	}
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
+			      page_end, GFP_NOFS);
+		unlock_page(page);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		goto again;
+	}
 
 	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			    GFP_NOFS);
@@ -1465,11 +1477,11 @@ void btrfs_delete_inode(struct inode *inode)
 	unsigned long nr;
 	int ret;
 
-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode)) {
 		goto no_delete;
 	}
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	btrfs_i_size_write(inode, 0);
 	trans = btrfs_start_transaction(root, 1);
@@ -2707,6 +2719,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		 1, 1, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
+	ClearPageChecked(page);
 	if (PagePrivate(page)) {
 		invalidate_extent_lru(tree, page_offset(page),
 				      PAGE_CACHE_SIZE);
@@ -2818,10 +2831,10 @@ static void btrfs_truncate(struct inode *inode)
 		return;
 
 	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
-	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_i_size_write(inode, inode->i_size);
 
 	/* FIXME, add redo link to tree so we don't leak on crash */
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c2b4a9c4ddb..0d87795fdd8 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -336,7 +336,7 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 		orig_end = start + len - 1;
 		wait_end = orig_end;
 	}
-
+again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
@@ -369,6 +369,14 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 			break;
 		end--;
 	}
+	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+		printk("inode %lu still ordered or delalloc after wait "
+		       "%llu %llu\n", inode->i_ino,
+		       (unsigned long long)start,
+		       (unsigned long long)orig_end);
+		goto again;
+	}
 }
 
 /*
@@ -545,7 +553,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 			sector_sums = &ordered_sum->sums;
 			for (i = 0; i < num_sectors; i++) {
 				if (sector_sums[i].offset == offset) {
-printk("find ordered sum inode %lu offset %Lu\n", inode->i_ino, offset);
 					*sum = sector_sums[i].sum;
 					ret = 0;
 					goto out;
-- 
cgit v1.2.3


From 6af118ce51b52ceda357c671550c79628b9c4a65 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 11:18:07 -0400
Subject: Btrfs: Index extent buffers in an rbtree

Before, extent buffers were a temporary object, meant to map a number of pages
at once and collect operations on them.

But, a few extra fields have crept in, and they are also the best place to
store a per-tree block lock field as well.  This commit puts the extent
buffers into an rbtree, and ensures a single extent buffer for each
tree block.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |  26 ++---
 fs/btrfs/extent_io.c | 309 +++++++++++++++++++--------------------------------
 fs/btrfs/extent_io.h |  11 +-
 fs/btrfs/inode.c     |   3 -
 4 files changed, 129 insertions(+), 220 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 99bb385c298..86e84a8579e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -381,7 +381,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
-	release_extent_buffer_tail_pages(eb);
 err:
 	free_extent_buffer(eb);
 out:
@@ -563,21 +562,21 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	struct extent_map_tree *map;
 	int ret;
 
-	if (page_count(page) > 3) {
-		/* once for page->private, once for the caller, once
-		 * once for the page cache
-		 */
-		return 0;
-	}
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
+
 	ret = try_release_extent_state(map, tree, page, gfp_flags);
+	if (!ret) {
+		return 0;
+	}
+
+	ret = try_release_extent_buffer(tree, page);
 	if (ret == 1) {
-		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
 	}
+
 	return ret;
 }
 
@@ -588,7 +587,8 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
+		printk("warning page private not zero on page %Lu\n",
+		       page_offset(page));
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -1456,7 +1456,6 @@ fail_tree_root:
 	free_extent_buffer(tree_root->node);
 fail_sys_array:
 fail_sb_buffer:
-	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
@@ -1705,13 +1704,6 @@ int close_ctree(struct btrfs_root *root)
 
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 
-	extent_io_tree_empty_lru(&fs_info->free_space_cache);
-	extent_io_tree_empty_lru(&fs_info->block_group_cache);
-	extent_io_tree_empty_lru(&fs_info->pinned_extents);
-	extent_io_tree_empty_lru(&fs_info->pending_del);
-	extent_io_tree_empty_lru(&fs_info->extent_ins);
-	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
-
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
 	btrfs_stop_workers(&fs_info->fixup_workers);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d4a63ae7ed1..32bb4ed3723 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -91,29 +91,16 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 			  struct address_space *mapping, gfp_t mask)
 {
 	tree->state.rb_node = NULL;
+	tree->buffer.rb_node = NULL;
 	tree->ops = NULL;
 	tree->dirty_bytes = 0;
 	spin_lock_init(&tree->lock);
-	spin_lock_init(&tree->lru_lock);
+	spin_lock_init(&tree->buffer_lock);
 	tree->mapping = mapping;
-	INIT_LIST_HEAD(&tree->buffer_lru);
-	tree->lru_size = 0;
 	tree->last = NULL;
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
-void extent_io_tree_empty_lru(struct extent_io_tree *tree)
-{
-	struct extent_buffer *eb;
-	while(!list_empty(&tree->buffer_lru)) {
-		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
-				lru);
-		list_del_init(&eb->lru);
-		free_extent_buffer(eb);
-	}
-}
-EXPORT_SYMBOL(extent_io_tree_empty_lru);
-
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
@@ -245,6 +232,50 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 	return ret;
 }
 
+static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
+					  u64 offset, struct rb_node *node)
+{
+	struct rb_root *root = &tree->buffer;
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct extent_buffer *eb;
+
+	while(*p) {
+		parent = *p;
+		eb = rb_entry(parent, struct extent_buffer, rb_node);
+
+		if (offset < eb->start)
+			p = &(*p)->rb_left;
+		else if (offset > eb->start)
+			p = &(*p)->rb_right;
+		else
+			return eb;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
+					   u64 offset)
+{
+	struct rb_root *root = &tree->buffer;
+	struct rb_node * n = root->rb_node;
+	struct extent_buffer *eb;
+
+	while(n) {
+		eb = rb_entry(n, struct extent_buffer, rb_node);
+		if (offset < eb->start)
+			n = n->rb_left;
+		else if (offset > eb->start)
+			n = n->rb_right;
+		else
+			return eb;
+	}
+	return NULL;
+}
+
 /*
  * utility function to look for merge candidates inside a given range.
  * Any extents with matching state are merged together into a single
@@ -1817,9 +1848,8 @@ void set_page_extent_mapped(struct page *page)
 {
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
-		WARN_ON(!page->mapping->a_ops->invalidatepage);
-		set_page_private(page, EXTENT_PAGE_PRIVATE);
 		page_cache_get(page);
+		set_page_private(page, EXTENT_PAGE_PRIVATE);
 	}
 }
 
@@ -2627,51 +2657,6 @@ out:
 	return sector;
 }
 
-static int add_lru(struct extent_io_tree *tree, struct extent_buffer *eb)
-{
-	if (list_empty(&eb->lru)) {
-		extent_buffer_get(eb);
-		list_add(&eb->lru, &tree->buffer_lru);
-		tree->lru_size++;
-		if (tree->lru_size >= BUFFER_LRU_MAX) {
-			struct extent_buffer *rm;
-			rm = list_entry(tree->buffer_lru.prev,
-					struct extent_buffer, lru);
-			tree->lru_size--;
-			list_del_init(&rm->lru);
-			free_extent_buffer(rm);
-		}
-	} else
-		list_move(&eb->lru, &tree->buffer_lru);
-	return 0;
-}
-static struct extent_buffer *find_lru(struct extent_io_tree *tree,
-				      u64 start, unsigned long len)
-{
-	struct list_head *lru = &tree->buffer_lru;
-	struct list_head *cur = lru->next;
-	struct extent_buffer *eb;
-
-	if (list_empty(lru))
-		return NULL;
-
-	do {
-		eb = list_entry(cur, struct extent_buffer, lru);
-		if (eb->start == start && eb->len == len) {
-			extent_buffer_get(eb);
-			return eb;
-		}
-		cur = cur->next;
-	} while (cur != lru);
-	return NULL;
-}
-
-static inline unsigned long num_extent_pages(u64 start, u64 len)
-{
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
-}
-
 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
@@ -2688,44 +2673,10 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	return p;
 }
 
-int release_extent_buffer_tail_pages(struct extent_buffer *eb)
-{
-	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
-	struct page *page;
-	unsigned long i;
-
-	if (num_pages == 1)
-		return 0;
-	for (i = 1; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		page_cache_release(page);
-	}
-	return 0;
-}
-
-
-int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
-			  unsigned long len)
+static inline unsigned long num_extent_pages(u64 start, u64 len)
 {
-	struct list_head *lru = &tree->buffer_lru;
-	struct list_head *cur = lru->next;
-	struct extent_buffer *eb;
-	int found = 0;
-
-	spin_lock(&tree->lru_lock);
-	if (list_empty(lru))
-		goto out;
-
-	do {
-		eb = list_entry(cur, struct extent_buffer, lru);
-		if (eb->start <= start && eb->start + eb->len > start) {
-			eb->flags &= ~EXTENT_UPTODATE;
-		}
-		cur = cur->next;
-	} while (cur != lru);
-out:
-	spin_unlock(&tree->lru_lock);
-	return found;
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
 }
 
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
@@ -2736,15 +2687,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	struct extent_buffer *eb = NULL;
 	unsigned long flags;
 
-	spin_lock(&tree->lru_lock);
-	eb = find_lru(tree, start, len);
-	spin_unlock(&tree->lru_lock);
-	if (eb) {
-		return eb;
-	}
-
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
-	INIT_LIST_HEAD(&eb->lru);
 	eb->start = start;
 	eb->len = len;
 	spin_lock_irqsave(&leak_lock, flags);
@@ -2773,17 +2716,24 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	unsigned long i;
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	struct extent_buffer *eb;
+	struct extent_buffer *exists = NULL;
 	struct page *p;
 	struct address_space *mapping = tree->mapping;
 	int uptodate = 1;
 
+	spin_lock(&tree->buffer_lock);
+	eb = buffer_search(tree, start);
+	if (eb) {
+		atomic_inc(&eb->refs);
+		spin_unlock(&tree->buffer_lock);
+		return eb;
+	}
+	spin_unlock(&tree->buffer_lock);
+
 	eb = __alloc_extent_buffer(tree, start, len, mask);
 	if (!eb)
 		return NULL;
 
-	if (eb->flags & EXTENT_BUFFER_FILLED)
-		goto lru_add;
-
 	if (page0) {
 		eb->first_page = page0;
 		i = 1;
@@ -2800,7 +2750,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
 		if (!p) {
 			WARN_ON(1);
-			goto fail;
+			goto free_eb;
 		}
 		set_page_extent_mapped(p);
 		mark_page_accessed(p);
@@ -2818,25 +2768,28 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		eb->flags |= EXTENT_UPTODATE;
 	eb->flags |= EXTENT_BUFFER_FILLED;
 
-lru_add:
-	spin_lock(&tree->lru_lock);
-	add_lru(tree, eb);
-	spin_unlock(&tree->lru_lock);
+	spin_lock(&tree->buffer_lock);
+	exists = buffer_tree_insert(tree, start, &eb->rb_node);
+	if (exists) {
+		/* add one reference for the caller */
+		atomic_inc(&exists->refs);
+		spin_unlock(&tree->buffer_lock);
+		goto free_eb;
+	}
+	spin_unlock(&tree->buffer_lock);
+
+	/* add one reference for the tree */
+	atomic_inc(&eb->refs);
 	return eb;
 
-fail:
-	spin_lock(&tree->lru_lock);
-	list_del_init(&eb->lru);
-	spin_unlock(&tree->lru_lock);
+free_eb:
 	if (!atomic_dec_and_test(&eb->refs))
-		return NULL;
-	for (index = 1; index < i; index++) {
+		return exists;
+	for (index = 1; index < i; index++)
 		page_cache_release(extent_buffer_page(eb, index));
-	}
-	if (i > 0)
-		page_cache_release(extent_buffer_page(eb, 0));
+	page_cache_release(extent_buffer_page(eb, 0));
 	__free_extent_buffer(eb);
-	return NULL;
+	return exists;
 }
 EXPORT_SYMBOL(alloc_extent_buffer);
 
@@ -2844,89 +2797,27 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					 u64 start, unsigned long len,
 					  gfp_t mask)
 {
-	unsigned long num_pages = num_extent_pages(start, len);
-	unsigned long i;
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	struct extent_buffer *eb;
-	struct page *p;
-	struct address_space *mapping = tree->mapping;
-	int uptodate = 1;
 
-	eb = __alloc_extent_buffer(tree, start, len, mask);
-	if (!eb)
-		return NULL;
-
-	if (eb->flags & EXTENT_BUFFER_FILLED)
-		goto lru_add;
-
-	for (i = 0; i < num_pages; i++, index++) {
-		p = find_get_page(mapping, index);
-		if (!p) {
-			goto fail;
-		}
-		if (TestSetPageLocked(p)) {
-			page_cache_release(p);
-			goto fail;
-		}
-
-		set_page_extent_mapped(p);
-		mark_page_accessed(p);
-
-		if (i == 0) {
-			eb->first_page = p;
-			set_page_extent_head(p, len);
-		} else {
-			set_page_private(p, EXTENT_PAGE_PRIVATE);
-		}
-
-		if (!PageUptodate(p))
-			uptodate = 0;
-		unlock_page(p);
-	}
-	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
+	spin_lock(&tree->buffer_lock);
+	eb = buffer_search(tree, start);
+	if (eb)
+		atomic_inc(&eb->refs);
+	spin_unlock(&tree->buffer_lock);
 
-lru_add:
-	spin_lock(&tree->lru_lock);
-	add_lru(tree, eb);
-	spin_unlock(&tree->lru_lock);
 	return eb;
-fail:
-	spin_lock(&tree->lru_lock);
-	list_del_init(&eb->lru);
-	spin_unlock(&tree->lru_lock);
-	if (!atomic_dec_and_test(&eb->refs))
-		return NULL;
-	for (index = 1; index < i; index++) {
-		page_cache_release(extent_buffer_page(eb, index));
-	}
-	if (i > 0)
-		page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
-	return NULL;
 }
 EXPORT_SYMBOL(find_extent_buffer);
 
 void free_extent_buffer(struct extent_buffer *eb)
 {
-	unsigned long i;
-	unsigned long num_pages;
-
 	if (!eb)
 		return;
 
 	if (!atomic_dec_and_test(&eb->refs))
 		return;
 
-	WARN_ON(!list_empty(&eb->lru));
-	num_pages = num_extent_pages(eb->start, eb->len);
-
-	for (i = 1; i < num_pages; i++) {
-		page_cache_release(extent_buffer_page(eb, i));
-	}
-	page_cache_release(extent_buffer_page(eb, 0));
-	__free_extent_buffer(eb);
+	WARN_ON(1);
 }
 EXPORT_SYMBOL(free_extent_buffer);
 
@@ -3583,3 +3474,35 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	}
 }
 EXPORT_SYMBOL(memmove_extent_buffer);
+
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+{
+	u64 start = page_offset(page);
+	struct extent_buffer *eb;
+	int ret = 1;
+	unsigned long i;
+	unsigned long num_pages;
+
+	spin_lock(&tree->buffer_lock);
+	eb = buffer_search(tree, start);
+	if (!eb)
+		goto out;
+
+	if (atomic_read(&eb->refs) > 1) {
+		ret = 0;
+		goto out;
+	}
+	/* at this point we can safely release the extent buffer */
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = extent_buffer_page(eb, i);
+		page_cache_release(page);
+	}
+	rb_erase(&eb->rb_node, &tree->buffer);
+	__free_extent_buffer(eb);
+out:
+	spin_unlock(&tree->buffer_lock);
+	return ret;
+}
+EXPORT_SYMBOL(try_release_extent_buffer);
+
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 23affd27af5..dd367617d78 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -54,13 +54,12 @@ struct extent_io_ops {
 
 struct extent_io_tree {
 	struct rb_root state;
+	struct rb_root buffer;
 	struct address_space *mapping;
 	u64 dirty_bytes;
 	spinlock_t lock;
+	spinlock_t buffer_lock;
 	struct extent_io_ops *ops;
-	spinlock_t lru_lock;
-	struct list_head buffer_lru;
-	int lru_size;
 	struct extent_state *last;
 };
 
@@ -87,10 +86,10 @@ struct extent_buffer {
 	unsigned long map_start;
 	unsigned long map_len;
 	struct page *first_page;
-	struct list_head lru;
 	atomic_t refs;
 	int flags;
 	struct list_head leak_list;
+	struct rb_node rb_node;
 };
 
 struct extent_map_tree;
@@ -112,10 +111,10 @@ typedef struct extent_map *(get_extent_t)(struct inode *inode,
 
 void extent_io_tree_init(struct extent_io_tree *tree,
 			  struct address_space *mapping, gfp_t mask);
-void extent_io_tree_empty_lru(struct extent_io_tree *tree);
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
+int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
 int try_release_extent_state(struct extent_map_tree *map,
 			     struct extent_io_tree *tree, struct page *page,
 			     gfp_t mask);
@@ -241,8 +240,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long *map_start,
 		      unsigned long *map_len, int km);
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
-int invalidate_extent_lru(struct extent_io_tree *tree, u64 start,
-			  unsigned long len);
 int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
 			  u64 start, u64 end);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8fb6dc25e7a..60852ada658 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2670,7 +2670,6 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
 	if (ret == 1) {
-		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -2721,8 +2720,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 
 	ClearPageChecked(page);
 	if (PagePrivate(page)) {
-		invalidate_extent_lru(tree, page_offset(page),
-				      PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
-- 
cgit v1.2.3


From a61e6f29dc7c9d56a776a518eed92bbc61848263 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 11:18:08 -0400
Subject: Btrfs: Use a mutex in the extent buffer for tree block locking

This replaces the use of the page cache lock bit for locking, which wasn't
suitable for block size < page size and couldn't be used recursively.

The mutexes alone don't fix either problem, but they are the first step.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  7 +------
 fs/btrfs/extent_io.c   |  9 +++++++++
 fs/btrfs/extent_io.h   |  1 +
 fs/btrfs/locking.c     | 13 ++++++-------
 4 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ccd49322f79..c51cd11de20 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1451,7 +1451,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
-			if (!btrfs_try_tree_lock(buf) &&
+			if (btrfs_try_tree_lock(buf) &&
 			    btrfs_buffer_uptodate(buf, 0)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
@@ -3345,11 +3345,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		set_state_private(block_group_cache, found_key.objectid,
 				  (unsigned long)cache);
 
-		/* hack for now */
-		if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
-			cache_block_group(root->fs_info->extent_root,
-					  cache);
-		}
 		if (key.objectid >=
 		    btrfs_super_total_bytes(&info->super_copy))
 			break;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 32bb4ed3723..7380449cb5b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2690,6 +2690,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	eb->start = start;
 	eb->len = len;
+	mutex_init(&eb->mutex);
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
 	spin_unlock_irqrestore(&leak_lock, flags);
@@ -2837,6 +2838,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
+		lock_page(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
 		else
@@ -2854,6 +2856,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			end  = start + PAGE_CACHE_SIZE - 1;
 			if (test_range_bit(tree, start, end,
 					   EXTENT_DIRTY, 0)) {
+				unlock_page(page);
 				continue;
 			}
 		}
@@ -2865,6 +2868,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 						PAGECACHE_TAG_DIRTY);
 		}
 		read_unlock_irq(&page->mapping->tree_lock);
+		unlock_page(page);
 	}
 	return 0;
 }
@@ -2893,12 +2897,17 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 		 * on us if the page isn't already dirty.
 		 */
 		if (i == 0) {
+			lock_page(page);
 			set_page_extent_head(page, eb->len);
 		} else if (PagePrivate(page) &&
 			   page->private != EXTENT_PAGE_PRIVATE) {
+			lock_page(page);
 			set_page_extent_mapped(page);
+			unlock_page(page);
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+		if (i == 0)
+			unlock_page(page);
 	}
 	return set_extent_dirty(tree, eb->start,
 				eb->start + eb->len - 1, GFP_NOFS);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index dd367617d78..6c03e6a1993 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -90,6 +90,7 @@ struct extent_buffer {
 	int flags;
 	struct list_head leak_list;
 	struct rb_node rb_node;
+	struct mutex mutex;
 };
 
 struct extent_map_tree;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 058a506a0dd..01a9ac2be3f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -29,32 +29,31 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 {
 	int i;
 
-	if (!TestSetPageLocked(eb->first_page))
+	if (mutex_trylock(&eb->mutex))
 		return 0;
 	for (i = 0; i < 512; i++) {
 		cpu_relax();
-		if (!TestSetPageLocked(eb->first_page))
+		if (mutex_trylock(&eb->mutex))
 			return 0;
 	}
 	cpu_relax();
-	lock_page(eb->first_page);
+	mutex_lock(&eb->mutex);
 	return 0;
 }
 
 int btrfs_try_tree_lock(struct extent_buffer *eb)
 {
-	return TestSetPageLocked(eb->first_page);
+	return mutex_trylock(&eb->mutex);
 }
 
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
-	WARN_ON(!PageLocked(eb->first_page));
-	unlock_page(eb->first_page);
+	mutex_unlock(&eb->mutex);
 	return 0;
 }
 
 int btrfs_tree_locked(struct extent_buffer *eb)
 {
-	return PageLocked(eb->first_page);
+	return mutex_is_locked(&eb->mutex);
 }
 
-- 
cgit v1.2.3


From f421950f86bf96a11fef932e167ab2e70d4c43a0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 11:18:09 -0400
Subject: Btrfs: Fix some data=ordered related data corruptions

Stress testing was showing data checksum errors, most of which were caused
by a lookup bug in the extent_map tree.  The tree was caching the last
pointer returned, and searches would check the last pointer first.

But, search callers also expect the search to return the very first
matching extent in the range, which wasn't always true with the last
pointer usage.

For now, the code to cache the last return value is just removed.  It is
easy to fix, but I think lookups are rare enough that it isn't required anymore.

This commit also replaces do_sync_mapping_range with a local copy of the
related functions.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        |   2 +
 fs/btrfs/extent_io.c    |  20 ---------
 fs/btrfs/extent_io.h    |   1 -
 fs/btrfs/extent_map.c   |   9 ----
 fs/btrfs/extent_map.h   |   1 -
 fs/btrfs/file.c         |  15 +++----
 fs/btrfs/inode.c        |  54 +++++++++++++----------
 fs/btrfs/ordered-data.c | 115 ++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/ordered-data.h |   4 ++
 fs/btrfs/transaction.c  |   2 -
 10 files changed, 140 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 96ab2797c09..f8fccdac305 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1590,6 +1590,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+int btrfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_root *new_root,
 		struct btrfs_trans_handle *trans, u64 new_dirid,
 		struct btrfs_block_group_cache *block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7380449cb5b..9965993748d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -97,7 +97,6 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 	spin_lock_init(&tree->lock);
 	spin_lock_init(&tree->buffer_lock);
 	tree->mapping = mapping;
-	tree->last = NULL;
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
@@ -173,12 +172,6 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
-	if (tree->last) {
-		struct extent_state *state;
-		state = tree->last;
-		if (state->start <= offset && offset <= state->end)
-			return &tree->last->rb_node;
-	}
 	while(n) {
 		entry = rb_entry(n, struct tree_entry, rb_node);
 		prev = n;
@@ -189,7 +182,6 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 		else if (offset > entry->end)
 			n = n->rb_right;
 		else {
-			tree->last = rb_entry(n, struct extent_state, rb_node);
 			return n;
 		}
 	}
@@ -223,10 +215,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 
 	ret = __etree_search(tree, offset, &prev, NULL);
 	if (!ret) {
-		if (prev) {
-			tree->last = rb_entry(prev, struct extent_state,
-					      rb_node);
-		}
 		return prev;
 	}
 	return ret;
@@ -301,8 +289,6 @@ static int merge_state(struct extent_io_tree *tree,
 		    other->state == state->state) {
 			state->start = other->start;
 			other->tree = NULL;
-			if (tree->last == other)
-				tree->last = state;
 			rb_erase(&other->rb_node, &tree->state);
 			free_extent_state(other);
 		}
@@ -314,8 +300,6 @@ static int merge_state(struct extent_io_tree *tree,
 		    other->state == state->state) {
 			other->start = state->start;
 			state->tree = NULL;
-			if (tree->last == state)
-				tree->last = other;
 			rb_erase(&state->rb_node, &tree->state);
 			free_extent_state(state);
 		}
@@ -378,7 +362,6 @@ static int insert_state(struct extent_io_tree *tree,
 		return -EEXIST;
 	}
 	state->tree = tree;
-	tree->last = state;
 	merge_state(tree, state);
 	return 0;
 }
@@ -444,9 +427,6 @@ static int clear_state_bit(struct extent_io_tree *tree,
 	if (delete || state->state == 0) {
 		if (state->tree) {
 			clear_state_cb(tree, state, state->state);
-			if (tree->last == state) {
-				tree->last = extent_state_next(state);
-			}
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
 			free_extent_state(state);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6c03e6a1993..315cfceae31 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -60,7 +60,6 @@ struct extent_io_tree {
 	spinlock_t lock;
 	spinlock_t buffer_lock;
 	struct extent_io_ops *ops;
-	struct extent_state *last;
 };
 
 struct extent_state {
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 71b1ac15535..8a502ee2f23 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -42,7 +42,6 @@ void extent_map_exit(void)
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 {
 	tree->map.rb_node = NULL;
-	tree->last = NULL;
 	spin_lock_init(&tree->lock);
 }
 EXPORT_SYMBOL(extent_map_tree_init);
@@ -239,7 +238,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		merge->in_tree = 0;
 		free_extent_map(merge);
 	}
-	tree->last = em;
 out:
 	return ret;
 }
@@ -273,10 +271,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 	u64 end = range_end(start, len);
 
 	BUG_ON(spin_trylock(&tree->lock));
-	em = tree->last;
-	if (em && end > em->start && start < extent_map_end(em))
-		goto found;
-
 	rb_node = __tree_search(&tree->map, start, &prev, &next);
 	if (!rb_node && prev) {
 		em = rb_entry(prev, struct extent_map, rb_node);
@@ -305,7 +299,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 
 found:
 	atomic_inc(&em->refs);
-	tree->last = em;
 out:
 	return em;
 }
@@ -327,8 +320,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	BUG_ON(spin_trylock(&tree->lock));
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
-	if (tree->last == em)
-		tree->last = NULL;
 	return ret;
 }
 EXPORT_SYMBOL(remove_extent_mapping);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index a3978ec2784..26ac6fe0b26 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,6 @@ struct extent_map {
 
 struct extent_map_tree {
 	struct rb_root map;
-	struct extent_map *last;
 	spinlock_t lock;
 };
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 591a30208ac..e5ffb66ad32 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -381,14 +381,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			break;
 		}
 		if (test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-			start = em->start + em->len;
-			free_extent_map(em);
-			spin_unlock(&em_tree->lock);
-			if (start < end) {
-				len = end - start + 1;
-				continue;
-			}
-			break;
+			printk(KERN_CRIT "inode %lu trying to drop pinned "
+			       "extent start %llu end %llu, em [%llu %llu]\n",
+			       inode->i_ino,
+			       (unsigned long long)start,
+			       (unsigned long long)end,
+			       (unsigned long long)em->start,
+			       (unsigned long long)em->len);
 		}
 		remove_extent_mapping(em_tree, em);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 60852ada658..3da12a4d913 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -485,7 +485,7 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
 	if (!fixup)
 		return -EAGAIN;
-printk("queueing worker to fixup page %lu %Lu\n", inode->i_ino, page_offset(page));
+
 	SetPageChecked(page);
 	page_cache_get(page);
 	fixup->work.func = btrfs_writepage_fixup_worker;
@@ -502,11 +502,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_map *em;
+	struct extent_map *em_orig;
 	u64 alloc_hint = 0;
 	u64 clear_start;
 	u64 clear_end;
 	struct list_head list;
 	struct btrfs_key ins;
+	struct rb_node *rb;
 	int ret;
 
 	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
@@ -535,6 +537,22 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 
+	spin_lock(&em_tree->lock);
+	clear_start = ordered_extent->file_offset;
+	clear_end = ordered_extent->file_offset + ordered_extent->len;
+	em = lookup_extent_mapping(em_tree, clear_start,
+				   ordered_extent->len);
+	em_orig = em;
+	while(em && clear_start < extent_map_end(em) && clear_end > em->start) {
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		rb = rb_next(&em->rb_node);
+		if (!rb)
+			break;
+		em = rb_entry(rb, struct extent_map, rb_node);
+	}
+	free_extent_map(em_orig);
+	spin_unlock(&em_tree->lock);
+
 	ret = btrfs_drop_extents(trans, root, inode,
 				 ordered_extent->file_offset,
 				 ordered_extent->file_offset +
@@ -548,22 +566,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				       ordered_extent->len, 0);
 	BUG_ON(ret);
 
-	spin_lock(&em_tree->lock);
-	clear_start = ordered_extent->file_offset;
-	clear_end = ordered_extent->file_offset + ordered_extent->len;
-	while(clear_start < clear_end) {
-		em = lookup_extent_mapping(em_tree, clear_start,
-					   clear_end - clear_start);
-		if (em) {
-			clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-			clear_start = em->start + em->len;
-			free_extent_map(em);
-		} else {
-			break;
-		}
-	}
-	spin_unlock(&em_tree->lock);
-
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
 				ordered_extent->len - 1);
@@ -2318,7 +2320,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	u64 extent_end = 0;
 	u64 objectid = inode->i_ino;
 	u32 found_type;
-	struct btrfs_path *path;
+	struct btrfs_path *path = NULL;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_file_extent_item *item;
 	struct extent_buffer *leaf;
@@ -2328,9 +2330,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
 again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
@@ -2354,6 +2353,12 @@ again:
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
+
+	if (!path) {
+		path = btrfs_alloc_path();
+		BUG_ON(!path);
+	}
+
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
 	if (ret < 0) {
@@ -2530,7 +2535,8 @@ insert:
 	}
 	spin_unlock(&em_tree->lock);
 out:
-	btrfs_free_path(path);
+	if (path)
+		btrfs_free_path(path);
 	if (trans) {
 		ret = btrfs_end_transaction(trans, root);
 		if (!err) {
@@ -2643,8 +2649,8 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-static int btrfs_writepages(struct address_space *mapping,
-			    struct writeback_control *wbc)
+int btrfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(mapping->host)->io_tree;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0d87795fdd8..830dbaea685 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -19,6 +19,8 @@
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
@@ -307,12 +309,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * start IO on any dirty ones so the wait doesn't stall waiting
 	 * for pdflush to find them
 	 */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
-#else
-	do_sync_mapping_range(inode->i_mapping, start, end,
-			      SYNC_FILE_RANGE_WRITE);
-#endif
+	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
 	if (wait)
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
@@ -327,28 +324,26 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 	u64 orig_end;
 	u64 wait_end;
 	struct btrfs_ordered_extent *ordered;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 
 	if (start + len < start) {
-		wait_end = (inode->i_size + mask) & ~mask;
-		orig_end = (u64)-1;
+		orig_end = INT_LIMIT(loff_t);
 	} else {
 		orig_end = start + len - 1;
-		wait_end = orig_end;
+		if (orig_end > INT_LIMIT(loff_t))
+			orig_end = INT_LIMIT(loff_t);
 	}
+	wait_end = orig_end;
 again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-	do_sync_file_range(file, start, wait_end, SYNC_FILE_RANGE_WRITE);
-#else
-	do_sync_mapping_range(inode->i_mapping, start, wait_end,
-			      SYNC_FILE_RANGE_WRITE);
-#endif
-	end = orig_end;
-	wait_on_extent_writeback(&BTRFS_I(inode)->io_tree, start, orig_end);
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
+
+	btrfs_wait_on_page_writeback_range(inode->i_mapping,
+					   start >> PAGE_CACHE_SHIFT,
+					   orig_end >> PAGE_CACHE_SHIFT);
 
+	end = orig_end;
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
 		if (!ordered) {
@@ -565,3 +560,87 @@ out:
 	return ret;
 }
 
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * @mapping:	address space structure to write
+ * @start:	offset in bytes where the range starts
+ * @end:	offset in bytes where the range ends (inclusive)
+ * @sync_mode:	enable synchronous operation
+ *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
+ * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
+ * opposed to a regular memory cleansing writeback.  The difference between
+ * these two operations is that if a dirty page/buffer is encountered, it must
+ * be waited upon, and not just skipped over.
+ */
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+			   loff_t end, int sync_mode)
+{
+	struct writeback_control wbc = {
+		.sync_mode = sync_mode,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = start,
+		.range_end = end,
+		.for_writepages = 1,
+	};
+	return btrfs_writepages(mapping, &wbc);
+}
+
+/**
+ * taken from mm/filemap.c because it isn't exported
+ *
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:	target address_space
+ * @start:	beginning page index
+ * @end:	ending page index
+ *
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
+ */
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+				       pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	int nr_pages;
+	int ret = 0;
+	pgoff_t index;
+
+	if (end < start)
+		return 0;
+
+	pagevec_init(&pvec, 0);
+	index = start;
+	while ((index <= end) &&
+			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_WRITEBACK,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+		unsigned i;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/* until radix tree lookup accepts end_index */
+			if (page->index > end)
+				continue;
+
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	/* Check for outstanding write errors */
+	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
+		ret = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &mapping->flags))
+		ret = -EIO;
+
+	return ret;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1794efd13ca..8e8e3c0404f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -132,4 +132,8 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
+int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
+				       pgoff_t start, pgoff_t end);
+int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
+			   loff_t end, int sync_mode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 05823904ecb..38c75a0256c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -649,7 +649,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	extent_io_tree_init(pinned_copy,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
-printk("commit trans %Lu\n", trans->transid);
 	trans->transaction->in_commit = 1;
 	trans->transaction->blocked = 1;
 	cur_trans = trans->transaction;
@@ -745,7 +744,6 @@ printk("commit trans %Lu\n", trans->transid);
 		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
-printk("done commit trans %Lu\n", trans->transid);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (root->fs_info->closing) {
-- 
cgit v1.2.3


From 6dddcbeb28b34620ad033f1e8d9f6960bafdd7d2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 11:18:09 -0400
Subject: Btrfs: Use mutex_lock_nested for tree locking

Lockdep has the notion of locking subclasses so that you can identify
locks you expect to be taken after other locks of the same class.  This
changes the per-extent buffer btree locking routines to use a subclass based
on the level in the tree.

Unfortunately, lockdep can only handle 8 total subclasses, and the btrfs
max level is also 8.  So when lockdep is on, use a lower max level.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 6 +++++-
 fs/btrfs/locking.c | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f8fccdac305..eeb5afa6e9b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -42,7 +42,11 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_B5RfS_M"
 
-#define BTRFS_MAX_LEVEL 8
+#ifdef CONFIG_LOCKDEP
+# define BTRFS_MAX_LEVEL 7
+#else
+# define BTRFS_MAX_LEVEL 8
+#endif
 
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01a9ac2be3f..27a02376ab1 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -37,7 +37,7 @@ int btrfs_tree_lock(struct extent_buffer *eb)
 			return 0;
 	}
 	cpu_relax();
-	mutex_lock(&eb->mutex);
+	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
 	return 0;
 }
 
-- 
cgit v1.2.3


From e34a5b4f77b8448cf2863ad0cbac35e2c2a86a0a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 12:08:37 -0400
Subject: Btrfs: Add some conditional schedules near the alloc_mutex

This helps prevent stalls, especially while the snapshot cleaner is
running hard

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c51cd11de20..72fa28236e5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -937,6 +937,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 	for (i = 0; i < nritems; i++) {
+		cond_resched();
 		if (level == 0) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
@@ -2189,6 +2190,7 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
+		cond_resched();
 
 		btrfs_item_key_to_cpu(leaf, &key, i);
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-- 
cgit v1.2.3


From c286ac48ed7aaf53586f575af6053ae2a0f8554a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 23:06:41 -0400
Subject: Btrfs: alloc_mutex latency reduction

This releases the alloc_mutex in a few places that hold it for over long
operations.  btrfs_lookup_block_group is changed so that it doesn't need
the mutex at all.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/extent-tree.c | 100 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 81 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eeb5afa6e9b..90504ba7f83 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -483,6 +483,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	struct btrfs_space_info *space_info;
+	spinlock_t lock;
 	u64 pinned;
 	u64 flags;
 	int cached;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72fa28236e5..febc6295c7a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -319,7 +319,7 @@ no_cache:
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 	}
 	cache_miss = 0;
-	cache = __btrfs_find_block_group(root, cache, last, data, 0);
+	cache = btrfs_find_block_group(root, cache, last, data, 0);
 	if (!cache)
 		goto no_cache;
 	*cache_ret = cache;
@@ -379,19 +379,25 @@ __btrfs_find_block_group(struct btrfs_root *root,
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_first_block_group(info, search_start);
 		if (shint && block_group_bits(shint, data) && !shint->ro) {
+			spin_lock(&shint->lock);
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
 			    div_factor(shint->key.offset, factor)) {
+				spin_unlock(&shint->lock);
 				return shint;
 			}
+			spin_unlock(&shint->lock);
 		}
 	}
 	if (hint && !hint->ro && block_group_bits(hint, data)) {
+		spin_lock(&hint->lock);
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
 		    div_factor(hint->key.offset, factor)) {
+			spin_unlock(&hint->lock);
 			return hint;
 		}
+		spin_unlock(&hint->lock);
 		last = hint->key.objectid + hint->key.offset;
 	} else {
 		if (hint)
@@ -413,6 +419,7 @@ again:
 		}
 
 		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
+		spin_lock(&cache->lock);
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
@@ -420,9 +427,11 @@ again:
 			free_check = div_factor(cache->key.offset, factor);
 			if (used + cache->pinned < free_check) {
 				found_group = cache;
+				spin_unlock(&cache->lock);
 				goto found;
 			}
 		}
+		spin_unlock(&cache->lock);
 		cond_resched();
 	}
 	if (!wrapped) {
@@ -447,9 +456,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 {
 
 	struct btrfs_block_group_cache *ret;
-	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
@@ -1262,21 +1269,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		set_extent_bits(&info->block_group_cache, start, end,
 				BLOCK_GROUP_DIRTY, GFP_NOFS);
 
+		spin_lock(&cache->lock);
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
 			old_val += num_bytes;
 			cache->space_info->bytes_used += num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			spin_unlock(&cache->lock);
 		} else {
 			old_val -= num_bytes;
 			cache->space_info->bytes_used -= num_bytes;
+			btrfs_set_block_group_used(&cache->item, old_val);
+			spin_unlock(&cache->lock);
 			if (mark_free) {
 				set_extent_dirty(&info->free_space_cache,
 						 bytenr, bytenr + num_bytes - 1,
 						 GFP_NOFS);
 			}
 		}
-		btrfs_set_block_group_used(&cache->item, old_val);
 		total -= num_bytes;
 		bytenr += num_bytes;
 	}
@@ -1325,14 +1336,18 @@ static int update_pinned_extents(struct btrfs_root *root,
 		}
 		if (pin) {
 			if (cache) {
+				spin_lock(&cache->lock);
 				cache->pinned += len;
 				cache->space_info->bytes_pinned += len;
+				spin_unlock(&cache->lock);
 			}
 			fs_info->total_pinned += len;
 		} else {
 			if (cache) {
+				spin_lock(&cache->lock);
 				cache->pinned -= len;
 				cache->space_info->bytes_pinned -= len;
+				spin_unlock(&cache->lock);
 			}
 			fs_info->total_pinned -= len;
 		}
@@ -1380,6 +1395,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
+		if (need_resched()) {
+			mutex_unlock(&root->fs_info->alloc_mutex);
+			cond_resched();
+			mutex_lock(&root->fs_info->alloc_mutex);
+		}
 	}
 	mutex_unlock(&root->fs_info->alloc_mutex);
 	return 0;
@@ -1417,8 +1437,16 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 					&extent_item, sizeof(extent_item));
 		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
-		eb = read_tree_block(extent_root, ins.objectid, ins.offset,
-				     trans->transid);
+
+		eb = btrfs_find_tree_block(extent_root, ins.objectid,
+					   ins.offset);
+
+		if (!btrfs_buffer_uptodate(eb, trans->transid)) {
+			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+			btrfs_read_buffer(eb, trans->transid);
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
+
 		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
 		if (level == 0) {
@@ -1437,6 +1465,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 					  0, level,
 					  btrfs_disk_key_objectid(&first));
 		BUG_ON(err);
+		if (need_resched()) {
+			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+			cond_resched();
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -1640,15 +1673,28 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 					    EXTENT_LOCKED);
 		if (ret)
 			break;
-		update_pinned_extents(extent_root, start, end + 1 - start, 1);
 		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
-		ret = __free_extent(trans, extent_root,
-				     start, end + 1 - start,
-				     extent_root->root_key.objectid,
-				     0, 0, 0, 0, 0);
+		if (!test_range_bit(&extent_root->fs_info->extent_ins,
+				    start, end, EXTENT_LOCKED, 0)) {
+			update_pinned_extents(extent_root, start,
+					      end + 1 - start, 1);
+			ret = __free_extent(trans, extent_root,
+					     start, end + 1 - start,
+					     extent_root->root_key.objectid,
+					     0, 0, 0, 0, 0);
+		} else {
+			clear_extent_bits(&extent_root->fs_info->extent_ins,
+					  start, end, EXTENT_LOCKED, GFP_NOFS);
+		}
 		if (ret)
 			err = ret;
+
+		if (need_resched()) {
+			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+			cond_resched();
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
 	}
 	return err;
 }
@@ -1768,12 +1814,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		block_group = btrfs_lookup_first_block_group(info, hint_byte);
 		if (!block_group)
 			hint_byte = search_start;
-		block_group = __btrfs_find_block_group(root, block_group,
+		block_group = btrfs_find_block_group(root, block_group,
 						     hint_byte, data, 1);
 		if (last_ptr && *last_ptr == 0 && block_group)
 			hint_byte = block_group->key.objectid;
 	} else {
-		block_group = __btrfs_find_block_group(root,
+		block_group = btrfs_find_block_group(root,
 						     trans->block_group,
 						     search_start, data, 1);
 	}
@@ -1895,7 +1941,7 @@ enospc:
 	}
 	block_group = btrfs_lookup_first_block_group(info, search_start);
 	cond_resched();
-	block_group = __btrfs_find_block_group(root, block_group,
+	block_group = btrfs_find_block_group(root, block_group,
 					     search_start, data, 0);
 	goto check_failed;
 
@@ -3032,11 +3078,14 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	u64 new_alloc_flags;
 	u64 calc;
 
+	spin_lock(&shrink_block_group->lock);
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
-
+		spin_unlock(&shrink_block_group->lock);
 		mutex_unlock(&root->fs_info->alloc_mutex);
+
 		trans = btrfs_start_transaction(root, 1);
 		mutex_lock(&root->fs_info->alloc_mutex);
+		spin_lock(&shrink_block_group->lock);
 
 		new_alloc_flags = update_block_group_flags(root,
 						   shrink_block_group->flags);
@@ -3046,13 +3095,16 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		} else {
 			calc = shrink_block_group->key.offset;
 		}
+		spin_unlock(&shrink_block_group->lock);
+
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
 
 		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_end_transaction(trans, root);
 		mutex_lock(&root->fs_info->alloc_mutex);
-	}
+	} else
+		spin_unlock(&shrink_block_group->lock);
 	return 0;
 }
 
@@ -3199,6 +3251,7 @@ next:
 	mutex_unlock(&root->fs_info->alloc_mutex);
 
 	trans = btrfs_start_transaction(root, 1);
+
 	mutex_lock(&root->fs_info->alloc_mutex);
 	memcpy(&key, &shrink_block_group->key, sizeof(key));
 
@@ -3316,6 +3369,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		}
 
+		spin_lock_init(&cache->lock);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
@@ -3343,10 +3397,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		/* use EXTENT_LOCKED to prevent merging */
 		set_extent_bits(block_group_cache, found_key.objectid,
 				found_key.objectid + found_key.offset - 1,
-				bit | EXTENT_LOCKED, GFP_NOFS);
+				EXTENT_LOCKED, GFP_NOFS);
 		set_state_private(block_group_cache, found_key.objectid,
 				  (unsigned long)cache);
-
+		set_extent_bits(block_group_cache, found_key.objectid,
+				found_key.objectid + found_key.offset - 1,
+				bit | EXTENT_LOCKED, GFP_NOFS);
 		if (key.objectid >=
 		    btrfs_super_total_bytes(&info->super_copy))
 			break;
@@ -3377,6 +3433,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
+	spin_lock_init(&cache->lock);
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
@@ -3391,10 +3448,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	bit = block_group_state_bits(type);
 	set_extent_bits(block_group_cache, chunk_offset,
 			chunk_offset + size - 1,
-			bit | EXTENT_LOCKED, GFP_NOFS);
-
+			EXTENT_LOCKED, GFP_NOFS);
 	set_state_private(block_group_cache, chunk_offset,
 			  (unsigned long)cache);
+	set_extent_bits(block_group_cache, chunk_offset,
+			chunk_offset + size - 1,
+			bit | EXTENT_LOCKED, GFP_NOFS);
+
 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
 				sizeof(cache->item));
 	BUG_ON(ret);
-- 
cgit v1.2.3


From ed98b56a6393c5e150fd5095b9eb7fd7d3cfb041 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Jul 2008 23:06:42 -0400
Subject: Btrfs: Take the csum mutex while reading checksums

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c    | 8 +++++---
 fs/btrfs/inode.c        | 2 ++
 fs/btrfs/ordered-data.c | 2 +-
 fs/btrfs/ordered-data.h | 2 +-
 fs/btrfs/transaction.c  | 3 +++
 5 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 45127e4797c..afe42d00b5a 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -152,7 +152,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	if (!sums)
 		return -ENOMEM;
 
-	sector_sum = &sums->sums;
+	sector_sum = sums->sums;
 	sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 	sums->len = bio->bi_size;
 	INIT_LIST_HEAD(&sums->list);
@@ -174,7 +174,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
 				       GFP_NOFS);
 			BUG_ON(!sums);
-			sector_sum = &sums->sums;
+			sector_sum = sums->sums;
 			sums->len = bytes_left;
 			sums->file_offset = offset;
 			ordered = btrfs_lookup_ordered_extent(inode,
@@ -193,12 +193,14 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 				 (char *)&sector_sum->sum);
 		sector_sum->offset = page_offset(bvec->bv_page) +
 			bvec->bv_offset;
+
 		sector_sum++;
 		bio_index++;
 		total_bytes += bvec->bv_len;
 		this_sum_bytes += bvec->bv_len;
 		bvec++;
 	}
+	this_sum_bytes = 0;
 	btrfs_add_ordered_sum(inode, ordered, sums);
 	btrfs_put_ordered_extent(ordered);
 	return 0;
@@ -231,7 +233,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	sector_sum = &sums->sums;
+	sector_sum = sums->sums;
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3da12a4d913..28e667052ec 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -612,6 +612,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 		return 0;
 
 	path = btrfs_alloc_path();
+	mutex_lock(&BTRFS_I(inode)->csum_mutex);
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
 	if (IS_ERR(item)) {
 		/*
@@ -640,6 +641,7 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 found:
 	set_state_private(io_tree, start, csum);
 out:
+	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	if (path)
 		btrfs_free_path(path);
 	return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 830dbaea685..b695f5b29c4 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -545,7 +545,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
 		if (offset >= ordered_sum->file_offset) {
 			num_sectors = ordered_sum->len / sectorsize;
-			sector_sums = &ordered_sum->sums;
+			sector_sums = ordered_sum->sums;
 			for (i = 0; i < num_sectors; i++) {
 				if (sector_sums[i].offset == offset) {
 					*sum = sector_sums[i].sum;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8e8e3c0404f..36e63f1f79b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -46,7 +46,7 @@ struct btrfs_ordered_sum {
 	unsigned long len;
 	struct list_head list;
 	/* last field is a variable length array of btrfs_sector_sums */
-	struct btrfs_sector_sum sums;
+	struct btrfs_sector_sum sums[];
 };
 
 /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 38c75a0256c..0f756e0175c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -382,6 +382,9 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			memcpy(dirty->root, root, sizeof(*root));
 			dirty->root->node = root->commit_root;
 			dirty->latest_root = root;
+			spin_lock_init(&dirty->root->node_lock);
+			mutex_init(&dirty->root->objectid_mutex);
+
 			root->commit_root = NULL;
 
 			root->root_key.offset = root->fs_info->generation;
-- 
cgit v1.2.3


From 9ba4611a3a7902c6bad70c5c205de5161fcfc17b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 23 Jul 2008 09:26:26 -0400
Subject: Btrfs: Fix 32 bit compiles by using an unsigned long byte count in
 the ordered extent

The ordered extents have to fit in memory, so an unsigned long is sufficient.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 36e63f1f79b..199cb0b4f1d 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -95,7 +95,8 @@ struct btrfs_ordered_extent {
  * calculates the total size you need to allocate for an ordered sum
  * structure spanning 'bytes' in the file
  */
-static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
+					 unsigned long bytes)
 {
 	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
 		root->sectorsize;
-- 
cgit v1.2.3


From 89642229a582a5c2b6d2ed8ec16986387d9a9047 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jul 2008 09:41:53 -0400
Subject: Btrfs: Search data ordered extents first for checksums on read

Checksum items are not inserted into the tree until all of the io from a
given extent is complete.  This means one dirty page from an extent may
be written, freed, and then read again before the entire extent is on disk
and the checksum item is inserted.

The checksums themselves are stored in the ordered extent so they can
be inserted in bulk when IO is complete.  On read, if a checksum item isn't
found, the ordered extents were being searched for a checksum record.

This all worked most of the time, but the checksum insertion code tries
to reduce the number of tree operations by pre-inserting checksum items
based on i_size and a few other factors.  This means the read code might
find a checksum item that hasn't yet really been filled in.

This commit changes things to check the ordered extents first and only
dive into the btree if nothing was found.  This removes the need for
extra locking and is more reliable.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c      |  9 ++++++++-
 fs/btrfs/extent_io.c    |  8 ++++----
 fs/btrfs/inode.c        | 33 ++++++++++++++++++---------------
 fs/btrfs/ordered-data.c |  1 +
 4 files changed, 31 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 86e84a8579e..7ce3f83c5dd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1011,9 +1011,16 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
 	spin_unlock(&em_tree->lock);
-	if (!em)
+	if (!em) {
+		__unplug_io_fn(bdi, page);
 		return;
+	}
 
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+		free_extent_map(em);
+		__unplug_io_fn(bdi, page);
+		return;
+	}
 	offset = offset - em->start;
 	btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
 			  em->block_start + offset, page);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9965993748d..e3547a992d5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1949,18 +1949,18 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 							  cur + iosize - 1);
 		}
 		if (!ret) {
-			unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-			nr -= page->index;
+			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+			pnr -= page->index;
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset,
-					 bdev, bio, nr,
+					 bdev, bio, pnr,
 					 end_bio_extent_readpage, mirror_num);
+			nr++;
 		}
 		if (ret)
 			SetPageError(page);
 		cur = cur + iosize;
 		page_offset += iosize;
-		nr++;
 	}
 	if (!nr) {
 		if (!PageError(page))
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 28e667052ec..0e90315ea80 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -611,22 +611,25 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
 
+	/*
+	 * It is possible there is an ordered extent that has
+	 * not yet finished for this range in the file.  If so,
+	 * that extent will have a csum cached, and it will insert
+	 * the sum after all the blocks in the extent are fully
+	 * on disk.  So, look for an ordered extent and use the
+	 * sum if found.  We have to do this before looking in the
+	 * btree because csum items are pre-inserted based on
+	 * the file size.  btrfs_lookup_csum might find an item
+	 * that still hasn't been fully filled.
+	 */
+	ret = btrfs_find_ordered_sum(inode, start, &csum);
+	if (ret == 0)
+		goto found;
+
+	ret = 0;
 	path = btrfs_alloc_path();
-	mutex_lock(&BTRFS_I(inode)->csum_mutex);
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
 	if (IS_ERR(item)) {
-		/*
-		 * It is possible there is an ordered extent that has
-		 * not yet finished for this range in the file.  If so,
-		 * that extent will have a csum cached, and it will insert
-		 * the sum after all the blocks in the extent are fully
-		 * on disk.  So, look for an ordered extent and use the
-		 * sum if found.
-		 */
-		ret = btrfs_find_ordered_sum(inode, start, &csum);
-		if (ret == 0)
-			goto found;
-
 		ret = PTR_ERR(item);
 		/* a csum that isn't present is a preallocated region. */
 		if (ret == -ENOENT || ret == -EFBIG)
@@ -641,7 +644,6 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 found:
 	set_state_private(io_tree, start, csum);
 out:
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	if (path)
 		btrfs_free_path(path);
 	return ret;
@@ -1375,7 +1377,7 @@ again:
 		}
 		if (!PageUptodate(page)) {
 			ret = -EIO;
-			goto out;
+			goto out_unlock;
 		}
 	}
 	wait_on_page_writeback(page);
@@ -1406,6 +1408,7 @@ again:
 	set_page_dirty(page);
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
+out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
 out:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b695f5b29c4..e42fd233e04 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -557,6 +557,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 	}
 out:
 	mutex_unlock(&tree->mutex);
+	btrfs_put_ordered_extent(ordered);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4881ee5a2e995c6a8999b56de70aa3834369d8ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jul 2008 09:51:08 -0400
Subject: Btrfs: Fix some build problems on 2.6.18 based enterprise kernels

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 8 ++++++++
 fs/btrfs/locking.c | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 90504ba7f83..8ecac2e77a4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1595,6 +1595,14 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+
+/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+#ifdef ClearPageFsMisc
+#define ClearPageChecked ClearPageFsMisc
+#define SetPageChecked SetPageFsMisc
+#define PageChecked PageFsMisc
+#endif
+
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_root *new_root,
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 27a02376ab1..d617c29787f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -20,7 +20,7 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/page-flags.h>
-#include <linux/bug.h>
+#include <asm/bug.h>
 #include "ctree.h"
 #include "extent_io.h"
 #include "locking.h"
-- 
cgit v1.2.3


From b3c3da71ed88c06bc18a4d8919c6e4af3b933566 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 23 Jul 2008 12:12:13 -0400
Subject: Btrfs: Add version strings on module load

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile   |  6 +++++-
 fs/btrfs/super.c    |  3 +++
 fs/btrfs/version.sh | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/version.sh

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 0e2dcc718de..f8a38b0da7f 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -14,8 +14,12 @@ else
 # Normal Makefile
 
 KERNELDIR := /lib/modules/`uname -r`/build
-all:
+all: version
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
+
+version:
+	bash version.sh
+
 modules_install:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
 clean:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5e28cf5c2e8..4cb6aac5122 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -45,6 +45,7 @@
 #include "print-tree.h"
 #include "xattr.h"
 #include "volumes.h"
+#include "version.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -605,6 +606,8 @@ static int __init init_btrfs_fs(void)
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
+
+	printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
 	return 0;
 
 unregister_ioctl:
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
new file mode 100644
index 00000000000..fd9b53d3986
--- /dev/null
+++ b/fs/btrfs/version.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# determine-version -- report a useful version for releases
+#
+# Copyright 2008, Aron Griffis <agriffis@n01se.net>
+# Copyright 2008, Oracle
+# Released under the GNU GPLv2
+ 
+v="Btrfs v0.15"
+
+which hg > /dev/null
+if [ $? == 0 ]; then
+	last=$(hg tags | grep -m1 -o '^v[0-9.]\+')
+	 
+	# now check if the repo has commits since then...
+	if [[ $(hg id -t) == $last || \
+	    $(hg di -r "$last:." | awk '/^diff/{print $NF}' | sort -u) == .hgtags ]]
+	then
+	    # check if it's dirty
+	    if [[ $(hg id | cut -d' ' -f1) == *+ ]]; then
+		v=$last+
+	    else
+		v=$last
+	    fi
+	else
+	    # includes dirty flag
+	    v=$last+$(hg id -i)
+	fi
+fi
+ 
+echo "#ifndef __BUILD_VERSION" > .build-version.h
+echo "#define __BUILD_VERSION" >> .build-version.h
+echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
+echo "#endif" >> .build-version.h
+
+diff -q version.h .build-version.h >& /dev/null
+
+if [ $? == 0 ]; then
+    rm .build-version.h
+    exit 0
+fi
+
+mv .build-version.h version.h
-- 
cgit v1.2.3


From 64f26f745084872b916cd1bef6054e21b15c5784 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 24 Jul 2008 10:09:43 -0400
Subject: Btrfs: Use assert_spin_locked instead of spin_trylock

On UP systems spin_trylock always succeeds

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 8a502ee2f23..954b047639a 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -208,7 +208,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
-	BUG_ON(spin_trylock(&tree->lock));
+	assert_spin_locked(&tree->lock);
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
 		ret = -EEXIST;
@@ -270,7 +270,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
-	BUG_ON(spin_trylock(&tree->lock));
+	assert_spin_locked(&tree->lock);
 	rb_node = __tree_search(&tree->map, start, &prev, &next);
 	if (!rb_node && prev) {
 		em = rb_entry(prev, struct extent_map, rb_node);
@@ -317,7 +317,7 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	int ret = 0;
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
-	BUG_ON(spin_trylock(&tree->lock));
+	assert_spin_locked(&tree->lock);
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
 	return ret;
-- 
cgit v1.2.3


From 3eaa2885276fd6dac7b076a793932428b7168e74 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jul 2008 11:57:52 -0400
Subject: Btrfs: Fix the defragmention code and the block relocation code for
 data=ordered

Before setting an extent to delalloc, the code needs to wait for
pending ordered extents.

Also, the relocation code needs to wait for ordered IO before scanning
the block group again.  This is because the extents are not removed
until the IO for the new extents is finished

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        |  7 +++++++
 fs/btrfs/disk-io.c      |  3 +++
 fs/btrfs/extent-tree.c  | 39 ++++++++++++++++++++++++----------
 fs/btrfs/ioctl.c        | 21 ++++++++++++-------
 fs/btrfs/ordered-data.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/ordered-data.h |  7 +++++++
 6 files changed, 113 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8ecac2e77a4..6675e916ebc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -538,6 +538,13 @@ struct btrfs_fs_info {
 	struct list_head dead_roots;
 	atomic_t nr_async_submits;
 
+	/*
+	 * this is used by the balancing code to wait for all the pending
+	 * ordered extents
+	 */
+	spinlock_t ordered_extent_lock;
+	struct list_head ordered_extents;
+
 	/*
 	 * there is a pool of worker threads for checksumming during writes
 	 * and a pool for checksumming after reads.  This is because readers
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7ce3f83c5dd..ec01062eb41 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1252,6 +1252,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_nlink = 1;
 	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
 
+	INIT_LIST_HEAD(&fs_info->ordered_extents);
+	spin_lock_init(&fs_info->ordered_extent_lock);
+
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index febc6295c7a..f92b297e7da 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2640,6 +2640,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	struct file_ra_state *ra;
 	unsigned long total_read = 0;
 	unsigned long ra_pages;
+	struct btrfs_ordered_extent *ordered;
 	struct btrfs_trans_handle *trans;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
@@ -2658,9 +2659,9 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 				       calc_ra(i, last_index, ra_pages));
 		}
 		total_read++;
-		if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
+again:
+		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
 			goto truncate_racing;
-
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page) {
 			goto out_unlock;
@@ -2674,18 +2675,24 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 				goto out_unlock;
 			}
 		}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(page);
-#else
-		cancel_dirty_page(page, PAGE_CACHE_SIZE);
-#endif
 		wait_on_page_writeback(page);
-		set_page_extent_mapped(page);
+
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
-
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
+		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+		if (ordered) {
+			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			goto again;
+		}
+		set_page_extent_mapped(page);
+
+
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 		set_page_dirty(page);
@@ -2694,10 +2701,18 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-					   total_read);
 
 out_unlock:
+	/* we have to start the IO in order to get the ordered extents
+	 * instantiated.  This allows the relocation to code to wait
+	 * for all the ordered extents to hit the disk.
+	 *
+	 * Otherwise, it would constantly loop over the same extents
+	 * because the old ones don't get deleted  until the IO is
+	 * started
+	 */
+	btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
+			       WB_SYNC_NONE);
 	kfree(ra);
 	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
 	if (trans) {
@@ -3238,6 +3253,8 @@ next:
 
 		btrfs_clean_old_snapshots(tree_root);
 
+		btrfs_wait_ordered_extents(tree_root);
+
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
 		mutex_lock(&root->fs_info->alloc_mutex);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 83f17a5cbd6..a61f2e7e2db 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -213,6 +213,7 @@ int btrfs_defrag_file(struct file *file)
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
 	struct page *page;
 	unsigned long last_index;
 	unsigned long ra_pages = root->fs_info->bdi.ra_pages;
@@ -234,6 +235,7 @@ int btrfs_defrag_file(struct file *file)
 				       min(last_index, i + ra_pages - 1));
 		}
 		total_read++;
+again:
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page)
 			goto out_unlock;
@@ -247,18 +249,23 @@ int btrfs_defrag_file(struct file *file)
 			}
 		}
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(page);
-#else
-		cancel_dirty_page(page, PAGE_CACHE_SIZE);
-#endif
 		wait_on_page_writeback(page);
-		set_page_extent_mapped(page);
 
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
-
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+		if (ordered) {
+			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			goto again;
+		}
+		set_page_extent_mapped(page);
+
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e42fd233e04..676e4bd65c5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -167,20 +167,28 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
+	entry->inode = inode;
+
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
 	init_waitqueue_head(&entry->wait);
 	INIT_LIST_HEAD(&entry->list);
+	INIT_LIST_HEAD(&entry->root_extent_list);
 
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
 	if (node) {
-		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		atomic_inc(&entry->refs);
+		printk("warning dup entry from add_ordered_extent\n");
+		BUG();
 	}
 	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
 			   entry_end(entry) - 1, GFP_NOFS);
 
+	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	list_add_tail(&entry->root_extent_list,
+		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
+	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
 	mutex_unlock(&tree->mutex);
 	BUG_ON(node);
 	return 0;
@@ -285,11 +293,55 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	rb_erase(node, &tree->tree);
 	tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+
+	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	list_del_init(&entry->root_extent_list);
+	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+
 	mutex_unlock(&tree->mutex);
 	wake_up(&entry->wait);
 	return 0;
 }
 
+int btrfs_wait_ordered_extents(struct btrfs_root *root)
+{
+	struct list_head splice;
+	struct list_head *cur;
+	struct btrfs_ordered_extent *ordered;
+	struct inode *inode;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&root->fs_info->ordered_extent_lock);
+	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	while(!list_empty(&splice)) {
+		cur = splice.next;
+		ordered = list_entry(cur, struct btrfs_ordered_extent,
+				     root_extent_list);
+		list_del_init(&ordered->root_extent_list);
+		atomic_inc(&ordered->refs);
+		inode = ordered->inode;
+
+		/*
+		 * the inode can't go away until all the pages are gone
+		 * and the pages won't go away while there is still
+		 * an ordered extent and the ordered extent won't go
+		 * away until it is off this list.  So, we can safely
+		 * increment i_count here and call iput later
+		 */
+		atomic_inc(&inode->i_count);
+		spin_unlock(&root->fs_info->ordered_extent_lock);
+
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+		iput(inode);
+
+		spin_lock(&root->fs_info->ordered_extent_lock);
+	}
+	spin_unlock(&root->fs_info->ordered_extent_lock);
+	return 0;
+}
+
 /*
  * Used to start IO or wait for a given ordered extent to finish.
  *
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 199cb0b4f1d..5efe6b63c74 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -80,6 +80,9 @@ struct btrfs_ordered_extent {
 	/* reference count */
 	atomic_t refs;
 
+	/* the inode we belong to */
+	struct inode *inode;
+
 	/* list of checksums for insertion when the extent io is done */
 	struct list_head list;
 
@@ -88,6 +91,9 @@ struct btrfs_ordered_extent {
 
 	/* our friendly rbtree entry */
 	struct rb_node rb_node;
+
+	/* a per root list of all the pending ordered extents */
+	struct list_head root_extent_list;
 };
 
 
@@ -137,4 +143,5 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 			   loff_t end, int sync_mode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From aec7477b3b0e8ec93f6d274f25ba40b0665134d4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:12:38 -0400
Subject: Btrfs: Implement new dir index format

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  15 ++++++
 fs/btrfs/ctree.h       |   8 +--
 fs/btrfs/dir-item.c    |   4 +-
 fs/btrfs/inode-item.c  |  10 +++-
 fs/btrfs/inode.c       | 143 +++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/ioctl.c       |   4 +-
 fs/btrfs/transaction.c |   4 +-
 7 files changed, 155 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3bf40591742..b7bd60e4fdd 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -43,6 +43,21 @@ struct btrfs_inode {
 	u64 delalloc_bytes;
 	u64 disk_i_size;
 	u32 flags;
+
+	/*
+	 * if this is a directory then index_cnt is the counter for the index
+	 * number for new files that are created
+	 */
+	u64 index_cnt;
+
+	/*
+	 * index holds the directory index for this inode on creation, so
+	 * add_link can do what its supposed to.  This isn't populated when the
+	 * inode is read because there isn't really a reason to know this unless
+	 * we are creating the directory index or deleting it, and deletion
+	 * reads the index off of the inode reference at unlink time.
+	 */
+	u64 index;
 };
 
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6675e916ebc..beb05b1de54 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -372,6 +372,7 @@ struct btrfs_dev_extent {
 } __attribute__ ((__packed__));
 
 struct btrfs_inode_ref {
+	__le64 index;
 	__le16 name_len;
 	/* name goes here */
 } __attribute__ ((__packed__));
@@ -902,6 +903,7 @@ BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
+BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
@@ -1528,7 +1530,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
-			  struct btrfs_key *location, u8 type);
+			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
@@ -1566,11 +1568,11 @@ int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid);
+			   u64 inode_objectid, u64 ref_objectid, u64 index);
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid);
+			   u64 inode_objectid, u64 ref_objectid, u64 *index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 7a73dc59dc4..eb4dd3d75cf 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -110,7 +110,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
-			  struct btrfs_key *location, u8 type)
+			  struct btrfs_key *location, u8 type, u64 index)
 {
 	int ret = 0;
 	int ret2 = 0;
@@ -156,7 +156,7 @@ second_insert:
 	btrfs_release_path(root, path);
 
 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
-	key.offset = location->objectid;
+	key.offset = index;
 	dir_item = insert_with_overflow(trans, root, path, &key, data_size,
 					name, name_len);
 	if (IS_ERR(dir_item)) {
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index cba30b6cc6f..d93451c66ba 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -52,7 +52,7 @@ int find_name_in_backref(struct btrfs_path *path, const char * name,
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid)
+			   u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -86,6 +86,10 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
 	}
 	leaf = path->nodes[0];
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+
+	if (index)
+		*index = btrfs_inode_ref_index(leaf, ref);
+
 	if (del_len == item_size) {
 		ret = btrfs_del_item(trans, root, path);
 		goto out;
@@ -106,7 +110,7 @@ out:
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid)
+			   u64 inode_objectid, u64 ref_objectid, u64 index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -138,6 +142,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
 		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
 		ptr = (unsigned long)(ref + 1);
 		ret = 0;
 	} else if (ret < 0) {
@@ -146,6 +151,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+		btrfs_set_inode_ref_index(path->nodes[0], ref, index);
 		ptr = (unsigned long)(ref + 1);
 	}
 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0e90315ea80..8d371d6fe55 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -872,6 +872,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
+	BTRFS_I(inode)->index_cnt = (u64)-1;
+
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
 	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
 						       alloc_group_block);
@@ -993,6 +995,7 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
 	struct btrfs_key key;
+	u64 index;
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -1017,8 +1020,19 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 		goto err;
 	btrfs_release_path(root, path);
 
+	ret = btrfs_del_inode_ref(trans, root, name, name_len,
+				  dentry->d_inode->i_ino,
+				  dentry->d_parent->d_inode->i_ino, &index);
+	if (ret) {
+		printk("failed to delete reference to %.*s, "
+		       "inode %lu parent %lu\n", name_len, name,
+		       dentry->d_inode->i_ino,
+		       dentry->d_parent->d_inode->i_ino);
+		goto err;
+	}
+
 	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
-					 key.objectid, name, name_len, -1);
+					 index, name, name_len, -1);
 	if (IS_ERR(di)) {
 		ret = PTR_ERR(di);
 		goto err;
@@ -1031,15 +1045,6 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	btrfs_release_path(root, path);
 
 	dentry->d_inode->i_ctime = dir->i_ctime;
-	ret = btrfs_del_inode_ref(trans, root, name, name_len,
-				  dentry->d_inode->i_ino,
-				  dentry->d_parent->d_inode->i_ino);
-	if (ret) {
-		printk("failed to delete reference to %.*s, "
-		       "inode %lu parent %lu\n", name_len, name,
-		       dentry->d_inode->i_ino,
-		       dentry->d_parent->d_inode->i_ino);
-	}
 err:
 	btrfs_free_path(path);
 	if (!ret) {
@@ -1625,6 +1630,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	BTRFS_I(inode)->root = args->root;
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->disk_i_size = 0;
+	BTRFS_I(inode)->index_cnt = (u64)-1;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1901,8 +1907,77 @@ void btrfs_dirty_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 }
 
+static int btrfs_set_inode_index_count(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key key, found_key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	int ret;
+
+	key.objectid = inode->i_ino;
+	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	/* FIXME: we should be able to handle this */
+	if (ret == 0)
+		goto out;
+	ret = 0;
+
+	/*
+	 * MAGIC NUMBER EXPLANATION:
+	 * since we search a directory based on f_pos we have to start at 2
+	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
+	 * else has to start at 2
+	 */
+	if (path->slots[0] == 0) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+	if (found_key.objectid != inode->i_ino ||
+	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
+		BTRFS_I(inode)->index_cnt = 2;
+		goto out;
+	}
+
+	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_set_inode_index(struct inode *dir, struct inode *inode)
+{
+	int ret = 0;
+
+	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
+		ret = btrfs_set_inode_index_count(dir);
+		if (ret)
+			return ret;
+	}
+
+	BTRFS_I(inode)->index = BTRFS_I(dir)->index_cnt;
+	BTRFS_I(dir)->index_cnt++;
+
+	return ret;
+}
+
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
+				     struct inode *dir,
 				     const char *name, int name_len,
 				     u64 ref_objectid,
 				     u64 objectid,
@@ -1928,6 +2003,20 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
+	if (dir) {
+		ret = btrfs_set_inode_index(dir, inode);
+		if (ret)
+			return ERR_PTR(ret);
+	} else {
+		BTRFS_I(inode)->index = 0;
+	}
+	/*
+	 * index_cnt is ignored for everything but a dir,
+	 * btrfs_get_inode_index_count has an explanation for the magic
+	 * number
+	 */
+	BTRFS_I(inode)->index_cnt = 2;
+
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1984,6 +2073,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_inode_ref);
 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+	btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->index);
 	ptr = (unsigned long)(ref + 1);
 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
 
@@ -1998,6 +2088,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	insert_inode_hash(inode);
 	return inode;
 fail:
+	if (dir)
+		BTRFS_I(dir)->index_cnt--;
 	btrfs_free_path(path);
 	return ERR_PTR(ret);
 }
@@ -2014,7 +2106,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
-	struct inode *parent_inode;
+	struct inode *parent_inode = dentry->d_parent->d_inode;
 
 	key.objectid = inode->i_ino;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -2023,16 +2115,17 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_dir_item(trans, root,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
-				    &key, btrfs_inode_type(inode));
+				    &key, btrfs_inode_type(inode),
+				    BTRFS_I(inode)->index);
 	if (ret == 0) {
 		if (add_backref) {
 			ret = btrfs_insert_inode_ref(trans, root,
 					     dentry->d_name.name,
 					     dentry->d_name.len,
 					     inode->i_ino,
-					     dentry->d_parent->d_inode->i_ino);
+					     parent_inode->i_ino,
+					     BTRFS_I(inode)->index);
 		}
-		parent_inode = dentry->d_parent->d_inode;
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
 				   dentry->d_name.len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -2083,7 +2176,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, mode);
@@ -2138,7 +2231,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino,
 				objectid, BTRFS_I(dir)->block_group, mode);
@@ -2203,10 +2296,15 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
+	err = btrfs_set_inode_index(dir, inode);
+	if (err)
+		goto fail;
+
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
+
 	err = btrfs_add_nondir(trans, dentry, inode, 1);
 
 	if (err)
@@ -2258,7 +2356,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, S_IFDIR | mode);
@@ -2886,9 +2984,8 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 		struct btrfs_block_group_cache *block_group)
 {
 	struct inode *inode;
-	int ret;
 
-	inode = btrfs_new_inode(trans, new_root, "..", 2, new_dirid,
+	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
 				new_dirid, block_group, S_IFDIR | 0700);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -2896,8 +2993,6 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 	inode->i_fop = &btrfs_dir_file_operations;
 	new_root->inode = inode;
 
-	ret = btrfs_insert_inode_ref(trans, new_root, "..", 2, new_dirid,
-				     new_dirid);
 	inode->i_nlink = 1;
 	btrfs_i_size_write(inode, 0);
 
@@ -3078,6 +3173,10 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		if (ret)
 			goto out_fail;
 	}
+	ret = btrfs_set_inode_index(new_dir, old_inode);
+	if (ret)
+		goto out_fail;
+
 	ret = btrfs_add_link(trans, new_dentry, old_inode, 1);
 	if (ret)
 		goto out_fail;
@@ -3123,7 +3222,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	inode = btrfs_new_inode(trans, root, dentry->d_name.name,
+	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
 				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a61f2e7e2db..faf081302d0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -129,13 +129,13 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	dir = root->fs_info->sb->s_root->d_inode;
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
 				    name, namelen, dir->i_ino, &key,
-				    BTRFS_FT_DIR);
+				    BTRFS_FT_DIR, 0);
 	if (ret)
 		goto fail;
 
 	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 			     name, namelen, objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino);
+			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
 	if (ret)
 		goto fail;
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0f756e0175c..768b0d223e6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -577,14 +577,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
 				    pending->name, namelen,
 				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, BTRFS_FT_DIR);
+				    &key, BTRFS_FT_DIR, 0);
 
 	if (ret)
 		goto fail;
 
 	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 			     pending->name, strlen(pending->name), objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino);
+			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
 
 	/* Invalidate existing dcache entry for new snapshot. */
 	btrfs_invalidate_dcache_root(root, pending->name, namelen);
-- 
cgit v1.2.3


From 6099afe88fe64b2f47c43a8a71c13be3a416bbf7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:16:03 -0400
Subject: Btrfs: Remove unused xattr code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  3 +--
 fs/btrfs/xattr.c | 51 ---------------------------------------------------
 2 files changed, 1 insertion(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index beb05b1de54..53d315bdd16 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1689,8 +1689,7 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct inode *inode);
+
 /* super.c */
 u64 btrfs_parse_size(char *str);
 int btrfs_parse_options(struct btrfs_root *root, char *options);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3e9eb91c3c8..6730b595884 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -378,57 +378,6 @@ err:
 	return ret;
 }
 
-/*
- * delete all the xattrs associated with the inode.
- */
-int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct inode *inode)
-{
-	struct btrfs_path *path;
-	struct btrfs_key key, found_key;
-	struct btrfs_item *item;
-	struct extent_buffer *leaf;
-	int ret;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	path->reada = -1;
-	key.objectid = inode->i_ino;
-	btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
-	key.offset = (u64)-1;
-
-	while(1) {
-		/* look for our next xattr */
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0)
-			goto out;
-		BUG_ON(ret == 0);
-
-		if (path->slots[0] == 0)
-			break;
-
-		path->slots[0]--;
-		leaf = path->nodes[0];
-		item = btrfs_item_nr(leaf, path->slots[0]);
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-		if (found_key.objectid != key.objectid)
-			break;
-		if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY)
-			break;
-
-		ret = btrfs_del_item(trans, root, path);
-		BUG_ON(ret);
-		btrfs_release_path(root, path);
-	}
-	ret = 0;
-out:
-	btrfs_free_path(path);
-
-	return ret;
-}
-
 /*
  * Handler functions
  */
-- 
cgit v1.2.3


From 33268eaf0b3db5e2bd12c0ada81a8e8f87a46d68 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:16:36 -0400
Subject: Btrfs: Add ACL support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c         | 296 +++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/btrfs_inode.h |   3 +
 fs/btrfs/ctree.h       |   7 ++
 fs/btrfs/inode.c       |  42 ++++++-
 fs/btrfs/super.c       |   9 +-
 fs/btrfs/xattr.c       |  26 +++--
 6 files changed, 335 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index ff0fcc72848..b192659b4f5 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -20,76 +20,302 @@
 #include <linux/string.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
 #include <linux/sched.h>
+
 #include "ctree.h"
+#include "btrfs_inode.h"
 #include "xattr.h"
-#ifndef is_owner_or_cap
-#define is_owner_or_cap(inode)	\
-	((current->fsuid == (inode)->i_uid) || capable(CAP_FOWNER))
-#endif
+
+static void btrfs_update_cached_acl(struct inode *inode,
+				    struct posix_acl **p_acl,
+				    struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(*p_acl);
+	*p_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
+{
+	int size, name_index;
+	char *value = NULL;
+	struct posix_acl *acl = NULL, **p_acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		p_acl = &BTRFS_I(inode)->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		p_acl = &BTRFS_I(inode)->i_default_acl;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	spin_lock(&inode->i_lock);
+	if (*p_acl != BTRFS_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*p_acl);
+	spin_unlock(&inode->i_lock);
+
+	if (acl)
+		return acl;
+
+
+	size = btrfs_xattr_get(inode, name_index, "", NULL, 0);
+	if (size > 0) {
+		value = kzalloc(size, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		size = btrfs_xattr_get(inode, name_index, "", value, size);
+		if (size > 0) {
+			acl = posix_acl_from_xattr(value, size);
+			btrfs_update_cached_acl(inode, p_acl, acl);
+		}
+		kfree(value);
+	} else if (size == -ENOENT) {
+		acl = NULL;
+		btrfs_update_cached_acl(inode, p_acl, acl);
+	}
+
+	return acl;
+}
+
+static int btrfs_xattr_get_acl(struct inode *inode, int type,
+			       void *value, size_t size)
+{
+	struct posix_acl *acl;
+	int ret = 0;
+
+	acl = btrfs_get_acl(inode, type);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, value, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+/*
+ * Needs to be called with fs_mutex held
+ */
+static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int ret, name_index = 0, size = 0;
+	struct posix_acl **p_acl;
+	char *value = NULL;
+	mode_t mode;
+
+	if (acl) {
+		ret = posix_acl_valid(acl);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+	}
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		mode = inode->i_mode;
+		ret = posix_acl_equiv_mode(acl, &mode);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+		inode->i_mode = mode;
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		p_acl = &BTRFS_I(inode)->i_acl;
+		break;
+	case ACL_TYPE_DEFAULT:
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EINVAL : 0;
+		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		p_acl = &BTRFS_I(inode)->i_default_acl;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmalloc(size, GFP_NOFS);
+		if (!value) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		ret = posix_acl_to_xattr(acl, value, size);
+		if (ret < 0)
+			goto out;
+	}
+
+	ret = btrfs_xattr_set(inode, name_index, "", value, size, 0);
+
+out:
+	if (value)
+		kfree(value);
+
+	if (!ret)
+		btrfs_update_cached_acl(inode, p_acl, acl);
+
+	return ret;
+}
 
 static int btrfs_xattr_set_acl(struct inode *inode, int type,
 			       const void *value, size_t size)
 {
 	int ret = 0;
-	struct posix_acl *acl;
+	struct posix_acl *acl = NULL;
 
-	if (!is_owner_or_cap(inode))
-		return -EPERM;
 	if (value) {
 		acl = posix_acl_from_xattr(value, size);
 		if (acl == NULL) {
 			value = NULL;
 			size = 0;
 		} else if (IS_ERR(acl)) {
-			ret = PTR_ERR(acl);
-		} else {
-			ret = posix_acl_valid(acl);
-			posix_acl_release(acl);
+			return PTR_ERR(acl);
 		}
-		if (ret)
-			return ret;
 	}
-	return btrfs_xattr_set(inode, type, "", value, size, 0);
-}
 
-static int btrfs_xattr_get_acl(struct inode *inode, int type,
-			       void *value, size_t size)
-{
-	return btrfs_xattr_get(inode, type, "", value, size);
+	ret = btrfs_set_acl(inode, acl, type);
+
+	posix_acl_release(acl);
+
+	return ret;
 }
+
+
 static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
 				      void *value, size_t size)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_get_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
-				   value, size);
+	return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
+
 static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
 				      const void *value, size_t size, int flags)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_set_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
-				   value, size);
+	return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
+
 static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
 				       void *value, size_t size)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_get_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
-				   value, size);
+	return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
+
 static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
 				       const void *value, size_t size, int flags)
 {
-	if (*name != '\0')
-	       return -EINVAL;
-	return btrfs_xattr_set_acl(inode, BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
-				   value, size);
+	return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl;
+	int error = -EAGAIN;
+
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		error = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+	}
+
+	return error;
+}
+
+/*
+ * btrfs_init_acl is already generally called under fs_mutex, so the locking
+ * stuff has been fixed to work with that.  If the locking stuff changes, we
+ * need to re-evaluate the acl locking stuff.
+ */
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	/* this happens with subvols */
+	if (!dir)
+		return 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (IS_POSIXACL(dir)) {
+			acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+
+	if (IS_POSIXACL(dir) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+			if (ret)
+				goto failed;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto failed;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			inode->i_mode = mode;
+			if (ret > 0) {
+				/* we need an acl */
+				ret = btrfs_set_acl(inode, clone,
+						    ACL_TYPE_ACCESS);
+			}
+		}
+	}
+failed:
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+	struct posix_acl *acl, *clone;
+	int ret = 0;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!IS_POSIXACL(inode))
+		return 0;
+
+	acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+
+	posix_acl_release(clone);
+
+	return ret;
 }
+
 struct xattr_handler btrfs_xattr_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.list	= btrfs_xattr_generic_list,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b7bd60e4fdd..9f2a4ef944a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -36,6 +36,9 @@ struct btrfs_inode {
 	struct inode vfs_inode;
 	struct btrfs_ordered_inode_tree ordered_tree;
 
+	struct posix_acl *i_acl;
+	struct posix_acl *i_default_acl;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 53d315bdd16..f87d7263f2d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -42,6 +42,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_B5RfS_M"
 
+#define BTRFS_ACL_NOT_CACHED    ((void *)-1)
+
 #ifdef CONFIG_LOCKDEP
 # define BTRFS_MAX_LEVEL 7
 #else
@@ -1694,4 +1696,9 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 u64 btrfs_parse_size(char *str);
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+
+/* acl.c */
+int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_acl_chmod(struct inode *inode);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8d371d6fe55..2d8853543a7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -1478,6 +1479,9 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 out:
 	err = inode_setattr(inode, attr);
+
+	if (!err && ((attr->ia_valid & ATTR_MODE)))
+		err = btrfs_acl_chmod(inode);
 fail:
 	return err;
 }
@@ -2184,6 +2188,12 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
 	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
@@ -2239,6 +2249,12 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
 	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
@@ -2366,6 +2382,11 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	}
 
 	drop_on_err = 1;
+
+	err = btrfs_init_acl(inode, dir);
+	if (err)
+		goto out_fail;
+
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 	btrfs_set_trans_block_group(trans, inode);
@@ -3023,6 +3044,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 		return NULL;
 	ei->last_trans = 0;
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
+	ei->i_acl = BTRFS_ACL_NOT_CACHED;
+	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
 	return &ei->vfs_inode;
 }
 
@@ -3032,6 +3055,13 @@ void btrfs_destroy_inode(struct inode *inode)
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
+	if (BTRFS_I(inode)->i_acl &&
+	    BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(BTRFS_I(inode)->i_acl);
+	if (BTRFS_I(inode)->i_default_acl &&
+	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
+		posix_acl_release(BTRFS_I(inode)->i_default_acl);
+
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
@@ -3230,6 +3260,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
+	err = btrfs_init_acl(inode, dir);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+
 	btrfs_set_trans_block_group(trans, inode);
 	err = btrfs_add_nondir(trans, dentry, inode, 0);
 	if (err)
@@ -3310,7 +3346,7 @@ static int btrfs_permission(struct inode *inode, int mask,
 {
 	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
-	return generic_permission(inode, mask, NULL);
+	return generic_permission(inode, mask, btrfs_check_acl);
 }
 
 static struct inode_operations btrfs_dir_inode_operations = {
@@ -3392,6 +3428,10 @@ static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= generic_removexattr,
 };
 static struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4cb6aac5122..a6a418b6894 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -68,7 +68,7 @@ static void btrfs_put_super (struct super_block * sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,7 +83,8 @@ static match_table_t tokens = {
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_ssd, "ssd"},
-	{Opt_err, NULL}
+	{Opt_noacl, "noacl"},
+	{Opt_err, NULL},
 };
 
 u64 btrfs_parse_size(char *str)
@@ -215,6 +216,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 					info->alloc_start);
 			}
 			break;
+		case Opt_noacl:
+			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
+			break;
 		default:
 			break;
 		}
@@ -301,6 +305,7 @@ static int btrfs_fill_super(struct super_block * sb,
 	sb->s_op = &btrfs_super_ops;
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
+	sb->s_flags |= MS_POSIXACL;
 
 	tree_root = open_ctree(sb, fs_devices, (char *)data);
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6730b595884..121c9550314 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -26,25 +26,27 @@
 #include "transaction.h"
 #include "xattr.h"
 #include "disk-io.h"
+
 static struct xattr_handler *btrfs_xattr_handler_map[] = {
 	[BTRFS_XATTR_INDEX_USER]		= &btrfs_xattr_user_handler,
 #ifdef CONFIG_FS_POSIX_ACL
-//	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
-//	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
+	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
+	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
 #endif
 	[BTRFS_XATTR_INDEX_TRUSTED]		= &btrfs_xattr_trusted_handler,
 	[BTRFS_XATTR_INDEX_SECURITY]		= &btrfs_xattr_security_handler,
-//	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
+	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
 };
+
 struct xattr_handler *btrfs_xattr_handlers[] = {
 	&btrfs_xattr_user_handler,
 #ifdef CONFIG_FS_POSIX_ACL
-//	&btrfs_xattr_acl_access_handler,
-//	&btrfs_xattr_acl_default_handler,
+	&btrfs_xattr_acl_access_handler,
+	&btrfs_xattr_acl_default_handler,
 #endif
 	&btrfs_xattr_trusted_handler,
 	&btrfs_xattr_security_handler,
-//	&btrfs_xattr_system_handler,
+	&btrfs_xattr_system_handler,
 	NULL,
 };
 
@@ -237,10 +239,14 @@ int btrfs_xattr_set(struct inode *inode, int name_index,
 			mod = 1;
 			goto out;
 		}
-	} else if (flags & XATTR_REPLACE) {
-		/* we couldn't find the attr to replace, so error out */
-		ret = -ENODATA;
-		goto out;
+	} else {
+		btrfs_release_path(root, path);
+
+		if (flags & XATTR_REPLACE) {
+			/* we couldn't find the attr to replace */
+			ret = -ENODATA;
+			goto out;
+		}
 	}
 
 	/* ok we have to create a completely new xattr */
-- 
cgit v1.2.3


From 7b1287662304c3cb05cb38f5e3e2d69f386e8f10 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:17:14 -0400
Subject: Btrfs: Create orphan inode records to prevent lost files after a
 crash

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   2 +-
 fs/btrfs/btrfs_inode.h |   3 +
 fs/btrfs/ctree.c       |   2 +-
 fs/btrfs/ctree.h       |  14 +++
 fs/btrfs/disk-io.c     |   2 +
 fs/btrfs/inode.c       | 237 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/orphan.c      |  67 ++++++++++++++
 7 files changed, 322 insertions(+), 5 deletions(-)
 create mode 100644 fs/btrfs/orphan.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index f8a38b0da7f..5a0fd7b0e3e 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o locking.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 9f2a4ef944a..df624fd735c 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -39,6 +39,9 @@ struct btrfs_inode {
 	struct posix_acl *i_acl;
 	struct posix_acl *i_default_acl;
 
+	/* for keeping track of orphaned inodes */
+	struct list_head i_orphan;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cdc713062b0..f2a94999c37 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2622,7 +2622,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		total_data += data_size[i];
 	}
 
-	total_size = total_data + (nr - 1) * sizeof(struct btrfs_item);
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
 	if (ret == 0) {
 		return -EEXIST;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f87d7263f2d..012ad529cb1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -74,6 +74,9 @@ struct btrfs_ordered_sum;
 /* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
 
+/* orhpan objectid for tracking unlinked/truncated files */
+#define BTRFS_ORPHAN_OBJECTID -5ULL
+
 /*
  * All files have objectids higher than this.
  */
@@ -646,6 +649,9 @@ struct btrfs_root {
 
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
+
+	spinlock_t orphan_lock;
+	struct list_head orphan_list;
 };
 
 /*
@@ -657,6 +663,7 @@ struct btrfs_root {
 #define BTRFS_INODE_ITEM_KEY		1
 #define BTRFS_INODE_REF_KEY		2
 #define BTRFS_XATTR_ITEM_KEY		8
+#define BTRFS_ORPHAN_ITEM_KEY		9
 /* reserve 2-15 close to the inode for later flexibility */
 
 /*
@@ -1560,6 +1567,13 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path, u64 dir,
 					  const char *name, u16 name_len,
 					  int mod);
+
+/* orphan.c */
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset);
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset);
+
 /* inode-map.c */
 int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *fs_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ec01062eb41..d6092396734 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -732,7 +732,9 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->in_sysfs = 0;
 
 	INIT_LIST_HEAD(&root->dirty_list);
+	INIT_LIST_HEAD(&root->orphan_list);
 	spin_lock_init(&root->node_lock);
+	spin_lock_init(&root->orphan_lock);
 	mutex_init(&root->objectid_mutex);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2d8853543a7..0c9ec8aa304 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -78,6 +78,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
+static void btrfs_truncate(struct inode *inode);
+
 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 			   int for_del)
 {
@@ -826,6 +828,190 @@ zeroit:
 	return -EIO;
 }
 
+/*
+ * This creates an orphan entry for the given inode in case something goes
+ * wrong in the middle of an unlink/truncate.
+ */
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	spin_lock(&root->orphan_lock);
+
+	/* already on the orphan list, we're good */
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		spin_unlock(&root->orphan_lock);
+		return 0;
+	}
+
+	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+
+	spin_unlock(&root->orphan_lock);
+
+	/*
+	 * insert an orphan item to track this unlinked/truncated file
+	 */
+	ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+
+	return ret;
+}
+
+/*
+ * We have done the truncate/delete so we can go ahead and remove the orphan
+ * item for this particular inode.
+ */
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	spin_lock(&root->orphan_lock);
+
+	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+		spin_unlock(&root->orphan_lock);
+		return 0;
+	}
+
+	list_del_init(&BTRFS_I(inode)->i_orphan);
+	if (!trans) {
+		spin_unlock(&root->orphan_lock);
+		return 0;
+	}
+
+	spin_unlock(&root->orphan_lock);
+
+	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+
+	return ret;
+}
+
+/*
+ * this cleans up any orphans that may be left on the list from the last use
+ * of this root.
+ */
+void btrfs_orphan_cleanup(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	struct btrfs_key key, found_key;
+	struct btrfs_trans_handle *trans;
+	struct inode *inode;
+	int ret = 0, nr_unlink = 0, nr_truncate = 0;
+
+	/* don't do orphan cleanup if the fs is readonly. */
+	if (root->inode->i_sb->s_flags & MS_RDONLY)
+		return;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+	path->reada = -1;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, root->inode);
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			printk(KERN_ERR "Error searching slot for orphan: %d"
+			       "\n", ret);
+			break;
+		}
+
+		/*
+		 * if ret == 0 means we found what we were searching for, which
+		 * is weird, but possible, so only screw with path if we didnt
+		 * find the key and see if we have stuff that matches
+		 */
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		/* pull out the item */
+		leaf = path->nodes[0];
+		item = btrfs_item_nr(leaf, path->slots[0]);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		/* make sure the item matches what we want */
+		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		/* release the path since we're done with it */
+		btrfs_release_path(root, path);
+
+		/*
+		 * this is where we are basically btrfs_lookup, without the
+		 * crossing root thing.  we store the inode number in the
+		 * offset of the orphan item.
+		 */
+		inode = btrfs_iget_locked(root->inode->i_sb,
+					  found_key.offset, root);
+		if (!inode)
+			break;
+
+		if (inode->i_state & I_NEW) {
+			BTRFS_I(inode)->root = root;
+
+			/* have to set the location manually */
+			BTRFS_I(inode)->location.objectid = inode->i_ino;
+			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+			BTRFS_I(inode)->location.offset = 0;
+
+			btrfs_read_locked_inode(inode);
+			unlock_new_inode(inode);
+		}
+
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		spin_lock(&root->orphan_lock);
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+		spin_unlock(&root->orphan_lock);
+
+		/*
+		 * if this is a bad inode, means we actually succeeded in
+		 * removing the inode, but not the orphan record, which means
+		 * we need to manually delete the orphan since iput will just
+		 * do a destroy_inode
+		 */
+		if (is_bad_inode(inode)) {
+			btrfs_orphan_del(trans, inode);
+			iput(inode);
+			continue;
+		}
+
+		/* if we have links, this was a truncate, lets do that */
+		if (inode->i_nlink) {
+			nr_truncate++;
+			btrfs_truncate(inode);
+		} else {
+			nr_unlink++;
+		}
+
+		/* this will do delete_inode and everything for us */
+		iput(inode);
+	}
+
+	if (nr_unlink)
+		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+	if (nr_truncate)
+		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+
+	btrfs_free_path(path);
+	btrfs_end_transaction(trans, root);
+}
+
 void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -1067,6 +1253,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
+	struct inode *inode = dentry->d_inode;
 	int ret;
 	unsigned long nr = 0;
 
@@ -1080,6 +1267,10 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	btrfs_set_trans_block_group(trans, dir);
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
+
+	if (inode->i_nlink == 0)
+		ret = btrfs_orphan_add(trans, inode);
+
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction_throttle(trans, root);
@@ -1108,12 +1299,17 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
+	err = btrfs_orphan_add(trans, inode);
+	if (err)
+		goto fail_trans;
+
 	/* now the directory is empty */
 	err = btrfs_unlink_trans(trans, root, dir, dentry);
 	if (!err) {
 		btrfs_i_size_write(inode, 0);
 	}
 
+fail_trans:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
 fail:
@@ -1131,6 +1327,9 @@ fail:
  *
  * csum items that cross the new i_size are truncated to the new size
  * as well.
+ *
+ * min_type is the minimum key type to truncate down to.  If set to 0, this
+ * will kill all the items on this inode, including the INODE_ITEM_KEY.
  */
 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
@@ -1495,6 +1694,7 @@ void btrfs_delete_inode(struct inode *inode)
 
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode)) {
+		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
 	}
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
@@ -1504,8 +1704,12 @@ void btrfs_delete_inode(struct inode *inode)
 
 	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_in_trans(trans, root, inode, 0);
-	if (ret)
+	if (ret) {
+		btrfs_orphan_del(NULL, inode);
 		goto no_delete_lock;
+	}
+
+	btrfs_orphan_del(trans, inode);
 
 	nr = trans->blocks_used;
 	clear_inode(inode);
@@ -1688,7 +1892,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
-	int ret;
+	int ret, do_orphan = 0;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -1706,6 +1910,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			return ERR_PTR(ret);
 		if (ret > 0)
 			return ERR_PTR(-ENOENT);
+
 		inode = btrfs_iget_locked(dir->i_sb, location.objectid,
 					  sub_root);
 		if (!inode)
@@ -1715,6 +1920,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			if (sub_root != root) {
 				igrab(inode);
 				sub_root->inode = inode;
+				do_orphan = 1;
 			}
 			BTRFS_I(inode)->root = sub_root;
 			memcpy(&BTRFS_I(inode)->location, &location,
@@ -1723,6 +1929,10 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			unlock_new_inode(inode);
 		}
 	}
+
+	if (unlikely(do_orphan))
+		btrfs_orphan_cleanup(sub_root);
+
 	return d_splice_alias(inode, dentry);
 }
 
@@ -2964,12 +3174,19 @@ static void btrfs_truncate(struct inode *inode)
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_i_size_write(inode, inode->i_size);
 
+	ret = btrfs_orphan_add(trans, inode);
+	if (ret)
+		goto out;
 	/* FIXME, add redo link to tree so we don't leak on crash */
 	ret = btrfs_truncate_in_trans(trans, root, inode,
 				      BTRFS_EXTENT_DATA_KEY);
 	btrfs_update_inode(trans, root, inode);
-	nr = trans->blocks_used;
 
+	ret = btrfs_orphan_del(trans, inode);
+	BUG_ON(ret);
+
+out:
+	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
 	BUG_ON(ret);
 	btrfs_btree_balance_dirty(root, nr);
@@ -3046,6 +3263,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
+	INIT_LIST_HEAD(&ei->i_orphan);
 	return &ei->vfs_inode;
 }
 
@@ -3062,6 +3280,14 @@ void btrfs_destroy_inode(struct inode *inode)
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
+	spin_lock(&BTRFS_I(inode)->root->orphan_lock);
+	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+		       " list\n", inode->i_ino);
+		dump_stack();
+	}
+	spin_unlock(&BTRFS_I(inode)->root->orphan_lock);
+
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
@@ -3202,6 +3428,11 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
 		if (ret)
 			goto out_fail;
+		if (new_inode->i_nlink == 0) {
+			ret = btrfs_orphan_add(trans, new_inode);
+			if (ret)
+				goto out_fail;
+		}
 	}
 	ret = btrfs_set_inode_index(new_dir, old_inode);
 	if (ret)
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
new file mode 100644
index 00000000000..3c0d52af4f8
--- /dev/null
+++ b/fs/btrfs/orphan.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+
+int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 offset)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret = 0;
+
+	key.objectid = BTRFS_ORPHAN_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = offset;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
-- 
cgit v1.2.3


From 8e8a1e31f2780b7865d40a8c5142a04f2bcdcb86 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 24 Jul 2008 12:17:14 -0400
Subject: Btrfs: Fix a few functions that exit without stopping their
 transaction

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 +++-
 fs/btrfs/ioctl.c       | 6 ++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f92b297e7da..e6a3ba9dd10 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3275,8 +3275,10 @@ next:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0)
 		ret = -EIO;
-	if (ret < 0)
+	if (ret < 0) {
+		btrfs_end_transaction(trans, root);
 		goto out;
+	}
 
 	clear_extent_bits(&info->block_group_cache, key.objectid,
 			  key.objectid + key.offset - 1,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index faf081302d0..7d40778a90e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -78,8 +78,10 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
 				      objectid, trans->transid, 0, 0,
 				      0, 0);
-	if (IS_ERR(leaf))
-		return PTR_ERR(leaf);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		goto fail;
+	}
 
 	btrfs_set_header_nritems(leaf, 0);
 	btrfs_set_header_level(leaf, 0);
-- 
cgit v1.2.3


From 45467261edb590fd5a2d676d9686edb82762fb3f Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@gmail.com>
Date: Thu, 24 Jul 2008 12:17:15 -0400
Subject: Btrfs: Remove unused variable in fixup_tree_root_location

Remove a unused variable 'path' in fixup_tree_root_location.

Signed-off-by: Balaji Rao <balajirrao@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0c9ec8aa304..8bc981c4f5e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1805,7 +1805,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 			     struct btrfs_root **sub_root,
 			     struct dentry *dentry)
 {
-	struct btrfs_path *path;
 	struct btrfs_root_item *ri;
 
 	if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
@@ -1813,9 +1812,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
 		return 0;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
 	*sub_root = btrfs_read_fs_root(root->fs_info, location,
 					dentry->d_name.name,
 					dentry->d_name.len);
@@ -1827,7 +1823,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 	location->offset = 0;
 
-	btrfs_free_path(path);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 974e35a82d022434efa50fe5f805ae14c542ba8c Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 24 Jul 2008 12:18:16 -0400
Subject: Btrfs: Properly release lock in pin_down_bytes

When buffer isn't uptodate, pin_down_bytes may leave the tree locked
after it returns.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e6a3ba9dd10..cdfb4ff4b45 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1485,8 +1485,8 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
-			if (btrfs_try_tree_lock(buf) &&
-			    btrfs_buffer_uptodate(buf, 0)) {
+			if (btrfs_buffer_uptodate(buf, 0) &&
+			    btrfs_try_tree_lock(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
-- 
cgit v1.2.3


From 445dceb78f3445b9bcade90e93ca35cae6120172 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 24 Jul 2008 12:19:32 -0400
Subject: Btrfs: Fix .. lookup corner case

Inode ref item can be in the next leaf when we find "path->slots[0] ==
btrfs_header_nritems(...)".

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bc981c4f5e..cf9534b79ab 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1752,21 +1752,18 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	if (namelen == 2 && strcmp(name, "..") == 0) {
 		struct btrfs_key key;
 		struct extent_buffer *leaf;
-		u32 nritems;
 		int slot;
 
 		key.objectid = dir->i_ino;
+		key.offset = (u64)-1;
 		btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
-		key.offset = 0;
+		if (ret < 0 || path->slots[0] == 0)
+			goto out_err;
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		BUG_ON(ret == 0);
 		ret = 0;
-
 		leaf = path->nodes[0];
-		slot = path->slots[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (slot >= nritems)
-			goto out_err;
+		slot = path->slots[0] - 1;
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid != dir->i_ino ||
@@ -1980,16 +1977,15 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	/* special case for .., just use the back ref */
 	if (filp->f_pos == 1) {
 		btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
-		key.offset = 0;
+		key.offset = (u64)-1;
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		BUG_ON(ret == 0);
-		leaf = path->nodes[0];
-		slot = path->slots[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (slot >= nritems) {
+		if (ret < 0 || path->slots[0] == 0) {
 			btrfs_release_path(root, path);
 			goto read_dir_items;
 		}
+		BUG_ON(ret == 0);
+		leaf = path->nodes[0];
+		slot = path->slots[0] - 1;
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 		btrfs_release_path(root, path);
 		if (found_key.objectid != key.objectid ||
-- 
cgit v1.2.3


From 9652480bf48500885a30754b4a5c436b5b34456d Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Thu, 24 Jul 2008 12:19:49 -0400
Subject: Fix path slots selection in btrfs_search_forward

We should decrease the found slot by one as btrfs_search_slot does
when bin_search return 1 and node level > 0.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f2a94999c37..ab4ac0365c7 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2992,6 +2992,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 	struct extent_buffer *cur;
 	struct btrfs_key found_key;
 	int slot;
+	int sret;
 	u32 nritems;
 	int level;
 	int ret = 1;
@@ -3009,7 +3010,7 @@ again:
 	while(1) {
 		nritems = btrfs_header_nritems(cur);
 		level = btrfs_header_level(cur);
-		bin_search(cur, min_key, level, &slot);
+		sret = bin_search(cur, min_key, level, &slot);
 
 		/* at level = 0, we're done, setup the path and exit */
 		if (level == 0) {
@@ -3018,6 +3019,8 @@ again:
 			btrfs_item_key_to_cpu(cur, &found_key, slot);
 			goto out;
 		}
+		if (sret && slot > 0)
+			slot--;
 		/*
 		 * check this node pointer against the cache_only and
 		 * min_trans parameters.  If it isn't in cache or is too
-- 
cgit v1.2.3


From 5516e5957f4b99b19fffffa53bf9fbe7cc793249 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Thu, 24 Jul 2008 12:20:14 -0400
Subject: Btrfs: Null terminate strings passed in from userspace

The 'char name[BTRFS_PATH_NAME_MAX]' member of struct btrfs_ioctl_vol_args
is passed directly to strlen() after being copied from user. I haven't
verified this, but in theory a userspace program could pass in an
unterminated string and cause a kernel crash as strlen walks off the end of
the array.

This patch terminates the ->name string in all btrfs ioctl functions which
currently use a 'struct btrfs_ioctl_vol_args'. Since the string is now
properly terminated, it's length will never be longer than
BTRFS_PATH_NAME_MAX so that error check has been removed.

By the way, it might be better overall to just have the ioctl pass an
unterminated string + length structure but I didn't bother with that since
it'd change the kernel/user interface.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7d40778a90e..5e627746c4e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -310,11 +310,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		ret = -EFAULT;
 		goto out;
 	}
+
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	namelen = strlen(vol_args->name);
-	if (namelen > BTRFS_VOL_NAME_MAX) {
-		ret = -EINVAL;
-		goto out;
-	}
 
 	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
@@ -412,11 +410,8 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
 		goto out;
 	}
 
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	namelen = strlen(vol_args->name);
-	if (namelen > BTRFS_VOL_NAME_MAX) {
-		ret = -EINVAL;
-		goto out;
-	}
 	if (strchr(vol_args->name, '/')) {
 		ret = -EINVAL;
 		goto out;
@@ -487,6 +482,7 @@ long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 		ret = -EFAULT;
 		goto out;
 	}
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
 out:
@@ -508,6 +504,7 @@ long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 		ret = -EFAULT;
 		goto out;
 	}
+	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_rm_device(root, vol_args->name);
 
 out:
-- 
cgit v1.2.3


From 3a115f520f391b4ab14041bdd6eedb370d944fa6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 24 Jul 2008 12:25:50 -0400
Subject: Btrfs: Rev the disk format magic

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 012ad529cb1..07d321552db 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,7 +40,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_B5RfS_M"
+#define BTRFS_MAGIC "_B6RfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From 31153d81284934601d08110ac7698fd9a535e4c0 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 28 Jul 2008 15:32:19 -0400
Subject: Btrfs: Add a leaf reference cache

Much of the IO done while dropping snapshots is done looking up
leaves in the filesystem trees to see if they point to any extents and
to drop the references on any extents found.

This creates a cache so that IO isn't required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |   3 +-
 fs/btrfs/ctree.c       |   4 +-
 fs/btrfs/ctree.h       |   8 +-
 fs/btrfs/disk-io.c     |  14 +++
 fs/btrfs/extent-tree.c | 115 ++++++++++++++++++++++---
 fs/btrfs/ref-cache.c   | 226 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ref-cache.h   |  72 ++++++++++++++++
 fs/btrfs/transaction.c |  67 +++++++++++----
 8 files changed, 476 insertions(+), 33 deletions(-)
 create mode 100644 fs/btrfs/ref-cache.c
 create mode 100644 fs/btrfs/ref-cache.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 5a0fd7b0e3e..a4b38177abd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o
+	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
+	   ref-cache.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ab4ac0365c7..245eb00435d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -165,7 +165,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf);
+	ret = btrfs_inc_ref(trans, new_root, buf, 0);
 	kfree(new_root);
 
 	if (ret)
@@ -232,7 +232,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		different_trans = 1;
-		ret = btrfs_inc_ref(trans, root, buf);
+		ret = btrfs_inc_ref(trans, root, buf, 1);
 		if (ret)
 			return ret;
 	} else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 07d321552db..34ed23d64eb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -592,6 +592,10 @@ struct btrfs_fs_info {
 	u64 last_alloc;
 	u64 last_data_alloc;
 
+	spinlock_t ref_cache_lock;
+	u64 total_ref_cache_size;
+	u64 running_ref_cache_size;
+
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
@@ -613,6 +617,8 @@ struct btrfs_root {
 	spinlock_t node_lock;
 
 	struct extent_buffer *commit_root;
+	struct btrfs_leaf_ref_tree *ref_tree;
+
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
@@ -1430,7 +1436,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf);
+		  struct extent_buffer *buf, int cache_ref);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 bytenr, u64 num_bytes,
 		      u64 root_objectid, u64 ref_generation,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d6092396734..4f0e1d06c38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -716,6 +716,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->node = NULL;
 	root->inode = NULL;
 	root->commit_root = NULL;
+	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
@@ -1165,12 +1166,19 @@ static int transaction_kthread(void *arg)
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
+		printk("btrfs: total reference cache size %Lu\n",
+			root->fs_info->total_ref_cache_size);
+
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
 		if (!cur) {
 			mutex_unlock(&root->fs_info->trans_mutex);
 			goto sleep;
 		}
+
+		printk("btrfs: running reference cache size %Lu\n",
+			root->fs_info->running_ref_cache_size);
+
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
 			mutex_unlock(&root->fs_info->trans_mutex);
@@ -1233,6 +1241,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
+	spin_lock_init(&fs_info->ref_cache_lock);
 
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
@@ -1699,6 +1708,11 @@ int close_ctree(struct btrfs_root *root)
 		printk("btrfs: at unmount delalloc count %Lu\n",
 		       fs_info->delalloc_bytes);
 	}
+	if (fs_info->total_ref_cache_size) {
+		printk("btrfs: at umount reference cache size %Lu\n",
+			fs_info->total_ref_cache_size);
+	}
+	
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cdfb4ff4b45..7b24f151165 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -26,6 +26,7 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 #define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
@@ -927,7 +928,7 @@ out:
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf)
+		  struct extent_buffer *buf, int cache_ref)
 {
 	u64 bytenr;
 	u32 nritems;
@@ -937,6 +938,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int level;
 	int ret;
 	int faili;
+	int nr_file_extents = 0;
 
 	if (!root->ref_cows)
 		return 0;
@@ -959,6 +961,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			if (disk_bytenr == 0)
 				continue;
 
+			if (buf != root->commit_root)
+				nr_file_extents++;
+
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
 				    btrfs_file_extent_disk_num_bytes(buf, fi),
@@ -988,6 +993,53 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 		}
 	}
+	/* cache orignal leaf block's references */
+	if (level == 0 && cache_ref && buf != root->commit_root) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_extent_info *info;
+
+		ref = btrfs_alloc_leaf_ref(nr_file_extents);
+		if (!ref) {
+			WARN_ON(1);
+			goto out;
+		}
+
+		btrfs_item_key_to_cpu(buf, &ref->key, 0);
+
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ref->nritems = nr_file_extents;
+		info = ref->extents;
+		
+		for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
+			u64 disk_bytenr;
+			btrfs_item_key_to_cpu(buf, &key, i);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, i,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (disk_bytenr == 0)
+				continue;
+
+			info->bytenr = disk_bytenr;
+			info->num_bytes =
+				btrfs_file_extent_disk_num_bytes(buf, fi);
+			info->objectid = key.objectid;
+			info->offset = key.offset;
+			info++;
+		}
+
+		BUG_ON(!root->ref_tree);
+		ret = btrfs_add_leaf_ref(root, ref);
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(ref);
+	}
+out:
 	return 0;
 fail:
 	WARN_ON(1);
@@ -2215,9 +2267,9 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	return buf;
 }
 
-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  struct extent_buffer *leaf)
+static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
+				  	   struct btrfs_root *root,
+					   struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -2266,6 +2318,30 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
+				  	 struct btrfs_root *root,
+					 struct btrfs_leaf_ref *ref)
+{
+	int i;
+	int ret;
+	struct btrfs_extent_info *info = ref->extents;
+
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	for (i = 0; i < ref->nritems; i++) {
+		mutex_lock(&root->fs_info->alloc_mutex);
+		ret = __btrfs_free_extent(trans, root,
+					info->bytenr, info->num_bytes,
+					ref->owner, ref->generation,
+					info->objectid, info->offset, 0);
+		mutex_unlock(&root->fs_info->alloc_mutex);
+		BUG_ON(ret);
+		info++;
+	}
+	mutex_lock(&root->fs_info->alloc_mutex);
+
+	return 0;
+}
+
 static void noinline reada_walk_down(struct btrfs_root *root,
 				     struct extent_buffer *node,
 				     int slot)
@@ -2341,6 +2417,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
+	struct btrfs_leaf_ref *ref;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -2370,7 +2447,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
-			ret = drop_leaf_ref(trans, root, cur);
+			ret = drop_leaf_ref_no_cache(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
@@ -2391,6 +2468,21 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 			continue;
 		}
+		
+		if (*level == 1) {
+			struct btrfs_key key;
+			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
+			ref = btrfs_lookup_leaf_ref(root, &key);
+			if (ref) {
+				ret = drop_leaf_ref(trans, root, ref);
+				BUG_ON(ret);
+				btrfs_remove_leaf_ref(root, ref);
+				btrfs_free_leaf_ref(ref);
+				*level = 0;
+				break;
+			}
+		}
+
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
@@ -2398,7 +2490,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 
 			if (path->slots[*level] == 0)
 				reada_walk_down(root, cur, path->slots[*level]);
-
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			cond_resched();
@@ -2435,17 +2526,19 @@ out:
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
 	if (path->nodes[*level] == root->node) {
-		root_owner = root->root_key.objectid;
 		parent = path->nodes[*level];
+		bytenr = path->nodes[*level]->start;
 	} else {
 		parent = path->nodes[*level + 1];
-		root_owner = btrfs_header_owner(parent);
+		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
 	}
 
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
-	ret = __btrfs_free_extent(trans, root, path->nodes[*level]->start,
-				path->nodes[*level]->len,
-				root_owner, root_gen, 0, 0, 1);
+
+	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+				  root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
new file mode 100644
index 00000000000..95a9faeb9dc
--- /dev/null
+++ b/fs/btrfs/ref-cache.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "ref-cache.h"
+#include "transaction.h"
+
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
+{
+	struct btrfs_leaf_ref *ref;
+
+	ref = kmalloc(btrfs_leaf_ref_size(nr_extents), GFP_NOFS);
+	if (ref) {
+		memset(ref, 0, sizeof(*ref));
+		atomic_set(&ref->usage, 1);
+	}
+	return ref;
+}
+
+void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref)
+{
+	if (!ref)
+		return;
+	WARN_ON(atomic_read(&ref->usage) == 0);
+	if (atomic_dec_and_test(&ref->usage)) {
+		BUG_ON(ref->in_tree);
+		kfree(ref);
+	}
+}
+
+static int comp_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->type > k2->type)
+		return 1;
+	if (k1->type < k2->type)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key,
+				   struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct btrfs_leaf_ref *entry;
+	int ret;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		ret = comp_keys(key, &entry->key);
+		if (ret < 0)
+			p = &(*p)->rb_left;
+		else if (ret > 0)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+	
+	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
+	entry->in_tree = 1;
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, struct btrfs_key *key)
+{
+	struct rb_node * n = root->rb_node;
+	struct btrfs_leaf_ref *entry;
+	int ret;
+
+	while(n) {
+		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		ret = comp_keys(key, &entry->key);
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+int btrfs_remove_leaf_refs(struct btrfs_root *root)
+{
+	struct rb_node *rb;
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+	if (!tree)
+		return 0;
+
+	spin_lock(&tree->lock);
+	while(!btrfs_leaf_ref_tree_empty(tree)) {
+		tree->last = NULL;
+		rb = rb_first(&tree->root);
+		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+		rb_erase(&ref->rb_node, &tree->root);
+		ref->in_tree = 0;
+
+		spin_unlock(&tree->lock);
+
+		btrfs_free_leaf_ref(ref);
+
+		cond_resched();
+		spin_lock(&tree->lock);
+	}
+	spin_unlock(&tree->lock);
+	return 0;
+}
+
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     struct btrfs_key *key)
+{
+	struct rb_node *rb;
+	struct btrfs_leaf_ref *ref = NULL;
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+
+	if (!tree)
+		return NULL;
+
+	spin_lock(&tree->lock);
+	if (tree->last && comp_keys(key, &tree->last->key) == 0) {
+		ref = tree->last;
+	} else {
+		rb = tree_search(&tree->root, key);
+		if (rb) {
+			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+			tree->last = ref;
+		}
+	}
+	if (ref)
+		atomic_inc(&ref->usage);
+	spin_unlock(&tree->lock);
+	return ref;
+}
+
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	int ret = 0;
+	struct rb_node *rb;
+	size_t size = btrfs_leaf_ref_size(ref->nritems);
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+	struct btrfs_transaction *trans = root->fs_info->running_transaction;
+
+	spin_lock(&tree->lock);
+	rb = tree_insert(&tree->root, &ref->key, &ref->rb_node);
+	if (rb) {
+		ret = -EEXIST;
+	} else {
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size += size;
+		if (trans && tree->generation == trans->transid)
+			root->fs_info->running_ref_cache_size += size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+
+		tree->last = ref;
+		atomic_inc(&ref->usage);
+	}
+	spin_unlock(&tree->lock);
+	return ret;
+}
+
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+{
+	size_t size = btrfs_leaf_ref_size(ref->nritems);
+	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+	struct btrfs_transaction *trans = root->fs_info->running_transaction;
+
+	BUG_ON(!ref->in_tree);
+	spin_lock(&tree->lock);
+	
+	spin_lock(&root->fs_info->ref_cache_lock);
+	root->fs_info->total_ref_cache_size -= size;
+	if (trans && tree->generation == trans->transid)
+		root->fs_info->running_ref_cache_size -= size;
+	spin_unlock(&root->fs_info->ref_cache_lock);
+
+	if (tree->last == ref) {
+		struct rb_node *next = rb_next(&ref->rb_node);
+		if (next) {
+			tree->last = rb_entry(next, struct btrfs_leaf_ref,
+					      rb_node);
+		} else
+			tree->last = NULL;
+	}
+
+	rb_erase(&ref->rb_node, &tree->root);
+	ref->in_tree = 0;
+
+	spin_unlock(&tree->lock);
+
+	btrfs_free_leaf_ref(ref);
+	return 0;
+}
+
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
new file mode 100644
index 00000000000..79ecc47110f
--- /dev/null
+++ b/fs/btrfs/ref-cache.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+struct btrfs_extent_info {
+	u64 bytenr;
+	u64 num_bytes;
+	u64 objectid;
+	u64 offset;
+};
+
+struct btrfs_leaf_ref {
+	struct rb_node rb_node;
+	struct btrfs_key key;
+	int in_tree;
+	atomic_t usage;
+
+	u64 bytenr;
+	u64 owner;
+	u64 generation;
+	int nritems;
+	struct btrfs_extent_info extents[];
+};
+
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct btrfs_leaf_ref *last;
+	u64 generation;
+	spinlock_t lock;
+};
+
+static inline size_t btrfs_leaf_ref_size(int nr_extents)
+{
+	return sizeof(struct btrfs_leaf_ref) + 
+	       sizeof(struct btrfs_extent_info) * nr_extents;
+}
+
+static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
+{
+	tree->root.rb_node = NULL;
+	tree->last = NULL;
+	tree->generation = 0;
+	spin_lock_init(&tree->lock);
+}
+
+static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
+{
+	return RB_EMPTY_ROOT(&tree->root);
+}
+
+void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
+					     struct btrfs_key *key);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+int btrfs_remove_leaf_refs(struct btrfs_root *root);
+int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 768b0d223e6..543e5ee4033 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -24,6 +24,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
@@ -31,6 +32,13 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
+struct dirty_root {
+	struct list_head list;
+	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
+	struct btrfs_leaf_ref_tree ref_tree;
+};
+
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(transaction->use_count == 0);
@@ -84,6 +92,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 
 static noinline int record_root_in_trans(struct btrfs_root *root)
 {
+	struct dirty_root *dirty;
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
 	if (root->ref_cows && root->last_trans < running_trans_id) {
 		WARN_ON(root == root->fs_info->extent_root);
@@ -91,7 +100,25 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
 				   (unsigned long)root->root_key.objectid,
 				   BTRFS_ROOT_TRANS_TAG);
+
+			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
+			BUG_ON(!dirty);
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+
+			dirty->latest_root = root;
+			INIT_LIST_HEAD(&dirty->list);
+			btrfs_leaf_ref_tree_init(&dirty->ref_tree);
+			dirty->ref_tree.generation = running_trans_id;
+
 			root->commit_root = btrfs_root_node(root);
+			root->ref_tree = &dirty->ref_tree;
+
+			memcpy(dirty->root, root, sizeof(*root));
+			spin_lock_init(&dirty->root->node_lock);
+			mutex_init(&dirty->root->objectid_mutex);
+			dirty->root->node = root->commit_root;
+			dirty->root->commit_root = NULL;
 		} else {
 			WARN_ON(1);
 		}
@@ -310,12 +337,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-struct dirty_root {
-	struct list_head list;
-	struct btrfs_root *root;
-	struct btrfs_root *latest_root;
-};
-
 int btrfs_add_dead_root(struct btrfs_root *root,
 			struct btrfs_root *latest,
 			struct list_head *dead_list)
@@ -325,8 +346,10 @@ int btrfs_add_dead_root(struct btrfs_root *root,
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
+	btrfs_leaf_ref_tree_init(&dirty->ref_tree);
 	dirty->root = root;
 	dirty->latest_root = latest;
+	root->ref_tree = NULL;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -354,11 +377,23 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			radix_tree_tag_clear(radix,
 				     (unsigned long)root->root_key.objectid,
 				     BTRFS_ROOT_TRANS_TAG);
+
+			BUG_ON(!root->ref_tree);
+			dirty = container_of(root->ref_tree, struct dirty_root,
+					     ref_tree);
+
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
+
+				BUG_ON(!btrfs_leaf_ref_tree_empty(
+							root->ref_tree));
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
+				root->ref_tree = NULL;
+				
+				kfree(dirty->root);
+				kfree(dirty);
 
 				/* make sure to update the root on disk
 				 * so we get any updates to the block used
@@ -370,23 +405,12 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 						&root->root_item);
 				continue;
 			}
-			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-			BUG_ON(!dirty);
-			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
-			BUG_ON(!dirty->root);
 
 			memset(&root->root_item.drop_progress, 0,
 			       sizeof(struct btrfs_disk_key));
 			root->root_item.drop_level = 0;
-
-			memcpy(dirty->root, root, sizeof(*root));
-			dirty->root->node = root->commit_root;
-			dirty->latest_root = root;
-			spin_lock_init(&dirty->root->node_lock);
-			mutex_init(&dirty->root->objectid_mutex);
-
 			root->commit_root = NULL;
-
+			root->ref_tree = NULL;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_bytenr(&root->root_item,
 					      root->node->start);
@@ -409,6 +433,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 				list_add(&dirty->list, list);
 			} else {
 				WARN_ON(1);
+				free_extent_buffer(dirty->root->node);
 				kfree(dirty->root);
 				kfree(dirty);
 			}
@@ -514,6 +539,8 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		btrfs_remove_leaf_refs(dirty->root);
+
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -698,6 +725,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
 
+	spin_lock(&root->fs_info->ref_cache_lock);
+	root->fs_info->running_ref_cache_size = 0;
+	spin_unlock(&root->fs_info->ref_cache_lock);
+
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
-- 
cgit v1.2.3


From 017e5369eb353559d68a11d4a718faa634533821 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Jul 2008 15:32:51 -0400
Subject: Btrfs: Leaf reference cache update

This changes the reference cache to make a single cache per root
instead of one cache per transaction, and to key by the byte number
of the disk block instead of the keys inside.

This makes it much less likely to have cache misses if a snapshot
or something has an extra reference on a higher node or a leaf while
the first transaction that added the leaf into the cache is dropping.

Some throttling is added to functions that free blocks heavily so they
wait for old transactions to drop.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 11 +++++++-
 fs/btrfs/disk-io.c     |  8 +++---
 fs/btrfs/extent-tree.c | 18 +++++--------
 fs/btrfs/file.c        |  2 +-
 fs/btrfs/inode.c       |  8 +++---
 fs/btrfs/ref-cache.c   | 71 +++++++++++---------------------------------------
 fs/btrfs/ref-cache.h   | 18 ++++++-------
 fs/btrfs/transaction.c | 34 +++++++++---------------
 8 files changed, 61 insertions(+), 109 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 34ed23d64eb..4eca0aa1ce7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -594,7 +594,6 @@ struct btrfs_fs_info {
 
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
-	u64 running_ref_cache_size;
 
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
@@ -606,10 +605,18 @@ struct btrfs_fs_info {
 	void *bdev_holder;
 };
 
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct btrfs_leaf_ref *last;
+	struct list_head list;
+	spinlock_t lock;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
+struct dirty_root;
 struct btrfs_root {
 	struct extent_buffer *node;
 
@@ -618,6 +625,8 @@ struct btrfs_root {
 
 	struct extent_buffer *commit_root;
 	struct btrfs_leaf_ref_tree *ref_tree;
+	struct btrfs_leaf_ref_tree ref_tree_struct;
+	struct dirty_root *dirty_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4f0e1d06c38..eccdf13a95a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -40,6 +40,7 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
+#include "ref-cache.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -737,6 +738,10 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->node_lock);
 	spin_lock_init(&root->orphan_lock);
 	mutex_init(&root->objectid_mutex);
+
+	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
+	root->ref_tree = &root->ref_tree_struct;
+
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -1176,9 +1181,6 @@ static int transaction_kthread(void *arg)
 			goto sleep;
 		}
 
-		printk("btrfs: running reference cache size %Lu\n",
-			root->fs_info->running_ref_cache_size);
-
 		now = get_seconds();
 		if (now < cur->start_time || now - cur->start_time < 30) {
 			mutex_unlock(&root->fs_info->trans_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7b24f151165..0e294cfaa60 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1004,8 +1004,6 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			goto out;
 		}
 
-		btrfs_item_key_to_cpu(buf, &ref->key, 0);
-
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
@@ -2387,19 +2385,15 @@ static void noinline reada_walk_down(struct btrfs_root *root,
 	}
 }
 
-/*
- * we want to avoid as much random IO as we can with the alloc mutex
- * held, so drop the lock and do the lookup, then do it again with the
- * lock held.
- */
 int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 			      u32 *refs)
 {
+	int ret;
 	mutex_unlock(&root->fs_info->alloc_mutex);
-	lookup_extent_ref(NULL, root, start, len, refs);
+	ret = lookup_extent_ref(NULL, root, start, len, refs);
 	cond_resched();
 	mutex_lock(&root->fs_info->alloc_mutex);
-	return lookup_extent_ref(NULL, root, start, len, refs);
+	return ret;
 }
 
 /*
@@ -2468,11 +2462,11 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 			continue;
 		}
-		
+
 		if (*level == 1) {
 			struct btrfs_key key;
 			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
-			ref = btrfs_lookup_leaf_ref(root, &key);
+			ref = btrfs_lookup_leaf_ref(root, bytenr);
 			if (ref) {
 				ret = drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
@@ -2482,7 +2476,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				break;
 			}
 		}
-
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
@@ -2672,6 +2665,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = -EAGAIN;
 			break;
 		}
+		wake_up(&root->fs_info->transaction_throttle);
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e5ffb66ad32..3efec25e34b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -347,7 +347,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
-	err = btrfs_end_transaction_throttle(trans, root);
+	err = btrfs_end_transaction(trans, root);
 out_unlock:
 	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cf9534b79ab..4f977ea5497 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2482,7 +2482,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -2535,7 +2535,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -2609,7 +2609,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 out_fail:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 
 out_unlock:
 	if (drop_on_err)
@@ -3548,7 +3548,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 out_fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 95a9faeb9dc..ec9587784a3 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -29,6 +29,7 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
 	if (ref) {
 		memset(ref, 0, sizeof(*ref));
 		atomic_set(&ref->usage, 1);
+		INIT_LIST_HEAD(&ref->list);
 	}
 	return ref;
 }
@@ -44,40 +45,21 @@ void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref)
 	}
 }
 
-static int comp_keys(struct btrfs_key *k1, struct btrfs_key *k2)
-{
-	if (k1->objectid > k2->objectid)
-		return 1;
-	if (k1->objectid < k2->objectid)
-		return -1;
-	if (k1->type > k2->type)
-		return 1;
-	if (k1->type < k2->type)
-		return -1;
-	if (k1->offset > k2->offset)
-		return 1;
-	if (k1->offset < k2->offset)
-		return -1;
-	return 0;
-}
-
-static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key,
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 				   struct rb_node *node)
 {
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
 	struct btrfs_leaf_ref *entry;
-	int ret;
 
 	while(*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
 		WARN_ON(!entry->in_tree);
 
-		ret = comp_keys(key, &entry->key);
-		if (ret < 0)
+		if (bytenr < entry->bytenr)
 			p = &(*p)->rb_left;
-		else if (ret > 0)
+		else if (bytenr > entry->bytenr)
 			p = &(*p)->rb_right;
 		else
 			return parent;
@@ -90,20 +72,18 @@ static struct rb_node *tree_insert(struct rb_root *root, struct btrfs_key *key,
 	return NULL;
 }
 
-static struct rb_node *tree_search(struct rb_root *root, struct btrfs_key *key)
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 {
 	struct rb_node * n = root->rb_node;
 	struct btrfs_leaf_ref *entry;
-	int ret;
 
 	while(n) {
 		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
 		WARN_ON(!entry->in_tree);
 
-		ret = comp_keys(key, &entry->key);
-		if (ret < 0)
+		if (bytenr < entry->bytenr)
 			n = n->rb_left;
-		else if (ret > 0)
+		else if (bytenr > entry->bytenr)
 			n = n->rb_right;
 		else
 			return n;
@@ -122,11 +102,11 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root)
 
 	spin_lock(&tree->lock);
 	while(!btrfs_leaf_ref_tree_empty(tree)) {
-		tree->last = NULL;
 		rb = rb_first(&tree->root);
 		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
 		rb_erase(&ref->rb_node, &tree->root);
 		ref->in_tree = 0;
+		list_del_init(&ref->list);
 
 		spin_unlock(&tree->lock);
 
@@ -140,7 +120,7 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root)
 }
 
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-					     struct btrfs_key *key)
+					     u64 bytenr)
 {
 	struct rb_node *rb;
 	struct btrfs_leaf_ref *ref = NULL;
@@ -150,15 +130,9 @@ struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 		return NULL;
 
 	spin_lock(&tree->lock);
-	if (tree->last && comp_keys(key, &tree->last->key) == 0) {
-		ref = tree->last;
-	} else {
-		rb = tree_search(&tree->root, key);
-		if (rb) {
-			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
-			tree->last = ref;
-		}
-	}
+	rb = tree_search(&tree->root, bytenr);
+	if (rb)
+		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
 	if (ref)
 		atomic_inc(&ref->usage);
 	spin_unlock(&tree->lock);
@@ -171,21 +145,17 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 	struct rb_node *rb;
 	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-	struct btrfs_transaction *trans = root->fs_info->running_transaction;
 
 	spin_lock(&tree->lock);
-	rb = tree_insert(&tree->root, &ref->key, &ref->rb_node);
+	rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
 	if (rb) {
 		ret = -EEXIST;
 	} else {
 		spin_lock(&root->fs_info->ref_cache_lock);
 		root->fs_info->total_ref_cache_size += size;
-		if (trans && tree->generation == trans->transid)
-			root->fs_info->running_ref_cache_size += size;
 		spin_unlock(&root->fs_info->ref_cache_lock);
-
-		tree->last = ref;
 		atomic_inc(&ref->usage);
+		list_add_tail(&ref->list, &tree->list);
 	}
 	spin_unlock(&tree->lock);
 	return ret;
@@ -195,28 +165,17 @@ int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-	struct btrfs_transaction *trans = root->fs_info->running_transaction;
 
 	BUG_ON(!ref->in_tree);
 	spin_lock(&tree->lock);
 	
 	spin_lock(&root->fs_info->ref_cache_lock);
 	root->fs_info->total_ref_cache_size -= size;
-	if (trans && tree->generation == trans->transid)
-		root->fs_info->running_ref_cache_size -= size;
 	spin_unlock(&root->fs_info->ref_cache_lock);
 
-	if (tree->last == ref) {
-		struct rb_node *next = rb_next(&ref->rb_node);
-		if (next) {
-			tree->last = rb_entry(next, struct btrfs_leaf_ref,
-					      rb_node);
-		} else
-			tree->last = NULL;
-	}
-
 	rb_erase(&ref->rb_node, &tree->root);
 	ref->in_tree = 0;
+	list_del_init(&ref->list);
 
 	spin_unlock(&tree->lock);
 
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 79ecc47110f..823c049f72f 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -15,6 +15,8 @@
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
+#ifndef __REFCACHE__
+#define __REFCACHE__
 
 struct btrfs_extent_info {
 	u64 bytenr;
@@ -25,7 +27,6 @@ struct btrfs_extent_info {
 
 struct btrfs_leaf_ref {
 	struct rb_node rb_node;
-	struct btrfs_key key;
 	int in_tree;
 	atomic_t usage;
 
@@ -33,14 +34,9 @@ struct btrfs_leaf_ref {
 	u64 owner;
 	u64 generation;
 	int nritems;
-	struct btrfs_extent_info extents[];
-};
 
-struct btrfs_leaf_ref_tree {
-	struct rb_root root;
-	struct btrfs_leaf_ref *last;
-	u64 generation;
-	spinlock_t lock;
+	struct list_head list;
+	struct btrfs_extent_info extents[];
 };
 
 static inline size_t btrfs_leaf_ref_size(int nr_extents)
@@ -53,7 +49,7 @@ static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
 {
 	tree->root.rb_node = NULL;
 	tree->last = NULL;
-	tree->generation = 0;
+	INIT_LIST_HEAD(&tree->list);
 	spin_lock_init(&tree->lock);
 }
 
@@ -66,7 +62,9 @@ void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
 struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
 void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-					     struct btrfs_key *key);
+					     u64 bytenr);
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 int btrfs_remove_leaf_refs(struct btrfs_root *root);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
+
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 543e5ee4033..fcef3cae0c9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,7 +36,6 @@ struct dirty_root {
 	struct list_head list;
 	struct btrfs_root *root;
 	struct btrfs_root *latest_root;
-	struct btrfs_leaf_ref_tree ref_tree;
 };
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
@@ -108,13 +107,13 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 
 			dirty->latest_root = root;
 			INIT_LIST_HEAD(&dirty->list);
-			btrfs_leaf_ref_tree_init(&dirty->ref_tree);
-			dirty->ref_tree.generation = running_trans_id;
 
 			root->commit_root = btrfs_root_node(root);
-			root->ref_tree = &dirty->ref_tree;
+			root->dirty_root = dirty;
 
 			memcpy(dirty->root, root, sizeof(*root));
+			dirty->root->ref_tree = &root->ref_tree_struct;
+
 			spin_lock_init(&dirty->root->node_lock);
 			mutex_init(&dirty->root->objectid_mutex);
 			dirty->root->node = root->commit_root;
@@ -217,12 +216,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
 
-	if (0 && cur_trans->in_commit && throttle) {
+	if (throttle && atomic_read(&root->fs_info->throttles)) {
 		DEFINE_WAIT(wait);
 		mutex_unlock(&root->fs_info->trans_mutex);
 		prepare_to_wait(&root->fs_info->transaction_throttle, &wait,
 				TASK_UNINTERRUPTIBLE);
-		schedule();
+		if (atomic_read(&root->fs_info->throttles))
+			schedule();
 		finish_wait(&root->fs_info->transaction_throttle, &wait);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
@@ -333,6 +333,8 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
 		update_cowonly_root(trans, root);
+		if (root->fs_info->closing)
+			btrfs_remove_leaf_refs(root);
 	}
 	return 0;
 }
@@ -346,10 +348,8 @@ int btrfs_add_dead_root(struct btrfs_root *root,
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
-	btrfs_leaf_ref_tree_init(&dirty->ref_tree);
 	dirty->root = root;
 	dirty->latest_root = latest;
-	root->ref_tree = NULL;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -379,18 +379,14 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 				     BTRFS_ROOT_TRANS_TAG);
 
 			BUG_ON(!root->ref_tree);
-			dirty = container_of(root->ref_tree, struct dirty_root,
-					     ref_tree);
+			dirty = root->dirty_root;
 
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
 
-				BUG_ON(!btrfs_leaf_ref_tree_empty(
-							root->ref_tree));
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
-				root->ref_tree = NULL;
 				
 				kfree(dirty->root);
 				kfree(dirty);
@@ -410,7 +406,6 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			       sizeof(struct btrfs_disk_key));
 			root->root_item.drop_level = 0;
 			root->commit_root = NULL;
-			root->ref_tree = NULL;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_bytenr(&root->root_item,
 					      root->node->start);
@@ -485,7 +480,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	while(!list_empty(list)) {
 		struct btrfs_root *root;
 
-		dirty = list_entry(list->next, struct dirty_root, list);
+		dirty = list_entry(list->prev, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
 		num_bytes = btrfs_root_used(&dirty->root->root_item);
@@ -507,7 +502,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			if (err)
 				ret = err;
 			nr = trans->blocks_used;
-			ret = btrfs_end_transaction_throttle(trans, tree_root);
+			ret = btrfs_end_transaction(trans, tree_root);
 			BUG_ON(ret);
 
 			mutex_unlock(&root->fs_info->drop_mutex);
@@ -517,6 +512,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		}
 		BUG_ON(ret);
 		atomic_dec(&root->fs_info->throttles);
+		wake_up(&root->fs_info->transaction_throttle);
 
 		mutex_lock(&root->fs_info->alloc_mutex);
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
@@ -539,8 +535,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		btrfs_remove_leaf_refs(dirty->root);
-
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -725,10 +719,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
 
-	spin_lock(&root->fs_info->ref_cache_lock);
-	root->fs_info->running_ref_cache_size = 0;
-	spin_unlock(&root->fs_info->ref_cache_lock);
-
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
-- 
cgit v1.2.3


From 1a3f5d0400d786aec41ede15fec5710ad1a1d18b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jul 2008 16:11:35 -0400
Subject: Btrfs: Fix version.sh when used outside of an hg repo

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/version.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
index fd9b53d3986..4d06497d2ec 100644
--- a/fs/btrfs/version.sh
+++ b/fs/btrfs/version.sh
@@ -6,10 +6,10 @@
 # Copyright 2008, Oracle
 # Released under the GNU GPLv2
  
-v="Btrfs v0.15"
+v="v0.15"
 
 which hg > /dev/null
-if [ $? == 0 ]; then
+if [ -d .hg ] && [ $? == 0 ]; then
 	last=$(hg tags | grep -m1 -o '^v[0-9.]\+')
 	 
 	# now check if the repo has commits since then...
-- 
cgit v1.2.3


From ab78c84de1ce4db1b2a2cef361625ad80abbab3f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Jul 2008 16:15:18 -0400
Subject: Btrfs: Throttle operations if the reference cache gets too large

A large reference cache is directly related to a lot of work pending
for the cleaner thread.  This throttles back new operations based on
the size of the reference cache so the cleaner thread will be able to keep
up.

Overall, this actually makes the FS faster because the cleaner thread will
be more likely to find things in cache.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  7 ++++--
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/file.c        |  1 +
 fs/btrfs/inode.c       | 10 ++++-----
 fs/btrfs/transaction.c | 59 +++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/transaction.h |  1 +
 7 files changed, 58 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4eca0aa1ce7..5517dfc6f71 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -580,6 +580,7 @@ struct btrfs_fs_info {
 	int do_barriers;
 	int closing;
 	atomic_t throttles;
+	atomic_t throttle_gen;
 
 	u64 total_pinned;
 	struct list_head dirty_cowonly_roots;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eccdf13a95a..27ffa9b7ddc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1171,8 +1171,10 @@ static int transaction_kthread(void *arg)
 		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
-		printk("btrfs: total reference cache size %Lu\n",
-			root->fs_info->total_ref_cache_size);
+		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
+			printk("btrfs: total reference cache size %Lu\n",
+				root->fs_info->total_ref_cache_size);
+		}
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		cur = root->fs_info->running_transaction;
@@ -1256,6 +1258,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->throttles, 0);
+	atomic_set(&fs_info->throttle_gen, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0e294cfaa60..6290cf41d64 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2650,6 +2650,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 	}
 	while(1) {
+		atomic_inc(&root->fs_info->throttle_gen);
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3efec25e34b..ded5281f846 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -974,6 +974,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
 		if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root, 1);
+		btrfs_throttle(root);
 		cond_resched();
 	}
 out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4f977ea5497..7c87f863d6f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2482,7 +2482,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -2535,7 +2535,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		drop_inode = 1;
 
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -2609,7 +2609,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 out_fail:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 
 out_unlock:
 	if (drop_on_err)
@@ -3434,7 +3434,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		goto out_fail;
 
 out_fail:
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 out_unlock:
 	return ret;
 }
@@ -3548,7 +3548,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
 out_unlock:
 	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
+	btrfs_end_transaction_throttle(trans, root);
 out_fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fcef3cae0c9..b8be6703189 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -202,35 +202,64 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+void btrfs_throttle(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+
+harder:
+	if (atomic_read(&info->throttles)) {
+		DEFINE_WAIT(wait);
+		int thr;
+		int harder_count = 0;
+		thr = atomic_read(&info->throttle_gen);
+
+		do {
+			prepare_to_wait(&info->transaction_throttle,
+					&wait, TASK_UNINTERRUPTIBLE);
+			if (!atomic_read(&info->throttles)) {
+				finish_wait(&info->transaction_throttle, &wait);
+				break;
+			}
+			schedule();
+			finish_wait(&info->transaction_throttle, &wait);
+		} while (thr == atomic_read(&info->throttle_gen));
+
+		if (harder_count < 5 &&
+		    info->total_ref_cache_size > 5 * 1024 * 1024) {
+			harder_count++;
+			goto harder;
+		}
+
+		if (harder_count < 10 &&
+		    info->total_ref_cache_size > 10 * 1024 * 1024) {
+			harder_count++;
+			goto harder;
+		}
+	}
+}
+
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, int throttle)
 {
 	struct btrfs_transaction *cur_trans;
+	struct btrfs_fs_info *info = root->fs_info;
 
-	mutex_lock(&root->fs_info->trans_mutex);
-	cur_trans = root->fs_info->running_transaction;
+	mutex_lock(&info->trans_mutex);
+	cur_trans = info->running_transaction;
 	WARN_ON(cur_trans != trans->transaction);
 	WARN_ON(cur_trans->num_writers < 1);
 	cur_trans->num_writers--;
 
 	if (waitqueue_active(&cur_trans->writer_wait))
 		wake_up(&cur_trans->writer_wait);
-
-	if (throttle && atomic_read(&root->fs_info->throttles)) {
-		DEFINE_WAIT(wait);
-		mutex_unlock(&root->fs_info->trans_mutex);
-		prepare_to_wait(&root->fs_info->transaction_throttle, &wait,
-				TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&root->fs_info->throttles))
-			schedule();
-		finish_wait(&root->fs_info->transaction_throttle, &wait);
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-
 	put_transaction(cur_trans);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	mutex_unlock(&info->trans_mutex);
 	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+
+	if (throttle)
+		btrfs_throttle(root);
+
 	return 0;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 11fbdeceb26..df2ca2aad1c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -91,4 +91,5 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
+void btrfs_throttle(struct btrfs_root *root);
 #endif
-- 
cgit v1.2.3


From 3bf10418675cb424724b5cb9d7725b234defe1fd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 30 Jul 2008 09:24:37 -0400
Subject: Btrfs: async-thread: fix possible memory leak

When kthread_run() returns failure, this worker hasn't been
added to the list, so btrfs_stop_workers() won't free it.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8d4cc4679d5..5fe6a0d532e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -187,6 +187,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		worker->task = kthread_run(worker_loop, worker, "btrfs");
 		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
+			kfree(worker);
 			ret = PTR_ERR(worker->task);
 			goto fail;
 		}
-- 
cgit v1.2.3


From f321e4910398cf7922265d269fb17fd26f312571 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 30 Jul 2008 09:26:11 -0400
Subject: Btrfs: Update and fix mount -o nodatacow

To check whether a given file extent is referenced by multiple snapshots, the
checker walks down the fs tree through dead root and checks all tree blocks in
the path.

We can easily detect whether a given tree block is directly referenced by other
snapshot. We can also detect any indirect reference from other snapshot by
checking reference's generation. The checker can always detect multiple
references, but can't reliably detect cases of single reference. So btrfs may
do file data cow even there is only one reference.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   9 +--
 fs/btrfs/extent-tree.c | 202 ++++++++++++++++++++++++++++++-------------------
 fs/btrfs/inode.c       |   6 +-
 fs/btrfs/transaction.c |  16 ++--
 fs/btrfs/transaction.h |   5 ++
 5 files changed, 142 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5517dfc6f71..83422088c62 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -617,7 +617,7 @@ struct btrfs_leaf_ref_tree {
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
-struct dirty_root;
+struct btrfs_dirty_root;
 struct btrfs_root {
 	struct extent_buffer *node;
 
@@ -627,7 +627,7 @@ struct btrfs_root {
 	struct extent_buffer *commit_root;
 	struct btrfs_leaf_ref_tree *ref_tree;
 	struct btrfs_leaf_ref_tree ref_tree_struct;
-	struct dirty_root *dirty_root;
+	struct btrfs_dirty_root *dirty_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
@@ -1399,9 +1399,8 @@ static inline struct dentry *fdentry(struct file *file) {
 }
 
 /* extent-tree.c */
-u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
-				  struct btrfs_path *count_path,
-				  u64 expected_owner, u64 first_extent);
+int btrfs_cross_ref_exists(struct btrfs_root *root,
+			   struct btrfs_key *key, u64 bytenr);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6290cf41d64..fe1ddbd2bfd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -802,70 +802,57 @@ out:
 	return 0;
 }
 
-u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
-				  struct btrfs_path *count_path,
-				  u64 expected_owner,
-				  u64 first_extent)
+
+static int get_reference_status(struct btrfs_root *root, u64 bytenr,
+				u64 parent_gen, u64 ref_objectid,
+			        u64 *min_generation, u32 *ref_count)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	struct btrfs_path *path;
-	u64 bytenr;
-	u64 found_objectid;
-	u64 found_owner;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref_item;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
 	u64 root_objectid = root->root_key.objectid;
-	u32 total_count = 0;
-	u32 extent_refs;
-	u32 cur_count;
+	u64 ref_generation;
 	u32 nritems;
 	int ret;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct extent_buffer *l;
-	struct btrfs_extent_item *item;
-	struct btrfs_extent_ref *ref_item;
-	int level = -1;
 
-	/* FIXME, needs locking */
-	BUG();
-
-	mutex_lock(&root->fs_info->alloc_mutex);
-	path = btrfs_alloc_path();
-again:
-	if (level == -1)
-		bytenr = first_extent;
-	else
-		bytenr = count_path->nodes[level]->start;
-
-	cur_count = 0;
 	key.objectid = bytenr;
 	key.offset = 0;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
 
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	path = btrfs_alloc_path();
+	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
 
-	l = path->nodes[0];
-	btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
 	if (found_key.objectid != bytenr ||
 	    found_key.type != BTRFS_EXTENT_ITEM_KEY) {
+		ret = 1;
 		goto out;
 	}
 
-	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-	extent_refs = btrfs_extent_refs(l, item);
+	*ref_count = 0;
+	*min_generation = (u64)-1;
+
 	while (1) {
-		l = path->nodes[0];
-		nritems = btrfs_header_nritems(l);
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
 			if (ret == 0)
 				continue;
 			break;
 		}
-		btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != bytenr)
 			break;
 
@@ -874,57 +861,120 @@ again:
 			continue;
 		}
 
-		cur_count++;
-		ref_item = btrfs_item_ptr(l, path->slots[0],
+		ref_item = btrfs_item_ptr(leaf, path->slots[0],
 					  struct btrfs_extent_ref);
-		found_objectid = btrfs_ref_root(l, ref_item);
-
-		if (found_objectid != root_objectid) {
-			total_count = 2;
-			goto out;
-		}
-		if (level == -1) {
-			found_owner = btrfs_ref_objectid(l, ref_item);
-			if (found_owner != expected_owner) {
-				total_count = 2;
-				goto out;
-			}
-			/*
-			 * nasty.  we don't count a reference held by
-			 * the running transaction.  This allows nodatacow
-			 * to avoid cow most of the time
-			 */
-			if (found_owner >= BTRFS_FIRST_FREE_OBJECTID &&
-			    btrfs_ref_generation(l, ref_item) ==
-			    root->fs_info->generation) {
-				extent_refs--;
-			}
+		ref_generation = btrfs_ref_generation(leaf, ref_item);
+		/*
+		 * For (parent_gen > 0 && parent_gen > ref_gen):
+		 *
+		 * we reach here through the oldest root, therefore 
+		 * all other reference from same snapshot should have 
+		 * a larger generation.
+		 */
+		if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
+		    (parent_gen > 0 && parent_gen > ref_generation) ||
+		    (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+		     ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
+			if (ref_count)
+				*ref_count = 2;
+			break;
 		}
-		total_count = 1;
+
+		*ref_count = 1;
+		if (*min_generation > ref_generation)
+			*min_generation = ref_generation;
+
 		path->slots[0]++;
 	}
-	/*
-	 * if there is more than one reference against a data extent,
-	 * we have to assume the other ref is another snapshot
-	 */
-	if (level == -1 && extent_refs > 1) {
-		total_count = 2;
+	ret = 0;
+out:
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_cross_ref_exists(struct btrfs_root *root,
+			   struct btrfs_key *key, u64 bytenr)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *old_root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	struct btrfs_file_extent_item *item;
+	u64 ref_generation;
+	u64 min_generation;
+	u64 extent_start;
+	u32 ref_count;
+	int level;
+	int ret;
+
+	BUG_ON(key->type != BTRFS_EXTENT_DATA_KEY);
+	ret = get_reference_status(root, bytenr, 0, key->objectid,
+				   &min_generation, &ref_count);
+	if (ret)
+		return ret;
+
+	if (ref_count != 1)
+		return 1;
+
+	trans = btrfs_start_transaction(root, 0);
+	old_root = root->dirty_root->root;
+	ref_generation = old_root->root_key.offset;
+
+	/* all references are created in running transaction */
+	if (min_generation > ref_generation) {
+		ret = 0;
 		goto out;
 	}
-	if (cur_count == 0) {
-		total_count = 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
 		goto out;
 	}
-	if (level >= 0 && root->node == count_path->nodes[level])
+
+	path->skip_locking = 1;
+	/* if no item found, the extent is referenced by other snapshot */
+	ret = btrfs_search_slot(NULL, old_root, key, path, 0, 0);
+	if (ret)
 		goto out;
-	level++;
-	btrfs_release_path(root, path);
-	goto again;
 
+	eb = path->nodes[0];
+	item = btrfs_item_ptr(eb, path->slots[0],
+			      struct btrfs_file_extent_item);
+	if (btrfs_file_extent_type(eb, item) != BTRFS_FILE_EXTENT_REG ||
+	    btrfs_file_extent_disk_bytenr(eb, item) != bytenr) {
+		ret = 1;
+		goto out;
+	}
+
+	for (level = BTRFS_MAX_LEVEL - 1; level >= -1; level--) {
+		if (level >= 0) {
+			eb = path->nodes[level];
+			if (!eb)
+				continue;
+			extent_start = eb->start;
+		} else 
+			extent_start = bytenr;
+
+		ret = get_reference_status(root, extent_start, ref_generation,
+					   0, &min_generation, &ref_count);
+		if (ret)
+			goto out;
+
+		if (ref_count != 1) {
+			ret = 1;
+			goto out;
+		}
+		if (level >= 0)
+			ref_generation = btrfs_header_generation(eb);
+	}
+	ret = 0;
 out:
-	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
-	return total_count;
+	if (path)
+		btrfs_free_path(path);
+	btrfs_end_transaction(trans, root);
+	return ret;
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7c87f863d6f..3aa82cec6bf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -249,11 +249,8 @@ again:
 		if (bytenr == 0)
 			goto not_found;
 
-		if (btrfs_count_snapshots_in_path(root, path, inode->i_ino,
-						  bytenr) != 1) {
+		if (btrfs_cross_ref_exists(root, &found_key, bytenr))
 			goto not_found;
-		}
-
 		/*
 		 * we may be called by the resizer, make sure we're inside
 		 * the limits of the FS
@@ -277,6 +274,7 @@ loop:
 	goto again;
 
 not_found:
+	btrfs_release_path(root, path);
 	cow_file_range(inode, start, end);
 	start = end + 1;
 	goto loop;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b8be6703189..216f3157162 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,12 +32,6 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
-struct dirty_root {
-	struct list_head list;
-	struct btrfs_root *root;
-	struct btrfs_root *latest_root;
-};
-
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(transaction->use_count == 0);
@@ -91,7 +85,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 
 static noinline int record_root_in_trans(struct btrfs_root *root)
 {
-	struct dirty_root *dirty;
+	struct btrfs_dirty_root *dirty;
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
 	if (root->ref_cows && root->last_trans < running_trans_id) {
 		WARN_ON(root == root->fs_info->extent_root);
@@ -372,7 +366,7 @@ int btrfs_add_dead_root(struct btrfs_root *root,
 			struct btrfs_root *latest,
 			struct list_head *dead_list)
 {
-	struct dirty_root *dirty;
+	struct btrfs_dirty_root *dirty;
 
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
@@ -387,7 +381,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 				    struct radix_tree_root *radix,
 				    struct list_head *list)
 {
-	struct dirty_root *dirty;
+	struct btrfs_dirty_root *dirty;
 	struct btrfs_root *gang[8];
 	struct btrfs_root *root;
 	int i;
@@ -498,7 +492,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 				     struct list_head *list)
 {
-	struct dirty_root *dirty;
+	struct btrfs_dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
 	u64 num_bytes;
@@ -509,7 +503,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	while(!list_empty(list)) {
 		struct btrfs_root *root;
 
-		dirty = list_entry(list->prev, struct dirty_root, list);
+		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
 		list_del_init(&dirty->list);
 
 		num_bytes = btrfs_root_used(&dirty->root->root_item);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index df2ca2aad1c..f5adb23151f 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -52,6 +52,11 @@ struct btrfs_pending_snapshot {
 	struct list_head list;
 };
 
+struct btrfs_dirty_root {
+	struct list_head list;
+	struct btrfs_root *root;
+	struct btrfs_root *latest_root;
+};
 
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 					       struct inode *inode)
-- 
cgit v1.2.3


From 33958dc6d38fb4ca7e62273855fcb2db7e616263 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 30 Jul 2008 10:29:12 -0400
Subject: Btrfs: Fix verify_parent_transid

It was incorrectly clearing the up to date flag on the buffer even
when the buffer properly verified.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 2 +-
 fs/btrfs/extent_io.c | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 27ffa9b7ddc..ec1ba8ddb35 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -235,8 +235,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 	       (unsigned long long)parent_transid,
 	       (unsigned long long)btrfs_header_generation(eb));
 	ret = 1;
-out:
 	clear_extent_buffer_uptodate(io_tree, eb);
+out:
 	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
 		      GFP_NOFS);
 	return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e3547a992d5..964ec1622d6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2647,6 +2647,8 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 		return eb->first_page;
 	i += eb->start >> PAGE_CACHE_SHIFT;
 	mapping = eb->first_page->mapping;
+	if (!mapping)
+		return NULL;
 	read_lock_irq(&mapping->tree_lock);
 	p = radix_tree_lookup(&mapping->page_tree, i);
 	read_unlock_irq(&mapping->tree_lock);
@@ -2908,7 +2910,8 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 			      GFP_NOFS);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		ClearPageUptodate(page);
+		if (page)
+			ClearPageUptodate(page);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From bcc63abbf3e9bf948a1b0129b3e6120ec7d7f698 Mon Sep 17 00:00:00 2001
From: Yan <zheng.yan@oracle.com>
Date: Wed, 30 Jul 2008 16:29:20 -0400
Subject: Btrfs: implement memory reclaim for leaf reference cache

The memory reclaiming issue happens when snapshot exists. In that
case, some cache entries may not be used during old snapshot dropping,
so they will remain in the cache until umount.

The patch adds a field to struct btrfs_leaf_ref to record create time. Besides,
the patch makes all dead roots of a given snapshot linked together in order of
create time. After a old snapshot was completely dropped, we check the dead
root list and remove all cache entries created before the oldest dead root in
the list.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  1 -
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/dir-item.c    |  1 -
 fs/btrfs/disk-io.c     |  5 +++--
 fs/btrfs/extent-tree.c | 18 +++++++++---------
 fs/btrfs/extent_io.c   |  1 -
 fs/btrfs/file-item.c   |  1 -
 fs/btrfs/file.c        |  1 -
 fs/btrfs/inode.c       | 22 +++++++++++-----------
 fs/btrfs/locking.c     |  1 -
 fs/btrfs/print-tree.c  |  1 -
 fs/btrfs/ref-cache.c   | 48 +++++++++++++++++++++++++-----------------------
 fs/btrfs/ref-cache.h   | 11 ++++++-----
 fs/btrfs/transaction.c | 40 ++++++++++++++++++++++++++++++----------
 fs/btrfs/volumes.c     |  1 -
 15 files changed, 86 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 245eb00435d..c4792062dd5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3275,4 +3275,3 @@ int btrfs_previous_item(struct btrfs_root *root,
 	}
 	return 1;
 }
-
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 83422088c62..be16cd49ef6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -666,7 +666,8 @@ struct btrfs_root {
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
 
-	spinlock_t orphan_lock;
+	spinlock_t list_lock;
+	struct list_head dead_list;
 	struct list_head orphan_list;
 };
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index eb4dd3d75cf..125094617fe 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -340,4 +340,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 	}
 	return 0;
 }
-
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ec1ba8ddb35..e826730d750 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -735,8 +735,9 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->orphan_list);
+	INIT_LIST_HEAD(&root->dead_list);
 	spin_lock_init(&root->node_lock);
-	spin_lock_init(&root->orphan_lock);
+	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 
 	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
@@ -1717,7 +1718,7 @@ int close_ctree(struct btrfs_root *root)
 		printk("btrfs: at umount reference cache size %Lu\n",
 			fs_info->total_ref_cache_size);
 	}
-	
+
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe1ddbd2bfd..37ca8df30c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -867,8 +867,8 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 		/*
 		 * For (parent_gen > 0 && parent_gen > ref_gen):
 		 *
-		 * we reach here through the oldest root, therefore 
-		 * all other reference from same snapshot should have 
+		 * we reach here through the oldest root, therefore
+		 * all other reference from same snapshot should have
 		 * a larger generation.
 		 */
 		if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
@@ -954,7 +954,7 @@ int btrfs_cross_ref_exists(struct btrfs_root *root,
 			if (!eb)
 				continue;
 			extent_start = eb->start;
-		} else 
+		} else
 			extent_start = bytenr;
 
 		ret = get_reference_status(root, extent_start, ref_generation,
@@ -1048,7 +1048,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		struct btrfs_leaf_ref *ref;
 		struct btrfs_extent_info *info;
 
-		ref = btrfs_alloc_leaf_ref(nr_file_extents);
+		ref = btrfs_alloc_leaf_ref(root, nr_file_extents);
 		if (!ref) {
 			WARN_ON(1);
 			goto out;
@@ -1059,7 +1059,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ref->generation = btrfs_header_generation(buf);
 		ref->nritems = nr_file_extents;
 		info = ref->extents;
-		
+
 		for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
@@ -1085,7 +1085,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		BUG_ON(!root->ref_tree);
 		ret = btrfs_add_leaf_ref(root, ref);
 		WARN_ON(ret);
-		btrfs_free_leaf_ref(ref);
+		btrfs_free_leaf_ref(root, ref);
 	}
 out:
 	return 0;
@@ -2316,7 +2316,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 }
 
 static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
-				  	   struct btrfs_root *root,
+					   struct btrfs_root *root,
 					   struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
@@ -2367,7 +2367,7 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
 }
 
 static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-				  	 struct btrfs_root *root,
+					 struct btrfs_root *root,
 					 struct btrfs_leaf_ref *ref)
 {
 	int i;
@@ -2521,7 +2521,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				ret = drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
 				btrfs_remove_leaf_ref(root, ref);
-				btrfs_free_leaf_ref(ref);
+				btrfs_free_leaf_ref(root, ref);
 				*level = 0;
 				break;
 			}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 964ec1622d6..5368e3b6eb9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3497,4 +3497,3 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(try_release_extent_buffer);
-
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index afe42d00b5a..2311061f070 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -422,4 +422,3 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	return ret;
 }
-
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ded5281f846..412ab4a2638 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1095,4 +1095,3 @@ struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
-
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3aa82cec6bf..7af8be076ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -835,17 +835,17 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
+	spin_lock(&root->list_lock);
 
 	/* already on the orphan list, we're good */
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
 	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
 
-	spin_unlock(&root->orphan_lock);
+	spin_unlock(&root->list_lock);
 
 	/*
 	 * insert an orphan item to track this unlinked/truncated file
@@ -864,20 +864,20 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
+	spin_lock(&root->list_lock);
 
 	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
 	list_del_init(&BTRFS_I(inode)->i_orphan);
 	if (!trans) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
-	spin_unlock(&root->orphan_lock);
+	spin_unlock(&root->list_lock);
 
 	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
 
@@ -973,9 +973,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
 		 */
-		spin_lock(&root->orphan_lock);
+		spin_lock(&root->list_lock);
 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 
 		/*
 		 * if this is a bad inode, means we actually succeeded in
@@ -3269,13 +3269,13 @@ void btrfs_destroy_inode(struct inode *inode)
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
-	spin_lock(&BTRFS_I(inode)->root->orphan_lock);
+	spin_lock(&BTRFS_I(inode)->root->list_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
 		       " list\n", inode->i_ino);
 		dump_stack();
 	}
-	spin_unlock(&BTRFS_I(inode)->root->orphan_lock);
+	spin_unlock(&BTRFS_I(inode)->root->list_lock);
 
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d617c29787f..d43e14c7471 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,4 +56,3 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 {
 	return mutex_is_locked(&eb->mutex);
 }
-
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 14d86372030..f1374d597a1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -198,4 +198,3 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		free_extent_buffer(next);
 	}
 }
-
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index ec9587784a3..272b9890c98 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -21,12 +21,18 @@
 #include "ref-cache.h"
 #include "transaction.h"
 
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents)
 {
 	struct btrfs_leaf_ref *ref;
+	size_t size = btrfs_leaf_ref_size(nr_extents);
 
-	ref = kmalloc(btrfs_leaf_ref_size(nr_extents), GFP_NOFS);
+	ref = kmalloc(size, GFP_NOFS);
 	if (ref) {
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size += size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+
 		memset(ref, 0, sizeof(*ref));
 		atomic_set(&ref->usage, 1);
 		INIT_LIST_HEAD(&ref->list);
@@ -34,14 +40,20 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
 	return ref;
 }
 
-void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref)
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	if (!ref)
 		return;
 	WARN_ON(atomic_read(&ref->usage) == 0);
 	if (atomic_dec_and_test(&ref->usage)) {
+		size_t size = btrfs_leaf_ref_size(ref->nritems);
+
 		BUG_ON(ref->in_tree);
 		kfree(ref);
+
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size -= size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
 	}
 }
 
@@ -64,7 +76,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 		else
 			return parent;
 	}
-	
+
 	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
 	entry->in_tree = 1;
 	rb_link_node(node, parent, p);
@@ -91,9 +103,8 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 	return NULL;
 }
 
-int btrfs_remove_leaf_refs(struct btrfs_root *root)
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen)
 {
-	struct rb_node *rb;
 	struct btrfs_leaf_ref *ref = NULL;
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
@@ -101,17 +112,18 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root)
 		return 0;
 
 	spin_lock(&tree->lock);
-	while(!btrfs_leaf_ref_tree_empty(tree)) {
-		rb = rb_first(&tree->root);
-		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+	while(!list_empty(&tree->list)) {
+		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+		BUG_ON(!ref->in_tree);
+		if (ref->root_gen > max_root_gen)
+			break;
+
 		rb_erase(&ref->rb_node, &tree->root);
 		ref->in_tree = 0;
 		list_del_init(&ref->list);
 
 		spin_unlock(&tree->lock);
-
-		btrfs_free_leaf_ref(ref);
-
+		btrfs_free_leaf_ref(root, ref);
 		cond_resched();
 		spin_lock(&tree->lock);
 	}
@@ -143,7 +155,6 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	int ret = 0;
 	struct rb_node *rb;
-	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
 	spin_lock(&tree->lock);
@@ -151,9 +162,6 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 	if (rb) {
 		ret = -EEXIST;
 	} else {
-		spin_lock(&root->fs_info->ref_cache_lock);
-		root->fs_info->total_ref_cache_size += size;
-		spin_unlock(&root->fs_info->ref_cache_lock);
 		atomic_inc(&ref->usage);
 		list_add_tail(&ref->list, &tree->list);
 	}
@@ -163,15 +171,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
-	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
 	BUG_ON(!ref->in_tree);
 	spin_lock(&tree->lock);
-	
-	spin_lock(&root->fs_info->ref_cache_lock);
-	root->fs_info->total_ref_cache_size -= size;
-	spin_unlock(&root->fs_info->ref_cache_lock);
 
 	rb_erase(&ref->rb_node, &tree->root);
 	ref->in_tree = 0;
@@ -179,7 +182,6 @@ int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 
 	spin_unlock(&tree->lock);
 
-	btrfs_free_leaf_ref(ref);
+	btrfs_free_leaf_ref(root, ref);
 	return 0;
 }
-
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 823c049f72f..c361b321c0c 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -30,6 +30,7 @@ struct btrfs_leaf_ref {
 	int in_tree;
 	atomic_t usage;
 
+	u64 root_gen;
 	u64 bytenr;
 	u64 owner;
 	u64 generation;
@@ -41,14 +42,13 @@ struct btrfs_leaf_ref {
 
 static inline size_t btrfs_leaf_ref_size(int nr_extents)
 {
-	return sizeof(struct btrfs_leaf_ref) + 
+	return sizeof(struct btrfs_leaf_ref) +
 	       sizeof(struct btrfs_extent_info) * nr_extents;
 }
 
 static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
 {
 	tree->root.rb_node = NULL;
-	tree->last = NULL;
 	INIT_LIST_HEAD(&tree->list);
 	spin_lock_init(&tree->lock);
 }
@@ -59,12 +59,13 @@ static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
 }
 
 void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
-void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     u64 bytenr);
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-int btrfs_remove_leaf_refs(struct btrfs_root *root);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 216f3157162..52c5524896a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -98,20 +98,24 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 			BUG_ON(!dirty);
 			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
 			BUG_ON(!dirty->root);
-
 			dirty->latest_root = root;
 			INIT_LIST_HEAD(&dirty->list);
 
 			root->commit_root = btrfs_root_node(root);
-			root->dirty_root = dirty;
 
 			memcpy(dirty->root, root, sizeof(*root));
-			dirty->root->ref_tree = &root->ref_tree_struct;
-
 			spin_lock_init(&dirty->root->node_lock);
+			spin_lock_init(&dirty->root->list_lock);
 			mutex_init(&dirty->root->objectid_mutex);
+			INIT_LIST_HEAD(&dirty->root->dead_list);
 			dirty->root->node = root->commit_root;
 			dirty->root->commit_root = NULL;
+
+			spin_lock(&root->list_lock);
+			list_add(&dirty->root->dead_list, &root->dead_list);
+			spin_unlock(&root->list_lock);
+
+			root->dirty_root = dirty;
 		} else {
 			WARN_ON(1);
 		}
@@ -356,8 +360,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
 		update_cowonly_root(trans, root);
-		if (root->fs_info->closing)
-			btrfs_remove_leaf_refs(root);
 	}
 	return 0;
 }
@@ -410,7 +412,11 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
-				
+
+				spin_lock(&root->list_lock);
+				list_del_init(&dirty->root->dead_list);
+				spin_unlock(&root->list_lock);
+
 				kfree(dirty->root);
 				kfree(dirty);
 
@@ -497,6 +503,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	unsigned long nr;
 	u64 num_bytes;
 	u64 bytes_used;
+	u64 max_useless;
 	int ret = 0;
 	int err;
 
@@ -554,10 +561,25 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		}
 		mutex_unlock(&root->fs_info->drop_mutex);
 
+		spin_lock(&root->list_lock);
+		list_del_init(&dirty->root->dead_list);
+		if (!list_empty(&root->dead_list)) {
+			struct btrfs_root *oldest;
+			oldest = list_entry(root->dead_list.prev,
+					    struct btrfs_root, dead_list);
+			max_useless = oldest->root_key.offset - 1;
+		} else {
+			max_useless = root->root_key.offset - 1;
+		}
+		spin_unlock(&root->list_lock);
+
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		ret = btrfs_remove_leaf_refs(root, max_useless);
+		BUG_ON(ret);
+
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -785,10 +807,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
+	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 	if (root->fs_info->closing)
 		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
-	else
-		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -814,4 +835,3 @@ again:
 	}
 	return 0;
 }
-
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e6ee7a6f73..18db4cbe279 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2527,4 +2527,3 @@ again:
 error:
 	return ret;
 }
-
-- 
cgit v1.2.3


From 0ee0fda06b943d7ef65f3cec50ab9d427a2baf97 Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Wed, 30 Jul 2008 16:54:26 -0400
Subject: Btrfs: Add compatibility for kernels >= 2.6.27-rc1

Add a couple of #if's to follow API changes.

Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 23 +++++++++++++++++++++++
 fs/btrfs/file.c      |  4 ++++
 fs/btrfs/inode.c     | 12 ++++++++++--
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5368e3b6eb9..1cf4baba434 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2649,9 +2649,24 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	mapping = eb->first_page->mapping;
 	if (!mapping)
 		return NULL;
+
+	/*
+	 * extent_buffer_page is only called after pinning the page
+	 * by increasing the reference count.  So we know the page must
+	 * be in the radix tree.
+	 */
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+	rcu_read_lock();
+#else
 	read_lock_irq(&mapping->tree_lock);
+#endif
 	p = radix_tree_lookup(&mapping->page_tree, i);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+	rcu_read_unlock();
+#else
 	read_unlock_irq(&mapping->tree_lock);
+#endif
 	return p;
 }
 
@@ -2843,13 +2858,21 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			}
 		}
 		clear_page_dirty_for_io(page);
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+		spin_lock_irq(&page->mapping->tree_lock);
+#else
 		read_lock_irq(&page->mapping->tree_lock);
+#endif
 		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&page->mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+		spin_unlock_irq(&page->mapping->tree_lock);
+#else
 		read_unlock_irq(&page->mapping->tree_lock);
+#endif
 		unlock_page(page);
 	}
 	return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 412ab4a2638..388ac397c2c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -886,7 +886,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 #ifdef REMOVE_SUID_PATH
 	err = remove_suid(&file->f_path);
 #else
+# if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+	err = file_remove_suid(file);
+# else
 	err = remove_suid(fdentry(file));
+# endif
 #endif
 	if (err)
 		goto out_nolock;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7af8be076ee..c4afa9d78da 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3293,7 +3293,9 @@ void btrfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+static void init_once(void *foo)
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void init_once(struct kmem_cache * cachep, void *foo)
 #else
 static void init_once(void * foo, struct kmem_cache * cachep,
@@ -3321,7 +3323,9 @@ void btrfs_destroy_cachep(void)
 
 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 				       unsigned long extra_flags,
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+				       void (*ctor)(void *)
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 				       void (*ctor)(struct kmem_cache *, void *)
 #else
 				       void (*ctor)(void *, struct kmem_cache *,
@@ -3561,8 +3565,12 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+static int btrfs_permission(struct inode *inode, int mask)
+#else
 static int btrfs_permission(struct inode *inode, int mask,
 			    struct nameidata *nd)
+#endif
 {
 	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
-- 
cgit v1.2.3


From 47ac14fa0f5306c6b97203f4f086bf1fa21dfddb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 09:46:18 -0400
Subject: Btrfs: Add missing hunk from Yan Zheng's cache reclaim patch

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 37ca8df30c3..4765248000f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1054,6 +1054,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			goto out;
 		}
 
+		ref->root_gen = root->root_key.offset;
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
-- 
cgit v1.2.3


From 37d1aeee3990385e9bb436c50c2f7e120a668df6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 10:48:37 -0400
Subject: Btrfs: Throttle tuning

This avoids waiting for transactions with pages locked by breaking out
the code to wait for the current transaction to close into a function
called by btrfs_throttle.

It also lowers the limits for where we start throttling.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  3 +++
 fs/btrfs/file.c        |  2 +-
 fs/btrfs/transaction.c | 38 +++++++++++++++++++++++++++-----------
 3 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4765248000f..b003b4364dd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2526,6 +2526,9 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				*level = 0;
 				break;
 			}
+			if (printk_ratelimit())
+				printk("leaf ref miss for bytenr %llu\n",
+				       (unsigned long long)bytenr);
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 388ac397c2c..d3f2fe0b7c6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -253,7 +253,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	end_of_last_block = start_pos + num_bytes - 1;
 
 	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	if (!trans) {
 		err = -ENOMEM;
 		goto out_unlock;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 52c5524896a..66af5140c8c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -124,17 +124,12 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 	return 0;
 }
 
-struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-					     int num_blocks, int join)
+static void wait_current_trans(struct btrfs_root *root)
 {
-	struct btrfs_trans_handle *h =
-		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	struct btrfs_transaction *cur_trans;
-	int ret;
 
-	mutex_lock(&root->fs_info->trans_mutex);
 	cur_trans = root->fs_info->running_transaction;
-	if (cur_trans && cur_trans->blocked && !join) {
+	if (cur_trans && cur_trans->blocked) {
 		DEFINE_WAIT(wait);
 		cur_trans->use_count++;
 		while(1) {
@@ -154,6 +149,18 @@ struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 		}
 		put_transaction(cur_trans);
 	}
+}
+
+struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+					     int num_blocks, int join)
+{
+	struct btrfs_trans_handle *h =
+		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+	int ret;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (!join)
+		wait_current_trans(root);
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
@@ -200,7 +207,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
-void btrfs_throttle(struct btrfs_root *root)
+static void throttle_on_drops(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 
@@ -223,19 +230,28 @@ harder:
 		} while (thr == atomic_read(&info->throttle_gen));
 
 		if (harder_count < 5 &&
-		    info->total_ref_cache_size > 5 * 1024 * 1024) {
+		    info->total_ref_cache_size > 1 * 1024 * 1024) {
 			harder_count++;
 			goto harder;
 		}
 
 		if (harder_count < 10 &&
-		    info->total_ref_cache_size > 10 * 1024 * 1024) {
+		    info->total_ref_cache_size > 5 * 1024 * 1024) {
 			harder_count++;
 			goto harder;
 		}
 	}
 }
 
+void btrfs_throttle(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->trans_mutex);
+	wait_current_trans(root);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	throttle_on_drops(root);
+}
+
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, int throttle)
 {
@@ -256,7 +272,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (throttle)
-		btrfs_throttle(root);
+		throttle_on_drops(root);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 61b4944018449003ac5f9757f4d125dce519cf51 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 15:42:53 -0400
Subject: Btrfs: Fix streaming read performance with checksumming on

Large streaming reads make for large bios, which means each entry on the
list async work queues represents a large amount of data.  IO
congestion throttling on the device was kicking in before the async
worker threads decided a single thread was busy and needed some help.

The end result was that a streaming read would result in a single CPU
running at 100% instead of balancing the work off to other CPUs.

This patch also changes the pre-IO checksum lookup done by reads to
work on a per-bio basis instead of a per-page.  This results in many
extra btree lookups on large streaming reads.  Doing the checksum lookup
right before bio submit allows us to reuse searches while processing
adjacent offsets.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c |  2 +-
 fs/btrfs/ctree.h        |  2 ++
 fs/btrfs/disk-io.c      | 15 ++++++++++
 fs/btrfs/file-item.c    | 77 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/inode.c        | 57 +++---------------------------------
 5 files changed, 99 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5fe6a0d532e..bc2980c433e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -160,7 +160,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
 	INIT_LIST_HEAD(&workers->idle_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
-	workers->idle_thresh = 64;
+	workers->idle_thresh = 32;
 }
 
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be16cd49ef6..d788ab0dcd9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1613,6 +1613,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       struct btrfs_key *location, int mod);
 
 /* file-item.c */
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 objectid, u64 pos, u64 disk_offset,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e826730d750..d2d1cc87e8a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1357,10 +1357,25 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+
+	/* a higher idle thresh on the submit workers makes it much more
+	 * likely that bios will be send down in a sane order to the
+	 * devices
+	 */
+	fs_info->submit_workers.idle_thresh = 64;
+
 	btrfs_init_workers(&fs_info->fixup_workers, 1);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_write_workers,
 			   fs_info->thread_pool_size);
+
+	/*
+	 * endios are largely parallel and should have a very
+	 * low idle thresh
+	 */
+	fs_info->endio_workers.idle_thresh = 4;
+	fs_info->endio_write_workers.idle_thresh = 4;
+
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 2311061f070..a5ff19b34b2 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -134,6 +134,83 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+			  struct bio *bio)
+{
+	u32 sum;
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	u64 offset;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u32 diff;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+	path = btrfs_alloc_path();
+	path->reada = 2;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+
+	while(bio_index < bio->bi_vcnt) {
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+		ret = btrfs_find_ordered_sum(inode, offset, &sum);
+		if (ret == 0)
+			goto found;
+
+		if (!item || offset < item_start_offset ||
+		    offset >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(root, path);
+			item = btrfs_lookup_csum(NULL, root, path,
+						 inode->i_ino, offset, 0);
+			if (IS_ERR(item)) {
+				ret = PTR_ERR(item);
+				if (ret == -ENOENT || ret == -EFBIG)
+					ret = 0;
+				sum = 0;
+				printk("no csum found for inode %lu start "
+				       "%llu\n", inode->i_ino,
+				       (unsigned long long)offset);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / BTRFS_CRC32_SIZE) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = offset - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * BTRFS_CRC32_SIZE;
+
+		read_extent_buffer(path->nodes[0], &sum,
+				   (unsigned long)item + diff,
+				   BTRFS_CRC32_SIZE);
+found:
+		set_state_private(io_tree, offset, sum);
+		bio_index++;
+		bvec++;
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c4afa9d78da..31d52c51acc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -374,6 +374,10 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	BUG_ON(ret);
 
 	if (!(rw & (1 << BIO_RW))) {
+		if (!btrfs_test_opt(root, NODATASUM) &&
+		    !btrfs_test_flag(inode, NODATASUM)) {
+			btrfs_lookup_bio_sums(root, inode, bio);
+		}
 		goto mapit;
 	}
 
@@ -598,58 +602,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
-int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
-{
-	int ret = 0;
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_csum_item *item;
-	struct btrfs_path *path = NULL;
-	u32 csum;
-
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
-		return 0;
-
-	/*
-	 * It is possible there is an ordered extent that has
-	 * not yet finished for this range in the file.  If so,
-	 * that extent will have a csum cached, and it will insert
-	 * the sum after all the blocks in the extent are fully
-	 * on disk.  So, look for an ordered extent and use the
-	 * sum if found.  We have to do this before looking in the
-	 * btree because csum items are pre-inserted based on
-	 * the file size.  btrfs_lookup_csum might find an item
-	 * that still hasn't been fully filled.
-	 */
-	ret = btrfs_find_ordered_sum(inode, start, &csum);
-	if (ret == 0)
-		goto found;
-
-	ret = 0;
-	path = btrfs_alloc_path();
-	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
-	if (IS_ERR(item)) {
-		ret = PTR_ERR(item);
-		/* a csum that isn't present is a preallocated region. */
-		if (ret == -ENOENT || ret == -EFBIG)
-			ret = 0;
-		csum = 0;
-		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
-		       start);
-		goto out;
-	}
-	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
-			   BTRFS_CRC32_SIZE);
-found:
-	set_state_private(io_tree, start, csum);
-out:
-	if (path)
-		btrfs_free_path(path);
-	return ret;
-}
-
 struct io_failure_record {
 	struct page *page;
 	u64 start;
@@ -3613,7 +3565,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.merge_bio_hook = btrfs_merge_bio_hook,
-	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
-- 
cgit v1.2.3


From 3ce7e67a069b919be774a341b82fc20978b7f69d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 15:42:54 -0400
Subject: Btrfs: Drop some debugging around the extent_map pinned flag

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c  | 10 +---------
 fs/btrfs/inode.c | 22 ----------------------
 2 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3f2fe0b7c6..c78f184ee5c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -380,15 +380,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			spin_unlock(&em_tree->lock);
 			break;
 		}
-		if (test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-			printk(KERN_CRIT "inode %lu trying to drop pinned "
-			       "extent start %llu end %llu, em [%llu %llu]\n",
-			       inode->i_ino,
-			       (unsigned long long)start,
-			       (unsigned long long)end,
-			       (unsigned long long)em->start,
-			       (unsigned long long)em->len);
-		}
+		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		remove_extent_mapping(em_tree, em);
 
 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 31d52c51acc..76365ad9bfb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -505,15 +505,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct extent_map *em;
-	struct extent_map *em_orig;
 	u64 alloc_hint = 0;
-	u64 clear_start;
-	u64 clear_end;
 	struct list_head list;
 	struct btrfs_key ins;
-	struct rb_node *rb;
 	int ret;
 
 	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
@@ -542,22 +536,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 
-	spin_lock(&em_tree->lock);
-	clear_start = ordered_extent->file_offset;
-	clear_end = ordered_extent->file_offset + ordered_extent->len;
-	em = lookup_extent_mapping(em_tree, clear_start,
-				   ordered_extent->len);
-	em_orig = em;
-	while(em && clear_start < extent_map_end(em) && clear_end > em->start) {
-		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
-		rb = rb_next(&em->rb_node);
-		if (!rb)
-			break;
-		em = rb_entry(rb, struct extent_map, rb_node);
-	}
-	free_extent_map(em_orig);
-	spin_unlock(&em_tree->lock);
-
 	ret = btrfs_drop_extents(trans, root, inode,
 				 ordered_extent->file_offset,
 				 ordered_extent->file_offset +
-- 
cgit v1.2.3


From 492bb6deee3416ad792dcd8584ebd95c463af1a6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 16:29:02 -0400
Subject: Btrfs: Hold a reference on bios during submit_bio, add some extra bio
 checks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 18db4cbe279..0de51e37e78 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -180,7 +180,11 @@ loop:
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
 		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+
+		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		bio_get(cur);
 		submit_bio(cur->bi_rw, cur);
+		bio_put(cur);
 		num_run++;
 
 		/*
@@ -188,10 +192,11 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && num_run && bdi_write_congested(bdi)) {
+		if (pending && bdi_write_congested(bdi)) {
 			struct bio *old_head;
 
 			spin_lock(&device->io_lock);
+
 			old_head = device->pending_bios;
 			device->pending_bios = pending;
 			if (device->pending_bio_tail)
@@ -2125,7 +2130,9 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
+		bio_get(bio);
 		submit_bio(rw, bio);
+		bio_put(bio);
 		return 0;
 	}
 
@@ -2136,6 +2143,7 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	 * on a queue for later
 	 */
 	atomic_inc(&root->fs_info->nr_async_submits);
+	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
 	bio->bi_rw |= rw;
 
-- 
cgit v1.2.3


From f87f057b49ee52cf5c627ab27a706e3252767c9f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Aug 2008 11:27:23 -0400
Subject: Btrfs: Improve and cleanup locking done by walk_down_tree

While dropping snapshots, walk_down_tree does most of the work of checking
reference counts and limiting tree traversal to just the blocks that
we are freeing.

It dropped and held the allocation mutex in strange and confusing ways,
this commit changes it to only hold the mutex while actually freeing a block.

The rest of the checks around reference counts should be safe without the lock
because we only allow one process in btrfs_drop_snapshot at a time.  Other
processes dropping reference counts should not drop it to 1 because
their tree roots already have an extra ref on the block.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 85 +++++++++++++++++++++++++++++++++-----------------
 fs/btrfs/file.c        | 13 +++++---
 fs/btrfs/ioctl.c       |  6 ++++
 3 files changed, 70 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b003b4364dd..58bceeeda9c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2333,8 +2333,6 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
 	leaf_owner = btrfs_header_owner(leaf);
 	leaf_generation = btrfs_header_generation(leaf);
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
-
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
 		cond_resched();
@@ -2362,8 +2360,6 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 	}
-
-	mutex_lock(&root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -2375,7 +2371,6 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_extent_info *info = ref->extents;
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	for (i = 0; i < ref->nritems; i++) {
 		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root,
@@ -2386,7 +2381,6 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 		BUG_ON(ret);
 		info++;
 	}
-	mutex_lock(&root->fs_info->alloc_mutex);
 
 	return 0;
 }
@@ -2440,10 +2434,39 @@ int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 			      u32 *refs)
 {
 	int ret;
-	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	ret = lookup_extent_ref(NULL, root, start, len, refs);
+	BUG_ON(ret);
+
+#if 0 // some debugging code in case we see problems here
+	/* if the refs count is one, it won't get increased again.  But
+	 * if the ref count is > 1, someone may be decreasing it at
+	 * the same time we are.
+	 */
+	if (*refs != 1) {
+		struct extent_buffer *eb = NULL;
+		eb = btrfs_find_create_tree_block(root, start, len);
+		if (eb)
+			btrfs_tree_lock(eb);
+
+		mutex_lock(&root->fs_info->alloc_mutex);
+		ret = lookup_extent_ref(NULL, root, start, len, refs);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
+		if (eb) {
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		}
+		if (*refs == 1) {
+			printk("block %llu went down to one during drop_snap\n",
+			       (unsigned long long)start);
+		}
+
+	}
+#endif
+
 	cond_resched();
-	mutex_lock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -2467,8 +2490,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 	int ret;
 	u32 refs;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
@@ -2507,13 +2528,21 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			root_owner = btrfs_header_owner(parent);
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
+
+			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, root_owner,
 						root_gen, 0, 0, 1);
 			BUG_ON(ret);
+			mutex_unlock(&root->fs_info->alloc_mutex);
 			continue;
 		}
-
+		/*
+		 * at this point, we have a single ref, and since the
+		 * only place referencing this extent is a dead root
+		 * the reference count should never go higher.
+		 * So, we don't need to check it again
+		 */
 		if (*level == 1) {
 			struct btrfs_key key;
 			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
@@ -2533,33 +2562,23 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 
 			if (path->slots[*level] == 0)
 				reada_walk_down(root, cur, path->slots[*level]);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
-
-			/* we've dropped the lock, double check */
+#if 0
+			/*
+			 * this is a debugging check and can go away
+			 * the ref should never go all the way down to 1
+			 * at this point
+			 */
 			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
 						&refs);
 			BUG_ON(ret);
-			if (refs != 1) {
-				parent = path->nodes[*level];
-				root_owner = btrfs_header_owner(parent);
-				root_gen = btrfs_header_generation(parent);
-
-				path->slots[*level]++;
-				free_extent_buffer(next);
-				ret = __btrfs_free_extent(trans, root, bytenr,
-							blocksize,
-							root_owner,
-							root_gen, 0, 0, 1);
-				BUG_ON(ret);
-				continue;
-			}
+			WARN_ON(refs != 1);
+#endif
 		}
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
@@ -2584,6 +2603,8 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
+
+	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
@@ -2591,6 +2612,7 @@ out:
 	*level += 1;
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	cond_resched();
 	return 0;
 }
@@ -2834,6 +2856,11 @@ again:
 		}
 		set_page_extent_mapped(page);
 
+		/*
+		 * make sure page_mkwrite is called for this page if userland
+		 * wants to change it from mmap
+		 */
+		clear_page_dirty_for_io(page);
 
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c78f184ee5c..8915f2dc1bc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -338,6 +338,13 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
 		BUG_ON(err);
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+
+		/*
+		 * an ugly way to do all the prop accounting around
+		 * the page bits and mapping tags
+		 */
+		set_page_writeback(pages[0]);
+		end_page_writeback(pages[0]);
 		did_inline = 1;
 	}
 	if (end_pos > isize) {
@@ -833,11 +840,7 @@ again:
 			      start_pos, last_pos - 1, GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(pages[i]);
-#else
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-#endif
+		clear_page_dirty_for_io(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5e627746c4e..224da287b3e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -268,6 +268,12 @@ again:
 		}
 		set_page_extent_mapped(page);
 
+		/*
+		 * this makes sure page_mkwrite is called on the
+		 * page if it is dirtied again later
+		 */
+		clear_page_dirty_for_io(page);
+
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
-- 
cgit v1.2.3


From 18e35e0ab337ec99c7e03e9ae917745a352c0bb1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Aug 2008 13:11:41 -0400
Subject: Btrfs: Throttle less often waiting for snapshots to delete

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  7 +++++--
 fs/btrfs/transaction.c | 14 --------------
 2 files changed, 5 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 58bceeeda9c..74bcd48a9c4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2535,6 +2535,10 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 						root_gen, 0, 0, 1);
 			BUG_ON(ret);
 			mutex_unlock(&root->fs_info->alloc_mutex);
+
+			atomic_inc(&root->fs_info->throttle_gen);
+			wake_up(&root->fs_info->transaction_throttle);
+
 			continue;
 		}
 		/*
@@ -2603,7 +2607,6 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
-
 	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  root_owner, root_gen, 0, 0, 1);
@@ -2726,7 +2729,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 	}
 	while(1) {
-		atomic_inc(&root->fs_info->throttle_gen);
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
@@ -2742,6 +2744,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = -EAGAIN;
 			break;
 		}
+		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
 	}
 	for (i = 0; i <= orig_level; i++) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66af5140c8c..a6877949930 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -211,11 +211,9 @@ static void throttle_on_drops(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 
-harder:
 	if (atomic_read(&info->throttles)) {
 		DEFINE_WAIT(wait);
 		int thr;
-		int harder_count = 0;
 		thr = atomic_read(&info->throttle_gen);
 
 		do {
@@ -228,18 +226,6 @@ harder:
 			schedule();
 			finish_wait(&info->transaction_throttle, &wait);
 		} while (thr == atomic_read(&info->throttle_gen));
-
-		if (harder_count < 5 &&
-		    info->total_ref_cache_size > 1 * 1024 * 1024) {
-			harder_count++;
-			goto harder;
-		}
-
-		if (harder_count < 10 &&
-		    info->total_ref_cache_size > 5 * 1024 * 1024) {
-			harder_count++;
-			goto harder;
-		}
 	}
 }
 
-- 
cgit v1.2.3


From 65b51a009e29e64c0951f21ea17fdc66bbb0fbd7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 1 Aug 2008 15:11:20 -0400
Subject: btrfs_search_slot: reduce lock contention by cowing in two stages

A btree block cow has two parts, the first is to allocate a destination
block and the second is to copy the old bock over.

The first part needs locks in the extent allocation tree, and may need to
do IO.  This changeset splits that into a separate function that can be
called without any tree locks held.

btrfs_search_slot is changed to drop its path and start over if it has
to COW a contended block.  This often means that many writers will
pre-alloc a new destination for a the same contended block, but they
cache their prealloc for later use on lower levels in the tree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 147 ++++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/ctree.h       |   6 +-
 fs/btrfs/extent-tree.c |  49 ++++++++++-------
 fs/btrfs/locking.c     |  16 ++++++
 fs/btrfs/locking.h     |   1 +
 fs/btrfs/transaction.c |   2 +-
 6 files changed, 173 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4792062dd5..7114faafa9d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -181,7 +181,8 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
 			     struct extent_buffer **cow_ret,
-			     u64 search_start, u64 empty_size)
+			     u64 search_start, u64 empty_size,
+			     u64 prealloc_dest)
 {
 	u64 root_gen;
 	struct extent_buffer *cow;
@@ -216,10 +217,27 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	} else {
 		first_key.objectid = 0;
 	}
-	cow = btrfs_alloc_free_block(trans, root, buf->len,
-				     root->root_key.objectid,
-				     root_gen, first_key.objectid, level,
-				     search_start, empty_size);
+	if (prealloc_dest) {
+		struct btrfs_key ins;
+
+		ins.objectid = prealloc_dest;
+		ins.offset = buf->len;
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		ret = btrfs_alloc_reserved_extent(trans, root,
+						  root->root_key.objectid,
+						  root_gen, level,
+						  first_key.objectid,
+						  &ins);
+		BUG_ON(ret);
+		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
+					    buf->len);
+	} else {
+		cow = btrfs_alloc_free_block(trans, root, buf->len,
+					     root->root_key.objectid,
+					     root_gen, first_key.objectid,
+					     level, search_start, empty_size);
+	}
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -279,7 +297,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret)
+		    struct extent_buffer **cow_ret, u64 prealloc_dest)
 {
 	u64 search_start;
 	u64 header_trans;
@@ -302,12 +320,14 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
 		spin_unlock(&root->fs_info->hash_lock);
+		WARN_ON(prealloc_dest);
 		return 0;
 	}
 	spin_unlock(&root->fs_info->hash_lock);
 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
-				 parent_slot, cow_ret, search_start, 0);
+				 parent_slot, cow_ret, search_start, 0,
+				 prealloc_dest);
 	return ret;
 }
 
@@ -451,7 +471,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
-					    (end_slot - i) * blocksize));
+					    (end_slot - i) * blocksize), 0);
 		if (err) {
 			btrfs_tree_unlock(cur);
 			free_extent_buffer(cur);
@@ -803,7 +823,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 		child = read_node_slot(root, mid, 0);
 		btrfs_tree_lock(child);
 		BUG_ON(!child);
-		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
 
 		spin_lock(&root->node_lock);
@@ -836,7 +856,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	if (left) {
 		btrfs_tree_lock(left);
 		wret = btrfs_cow_block(trans, root, left,
-				       parent, pslot - 1, &left);
+				       parent, pslot - 1, &left, 0);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -846,7 +866,7 @@ static int balance_level(struct btrfs_trans_handle *trans,
 	if (right) {
 		btrfs_tree_lock(right);
 		wret = btrfs_cow_block(trans, root, right,
-				       parent, pslot + 1, &right);
+				       parent, pslot + 1, &right, 0);
 		if (wret) {
 			ret = wret;
 			goto enospc;
@@ -1021,7 +1041,7 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 			wret = 1;
 		} else {
 			ret = btrfs_cow_block(trans, root, left, parent,
-					      pslot - 1, &left);
+					      pslot - 1, &left, 0);
 			if (ret)
 				wret = 1;
 			else {
@@ -1069,7 +1089,7 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		} else {
 			ret = btrfs_cow_block(trans, root, right,
 					      parent, pslot + 1,
-					      &right);
+					      &right, 0);
 			if (ret)
 				wret = 1;
 			else {
@@ -1245,6 +1265,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	u8 lowest_level = 0;
 	u64 blocknr;
 	u64 gen;
+	struct btrfs_key prealloc_block;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len);
@@ -1253,6 +1274,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (ins_len < 0)
 		lowest_unlock = 2;
+
+	prealloc_block.objectid = 0;
+
 again:
 	if (p->skip_locking)
 		b = btrfs_root_node(root);
@@ -1261,27 +1285,82 @@ again:
 
 	while (b) {
 		level = btrfs_header_level(b);
+
+		/*
+		 * setup the path here so we can release it under lock
+		 * contention with the cow code
+		 */
+		p->nodes[level] = b;
+		if (!p->skip_locking)
+			p->locks[level] = 1;
+
 		if (cow) {
 			int wret;
+
+			/* is a cow on this block not required */
+			spin_lock(&root->fs_info->hash_lock);
+			if (btrfs_header_generation(b) == trans->transid &&
+			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
+				spin_unlock(&root->fs_info->hash_lock);
+				goto cow_done;
+			}
+			spin_unlock(&root->fs_info->hash_lock);
+
+			/* ok, we have to cow, is our old prealloc the right
+			 * size?
+			 */
+			if (prealloc_block.objectid &&
+			    prealloc_block.offset != b->len) {
+				btrfs_free_reserved_extent(root,
+					   prealloc_block.objectid,
+					   prealloc_block.offset);
+				prealloc_block.objectid = 0;
+			}
+
+			/*
+			 * for higher level blocks, try not to allocate blocks
+			 * with the block and the parent locks held.
+			 */
+			if (level > 1 && !prealloc_block.objectid &&
+			    btrfs_path_lock_waiting(p, level)) {
+				u32 size = b->len;
+				u64 hint = b->start;
+
+				btrfs_release_path(root, p);
+				ret = btrfs_reserve_extent(trans, root,
+							   size, size, 0,
+							   hint, (u64)-1,
+							   &prealloc_block, 0);
+				BUG_ON(ret);
+				goto again;
+			}
+
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
 					       p->slots[level + 1],
-					       &b);
+					       &b, prealloc_block.objectid);
+			prealloc_block.objectid = 0;
 			if (wret) {
 				free_extent_buffer(b);
-				return wret;
+				ret = wret;
+				goto done;
 			}
 		}
+cow_done:
 		BUG_ON(!cow && ins_len);
 		if (level != btrfs_header_level(b))
 			WARN_ON(1);
 		level = btrfs_header_level(b);
+
 		p->nodes[level] = b;
 		if (!p->skip_locking)
 			p->locks[level] = 1;
+
 		ret = check_block(root, p, level);
-		if (ret)
-			return -1;
+		if (ret) {
+			ret = -1;
+			goto done;
+		}
 
 		ret = bin_search(b, key, level, &slot);
 		if (level != 0) {
@@ -1292,15 +1371,19 @@ again:
 			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
 				int sret = split_node(trans, root, p, level);
 				BUG_ON(sret > 0);
-				if (sret)
-					return sret;
+				if (sret) {
+					ret = sret;
+					goto done;
+				}
 				b = p->nodes[level];
 				slot = p->slots[level];
 			} else if (ins_len < 0) {
 				int sret = balance_level(trans, root, p,
 							 level);
-				if (sret)
-					return sret;
+				if (sret) {
+					ret = sret;
+					goto done;
+				}
 				b = p->nodes[level];
 				if (!b) {
 					btrfs_release_path(NULL, p);
@@ -1362,14 +1445,24 @@ again:
 				int sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
 				BUG_ON(sret > 0);
-				if (sret)
-					return sret;
+				if (sret) {
+					ret = sret;
+					goto done;
+				}
 			}
 			unlock_up(p, level, lowest_unlock);
-			return ret;
+			goto done;
 		}
 	}
-	return 1;
+	ret = 1;
+done:
+	if (prealloc_block.objectid) {
+		btrfs_free_reserved_extent(root,
+			   prealloc_block.objectid,
+			   prealloc_block.offset);
+	}
+
+	return ret;
 }
 
 /*
@@ -1840,7 +1933,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* cow and double check */
 	ret = btrfs_cow_block(trans, root, right, upper,
-			      slot + 1, &right);
+			      slot + 1, &right, 0);
 	if (ret)
 		goto out_unlock;
 
@@ -2021,7 +2114,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* cow and double check */
 	ret = btrfs_cow_block(trans, root, left,
-			      path->nodes[1], slot - 1, &left);
+			      path->nodes[1], slot - 1, &left, 0);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
 		ret = 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d788ab0dcd9..9b025960bbd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1421,6 +1421,9 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     int level,
 					     u64 hint,
 					     u64 empty_size);
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize);
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
@@ -1451,6 +1454,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, u64 bytenr, u64 num_bytes,
 		      u64 root_objectid, u64 ref_generation,
 		      u64 owner_objectid, u64 owner_offset, int pin);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct extent_io_tree *unpin);
@@ -1484,7 +1488,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
-		    struct extent_buffer **cow_ret);
+		    struct extent_buffer **cow_ret, u64 prealloc_dest);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 74bcd48a9c4..98a1c0faeda 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2118,6 +2118,15 @@ again:
 	return 0;
 }
 
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+{
+	maybe_lock_mutex(root);
+	set_extent_dirty(&root->fs_info->free_space_cache,
+			 start, start + len - 1, GFP_NOFS);
+	maybe_unlock_mutex(root);
+	return 0;
+}
+
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -2267,6 +2276,26 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	maybe_unlock_mutex(root);
 	return ret;
 }
+
+struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    u64 bytenr, u32 blocksize)
+{
+	struct extent_buffer *buf;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_tree_lock(buf);
+	clean_tree_block(trans, root, buf);
+	btrfs_set_buffer_uptodate(buf);
+	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	trans->blocks_used++;
+	return buf;
+}
+
 /*
  * helper function to allocate a block for a given tree
  * returns the tree buffer or NULL.
@@ -2293,26 +2322,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
 	}
-	buf = btrfs_find_create_tree_block(root, ins.objectid, blocksize);
-	if (!buf) {
-		btrfs_free_extent(trans, root, ins.objectid, blocksize,
-				  root->root_key.objectid, ref_generation,
-				  0, 0, 0);
-		return ERR_PTR(-ENOMEM);
-	}
-	btrfs_set_header_generation(buf, trans->transid);
-	btrfs_tree_lock(buf);
-	clean_tree_block(trans, root, buf);
-	btrfs_set_buffer_uptodate(buf);
-
-	if (PageDirty(buf->first_page)) {
-		printk("page %lu dirty\n", buf->first_page->index);
-		WARN_ON(1);
-	}
 
-	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
-			 buf->start + buf->len - 1, GFP_NOFS);
-	trans->blocks_used++;
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
 	return buf;
 }
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d43e14c7471..0cc314c10d6 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,3 +56,19 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 {
 	return mutex_is_locked(&eb->mutex);
 }
+
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
+{
+	int i;
+	struct extent_buffer *eb;
+	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
+		eb = path->nodes[i];
+		if (!eb)
+			break;
+		smp_mb();
+		if (!list_empty(&eb->mutex.wait_list))
+			return 1;
+	}
+	return 0;
+}
+
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 2dab96d8280..bc1faef1251 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -23,4 +23,5 @@ int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
 int btrfs_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_lock(struct extent_buffer *eb);
+int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a6877949930..9d84daf1000 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -622,7 +622,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
-	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
 
 	btrfs_copy_root(trans, root, old, &tmp, objectid);
 	btrfs_tree_unlock(old);
-- 
cgit v1.2.3


From 2dd3e67b1eaec8504da7e12b8afee77323a49f38 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 08:20:15 -0400
Subject: Btrfs: More throttle tuning

* Make walk_down_tree wake up throttled tasks more often
* Make walk_down_tree call cond_resched during long loops
* As the size of the ref cache grows, wait longer in throttle
* Get rid of the reada code in walk_down_tree, the leaves don't get
  read anymore, thanks to the ref cache.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 13 ++---------
 fs/btrfs/extent-tree.c | 59 ++++++++++----------------------------------------
 fs/btrfs/transaction.c | 15 +++++++++++++
 3 files changed, 29 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d2d1cc87e8a..da9dda4338a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -188,13 +188,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	btrfs_csum_final(crc, result);
 
 	if (verify) {
-		int from_this_trans = 0;
-
-		if (root->fs_info->running_transaction &&
-		    btrfs_header_generation(buf) ==
-		    root->fs_info->running_transaction->transid)
-			from_this_trans = 1;
-
 		/* FIXME, this is not good */
 		if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
 			u32 val;
@@ -203,11 +196,9 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 
 			read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
 			printk("btrfs: %s checksum verify failed on %llu "
-			       "wanted %X found %X from_this_trans %d "
-			       "level %d\n",
+			       "wanted %X found %X level %d\n",
 			       root->fs_info->sb->s_id,
-			       buf->start, val, found, from_this_trans,
-			       btrfs_header_level(buf));
+			       buf->start, val, found, btrfs_header_level(buf));
 			return 1;
 		}
 	} else {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 98a1c0faeda..1aeb695078b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2369,6 +2369,11 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
 				leaf_owner, leaf_generation,
 				key.objectid, key.offset, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+
 		BUG_ON(ret);
 	}
 	return 0;
@@ -2389,6 +2394,11 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 					ref->owner, ref->generation,
 					info->objectid, info->offset, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+
 		BUG_ON(ret);
 		info++;
 	}
@@ -2396,51 +2406,6 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static void noinline reada_walk_down(struct btrfs_root *root,
-				     struct extent_buffer *node,
-				     int slot)
-{
-	u64 bytenr;
-	u64 last = 0;
-	u32 nritems;
-	u32 refs;
-	u32 blocksize;
-	int ret;
-	int i;
-	int level;
-	int skipped = 0;
-
-	nritems = btrfs_header_nritems(node);
-	level = btrfs_header_level(node);
-	if (level)
-		return;
-
-	for (i = slot; i < nritems && skipped < 32; i++) {
-		bytenr = btrfs_node_blockptr(node, i);
-		if (last && ((bytenr > last && bytenr - last > 32 * 1024) ||
-			     (last > bytenr && last - bytenr > 32 * 1024))) {
-			skipped++;
-			continue;
-		}
-		blocksize = btrfs_level_size(root, level - 1);
-		if (i != slot) {
-			ret = lookup_extent_ref(NULL, root, bytenr,
-						blocksize, &refs);
-			BUG_ON(ret);
-			if (refs != 1) {
-				skipped++;
-				continue;
-			}
-		}
-		ret = readahead_tree_block(root, bytenr, blocksize,
-					   btrfs_node_ptr_generation(node, i));
-		last = bytenr + blocksize;
-		cond_resched();
-		if (ret)
-			break;
-	}
-}
-
 int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 			      u32 *refs)
 {
@@ -2549,6 +2514,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 
 			atomic_inc(&root->fs_info->throttle_gen);
 			wake_up(&root->fs_info->transaction_throttle);
+			cond_resched();
 
 			continue;
 		}
@@ -2578,8 +2544,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
 
-			if (path->slots[*level] == 0)
-				reada_walk_down(root, cur, path->slots[*level]);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			cond_resched();
@@ -2601,6 +2565,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		path->nodes[*level-1] = next;
 		*level = btrfs_header_level(next);
 		path->slots[*level] = 0;
+		cond_resched();
 	}
 out:
 	WARN_ON(*level < 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9d84daf1000..cf73342e821 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -210,7 +210,9 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 static void throttle_on_drops(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
+	int harder_count = 0;
 
+harder:
 	if (atomic_read(&info->throttles)) {
 		DEFINE_WAIT(wait);
 		int thr;
@@ -226,6 +228,19 @@ static void throttle_on_drops(struct btrfs_root *root)
 			schedule();
 			finish_wait(&info->transaction_throttle, &wait);
 		} while (thr == atomic_read(&info->throttle_gen));
+		harder_count++;
+
+		if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
+		    harder_count < 2)
+			goto harder;
+
+		if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
+		    harder_count < 10)
+			goto harder;
+
+		if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
+		    harder_count < 20)
+			goto harder;
 	}
 }
 
-- 
cgit v1.2.3


From 6dab81574346c831ded96ae3ab0e8f9ca72c37ae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 08:35:53 -0400
Subject: Btrfs: Hold csum mutex while reading in sums during readpages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 2 +-
 fs/btrfs/inode.c     | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a5ff19b34b2..70af24aa582 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -150,7 +150,6 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 
 	path = btrfs_alloc_path();
-	path->reada = 2;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 
@@ -177,6 +176,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 				printk("no csum found for inode %lu start "
 				       "%llu\n", inode->i_ino,
 				       (unsigned long long)offset);
+				item = NULL;
 				goto found;
 			}
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 76365ad9bfb..640648c66b2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -376,7 +376,9 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	if (!(rw & (1 << BIO_RW))) {
 		if (!btrfs_test_opt(root, NODATASUM) &&
 		    !btrfs_test_flag(inode, NODATASUM)) {
+			mutex_lock(&BTRFS_I(inode)->csum_mutex);
 			btrfs_lookup_bio_sums(root, inode, bio);
+			mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 		}
 		goto mapit;
 	}
-- 
cgit v1.2.3


From 3117a77370b6cb902191568e4e647cdcba083d0a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 11:10:20 -0400
Subject: Btrfs: Add support for HW assisted crc32c

Intel doesn't yet ship hardware to the public with this enabled, but when they
do, they will be ready.  Original code from:

Austin Zhang <austin_zhang@linux.intel.com>

It is currently disabled, but edit crc32c.h to turn it on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/crc32c.h | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
index a93255b4ee2..bf6c12e8573 100644
--- a/fs/btrfs/crc32c.h
+++ b/fs/btrfs/crc32c.h
@@ -1,7 +1,95 @@
+#ifndef __BTRFS_CRC32C__
+#define __BTRFS_CRC32C__
 #include <asm/byteorder.h>
 #include <linux/crc32c.h>
 #include <linux/version.h>
 
+/* #define CONFIG_BTRFS_HW_SUM 1 */
+
+#ifdef CONFIG_BTRFS_HW_SUM
+#ifdef CONFIG_X86
+/*
+ * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
+ * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
+ * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2A: Instruction Set Reference, A-M
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+
+#define X86_FEATURE_XMM4_2     (4*32+20) /* Streaming SIMD Extensions-4.2 */
+#define cpu_has_xmm4_2         boot_cpu_has(X86_FEATURE_XMM4_2)
+
+#ifdef CONFIG_X86_64
+#define REX_PRE	"0x48, "
+#define SCALE_F	8
+#else
+#define REX_PRE
+#define SCALE_F	4
+#endif
+
+static inline u32 btrfs_crc32c_le_hw_byte(u32 crc, unsigned char const *data,
+				   size_t length)
+{
+	while (length--) {
+		__asm__ __volatile__(
+			".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
+			:"=S"(crc)
+			:"0"(crc), "c"(*data)
+		);
+		data++;
+	}
+
+	return crc;
+}
+
+static inline u32 __pure btrfs_crc32c_le_hw(u32 crc, unsigned char const *p,
+				     size_t len)
+{
+	unsigned int iquotient = len / SCALE_F;
+	unsigned int iremainder = len % SCALE_F;
+#ifdef CONFIG_X86_64
+	u64 *ptmp = (u64 *)p;
+#else
+	u32 *ptmp = (u32 *)p;
+#endif
+
+	while (iquotient--) {
+		__asm__ __volatile__(
+			".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
+			:"=S"(crc)
+			:"0"(crc), "c"(*ptmp)
+		);
+		ptmp++;
+	}
+
+	if (iremainder)
+		crc = btrfs_crc32c_le_hw_byte(crc, (unsigned char *)ptmp,
+					      iremainder);
+
+	return crc;
+}
+#endif /* CONFIG_BTRFS_HW_SUM */
+
+static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
+				 size_t len)
+{
+#ifdef CONFIG_BTRFS_HW_SUM
+	if (cpu_has_xmm4_2)
+		return btrfs_crc32c_le_hw(crc, address, len);
+#endif
+	return crc32c_le(crc, address, len);
+}
+
+#else
+
+#define __btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
+
+#endif /* CONFIG_X86 */
+
 /**
  * implementation of crc32c_le() changed in linux-2.6.23,
  * has of v0.13 btrfs-progs is using the latest version.
@@ -10,8 +98,11 @@
  */
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
 #define btrfs_crc32c(seed, data, length) \
-	__cpu_to_le32( crc32c( __le32_to_cpu(seed), data, length) )
+	__cpu_to_le32( __btrfs_crc32c( __le32_to_cpu(seed), \
+		                      (unsigned char const *)data, length) )
 #else
 #define btrfs_crc32c(seed, data, length) \
-	crc32c(seed, data, length)
+	__btrfs_crc32c(seed, (unsigned char const *)data, length)
+#endif
 #endif
+
-- 
cgit v1.2.3


From 9ca9ee09c176a814189063c8b88f75c8f8e4ad19 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 4 Aug 2008 10:41:27 -0400
Subject: Btrfs: fix ioctl-initiated transactions vs wait_current_trans()

Commit 597:466b27332893 (btrfs_start_transaction: wait for commits in
progress) breaks the transaction start/stop ioctls by making
btrfs_start_transaction conditionally wait for the next transaction to
start.  If an application artificially is holding a transaction open,
things deadlock.

This workaround maintains a count of open ioctl-initiated transactions in
fs_info, and avoids wait_current_trans() if any are currently open (in
start_transaction() and btrfs_throttle()).  The start transaction ioctl
uses a new btrfs_start_ioctl_transaction() that _does_ call
wait_current_trans(), effectively pushing the join/wait decision to the
outer ioctl-initiated transaction.

This more or less neuters btrfs_throttle() when ioctl-initiated
transactions are in use, but that seems like a pretty fundamental
consequence of wrapping lots of write()'s in a transaction.  Btrfs has no
way to tell if the application considers a given operation as part of it's
transaction.

Obviously, if the transaction start/stop ioctls aren't being used, there
is no effect on current behavior.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 ctree.h       |    1 +
 ioctl.c       |   12 +++++++++++-
 transaction.c |   18 +++++++++++++-----
 transaction.h |    2 ++
 4 files changed, 27 insertions(+), 6 deletions(-)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/ioctl.c       | 12 +++++++++++-
 fs/btrfs/transaction.c | 18 +++++++++++++-----
 fs/btrfs/transaction.h |  2 ++
 4 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9b025960bbd..62499dd761b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -518,6 +518,7 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
+	u64 open_ioctl_trans;
 	unsigned long mount_opt;
 	u64 max_extent;
 	u64 max_inline;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 224da287b3e..0b63c3c77cf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -715,7 +715,12 @@ long btrfs_ioctl_trans_start(struct file *file)
 		ret = -EINPROGRESS;
 		goto out;
 	}
-	trans = btrfs_start_transaction(root, 0);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans++;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
+	trans = btrfs_start_ioctl_transaction(root, 0);
 	if (trans)
 		file->private_data = trans;
 	else
@@ -745,6 +750,11 @@ long btrfs_ioctl_trans_end(struct file *file)
 	}
 	btrfs_end_transaction(trans, root);
 	file->private_data = 0;
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans--;
+	mutex_unlock(&root->fs_info->trans_mutex);
+
 out:
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cf73342e821..a2c821e3c3a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -152,14 +152,14 @@ static void wait_current_trans(struct btrfs_root *root)
 }
 
 struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-					     int num_blocks, int join)
+					     int num_blocks, int wait)
 {
 	struct btrfs_trans_handle *h =
 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
 	int ret;
 
 	mutex_lock(&root->fs_info->trans_mutex);
-	if (!join)
+	if ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)
 		wait_current_trans(root);
 	ret = join_transaction(root);
 	BUG_ON(ret);
@@ -180,14 +180,21 @@ struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_blocks)
 {
-	return start_transaction(root, num_blocks, 0);
+	return start_transaction(root, num_blocks, 1);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 						   int num_blocks)
 {
-	return start_transaction(root, num_blocks, 1);
+	return start_transaction(root, num_blocks, 0);
 }
 
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+							 int num_blocks)
+{
+	return start_transaction(r, num_blocks, 2);
+}
+
+
 static noinline int wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
@@ -247,7 +254,8 @@ harder:
 void btrfs_throttle(struct btrfs_root *root)
 {
 	mutex_lock(&root->fs_info->trans_mutex);
-	wait_current_trans(root);
+	if (!root->fs_info->open_ioctl_trans)
+		wait_current_trans(root);
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	throttle_on_drops(root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f5adb23151f..2c73caeebb2 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -83,6 +83,8 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_blocks);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
 						   int num_blocks);
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+						   int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From d7a029a89ef370e74b63f18b81498d90d1ee3cc1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 23:17:26 -0400
Subject: Btrfs: Don't corrupt ram in shrink_extent_tree, leak it instead

Far from the perfect fix, but these structs are small.  TODO for the
next release.  The block group cache structs are referenced in many
different places, and it isn't safe to just free them while resizing.

A real fix will be a larger change to the allocator so that it doesn't
have to carry about the block group cache structs to find good places
to search for free blocks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1aeb695078b..dbde1288124 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3437,8 +3437,10 @@ next:
 			   key.objectid, key.objectid + key.offset - 1,
 			   (unsigned int)-1, GFP_NOFS);
 
+	/*
 	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
 	kfree(shrink_block_group);
+	*/
 
 	btrfs_del_item(trans, root, path);
 	btrfs_release_path(root, path);
-- 
cgit v1.2.3


From ea8c281947950fac5f78818b767821d696c9512a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 23:17:27 -0400
Subject: Btrfs: Maintain a list of inodes that are delalloc and a way to wait
 on them

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  2 ++
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 12 +++++++++---
 fs/btrfs/file.c        |  3 +--
 fs/btrfs/inode.c       | 47 +++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/ioctl.c       |  4 ++--
 7 files changed, 60 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index df624fd735c..534ac4153dd 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -42,6 +42,8 @@ struct btrfs_inode {
 	/* for keeping track of orphaned inodes */
 	struct list_head i_orphan;
 
+	struct list_head delalloc_inodes;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 62499dd761b..116aee21bf7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -551,6 +551,7 @@ struct btrfs_fs_info {
 	 */
 	spinlock_t ordered_extent_lock;
 	struct list_head ordered_extents;
+	struct list_head delalloc_inodes;
 
 	/*
 	 * there is a pool of worker threads for checksumming during writes
@@ -637,6 +638,7 @@ struct btrfs_root {
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
+
 	u64 objectid;
 	u64 last_trans;
 
@@ -1651,6 +1653,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 #define PageChecked PageFsMisc
 #endif
 
+int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_root *new_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index da9dda4338a..76543683f3b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1234,6 +1234,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
+	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
 	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dbde1288124..33cb2ac4cb2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1230,7 +1230,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
 		found->full = 0;
-		WARN_ON(found->total_bytes < found->bytes_used);
 		*space_info = found;
 		return 0;
 	}
@@ -2841,8 +2840,7 @@ again:
 		 */
 		clear_page_dirty_for_io(page);
 
-		set_extent_delalloc(io_tree, page_start,
-				    page_end, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
 		set_page_dirty(page);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -3319,6 +3317,13 @@ again:
 	key.type = 0;
 	cur_byte = key.objectid;
 
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
+	btrfs_start_delalloc_inodes(root);
+	btrfs_wait_ordered_extents(tree_root);
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -3401,6 +3406,7 @@ next:
 
 		btrfs_clean_old_snapshots(tree_root);
 
+		btrfs_start_delalloc_inodes(root);
 		btrfs_wait_ordered_extents(tree_root);
 
 		trans = btrfs_start_transaction(tree_root, 1);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8915f2dc1bc..eb8e4556fa7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -312,8 +312,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		 * to reset the delalloc bit on things that already have
 		 * extents reserved.
 		 */
-		set_extent_delalloc(io_tree, start_pos,
-				    end_of_last_block, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 640648c66b2..8a405a5fa6a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -303,6 +303,10 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
+		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+				      &root->fs_info->delalloc_inodes);
+		}
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 	}
 	return 0;
@@ -325,6 +329,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
+		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		}
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 	}
 	return 0;
@@ -408,6 +416,12 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
+{
+	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
+				   GFP_NOFS);
+}
+
 struct btrfs_writepage_fixup {
 	struct page *page;
 	struct btrfs_work work;
@@ -453,8 +467,7 @@ again:
 		goto again;
 	}
 
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			    GFP_NOFS);
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
 	ClearPageChecked(page);
 out:
 	unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
@@ -1530,8 +1543,7 @@ again:
 		goto again;
 	}
 
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
-			    page_end, GFP_NOFS);
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
 	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
 		kaddr = kmap(page);
@@ -1766,6 +1778,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 			     inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
+	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
@@ -2158,6 +2171,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
@@ -2400,6 +2414,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
@@ -3049,8 +3064,7 @@ again:
 		goto again;
 	}
 
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
-			    page_end, GFP_NOFS);
+	btrfs_set_extent_delalloc(inode, page_start, page_end);
 	ret = 0;
 
 	/* page is wholly or partially inside EOF */
@@ -3373,6 +3387,26 @@ out_unlock:
 	return ret;
 }
 
+int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+{
+	struct list_head *head = &root->fs_info->delalloc_inodes;
+	struct btrfs_inode *binode;
+	unsigned long flags;
+
+	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	while(!list_empty(head)) {
+		binode = list_entry(head->next, struct btrfs_inode,
+				    delalloc_inodes);
+		atomic_inc(&binode->vfs_inode.i_count);
+		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+		filemap_write_and_wait(binode->vfs_inode.i_mapping);
+		iput(&binode->vfs_inode);
+		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	}
+	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+	return 0;
+}
+
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 			 const char *symname)
 {
@@ -3436,6 +3470,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 				     inode->i_mapping, GFP_NOFS);
 		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 				     inode->i_mapping, GFP_NOFS);
+		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0b63c3c77cf..e1046a54b1c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -274,8 +274,7 @@ again:
 		 */
 		clear_page_dirty_for_io(page);
 
-		set_extent_delalloc(io_tree, page_start,
-				    page_end, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		set_page_dirty(page);
@@ -784,6 +783,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_TRANS_END:
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_SYNC:
+		btrfs_start_delalloc_inodes(root);
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
 	}
-- 
cgit v1.2.3


From 3de9d6b649b4cc60687be92e71cef36d7d4e8f2f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 23:17:27 -0400
Subject: btrfs_lookup_bio_sums seems broken, go back to the readpage_io_hook
 for now

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c |  4 +++-
 fs/btrfs/inode.c     | 59 ++++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 56 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 70af24aa582..51aba8cee7c 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -134,6 +134,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#if 0 /* broken */
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio)
 {
@@ -200,7 +201,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 		diff = diff * BTRFS_CRC32_SIZE;
 
 		read_extent_buffer(path->nodes[0], &sum,
-				   (unsigned long)item + diff,
+				   ((unsigned long)item) + diff,
 				   BTRFS_CRC32_SIZE);
 found:
 		set_state_private(io_tree, offset, sum);
@@ -210,6 +211,7 @@ found:
 	btrfs_free_path(path);
 	return 0;
 }
+#endif
 
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8a405a5fa6a..99121a55ffb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,12 +382,6 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	BUG_ON(ret);
 
 	if (!(rw & (1 << BIO_RW))) {
-		if (!btrfs_test_opt(root, NODATASUM) &&
-		    !btrfs_test_flag(inode, NODATASUM)) {
-			mutex_lock(&BTRFS_I(inode)->csum_mutex);
-			btrfs_lookup_bio_sums(root, inode, bio);
-			mutex_unlock(&BTRFS_I(inode)->csum_mutex);
-		}
 		goto mapit;
 	}
 
@@ -595,6 +589,58 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
+int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
+{
+	int ret = 0;
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_csum_item *item;
+	struct btrfs_path *path = NULL;
+	u32 csum;
+
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	/*
+	 * It is possible there is an ordered extent that has
+	 * not yet finished for this range in the file.  If so,
+	 * that extent will have a csum cached, and it will insert
+	 * the sum after all the blocks in the extent are fully
+	 * on disk.  So, look for an ordered extent and use the
+	 * sum if found.  We have to do this before looking in the
+	 * btree because csum items are pre-inserted based on
+	 * the file size.  btrfs_lookup_csum might find an item
+	 * that still hasn't been fully filled.
+	 */
+	ret = btrfs_find_ordered_sum(inode, start, &csum);
+	if (ret == 0)
+		goto found;
+
+	ret = 0;
+	path = btrfs_alloc_path();
+	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
+	if (IS_ERR(item)) {
+		ret = PTR_ERR(item);
+		/* a csum that isn't present is a preallocated region. */
+		if (ret == -ENOENT || ret == -EFBIG)
+			ret = 0;
+		csum = 0;
+		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
+		       start);
+		goto out;
+	}
+	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
+			   BTRFS_CRC32_SIZE);
+found:
+	set_state_private(io_tree, start, csum);
+out:
+	if (path)
+		btrfs_free_path(path);
+	return ret;
+}
+
 struct io_failure_record {
 	struct page *page;
 	u64 start;
@@ -3580,6 +3626,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.merge_bio_hook = btrfs_merge_bio_hook,
+	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
-- 
cgit v1.2.3


From b48652c101cce7a54379a49cc0cf854cec2c94e2 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 4 Aug 2008 23:23:47 -0400
Subject: Btrfs: Various small fixes.

This trivial patch contains two locking fixes and a off by one fix.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ordered-data.c | 4 ++--
 fs/btrfs/root-tree.c    | 3 +--
 fs/btrfs/super.c        | 2 ++
 fs/btrfs/transaction.c  | 9 +++++----
 fs/btrfs/transaction.h  | 3 +--
 5 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 676e4bd65c5..db200e6baf7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -548,7 +548,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 		 */
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 		if (test->file_offset > entry_end(ordered)) {
-			i_size_test = test->file_offset - 1;
+			i_size_test = test->file_offset;
 		}
 	} else {
 		i_size_test = i_size_read(inode);
@@ -561,7 +561,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * disk_i_size to the end of the region.
 	 */
 	if (i_size_test > entry_end(ordered) &&
-	    !test_range_bit(io_tree, entry_end(ordered), i_size_test,
+	    !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
 			   EXTENT_DELALLOC, 0)) {
 		new_i_size = min_t(u64, i_size_test, i_size_read(inode));
 	}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index a5c0e98b5ae..36726696e58 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -209,8 +209,7 @@ again:
 			goto err;
 		}
 
-		ret = btrfs_add_dead_root(dead_root, latest,
-					  &root->fs_info->dead_roots);
+		ret = btrfs_add_dead_root(dead_root, latest);
 		if (ret)
 			goto err;
 		goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a6a418b6894..eb4b357d05e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -449,7 +449,9 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		s->s_flags |= MS_ACTIVE;
 	}
 
+	mutex_lock(&s->s_root->d_inode->i_mutex);
 	root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
+	mutex_unlock(&s->s_root->d_inode->i_mutex);
 	if (IS_ERR(root)) {
 		up_write(&s->s_umount);
 		deactivate_super(s);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a2c821e3c3a..ebf5362da1d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -389,9 +389,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_add_dead_root(struct btrfs_root *root,
-			struct btrfs_root *latest,
-			struct list_head *dead_list)
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
 {
 	struct btrfs_dirty_root *dirty;
 
@@ -400,7 +398,10 @@ int btrfs_add_dead_root(struct btrfs_root *root,
 		return -ENOMEM;
 	dirty->root = root;
 	dirty->latest_root = latest;
-	list_add(&dirty->list, dead_list);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+	list_add(&dirty->list, &latest->fs_info->dead_roots);
+	mutex_unlock(&root->fs_info->trans_mutex);
 	return 0;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2c73caeebb2..598baa31241 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -90,8 +90,7 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
-			struct list_head *dead_list);
+int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From ae01a0abf343aefe923ace5c1a8c634adfbe29c5 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 4 Aug 2008 23:23:47 -0400
Subject: Btrfs: Update clone file ioctl

This patch updates the file clone ioctl for the tree locking and new
data ordered code.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 150 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 70 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e1046a54b1c..3932c7cd0fa 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -517,32 +517,6 @@ out:
 	return ret;
 }
 
-int dup_item_to_inode(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       struct btrfs_path *path,
-		       struct extent_buffer *leaf,
-		       int slot,
-		       struct btrfs_key *key,
-		       u64 destino)
-{
-	char *dup;
-	int len = btrfs_item_size_nr(leaf, slot);
-	struct btrfs_key ckey = *key;
-	int ret = 0;
-
-	dup = kmalloc(len, GFP_NOFS);
-	if (!dup)
-		return -ENOMEM;
-
-	read_extent_buffer(leaf, dup, btrfs_item_ptr_offset(leaf, slot), len);
-	btrfs_release_path(root, path);
-
-	ckey.objectid = destino;
-	ret = btrfs_insert_item(trans, root, &ckey, dup, len);
-	kfree(dup);
-	return ret;
-}
-
 long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 {
 	struct inode *inode = fdentry(file)->d_inode;
@@ -550,22 +524,41 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	struct file *src_file;
 	struct inode *src;
 	struct btrfs_trans_handle *trans;
-	int ret;
-	u64 pos;
+	struct btrfs_ordered_extent *ordered;
 	struct btrfs_path *path;
-	struct btrfs_key key;
 	struct extent_buffer *leaf;
+	char *buf;
+	struct btrfs_key key;
+	struct btrfs_key new_key;
+	u32 size;
 	u32 nritems;
 	int slot;
+	int ret;
 
 	src_file = fget(src_fd);
 	if (!src_file)
 		return -EBADF;
 	src = src_file->f_dentry->d_inode;
 
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+		goto out_fput;
+
 	ret = -EXDEV;
-	if (src->i_sb != inode->i_sb)
+	if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
+		goto out_fput;
+
+	ret = -ENOMEM;
+	buf = vmalloc(btrfs_level_size(root, 0));
+	if (!buf)
+		goto out_fput;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
 		goto out_fput;
+	}
+	path->reada = 2;
 
 	if (inode < src) {
 		mutex_lock(&inode->i_mutex);
@@ -582,24 +575,22 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	/* do any pending delalloc/csum calc on src, one way or
 	   another, and lock file content */
 	while (1) {
-		filemap_write_and_wait(src->i_mapping);
 		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
-		if (BTRFS_I(src)->delalloc_bytes == 0)
+		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
 			break;
 		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		btrfs_wait_ordered_range(src, 0, (u64)-1);
 	}
 
-	trans = btrfs_start_transaction(root, 0);
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	key.offset = 0;
-	key.type = BTRFS_EXTENT_DATA_KEY;
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
 	key.objectid = src->i_ino;
-	pos = 0;
-	path->reada = 2;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = 0;
 
 	while (1) {
 		/*
@@ -610,18 +601,19 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		if (ret < 0)
 			goto out;
 
-		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto out;
 			if (ret > 0)
 				break;
+			nritems = btrfs_header_nritems(path->nodes[0]);
 		}
 		leaf = path->nodes[0];
 		slot = path->slots[0];
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		nritems = btrfs_header_nritems(leaf);
 
+		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
 		    key.objectid != src->i_ino)
 			break;
@@ -629,66 +621,64 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 			struct btrfs_file_extent_item *extent;
 			int found_type;
-			pos = key.offset;
+
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 			found_type = btrfs_file_extent_type(leaf, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
-				u64 len = btrfs_file_extent_num_bytes(leaf,
-								      extent);
 				u64 ds = btrfs_file_extent_disk_bytenr(leaf,
 								       extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
 								 extent);
-				u64 off = btrfs_file_extent_offset(leaf,
-								   extent);
-				btrfs_insert_file_extent(trans, root,
-							 inode->i_ino, pos,
-							 ds, dl, len, off);
 				/* ds == 0 means there's a hole */
 				if (ds != 0) {
-					btrfs_inc_extent_ref(trans, root,
+					ret = btrfs_inc_extent_ref(trans, root,
 						     ds, dl,
 						     root->root_key.objectid,
 						     trans->transid,
-						     inode->i_ino, pos);
+						     inode->i_ino, key.offset);
+					if (ret)
+						goto out;
 				}
-				pos = key.offset + len;
-			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-				ret = dup_item_to_inode(trans, root, path,
-							leaf, slot, &key,
-							inode->i_ino);
-				if (ret)
-					goto out;
-				pos = key.offset + btrfs_item_size_nr(leaf,
-								      slot);
 			}
-		} else if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
-			ret = dup_item_to_inode(trans, root, path, leaf,
-						slot, &key, inode->i_ino);
+		}
 
-			if (ret)
-				goto out;
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
+		    btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+			btrfs_release_path(root, path);
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.objectid = inode->i_ino;
+			ret = btrfs_insert_item(trans, root, &new_key,
+						buf, size);
+			BUG_ON(ret);
+		} else {
+			btrfs_release_path(root, path);
 		}
 		key.offset++;
-		btrfs_release_path(root, path);
 	}
-
 	ret = 0;
 out:
-	btrfs_free_path(path);
-
-	inode->i_blocks = src->i_blocks;
-	i_size_write(inode, src->i_size);
-	btrfs_update_inode(trans, root, inode);
-
-	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
-
+	btrfs_release_path(root, path);
+	if (ret == 0) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+		inode->i_blocks = src->i_blocks;
+		btrfs_i_size_write(inode, src->i_size);
+		BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+		ret = btrfs_update_inode(trans, root, inode);
+	}
 	btrfs_end_transaction(trans, root);
-
+	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+	if (ret)
+		vmtruncate(inode, 0);
 out_unlock:
 	mutex_unlock(&src->i_mutex);
 	mutex_unlock(&inode->i_mutex);
+	vfree(buf);
+	btrfs_free_path(path);
 out_fput:
 	fput(src_file);
 	return ret;
-- 
cgit v1.2.3


From 7d2b4daa67379960477568abda62b8ba9ee3a8aa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 5 Aug 2008 10:13:57 -0400
Subject: Btrfs: Fix the multi-bio code to save the original bio for completion

The multi-bio code is responsible for duplicating blocks in raid1 and
single spindle duplication.  It has counters to make sure all of
the locations for a given extent are properly written before io completion
is returned to the higher layers.

But, it didn't always complete the same bio it was given, sometimes a
clone was completed instead.  This lead to problems with the async
work queues because they saved a pointer to the bio in a struct off
bi_private.

The fix is to remember the original bio and only complete that one.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 11 ++++++++++-
 fs/btrfs/volumes.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0de51e37e78..09311b3066d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2070,6 +2070,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
+	int is_orig_bio = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -2078,7 +2079,14 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (err)
 		atomic_inc(&multi->error);
 
+	if (bio == multi->orig_bio)
+		is_orig_bio = 1;
+
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
+		if (!is_orig_bio) {
+			bio_put(bio);
+			bio = multi->orig_bio;
+		}
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 		/* only send an error to the higher layers if it is
@@ -2101,7 +2109,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 #else
 		bio_endio(bio, err);
 #endif
-	} else {
+	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -2196,6 +2204,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	}
 	multi->end_io = first_bio->bi_end_io;
 	multi->private = first_bio->bi_private;
+	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
 	while(dev_nr < total_devs) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 48a44f7a938..c50e50580b5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -95,6 +95,7 @@ struct btrfs_bio_stripe {
 struct btrfs_multi_bio {
 	atomic_t stripes_pending;
 	bio_end_io_t *end_io;
+	struct bio *orig_bio;
 	void *private;
 	atomic_t error;
 	int max_errors;
-- 
cgit v1.2.3


From 00e4e6b33a0f78aab4b788d6d31c884fd8bf88da Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 5 Aug 2008 11:18:09 -0400
Subject: Get rid of BTRFS_I(inode)->index and use local vars instead

rename and link don't always have a lock on the source inode, and
our use of a per-inode index variable was racy.  This changes things to
store the index in a local variable instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  9 --------
 fs/btrfs/inode.c       | 59 +++++++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 534ac4153dd..111f90524ae 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -57,15 +57,6 @@ struct btrfs_inode {
 	 * number for new files that are created
 	 */
 	u64 index_cnt;
-
-	/*
-	 * index holds the directory index for this inode on creation, so
-	 * add_link can do what its supposed to.  This isn't populated when the
-	 * inode is read because there isn't really a reason to know this unless
-	 * we are creating the directory index or deleting it, and deletion
-	 * reads the index off of the inode reference at unlink time.
-	 */
-	u64 index;
 };
 
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 99121a55ffb..4d8ffc01931 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2153,7 +2153,8 @@ out:
 	return ret;
 }
 
-static int btrfs_set_inode_index(struct inode *dir, struct inode *inode)
+static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
+				 u64 *index)
 {
 	int ret = 0;
 
@@ -2163,7 +2164,7 @@ static int btrfs_set_inode_index(struct inode *dir, struct inode *inode)
 			return ret;
 	}
 
-	BTRFS_I(inode)->index = BTRFS_I(dir)->index_cnt;
+	*index = BTRFS_I(dir)->index_cnt;
 	BTRFS_I(dir)->index_cnt++;
 
 	return ret;
@@ -2176,7 +2177,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     u64 ref_objectid,
 				     u64 objectid,
 				     struct btrfs_block_group_cache *group,
-				     int mode)
+				     int mode, u64 *index)
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
@@ -2198,11 +2199,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	if (dir) {
-		ret = btrfs_set_inode_index(dir, inode);
+		ret = btrfs_set_inode_index(dir, inode, index);
 		if (ret)
 			return ERR_PTR(ret);
-	} else {
-		BTRFS_I(inode)->index = 0;
 	}
 	/*
 	 * index_cnt is ignored for everything but a dir,
@@ -2268,7 +2267,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_inode_ref);
 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
-	btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->index);
+	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
 	ptr = (unsigned long)(ref + 1);
 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
 
@@ -2296,7 +2295,7 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 
 static int btrfs_add_link(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode,
-			    int add_backref)
+			    int add_backref, u64 index)
 {
 	int ret;
 	struct btrfs_key key;
@@ -2311,7 +2310,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 				    dentry->d_name.name, dentry->d_name.len,
 				    dentry->d_parent->d_inode->i_ino,
 				    &key, btrfs_inode_type(inode),
-				    BTRFS_I(inode)->index);
+				    index);
 	if (ret == 0) {
 		if (add_backref) {
 			ret = btrfs_insert_inode_ref(trans, root,
@@ -2319,7 +2318,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 					     dentry->d_name.len,
 					     inode->i_ino,
 					     parent_inode->i_ino,
-					     BTRFS_I(inode)->index);
+					     index);
 		}
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
 				   dentry->d_name.len * 2);
@@ -2332,9 +2331,9 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
 
 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode,
-			    int backref)
+			    int backref, u64 index)
 {
-	int err = btrfs_add_link(trans, dentry, inode, backref);
+	int err = btrfs_add_link(trans, dentry, inode, backref, index);
 	if (!err) {
 		d_instantiate(dentry, inode);
 		return 0;
@@ -2354,6 +2353,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	int drop_inode = 0;
 	u64 objectid;
 	unsigned long nr = 0;
+	u64 index = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
@@ -2374,7 +2374,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
-				BTRFS_I(dir)->block_group, mode);
+				BTRFS_I(dir)->block_group, mode, &index);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
@@ -2386,7 +2386,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	}
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode, 0);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
@@ -2419,6 +2419,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	int drop_inode = 0;
 	unsigned long nr = 0;
 	u64 objectid;
+	u64 index = 0;
 
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
@@ -2435,7 +2436,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino,
-				objectid, BTRFS_I(dir)->block_group, mode);
+				objectid, BTRFS_I(dir)->block_group, mode,
+				&index);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
@@ -2447,7 +2449,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	}
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode, 0);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
@@ -2489,6 +2491,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = old_dentry->d_inode;
+	u64 index;
 	unsigned long nr = 0;
 	int err;
 	int drop_inode = 0;
@@ -2504,7 +2507,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
-	err = btrfs_set_inode_index(dir, inode);
+	err = btrfs_set_inode_index(dir, inode, &index);
 	if (err)
 		goto fail;
 
@@ -2513,7 +2516,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	btrfs_set_trans_block_group(trans, dir);
 	atomic_inc(&inode->i_count);
 
-	err = btrfs_add_nondir(trans, dentry, inode, 1);
+	err = btrfs_add_nondir(trans, dentry, inode, 1, index);
 
 	if (err)
 		drop_inode = 1;
@@ -2544,6 +2547,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	int err = 0;
 	int drop_on_err = 0;
 	u64 objectid = 0;
+	u64 index = 0;
 	unsigned long nr = 1;
 
 	err = btrfs_check_free_space(root, 1, 0);
@@ -2567,7 +2571,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFDIR | mode);
+				BTRFS_I(dir)->block_group, S_IFDIR | mode,
+				&index);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto out_fail;
@@ -2588,7 +2593,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_fail;
 
-	err = btrfs_add_link(trans, dentry, inode, 0);
+	err = btrfs_add_link(trans, dentry, inode, 0, index);
 	if (err)
 		goto out_fail;
 
@@ -3203,9 +3208,10 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 		struct btrfs_block_group_cache *block_group)
 {
 	struct inode *inode;
+	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-				new_dirid, block_group, S_IFDIR | 0700);
+				new_dirid, block_group, S_IFDIR | 0700, &index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
@@ -3384,6 +3390,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct timespec ctime = CURRENT_TIME;
+	u64 index = 0;
 	int ret;
 
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -3419,11 +3426,11 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 				goto out_fail;
 		}
 	}
-	ret = btrfs_set_inode_index(new_dir, old_inode);
+	ret = btrfs_set_inode_index(new_dir, old_inode, &index);
 	if (ret)
 		goto out_fail;
 
-	ret = btrfs_add_link(trans, new_dentry, old_inode, 1);
+	ret = btrfs_add_link(trans, new_dentry, old_inode, 1, index);
 	if (ret)
 		goto out_fail;
 
@@ -3464,6 +3471,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
+	u64 index = 0 ;
 	int name_len;
 	int datasize;
 	unsigned long ptr;
@@ -3491,7 +3499,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
 				dentry->d_name.len,
 				dentry->d_parent->d_inode->i_ino, objectid,
-				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO);
+				BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+				&index);
 	err = PTR_ERR(inode);
 	if (IS_ERR(inode))
 		goto out_unlock;
@@ -3503,7 +3512,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	}
 
 	btrfs_set_trans_block_group(trans, inode);
-	err = btrfs_add_nondir(trans, dentry, inode, 0);
+	err = btrfs_add_nondir(trans, dentry, inode, 0, index);
 	if (err)
 		drop_inode = 1;
 	else {
-- 
cgit v1.2.3


From 7ea394f1192bee1af67ea4762c88ef4b7b0487a8 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 5 Aug 2008 13:05:02 -0400
Subject: Btrfs: Fix nodatacow for the new data=ordered mode

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h        |  3 ++-
 fs/btrfs/extent-tree.c  | 11 +++++----
 fs/btrfs/inode.c        | 60 +++++++++++++++++++++++++++++++------------------
 fs/btrfs/ioctl.c        |  1 +
 fs/btrfs/ordered-data.c | 16 ++++++++++---
 fs/btrfs/ordered-data.h |  6 +++--
 fs/btrfs/transaction.c  | 11 +++++++++
 7 files changed, 74 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 116aee21bf7..f90e5a7ac16 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1403,7 +1403,8 @@ static inline struct dentry *fdentry(struct file *file) {
 }
 
 /* extent-tree.c */
-int btrfs_cross_ref_exists(struct btrfs_root *root,
+int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
 			   struct btrfs_key *key, u64 bytenr);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 33cb2ac4cb2..fff219ed61d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -893,10 +893,10 @@ out:
 	return ret;
 }
 
-int btrfs_cross_ref_exists(struct btrfs_root *root,
+int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
 			   struct btrfs_key *key, u64 bytenr)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *old_root;
 	struct btrfs_path *path = NULL;
 	struct extent_buffer *eb;
@@ -908,6 +908,7 @@ int btrfs_cross_ref_exists(struct btrfs_root *root,
 	int level;
 	int ret;
 
+	BUG_ON(trans == NULL);
 	BUG_ON(key->type != BTRFS_EXTENT_DATA_KEY);
 	ret = get_reference_status(root, bytenr, 0, key->objectid,
 				   &min_generation, &ref_count);
@@ -917,7 +918,6 @@ int btrfs_cross_ref_exists(struct btrfs_root *root,
 	if (ref_count != 1)
 		return 1;
 
-	trans = btrfs_start_transaction(root, 0);
 	old_root = root->dirty_root->root;
 	ref_generation = old_root->root_key.offset;
 
@@ -973,7 +973,6 @@ int btrfs_cross_ref_exists(struct btrfs_root *root,
 out:
 	if (path)
 		btrfs_free_path(path);
-	btrfs_end_transaction(trans, root);
 	return ret;
 }
 
@@ -3320,7 +3319,7 @@ again:
 	mutex_unlock(&root->fs_info->alloc_mutex);
 
 	btrfs_start_delalloc_inodes(root);
-	btrfs_wait_ordered_extents(tree_root);
+	btrfs_wait_ordered_extents(tree_root, 0);
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 
@@ -3407,7 +3406,7 @@ next:
 		btrfs_clean_old_snapshots(tree_root);
 
 		btrfs_start_delalloc_inodes(root);
-		btrfs_wait_ordered_extents(tree_root);
+		btrfs_wait_ordered_extents(tree_root, 0);
 
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4d8ffc01931..c33053ba381 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -166,7 +166,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-					       ins.offset);
+					       ins.offset, 0);
 		BUG_ON(ret);
 		if (num_bytes < cur_alloc_size) {
 			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
@@ -187,31 +187,32 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
 	u64 extent_start;
 	u64 extent_end;
 	u64 bytenr;
-	u64 cow_end;
 	u64 loops = 0;
 	u64 total_fs_bytes;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_trans_handle *trans;
 	struct extent_buffer *leaf;
 	int found_type;
 	struct btrfs_path *path;
 	struct btrfs_file_extent_item *item;
 	int ret;
-	int err;
+	int err = 0;
 	struct btrfs_key found_key;
 
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
 again:
 	ret = btrfs_lookup_file_extent(NULL, root, path,
 				       inode->i_ino, start, 0);
 	if (ret < 0) {
-		btrfs_free_path(path);
-		return ret;
+		err = ret;
+		goto out;
 	}
 
-	cow_end = end;
 	if (ret != 0) {
 		if (path->slots[0] == 0)
 			goto not_found;
@@ -244,12 +245,11 @@ again:
 		if (start < extent_start || start >= extent_end)
 			goto not_found;
 
-		cow_end = min(end, extent_end - 1);
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0)
 			goto not_found;
 
-		if (btrfs_cross_ref_exists(root, &found_key, bytenr))
+		if (btrfs_cross_ref_exists(trans, root, &found_key, bytenr))
 			goto not_found;
 		/*
 		 * we may be called by the resizer, make sure we're inside
@@ -260,24 +260,32 @@ again:
 		if (!block_group || block_group->ro)
 			goto not_found;
 
+		bytenr += btrfs_file_extent_offset(leaf, item);
+		extent_num_bytes = min(end + 1, extent_end) - start;
+		ret = btrfs_add_ordered_extent(inode, start, bytenr,
+						extent_num_bytes, 1);
+		if (ret) {
+			err = ret;
+			goto out;
+		}
+
+		btrfs_release_path(root, path);
 		start = extent_end;
+		if (start <= end) {
+			loops++;
+			goto again;
+		}
 	} else {
-		goto not_found;
-	}
-loop:
-	if (start > end) {
+not_found:
+		btrfs_end_transaction(trans, root);
 		btrfs_free_path(path);
-		return 0;
+		return cow_file_range(inode, start, end);
 	}
-	btrfs_release_path(root, path);
-	loops++;
-	goto again;
-
-not_found:
-	btrfs_release_path(root, path);
-	cow_file_range(inode, start, end);
-	start = end + 1;
-	goto loop;
+out:
+	WARN_ON(err);
+	btrfs_end_transaction(trans, root);
+	btrfs_free_path(path);
+	return err;
 }
 
 static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
@@ -385,6 +393,11 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		goto mapit;
 	}
 
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM)) {
+		goto mapit;
+	}
+
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
 				   __btrfs_submit_bio_hook);
@@ -527,6 +540,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
 	BUG_ON(!ordered_extent);
+	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
+		goto nocow;
 
 	lock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
@@ -567,6 +582,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
+nocow:
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3932c7cd0fa..59b64c738fd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -36,6 +36,7 @@
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
 #include <linux/xattr.h>
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index db200e6baf7..da6d43eb41d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -152,7 +152,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  * inserted.
  */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len)
+			     u64 start, u64 len, int nocow)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -168,6 +168,8 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->start = start;
 	entry->len = len;
 	entry->inode = inode;
+	if (nocow)
+		set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
 
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
@@ -303,10 +305,11 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	return 0;
 }
 
-int btrfs_wait_ordered_extents(struct btrfs_root *root)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 {
 	struct list_head splice;
 	struct list_head *cur;
+	struct list_head *tmp;
 	struct btrfs_ordered_extent *ordered;
 	struct inode *inode;
 
@@ -314,10 +317,16 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root)
 
 	spin_lock(&root->fs_info->ordered_extent_lock);
 	list_splice_init(&root->fs_info->ordered_extents, &splice);
-	while(!list_empty(&splice)) {
+	list_for_each_safe(cur, tmp, &splice) {
 		cur = splice.next;
 		ordered = list_entry(cur, struct btrfs_ordered_extent,
 				     root_extent_list);
+		if (nocow_only &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+			cond_resched_lock(&root->fs_info->ordered_extent_lock);
+			continue;
+		}
+
 		list_del_init(&ordered->root_extent_list);
 		atomic_inc(&ordered->refs);
 		inode = ordered->inode;
@@ -338,6 +347,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root)
 
 		spin_lock(&root->fs_info->ordered_extent_lock);
 	}
+	list_splice_init(&splice, &root->fs_info->ordered_extents);
 	spin_unlock(&root->fs_info->ordered_extent_lock);
 	return 0;
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 5efe6b63c74..fd45519f30a 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -64,6 +64,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
 
+#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -125,7 +127,7 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len);
+			     u64 start, u64 len, int nocow);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
@@ -143,5 +145,5 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
 			   loff_t end, int sync_mode);
-int btrfs_wait_ordered_extents(struct btrfs_root *root);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index ebf5362da1d..9d3d08e9f8d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -438,6 +438,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
+				root->dirty_root = NULL;
 
 				spin_lock(&root->list_lock);
 				list_del_init(&dirty->root->dead_list);
@@ -461,6 +462,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			       sizeof(struct btrfs_disk_key));
 			root->root_item.drop_level = 0;
 			root->commit_root = NULL;
+			root->dirty_root = NULL;
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_bytenr(&root->root_item,
 					      root->node->start);
@@ -762,7 +764,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	do {
+		int snap_pending = 0;
 		joined = cur_trans->num_joined;
+		if (!list_empty(&trans->transaction->pending_snapshots))
+			snap_pending = 1;
+
 		WARN_ON(cur_trans != trans->transaction);
 		prepare_to_wait(&cur_trans->writer_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
@@ -774,6 +780,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 		mutex_unlock(&root->fs_info->trans_mutex);
 
+		if (snap_pending) {
+			ret = btrfs_wait_ordered_extents(root, 1);
+			BUG_ON(ret);
+		}
+
 		schedule_timeout(timeout);
 
 		mutex_lock(&root->fs_info->trans_mutex);
-- 
cgit v1.2.3


From 4ca8b41e3fe76cc5774fbcc6cba64259d69e0f96 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 5 Aug 2008 13:30:48 -0400
Subject: Btrfs: Avoid calling into the FS for the final iput on fake root
 inodes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 fs/btrfs/inode.c   | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 76543683f3b..56c54a41dbb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1739,6 +1739,7 @@ int close_ctree(struct btrfs_root *root)
 		free_extent_buffer(root->fs_info->dev_root->node);
 
 	btrfs_free_block_groups(root->fs_info);
+	fs_info->closing = 2;
 	del_fs_roots(fs_info);
 
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c33053ba381..a26d365111e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2092,6 +2092,9 @@ int btrfs_write_inode(struct inode *inode, int wait)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
+	if (root->fs_info->closing > 1)
+		return 0;
+
 	if (wait) {
 		trans = btrfs_join_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
-- 
cgit v1.2.3


From 5707e3b6f77e37bb8ecc7ba9d74c8bfc38011b6d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 4 Aug 2008 15:45:30 -0400
Subject: Btrfs: Update version.sh to v0.16

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/version.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
index 4d06497d2ec..0f57f24404d 100644
--- a/fs/btrfs/version.sh
+++ b/fs/btrfs/version.sh
@@ -6,7 +6,7 @@
 # Copyright 2008, Oracle
 # Released under the GNU GPLv2
  
-v="v0.15"
+v="v0.16"
 
 which hg > /dev/null
 if [ -d .hg ] && [ $? == 0 ]; then
-- 
cgit v1.2.3


From 5036f53868ae943704ae69a192d21225dc914c35 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 7 Aug 2008 11:19:42 -0400
Subject: Btrfs: fix RHEL test for ClearPageFsMisc

Newer RHEL5 kernels define both ClearPageFSMisc and
ClearPageChecked, so test for both before redefining.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f90e5a7ac16..c88f1e16ce2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1648,7 +1648,7 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 /* inode.c */
 
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
-#ifdef ClearPageFsMisc
+#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
 #define ClearPageChecked ClearPageFsMisc
 #define SetPageChecked SetPageFsMisc
 #define PageChecked PageFsMisc
-- 
cgit v1.2.3


From 2db04966ae9a3eeb57f28df9aac4e77d5b287cb1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Thu, 7 Aug 2008 11:19:43 -0400
Subject: Btrfs: Change TestSetPageLocked() to trylock_page()

Add backwards compatibility in compat.h

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 compat.h    |    3 +++
 extent_io.c |    3 ++-
 2 files changed, 5 insertions(+), 1 deletions(-)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compat.h    | 3 +++
 fs/btrfs/extent_io.c | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index d39a768e65f..b3349a66999 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -1,6 +1,9 @@
 #ifndef _COMPAT_H_
 #define _COMPAT_H_
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,26)
+#define trylock_page(page) (!TestSetPageLocked(page))
+#endif
 
 /*
  * Even if AppArmor isn't enabled, it still has different prototypes.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1cf4baba434..f46f88620c7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -14,6 +14,7 @@
 #include <linux/pagevec.h>
 #include "extent_io.h"
 #include "extent_map.h"
+#include "compat.h"
 
 /* temporary define until extent_map moves out of btrfs */
 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
@@ -3055,7 +3056,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (!wait) {
-			if (TestSetPageLocked(page))
+			if (!trylock_page(page))
 				goto unlock_exit;
 		} else {
 			lock_page(page);
-- 
cgit v1.2.3


From db69e0ebae944690de89851315404f483e6464e0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:14 -0400
Subject: Btrfs: Init address_space->writeback_index properly

The writeback_index field is used by write_cache_pages to pick up where
writeback on a given inode left off.  But, it is never set to a sane
value, so writeback can often start at a random offset in the file.

Kernels 2.6.28 and higher will have this fixed, but for everyone else,
we also fill in the value in btrfs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a26d365111e..33b990878d8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1833,6 +1833,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
 	BTRFS_I(inode)->delalloc_bytes = 0;
+	inode->i_mapping->writeback_index = 0;
 	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->index_cnt = (u64)-1;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
@@ -2239,6 +2240,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	BTRFS_I(inode)->delalloc_bytes = 0;
+	inode->i_mapping->writeback_index = 0;
 	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -2486,6 +2488,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
+		inode->i_mapping->writeback_index = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
@@ -3549,6 +3552,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		mutex_init(&BTRFS_I(inode)->extent_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
 		BTRFS_I(inode)->disk_i_size = 0;
+		inode->i_mapping->writeback_index = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
-- 
cgit v1.2.3


From b720d209524fccdd33a499351f2ca880b8c0ffb4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:14 -0400
Subject: Btrfs: Limit the number of async bio submission kthreads to the
 number of devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 56c54a41dbb..99bd9f9b9ee 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1348,7 +1348,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * cannot dynamically grow.
 	 */
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
-	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->submit_workers,
+			   min_t(u64, fs_devices->num_devices,
+			   fs_info->thread_pool_size));
 
 	/* a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
-- 
cgit v1.2.3


From 0986fe9eac24fd186927c3b87af51d62f8ab92cd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:15 -0400
Subject: Btrfs: Count async bios separately from async checksum work items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 25 ++++++++++++++++++++++---
 fs/btrfs/volumes.c |  6 +++---
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c88f1e16ce2..bb4a8d2200d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -544,6 +544,7 @@ struct btrfs_fs_info {
 	struct list_head hashers;
 	struct list_head dead_roots;
 	atomic_t nr_async_submits;
+	atomic_t nr_async_bios;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 99bd9f9b9ee..9902d29abd0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -429,6 +429,21 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
+static int congested_async(struct btrfs_fs_info *info, int iodone)
+{
+	int limit = 256 * info->fs_devices->open_devices;
+
+	if (iodone)
+		limit = (limit * 3) / 2;
+	if (atomic_read(&info->nr_async_submits) > limit)
+		return 1;
+
+	limit = 8192 * info->fs_devices->open_devices;
+	if (iodone)
+		limit = (limit * 3) / 2;
+	return atomic_read(&info->nr_async_bios) > limit;
+}
+
 static void run_one_async_submit(struct btrfs_work *work)
 {
 	struct btrfs_fs_info *fs_info;
@@ -437,6 +452,11 @@ static void run_one_async_submit(struct btrfs_work *work)
 	async = container_of(work, struct  async_submit_bio, work);
 	fs_info = BTRFS_I(async->inode)->root->fs_info;
 	atomic_dec(&fs_info->nr_async_submits);
+
+	if ((async->bio->bi_rw & (1 << BIO_RW)) &&
+	    !congested_async(fs_info, 1)) {
+		clear_bdi_congested(&fs_info->bdi, WRITE);
+	}
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
 			       async->mirror_num);
 	kfree(async);
@@ -938,15 +958,13 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
 	int ret = 0;
-	int limit = 256 * info->fs_devices->open_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
 
 	if ((bdi_bits & (1 << BDI_write_congested)) &&
-	    atomic_read(&info->nr_async_submits) > limit) {
+	    congested_async(info, 0))
 		return 1;
-	}
 
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1250,6 +1268,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->throttles, 0);
 	atomic_set(&fs_info->throttle_gen, 0);
 	fs_info->sb = sb;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 09311b3066d..23a5b0aba00 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -179,7 +179,7 @@ loop:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+		atomic_dec(&device->dev_root->fs_info->nr_async_bios);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		bio_get(cur);
@@ -2145,12 +2145,12 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	}
 
 	/*
-	 * nr_async_sumbits allows us to reliably return congestion to the
+	 * nr_async_bios allows us to reliably return congestion to the
 	 * higher layers.  Otherwise, the async bio makes it appear we have
 	 * made progress against dirty pages when we've really just put it
 	 * on a queue for later
 	 */
-	atomic_inc(&root->fs_info->nr_async_submits);
+	atomic_inc(&root->fs_info->nr_async_bios);
 	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
 	bio->bi_rw |= rw;
-- 
cgit v1.2.3


From 777e6bd706ee40897545463871de5b456fbc46dc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:15 -0400
Subject: Btrfs: Transaction commit: don't use filemap_fdatawait

After writing out all the remaining btree blocks in the transaction,
the commit code would use filemap_fdatawait to make sure it was all
on disk.  This means it would wait for blocks written by other procs
as well.

The new code walks the list of blocks for this transaction again
and waits only for those required by this transaction.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 11 ++++-------
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/transaction.c | 36 +++++++++++++++++++++++++++++++-----
 3 files changed, 36 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9902d29abd0..9601b13c7d7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -429,7 +429,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-static int congested_async(struct btrfs_fs_info *info, int iodone)
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 {
 	int limit = 256 * info->fs_devices->open_devices;
 
@@ -438,9 +438,6 @@ static int congested_async(struct btrfs_fs_info *info, int iodone)
 	if (atomic_read(&info->nr_async_submits) > limit)
 		return 1;
 
-	limit = 8192 * info->fs_devices->open_devices;
-	if (iodone)
-		limit = (limit * 3) / 2;
 	return atomic_read(&info->nr_async_bios) > limit;
 }
 
@@ -454,7 +451,7 @@ static void run_one_async_submit(struct btrfs_work *work)
 	atomic_dec(&fs_info->nr_async_submits);
 
 	if ((async->bio->bi_rw & (1 << BIO_RW)) &&
-	    !congested_async(fs_info, 1)) {
+	    !btrfs_congested_async(fs_info, 1)) {
 		clear_bdi_congested(&fs_info->bdi, WRITE);
 	}
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
@@ -963,7 +960,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	struct backing_dev_info *bdi;
 
 	if ((bdi_bits & (1 << BDI_write_congested)) &&
-	    congested_async(info, 0))
+	    btrfs_congested_async(info, 0))
 		return 1;
 
 	list_for_each(cur, &info->fs_devices->devices) {
@@ -1844,7 +1841,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
-	unsigned long thresh = 16 * 1024 * 1024;
+	unsigned long thresh = 2 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
 	if (current_is_pdflush())
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 353c3c50c95..e904a69347a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,4 +72,5 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			extent_submit_bio_hook_t *submit_bio_hook);
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9d3d08e9f8d..6bcb0876f9b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -303,12 +303,12 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
 	int ret;
-	int err;
+	int err = 0;
 	int werr = 0;
 	struct extent_io_tree *dirty_pages;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	u64 start;
+	u64 start = 0;
 	u64 end;
 	unsigned long index;
 
@@ -317,12 +317,15 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 	}
 	dirty_pages = &trans->transaction->dirty_pages;
 	while(1) {
-		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
 		while(start <= end) {
+			if (btrfs_congested_async(root->fs_info, 0))
+				congestion_wait(WRITE, HZ/10);
+			cond_resched();
+
 			index = start >> PAGE_CACHE_SHIFT;
 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 			page = find_lock_page(btree_inode->i_mapping, index);
@@ -343,7 +346,30 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 			page_cache_release(page);
 		}
 	}
-	err = filemap_fdatawait(btree_inode->i_mapping);
+	while(1) {
+		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+		while(start <= end) {
+			index = start >> PAGE_CACHE_SHIFT;
+			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+			page = find_get_page(btree_inode->i_mapping, index);
+			if (!page)
+				continue;
+			if (PageDirty(page)) {
+				lock_page(page);
+				err = write_one_page(page, 0);
+				if (err)
+					werr = err;
+			}
+			wait_on_page_writeback(page);
+			page_cache_release(page);
+			cond_resched();
+		}
+	}
 	if (err)
 		werr = err;
 	return werr;
-- 
cgit v1.2.3


From 5443be45f5cb57d02fd895a0bcaf7e7d9890b1df Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:16 -0400
Subject: Btrfs: Give all the worker threads descriptive names

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c |  7 +++++--
 fs/btrfs/async-thread.h |  5 ++++-
 fs/btrfs/disk-io.c      | 22 +++++++++++++++-------
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index bc2980c433e..5f2f5a8c228 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -153,7 +153,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 /*
  * simple init on struct btrfs_workers
  */
-void btrfs_init_workers(struct btrfs_workers *workers, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 {
 	workers->num_workers = 0;
 	INIT_LIST_HEAD(&workers->worker_list);
@@ -161,6 +161,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, int max)
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
+	workers->name = name;
 }
 
 /*
@@ -184,7 +185,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
-		worker->task = kthread_run(worker_loop, worker, "btrfs");
+		worker->task = kthread_run(worker_loop, worker,
+					   "btrfs-%s-%d", workers->name,
+					   workers->num_workers + i);
 		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
 			kfree(worker);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 3436ff89759..43e44d115dd 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -69,11 +69,14 @@ struct btrfs_workers {
 
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
+
+	/* extra name for this worker */
+	char *name;
 };
 
 int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
 int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, int max);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9601b13c7d7..1bf210dadef 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -492,11 +492,11 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	/*
 	 * when we're called for a write, we're already in the async
-	 * submission context.  Just jump ingo btrfs_map_bio
+	 * submission context.  Just jump into btrfs_map_bio
 	 */
 	if (rw & (1 << BIO_RW)) {
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-				     mirror_num, 0);
+				     mirror_num, 1);
 	}
 
 	/*
@@ -528,6 +528,12 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+	if (current->flags & PF_MEMALLOC) {
+		redirty_page_for_writepage(wbc, page);
+		unlock_page(page);
+		return 0;
+	}
 	return extent_write_full_page(tree, page, btree_get_extent, wbc);
 }
 
@@ -1363,8 +1369,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * queue work function gets called at interrupt time, and so it
 	 * cannot dynamically grow.
 	 */
-	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
-	btrfs_init_workers(&fs_info->submit_workers,
+	btrfs_init_workers(&fs_info->workers, "worker",
+			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size));
 
@@ -1374,9 +1381,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	fs_info->submit_workers.idle_thresh = 64;
 
-	btrfs_init_workers(&fs_info->fixup_workers, 1);
-	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
-	btrfs_init_workers(&fs_info->endio_write_workers,
+	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+	btrfs_init_workers(&fs_info->endio_workers, "endio",
+			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size);
 
 	/*
-- 
cgit v1.2.3


From 4854ddd0ed0a687fc2d7c45a529c406232e31e7b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:17 -0400
Subject: Btrfs: Wait for kernel threads to make progress during async
 submission

Before this change, btrfs would use a bdi congestion function to make
sure there weren't too many pending async checksum work items.

This change makes the process creating async work items wait instead,
leading to fewer congestion returns from the bdi.  This improves
pdflush background_writeout scanning.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c |  7 ++++++-
 fs/btrfs/ctree.h        |  1 +
 fs/btrfs/disk-io.c      | 45 ++++++++++++++++++++++++++-------------------
 3 files changed, 33 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5f2f5a8c228..958cd8b5f0d 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,7 @@ struct btrfs_worker_thread {
 
 	/* number of things on the pending list */
 	atomic_t num_pending;
+	unsigned long sequence;
 
 	/* protects the pending list. */
 	spinlock_t lock;
@@ -197,6 +198,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 
 		spin_lock_irq(&workers->lock);
 		list_add_tail(&worker->worker_list, &workers->idle_list);
+		worker->idle = 1;
 		workers->num_workers++;
 		spin_unlock_irq(&workers->lock);
 	}
@@ -238,7 +240,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 	 */
 	next = workers->worker_list.next;
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-	list_move_tail(next, &workers->worker_list);
+	atomic_inc(&worker->num_pending);
+	worker->sequence++;
+	if (worker->sequence % 4 == 0)
+		list_move_tail(next, &workers->worker_list);
 	return worker;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bb4a8d2200d..04021335939 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -526,6 +526,7 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
+	wait_queue_head_t async_submit_wait;
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1bf210dadef..1aed1f4616b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -429,31 +429,36 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+static unsigned long async_submit_limit(struct btrfs_fs_info *info)
 {
-	int limit = 256 * info->fs_devices->open_devices;
-
-	if (iodone)
-		limit = (limit * 3) / 2;
-	if (atomic_read(&info->nr_async_submits) > limit)
-		return 1;
+	unsigned long limit = min_t(unsigned long,
+				    info->workers.max_workers,
+				    info->fs_devices->open_devices);
+	return 256 * limit;
+}
 
-	return atomic_read(&info->nr_async_bios) > limit;
+int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
+{
+	return atomic_read(&info->nr_async_bios) > async_submit_limit(info);
 }
 
 static void run_one_async_submit(struct btrfs_work *work)
 {
 	struct btrfs_fs_info *fs_info;
 	struct async_submit_bio *async;
+	int limit;
 
 	async = container_of(work, struct  async_submit_bio, work);
 	fs_info = BTRFS_I(async->inode)->root->fs_info;
+
+	limit = async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
 	atomic_dec(&fs_info->nr_async_submits);
 
-	if ((async->bio->bi_rw & (1 << BIO_RW)) &&
-	    !btrfs_congested_async(fs_info, 1)) {
-		clear_bdi_congested(&fs_info->bdi, WRITE);
-	}
+	if (atomic_read(&fs_info->nr_async_submits) < limit)
+		wake_up(&fs_info->async_submit_wait);
+
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
 			       async->mirror_num);
 	kfree(async);
@@ -464,6 +469,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			extent_submit_bio_hook_t *submit_bio_hook)
 {
 	struct async_submit_bio *async;
+	int limit = async_submit_limit(fs_info);
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
@@ -478,6 +484,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->work.flags = 0;
 	atomic_inc(&fs_info->nr_async_submits);
 	btrfs_queue_worker(&fs_info->workers, &async->work);
+
+	wait_event_timeout(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) < limit),
+			   HZ/10);
 	return 0;
 }
 
@@ -545,16 +555,11 @@ static int btree_writepages(struct address_space *mapping,
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		u64 num_dirty;
 		u64 start = 0;
-		unsigned long thresh = 96 * 1024 * 1024;
+		unsigned long thresh = 8 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
 
-		if (current_is_pdflush()) {
-			thresh = 96 * 1024 * 1024;
-		} else {
-			thresh = 8 * 1024 * 1024;
-		}
 		num_dirty = count_range_bits(tree, &start, (u64)-1,
 					     thresh, EXTENT_DIRTY);
 		if (num_dirty < thresh) {
@@ -1333,6 +1338,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->volume_mutex);
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
+	init_waitqueue_head(&fs_info->async_submit_wait);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1380,6 +1386,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * devices
 	 */
 	fs_info->submit_workers.idle_thresh = 64;
+	fs_info->workers.idle_thresh = 32;
 
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
@@ -1849,7 +1856,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
-	unsigned long thresh = 2 * 1024 * 1024;
+	unsigned long thresh = 12 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
 	if (current_is_pdflush())
-- 
cgit v1.2.3


From 53863232ef961778aa414b700ed88a48e8e871e6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:18 -0400
Subject: Btrfs: Lower contention on the csum mutex

This takes the csum mutex deeper in the call chain and releases it
more often.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c |  3 ++-
 fs/btrfs/disk-io.c      |  9 ++++++++-
 fs/btrfs/file-item.c    | 16 ++++++++++++----
 fs/btrfs/inode.c        |  2 --
 4 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 958cd8b5f0d..2ee30174019 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,7 @@ struct btrfs_worker_thread {
 
 	/* number of things on the pending list */
 	atomic_t num_pending;
+
 	unsigned long sequence;
 
 	/* protects the pending list. */
@@ -242,7 +243,7 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
 	atomic_inc(&worker->num_pending);
 	worker->sequence++;
-	if (worker->sequence % 4 == 0)
+	if (worker->sequence % workers->idle_thresh == 0)
 		list_move_tail(next, &workers->worker_list);
 	return worker;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1aed1f4616b..92e14dd9bdd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1386,7 +1386,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * devices
 	 */
 	fs_info->submit_workers.idle_thresh = 64;
-	fs_info->workers.idle_thresh = 32;
+
+	/* fs_info->workers is responsible for checksumming file data
+	 * blocks and metadata.  Using a larger idle thresh allows each
+	 * worker thread to operate on things in roughly the order they
+	 * were sent by the writeback daemons, improving overall locality
+	 * of the IO going down the pipe.
+	 */
+	fs_info->workers.idle_thresh = 128;
 
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 51aba8cee7c..9454e0a07c8 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -321,6 +321,7 @@ again:
 	file_key.offset = offset;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
 
+	mutex_lock(&BTRFS_I(inode)->csum_mutex);
 	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
 	if (!IS_ERR(item)) {
 		leaf = path->nodes[0];
@@ -367,7 +368,7 @@ again:
 	ret = btrfs_search_slot(trans, root, &file_key, path,
 				BTRFS_CRC32_SIZE, 1);
 	if (ret < 0)
-		goto fail;
+		goto fail_unlock;
 	if (ret == 0) {
 		BUG();
 	}
@@ -411,10 +412,10 @@ insert:
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      ins_size);
 	if (ret < 0)
-		goto fail;
+		goto fail_unlock;
 	if (ret != 0) {
 		WARN_ON(1);
-		goto fail;
+		goto fail_unlock;
 	}
 csum:
 	leaf = path->nodes[0];
@@ -427,6 +428,8 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
+	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
+	cond_resched();
 next_sector:
 
 	if (!eb_token ||
@@ -467,13 +470,18 @@ next_sector:
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
+	cond_resched();
 	if (total_bytes < sums->len) {
 		btrfs_release_path(root, path);
 		goto again;
 	}
-fail:
+out:
 	btrfs_free_path(path);
 	return ret;
+
+fail_unlock:
+	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
+	goto out;
 }
 
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 33b990878d8..65107894a5b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -415,10 +415,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	btrfs_set_trans_block_group(trans, inode);
 	list_for_each(cur, list) {
 		sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		mutex_lock(&BTRFS_I(inode)->csum_mutex);
 		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
 				       inode, sum);
-		mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 902b22f341efa00be802418a0a8c57bddcd269a6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 20 Aug 2008 08:51:49 -0400
Subject: Btrfs: Remove broken optimisations in end_bio functions.

These ended up freeing objects while they were still using them. Under
guidance from Chris, just rip out the 'clever' bits and do things the
simple way.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 159 +++++++--------------------------------------------
 1 file changed, 21 insertions(+), 138 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f46f88620c7..83ba0c32872 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -15,6 +15,8 @@
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
 
 /* temporary define until extent_map moves out of btrfs */
 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
@@ -1394,15 +1396,11 @@ static int end_bio_extent_writepage(struct bio *bio,
 {
 	int uptodate = err == 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_state *state = bio->bi_private;
-	struct extent_io_tree *tree = state->tree;
-	struct rb_node *node;
+	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
-	u64 cur;
 	int whole_page;
 	int ret;
-	unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -1410,6 +1408,8 @@ static int end_bio_extent_writepage(struct bio *bio,
 #endif
 	do {
 		struct page *page = bvec->bv_page;
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
 			 bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
@@ -1423,7 +1423,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 			prefetchw(&bvec->bv_page->flags);
 		if (tree->ops && tree->ops->writepage_end_io_hook) {
 			ret = tree->ops->writepage_end_io_hook(page, start,
-						       end, state, uptodate);
+						       end, NULL, uptodate);
 			if (ret)
 				uptodate = 0;
 		}
@@ -1431,9 +1431,8 @@ static int end_bio_extent_writepage(struct bio *bio,
 		if (!uptodate && tree->ops &&
 		    tree->ops->writepage_io_failed_hook) {
 			ret = tree->ops->writepage_io_failed_hook(bio, page,
-							 start, end, state);
+							 start, end, NULL);
 			if (ret == 0) {
-				state = NULL;
 				uptodate = (err == 0);
 				continue;
 			}
@@ -1445,68 +1444,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 			SetPageError(page);
 		}
 
-		/*
-		 * bios can get merged in funny ways, and so we need to
-		 * be careful with the state variable.  We know the
-		 * state won't be merged with others because it has
-		 * WRITEBACK set, but we can't be sure each biovec is
-		 * sequential in the file.  So, if our cached state
-		 * doesn't match the expected end, search the tree
-		 * for the correct one.
-		 */
-
-		spin_lock_irqsave(&tree->lock, flags);
-		if (!state || state->end != end) {
-			state = NULL;
-			node = __etree_search(tree, start, NULL, NULL);
-			if (node) {
-				state = rb_entry(node, struct extent_state,
-						 rb_node);
-				if (state->end != end ||
-				    !(state->state & EXTENT_WRITEBACK))
-					state = NULL;
-			}
-			if (!state) {
-				spin_unlock_irqrestore(&tree->lock, flags);
-				clear_extent_writeback(tree, start,
-						       end, GFP_ATOMIC);
-				goto next_io;
-			}
-		}
-		cur = end;
-		while(1) {
-			struct extent_state *clear = state;
-			cur = state->start;
-			node = rb_prev(&state->rb_node);
-			if (node) {
-				state = rb_entry(node,
-						 struct extent_state,
-						 rb_node);
-			} else {
-				state = NULL;
-			}
-
-			clear_state_bit(tree, clear, EXTENT_WRITEBACK,
-					1, 0);
-			if (cur == start)
-				break;
-			if (cur < start) {
-				WARN_ON(1);
-				break;
-			}
-			if (!node)
-				break;
-		}
-		/* before releasing the lock, make sure the next state
-		 * variable has the expected bits set and corresponds
-		 * to the correct offsets in the file
-		 */
-		if (state && (state->end + 1 != start ||
-		    !(state->state & EXTENT_WRITEBACK))) {
-			state = NULL;
-		}
-		spin_unlock_irqrestore(&tree->lock, flags);
-next_io:
+		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
 
 		if (whole_page)
 			end_page_writeback(page);
@@ -1539,13 +1477,9 @@ static int end_bio_extent_readpage(struct bio *bio,
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_state *state = bio->bi_private;
-	struct extent_io_tree *tree = state->tree;
-	struct rb_node *node;
+	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
-	u64 cur;
-	unsigned long flags;
 	int whole_page;
 	int ret;
 
@@ -1556,6 +1490,8 @@ static int end_bio_extent_readpage(struct bio *bio,
 
 	do {
 		struct page *page = bvec->bv_page;
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
 			bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
@@ -1570,80 +1506,26 @@ static int end_bio_extent_readpage(struct bio *bio,
 
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
 			ret = tree->ops->readpage_end_io_hook(page, start, end,
-							      state);
+							      NULL);
 			if (ret)
 				uptodate = 0;
 		}
 		if (!uptodate && tree->ops &&
 		    tree->ops->readpage_io_failed_hook) {
 			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, state);
+							 start, end, NULL);
 			if (ret == 0) {
-				state = NULL;
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				continue;
 			}
 		}
 
-		spin_lock_irqsave(&tree->lock, flags);
-		if (!state || state->end != end) {
-			state = NULL;
-			node = __etree_search(tree, start, NULL, NULL);
-			if (node) {
-				state = rb_entry(node, struct extent_state,
-						 rb_node);
-				if (state->end != end ||
-				    !(state->state & EXTENT_LOCKED))
-					state = NULL;
-			}
-			if (!state) {
-				spin_unlock_irqrestore(&tree->lock, flags);
-				if (uptodate)
-					set_extent_uptodate(tree, start, end,
-							    GFP_ATOMIC);
-				unlock_extent(tree, start, end, GFP_ATOMIC);
-				goto next_io;
-			}
-		}
+		if (uptodate)
+			set_extent_uptodate(tree, start, end,
+					    GFP_ATOMIC);
+		unlock_extent(tree, start, end, GFP_ATOMIC);
 
-		cur = end;
-		while(1) {
-			struct extent_state *clear = state;
-			cur = state->start;
-			node = rb_prev(&state->rb_node);
-			if (node) {
-				state = rb_entry(node,
-					 struct extent_state,
-					 rb_node);
-			} else {
-				state = NULL;
-			}
-			if (uptodate) {
-				set_state_cb(tree, clear, EXTENT_UPTODATE);
-				clear->state |= EXTENT_UPTODATE;
-			}
-			clear_state_bit(tree, clear, EXTENT_LOCKED,
-					1, 0);
-			if (cur == start)
-				break;
-			if (cur < start) {
-				WARN_ON(1);
-				break;
-			}
-			if (!node)
-				break;
-		}
-		/* before releasing the lock, make sure the next state
-		 * variable has the expected bits set and corresponds
-		 * to the correct offsets in the file
-		 */
-		if (state && (state->end + 1 != start ||
-		    !(state->state & EXTENT_LOCKED))) {
-			state = NULL;
-		}
-		spin_unlock_irqrestore(&tree->lock, flags);
-next_io:
 		if (whole_page) {
 			if (uptodate) {
 				SetPageUptodate(page);
@@ -1683,8 +1565,7 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct extent_state *state = bio->bi_private;
-	struct extent_io_tree *tree = state->tree;
+	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
 
@@ -1695,6 +1576,8 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 
 	do {
 		struct page *page = bvec->bv_page;
+		tree = &BTRFS_I(page->mapping->host)->io_tree;
+
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
 			bvec->bv_offset;
 		end = start + bvec->bv_len - 1;
@@ -1765,7 +1648,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
 	BUG_ON(state->end != end);
 	spin_unlock_irq(&tree->lock);
 
-	bio->bi_private = state;
+	bio->bi_private = NULL;
 
 	bio_get(bio);
 
-- 
cgit v1.2.3


From 7c2fe32a238eb12422beca5cbd5194a594baa559 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Aug 2008 08:51:50 -0400
Subject: Btrfs: Fix add_extent_mapping to check for duplicates across the
 whole range

add_extent_mapping was allowing the insertion of overlapping extents.
This never used to happen because it only inserted the extents from disk
and those were never overlapping.

But, with the data=ordered code, the disk and memory representations of the
file are not the same.  add_extent_mapping needs to ensure a new extent
does not overlap before it inserts.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_map.c |  7 +++++++
 fs/btrfs/inode.c      | 21 +++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 954b047639a..78ced11d18c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -207,7 +207,14 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	int ret = 0;
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
+	struct extent_map *exist;
 
+	exist = lookup_extent_mapping(tree, em->start, em->len);
+	if (exist) {
+		free_extent_map(exist);
+		ret = -EEXIST;
+		goto out;
+	}
 	assert_spin_locked(&tree->lock);
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 65107894a5b..6c778043207 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -641,8 +641,9 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 		if (ret == -ENOENT || ret == -EFBIG)
 			ret = 0;
 		csum = 0;
-		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
-		       start);
+		if (printk_ratelimit())
+			printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
+			       start);
 		goto out;
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
@@ -1653,8 +1654,20 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
 		hole_size = block_end - hole_start;
-		btrfs_wait_ordered_range(inode, hole_start, hole_size);
-		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		while(1) {
+			struct btrfs_ordered_extent *ordered;
+			btrfs_wait_ordered_range(inode, hole_start, hole_size);
+
+			lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+			ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+			if (ordered) {
+				unlock_extent(io_tree, hole_start,
+					      block_end - 1, GFP_NOFS);
+				btrfs_put_ordered_extent(ordered);
+			} else {
+				break;
+			}
+		}
 
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
-- 
cgit v1.2.3


From 4d1b5fb4d7075f862848dbff8873e22382abd482 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Aug 2008 09:44:52 -0400
Subject: Btrfs: Lookup readpage checksums on bio submission again

This optimization had been removed because I thought it was triggering
csum errors.  The real cause of the errors was elsewhere, and so
this optimization is back.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c |  4 ++--
 fs/btrfs/inode.c     | 62 ++++------------------------------------------------
 2 files changed, 6 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9454e0a07c8..f861bd50692 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -134,7 +134,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-#if 0 /* broken */
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio)
 {
@@ -151,6 +150,8 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 
 	path = btrfs_alloc_path();
+	if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+		path->reada = 2;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 
@@ -211,7 +212,6 @@ found:
 	btrfs_free_path(path);
 	return 0;
 }
-#endif
 
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6c778043207..38d7a81763c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -389,15 +389,15 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
-	if (!(rw & (1 << BIO_RW))) {
-		goto mapit;
-	}
-
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM)) {
 		goto mapit;
 	}
 
+	if (!(rw & (1 << BIO_RW))) {
+		btrfs_lookup_bio_sums(root, inode, bio);
+		goto mapit;
+	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
 				   __btrfs_submit_bio_hook);
@@ -603,59 +603,6 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
-int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
-{
-	int ret = 0;
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_csum_item *item;
-	struct btrfs_path *path = NULL;
-	u32 csum;
-
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
-		return 0;
-
-	/*
-	 * It is possible there is an ordered extent that has
-	 * not yet finished for this range in the file.  If so,
-	 * that extent will have a csum cached, and it will insert
-	 * the sum after all the blocks in the extent are fully
-	 * on disk.  So, look for an ordered extent and use the
-	 * sum if found.  We have to do this before looking in the
-	 * btree because csum items are pre-inserted based on
-	 * the file size.  btrfs_lookup_csum might find an item
-	 * that still hasn't been fully filled.
-	 */
-	ret = btrfs_find_ordered_sum(inode, start, &csum);
-	if (ret == 0)
-		goto found;
-
-	ret = 0;
-	path = btrfs_alloc_path();
-	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
-	if (IS_ERR(item)) {
-		ret = PTR_ERR(item);
-		/* a csum that isn't present is a preallocated region. */
-		if (ret == -ENOENT || ret == -EFBIG)
-			ret = 0;
-		csum = 0;
-		if (printk_ratelimit())
-			printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
-			       start);
-		goto out;
-	}
-	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
-			   BTRFS_CRC32_SIZE);
-found:
-	set_state_private(io_tree, start, csum);
-out:
-	if (path)
-		btrfs_free_path(path);
-	return ret;
-}
-
 struct io_failure_record {
 	struct page *page;
 	u64 start;
@@ -3669,7 +3616,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.merge_bio_hook = btrfs_merge_bio_hook,
-	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
-- 
cgit v1.2.3


From b64a2851ba25b3410a87d3d1b751155612105c8e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Aug 2008 13:39:41 -0400
Subject: Btrfs: Wait for async bio submissions to make some progress at queue
 time

Before, the btrfs bdi congestion function was used to test for too many
async bios.  This keeps that check to throttle pdflush, but also
adds a check while queuing bios.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 16 +++++++++-------
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/transaction.c |  2 --
 fs/btrfs/volumes.c     | 18 +++++++++++++++++-
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92e14dd9bdd..bbba14b629d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -429,7 +429,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-static unsigned long async_submit_limit(struct btrfs_fs_info *info)
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 {
 	unsigned long limit = min_t(unsigned long,
 				    info->workers.max_workers,
@@ -439,7 +439,8 @@ static unsigned long async_submit_limit(struct btrfs_fs_info *info)
 
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 {
-	return atomic_read(&info->nr_async_bios) > async_submit_limit(info);
+	return atomic_read(&info->nr_async_bios) >
+		btrfs_async_submit_limit(info);
 }
 
 static void run_one_async_submit(struct btrfs_work *work)
@@ -451,12 +452,13 @@ static void run_one_async_submit(struct btrfs_work *work)
 	async = container_of(work, struct  async_submit_bio, work);
 	fs_info = BTRFS_I(async->inode)->root->fs_info;
 
-	limit = async_submit_limit(fs_info);
+	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
 	atomic_dec(&fs_info->nr_async_submits);
 
-	if (atomic_read(&fs_info->nr_async_submits) < limit)
+	if (atomic_read(&fs_info->nr_async_submits) < limit &&
+	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
 
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
@@ -469,7 +471,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			extent_submit_bio_hook_t *submit_bio_hook)
 {
 	struct async_submit_bio *async;
-	int limit = async_submit_limit(fs_info);
+	int limit = btrfs_async_submit_limit(fs_info);
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
@@ -1863,10 +1865,10 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
-	unsigned long thresh = 12 * 1024 * 1024;
+	unsigned long thresh = 96 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
-	if (current_is_pdflush())
+	if (current_is_pdflush() || current->flags & PF_MEMALLOC)
 		return;
 
 	num_dirty = count_range_bits(tree, &start, (u64)-1,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e904a69347a..2562a273ae1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -73,4 +73,5 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6bcb0876f9b..eff3ad72991 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -322,8 +322,6 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 		if (ret)
 			break;
 		while(start <= end) {
-			if (btrfs_congested_async(root->fs_info, 0))
-				congestion_wait(WRITE, HZ/10);
 			cond_resched();
 
 			index = start >> PAGE_CACHE_SHIFT;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23a5b0aba00..2652660e607 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -138,12 +138,18 @@ int run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
+	struct btrfs_fs_info *fs_info;
 	struct bio *tail;
 	struct bio *cur;
 	int again = 0;
 	unsigned long num_run = 0;
+	unsigned long limit;
 
 	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+	fs_info = device->dev_root->fs_info;
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
 loop:
 	spin_lock(&device->io_lock);
 
@@ -179,7 +185,11 @@ loop:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&device->dev_root->fs_info->nr_async_bios);
+		atomic_dec(&fs_info->nr_async_bios);
+
+		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		    waitqueue_active(&fs_info->async_submit_wait))
+			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		bio_get(cur);
@@ -2135,6 +2145,7 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 		 int rw, struct bio *bio)
 {
 	int should_queue = 1;
+	unsigned long limit;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2171,6 +2182,11 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	if (should_queue)
 		btrfs_queue_worker(&root->fs_info->submit_workers,
 				   &device->work);
+
+	limit = btrfs_async_submit_limit(root->fs_info);
+	wait_event_timeout(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->nr_async_bios) < limit),
+			   HZ/10);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 1a54ef8c11a0eaef59ff418089f109c27f8bd48d Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@gmail.com>
Date: Mon, 21 Jul 2008 02:01:04 +0530
Subject: Introduce btrfs_iget helper

Date: Mon, 21 Jul 2008 02:01:04 +0530
This patch introduces a btrfs_iget helper to be used in NFS support.

Signed-off-by: Balaji Rao <balajirrao@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 39 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 04021335939..cecf29e0363 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1700,6 +1700,8 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
 			    u64 root_objectid);
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *is_new);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 38d7a81763c..266ca5514b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1840,6 +1840,33 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 	return inode;
 }
 
+/* Get an inode object given its location and corresponding root.
+ * Returns in *is_new if the inode was read from disk
+ */
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+			 struct btrfs_root *root, int *is_new)
+{
+	struct inode *inode;
+
+	inode = btrfs_iget_locked(s, location->objectid, root);
+	if (!inode)
+		return ERR_PTR(-EACCES);
+
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+		if (is_new)
+			*is_new = 1;
+	} else {
+		if (is_new)
+			*is_new = 0;
+	}
+
+	return inode;
+}
+
 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
@@ -1848,7 +1875,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
-	int ret, do_orphan = 0;
+	int ret, new, do_orphan = 0;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -1866,23 +1893,15 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 			return ERR_PTR(ret);
 		if (ret > 0)
 			return ERR_PTR(-ENOENT);
-
-		inode = btrfs_iget_locked(dir->i_sb, location.objectid,
-					  sub_root);
-		if (!inode)
-			return ERR_PTR(-EACCES);
-		if (inode->i_state & I_NEW) {
-			/* the inode and parent dir are two different roots */
-			if (sub_root != root) {
-				igrab(inode);
-				sub_root->inode = inode;
-				do_orphan = 1;
-			}
-			BTRFS_I(inode)->root = sub_root;
-			memcpy(&BTRFS_I(inode)->location, &location,
-			       sizeof(location));
-			btrfs_read_locked_inode(inode);
-			unlock_new_inode(inode);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+
+		/* the inode and parent dir are two different roots */
+		if (new && root != sub_root) {
+			igrab(inode);
+			sub_root->inode = inode;
+			do_orphan = 1;
 		}
 	}
 
-- 
cgit v1.2.3


From be6e8dc0ba84029997075a1ec77b4ddb863cbe15 Mon Sep 17 00:00:00 2001
From: Balaji Rao <balajirrao@gmail.com>
Date: Mon, 21 Jul 2008 02:01:56 +0530
Subject: NFS support for btrfs - v3

Date: Mon, 21 Jul 2008 02:01:56 +0530
Here's an implementation of NFS support for btrfs. It relies on the
fixes which are going in to 2.6.28 for the NFS readdir/lookup deadlock.

This uses the btrfs_iget helper introduced previously.

[dwmw2: Tidy up a little, switch to d_obtain_alias() w/compat routine,
	change fh_type,	store parent's root object ID where needed,
	fix some get_parent() and fs_to_dentry() bugs]

Signed-off-by: Balaji Rao <balajirrao@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile |   2 +-
 fs/btrfs/compat.h |  17 +++++
 fs/btrfs/export.c | 208 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/export.h |  19 +++++
 fs/btrfs/super.c  |   2 +
 5 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/export.c
 create mode 100644 fs/btrfs/export.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a4b38177abd..75f8818cbfe 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o
+	   ref-cache.o export.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index b3349a66999..d45fb37887b 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -5,6 +5,23 @@
 #define trylock_page(page) (!TestSetPageLocked(page))
 #endif
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
+static inline struct dentry *d_obtain_alias(struct inode *inode)
+{
+	struct dentry *d;
+
+	if (!inode)
+		return NULL;
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+
+	d = d_alloc_anon(inode);
+	if (!d)
+		iput(inode);
+	return d;
+}
+#endif
+
 /*
  * Even if AppArmor isn't enabled, it still has different prototypes.
  * Add more distro/version pairs here to declare which has AppArmor applied.
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
new file mode 100644
index 00000000000..797b4cbc378
--- /dev/null
+++ b/fs/btrfs/export.c
@@ -0,0 +1,208 @@
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "btrfs_inode.h"
+#include "print-tree.h"
+#include "export.h"
+#include "compat.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+#define FILEID_BTRFS_WITHOUT_PARENT		0x4d
+#define FILEID_BTRFS_WITH_PARENT 		0x4e
+#define FILEID_BTRFS_WITH_PARENT_ROOT 		0x4f
+#endif
+
+#define BTRFS_FID_SIZE_NON_CONNECTABLE		(offsetof(struct btrfs_fid, parent_objectid)/4)
+#define BTRFS_FID_SIZE_CONNECTABLE		(offsetof(struct btrfs_fid, parent_root_objectid)/4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT		(sizeof(struct btrfs_fid)/4)
+
+static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+			   int connectable)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *)fh;
+	struct inode *inode = dentry->d_inode;
+	int len = *max_len;
+	int type;
+
+	if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+	    (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+		return 255;
+
+	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
+	type = FILEID_BTRFS_WITHOUT_PARENT;
+
+	fid->objectid = BTRFS_I(inode)->location.objectid;
+	fid->root_objectid = BTRFS_I(inode)->root->objectid;
+	fid->gen = inode->i_generation;
+
+	if (connectable && !S_ISDIR(inode->i_mode)) {
+		struct inode *parent;
+		u64 parent_root_id;
+
+		spin_lock(&dentry->d_lock);
+
+		parent = dentry->d_parent->d_inode;
+		fid->parent_objectid = BTRFS_I(parent)->location.objectid;
+		fid->parent_gen = parent->i_generation;
+		parent_root_id = BTRFS_I(parent)->root->objectid;
+
+		spin_unlock(&dentry->d_lock);
+
+		if (parent_root_id != fid->root_objectid) {
+			fid->parent_root_objectid = parent_root_id;
+			len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
+			type = FILEID_BTRFS_WITH_PARENT_ROOT;
+		} else {
+			len = BTRFS_FID_SIZE_CONNECTABLE;
+			type = FILEID_BTRFS_WITH_PARENT;
+		}
+	}
+
+	*max_len = len;
+	return type;
+}
+
+static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+				       u64 root_objectid, u32 generation)
+{
+	struct btrfs_root *root;
+	struct inode *inode;
+	struct dentry *result;
+	struct btrfs_key key;
+
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid);
+	inode = btrfs_iget(sb, &key, root, NULL);
+	if (IS_ERR(inode))
+		return (void *)inode;
+
+	if (generation != inode->i_generation) {
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+
+	result = d_obtain_alias(inode);
+	if (!result)
+		return ERR_PTR(-ENOMEM);
+
+	return result;
+}
+
+static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if (fh_type == FILEID_BTRFS_WITH_PARENT) {
+		if (fh_len !=  BTRFS_FID_SIZE_CONNECTABLE)
+			return NULL;
+		root_objectid = fid->root_objectid;
+	} else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
+		if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+			return NULL;
+		root_objectid = fid->parent_root_objectid;
+	} else
+		return NULL;
+
+	objectid = fid->parent_objectid;
+	generation = fid->parent_gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+
+static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
+					 int fh_len, int fh_type)
+{
+	struct btrfs_fid *fid = (struct btrfs_fid *) fh;
+	u64 objectid, root_objectid;
+	u32 generation;
+
+	if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+	    (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
+	     fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+	    (fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
+	     fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+		return NULL;
+
+	objectid = fid->objectid;
+	root_objectid = fid->root_objectid;
+	generation = fid->gen;
+
+	return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+}
+
+static struct dentry *btrfs_get_parent(struct dentry *child)
+{
+	struct inode *dir = child->d_inode;
+	struct inode *inode;
+	struct dentry *parent;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int slot;
+	u64 objectid;
+	int ret;
+
+	path = btrfs_alloc_path();
+
+	key.objectid = dir->i_ino;
+	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
+	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	BUG_ON(ret == 0);
+	ret = 0;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	nritems = btrfs_header_nritems(leaf);
+	if (slot >= nritems) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret) {
+			btrfs_free_path(path);
+			goto out;
+		}
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+	}
+
+	btrfs_free_path(path);
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
+		goto out;
+
+	objectid = key.offset;
+
+	/* Build a new key for the inode item */
+	key.objectid = objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+
+	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+
+	parent = d_obtain_alias(inode);
+	if (!parent)
+		parent = ERR_PTR(-ENOMEM);
+
+	return parent;
+
+out:
+	btrfs_free_path(path);
+	return ERR_PTR(-EINVAL);
+}
+
+const struct export_operations btrfs_export_ops = {
+	.encode_fh	= btrfs_encode_fh,
+	.fh_to_dentry	= btrfs_fh_to_dentry,
+	.fh_to_parent	= btrfs_fh_to_parent,
+	.get_parent	= btrfs_get_parent,
+};
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
new file mode 100644
index 00000000000..074348a9584
--- /dev/null
+++ b/fs/btrfs/export.h
@@ -0,0 +1,19 @@
+#ifndef BTRFS_EXPORT_H
+#define BTRFS_EXPORT_H
+
+#include <linux/exportfs.h>
+
+extern const struct export_operations btrfs_export_ops;
+
+struct btrfs_fid {
+	u64 objectid;
+	u64 root_objectid;
+	u32 gen;
+
+	u64 parent_objectid;
+	u32 parent_gen;
+
+	u64 parent_root_objectid;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index eb4b357d05e..e830e0ed409 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -46,6 +46,7 @@
 #include "xattr.h"
 #include "volumes.h"
 #include "version.h"
+#include "export.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -303,6 +304,7 @@ static int btrfs_fill_super(struct super_block * sb,
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_magic = BTRFS_SUPER_MAGIC;
 	sb->s_op = &btrfs_super_ops;
+	sb->s_export_op = &btrfs_export_ops;
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_POSIXACL;
-- 
cgit v1.2.3


From cbdf5a2442330102c08f5a2ad3058e29e90a43a9 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 6 Aug 2008 19:42:33 +0100
Subject: Btrfs: Implement our own copy of the nfsd readdir hack, for older
 kernels

Date: Wed, 6 Aug 2008 19:42:33 +0100
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 102 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 266ca5514b3..0da1ae4ee40 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1915,7 +1915,8 @@ static unsigned char btrfs_filetype_table[] = {
 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
 
-static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+static int btrfs_real_readdir(struct file *filp, void *dirent,
+			      filldir_t filldir)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2064,6 +2065,101 @@ err:
 	return ret;
 }
 
+/* Kernels earlier than 2.6.28 still have the NFS deadlock where nfsd
+   will call the file system's ->lookup() method from within its
+   filldir callback, which in turn was called from the file system's
+   ->readdir() method. And will deadlock for many file systems. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+
+struct nfshack_dirent {
+	u64		ino;
+	loff_t		offset;
+	int		namlen;
+	unsigned int	d_type;
+	char		name[];
+};
+
+struct nfshack_readdir {
+	char		*dirent;
+	size_t		used;
+};
+
+
+
+static int btrfs_nfshack_filldir(void *__buf, const char *name, int namlen,
+			      loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct nfshack_readdir *buf = __buf;
+	struct nfshack_dirent *de = (void *)(buf->dirent + buf->used);
+	unsigned int reclen;
+
+	reclen = ALIGN(sizeof(struct nfshack_dirent) + namlen, sizeof(u64));
+	if (buf->used + reclen > PAGE_SIZE)
+		return -EINVAL;
+
+	de->namlen = namlen;
+	de->offset = offset;
+	de->ino = ino;
+	de->d_type = d_type;
+	memcpy(de->name, name, namlen);
+	buf->used += reclen;
+
+	return 0;
+}
+
+static int btrfs_nfshack_readdir(struct file *file, void *dirent,
+				 filldir_t filldir)
+{
+	struct nfshack_readdir buf;
+	struct nfshack_dirent *de;
+	int err;
+	int size;
+	loff_t offset;
+
+	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
+	if (!buf.dirent)
+		return -ENOMEM;
+
+	offset = file->f_pos;
+
+	while (1) {
+		unsigned int reclen;
+
+		buf.used = 0;
+
+		err = btrfs_real_readdir(file, &buf, btrfs_nfshack_filldir);
+		if (err)
+			break;
+
+		size = buf.used;
+
+		if (!size)
+			break;
+
+		de = (struct nfshack_dirent *)buf.dirent;
+		while (size > 0) {
+			offset = de->offset;
+
+			if (filldir(dirent, de->name, de->namlen, de->offset,
+				    de->ino, de->d_type))
+				goto done;
+			offset = file->f_pos;
+
+			reclen = ALIGN(sizeof(*de) + de->namlen,
+				       sizeof(u64));
+			size -= reclen;
+			de = (struct nfshack_dirent *)((char *)de + reclen);
+		}
+	}
+
+ done:
+	free_page((unsigned long)buf.dirent);
+	file->f_pos = offset;
+
+	return err;
+}
+#endif
+
 int btrfs_write_inode(struct inode *inode, int wait)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3623,7 +3719,11 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.readdir	= btrfs_readdir,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+	.readdir	= btrfs_nfshack_readdir,
+#else /* NFSd readdir/lookup deadlock is fixed */
+	.readdir	= btrfs_real_readdir,
+#endif
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
-- 
cgit v1.2.3


From 21af804c07141c035085f99798efaabbc7836a97 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 12 Aug 2008 14:13:26 +0100
Subject: Btrfs: Discard sector data in __free_extent()

Date: Tue, 12 Aug 2008 14:13:26 +0100
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fff219ed61d..e63b3b4bed7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 #include "hash.h"
 #include "crc32c.h"
 #include "ctree.h"
@@ -1716,6 +1717,10 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (refs == 0) {
 		u64 super_used;
 		u64 root_used;
+#ifdef BIO_RW_DISCARD
+		u64 map_length = num_bytes;
+		struct btrfs_multi_bio *multi = NULL;
+#endif
 
 		if (pin) {
 			ret = pin_down_bytes(root, bytenr, num_bytes, 0);
@@ -1743,6 +1748,26 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free);
 		BUG_ON(ret);
+
+#ifdef BIO_RW_DISCARD
+		/* Tell the block device(s) that the sectors can be discarded */
+		ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+				      bytenr, &map_length, &multi, 0);
+		if (!ret) {
+			struct btrfs_bio_stripe *stripe = multi->stripes;
+			int i;
+
+			if (map_length > num_bytes)
+				map_length = num_bytes;
+
+			for (i = 0; i < multi->num_stripes; i++, stripe++) {
+				blkdev_issue_discard(stripe->dev->bdev,
+						     stripe->physical >> 9,
+						     map_length >> 9);
+			}
+			kfree(multi);
+		}
+#endif
 	}
 	btrfs_free_path(path);
 	finish_current_insert(trans, extent_root);
-- 
cgit v1.2.3


From 5ecc7e5d1d58731b3a6a69303d69aca7fb0fe1f5 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 17 Aug 2008 15:14:48 +0100
Subject: Btrfs: Remove special cases for "." and ".."

Date: Sun, 17 Aug 2008 15:14:48 +0100
We never get asked by the VFS to lookup either of them, and we can
handle the readdir() case a lot more simply, too.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 52 ++--------------------------------------------------
 1 file changed, 2 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0da1ae4ee40..4520a0e86e7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1702,42 +1702,9 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	int ret = 0;
 
-	if (namelen == 1 && strcmp(name, ".") == 0) {
-		location->objectid = dir->i_ino;
-		location->type = BTRFS_INODE_ITEM_KEY;
-		location->offset = 0;
-		return 0;
-	}
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	if (namelen == 2 && strcmp(name, "..") == 0) {
-		struct btrfs_key key;
-		struct extent_buffer *leaf;
-		int slot;
-
-		key.objectid = dir->i_ino;
-		key.offset = (u64)-1;
-		btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
-		if (ret < 0 || path->slots[0] == 0)
-			goto out_err;
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		BUG_ON(ret == 0);
-		ret = 0;
-		leaf = path->nodes[0];
-		slot = path->slots[0] - 1;
-
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (key.objectid != dir->i_ino ||
-		    key.type != BTRFS_INODE_REF_KEY) {
-			goto out_err;
-		}
-		location->objectid = key.offset;
-		location->type = BTRFS_INODE_ITEM_KEY;
-		location->offset = 0;
-		goto out;
-	}
-
 	di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
 				    namelen, 0);
 	if (IS_ERR(di))
@@ -1960,29 +1927,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 
 	/* special case for .., just use the back ref */
 	if (filp->f_pos == 1) {
-		btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
-		key.offset = (u64)-1;
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0 || path->slots[0] == 0) {
-			btrfs_release_path(root, path);
-			goto read_dir_items;
-		}
-		BUG_ON(ret == 0);
-		leaf = path->nodes[0];
-		slot = path->slots[0] - 1;
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		btrfs_release_path(root, path);
-		if (found_key.objectid != key.objectid ||
-		    found_key.type != BTRFS_INODE_REF_KEY)
-			goto read_dir_items;
+		u64 pino = parent_ino(filp->f_path.dentry);
 		over = filldir(dirent, "..", 2,
-			       2, found_key.offset, DT_DIR);
+			       2, pino, DT_DIR);
 		if (over)
 			goto nopos;
 		filp->f_pos = 2;
 	}
 
-read_dir_items:
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
 
-- 
cgit v1.2.3


From 49593bfa575b7e3fda073b6d1033ee273bdaf97c Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 17 Aug 2008 17:08:36 +0100
Subject: Minor cleanup of btrfs_real_readdir()

Date: Sun, 17 Aug 2008 17:08:36 +0100
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4520a0e86e7..c7b7095634d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1920,34 +1920,34 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			return 0;
 		filp->f_pos = 1;
 	}
-
-	key.objectid = inode->i_ino;
-	path = btrfs_alloc_path();
-	path->reada = 2;
-
 	/* special case for .., just use the back ref */
 	if (filp->f_pos == 1) {
 		u64 pino = parent_ino(filp->f_path.dentry);
 		over = filldir(dirent, "..", 2,
 			       2, pino, DT_DIR);
 		if (over)
-			goto nopos;
+			return 0;
 		filp->f_pos = 2;
 	}
 
+	path = btrfs_alloc_path();
+	path->reada = 2;
+
 	btrfs_set_key_type(&key, key_type);
 	key.offset = filp->f_pos;
+	key.objectid = inode->i_ino;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
 	advance = 0;
-	while(1) {
+
+	while (1) {
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
 		if (advance || slot >= nritems) {
-			if (slot >= nritems -1) {
+			if (slot >= nritems - 1) {
 				ret = btrfs_next_leaf(root, path);
 				if (ret)
 					break;
@@ -1971,19 +1971,23 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			continue;
 
 		filp->f_pos = found_key.offset;
-		advance = 1;
+
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		di_cur = 0;
 		di_total = btrfs_item_size(leaf, item);
-		while(di_cur < di_total) {
+
+		while (di_cur < di_total) {
 			struct btrfs_key location;
 
 			name_len = btrfs_dir_name_len(leaf, di);
-			if (name_len < 32) {
+			if (name_len <= sizeof(tmp_name)) {
 				name_ptr = tmp_name;
 			} else {
 				name_ptr = kmalloc(name_len, GFP_NOFS);
-				BUG_ON(!name_ptr);
+				if (!name_ptr) {
+					ret = -ENOMEM;
+					goto err;
+				}
 			}
 			read_extent_buffer(leaf, name_ptr,
 					   (unsigned long)(di + 1), name_len);
@@ -1991,8 +1995,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
 			over = filldir(dirent, name_ptr, name_len,
-				       found_key.offset,
-				       location.objectid,
+				       found_key.offset, location.objectid,
 				       d_type);
 
 			if (name_ptr != tmp_name)
@@ -2000,12 +2003,15 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 
 			if (over)
 				goto nopos;
+
 			di_len = btrfs_dir_name_len(leaf, di) +
-				btrfs_dir_data_len(leaf, di) +sizeof(*di);
+				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
 			di_cur += di_len;
 			di = (struct btrfs_dir_item *)((char *)di + di_len);
 		}
 	}
+
+	/* Reached end of directory/root. Bump pos past the last item. */
 	if (key_type == BTRFS_DIR_INDEX_KEY)
 		filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
 	else
-- 
cgit v1.2.3


From f2322b1c652add8bcd64b10843d76b0211ab1fc6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Sun, 17 Aug 2008 17:12:56 +0100
Subject: Btrfs: Optimise NFS readdir hack slightly; don't call readdir() again
 when done

Date: Sun, 17 Aug 2008 17:12:56 +0100
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c7b7095634d..10f26f44532 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2040,6 +2040,7 @@ struct nfshack_dirent {
 struct nfshack_readdir {
 	char		*dirent;
 	size_t		used;
+	int		full;
 };
 
 
@@ -2052,8 +2053,10 @@ static int btrfs_nfshack_filldir(void *__buf, const char *name, int namlen,
 	unsigned int reclen;
 
 	reclen = ALIGN(sizeof(struct nfshack_dirent) + namlen, sizeof(u64));
-	if (buf->used + reclen > PAGE_SIZE)
+	if (buf->used + reclen > PAGE_SIZE) {
+		buf->full = 1;
 		return -EINVAL;
+	}
 
 	de->namlen = namlen;
 	de->offset = offset;
@@ -2080,11 +2083,11 @@ static int btrfs_nfshack_readdir(struct file *file, void *dirent,
 
 	offset = file->f_pos;
 
-	while (1) {
+	do {
 		unsigned int reclen;
 
 		buf.used = 0;
-
+		buf.full = 0;
 		err = btrfs_real_readdir(file, &buf, btrfs_nfshack_filldir);
 		if (err)
 			break;
@@ -2108,7 +2111,7 @@ static int btrfs_nfshack_readdir(struct file *file, void *dirent,
 			size -= reclen;
 			de = (struct nfshack_dirent *)((char *)de + reclen);
 		}
-	}
+	} while (buf.full);
 
  done:
 	free_page((unsigned long)buf.dirent);
-- 
cgit v1.2.3


From 9d03632e26e1a0a9e4a632cf426a7c0566768a7d Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 18 Aug 2008 12:01:52 +0100
Subject: Fill f_fsid field in btrfs_statfs()

Date: Mon, 18 Aug 2008 12:01:52 +0100
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e830e0ed409..6446ab73f35 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -489,6 +489,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
 	int bits = dentry->d_sb->s_blocksize_bits;
+	__be32 *fsid = (__be32 *)root->fs_info->fsid;
 
 	buf->f_namelen = BTRFS_NAME_LEN;
 	buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
@@ -497,6 +498,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
+	/* We treat it as constant endianness (it doesn't matter _which_)
+	   because we want the fsid to come out the same whether mounted 
+	   on a big-endian or little-endian host */
+	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
+	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 32d48fa1af1fe066a6a4798e6f5a50ac6a3ce4a3 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 18 Aug 2008 13:10:20 +0100
Subject: Mask root object ID into f_fsid in btrfs_statfs()

Date: Mon, 18 Aug 2008 13:10:20 +0100
This means that subvolumes get a different fsid, and NFS exporting them
works properly.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6446ab73f35..55f4d00fda3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -503,6 +503,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	   on a big-endian or little-endian host */
 	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
 	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
+	/* Mask in the root object ID too, to disambiguate subvols */
+	buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32;
+	buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 87acb4ef9b2991e1c453b78d71bce2ef994ef1ff Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Mon, 18 Aug 2008 22:50:22 +0100
Subject: Simplify btrfs_get_parent(), fix use-after-free bug

Date: Mon, 18 Aug 2008 22:50:22 +0100
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/export.c | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 797b4cbc378..a913b9befe6 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -147,7 +147,6 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	u32 nritems;
 	int slot;
 	u64 objectid;
 	int ret;
@@ -156,27 +155,24 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 
 	key.objectid = dir->i_ino;
 	btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
-	key.offset = 0;
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	BUG_ON(ret == 0);
-	ret = 0;
+	key.offset = (u64)-1;
 
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	leaf = path->nodes[0];
 	slot = path->slots[0];
-	nritems = btrfs_header_nritems(leaf);
-	if (slot >= nritems) {
-		ret = btrfs_next_leaf(root, path);
-		if (ret) {
-			btrfs_free_path(path);
-			goto out;
-		}
-		leaf = path->nodes[0];
-		slot = path->slots[0];
+	if (ret < 0 || slot == 0) {
+		btrfs_free_path(path);
+		goto out;
 	}
+	/* btrfs_search_slot() returns the slot where we'd want to insert
+	   an INODE_REF_KEY for parent inode #0xFFFFFFFFFFFFFFFF. The _real_
+	   one, telling us what the parent inode _actually_ is, will be in
+	   the slot _before_ the one that btrfs_search_slot() returns. */
+	slot--;
 
+	btrfs_item_key_to_cpu(leaf, &key, slot);
 	btrfs_free_path(path);
 
-	btrfs_item_key_to_cpu(leaf, &key, slot);
 	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
 		goto out;
 
-- 
cgit v1.2.3


From 76fcef19c40328499a2f6d59d76b72fd03d2cc82 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 19 Aug 2008 16:49:35 +0100
Subject: Btrfs: Reinstate '-osubvol=.' option to mount entire tree

Date: Tue, 19 Aug 2008 16:49:35 +0100
This disappeared when I removed the special case for '.' in btrfs_lookup()

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 55f4d00fda3..f7b3eac7ac6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -451,21 +451,25 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	mutex_lock(&s->s_root->d_inode->i_mutex);
-	root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
-	mutex_unlock(&s->s_root->d_inode->i_mutex);
-	if (IS_ERR(root)) {
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		error = PTR_ERR(root);
-		goto error;
-	}
-	if (!root->d_inode) {
-		dput(root);
-		up_write(&s->s_umount);
-		deactivate_super(s);
-		error = -ENXIO;
-		goto error;
+	if (!strcmp(subvol_name, "."))
+		root = dget(s->s_root);
+	else {
+		mutex_lock(&s->s_root->d_inode->i_mutex);
+		root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
+		mutex_unlock(&s->s_root->d_inode->i_mutex);
+		if (IS_ERR(root)) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = PTR_ERR(root);
+			goto error;
+		}
+		if (!root->d_inode) {
+			dput(root);
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			error = -ENXIO;
+			goto error;
+		}
 	}
 
 	mnt->mnt_sb = s;
-- 
cgit v1.2.3


From 615f996fb8185a0bc02812ebd72cb77ded5645f1 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 19 Aug 2008 19:21:57 +0100
Subject: Switch btrfs_name_hash() to crc32c

Date: Tue, 19 Aug 2008 19:21:57 +0100
Using a 64-bit hash as the readdir cookie is just asking for trouble.
And gets it, when we try to export the file system by NFS.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile |   2 +-
 fs/btrfs/hash.c   | 112 ------------------------------------------------------
 fs/btrfs/hash.h   |   7 +++-
 3 files changed, 7 insertions(+), 114 deletions(-)
 delete mode 100644 fs/btrfs/hash.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 75f8818cbfe..8213bba1de9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -3,7 +3,7 @@ ifneq ($(KERNELRELEASE),)
 
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
-	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
+	   file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
deleted file mode 100644
index 21037cfca9b..00000000000
--- a/fs/btrfs/hash.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-/*
- *  Original copy from:
- *  linux/fs/ext3/hash.c
- *
- * Copyright (C) 2002 by Theodore Ts'o
- *
- * This file is released under the GPL v2.
- *
- * This file may be redistributed under the terms of the GNU Public
- * License.
- */
-
-#include <linux/types.h>
-#include "hash.h"
-#define DELTA 0x9E3779B9
-
-static void TEA_transform(__u32 buf[2], __u32 const in[])
-{
-	__u32	sum = 0;
-	__u32	b0 = buf[0], b1 = buf[1];
-	__u32	a = in[0], b = in[1], c = in[2], d = in[3];
-	int	n = 16;
-
-	do {
-		sum += DELTA;
-		b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
-		b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
-	} while(--n);
-
-	buf[0] += b0;
-	buf[1] += b1;
-}
-
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
-{
-	__u32	pad, val;
-	int	i;
-
-	pad = (__u32)len | ((__u32)len << 8);
-	pad |= pad << 16;
-
-	val = pad;
-	if (len > num*4)
-		len = num * 4;
-	for (i=0; i < len; i++) {
-		if ((i % 4) == 0)
-			val = pad;
-		val = msg[i] + (val << 8);
-		if ((i % 4) == 3) {
-			*buf++ = val;
-			val = pad;
-			num--;
-		}
-	}
-	if (--num >= 0)
-		*buf++ = val;
-	while (--num >= 0)
-		*buf++ = pad;
-}
-
-u64 btrfs_name_hash(const char *name, int len)
-{
-	__u32	hash;
-	__u32	minor_hash = 0;
-	const char	*p;
-	__u32		in[8], buf[4];
-	u64		hash_result;
-
-	if (len == 1 && *name == '.') {
-		return 1;
-	} else if (len == 2 && name[0] == '.' && name[1] == '.') {
-		return 2;
-	}
-
-	/* Initialize the default seed for the hash checksum functions */
-	buf[0] = 0x67452301;
-	buf[1] = 0xefcdab89;
-	buf[2] = 0x98badcfe;
-	buf[3] = 0x10325476;
-
-	p = name;
-	while (len > 0) {
-		str2hashbuf(p, len, in, 4);
-		TEA_transform(buf, in);
-		len -= 16;
-		p += 16;
-	}
-	hash = buf[0];
-	minor_hash = buf[1];
-	hash_result = buf[0];
-	hash_result <<= 32;
-	hash_result |= buf[1];
-	return hash_result;
-}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 868ee17ca77..2a020b27676 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -18,5 +18,10 @@
 
 #ifndef __HASH__
 #define __HASH__
-u64 btrfs_name_hash(const char *name, int len);
+
+#include "crc32c.h"
+static inline u64 btrfs_name_hash(const char *name, int len)
+{
+	return btrfs_crc32c((u32)~1, name, len);
+}
 #endif
-- 
cgit v1.2.3


From 2d4d9fbd6efa858dfa009518fca1ab85a73fd848 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 19 Aug 2008 22:20:17 +0100
Subject: Btrfs: Fix NFS exporting of subvol roots.

Date: Tue, 19 Aug 2008 22:20:17 +0100
btrfs_lookup_fs_root() only finds subvol roots which have already been
seen and put into the cache. For btrfs_get_dentry() we actually have to
go to the medium -- so use btrfs_read_fs_root_no_name() instead.

In btrfs_get_parent(), notice when we've hit the root of the
subvolume and return the real root instead.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/export.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index a913b9befe6..36cbc6872fd 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -71,11 +71,18 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 	struct dentry *result;
 	struct btrfs_key key;
 
+	key.objectid = root_objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
 	key.objectid = objectid;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	root = btrfs_lookup_fs_root(btrfs_sb(sb)->fs_info, root_objectid);
 	inode = btrfs_iget(sb, &key, root, NULL);
 	if (IS_ERR(inode))
 		return (void *)inode;
@@ -178,6 +185,10 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 
 	objectid = key.offset;
 
+	/* If we are already at the root of a subvol, return the real root */
+	if (objectid == dir->i_ino)
+		return dget(dir->i_sb->s_root);
+
 	/* Build a new key for the inode item */
 	key.objectid = objectid;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-- 
cgit v1.2.3


From d54a83901055bb0bffca64fa09fce4d897274870 Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Tue, 19 Aug 2008 22:33:04 +0100
Subject: Clean up btrfs_get_parent() a little more, fix a free-after-free bug

Date: Tue, 19 Aug 2008 22:33:04 +0100
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/export.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 36cbc6872fd..292b0b24c30 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -165,23 +165,32 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	key.offset = (u64)-1;
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		/* Error */
+		btrfs_free_path(path);
+		return ERR_PTR(ret);
+	}
 	leaf = path->nodes[0];
 	slot = path->slots[0];
-	if (ret < 0 || slot == 0) {
-		btrfs_free_path(path);
-		goto out;
+	if (ret) {
+		/* btrfs_search_slot() returns the slot where we'd want to
+		   insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
+		   The _real_ backref, telling us what the parent inode
+		   _actually_ is, will be in the slot _before_ the one
+		   that btrfs_search_slot() returns. */
+		if (!slot) {
+			/* Unless there is _no_ key in the tree before... */
+			btrfs_free_path(path);
+			return ERR_PTR(-EIO);
+		}
+		slot--;
 	}
-	/* btrfs_search_slot() returns the slot where we'd want to insert
-	   an INODE_REF_KEY for parent inode #0xFFFFFFFFFFFFFFFF. The _real_
-	   one, telling us what the parent inode _actually_ is, will be in
-	   the slot _before_ the one that btrfs_search_slot() returns. */
-	slot--;
 
 	btrfs_item_key_to_cpu(leaf, &key, slot);
 	btrfs_free_path(path);
 
 	if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
-		goto out;
+		return ERR_PTR(-EINVAL);
 
 	objectid = key.offset;
 
@@ -201,10 +210,6 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 		parent = ERR_PTR(-ENOMEM);
 
 	return parent;
-
-out:
-	btrfs_free_path(path);
-	return ERR_PTR(-EINVAL);
 }
 
 const struct export_operations btrfs_export_ops = {
-- 
cgit v1.2.3


From f3f9931e3d0836509cfccdf473b34e34543a3272 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 21 Aug 2008 15:49:09 -0400
Subject: Btrfs: Rev the disk format

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cecf29e0363..b305ae7e10b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,7 +40,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_B6RfS_M"
+#define BTRFS_MAGIC "_B7RfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From 9473f16c75606fe6b2e5000525fe9766114505f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 28 Aug 2008 06:15:24 -0400
Subject: Btrfs: Throttle for async bio submits higher up the chain

The current code waits for the count of async bio submits to get below
a given threshold if it is too high right after adding the latest bio
to the work queue.  This isn't optimal because the caller may have
sequential adjacent bios pending they are waiting to send down the pipe.

This changeset requires the caller to wait on the async bio count,
and changes the async checksumming submits to wait for async bios any
time they self throttle.

The end result is much higher sequential throughput.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 8 +++++++-
 fs/btrfs/volumes.c | 6 ------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bbba14b629d..6a218f792e5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -487,9 +487,15 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	atomic_inc(&fs_info->nr_async_submits);
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 
-	wait_event_timeout(fs_info->async_submit_wait,
+	if (atomic_read(&fs_info->nr_async_submits) > limit) {
+		wait_event_timeout(fs_info->async_submit_wait,
 			   (atomic_read(&fs_info->nr_async_submits) < limit),
 			   HZ/10);
+
+		wait_event_timeout(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_bios) < limit),
+			   HZ/10);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2652660e607..5b1b60839d2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2145,7 +2145,6 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 		 int rw, struct bio *bio)
 {
 	int should_queue = 1;
-	unsigned long limit;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2182,11 +2181,6 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	if (should_queue)
 		btrfs_queue_worker(&root->fs_info->submit_workers,
 				   &device->work);
-
-	limit = btrfs_async_submit_limit(root->fs_info);
-	wait_event_timeout(root->fs_info->async_submit_wait,
-			   (atomic_read(&root->fs_info->nr_async_bios) < limit),
-			   HZ/10);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 639cb58675ce9b507eed9c3d6b3335488079b21a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 28 Aug 2008 06:15:25 -0400
Subject: Btrfs: Fix variable init during csum creation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file-item.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f861bd50692..6dbe88b9d7d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -325,6 +325,7 @@ again:
 	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
 	if (!IS_ERR(item)) {
 		leaf = path->nodes[0];
+		ret = 0;
 		goto found;
 	}
 	ret = PTR_ERR(item);
-- 
cgit v1.2.3


From eab922ec8907b8c506e799785e7e2d16eabe50e4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 28 Aug 2008 06:21:15 -0400
Subject: Btrfs: compile when posix acl's are disabled

This patch makes btrfs so it will compile properly when acls are disabled.  I
tested this and it worked with CONFIG_FS_POSIX_ACL off and on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile |  3 +--
 fs/btrfs/acl.c    | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 8213bba1de9..3a01065d4ef 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,9 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o
+	   ref-cache.o acl.o export.o
 
-btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index b192659b4f5..b95147ef1c7 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,6 +27,8 @@
 #include "btrfs_inode.h"
 #include "xattr.h"
 
+#ifdef CONFIG_FS_POSIX_ACL
+
 static void btrfs_update_cached_acl(struct inode *inode,
 				    struct posix_acl **p_acl,
 				    struct posix_acl *acl)
@@ -329,3 +331,22 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
 	.get	= btrfs_xattr_acl_access_get,
 	.set	= btrfs_xattr_acl_access_set,
 };
+
+#else /* CONFIG_FS_POSIX_ACL */
+
+int btrfs_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+int btrfs_init_acl(struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+
+int btrfs_check_acl(struct inode *inode, int mask)
+{
+	return 0;
+}
+
+#endif /* CONFIG_FS_POSIX_ACL */
-- 
cgit v1.2.3


From eaa47d8612783807ef9703ebc9bf0d0f0455bf62 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Aug 2008 06:21:16 -0400
Subject: btrfs: optmize listxattr

The ->list handler is really not useful at all, because we always call
btrfs_xattr_generic_list anyway.  After this is done
find_btrfs_xattr_handler becomes unused, and it becomes obvious that the
temporary name buffer allocation isn't needed but we can directly copy
into the supplied buffer.

Tested with various getfattr -d calls on varying xattr lists.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   |  2 --
 fs/btrfs/xattr.c | 82 ++++++++------------------------------------------------
 fs/btrfs/xattr.h |  8 ------
 3 files changed, 11 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index b95147ef1c7..2f865311460 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -320,14 +320,12 @@ int btrfs_acl_chmod(struct inode *inode)
 
 struct xattr_handler btrfs_xattr_acl_default_handler = {
 	.prefix = POSIX_ACL_XATTR_DEFAULT,
-	.list	= btrfs_xattr_generic_list,
 	.get	= btrfs_xattr_acl_default_get,
 	.set	= btrfs_xattr_acl_default_set,
 };
 
 struct xattr_handler btrfs_xattr_acl_access_handler = {
 	.prefix = POSIX_ACL_XATTR_ACCESS,
-	.list	= btrfs_xattr_generic_list,
 	.get	= btrfs_xattr_acl_access_get,
 	.set	= btrfs_xattr_acl_access_set,
 };
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 121c9550314..fdfece41dd1 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -50,35 +50,6 @@ struct xattr_handler *btrfs_xattr_handlers[] = {
 	NULL,
 };
 
-/*
- * @param name - the xattr name
- * @return - the xattr_handler for the xattr, NULL if its not found
- *
- * use this with listxattr where we don't already know the type of xattr we
- * have
- */
-static struct xattr_handler *find_btrfs_xattr_handler(struct extent_buffer *l,
-						      unsigned long name_ptr,
-						      u16 name_len)
-{
-	struct xattr_handler *handler = NULL;
-	int i = 0;
-
-	for (handler = btrfs_xattr_handlers[i]; handler != NULL; i++,
-	     handler = btrfs_xattr_handlers[i]) {
-		u16 prefix_len = strlen(handler->prefix);
-
-		if (name_len < prefix_len)
-			continue;
-
-		if (memcmp_extent_buffer(l, handler->prefix, name_ptr,
-					 prefix_len) == 0)
-			break;
-	}
-
-	return handler;
-}
-
 /*
  * @param name_index - the index for the xattr handler
  * @return the xattr_handler if we found it, NULL otherwise
@@ -118,19 +89,6 @@ static inline char *get_name(const char *name, int name_index)
 	return ret;
 }
 
-size_t btrfs_xattr_generic_list(struct inode *inode, char *list,
-				size_t list_size, const char *name,
-				size_t name_len)
-{
-	if (list && (name_len+1) <= list_size) {
-		memcpy(list, name, name_len);
-		list[name_len] = '\0';
-	} else
-		return -ERANGE;
-
-	return name_len+1;
-}
-
 ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 			const char *attr_name, void *buffer, size_t size)
 {
@@ -278,11 +236,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct btrfs_item *item;
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
-	struct xattr_handler *handler;
 	int ret = 0, slot, advance;
-	size_t total_size = 0, size_left = size, written;
+	size_t total_size = 0, size_left = size;
 	unsigned long name_ptr;
-	char *name;
+	size_t name_len;
 	u32 nritems;
 
 	/*
@@ -344,37 +301,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 
-		total_size += btrfs_dir_name_len(leaf, di)+1;
+		name_len = btrfs_dir_name_len(leaf, di);
+		total_size += name_len + 1;
 
 		/* we are just looking for how big our buffer needs to be */
 		if (!size)
 			continue;
 
-		/* find our handler for this xattr */
-		name_ptr = (unsigned long)(di + 1);
-		handler = find_btrfs_xattr_handler(leaf, name_ptr,
-						   btrfs_dir_name_len(leaf, di));
-		if (!handler) {
-			printk(KERN_ERR "btrfs: unsupported xattr found\n");
-			continue;
-		}
-
-		name = kmalloc(btrfs_dir_name_len(leaf, di), GFP_KERNEL);
-		read_extent_buffer(leaf, name, name_ptr,
-				   btrfs_dir_name_len(leaf, di));
-
-		/* call the list function associated with this xattr */
-		written = handler->list(inode, buffer, size_left, name,
-					btrfs_dir_name_len(leaf, di));
-		kfree(name);
-
-		if (written < 0) {
+		if (!buffer || (name_len + 1) > size_left) {
 			ret = -ERANGE;
 			break;
 		}
 
-		size_left -= written;
-		buffer += written;
+		name_ptr = (unsigned long)(di + 1);
+		read_extent_buffer(leaf, buffer, name_ptr, name_len);
+		buffer[name_len] = '\0';
+
+		size_left -= name_len + 1;
+		buffer += name_len + 1;
 	}
 	ret = total_size;
 
@@ -412,28 +356,24 @@ BTRFS_XATTR_SETGET_FUNCS(trusted, BTRFS_XATTR_INDEX_TRUSTED);
 
 struct xattr_handler btrfs_xattr_security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.list	= btrfs_xattr_generic_list,
 	.get	= btrfs_xattr_security_get,
 	.set	= btrfs_xattr_security_set,
 };
 
 struct xattr_handler btrfs_xattr_system_handler = {
 	.prefix = XATTR_SYSTEM_PREFIX,
-	.list	= btrfs_xattr_generic_list,
 	.get	= btrfs_xattr_system_get,
 	.set	= btrfs_xattr_system_set,
 };
 
 struct xattr_handler btrfs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
-	.list	= btrfs_xattr_generic_list,
 	.get	= btrfs_xattr_user_get,
 	.set	= btrfs_xattr_user_set,
 };
 
 struct xattr_handler btrfs_xattr_trusted_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
-	.list	= btrfs_xattr_generic_list,
 	.get	= btrfs_xattr_trusted_get,
 	.set	= btrfs_xattr_trusted_set,
 };
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b2e47e3f244..825e55bd496 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -47,12 +47,4 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index, const char *name,
 int btrfs_xattr_set(struct inode *inode, int name_index, const char *name,
 			const void *value, size_t size, int flags);
 
-/*
- * the only reason this is public is for acl.c.  There may be a point where
- * acl.c doesn't need it, and if thats the case we need to remove it and make
- * it static in xattr.c
- */
-size_t btrfs_xattr_generic_list(struct inode *inode, char *list,
-				size_t list_size, const char *name,
-				size_t name_len);
 #endif /* __XATTR__ */
-- 
cgit v1.2.3


From 95819c05732c511338b43c115ffbcee978c02888 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Aug 2008 06:21:17 -0400
Subject: Btrfs: optimize btrget/set/removexattr

btrfs actually stores the whole xattr name, including the prefix ondisk,
so using the generic resolver that strips off the prefix is not very
helpful.  Instead do the real ondisk xattrs manually and only use the
generic resolver for synthetic xattrs like ACLs.

(Sorry Josef for guiding you towards the wrong direction here intially)

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   |  20 +++---
 fs/btrfs/inode.c |  19 +++---
 fs/btrfs/xattr.c | 204 ++++++++++++++++++++-----------------------------------
 fs/btrfs/xattr.h |  31 +++------
 4 files changed, 104 insertions(+), 170 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2f865311460..867eaf1f8ef 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -42,17 +42,18 @@ static void btrfs_update_cached_acl(struct inode *inode,
 
 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
-	int size, name_index;
+	int size;
+	const char *name;
 	char *value = NULL;
 	struct posix_acl *acl = NULL, **p_acl;
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		name = POSIX_ACL_XATTR_ACCESS;
 		p_acl = &BTRFS_I(inode)->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		name = POSIX_ACL_XATTR_DEFAULT;
 		p_acl = &BTRFS_I(inode)->i_default_acl;
 		break;
 	default:
@@ -68,12 +69,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		return acl;
 
 
-	size = btrfs_xattr_get(inode, name_index, "", NULL, 0);
+	size = __btrfs_getxattr(inode, name, "", 0);
 	if (size > 0) {
 		value = kzalloc(size, GFP_NOFS);
 		if (!value)
 			return ERR_PTR(-ENOMEM);
-		size = btrfs_xattr_get(inode, name_index, "", value, size);
+		size = __btrfs_getxattr(inode, name, value, size);
 		if (size > 0) {
 			acl = posix_acl_from_xattr(value, size);
 			btrfs_update_cached_acl(inode, p_acl, acl);
@@ -110,7 +111,8 @@ static int btrfs_xattr_get_acl(struct inode *inode, int type,
  */
 static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	int ret, name_index = 0, size = 0;
+	int ret, size = 0;
+	const char *name;
 	struct posix_acl **p_acl;
 	char *value = NULL;
 	mode_t mode;
@@ -130,13 +132,13 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 			return ret;
 		ret = 0;
 		inode->i_mode = mode;
-		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		name = POSIX_ACL_XATTR_ACCESS;
 		p_acl = &BTRFS_I(inode)->i_acl;
 		break;
 	case ACL_TYPE_DEFAULT:
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EINVAL : 0;
-		name_index = BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		name = POSIX_ACL_XATTR_DEFAULT;
 		p_acl = &BTRFS_I(inode)->i_default_acl;
 		break;
 	default:
@@ -156,7 +158,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 			goto out;
 	}
 
-	ret = btrfs_xattr_set(inode, name_index, "", value, size, 0);
+	ret = __btrfs_setxattr(inode, name, value, size, 0);
 
 out:
 	if (value)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 10f26f44532..43d3f2649ca 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,6 +45,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "ordered-data.h"
+#include "xattr.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -3667,10 +3668,10 @@ static struct inode_operations btrfs_dir_inode_operations = {
 	.symlink	= btrfs_symlink,
 	.setattr	= btrfs_setattr,
 	.mknod		= btrfs_mknod,
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
-	.removexattr	= generic_removexattr,
+	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 };
 static struct inode_operations btrfs_dir_ro_inode_operations = {
@@ -3728,20 +3729,20 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.truncate	= btrfs_truncate,
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
 	.listxattr      = btrfs_listxattr,
-	.removexattr	= generic_removexattr,
+	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 };
 static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
-	.setxattr	= generic_setxattr,
-	.getxattr	= generic_getxattr,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
-	.removexattr	= generic_removexattr,
+	.removexattr	= btrfs_removexattr,
 };
 static struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index fdfece41dd1..adb4b32a9d5 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -27,91 +27,20 @@
 #include "xattr.h"
 #include "disk-io.h"
 
-static struct xattr_handler *btrfs_xattr_handler_map[] = {
-	[BTRFS_XATTR_INDEX_USER]		= &btrfs_xattr_user_handler,
-#ifdef CONFIG_FS_POSIX_ACL
-	[BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS]	= &btrfs_xattr_acl_access_handler,
-	[BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT]	= &btrfs_xattr_acl_default_handler,
-#endif
-	[BTRFS_XATTR_INDEX_TRUSTED]		= &btrfs_xattr_trusted_handler,
-	[BTRFS_XATTR_INDEX_SECURITY]		= &btrfs_xattr_security_handler,
-	[BTRFS_XATTR_INDEX_SYSTEM]		= &btrfs_xattr_system_handler,
-};
-
-struct xattr_handler *btrfs_xattr_handlers[] = {
-	&btrfs_xattr_user_handler,
-#ifdef CONFIG_FS_POSIX_ACL
-	&btrfs_xattr_acl_access_handler,
-	&btrfs_xattr_acl_default_handler,
-#endif
-	&btrfs_xattr_trusted_handler,
-	&btrfs_xattr_security_handler,
-	&btrfs_xattr_system_handler,
-	NULL,
-};
-
-/*
- * @param name_index - the index for the xattr handler
- * @return the xattr_handler if we found it, NULL otherwise
- *
- * use this if we know the type of the xattr already
- */
-static struct xattr_handler *btrfs_xattr_handler(int name_index)
-{
-	struct xattr_handler *handler = NULL;
-
-	if (name_index >= 0 &&
-	    name_index < ARRAY_SIZE(btrfs_xattr_handler_map))
-		handler = btrfs_xattr_handler_map[name_index];
-
-	return handler;
-}
-
-static inline char *get_name(const char *name, int name_index)
-{
-	char *ret = NULL;
-	struct xattr_handler *handler = btrfs_xattr_handler(name_index);
-	int prefix_len;
-
-	if (!handler)
-		return ret;
-
-	prefix_len = strlen(handler->prefix);
-
-	ret = kmalloc(strlen(name) + prefix_len + 1, GFP_KERNEL);
-	if (!ret)
-		return ret;
-
-	memcpy(ret, handler->prefix, prefix_len);
-	memcpy(ret+prefix_len, name, strlen(name));
-	ret[prefix_len + strlen(name)] = '\0';
-
-	return ret;
-}
 
-ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
-			const char *attr_name, void *buffer, size_t size)
+ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+				void *buffer, size_t size)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	struct xattr_handler *handler = btrfs_xattr_handler(name_index);
 	int ret = 0;
 	unsigned long data_ptr;
-	char *name;
-
-	if (!handler)
-		return -EOPNOTSUPP;
-	name = get_name(attr_name, name_index);
-	if (!name)
-		return -ENOMEM;
 
 	path = btrfs_alloc_path();
-	if (!path) {
-		kfree(name);
+	if (!path)
 		return -ENOMEM;
-	}
 
 	/* lookup the xattr by name */
 	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
@@ -140,33 +69,22 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 	ret = btrfs_dir_data_len(leaf, di);
 
 out:
-	kfree(name);
 	btrfs_free_path(path);
 	return ret;
 }
 
-int btrfs_xattr_set(struct inode *inode, int name_index,
-		    const char *attr_name, const void *value, size_t size,
-		    int flags)
+int __btrfs_setxattr(struct inode *inode, const char *name,
+			    const void *value, size_t size, int flags)
 {
 	struct btrfs_dir_item *di;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
-	struct xattr_handler *handler = btrfs_xattr_handler(name_index);
-	char *name;
 	int ret = 0, mod = 0;
-	if (!handler)
-		return -EOPNOTSUPP;
-	name = get_name(attr_name, name_index);
-	if (!name)
-		return -ENOMEM;
 
 	path = btrfs_alloc_path();
-	if (!path) {
-		kfree(name);
+	if (!path)
 		return -ENOMEM;
-	}
 
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
@@ -221,9 +139,7 @@ out:
 	}
 
 	btrfs_end_transaction(trans, root);
-	kfree(name);
 	btrfs_free_path(path);
-
 	return ret;
 }
 
@@ -329,51 +245,77 @@ err:
 }
 
 /*
- * Handler functions
+ * List of handlers for synthetic system.* attributes.  All real ondisk
+ * attributes are handled directly.
+ */
+struct xattr_handler *btrfs_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
+	&btrfs_xattr_acl_access_handler,
+	&btrfs_xattr_acl_default_handler,
+#endif
+	NULL,
+};
+
+/*
+ * Check if the attribute is in a supported namespace.
+ *
+ * This applied after the check for the synthetic attributes in the system
+ * namespace.
  */
-#define BTRFS_XATTR_SETGET_FUNCS(name, index)				\
-static int btrfs_xattr_##name##_get(struct inode *inode,		\
-				    const char *name, void *value,	\
-				    size_t size)			\
-{									\
-	if (*name == '\0')						\
-		return -EINVAL;						\
-	return btrfs_xattr_get(inode, index, name, value, size);	\
-}									\
-static int btrfs_xattr_##name##_set(struct inode *inode,		\
-				    const char *name, const void *value,\
-				    size_t size, int flags)		\
-{									\
-	if (*name == '\0')						\
-		return -EINVAL;						\
-	return btrfs_xattr_set(inode, index, name, value, size, flags);	\
+static bool btrfs_is_valid_xattr(const char *name)
+{
+	return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
-BTRFS_XATTR_SETGET_FUNCS(security, BTRFS_XATTR_INDEX_SECURITY);
-BTRFS_XATTR_SETGET_FUNCS(system, BTRFS_XATTR_INDEX_SYSTEM);
-BTRFS_XATTR_SETGET_FUNCS(user, BTRFS_XATTR_INDEX_USER);
-BTRFS_XATTR_SETGET_FUNCS(trusted, BTRFS_XATTR_INDEX_TRUSTED);
+ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		       void *buffer, size_t size)
+{
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_getxattr(dentry, name, buffer, size);
 
-struct xattr_handler btrfs_xattr_security_handler = {
-	.prefix = XATTR_SECURITY_PREFIX,
-	.get	= btrfs_xattr_security_get,
-	.set	= btrfs_xattr_security_set,
-};
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+	return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
+}
 
-struct xattr_handler btrfs_xattr_system_handler = {
-	.prefix = XATTR_SYSTEM_PREFIX,
-	.get	= btrfs_xattr_system_get,
-	.set	= btrfs_xattr_system_set,
-};
+int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		   size_t size, int flags)
+{
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_setxattr(dentry, name, value, size, flags);
 
-struct xattr_handler btrfs_xattr_user_handler = {
-	.prefix	= XATTR_USER_PREFIX,
-	.get	= btrfs_xattr_user_get,
-	.set	= btrfs_xattr_user_set,
-};
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
 
-struct xattr_handler btrfs_xattr_trusted_handler = {
-	.prefix = XATTR_TRUSTED_PREFIX,
-	.get	= btrfs_xattr_trusted_get,
-	.set	= btrfs_xattr_trusted_set,
-};
+	if (size == 0)
+		value = "";  /* empty EA, do not remove */
+	return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+}
+
+int btrfs_removexattr(struct dentry *dentry, const char *name)
+{
+	/*
+	 * If this is a request for a synthetic attribute in the system.*
+	 * namespace use the generic infrastructure to resolve a handler
+	 * for it via sb->s_xattr.
+	 */
+	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+		return generic_removexattr(dentry, name);
+
+	if (!btrfs_is_valid_xattr(name))
+		return -EOPNOTSUPP;
+	return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 825e55bd496..5b1d08f8e68 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -20,31 +20,20 @@
 #define __XATTR__
 
 #include <linux/xattr.h>
-#include "ctree.h"
 
-/* Name indexes */
-enum {
-	BTRFS_XATTR_INDEX_USER,
-	BTRFS_XATTR_INDEX_POSIX_ACL_ACCESS,
-	BTRFS_XATTR_INDEX_POSIX_ACL_DEFAULT,
-	BTRFS_XATTR_INDEX_TRUSTED,
-	BTRFS_XATTR_INDEX_SECURITY,
-	BTRFS_XATTR_INDEX_SYSTEM,
-	BTRFS_XATTR_INDEX_END,
-};
-
-extern struct xattr_handler btrfs_xattr_user_handler;
-extern struct xattr_handler btrfs_xattr_trusted_handler;
 extern struct xattr_handler btrfs_xattr_acl_access_handler;
 extern struct xattr_handler btrfs_xattr_acl_default_handler;
-extern struct xattr_handler btrfs_xattr_security_handler;
-extern struct xattr_handler btrfs_xattr_system_handler;
-
 extern struct xattr_handler *btrfs_xattr_handlers[];
 
-ssize_t btrfs_xattr_get(struct inode *inode, int name_index, const char *name,
-			void *buffer, size_t size);
-int btrfs_xattr_set(struct inode *inode, int name_index, const char *name,
-			const void *value, size_t size, int flags);
+extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+		void *buffer, size_t size);
+extern int __btrfs_setxattr(struct inode *inode, const char *name,
+		const void *value, size_t size, int flags);
+
+extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size);
+extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+		const void *value, size_t size, int flags);
+extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 
 #endif /* __XATTR__ */
-- 
cgit v1.2.3


From a1b32a5932cfac7c38b442582285f3da2a09dfd8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 5 Sep 2008 16:09:51 -0400
Subject: Btrfs: Add debugging checks to track down corrupted metadata

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |  7 ++++++-
 fs/btrfs/extent_io.c | 21 ++++++++++++++-------
 fs/btrfs/file.c      | 15 ++++++++-------
 fs/btrfs/volumes.c   | 40 +++++++++++++++++++++-------------------
 4 files changed, 49 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a218f792e5..8e7a938bfbc 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -250,7 +250,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
-
+printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
@@ -348,6 +348,9 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
+		printk("bad tree block start %llu %llu\n",
+		       (unsigned long long)found_start,
+		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
@@ -709,6 +712,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
+	} else {
+		WARN_ON(1);
 	}
 	return buf;
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 83ba0c32872..7ca89c45d40 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1811,6 +1811,7 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 		}
 		/* the get_extent function already copied into the page */
 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+			check_page_uptodate(tree, page);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
 			cur = cur + iosize;
 			page_offset += iosize;
@@ -2785,21 +2786,20 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 		 * properly set.  releasepage may drop page->private
 		 * on us if the page isn't already dirty.
 		 */
+		lock_page(page);
 		if (i == 0) {
-			lock_page(page);
 			set_page_extent_head(page, eb->len);
 		} else if (PagePrivate(page) &&
 			   page->private != EXTENT_PAGE_PRIVATE) {
-			lock_page(page);
 			set_page_extent_mapped(page);
-			unlock_page(page);
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		if (i == 0)
-			unlock_page(page);
+		set_extent_dirty(tree, page_offset(page),
+				 page_offset(page) + PAGE_CACHE_SIZE -1,
+				 GFP_NOFS);
+		unlock_page(page);
 	}
-	return set_extent_dirty(tree, eb->start,
-				eb->start + eb->len - 1, GFP_NOFS);
+	return 0;
 }
 EXPORT_SYMBOL(set_extent_buffer_dirty);
 
@@ -2952,6 +2952,9 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (all_uptodate) {
 		if (start_i == 0)
 			eb->flags |= EXTENT_UPTODATE;
+		if (ret) {
+			printk("all up to date but ret is %d\n", ret);
+		}
 		goto unlock_exit;
 	}
 
@@ -2968,6 +2971,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 						      mirror_num);
 			if (err) {
 				ret = err;
+				printk("err %d from __extent_read_full_page\n", ret);
 			}
 		} else {
 			unlock_page(page);
@@ -2978,12 +2982,15 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		submit_one_bio(READ, bio, mirror_num);
 
 	if (ret || !wait) {
+		if (ret)
+			printk("ret %d wait %d returning\n", ret, wait);
 		return ret;
 	}
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
 		if (!PageUptodate(page)) {
+			printk("page not uptodate after wait_on_page_locked\n");
 			ret = -EIO;
 		}
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eb8e4556fa7..e9e86fbaa24 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,9 +39,10 @@
 #include "compat.h"
 
 
-static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
-				struct page **prepared_pages,
-				const char __user * buf)
+static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
+					 int write_bytes,
+					 struct page **prepared_pages,
+					 const char __user * buf)
 {
 	long page_fault = 0;
 	int i;
@@ -69,7 +70,7 @@ static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
 	return page_fault ? -EFAULT : 0;
 }
 
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
@@ -359,7 +360,7 @@ out_unlock:
 	return err;
 }
 
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
+int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
@@ -515,7 +516,7 @@ out:
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
@@ -785,7 +786,7 @@ out:
 /*
  * this gets pages into the page cache and locks them down
  */
-static int prepare_pages(struct btrfs_root *root, struct file *file,
+static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
 			 loff_t pos, unsigned long first_index,
 			 unsigned long last_index, size_t write_bytes)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5b1b60839d2..37a8ea23e81 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -94,8 +94,8 @@ int btrfs_cleanup_fs_uuids(void)
 	return 0;
 }
 
-static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
-					  u8 *uuid)
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+						   u64 devid, u8 *uuid)
 {
 	struct btrfs_device *dev;
 	struct list_head *cur;
@@ -110,7 +110,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
 	return NULL;
 }
 
-static struct btrfs_fs_devices *find_fsid(u8 *fsid)
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
 	struct list_head *cur;
 	struct btrfs_fs_devices *fs_devices;
@@ -134,7 +134,7 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-int run_scheduled_bios(struct btrfs_device *device)
+static int noinline run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -233,7 +233,7 @@ void pending_bios_fn(struct btrfs_work *work)
 	run_scheduled_bios(device);
 }
 
-static int device_list_add(const char *path,
+static noinline int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 {
@@ -480,10 +480,10 @@ error:
  * called very infrequently and that a given device has a small number
  * of extents
  */
-static int find_free_dev_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_device *device,
-				struct btrfs_path *path,
-				u64 num_bytes, u64 *start)
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_device *device,
+					 struct btrfs_path *path,
+					 u64 num_bytes, u64 *start)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
@@ -645,7 +645,7 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
 			   u64 chunk_offset,
@@ -693,7 +693,8 @@ err:
 	return ret;
 }
 
-static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset)
+static noinline int find_next_chunk(struct btrfs_root *root,
+				    u64 objectid, u64 *offset)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -735,8 +736,8 @@ error:
 	return ret;
 }
 
-static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
-			   u64 *objectid)
+static noinline int find_next_devid(struct btrfs_root *root,
+				    struct btrfs_path *path, u64 *objectid)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1103,8 +1104,8 @@ out_close_bdev:
 	goto out;
 }
 
-int btrfs_update_device(struct btrfs_trans_handle *trans,
-			struct btrfs_device *device)
+int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+				 struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1544,8 +1545,8 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
-			       int sub_stripes)
+static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+					int num_stripes, int sub_stripes)
 {
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		return calc_size;
@@ -2141,8 +2142,9 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
-		 int rw, struct bio *bio)
+static int noinline schedule_bio(struct btrfs_root *root,
+				 struct btrfs_device *device,
+				 int rw, struct bio *bio)
 {
 	int should_queue = 1;
 
-- 
cgit v1.2.3


From e02119d5a7b4396c5a872582fddc8bd6d305a70a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 5 Sep 2008 16:13:11 -0400
Subject: Btrfs: Add a write ahead tree log to optimize synchronous operations

File syncs and directory syncs are optimized by copying their
items into a special (copy-on-write) log tree.  There is one log tree per
subvolume and the btrfs super block points to a tree of log tree roots.

After a crash, items are copied out of the log tree and back into the
subvolume.  See tree-log.c for all the details.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    3 +-
 fs/btrfs/btrfs_inode.h |    8 +
 fs/btrfs/compat.h      |   15 +
 fs/btrfs/ctree.c       |   73 +-
 fs/btrfs/ctree.h       |   72 +-
 fs/btrfs/dir-item.c    |    3 +
 fs/btrfs/disk-io.c     |  138 ++-
 fs/btrfs/disk-io.h     |    8 +-
 fs/btrfs/extent-tree.c |   93 +-
 fs/btrfs/file.c        |   39 +-
 fs/btrfs/inode.c       |  261 ++---
 fs/btrfs/root-tree.c   |    5 +-
 fs/btrfs/transaction.c |   45 +-
 fs/btrfs/transaction.h |    1 +
 fs/btrfs/tree-defrag.c |    4 +-
 fs/btrfs/tree-log.c    | 2804 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/tree-log.h    |   41 +
 17 files changed, 3408 insertions(+), 205 deletions(-)
 create mode 100644 fs/btrfs/tree-log.c
 create mode 100644 fs/btrfs/tree-log.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3a01065d4ef..b7addbfd8c2 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,8 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o acl.o export.o
-
+	   ref-cache.o export.o tree-log.o acl.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 111f90524ae..fcc8cf27e90 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -33,6 +33,7 @@ struct btrfs_inode {
 	struct extent_io_tree io_failure_tree;
 	struct mutex csum_mutex;
 	struct mutex extent_mutex;
+	struct mutex log_mutex;
 	struct inode vfs_inode;
 	struct btrfs_ordered_inode_tree ordered_tree;
 
@@ -44,10 +45,17 @@ struct btrfs_inode {
 
 	struct list_head delalloc_inodes;
 
+	/* full 64 bit generation number */
+	u64 generation;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
+	/*
+	 * transid that last logged this inode
+	 */
+	u64 logged_trans;
 	u64 delalloc_bytes;
 	u64 disk_i_size;
 	u32 flags;
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index d45fb37887b..b0ed1887d9b 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -22,6 +22,21 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+static inline void btrfs_drop_nlink(struct inode *inode)
+{
+	inode->i_nlink--;
+}
+
+static inline void btrfs_inc_nlink(struct inode *inode)
+{
+	inode->i_nlink++;
+}
+#else
+# define btrfs_drop_nlink(inode) drop_nlink(inode)
+# define btrfs_inc_nlink(inode)	inc_nlink(inode)
+#endif
+
 /*
  * Even if AppArmor isn't enabled, it still has different prototypes.
  * Add more distro/version pairs here to declare which has AppArmor applied.
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7114faafa9d..579124043d9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -60,7 +60,7 @@ void btrfs_free_path(struct btrfs_path *p)
 	kmem_cache_free(btrfs_path_cachep, p);
 }
 
-void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
 
@@ -176,7 +176,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
@@ -294,7 +294,7 @@ int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_cow_block(struct btrfs_trans_handle *trans,
+int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret, u64 prealloc_dest)
@@ -677,9 +677,10 @@ static int noinline check_block(struct btrfs_root *root,
  *
  * slot may point to max if the key is bigger than all of the keys
  */
-static int generic_bin_search(struct extent_buffer *eb, unsigned long p,
-			      int item_size, struct btrfs_key *key,
-			      int max, int *slot)
+static noinline int generic_bin_search(struct extent_buffer *eb,
+				       unsigned long p,
+				       int item_size, struct btrfs_key *key,
+				       int max, int *slot)
 {
 	int low = 0;
 	int high = max;
@@ -765,7 +766,7 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return -1;
 }
 
-static struct extent_buffer *read_node_slot(struct btrfs_root *root,
+static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 				   struct extent_buffer *parent, int slot)
 {
 	int level = btrfs_header_level(parent);
@@ -781,7 +782,7 @@ static struct extent_buffer *read_node_slot(struct btrfs_root *root,
 		       btrfs_node_ptr_generation(parent, slot));
 }
 
-static int balance_level(struct btrfs_trans_handle *trans,
+static noinline int balance_level(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 struct btrfs_path *path, int level)
 {
@@ -1128,8 +1129,9 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 /*
  * readahead one full node of leaves
  */
-static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
-			     int level, int slot, u64 objectid)
+static noinline void reada_for_search(struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      int level, int slot, u64 objectid)
 {
 	struct extent_buffer *node;
 	struct btrfs_disk_key disk_key;
@@ -1201,7 +1203,8 @@ static void reada_for_search(struct btrfs_root *root, struct btrfs_path *path,
 	}
 }
 
-static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
+static noinline void unlock_up(struct btrfs_path *path, int level,
+			       int lowest_unlock)
 {
 	int i;
 	int skip_level = level;
@@ -1759,8 +1762,9 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
  *
  * returns 0 on success and < 0 on failure
  */
-static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, int level)
+static noinline int split_node(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_path *path, int level)
 {
 	u64 root_gen;
 	struct extent_buffer *c;
@@ -1874,7 +1878,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf)
+int noinline btrfs_leaf_free_space(struct btrfs_root *root,
+				   struct extent_buffer *leaf)
 {
 	int nritems = btrfs_header_nritems(leaf);
 	int ret;
@@ -2283,9 +2288,11 @@ out:
  *
  * returns 0 if all went well and < 0 on failure.
  */
-static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_key *ins_key,
-		      struct btrfs_path *path, int data_size, int extend)
+static noinline int split_leaf(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_key *ins_key,
+			       struct btrfs_path *path, int data_size,
+			       int extend)
 {
 	u64 root_gen;
 	struct extent_buffer *l;
@@ -3079,6 +3086,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * was nothing in the tree that matched the search criteria.
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
 			 struct btrfs_path *path, int cache_only,
 			 u64 min_trans)
 {
@@ -3093,6 +3101,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 again:
 	cur = btrfs_lock_root_node(root);
 	level = btrfs_header_level(cur);
+	WARN_ON(path->nodes[level]);
 	path->nodes[level] = cur;
 	path->locks[level] = 1;
 
@@ -3107,6 +3116,8 @@ again:
 
 		/* at level = 0, we're done, setup the path and exit */
 		if (level == 0) {
+			if (slot >= nritems)
+				goto find_next_key;
 			ret = 0;
 			path->slots[level] = slot;
 			btrfs_item_key_to_cpu(cur, &found_key, slot);
@@ -3123,6 +3134,8 @@ again:
 			u64 blockptr;
 			u64 gen;
 			struct extent_buffer *tmp;
+			struct btrfs_disk_key disk_key;
+
 			blockptr = btrfs_node_blockptr(cur, slot);
 			gen = btrfs_node_ptr_generation(cur, slot);
 			if (gen < min_trans) {
@@ -3132,6 +3145,14 @@ again:
 			if (!cache_only)
 				break;
 
+			if (max_key) {
+				btrfs_node_key(cur, &disk_key, slot);
+				if (comp_keys(&disk_key, max_key) >= 0) {
+					ret = 1;
+					goto out;
+				}
+			}
+
 			tmp = btrfs_find_tree_block(root, blockptr,
 					    btrfs_level_size(root, level - 1));
 
@@ -3143,14 +3164,16 @@ again:
 				free_extent_buffer(tmp);
 			slot++;
 		}
+find_next_key:
 		/*
 		 * we didn't find a candidate key in this node, walk forward
 		 * and find another one
 		 */
 		if (slot >= nritems) {
-			ret = btrfs_find_next_key(root, path, min_key, level,
+			path->slots[level] = slot;
+			sret = btrfs_find_next_key(root, path, min_key, level,
 						  cache_only, min_trans);
-			if (ret == 0) {
+			if (sret == 0) {
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
@@ -3351,6 +3374,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 {
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
+	u32 nritems;
 	int ret;
 
 	while(1) {
@@ -3362,9 +3386,20 @@ int btrfs_previous_item(struct btrfs_root *root,
 			path->slots[0]--;
 		}
 		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (nritems == 0)
+			return 1;
+		if (path->slots[0] == nritems)
+			path->slots[0]--;
+
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.type == type)
 			return 0;
+		if (found_key.objectid < min_objectid)
+			break;
+		if (found_key.objectid == min_objectid &&
+		    found_key.type < type)
+			break;
 	}
 	return 1;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b305ae7e10b..6532b60683e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -77,6 +77,10 @@ struct btrfs_ordered_sum;
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
+/* does write ahead logging to speed up fsyncs */
+#define BTRFS_TREE_LOG_OBJECTID -6ULL
+#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
+
 /*
  * All files have objectids higher than this.
  */
@@ -276,6 +280,7 @@ struct btrfs_super_block {
 	__le64 generation;
 	__le64 root;
 	__le64 chunk_root;
+	__le64 log_root;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -287,6 +292,7 @@ struct btrfs_super_block {
 	__le32 sys_chunk_array_size;
 	u8 root_level;
 	u8 chunk_root_level;
+	u8 log_root_level;
 	struct btrfs_dev_item dev_item;
 	char label[BTRFS_LABEL_SIZE];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
@@ -392,7 +398,10 @@ struct btrfs_timespec {
  * make a new item type
  */
 struct btrfs_inode_item {
+	/* nfs style generation number */
 	__le64 generation;
+	/* transid that last touched this inode */
+	__le64 transid;
 	__le64 size;
 	__le64 nblocks;
 	__le64 block_group;
@@ -409,8 +418,13 @@ struct btrfs_inode_item {
 	struct btrfs_timespec otime;
 } __attribute__ ((__packed__));
 
+struct btrfs_dir_log_item {
+	__le64 end;
+} __attribute__ ((__packed__));
+
 struct btrfs_dir_item {
 	struct btrfs_disk_key location;
+	__le64 transid;
 	__le16 data_len;
 	__le16 name_len;
 	u8 type;
@@ -505,6 +519,9 @@ struct btrfs_fs_info {
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
+
+	/* the log root tree is a directory of all the other log roots */
+	struct btrfs_root *log_root_tree;
 	struct radix_tree_root fs_roots_radix;
 
 	struct extent_io_tree free_space_cache;
@@ -518,6 +535,7 @@ struct btrfs_fs_info {
 
 	u64 generation;
 	u64 last_trans_committed;
+	u64 last_trans_new_blockgroup;
 	u64 open_ioctl_trans;
 	unsigned long mount_opt;
 	u64 max_extent;
@@ -527,6 +545,9 @@ struct btrfs_fs_info {
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
 	wait_queue_head_t async_submit_wait;
+
+	wait_queue_head_t tree_log_wait;
+
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
@@ -535,6 +556,7 @@ struct btrfs_fs_info {
 	struct backing_dev_info bdi;
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
+	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
 	struct mutex alloc_mutex;
@@ -544,8 +566,13 @@ struct btrfs_fs_info {
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
+
 	atomic_t nr_async_submits;
 	atomic_t nr_async_bios;
+	atomic_t tree_log_writers;
+	atomic_t tree_log_commit;
+	unsigned long tree_log_batch;
+	u64 tree_log_transid;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -583,6 +610,7 @@ struct btrfs_fs_info {
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
+	int log_root_recovering;
 	atomic_t throttles;
 	atomic_t throttle_gen;
 
@@ -596,6 +624,7 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
+	u64 last_log_alloc;
 
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
@@ -632,6 +661,7 @@ struct btrfs_root {
 	struct btrfs_leaf_ref_tree *ref_tree;
 	struct btrfs_leaf_ref_tree ref_tree_struct;
 	struct btrfs_dirty_root *dirty_root;
+	struct btrfs_root *log_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
@@ -640,6 +670,7 @@ struct btrfs_root {
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
+	struct mutex log_mutex;
 
 	u64 objectid;
 	u64 last_trans;
@@ -692,6 +723,8 @@ struct btrfs_root {
  * dir items are the name -> inode pointers in a directory.  There is one
  * for every name in a directory.
  */
+#define BTRFS_DIR_LOG_ITEM_KEY  14
+#define BTRFS_DIR_LOG_INDEX_KEY 15
 #define BTRFS_DIR_ITEM_KEY	16
 #define BTRFS_DIR_INDEX_KEY	17
 /*
@@ -703,7 +736,8 @@ struct btrfs_root {
  */
 #define BTRFS_CSUM_ITEM_KEY	19
 
-/* reserve 20-31 for other file stuff */
+
+/* reserve 21-31 for other file/dir stuff */
 
 /*
  * root items point to tree roots.  There are typically in the root
@@ -938,6 +972,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
 BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64);
 BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
@@ -1126,10 +1161,13 @@ static inline void btrfs_set_item_key(struct extent_buffer *eb,
 	write_eb_member(eb, item, struct btrfs_item, key, disk_key);
 }
 
+BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
+
 /* struct btrfs_dir_item */
 BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
 BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
+BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
 
 static inline void btrfs_dir_item_key(struct extent_buffer *eb,
 				      struct btrfs_dir_item *item,
@@ -1301,7 +1339,11 @@ BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
 			 chunk_root, 64);
 BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
-			 chunk_root_level, 64);
+			 chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
+			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
+			 log_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
 			 total_bytes, 64);
 BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
@@ -1405,6 +1447,12 @@ static inline struct dentry *fdentry(struct file *file) {
 }
 
 /* extent-tree.c */
+int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
+			u64 start, u64 len);
+int btrfs_update_pinned_extents(struct btrfs_root *root,
+				u64 bytenr, u64 num, int pin);
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key, u64 bytenr);
@@ -1448,6 +1496,11 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins);
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -1488,9 +1541,9 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
 			int cache_only, u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+			 struct btrfs_key *max_key,
 			 struct btrfs_path *path, int cache_only,
 			 u64 min_trans);
-
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
@@ -1656,6 +1709,18 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 #define PageChecked PageFsMisc
 #endif
 
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len);
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index);
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct inode *inode, u64 new_size,
+			       u32 min_type);
+
 int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
@@ -1715,6 +1780,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /* file.c */
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 125094617fe..e4f30090d64 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -96,6 +96,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
 	btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	btrfs_set_dir_data_len(leaf, dir_item, data_len);
 	name_ptr = (unsigned long)(dir_item + 1);
 	data_ptr = (unsigned long)((char *)name_ptr + name_len);
@@ -142,6 +143,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_set_dir_type(leaf, dir_item, type);
 	btrfs_set_dir_data_len(leaf, dir_item, 0);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	name_ptr = (unsigned long)(dir_item + 1);
 
 	write_extent_buffer(leaf, name, name_ptr, name_len);
@@ -169,6 +171,7 @@ second_insert:
 	btrfs_set_dir_type(leaf, dir_item, type);
 	btrfs_set_dir_data_len(leaf, dir_item, 0);
 	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
 	name_ptr = (unsigned long)(dir_item + 1);
 	write_extent_buffer(leaf, name, name_ptr, name_len);
 	btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e7a938bfbc..a4373db5967 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
 #include "async-thread.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "tree-log.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -694,6 +695,18 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 }
 
 
+int btrfs_write_tree_block(struct extent_buffer *buf)
+{
+	return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
+				      buf->start + buf->len - 1, WB_SYNC_NONE);
+}
+
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+{
+	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
+				  buf->start, buf->start + buf->len -1);
+}
+
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid)
 {
@@ -732,15 +745,6 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return 0;
 }
 
-int wait_on_tree_block_writeback(struct btrfs_root *root,
-				 struct extent_buffer *buf)
-{
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	wait_on_extent_buffer_writeback(&BTRFS_I(btree_inode)->io_tree,
-					buf);
-	return 0;
-}
-
 static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			u32 stripesize, struct btrfs_root *root,
 			struct btrfs_fs_info *fs_info,
@@ -771,6 +775,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->node_lock);
 	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
+	mutex_init(&root->log_mutex);
 
 	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
 	root->ref_tree = &root->ref_tree_struct;
@@ -809,11 +814,74 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
-					       struct btrfs_key *location)
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct extent_buffer *eb;
+	int ret;
+
+	if (!fs_info->log_root_tree)
+		return 0;
+
+	eb = fs_info->log_root_tree->node;
+
+	WARN_ON(btrfs_header_level(eb) != 0);
+	WARN_ON(btrfs_header_nritems(eb) != 0);
+
+	ret = btrfs_free_extent(trans, fs_info->tree_root,
+				eb->start, eb->len,
+				BTRFS_TREE_LOG_OBJECTID, 0, 0, 0, 1);
+	BUG_ON(ret);
+
+	free_extent_buffer(eb);
+	kfree(fs_info->log_root_tree);
+	fs_info->log_root_tree = NULL;
+	return 0;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
+
+	root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (!root)
+		return -ENOMEM;
+
+	__setup_root(tree_root->nodesize, tree_root->leafsize,
+		     tree_root->sectorsize, tree_root->stripesize,
+		     root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+	root->ref_cows = 0;
+
+	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
+					    BTRFS_TREE_LOG_OBJECTID,
+					    0, 0, 0, 0, 0);
+
+	btrfs_set_header_nritems(root->node, 0);
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_bytenr(root->node, root->node->start);
+	btrfs_set_header_generation(root->node, trans->transid);
+	btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
+
+	write_extent_buffer(root->node, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(root->node),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(root->node);
+	btrfs_tree_unlock(root->node);
+	fs_info->log_root_tree = root;
+	return 0;
+}
+
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
+					       struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_path *path;
 	struct extent_buffer *l;
 	u64 highest_inode;
@@ -863,11 +931,13 @@ out:
 				     blocksize, 0);
 	BUG_ON(!root->node);
 insert:
-	root->ref_cows = 1;
-	ret = btrfs_find_highest_inode(root, &highest_inode);
-	if (ret == 0) {
-		root->highest_inode = highest_inode;
-		root->last_inode_alloc = highest_inode;
+	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+		root->ref_cows = 1;
+		ret = btrfs_find_highest_inode(root, &highest_inode);
+		if (ret == 0) {
+			root->highest_inode = highest_inode;
+			root->last_inode_alloc = highest_inode;
+		}
 	}
 	return root;
 }
@@ -907,7 +977,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 	if (root)
 		return root;
 
-	root = btrfs_read_fs_root_no_radix(fs_info, location);
+	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
 	if (IS_ERR(root))
 		return root;
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
@@ -1250,16 +1320,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 blocksize;
 	u32 stripesize;
 	struct buffer_head *bh;
-	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
-	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
 						GFP_NOFS);
-	struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
 						GFP_NOFS);
-	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
+	struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
 					      GFP_NOFS);
+	struct btrfs_root *log_tree_root;
+
 	int ret;
 	int err = -EINVAL;
 
@@ -1343,6 +1415,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	mutex_init(&fs_info->trans_mutex);
+	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->alloc_mutex);
 	mutex_init(&fs_info->chunk_mutex);
@@ -1352,6 +1425,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
+	init_waitqueue_head(&fs_info->tree_log_wait);
+	atomic_set(&fs_info->tree_log_commit, 0);
+	atomic_set(&fs_info->tree_log_writers, 0);
+	fs_info->tree_log_transid = 0;
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1532,7 +1609,26 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!fs_info->transaction_kthread)
 		goto fail_cleaner;
 
+	if (btrfs_super_log_root(disk_super) != 0) {
+		u32 blocksize;
+		u64 bytenr = btrfs_super_log_root(disk_super);
+
+		blocksize =
+		     btrfs_level_size(tree_root,
+				      btrfs_super_log_root_level(disk_super));
 
+		log_tree_root = kzalloc(sizeof(struct btrfs_root),
+						      GFP_NOFS);
+
+		__setup_root(nodesize, leafsize, sectorsize, stripesize,
+			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
+
+		log_tree_root->node = read_tree_block(tree_root, bytenr,
+						      blocksize, 0);
+		ret = btrfs_recover_log_trees(log_tree_root);
+		BUG_ON(ret);
+	}
+	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 	return tree_root;
 
 fail_cleaner:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 2562a273ae1..6b6fdc697f3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -45,7 +45,7 @@ struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_fs_info *fs_info,
+struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 					       struct btrfs_key *location);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location);
@@ -74,4 +74,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+int btrfs_write_tree_block(struct extent_buffer *buf);
+int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e63b3b4bed7..646b9148ca2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -496,6 +496,23 @@ static int match_extent_ref(struct extent_buffer *leaf,
 	return ret == 0;
 }
 
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
+			u64 start, u64 len)
+{
+	int ret;
+	struct btrfs_key key;
+
+	maybe_lock_mutex(root);
+	key.objectid = start;
+	key.offset = len;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+				0, 0);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, u64 bytenr,
@@ -1409,7 +1426,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 }
 
 
-static int update_pinned_extents(struct btrfs_root *root,
+int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
 	u64 len;
@@ -1492,7 +1509,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-		update_pinned_extents(root, start, end + 1 - start, 0);
+		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
 		if (need_resched()) {
@@ -1538,14 +1555,11 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
 
-		eb = btrfs_find_tree_block(extent_root, ins.objectid,
+		eb = btrfs_find_create_tree_block(extent_root, ins.objectid,
 					   ins.offset);
 
-		if (!btrfs_buffer_uptodate(eb, trans->transid)) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+		if (!btrfs_buffer_uptodate(eb, trans->transid))
 			btrfs_read_buffer(eb, trans->transid);
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
 
 		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
@@ -1585,13 +1599,20 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
+			/* we can reuse a block if it hasn't been written
+			 * and it is from this transaction.  We can't
+			 * reuse anything from the tree log root because
+			 * it has tiny sub-transactions.
+			 */
 			if (btrfs_buffer_uptodate(buf, 0) &&
 			    btrfs_try_tree_lock(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
 					btrfs_header_generation(buf);
-				if (header_transid == transid &&
+				if (btrfs_header_owner(buf) !=
+				    BTRFS_TREE_LOG_OBJECTID &&
+				    header_transid == transid &&
 				    !btrfs_header_flag(buf,
 					       BTRFS_HEADER_FLAG_WRITTEN)) {
 					clean_tree_block(NULL, root, buf);
@@ -1603,7 +1624,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			}
 			free_extent_buffer(buf);
 		}
-		update_pinned_extents(root, bytenr, num_bytes, 1);
+		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
@@ -1801,7 +1822,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 				  GFP_NOFS);
 		if (!test_range_bit(&extent_root->fs_info->extent_ins,
 				    start, end, EXTENT_LOCKED, 0)) {
-			update_pinned_extents(extent_root, start,
+			btrfs_update_pinned_extents(extent_root, start,
 					      end + 1 - start, 1);
 			ret = __free_extent(trans, extent_root,
 					     start, end + 1 - start,
@@ -1919,6 +1940,12 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->last_data_alloc;
 	}
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		last_ptr = &root->fs_info->last_log_alloc;
+		if (!last_ptr == 0 && root->fs_info->last_alloc) {
+			*last_ptr = root->fs_info->last_alloc + empty_cluster;
+		}
+	}
 
 	if (last_ptr) {
 		if (*last_ptr)
@@ -2268,6 +2295,35 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	maybe_unlock_mutex(root);
 	return ret;
 }
+
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+
+	maybe_lock_mutex(root);
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	cache_block_group(root, block_group);
+
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -2350,9 +2406,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	return buf;
 }
 
-static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct extent_buffer *leaf)
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -2402,9 +2457,9 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root,
-					 struct btrfs_leaf_ref *ref)
+static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_leaf_ref *ref)
 {
 	int i;
 	int ret;
@@ -2512,7 +2567,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
-			ret = drop_leaf_ref_no_cache(trans, root, cur);
+			ret = btrfs_drop_leaf_ref(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
@@ -2552,7 +2607,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
 			ref = btrfs_lookup_leaf_ref(root, bytenr);
 			if (ref) {
-				ret = drop_leaf_ref(trans, root, ref);
+				ret = cache_drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
 				btrfs_remove_leaf_ref(root, ref);
 				btrfs_free_leaf_ref(root, ref);
@@ -3628,6 +3683,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	extent_root = root->fs_info->extent_root;
 	block_group_cache = &root->fs_info->block_group_cache;
 
+	root->fs_info->last_trans_new_blockgroup = trans->transid;
+
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e9e86fbaa24..84ecf3ab851 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -36,6 +36,8 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
+#include "tree-log.h"
+#include "locking.h"
 #include "compat.h"
 
 
@@ -988,10 +990,27 @@ out_nolock:
 	*ppos = pos;
 
 	if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
-		err = sync_page_range(inode, inode->i_mapping,
-				      start_pos, num_written);
+		struct btrfs_trans_handle *trans;
+
+		err = btrfs_fdatawrite_range(inode->i_mapping, start_pos,
+					     start_pos + num_written -1,
+					     WB_SYNC_NONE);
+		if (err < 0)
+			num_written = err;
+
+		err = btrfs_wait_on_page_writeback_range(inode->i_mapping,
+				 start_pos, start_pos + num_written - 1);
 		if (err < 0)
 			num_written = err;
+
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+		if (ret == 0) {
+			btrfs_sync_log(trans, root);
+			btrfs_end_transaction(trans, root);
+		} else {
+			btrfs_commit_transaction(trans, root);
+		}
 	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
 		do_sync_file_range(file, start_pos,
@@ -1019,8 +1038,7 @@ int btrfs_release_file(struct inode * inode, struct file * filp)
 	return 0;
 }
 
-static int btrfs_sync_file(struct file *file,
-			   struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1043,6 +1061,8 @@ static int btrfs_sync_file(struct file *file,
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	filemap_fdatawait(inode->i_mapping);
+
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
@@ -1054,7 +1074,16 @@ static int btrfs_sync_file(struct file *file,
 		ret = -ENOMEM;
 		goto out;
 	}
-	ret = btrfs_commit_transaction(trans, root);
+
+	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = btrfs_commit_transaction(trans, root);
+	} else {
+		btrfs_sync_log(trans, root);
+		ret = btrfs_end_transaction(trans, root);
+	}
 out:
 	return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43d3f2649ca..65df9d83023 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -46,6 +46,8 @@
 #include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
+#include "compat.h"
+#include "tree-log.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -586,6 +588,7 @@ nocow:
 			  &ordered_extent->list);
 
 	btrfs_ordered_update_i_size(inode, ordered_extent);
+	btrfs_update_inode(trans, root, inode);
 	btrfs_remove_ordered_extent(inode, ordered_extent);
 
 	/* once for us */
@@ -593,7 +596,6 @@ nocow:
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
-	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
 	return 0;
 }
@@ -1007,7 +1009,8 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 
 	inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item);
-	inode->i_generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
@@ -1056,7 +1059,8 @@ make_bad:
 	make_bad_inode(inode);
 }
 
-static void fill_inode_item(struct extent_buffer *leaf,
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
 			    struct btrfs_inode_item *item,
 			    struct inode *inode)
 {
@@ -1082,7 +1086,8 @@ static void fill_inode_item(struct extent_buffer *leaf,
 				inode->i_ctime.tv_nsec);
 
 	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
-	btrfs_set_inode_generation(leaf, item, inode->i_generation);
+	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
 	btrfs_set_inode_block_group(leaf, item,
@@ -1112,7 +1117,7 @@ int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				  struct btrfs_inode_item);
 
-	fill_inode_item(leaf, inode_item, inode);
+	fill_inode_item(trans, leaf, inode_item, inode);
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
@@ -1122,14 +1127,12 @@ failed:
 }
 
 
-static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *dir,
-			      struct dentry *dentry)
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       struct inode *dir, struct inode *inode,
+		       const char *name, int name_len)
 {
 	struct btrfs_path *path;
-	const char *name = dentry->d_name.name;
-	int name_len = dentry->d_name.len;
 	int ret = 0;
 	struct extent_buffer *leaf;
 	struct btrfs_dir_item *di;
@@ -1160,13 +1163,12 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	btrfs_release_path(root, path);
 
 	ret = btrfs_del_inode_ref(trans, root, name, name_len,
-				  dentry->d_inode->i_ino,
-				  dentry->d_parent->d_inode->i_ino, &index);
+				  inode->i_ino,
+				  dir->i_ino, &index);
 	if (ret) {
 		printk("failed to delete reference to %.*s, "
 		       "inode %lu parent %lu\n", name_len, name,
-		       dentry->d_inode->i_ino,
-		       dentry->d_parent->d_inode->i_ino);
+		       inode->i_ino, dir->i_ino);
 		goto err;
 	}
 
@@ -1183,21 +1185,25 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
 	btrfs_release_path(root, path);
 
-	dentry->d_inode->i_ctime = dir->i_ctime;
+	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
+					 inode, dir->i_ino);
+	BUG_ON(ret);
+
+	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
+					   dir, index);
+	BUG_ON(ret);
 err:
 	btrfs_free_path(path);
-	if (!ret) {
-		btrfs_i_size_write(dir, dir->i_size - name_len * 2);
-		dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-		btrfs_update_inode(trans, root, dir);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		dentry->d_inode->i_nlink--;
-#else
-		drop_nlink(dentry->d_inode);
-#endif
-		ret = btrfs_update_inode(trans, root, dentry->d_inode);
-		dir->i_sb->s_dirt = 1;
-	}
+	if (ret)
+		goto out;
+
+	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	btrfs_update_inode(trans, root, dir);
+	btrfs_drop_nlink(inode);
+	ret = btrfs_update_inode(trans, root, inode);
+	dir->i_sb->s_dirt = 1;
+out:
 	return ret;
 }
 
@@ -1218,7 +1224,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
-	ret = btrfs_unlink_trans(trans, root, dir, dentry);
+	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
 
 	if (inode->i_nlink == 0)
 		ret = btrfs_orphan_add(trans, inode);
@@ -1256,7 +1263,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		goto fail_trans;
 
 	/* now the directory is empty */
-	err = btrfs_unlink_trans(trans, root, dir, dentry);
+	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
+				 dentry->d_name.name, dentry->d_name.len);
 	if (!err) {
 		btrfs_i_size_write(inode, 0);
 	}
@@ -1283,10 +1291,10 @@ fail:
  * min_type is the minimum key type to truncate down to.  If set to 0, this
  * will kill all the items on this inode, including the INODE_ITEM_KEY.
  */
-static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct inode *inode,
-				   u32 min_type)
+noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct inode *inode,
+					u64 new_size, u32 min_type)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1307,7 +1315,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int extent_type = -1;
 	u64 mask = root->sectorsize - 1;
 
-	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
+	if (root->ref_cows)
+		btrfs_drop_extent_cache(inode,
+					new_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -1324,7 +1334,13 @@ search_again:
 		goto error;
 	}
 	if (ret > 0) {
-		BUG_ON(path->slots[0] == 0);
+		/* there are no items in the tree for us to truncate, we're
+		 * done
+		 */
+		if (path->slots[0] == 0) {
+			ret = 0;
+			goto error;
+		}
 		path->slots[0]--;
 	}
 
@@ -1358,10 +1374,10 @@ search_again:
 		}
 		if (found_type == BTRFS_CSUM_ITEM_KEY) {
 			ret = btrfs_csum_truncate(trans, root, path,
-						  inode->i_size);
+						  new_size);
 			BUG_ON(ret);
 		}
-		if (item_end < inode->i_size) {
+		if (item_end < new_size) {
 			if (found_type == BTRFS_DIR_ITEM_KEY) {
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
@@ -1378,7 +1394,7 @@ search_again:
 			btrfs_set_key_type(&key, found_type);
 			goto next;
 		}
-		if (found_key.offset >= inode->i_size)
+		if (found_key.offset >= new_size)
 			del_item = 1;
 		else
 			del_item = 0;
@@ -1394,7 +1410,7 @@ search_again:
 			if (!del_item) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
-				extent_num_bytes = inode->i_size -
+				extent_num_bytes = new_size -
 					found_key.offset + root->sectorsize - 1;
 				extent_num_bytes = extent_num_bytes &
 					~((u64)root->sectorsize - 1);
@@ -1402,7 +1418,7 @@ search_again:
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
 					   extent_num_bytes);
-				if (extent_start != 0)
+				if (root->ref_cows && extent_start != 0)
 					dec_i_blocks(inode, num_dec);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
@@ -1413,22 +1429,29 @@ search_again:
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
 					found_extent = 1;
-					dec_i_blocks(inode, num_dec);
+					if (root->ref_cows)
+						dec_i_blocks(inode, num_dec);
+				}
+				if (root->ref_cows) {
+					root_gen =
+						btrfs_header_generation(leaf);
 				}
-				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			if (!del_item) {
-				u32 newsize = inode->i_size - found_key.offset;
-				dec_i_blocks(inode, item_end + 1 -
-					    found_key.offset - newsize);
-				newsize =
-				    btrfs_file_extent_calc_inline_size(newsize);
+				u32 size = new_size - found_key.offset;
+
+				if (root->ref_cows) {
+					dec_i_blocks(inode, item_end + 1 -
+						    found_key.offset - size);
+				}
+				size =
+				    btrfs_file_extent_calc_inline_size(size);
 				ret = btrfs_truncate_item(trans, root, path,
-							  newsize, 1);
+							  size, 1);
 				BUG_ON(ret);
-			} else {
+			} else if (root->ref_cows) {
 				dec_i_blocks(inode, item_end + 1 -
 					     found_key.offset);
 			}
@@ -1666,7 +1689,7 @@ void btrfs_delete_inode(struct inode *inode)
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
-	ret = btrfs_truncate_in_trans(trans, root, inode, 0);
+	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
 	if (ret) {
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete_lock;
@@ -1753,15 +1776,20 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	return 0;
 }
 
-static int btrfs_init_locked_inode(struct inode *inode, void *p)
+static noinline void init_btrfs_i(struct inode *inode)
 {
-	struct btrfs_iget_args *args = p;
-	inode->i_ino = args->ino;
-	BTRFS_I(inode)->root = args->root;
-	BTRFS_I(inode)->delalloc_bytes = 0;
-	inode->i_mapping->writeback_index = 0;
-	BTRFS_I(inode)->disk_i_size = 0;
-	BTRFS_I(inode)->index_cnt = (u64)-1;
+	struct btrfs_inode *bi = BTRFS_I(inode);
+
+	bi->i_acl = NULL;
+	bi->i_default_acl = NULL;
+
+	bi->generation = 0;
+	bi->last_trans = 0;
+	bi->logged_trans = 0;
+	bi->delalloc_bytes = 0;
+	bi->disk_i_size = 0;
+	bi->flags = 0;
+	bi->index_cnt = (u64)-1;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
@@ -1771,6 +1799,15 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
+	mutex_init(&BTRFS_I(inode)->log_mutex);
+}
+
+static int btrfs_init_locked_inode(struct inode *inode, void *p)
+{
+	struct btrfs_iget_args *args = p;
+	inode->i_ino = args->ino;
+	init_btrfs_i(inode);
+	BTRFS_I(inode)->root = args->root;
 	return 0;
 }
 
@@ -2263,21 +2300,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	 * btrfs_get_inode_index_count has an explanation for the magic
 	 * number
 	 */
+	init_btrfs_i(inode);
 	BTRFS_I(inode)->index_cnt = 2;
-
-	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-			     inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-			     inode->i_mapping, GFP_NOFS);
-	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	mutex_init(&BTRFS_I(inode)->extent_mutex);
-	BTRFS_I(inode)->delalloc_bytes = 0;
-	inode->i_mapping->writeback_index = 0;
-	BTRFS_I(inode)->disk_i_size = 0;
 	BTRFS_I(inode)->root = root;
+	BTRFS_I(inode)->generation = trans->transid;
 
 	if (mode & S_IFDIR)
 		owner = 0;
@@ -2290,7 +2316,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		new_inode_group = group;
 	}
 	BTRFS_I(inode)->block_group = new_inode_group;
-	BTRFS_I(inode)->flags = 0;
 
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -2318,7 +2343,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
-	fill_inode_item(path->nodes[0], inode_item, inode);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode);
 
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
 			     struct btrfs_inode_ref);
@@ -2349,38 +2374,34 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
 }
 
-static int btrfs_add_link(struct btrfs_trans_handle *trans,
-			    struct dentry *dentry, struct inode *inode,
-			    int add_backref, u64 index)
+int btrfs_add_link(struct btrfs_trans_handle *trans,
+		   struct inode *parent_inode, struct inode *inode,
+		   const char *name, int name_len, int add_backref, u64 index)
 {
 	int ret;
 	struct btrfs_key key;
-	struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
-	struct inode *parent_inode = dentry->d_parent->d_inode;
+	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
 
 	key.objectid = inode->i_ino;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	ret = btrfs_insert_dir_item(trans, root,
-				    dentry->d_name.name, dentry->d_name.len,
-				    dentry->d_parent->d_inode->i_ino,
+	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+				    parent_inode->i_ino,
 				    &key, btrfs_inode_type(inode),
 				    index);
 	if (ret == 0) {
 		if (add_backref) {
 			ret = btrfs_insert_inode_ref(trans, root,
-					     dentry->d_name.name,
-					     dentry->d_name.len,
-					     inode->i_ino,
-					     parent_inode->i_ino,
-					     index);
+						     name, name_len,
+						     inode->i_ino,
+						     parent_inode->i_ino,
+						     index);
 		}
 		btrfs_i_size_write(parent_inode, parent_inode->i_size +
-				   dentry->d_name.len * 2);
+				   name_len * 2);
 		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_update_inode(trans, root,
-					 dentry->d_parent->d_inode);
+		ret = btrfs_update_inode(trans, root, parent_inode);
 	}
 	return ret;
 }
@@ -2389,7 +2410,9 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
 			    struct dentry *dentry, struct inode *inode,
 			    int backref, u64 index)
 {
-	int err = btrfs_add_link(trans, dentry, inode, backref, index);
+	int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+				 inode, dentry->d_name.name,
+				 dentry->d_name.len, backref, index);
 	if (!err) {
 		d_instantiate(dentry, inode);
 		return 0;
@@ -2513,19 +2536,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-				     inode->i_mapping, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-				     inode->i_mapping, GFP_NOFS);
-		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-		mutex_init(&BTRFS_I(inode)->csum_mutex);
-		mutex_init(&BTRFS_I(inode)->extent_mutex);
-		BTRFS_I(inode)->delalloc_bytes = 0;
-		BTRFS_I(inode)->disk_i_size = 0;
-		inode->i_mapping->writeback_index = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -2556,11 +2567,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		return -ENOENT;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	inode->i_nlink++;
-#else
-	inc_nlink(inode);
-#endif
+	btrfs_inc_nlink(inode);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -2650,7 +2657,9 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (err)
 		goto out_fail;
 
-	err = btrfs_add_link(trans, dentry, inode, 0, index);
+	err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+				 inode, dentry->d_name.name,
+				 dentry->d_name.len, 0, index);
 	if (err)
 		goto out_fail;
 
@@ -3221,7 +3230,7 @@ static void btrfs_truncate(struct inode *inode)
 	if (ret)
 		goto out;
 	/* FIXME, add redo link to tree so we don't leak on crash */
-	ret = btrfs_truncate_in_trans(trans, root, inode,
+	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
 				      BTRFS_EXTENT_DATA_KEY);
 	btrfs_update_inode(trans, root, inode);
 
@@ -3304,6 +3313,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
+	ei->logged_trans = 0;
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	ei->i_acl = BTRFS_ACL_NOT_CACHED;
 	ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
@@ -3463,31 +3473,39 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 
 	btrfs_set_trans_block_group(trans, new_dir);
 
-	old_dentry->d_inode->i_nlink++;
+	btrfs_inc_nlink(old_dentry->d_inode);
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
 
-	ret = btrfs_unlink_trans(trans, root, old_dir, old_dentry);
+	ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
+				 old_dentry->d_name.name,
+				 old_dentry->d_name.len);
 	if (ret)
 		goto out_fail;
 
 	if (new_inode) {
 		new_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_unlink_trans(trans, root, new_dir, new_dentry);
+		ret = btrfs_unlink_inode(trans, root, new_dir,
+					 new_dentry->d_inode,
+					 new_dentry->d_name.name,
+					 new_dentry->d_name.len);
 		if (ret)
 			goto out_fail;
 		if (new_inode->i_nlink == 0) {
-			ret = btrfs_orphan_add(trans, new_inode);
+			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
 			if (ret)
 				goto out_fail;
 		}
+
 	}
 	ret = btrfs_set_inode_index(new_dir, old_inode, &index);
 	if (ret)
 		goto out_fail;
 
-	ret = btrfs_add_link(trans, new_dentry, old_inode, 1, index);
+	ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
+			     old_inode, new_dentry->d_name.name,
+			     new_dentry->d_name.len, 1, index);
 	if (ret)
 		goto out_fail;
 
@@ -3577,19 +3595,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
-		extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-				     inode->i_mapping, GFP_NOFS);
-		extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-				     inode->i_mapping, GFP_NOFS);
-		INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-		mutex_init(&BTRFS_I(inode)->csum_mutex);
-		mutex_init(&BTRFS_I(inode)->extent_mutex);
-		BTRFS_I(inode)->delalloc_bytes = 0;
-		BTRFS_I(inode)->disk_i_size = 0;
-		inode->i_mapping->writeback_index = 0;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-		btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	}
 	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
@@ -3691,6 +3697,7 @@ static struct file_operations btrfs_dir_file_operations = {
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 	.release        = btrfs_release_file,
+	.fsync		= btrfs_sync_file,
 };
 
 static struct extent_io_ops btrfs_extent_io_ops = {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 36726696e58..e3984f902e7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -202,8 +202,9 @@ again:
 		memcpy(&found_key, &key, sizeof(key));
 		key.offset++;
 		btrfs_release_path(root, path);
-		dead_root = btrfs_read_fs_root_no_radix(root->fs_info,
-							&found_key);
+		dead_root =
+			btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						    &found_key);
 		if (IS_ERR(dead_root)) {
 			ret = PTR_ERR(dead_root);
 			goto err;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index eff3ad72991..49c4f5b40ed 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "tree-log.h"
 
 static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
@@ -57,6 +58,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 		root->fs_info->generation++;
 		root->fs_info->last_alloc = 0;
 		root->fs_info->last_data_alloc = 0;
+		root->fs_info->last_log_alloc = 0;
 		cur_trans->num_writers = 1;
 		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
@@ -83,7 +85,7 @@ static noinline int join_transaction(struct btrfs_root *root)
 	return 0;
 }
 
-static noinline int record_root_in_trans(struct btrfs_root *root)
+noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
 	struct btrfs_dirty_root *dirty;
 	u64 running_trans_id = root->fs_info->running_transaction->transid;
@@ -151,7 +153,7 @@ static void wait_current_trans(struct btrfs_root *root)
 	}
 }
 
-struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
+static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 					     int num_blocks, int wait)
 {
 	struct btrfs_trans_handle *h =
@@ -164,7 +166,7 @@ struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
-	record_root_in_trans(root);
+	btrfs_record_root_in_trans(root);
 	h->transid = root->fs_info->running_transaction->transid;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
@@ -456,6 +458,8 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			BUG_ON(!root->ref_tree);
 			dirty = root->dirty_root;
 
+			btrfs_free_log(trans, root);
+
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
 					btrfs_root_bytenr(&root->root_item));
@@ -600,7 +604,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
-			record_root_in_trans(root);
+			btrfs_record_root_in_trans(root);
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
@@ -745,7 +749,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	int ret;
 
 	INIT_LIST_HEAD(&dirty_fs_roots);
-
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (trans->transaction->in_commit) {
 		cur_trans = trans->transaction;
@@ -821,10 +824,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	WARN_ON(cur_trans != trans->transaction);
 
+	/* btrfs_commit_tree_roots is responsible for getting the
+	 * various roots consistent with each other.  Every pointer
+	 * in the tree of tree roots has to point to the most up to date
+	 * root for every subvolume and other tree.  So, we have to keep
+	 * the tree logging code from jumping in and changing any
+	 * of the trees.
+	 *
+	 * At this point in the commit, there can't be any tree-log
+	 * writers, but a little lower down we drop the trans mutex
+	 * and let new people in.  By holding the tree_log_mutex
+	 * from now until after the super is written, we avoid races
+	 * with the tree-log code.
+	 */
+	mutex_lock(&root->fs_info->tree_log_mutex);
+
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
 
+	/* add_dirty_roots gets rid of all the tree log roots, it is now
+	 * safe to free the root of tree log roots
+	 */
+	btrfs_free_log_root_tree(trans, root->fs_info);
+
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
@@ -843,6 +866,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				   chunk_root->node->start);
 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
 					 btrfs_header_level(chunk_root->node));
+
+	if (!root->fs_info->log_root_recovering) {
+		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+	}
+
 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
 	       sizeof(root->fs_info->super_copy));
 
@@ -857,6 +886,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
 
+	/*
+	 * the super is written, we can safely allow the tree-loggers
+	 * to go about their business
+	 */
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 598baa31241..cc63650d60d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -98,4 +98,5 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_root *root);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index cc2650b0695..b3bb5bbad76 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -81,12 +81,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		memcpy(&key, &root->defrag_progress, sizeof(key));
 	}
 
-	path->lowest_level = 1;
 	path->keep_locks = 1;
 	if (cache_only)
 		min_trans = root->defrag_trans_start;
 
-	ret = btrfs_search_forward(root, &key, path, cache_only, min_trans);
+	ret = btrfs_search_forward(root, &key, NULL, path,
+				   cache_only, min_trans);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
new file mode 100644
index 00000000000..d1ce8314b94
--- /dev/null
+++ b/fs/btrfs/tree-log.c
@@ -0,0 +1,2804 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "print-tree.h"
+#include "compat.h"
+
+/* magic values for the inode_only field in btrfs_log_inode:
+ *
+ * LOG_INODE_ALL means to log everything
+ * LOG_INODE_EXISTS means to log just enough to recreate the inode
+ * during log replay
+ */
+#define LOG_INODE_ALL 0
+#define LOG_INODE_EXISTS 1
+
+/*
+ * stages for the tree walking.  The first
+ * stage (0) is to only pin down the blocks we find
+ * the second stage (1) is to make sure that all the inodes
+ * we find in the log are created in the subvolume.
+ *
+ * The last stage is to deal with directories and links and extents
+ * and all the other fun semantics
+ */
+#define LOG_WALK_PIN_ONLY 0
+#define LOG_WALK_REPLAY_INODES 1
+#define LOG_WALK_REPLAY_ALL 2
+
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only);
+
+/*
+ * tree logging is a special write ahead log used to make sure that
+ * fsyncs and O_SYNCs can happen without doing full tree commits.
+ *
+ * Full tree commits are expensive because they require commonly
+ * modified blocks to be recowed, creating many dirty pages in the
+ * extent tree an 4x-6x higher write load than ext3.
+ *
+ * Instead of doing a tree commit on every fsync, we use the
+ * key ranges and transaction ids to find items for a given file or directory
+ * that have changed in this transaction.  Those items are copied into
+ * a special tree (one per subvolume root), that tree is written to disk
+ * and then the fsync is considered complete.
+ *
+ * After a crash, items are copied out of the log-tree back into the
+ * subvolume tree.  Any file data extents found are recorded in the extent
+ * allocation tree, and the log-tree freed.
+ *
+ * The log tree is read three times, once to pin down all the extents it is
+ * using in ram and once, once to create all the inodes logged in the tree
+ * and once to do all the other items.
+ */
+
+/*
+ * btrfs_add_log_tree adds a new per-subvolume log tree into the
+ * tree of log tree roots.  This must be called with a tree log transaction
+ * running (see start_log_trans).
+ */
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root)
+{
+	struct btrfs_key key;
+	struct btrfs_root_item root_item;
+	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *leaf;
+	struct btrfs_root *new_root = root;
+	int ret;
+	u64 objectid = root->root_key.objectid;
+
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      BTRFS_TREE_LOG_OBJECTID,
+				      0, 0, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		return ret;
+	}
+
+	btrfs_set_header_nritems(leaf, 0);
+	btrfs_set_header_level(leaf, 0);
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
+
+	write_extent_buffer(leaf, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_item = &root_item.inode;
+	memset(inode_item, 0, sizeof(*inode_item));
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_level(&root_item, 0);
+	btrfs_set_root_refs(&root_item, 0);
+	btrfs_set_root_used(&root_item, 0);
+
+	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+	root_item.drop_level = 0;
+
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	leaf = NULL;
+
+	btrfs_set_root_dirid(&root_item, 0);
+
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = objectid;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
+				&root_item);
+	if (ret)
+		goto fail;
+
+	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
+					       &key);
+	BUG_ON(!new_root);
+
+	WARN_ON(root->log_root);
+	root->log_root = new_root;
+
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
+	new_root->ref_cows = 0;
+	new_root->last_trans = trans->transid;
+fail:
+	return ret;
+}
+
+/*
+ * start a sub transaction and setup the log tree
+ * this increments the log tree writer count to make the people
+ * syncing the tree wait for us to finish
+ */
+static int start_log_trans(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root)
+{
+	int ret;
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (!root->fs_info->log_root_tree) {
+		ret = btrfs_init_log_root_tree(trans, root->fs_info);
+		BUG_ON(ret);
+	}
+	if (!root->log_root) {
+		ret = btrfs_add_log_tree(trans, root);
+		BUG_ON(ret);
+	}
+	atomic_inc(&root->fs_info->tree_log_writers);
+	root->fs_info->tree_log_batch++;
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return 0;
+}
+
+/*
+ * returns 0 if there was a log transaction running and we were able
+ * to join, or returns -ENOENT if there were not transactions
+ * in progress
+ */
+static int join_running_log_trans(struct btrfs_root *root)
+{
+	int ret = -ENOENT;
+
+	smp_mb();
+	if (!root->log_root)
+		return -ENOENT;
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	if (root->log_root) {
+		ret = 0;
+		atomic_inc(&root->fs_info->tree_log_writers);
+		root->fs_info->tree_log_batch++;
+	}
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	return ret;
+}
+
+/*
+ * indicate we're done making changes to the log tree
+ * and wake up anyone waiting to do a sync
+ */
+static int end_log_trans(struct btrfs_root *root)
+{
+	atomic_dec(&root->fs_info->tree_log_writers);
+	smp_mb();
+	if (waitqueue_active(&root->fs_info->tree_log_wait))
+		wake_up(&root->fs_info->tree_log_wait);
+	return 0;
+}
+
+
+/*
+ * the walk control struct is used to pass state down the chain when
+ * processing the log tree.  The stage field tells us which part
+ * of the log tree processing we are currently doing.  The others
+ * are state fields used for that specific part
+ */
+struct walk_control {
+	/* should we free the extent on disk when done?  This is used
+	 * at transaction commit time while freeing a log tree
+	 */
+	int free;
+
+	/* should we write out the extent buffer?  This is used
+	 * while flushing the log tree to disk during a sync
+	 */
+	int write;
+
+	/* should we wait for the extent buffer io to finish?  Also used
+	 * while flushing the log tree to disk for a sync
+	 */
+	int wait;
+
+	/* pin only walk, we record which extents on disk belong to the
+	 * log trees
+	 */
+	int pin;
+
+	/* what stage of the replay code we're currently in */
+	int stage;
+
+	/* the root we are currently replaying */
+	struct btrfs_root *replay_dest;
+
+	/* the trans handle for the current replay */
+	struct btrfs_trans_handle *trans;
+
+	/* the function that gets used to process blocks we find in the
+	 * tree.  Note the extent_buffer might not be up to date when it is
+	 * passed in, and it must be checked or read if you need the data
+	 * inside it
+	 */
+	int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
+			    struct walk_control *wc, u64 gen);
+};
+
+/*
+ * process_func used to pin down extents, write them or wait on them
+ */
+static int process_one_buffer(struct btrfs_root *log,
+			      struct extent_buffer *eb,
+			      struct walk_control *wc, u64 gen)
+{
+	if (wc->pin) {
+		mutex_lock(&log->fs_info->alloc_mutex);
+		btrfs_update_pinned_extents(log->fs_info->extent_root,
+					    eb->start, eb->len, 1);
+		mutex_unlock(&log->fs_info->alloc_mutex);
+	}
+
+	if (btrfs_buffer_uptodate(eb, gen)) {
+		if (wc->write)
+			btrfs_write_tree_block(eb);
+		if (wc->wait)
+			btrfs_wait_tree_block_writeback(eb);
+	}
+	return 0;
+}
+
+/*
+ * Item overwrite used by replay and tree logging.  eb, slot and key all refer
+ * to the src data we are copying out.
+ *
+ * root is the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and
+ * will be released on exit).
+ *
+ * If the key is already in the destination tree the existing item is
+ * overwritten.  If the existing item isn't big enough, it is extended.
+ * If it is too large, it is truncated.
+ *
+ * If the key isn't in the destination yet, a new item is inserted.
+ */
+static noinline int overwrite_item(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *eb, int slot,
+				   struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size;
+	u64 saved_i_size = 0;
+	int save_old_i_size = 0;
+	unsigned long src_ptr;
+	unsigned long dst_ptr;
+	int overwrite_root = 0;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+		overwrite_root = 1;
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	src_ptr = btrfs_item_ptr_offset(eb, slot);
+
+	/* look for the key in the destination tree */
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *src_copy;
+		char *dst_copy;
+		u32 dst_size = btrfs_item_size_nr(path->nodes[0],
+						  path->slots[0]);
+		if (dst_size != item_size)
+			goto insert;
+
+		if (item_size == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+		dst_copy = kmalloc(item_size, GFP_NOFS);
+		src_copy = kmalloc(item_size, GFP_NOFS);
+
+		read_extent_buffer(eb, src_copy, src_ptr, item_size);
+
+		dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
+				   item_size);
+		ret = memcmp(dst_copy, src_copy, item_size);
+
+		kfree(dst_copy);
+		kfree(src_copy);
+		/*
+		 * they have the same contents, just return, this saves
+		 * us from cowing blocks in the destination tree and doing
+		 * extra writes that may not have been done by a previous
+		 * sync
+		 */
+		if (ret == 0) {
+			btrfs_release_path(root, path);
+			return 0;
+		}
+
+	}
+insert:
+	btrfs_release_path(root, path);
+	/* try to insert the key into the destination tree */
+	ret = btrfs_insert_empty_item(trans, root, path,
+				      key, item_size);
+
+	/* make sure any existing item is the correct size */
+	if (ret == -EEXIST) {
+		u32 found_size;
+		found_size = btrfs_item_size_nr(path->nodes[0],
+						path->slots[0]);
+		if (found_size > item_size) {
+			btrfs_truncate_item(trans, root, path, item_size, 1);
+		} else if (found_size < item_size) {
+			ret = btrfs_del_item(trans, root,
+					     path);
+			BUG_ON(ret);
+
+			btrfs_release_path(root, path);
+			ret = btrfs_insert_empty_item(trans,
+				  root, path, key, item_size);
+			BUG_ON(ret);
+		}
+	} else if (ret) {
+		BUG();
+	}
+	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
+					path->slots[0]);
+
+	/* don't overwrite an existing inode if the generation number
+	 * was logged as zero.  This is done when the tree logging code
+	 * is just logging an inode to make sure it exists after recovery.
+	 *
+	 * Also, don't overwrite i_size on directories during replay.
+	 * log replay inserts and removes directory items based on the
+	 * state of the tree found in the subvolume, and i_size is modified
+	 * as it goes
+	 */
+	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
+		struct btrfs_inode_item *src_item;
+		struct btrfs_inode_item *dst_item;
+
+		src_item = (struct btrfs_inode_item *)src_ptr;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+
+		if (btrfs_inode_generation(eb, src_item) == 0)
+			goto no_copy;
+
+		if (overwrite_root &&
+		    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
+		    S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+			save_old_i_size = 1;
+			saved_i_size = btrfs_inode_size(path->nodes[0],
+							dst_item);
+		}
+	}
+
+	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
+			   src_ptr, item_size);
+
+	if (save_old_i_size) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+	}
+
+	/* make sure the generation is filled in */
+	if (key->type == BTRFS_INODE_ITEM_KEY) {
+		struct btrfs_inode_item *dst_item;
+		dst_item = (struct btrfs_inode_item *)dst_ptr;
+		if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
+			btrfs_set_inode_generation(path->nodes[0], dst_item,
+						   trans->transid);
+		}
+	}
+no_copy:
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+/*
+ * simple helper to read an inode off the disk from a given root
+ * This can only be called for subvolume roots and not for the log
+ */
+static noinline struct inode *read_one_inode(struct btrfs_root *root,
+					     u64 objectid)
+{
+	struct inode *inode;
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+
+	}
+	if (is_bad_inode(inode)) {
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
+}
+
+/* replays a single extent in 'eb' at 'slot' with 'key' into the
+ * subvolume 'root'.  path is released on entry and should be released
+ * on exit.
+ *
+ * extents in the log tree have not been allocated out of the extent
+ * tree yet.  So, this completes the allocation, taking a reference
+ * as required if the extent already exists or creating a new extent
+ * if it isn't in the extent allocation tree yet.
+ *
+ * The extent is inserted into the file, dropping any existing extents
+ * from the file that overlap the new one.
+ */
+static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int found_type;
+	u64 mask = root->sectorsize - 1;
+	u64 extent_end;
+	u64 alloc_hint;
+	u64 start = key->offset;
+	struct btrfs_file_extent_item *item;
+	struct inode *inode = NULL;
+	unsigned long size;
+	int ret = 0;
+
+	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+	found_type = btrfs_file_extent_type(eb, item);
+
+	if (found_type == BTRFS_FILE_EXTENT_REG)
+		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
+	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size = btrfs_file_extent_inline_len(eb,
+						    btrfs_item_nr(eb, slot));
+		extent_end = (start + size + mask) & ~mask;
+	} else {
+		ret = 0;
+		goto out;
+	}
+
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		ret = -EIO;
+		goto out;
+	}
+
+	/*
+	 * first check to see if we already have this extent in the
+	 * file.  This must be done before the btrfs_drop_extents run
+	 * so we don't try to drop this extent.
+	 */
+	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+				       start, 0);
+
+	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
+		struct btrfs_file_extent_item cmp1;
+		struct btrfs_file_extent_item cmp2;
+		struct btrfs_file_extent_item *existing;
+		struct extent_buffer *leaf;
+
+		leaf = path->nodes[0];
+		existing = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_file_extent_item);
+
+		read_extent_buffer(eb, &cmp1, (unsigned long)item,
+				   sizeof(cmp1));
+		read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
+				   sizeof(cmp2));
+
+		/*
+		 * we already have a pointer to this exact extent,
+		 * we don't have to do anything
+		 */
+		if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+			btrfs_release_path(root, path);
+			goto out;
+		}
+	}
+	btrfs_release_path(root, path);
+
+	/* drop any overlapping extents */
+	ret = btrfs_drop_extents(trans, root, inode,
+			 start, extent_end, start, &alloc_hint);
+	BUG_ON(ret);
+
+	BUG_ON(ret);
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		struct btrfs_key ins;
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		/* insert the extent pointer in the file */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+
+		/*
+		 * is this extent already allocated in the extent
+		 * allocation tree?  If so, just add a reference
+		 */
+		ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset);
+		btrfs_release_path(root, path);
+		if (ret == 0) {
+			ret = btrfs_inc_extent_ref(trans, root,
+				   ins.objectid, ins.offset,
+				   root->root_key.objectid,
+				   trans->transid, key->objectid, start);
+		} else {
+			/*
+			 * insert the extent pointer in the extent
+			 * allocation tree
+			 */
+			ret = btrfs_alloc_logged_extent(trans, root,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						start, &ins);
+			BUG_ON(ret);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+	}
+	/* btrfs_drop_extents changes i_blocks, update it here */
+	inode->i_blocks += (extent_end - start) >> 9;
+	btrfs_update_inode(trans, root, inode);
+out:
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+/*
+ * when cleaning up conflicts between the directory names in the
+ * subvolume, directory names in the log and directory names in the
+ * inode back references, we may have to unlink inodes from directories.
+ *
+ * This is a helper function to do the unlink of a specific directory
+ * item
+ */
+static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct inode *dir,
+				      struct btrfs_dir_item *di)
+{
+	struct inode *inode;
+	char *name;
+	int name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_key location;
+	int ret;
+
+	leaf = path->nodes[0];
+
+	btrfs_dir_item_key_to_cpu(leaf, di, &location);
+	name_len = btrfs_dir_name_len(leaf, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
+	btrfs_release_path(root, path);
+
+	inode = read_one_inode(root, location.objectid);
+	BUG_ON(!inode);
+
+	btrfs_inc_nlink(inode);
+	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	kfree(name);
+
+	iput(inode);
+	return ret;
+}
+
+/*
+ * helper function to see if a given name and sequence number found
+ * in an inode back reference are already in a directory and correctly
+ * point to this inode
+ */
+static noinline int inode_in_dir(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 dirid, u64 objectid, u64 index,
+				 const char *name, int name_len)
+{
+	struct btrfs_dir_item *di;
+	struct btrfs_key location;
+	int match = 0;
+
+	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
+					 index, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	btrfs_release_path(root, path);
+
+	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+		if (location.objectid != objectid)
+			goto out;
+	} else
+		goto out;
+	match = 1;
+out:
+	btrfs_release_path(root, path);
+	return match;
+}
+
+/*
+ * helper function to check a log tree for a named back reference in
+ * an inode.  This is used to decide if a back reference that is
+ * found in the subvolume conflicts with what we find in the log.
+ *
+ * inode backreferences may have multiple refs in a single item,
+ * during replay we process one reference at a time, and we don't
+ * want to delete valid links to a file from the subvolume if that
+ * link is also in the log.
+ */
+static noinline int backref_in_log(struct btrfs_root *log,
+				   struct btrfs_key *key,
+				   char *name, int namelen)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_ref *ref;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	unsigned long name_ptr;
+	int found_name_len;
+	int item_size;
+	int ret;
+	int match = 0;
+
+	path = btrfs_alloc_path();
+	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
+	if (ret != 0)
+		goto out;
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
+	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+	ptr_end = ptr + item_size;
+	while (ptr < ptr_end) {
+		ref = (struct btrfs_inode_ref *)ptr;
+		found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
+		if (found_name_len == namelen) {
+			name_ptr = (unsigned long)(ref + 1);
+			ret = memcmp_extent_buffer(path->nodes[0], name,
+						   name_ptr, namelen);
+			if (ret == 0) {
+				match = 1;
+				goto out;
+			}
+		}
+		ptr = (unsigned long)(ref + 1) + found_name_len;
+	}
+out:
+	btrfs_free_path(path);
+	return match;
+}
+
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct inode *dir;
+	int ret;
+	struct btrfs_key location;
+	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
+	struct inode *inode;
+	char *name;
+	int namelen;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+
+	location.objectid = key->objectid;
+	location.type = BTRFS_INODE_ITEM_KEY;
+	location.offset = 0;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, key->offset);
+	if (!dir)
+		return -ENOENT;
+
+	inode = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+again:
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	namelen = btrfs_inode_ref_name_len(eb, ref);
+	name = kmalloc(namelen, GFP_NOFS);
+	BUG_ON(!name);
+
+	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
+
+	/* if we already have a perfect match, we're done */
+	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+			 btrfs_inode_ref_index(eb, ref),
+			 name, namelen)) {
+		goto out;
+	}
+
+	/*
+	 * look for a conflicting back reference in the metadata.
+	 * if we find one we have to unlink that name of the file
+	 * before we add our new link.  Later on, we overwrite any
+	 * existing back reference, and we don't want to create
+	 * dangling pointers in the directory.
+	 */
+conflict_again:
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret == 0) {
+		char *victim_name;
+		int victim_name_len;
+		struct btrfs_inode_ref *victim_ref;
+		unsigned long ptr;
+		unsigned long ptr_end;
+		struct extent_buffer *leaf = path->nodes[0];
+
+		/* are we trying to overwrite a back ref for the root directory
+		 * if so, just jump out, we're done
+		 */
+		if (key->objectid == key->offset)
+			goto out_nowrite;
+
+		/* check all the names in this back reference to see
+		 * if they are in the log.  if so, we allow them to stay
+		 * otherwise they must be unlinked as a conflict
+		 */
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
+		while(ptr < ptr_end) {
+			victim_ref = (struct btrfs_inode_ref *)ptr;
+			victim_name_len = btrfs_inode_ref_name_len(leaf,
+								   victim_ref);
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			BUG_ON(!victim_name);
+
+			read_extent_buffer(leaf, victim_name,
+					   (unsigned long)(victim_ref + 1),
+					   victim_name_len);
+
+			if (!backref_in_log(log, key, victim_name,
+					    victim_name_len)) {
+				btrfs_inc_nlink(inode);
+				btrfs_release_path(root, path);
+				ret = btrfs_unlink_inode(trans, root, dir,
+							 inode, victim_name,
+							 victim_name_len);
+				kfree(victim_name);
+				btrfs_release_path(root, path);
+				goto conflict_again;
+			}
+			kfree(victim_name);
+			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
+		}
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+					 btrfs_inode_ref_index(eb, ref),
+					 name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+
+	/* look for a conflicting name */
+	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(root, path);
+
+	/* insert our name */
+	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
+			     btrfs_inode_ref_index(eb, ref));
+	BUG_ON(ret);
+
+	btrfs_update_inode(trans, root, inode);
+
+out:
+	ref_ptr = (unsigned long)(ref + 1) + namelen;
+	kfree(name);
+	if (ref_ptr < ref_end)
+		goto again;
+
+	/* finally write the back reference in the inode */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
+	BUG_ON(ret);
+
+out_nowrite:
+	btrfs_release_path(root, path);
+	iput(dir);
+	iput(inode);
+	return 0;
+}
+
+/*
+ * replay one csum item from the log tree into the subvolume 'root'
+ * eb, slot and key all refer to the log tree
+ * path is for temp use by this function and should be released on return
+ *
+ * This copies the checksums out of the log tree and inserts them into
+ * the subvolume.  Any existing checksums for this range in the file
+ * are overwritten, and new items are added where required.
+ *
+ * We keep this simple by reusing the btrfs_ordered_sum code from
+ * the data=ordered mode.  This basically means making a copy
+ * of all the checksums in ram, which we have to do anyway for kmap
+ * rules.
+ *
+ * The copy is then sent down to btrfs_csum_file_blocks, which
+ * does all the hard work of finding existing items in the file
+ * or adding new ones.
+ */
+static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct extent_buffer *eb, int slot,
+				      struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u64 cur_offset;
+	unsigned long file_bytes;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct inode *inode;
+	unsigned long ptr;
+
+	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
+	inode = read_one_inode(root, key->objectid);
+	if (!inode) {
+		return -EIO;
+	}
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
+	if (!sums) {
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&sums->list);
+	sums->len = file_bytes;
+	sums->file_offset = key->offset;
+
+	/*
+	 * copy all the sums into the ordered sum struct
+	 */
+	sector_sum = sums->sums;
+	cur_offset = key->offset;
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	while(item_size > 0) {
+		sector_sum->offset = cur_offset;
+		read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
+		sector_sum++;
+		item_size -= BTRFS_CRC32_SIZE;
+		ptr += BTRFS_CRC32_SIZE;
+		cur_offset += root->sectorsize;
+	}
+
+	/* let btrfs_csum_file_blocks add them into the file */
+	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
+	BUG_ON(ret);
+	kfree(sums);
+	iput(inode);
+
+	return 0;
+}
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	u64 nlink = 0;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	int name_len;
+
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_INODE_REF_KEY;
+	key.offset = (u64)-1;
+
+	path = btrfs_alloc_path();
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &key,
+				      path->slots[0]);
+		if (key.objectid != inode->i_ino ||
+		    key.type != BTRFS_INODE_REF_KEY)
+			break;
+		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
+						   path->slots[0]);
+		while(ptr < ptr_end) {
+			struct btrfs_inode_ref *ref;
+
+			ref = (struct btrfs_inode_ref *)ptr;
+			name_len = btrfs_inode_ref_name_len(path->nodes[0],
+							    ref);
+			ptr = (unsigned long)(ref + 1) + name_len;
+			nlink++;
+		}
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+		btrfs_release_path(root, path);
+	}
+	btrfs_free_path(path);
+	if (nlink != inode->i_nlink) {
+		inode->i_nlink = nlink;
+		btrfs_update_inode(trans, root, inode);
+	}
+
+	return 0;
+}
+
+static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
+					    struct btrfs_root *root,
+					    struct btrfs_path *path)
+{
+	int ret;
+	struct btrfs_key key;
+	struct inode *inode;
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	key.type = BTRFS_ORPHAN_ITEM_KEY;
+	key.offset = (u64)-1;
+	while(1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			break;
+
+		if (ret == 1) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
+		    key.type != BTRFS_ORPHAN_ITEM_KEY)
+			break;
+
+		ret = btrfs_del_item(trans, root, path);
+		BUG_ON(ret);
+
+		btrfs_release_path(root, path);
+		inode = read_one_inode(root, key.offset);
+		BUG_ON(!inode);
+
+		ret = fixup_inode_link_count(trans, root, inode);
+		BUG_ON(ret);
+
+		iput(inode);
+
+		if (key.offset == 0)
+			break;
+		key.offset--;
+	}
+	btrfs_release_path(root, path);
+	return 0;
+}
+
+
+/*
+ * record a given inode in the fixup dir so we can check its link
+ * count when replay is done.  The link count is incremented here
+ * so the inode won't go away until we check it
+ */
+static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct inode *inode;
+
+	inode = read_one_inode(root, objectid);
+	BUG_ON(!inode);
+
+	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
+	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+	key.offset = objectid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+
+	btrfs_release_path(root, path);
+	if (ret == 0) {
+		btrfs_inc_nlink(inode);
+		btrfs_update_inode(trans, root, inode);
+	} else if (ret == -EEXIST) {
+		ret = 0;
+	} else {
+		BUG();
+	}
+	iput(inode);
+
+	return ret;
+}
+
+/*
+ * when replaying the log for a directory, we only insert names
+ * for inodes that actually exist.  This means an fsync on a directory
+ * does not implicitly fsync all the new files in it
+ */
+static noinline int insert_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    u64 dirid, u64 index,
+				    char *name, int name_len, u8 type,
+				    struct btrfs_key *location)
+{
+	struct inode *inode;
+	struct inode *dir;
+	int ret;
+
+	inode = read_one_inode(root, location->objectid);
+	if (!inode)
+		return -ENOENT;
+
+	dir = read_one_inode(root, dirid);
+	if (!dir) {
+		iput(inode);
+		return -EIO;
+	}
+	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
+
+	/* FIXME, put inode into FIXUP list */
+
+	iput(inode);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * take a single entry in a log directory item and replay it into
+ * the subvolume.
+ *
+ * if a conflicting item exists in the subdirectory already,
+ * the inode it points to is unlinked and put into the link count
+ * fix up tree.
+ *
+ * If a name from the log points to a file or directory that does
+ * not exist in the FS, it is skipped.  fsyncs on directories
+ * do not force down inodes inside that directory, just changes to the
+ * names or unlinks in a directory.
+ */
+static noinline int replay_one_name(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *eb,
+				    struct btrfs_dir_item *di,
+				    struct btrfs_key *key)
+{
+	char *name;
+	int name_len;
+	struct btrfs_dir_item *dst_di;
+	struct btrfs_key found_key;
+	struct btrfs_key log_key;
+	struct inode *dir;
+	struct inode *inode;
+	u8 log_type;
+	int ret;
+
+	dir = read_one_inode(root, key->objectid);
+	BUG_ON(!dir);
+
+	name_len = btrfs_dir_name_len(eb, di);
+	name = kmalloc(name_len, GFP_NOFS);
+	log_type = btrfs_dir_type(eb, di);
+	read_extent_buffer(eb, name, (unsigned long)(di + 1),
+		   name_len);
+
+	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	if (key->type == BTRFS_DIR_ITEM_KEY) {
+		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
+				       name, name_len, 1);
+	}
+	else if (key->type == BTRFS_DIR_INDEX_KEY) {
+		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
+						     key->objectid,
+						     key->offset, name,
+						     name_len, 1);
+	} else {
+		BUG();
+	}
+	if (!dst_di || IS_ERR(dst_di)) {
+		/* we need a sequence number to insert, so we only
+		 * do inserts for the BTRFS_DIR_INDEX_KEY types
+		 */
+		if (key->type != BTRFS_DIR_INDEX_KEY)
+			goto out;
+		goto insert;
+	}
+
+	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
+	/* the existing item matches the logged item */
+	if (found_key.objectid == log_key.objectid &&
+	    found_key.type == log_key.type &&
+	    found_key.offset == log_key.offset &&
+	    btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
+		goto out;
+	}
+
+	/*
+	 * don't drop the conflicting directory entry if the inode
+	 * for the new entry doesn't exist
+	 */
+	inode = read_one_inode(root, log_key.objectid);
+	if (!inode)
+		goto out;
+
+	iput(inode);
+	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
+	BUG_ON(ret);
+
+	if (key->type == BTRFS_DIR_INDEX_KEY)
+		goto insert;
+out:
+	btrfs_release_path(root, path);
+	kfree(name);
+	iput(dir);
+	return 0;
+
+insert:
+	btrfs_release_path(root, path);
+	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
+			      name, name_len, log_type, &log_key);
+
+	if (ret && ret != -ENOENT)
+		BUG();
+	goto out;
+}
+
+/*
+ * find all the names in a directory item and reconcile them into
+ * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
+ * one name in a directory item, but the same code gets used for
+ * both directory index types
+ */
+static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct extent_buffer *eb, int slot,
+					struct btrfs_key *key)
+{
+	int ret;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	struct btrfs_dir_item *di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while(ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		ret = replay_one_name(trans, root, path, eb, di, key);
+		BUG_ON(ret);
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	return 0;
+}
+
+/*
+ * directory replay has two parts.  There are the standard directory
+ * items in the log copied from the subvolume, and range items
+ * created in the log while the subvolume was logged.
+ *
+ * The range items tell us which parts of the key space the log
+ * is authoritative for.  During replay, if a key in the subvolume
+ * directory is in a logged range item, but not actually in the log
+ * that means it was deleted from the directory before the fsync
+ * and should be removed.
+ */
+static noinline int find_dir_range(struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   u64 dirid, int key_type,
+				   u64 *start_ret, u64 *end_ret)
+{
+	struct btrfs_key key;
+	u64 found_end;
+	struct btrfs_dir_log_item *item;
+	int ret;
+	int nritems;
+
+	if (*start_ret == (u64)-1)
+		return 1;
+
+	key.objectid = dirid;
+	key.type = key_type;
+	key.offset = *start_ret;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+	}
+	if (ret != 0)
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto next;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+
+	if (*start_ret >= key.offset && *start_ret <= found_end) {
+		ret = 0;
+		*start_ret = key.offset;
+		*end_ret = found_end;
+		goto out;
+	}
+	ret = 1;
+next:
+	/* check the next slot in the tree to see if it is a valid item */
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (path->slots[0] >= nritems) {
+		ret = btrfs_next_leaf(root, path);
+		if (ret)
+			goto out;
+	} else {
+		path->slots[0]++;
+	}
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != key_type || key.objectid != dirid) {
+		ret = 1;
+		goto out;
+	}
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	found_end = btrfs_dir_log_end(path->nodes[0], item);
+	*start_ret = key.offset;
+	*end_ret = found_end;
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * this looks for a given directory item in the log.  If the directory
+ * item is not in the log, the item is removed and the inode it points
+ * to is unlinked
+ */
+static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_root *log,
+				      struct btrfs_path *path,
+				      struct btrfs_path *log_path,
+				      struct inode *dir,
+				      struct btrfs_key *dir_key)
+{
+	int ret;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	struct btrfs_dir_item *di;
+	struct btrfs_dir_item *log_di;
+	int name_len;
+	unsigned long ptr;
+	unsigned long ptr_end;
+	char *name;
+	struct inode *inode;
+	struct btrfs_key location;
+
+again:
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	ptr = btrfs_item_ptr_offset(eb, slot);
+	ptr_end = ptr + item_size;
+	while(ptr < ptr_end) {
+		di = (struct btrfs_dir_item *)ptr;
+		name_len = btrfs_dir_name_len(eb, di);
+		name = kmalloc(name_len, GFP_NOFS);
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		read_extent_buffer(eb, name, (unsigned long)(di + 1),
+				  name_len);
+		log_di = NULL;
+		if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+			log_di = btrfs_lookup_dir_item(trans, log, log_path,
+						       dir_key->objectid,
+						       name, name_len, 0);
+		} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+			log_di = btrfs_lookup_dir_index_item(trans, log,
+						     log_path,
+						     dir_key->objectid,
+						     dir_key->offset,
+						     name, name_len, 0);
+		}
+		if (!log_di || IS_ERR(log_di)) {
+			btrfs_dir_item_key_to_cpu(eb, di, &location);
+			btrfs_release_path(root, path);
+			btrfs_release_path(log, log_path);
+			inode = read_one_inode(root, location.objectid);
+			BUG_ON(!inode);
+
+			ret = link_to_fixup_dir(trans, root,
+						path, location.objectid);
+			BUG_ON(ret);
+			btrfs_inc_nlink(inode);
+			ret = btrfs_unlink_inode(trans, root, dir, inode,
+						 name, name_len);
+			BUG_ON(ret);
+			kfree(name);
+			iput(inode);
+
+			/* there might still be more names under this key
+			 * check and repeat if required
+			 */
+			ret = btrfs_search_slot(NULL, root, dir_key, path,
+						0, 0);
+			if (ret == 0)
+				goto again;
+			ret = 0;
+			goto out;
+		}
+		btrfs_release_path(log, log_path);
+		kfree(name);
+
+		ptr = (unsigned long)(di + 1);
+		ptr += name_len;
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, log_path);
+	return ret;
+}
+
+/*
+ * deletion replay happens before we copy any new directory items
+ * out of the log or out of backreferences from inodes.  It
+ * scans the log to find ranges of keys that log is authoritative for,
+ * and then scans the directory to find items in those ranges that are
+ * not present in the log.
+ *
+ * Anything we don't find in the log is unlinked and removed from the
+ * directory.
+ */
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       u64 dirid)
+{
+	u64 range_start;
+	u64 range_end;
+	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
+	int ret = 0;
+	struct btrfs_key dir_key;
+	struct btrfs_key found_key;
+	struct btrfs_path *log_path;
+	struct inode *dir;
+
+	dir_key.objectid = dirid;
+	dir_key.type = BTRFS_DIR_ITEM_KEY;
+	log_path = btrfs_alloc_path();
+	if (!log_path)
+		return -ENOMEM;
+
+	dir = read_one_inode(root, dirid);
+	/* it isn't an error if the inode isn't there, that can happen
+	 * because we replay the deletes before we copy in the inode item
+	 * from the log
+	 */
+	if (!dir) {
+		btrfs_free_path(log_path);
+		return 0;
+	}
+again:
+	range_start = 0;
+	range_end = 0;
+	while(1) {
+		ret = find_dir_range(log, path, dirid, key_type,
+				     &range_start, &range_end);
+		if (ret != 0)
+			break;
+
+		dir_key.offset = range_start;
+		while(1) {
+			int nritems;
+			ret = btrfs_search_slot(NULL, root, &dir_key, path,
+						0, 0);
+			if (ret < 0)
+				goto out;
+
+			nritems = btrfs_header_nritems(path->nodes[0]);
+			if (path->slots[0] >= nritems) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret)
+					break;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+			if (found_key.objectid != dirid ||
+			    found_key.type != dir_key.type)
+				goto next_type;
+
+			if (found_key.offset > range_end)
+				break;
+
+			ret = check_item_in_log(trans, root, log, path,
+						log_path, dir, &found_key);
+			BUG_ON(ret);
+			if (found_key.offset == (u64)-1)
+				break;
+			dir_key.offset = found_key.offset + 1;
+		}
+		btrfs_release_path(root, path);
+		if (range_end == (u64)-1)
+			break;
+		range_start = range_end + 1;
+	}
+
+next_type:
+	ret = 0;
+	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
+		key_type = BTRFS_DIR_LOG_INDEX_KEY;
+		dir_key.type = BTRFS_DIR_INDEX_KEY;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+out:
+	btrfs_release_path(root, path);
+	btrfs_free_path(log_path);
+	iput(dir);
+	return ret;
+}
+
+/*
+ * the process_func used to replay items from the log tree.  This
+ * gets called in two different stages.  The first stage just looks
+ * for inodes and makes sure they are all copied into the subvolume.
+ *
+ * The second stage copies all the other item types from the log into
+ * the subvolume.  The two stage approach is slower, but gets rid of
+ * lots of complexity around inodes referencing other inodes that exist
+ * only in the log (references come from either directory items or inode
+ * back refs).
+ */
+static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
+			     struct walk_control *wc, u64 gen)
+{
+	int nritems;
+	struct btrfs_path *path;
+	struct btrfs_root *root = wc->replay_dest;
+	struct btrfs_key key;
+	u32 item_size;
+	int level;
+	int i;
+	int ret;
+
+	btrfs_read_buffer(eb, gen);
+
+	level = btrfs_header_level(eb);
+
+	if (level != 0)
+		return 0;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		item_size = btrfs_item_size_nr(eb, i);
+
+		/* inode keys are done during the first stage */
+		if (key.type == BTRFS_INODE_ITEM_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_INODES) {
+			struct inode *inode;
+			struct btrfs_inode_item *inode_item;
+			u32 mode;
+
+			inode_item = btrfs_item_ptr(eb, i,
+					    struct btrfs_inode_item);
+			mode = btrfs_inode_mode(eb, inode_item);
+			if (S_ISDIR(mode)) {
+				ret = replay_dir_deletes(wc->trans,
+					 root, log, path, key.objectid);
+				BUG_ON(ret);
+			}
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+
+			/* for regular files, truncate away
+			 * extents past the new EOF
+			 */
+			if (S_ISREG(mode)) {
+				inode = read_one_inode(root,
+						       key.objectid);
+				BUG_ON(!inode);
+
+				ret = btrfs_truncate_inode_items(wc->trans,
+					root, inode, inode->i_size,
+					BTRFS_EXTENT_DATA_KEY);
+				BUG_ON(ret);
+				iput(inode);
+			}
+			ret = link_to_fixup_dir(wc->trans, root,
+						path, key.objectid);
+			BUG_ON(ret);
+		}
+		if (wc->stage < LOG_WALK_REPLAY_ALL)
+			continue;
+
+		/* these keys are simply copied */
+		if (key.type == BTRFS_XATTR_ITEM_KEY) {
+			ret = overwrite_item(wc->trans, root, path,
+					     eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_INODE_REF_KEY) {
+			ret = add_inode_ref(wc->trans, root, log, path,
+					    eb, i, &key);
+			BUG_ON(ret && ret != -ENOENT);
+		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+			ret = replay_one_extent(wc->trans, root, path,
+						eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
+			ret = replay_one_csum(wc->trans, root, path,
+					      eb, i, &key);
+			BUG_ON(ret);
+		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
+			   key.type == BTRFS_DIR_INDEX_KEY) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			BUG_ON(ret);
+		}
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
+static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level,
+				   struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	u64 bytenr;
+	u64 ptr_gen;
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u32 blocksize;
+	int ret = 0;
+
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	while(*level > 0) {
+		WARN_ON(*level < 0);
+		WARN_ON(*level >= BTRFS_MAX_LEVEL);
+		cur = path->nodes[*level];
+
+		if (btrfs_header_level(cur) != *level)
+			WARN_ON(1);
+
+		if (path->slots[*level] >=
+		    btrfs_header_nritems(cur))
+			break;
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+
+		parent = path->nodes[*level];
+		root_owner = btrfs_header_owner(parent);
+		root_gen = btrfs_header_generation(parent);
+
+		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+
+		wc->process_func(root, next, wc, ptr_gen);
+
+		if (*level == 1) {
+			path->slots[*level]++;
+			if (wc->free) {
+				btrfs_read_buffer(next, ptr_gen);
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				ret = btrfs_drop_leaf_ref(trans, root, next);
+				BUG_ON(ret);
+
+				WARN_ON(root_owner !=
+					BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_extent(trans, root, bytenr,
+							blocksize, root_owner,
+							root_gen, 0, 0, 1);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(next);
+			continue;
+		}
+		btrfs_read_buffer(next, ptr_gen);
+
+		WARN_ON(*level <= 0);
+		if (path->nodes[*level-1])
+			free_extent_buffer(path->nodes[*level-1]);
+		path->nodes[*level-1] = next;
+		*level = btrfs_header_level(next);
+		path->slots[*level] = 0;
+		cond_resched();
+	}
+	WARN_ON(*level < 0);
+	WARN_ON(*level >= BTRFS_MAX_LEVEL);
+
+	if (path->nodes[*level] == root->node) {
+		parent = path->nodes[*level];
+	} else {
+		parent = path->nodes[*level + 1];
+	}
+	bytenr = path->nodes[*level]->start;
+
+	blocksize = btrfs_level_size(root, *level);
+	root_owner = btrfs_header_owner(parent);
+	root_gen = btrfs_header_generation(parent);
+
+	wc->process_func(root, path->nodes[*level], wc,
+			 btrfs_header_generation(path->nodes[*level]));
+
+	if (wc->free) {
+		next = path->nodes[*level];
+		btrfs_tree_lock(next);
+		clean_tree_block(trans, root, next);
+		btrfs_wait_tree_block_writeback(next);
+		btrfs_tree_unlock(next);
+
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, next);
+			BUG_ON(ret);
+		}
+		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+					  root_owner, root_gen, 0, 0, 1);
+		BUG_ON(ret);
+	}
+	free_extent_buffer(path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+
+	cond_resched();
+	return 0;
+}
+
+static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path, int *level,
+				 struct walk_control *wc)
+{
+	u64 root_owner;
+	u64 root_gen;
+	int i;
+	int slot;
+	int ret;
+
+	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+		slot = path->slots[i];
+		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+			struct extent_buffer *node;
+			node = path->nodes[i];
+			path->slots[i]++;
+			*level = i;
+			WARN_ON(*level == 0);
+			return 0;
+		} else {
+			if (path->nodes[*level] == root->node) {
+				root_owner = root->root_key.objectid;
+				root_gen =
+				   btrfs_header_generation(path->nodes[*level]);
+			} else {
+				struct extent_buffer *node;
+				node = path->nodes[*level + 1];
+				root_owner = btrfs_header_owner(node);
+				root_gen = btrfs_header_generation(node);
+			}
+			wc->process_func(root, path->nodes[*level], wc,
+				 btrfs_header_generation(path->nodes[*level]));
+			if (wc->free) {
+				struct extent_buffer *next;
+
+				next = path->nodes[*level];
+
+				btrfs_tree_lock(next);
+				clean_tree_block(trans, root, next);
+				btrfs_wait_tree_block_writeback(next);
+				btrfs_tree_unlock(next);
+
+				if (*level == 0) {
+					ret = btrfs_drop_leaf_ref(trans, root,
+								  next);
+					BUG_ON(ret);
+				}
+
+				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
+				ret = btrfs_free_extent(trans, root,
+						path->nodes[*level]->start,
+						path->nodes[*level]->len,
+						root_owner, root_gen, 0, 0, 1);
+				BUG_ON(ret);
+			}
+			free_extent_buffer(path->nodes[*level]);
+			path->nodes[*level] = NULL;
+			*level = i + 1;
+		}
+	}
+	return 1;
+}
+
+/*
+ * drop the reference count on the tree rooted at 'snap'.  This traverses
+ * the tree freeing any blocks that have a ref count of zero after being
+ * decremented.
+ */
+static int walk_log_tree(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *log, struct walk_control *wc)
+{
+	int ret = 0;
+	int wret;
+	int level;
+	struct btrfs_path *path;
+	int i;
+	int orig_level;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	level = btrfs_header_level(log->node);
+	orig_level = level;
+	path->nodes[level] = log->node;
+	extent_buffer_get(log->node);
+	path->slots[level] = 0;
+
+	while(1) {
+		wret = walk_down_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+
+		wret = walk_up_log_tree(trans, log, path, &level, wc);
+		if (wret > 0)
+			break;
+		if (wret < 0)
+			ret = wret;
+	}
+
+	/* was the root node processed? if not, catch it here */
+	if (path->nodes[orig_level]) {
+		wc->process_func(log, path->nodes[orig_level], wc,
+			 btrfs_header_generation(path->nodes[orig_level]));
+		if (wc->free) {
+			struct extent_buffer *next;
+
+			next = path->nodes[orig_level];
+
+			btrfs_tree_lock(next);
+			clean_tree_block(trans, log, next);
+			btrfs_wait_tree_block_writeback(next);
+			btrfs_tree_unlock(next);
+
+			if (orig_level == 0) {
+				ret = btrfs_drop_leaf_ref(trans, log,
+							  next);
+				BUG_ON(ret);
+			}
+			WARN_ON(log->root_key.objectid !=
+				BTRFS_TREE_LOG_OBJECTID);
+			ret = btrfs_free_extent(trans, log,
+						next->start, next->len,
+						log->root_key.objectid,
+						btrfs_header_generation(next),
+						0, 0, 1);
+			BUG_ON(ret);
+		}
+	}
+
+	for (i = 0; i <= orig_level; i++) {
+		if (path->nodes[i]) {
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
+		}
+	}
+	btrfs_free_path(path);
+	if (wc->free)
+		free_extent_buffer(log->node);
+	return ret;
+}
+
+int wait_log_commit(struct btrfs_root *log)
+{
+	DEFINE_WAIT(wait);
+	u64 transid = log->fs_info->tree_log_transid;
+
+	do {
+		prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		if (atomic_read(&log->fs_info->tree_log_commit))
+			schedule();
+		finish_wait(&log->fs_info->tree_log_wait, &wait);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+	} while(transid == log->fs_info->tree_log_transid &&
+		atomic_read(&log->fs_info->tree_log_commit));
+	return 0;
+}
+
+/*
+ * btrfs_sync_log does sends a given tree log down to the disk and
+ * updates the super blocks to record it.  When this call is done,
+ * you know that any inodes previously logged are safely on disk
+ */
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root)
+{
+	int ret;
+	unsigned long batch;
+	struct btrfs_root *log = root->log_root;
+	struct walk_control wc = {
+		.write = 1,
+		.process_func = process_one_buffer
+	};
+
+	mutex_lock(&log->fs_info->tree_log_mutex);
+	if (atomic_read(&log->fs_info->tree_log_commit)) {
+		wait_log_commit(log);
+		goto out;
+	}
+	atomic_set(&log->fs_info->tree_log_commit, 1);
+
+	while(1) {
+		mutex_unlock(&log->fs_info->tree_log_mutex);
+		schedule_timeout_uninterruptible(1);
+		mutex_lock(&log->fs_info->tree_log_mutex);
+		batch = log->fs_info->tree_log_batch;
+
+		while(atomic_read(&log->fs_info->tree_log_writers)) {
+			DEFINE_WAIT(wait);
+			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			batch = log->fs_info->tree_log_batch;
+			mutex_unlock(&log->fs_info->tree_log_mutex);
+			if (atomic_read(&log->fs_info->tree_log_writers))
+				schedule();
+			mutex_lock(&log->fs_info->tree_log_mutex);
+			finish_wait(&log->fs_info->tree_log_wait, &wait);
+		}
+		if (batch == log->fs_info->tree_log_batch)
+			break;
+	}
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	BUG_ON(ret);
+
+	wc.wait = 1;
+
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	BUG_ON(ret);
+
+	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+				 log->fs_info->log_root_tree->node->start);
+	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+		       btrfs_header_level(log->fs_info->log_root_tree->node));
+
+	write_ctree_super(trans, log->fs_info->tree_root);
+	log->fs_info->tree_log_transid++;
+	log->fs_info->tree_log_batch = 0;
+	atomic_set(&log->fs_info->tree_log_commit, 0);
+	smp_mb();
+	if (waitqueue_active(&log->fs_info->tree_log_wait))
+		wake_up(&log->fs_info->tree_log_wait);
+out:
+	mutex_unlock(&log->fs_info->tree_log_mutex);
+	return 0;
+
+}
+
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+	int ret;
+	struct btrfs_root *log;
+	struct key;
+	struct walk_control wc = {
+		.free = 1,
+		.process_func = process_one_buffer
+	};
+
+	if (!root->log_root)
+		return 0;
+
+	log = root->log_root;
+	ret = walk_log_tree(trans, log, &wc);
+	BUG_ON(ret);
+
+	log = root->log_root;
+	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+			     &log->root_key);
+	BUG_ON(ret);
+	root->log_root = NULL;
+	kfree(root->log_root);
+	return 0;
+}
+
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	u64 bytenr = btrfs_root_bytenr(&log->root_item);
+	int ret;
+
+	if (log->node->start == bytenr)
+		return 0;
+
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	BUG_ON(ret);
+	return ret;
+}
+
+/*
+ * If both a file and directory are logged, and unlinks or renames are
+ * mixed in, we have a few interesting corners:
+ *
+ * create file X in dir Y
+ * link file X to X.link in dir Y
+ * fsync file X
+ * unlink file X but leave X.link
+ * fsync dir Y
+ *
+ * After a crash we would expect only X.link to exist.  But file X
+ * didn't get fsync'd again so the log has back refs for X and X.link.
+ *
+ * We solve this by removing directory entries and inode backrefs from the
+ * log when a file that was logged in the current transaction is
+ * unlinked.  Any later fsync will include the updated log entries, and
+ * we'll be able to reconstruct the proper directory items from backrefs.
+ *
+ * This optimizations allows us to avoid relogging the entire inode
+ * or the entire directory.
+ */
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index)
+{
+	struct btrfs_root *log;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	int ret;
+	int bytes_del = 0;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
+
+	log = root->log_root;
+	path = btrfs_alloc_path();
+	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+				   name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+	btrfs_release_path(log, path);
+	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+					 index, name, name_len, -1);
+	if (di && !IS_ERR(di)) {
+		ret = btrfs_delete_one_dir_name(trans, log, path, di);
+		bytes_del += name_len;
+		BUG_ON(ret);
+	}
+
+	/* update the directory size in the log to reflect the names
+	 * we have removed
+	 */
+	if (bytes_del) {
+		struct btrfs_key key;
+
+		key.objectid = dir->i_ino;
+		key.offset = 0;
+		key.type = BTRFS_INODE_ITEM_KEY;
+		btrfs_release_path(log, path);
+
+		ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+		if (ret == 0) {
+			struct btrfs_inode_item *item;
+			u64 i_size;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			i_size = btrfs_inode_size(path->nodes[0], item);
+			if (i_size > bytes_del)
+				i_size -= bytes_del;
+			else
+				i_size = 0;
+			btrfs_set_inode_size(path->nodes[0], item, i_size);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+		} else
+			ret = 0;
+		btrfs_release_path(log, path);
+	}
+
+	btrfs_free_path(path);
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
+	end_log_trans(root);
+
+	return 0;
+}
+
+/* see comments for btrfs_del_dir_entries_in_log */
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid)
+{
+	struct btrfs_root *log;
+	u64 index;
+	int ret;
+
+	ret = join_running_log_trans(root);
+	if (ret)
+		return 0;
+	log = root->log_root;
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+				  dirid, &index);
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	end_log_trans(root);
+
+	if (ret == 0 || ret == -ENOENT)
+		return 0;
+	return ret;
+}
+
+/*
+ * creates a range item in the log for 'dirid'.  first_offset and
+ * last_offset tell us which parts of the key space the log should
+ * be considered authoritative for.
+ */
+static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *log,
+				       struct btrfs_path *path,
+				       int key_type, u64 dirid,
+				       u64 first_offset, u64 last_offset)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_log_item *item;
+
+	key.objectid = dirid;
+	key.offset = first_offset;
+	if (key_type == BTRFS_DIR_ITEM_KEY)
+		key.type = BTRFS_DIR_LOG_ITEM_KEY;
+	else
+		key.type = BTRFS_DIR_LOG_INDEX_KEY;
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
+	BUG_ON(ret);
+
+	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_dir_log_item);
+	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+/*
+ * log all the items included in the current transaction for a given
+ * directory.  This also creates the range items in the log tree required
+ * to replay anything deleted before the fsync
+ */
+static noinline int log_dir_items(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path, int key_type,
+			  u64 min_offset, u64 *last_offset_ret)
+{
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	struct extent_buffer *src;
+	int ret;
+	int i;
+	int nritems;
+	u64 first_offset = min_offset;
+	u64 last_offset = (u64)-1;
+
+	log = root->log_root;
+	max_key.objectid = inode->i_ino;
+	max_key.offset = (u64)-1;
+	max_key.type = key_type;
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = key_type;
+	min_key.offset = min_offset;
+
+	path->keep_locks = 1;
+
+	ret = btrfs_search_forward(root, &min_key, &max_key,
+				   path, 0, trans->transid);
+
+	/*
+	 * we didn't find anything from this transaction, see if there
+	 * is anything at all
+	 */
+	if (ret != 0 || min_key.objectid != inode->i_ino ||
+	    min_key.type != key_type) {
+		min_key.objectid = inode->i_ino;
+		min_key.type = key_type;
+		min_key.offset = (u64)-1;
+		btrfs_release_path(root, path);
+		ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_release_path(root, path);
+			return ret;
+		}
+		ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+
+		/* if ret == 0 there are items for this type,
+		 * create a range to tell us the last key of this type.
+		 * otherwise, there are no items in this directory after
+		 * *min_offset, and we create a range to indicate that.
+		 */
+		if (ret == 0) {
+			struct btrfs_key tmp;
+			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
+					      path->slots[0]);
+			if (key_type == tmp.type) {
+				first_offset = max(min_offset, tmp.offset) + 1;
+			}
+		}
+		goto done;
+	}
+
+	/* go backward to find any previous key */
+	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+	if (ret == 0) {
+		struct btrfs_key tmp;
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (key_type == tmp.type) {
+			first_offset = tmp.offset;
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+		}
+	}
+	btrfs_release_path(root, path);
+
+	/* find the first key from this transaction again */
+	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+	if (ret != 0) {
+		WARN_ON(1);
+		goto done;
+	}
+
+	/*
+	 * we have a block from this transaction, log every item in it
+	 * from our directory
+	 */
+	while(1) {
+		struct btrfs_key tmp;
+		src = path->nodes[0];
+		nritems = btrfs_header_nritems(src);
+		for (i = path->slots[0]; i < nritems; i++) {
+			btrfs_item_key_to_cpu(src, &min_key, i);
+
+			if (min_key.objectid != inode->i_ino ||
+			    min_key.type != key_type)
+				goto done;
+			ret = overwrite_item(trans, log, dst_path, src, i,
+					     &min_key);
+			BUG_ON(ret);
+		}
+		path->slots[0] = nritems;
+
+		/*
+		 * look ahead to the next item and see if it is also
+		 * from this directory and from this transaction
+		 */
+		ret = btrfs_next_leaf(root, path);
+		if (ret == 1) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
+		if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+			last_offset = (u64)-1;
+			goto done;
+		}
+		if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
+			ret = overwrite_item(trans, log, dst_path,
+					     path->nodes[0], path->slots[0],
+					     &tmp);
+
+			BUG_ON(ret);
+			last_offset = tmp.offset;
+			goto done;
+		}
+	}
+done:
+	*last_offset_ret = last_offset;
+	btrfs_release_path(root, path);
+	btrfs_release_path(log, dst_path);
+
+	/* insert the log range keys to indicate where the log is valid */
+	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+				 first_offset, last_offset);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * logging directories is very similar to logging inodes, We find all the items
+ * from the current transaction and write them to the log.
+ *
+ * The recovery code scans the directory in the subvolume, and if it finds a
+ * key in the range logged that is not present in the log tree, then it means
+ * that dir entry was unlinked during the transaction.
+ *
+ * In order for that scan to work, we must include one key smaller than
+ * the smallest logged by this transaction and one key larger than the largest
+ * key logged by this transaction.
+ */
+static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct inode *inode,
+			  struct btrfs_path *path,
+			  struct btrfs_path *dst_path)
+{
+	u64 min_key;
+	u64 max_key;
+	int ret;
+	int key_type = BTRFS_DIR_ITEM_KEY;
+
+again:
+	min_key = 0;
+	max_key = 0;
+	while(1) {
+		ret = log_dir_items(trans, root, inode, path,
+				    dst_path, key_type, min_key,
+				    &max_key);
+		BUG_ON(ret);
+		if (max_key == (u64)-1)
+			break;
+		min_key = max_key + 1;
+	}
+
+	if (key_type == BTRFS_DIR_ITEM_KEY) {
+		key_type = BTRFS_DIR_INDEX_KEY;
+		goto again;
+	}
+	return 0;
+}
+
+/*
+ * a helper function to drop items from the log before we relog an
+ * inode.  max_key_type indicates the highest item type to remove.
+ * This cannot be run for file data extents because it does not
+ * free the extents they point to.
+ */
+static int drop_objectid_items(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  u64 objectid, int max_key_type)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = objectid;
+	key.type = max_key_type;
+	key.offset = (u64)-1;
+
+	while(1) {
+		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+
+		if (ret != 1)
+			break;
+
+		if (path->slots[0] == 0)
+			break;
+
+		path->slots[0]--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+
+		if (found_key.objectid != objectid)
+			break;
+
+		ret = btrfs_del_item(trans, log, path);
+		BUG_ON(ret);
+		btrfs_release_path(log, path);
+	}
+	btrfs_release_path(log, path);
+	return 0;
+}
+
+/* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+ *
+ * Any items from this inode changed by the current transaction are copied
+ * to the log tree.  An extra reference is taken on any extents in this
+ * file, allowing us to avoid a whole pile of corner cases around logging
+ * blocks that have been removed from the tree.
+ *
+ * See LOG_INODE_ALL and related defines for a description of what inode_only
+ * does.
+ *
+ * This handles both files and directories.
+ */
+static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, struct inode *inode,
+			     int inode_only)
+{
+	struct btrfs_path *path;
+	struct btrfs_path *dst_path;
+	struct btrfs_key min_key;
+	struct btrfs_key max_key;
+	struct btrfs_root *log = root->log_root;
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct extent_buffer *src;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	u32 size;
+	int ret;
+
+	log = root->log_root;
+
+	path = btrfs_alloc_path();
+	dst_path = btrfs_alloc_path();
+
+	min_key.objectid = inode->i_ino;
+	min_key.type = BTRFS_INODE_ITEM_KEY;
+	min_key.offset = 0;
+
+	max_key.objectid = inode->i_ino;
+	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+		max_key.type = BTRFS_XATTR_ITEM_KEY;
+	else
+		max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+	/*
+	 * if this inode has already been logged and we're in inode_only
+	 * mode, we don't want to delete the things that have already
+	 * been written to the log.
+	 *
+	 * But, if the inode has been through an inode_only log,
+	 * the logged_trans field is not set.  This allows us to catch
+	 * any new names for this inode in the backrefs by logging it
+	 * again
+	 */
+	if (inode_only == LOG_INODE_EXISTS &&
+	    BTRFS_I(inode)->logged_trans == trans->transid) {
+		btrfs_free_path(path);
+		btrfs_free_path(dst_path);
+		goto out;
+	}
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+
+	/*
+	 * a brute force approach to making sure we get the most uptodate
+	 * copies of everything.
+	 */
+	if (S_ISDIR(inode->i_mode)) {
+		int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
+
+		if (inode_only == LOG_INODE_EXISTS)
+			max_key_type = BTRFS_XATTR_ITEM_KEY;
+		ret = drop_objectid_items(trans, log, path,
+					  inode->i_ino, max_key_type);
+	} else {
+		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+	}
+	BUG_ON(ret);
+	path->keep_locks = 1;
+
+	while(1) {
+		ret = btrfs_search_forward(root, &min_key, &max_key,
+					   path, 0, trans->transid);
+		if (ret != 0)
+			break;
+
+		if (min_key.objectid != inode->i_ino)
+			break;
+		if (min_key.type > max_key.type)
+			break;
+
+		src = path->nodes[0];
+		size = btrfs_item_size_nr(src, path->slots[0]);
+		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key,
+					      size);
+		if (ret)
+			BUG();
+
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, path->slots[0]);
+
+		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+				   src_offset, size);
+
+		if (inode_only == LOG_INODE_EXISTS &&
+		    min_key.type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+			/* set the generation to zero so the recover code
+			 * can tell the difference between an logging
+			 * just to say 'this inode exists' and a logging
+			 * to say 'update this inode with these values'
+			 */
+			btrfs_set_inode_generation(dst_path->nodes[0],
+						   inode_item, 0);
+		}
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (btrfs_key_type(&min_key) == BTRFS_EXTENT_DATA_KEY) {
+			int found_type;
+			extent = btrfs_item_ptr(src, path->slots[0],
+						struct btrfs_file_extent_item);
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(src,
+								   extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(src,
+								      extent);
+				/* ds == 0 is a hole */
+				if (ds != 0) {
+					ret = btrfs_inc_extent_ref(trans, log,
+						   ds, dl,
+						   log->root_key.objectid,
+						   0,
+						   inode->i_ino,
+						   min_key.offset);
+					BUG_ON(ret);
+				}
+			}
+		}
+
+		btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+		btrfs_release_path(root, path);
+		btrfs_release_path(log, dst_path);
+
+		if (min_key.offset < (u64)-1)
+			min_key.offset++;
+		else if (min_key.type < (u8)-1)
+			min_key.type++;
+		else if (min_key.objectid < (u64)-1)
+			min_key.objectid++;
+		else
+			break;
+	}
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+		btrfs_release_path(root, path);
+		btrfs_release_path(log, dst_path);
+		ret = log_directory_changes(trans, root, inode, path, dst_path);
+		BUG_ON(ret);
+	}
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+	btrfs_free_path(path);
+	btrfs_free_path(dst_path);
+
+	mutex_lock(&root->fs_info->tree_log_mutex);
+	ret = update_log_root(trans, log);
+	BUG_ON(ret);
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+out:
+	return 0;
+}
+
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only)
+{
+	int ret;
+
+	start_log_trans(trans, root);
+	ret = __btrfs_log_inode(trans, root, inode, inode_only);
+	end_log_trans(root);
+	return ret;
+}
+
+/*
+ * helper function around btrfs_log_inode to make sure newly created
+ * parent directories also end up in the log.  A minimal inode and backref
+ * only logging is done of any parent directories that are older than
+ * the last committed transaction
+ */
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry)
+{
+	int inode_only = LOG_INODE_ALL;
+	struct super_block *sb;
+	int ret;
+
+	start_log_trans(trans, root);
+	sb = dentry->d_inode->i_sb;
+	while(1) {
+		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
+					inode_only);
+		BUG_ON(ret);
+		inode_only = LOG_INODE_EXISTS;
+
+		dentry = dentry->d_parent;
+		if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+			break;
+
+		if (BTRFS_I(dentry->d_inode)->generation <=
+		    root->fs_info->last_trans_committed)
+			break;
+	}
+	end_log_trans(root);
+	return 0;
+}
+
+/*
+ * it is not safe to log dentry if the chunk root has added new
+ * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
+ * If this returns 1, you must commit the transaction to safely get your
+ * data on disk.
+ */
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry)
+{
+	u64 gen;
+	gen = root->fs_info->last_trans_new_blockgroup;
+	if (gen > root->fs_info->last_trans_committed)
+		return 1;
+	else
+		return btrfs_log_dentry(trans, root, dentry);
+}
+
+/*
+ * should be called during mount to recover any replay any log trees
+ * from the FS
+ */
+int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_key tmp_key;
+	struct btrfs_root *log;
+	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	struct walk_control wc = {
+		.process_func = process_one_buffer,
+		.stage = 0,
+	};
+
+	fs_info->log_root_recovering = 1;
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+	wc.trans = trans;
+	wc.pin = 1;
+
+	walk_log_tree(trans, log_root_tree, &wc);
+
+again:
+	key.objectid = BTRFS_TREE_LOG_OBJECTID;
+	key.offset = (u64)-1;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
+		if (ret < 0)
+			break;
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		btrfs_release_path(log_root_tree, path);
+		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			break;
+
+		log = btrfs_read_fs_root_no_radix(log_root_tree,
+						  &found_key);
+		BUG_ON(!log);
+
+
+		tmp_key.objectid = found_key.offset;
+		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
+		tmp_key.offset = (u64)-1;
+
+		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
+
+		BUG_ON(!wc.replay_dest);
+
+		btrfs_record_root_in_trans(wc.replay_dest);
+		ret = walk_log_tree(trans, log, &wc);
+		BUG_ON(ret);
+
+		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+			ret = fixup_inode_link_counts(trans, wc.replay_dest,
+						      path);
+			BUG_ON(ret);
+		}
+
+		key.offset = found_key.offset - 1;
+		free_extent_buffer(log->node);
+		kfree(log);
+
+		if (found_key.offset == 0)
+			break;
+	}
+	btrfs_release_path(log_root_tree, path);
+
+	/* step one is to pin it all, step two is to replay just inodes */
+	if (wc.pin) {
+		wc.pin = 0;
+		wc.process_func = replay_one_buffer;
+		wc.stage = LOG_WALK_REPLAY_INODES;
+		goto again;
+	}
+	/* step three is to replay everything */
+	if (wc.stage < LOG_WALK_REPLAY_ALL) {
+		wc.stage++;
+		goto again;
+	}
+
+	btrfs_free_path(path);
+
+	free_extent_buffer(log_root_tree->node);
+	log_root_tree->log_root = NULL;
+	fs_info->log_root_recovering = 0;
+
+	/* step 4: commit the transaction, which also unpins the blocks */
+	btrfs_commit_transaction(trans, fs_info->tree_root);
+
+	kfree(log_root_tree);
+	return 0;
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
new file mode 100644
index 00000000000..b9409b32ed0
--- /dev/null
+++ b/fs/btrfs/tree-log.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __TREE_LOG_
+#define __TREE_LOG_
+
+int btrfs_sync_log(struct btrfs_trans_handle *trans,
+		   struct btrfs_root *root);
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct dentry *dentry);
+int btrfs_recover_log_trees(struct btrfs_root *tree_root);
+int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct dentry *dentry);
+int btrfs_log_inode(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, struct inode *inode,
+		    int inode_only);
+int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 const char *name, int name_len,
+				 struct inode *dir, u64 index);
+int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       const char *name, int name_len,
+			       struct inode *inode, u64 dirid);
+#endif
-- 
cgit v1.2.3


From 91c0827de247303c6dfc40a8ec9d8305e1d07986 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 5 Sep 2008 16:43:06 -0400
Subject: Btrfs: Rev the disk format

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6532b60683e..f720b582fd5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,7 +40,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_B7RfS_M"
+#define BTRFS_MAGIC "_B8RfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From 50ec891cc54fb865c4c40c45845e60123e0e66ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Sep 2008 16:43:20 -0400
Subject: Btrfs: cleanup d_obtain_alias useage

d_obtain_alias is intended as a tailcall that can pass in errors encoded
in the inode pointer if needed, so use it that way instead of
duplicating the error handling.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/export.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 292b0b24c30..2b357a6d240 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -68,7 +68,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 {
 	struct btrfs_root *root;
 	struct inode *inode;
-	struct dentry *result;
 	struct btrfs_key key;
 
 	key.objectid = root_objectid;
@@ -92,11 +91,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 		return ERR_PTR(-ESTALE);
 	}
 
-	result = d_obtain_alias(inode);
-	if (!result)
-		return ERR_PTR(-ENOMEM);
-
-	return result;
+	return d_obtain_alias(inode);
 }
 
 static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -148,8 +143,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
 	struct inode *dir = child->d_inode;
-	struct inode *inode;
-	struct dentry *parent;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -203,13 +196,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
-
-	parent = d_obtain_alias(inode);
-	if (!parent)
-		parent = ERR_PTR(-ENOMEM);
-
-	return parent;
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
 }
 
 const struct export_operations btrfs_export_ops = {
-- 
cgit v1.2.3


From ad3d81ba8f2880dfa3ee518c779a7ea09a024520 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Sep 2008 16:43:28 -0400
Subject: Btrfs: missing endianess conversion in insert_new_root

Add two missing endianess conversions in this function, found by sparse.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 579124043d9..6cd4e929bd4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1656,8 +1656,8 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 
 	c = btrfs_alloc_free_block(trans, root, root->nodesize,
 				   root->root_key.objectid,
-				   root_gen, lower_key.objectid, level,
-				   root->node->start, 0);
+				   root_gen, le64_to_cpu(lower_key.objectid),
+				   level, root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -2352,8 +2352,9 @@ again:
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
 					 root->root_key.objectid,
-					 root_gen, disk_key.objectid, 0,
-					 l->start, 0);
+					 root_gen,
+					 le64_to_cpu(disk_key.objectid),
+					 0, l->start, 0);
 	if (IS_ERR(right)) {
 		BUG_ON(1);
 		return PTR_ERR(right);
-- 
cgit v1.2.3


From a237d2a2bd67eb2b57b87c1e1d3d802d078d4ba7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Sep 2008 16:43:29 -0400
Subject: remove unused function btrfs_ilookup

btrfs_ilookup is unused, which is good because a normal filesystem
should never have to use ilookup anyway.  Remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h |  2 --
 fs/btrfs/inode.c | 13 -------------
 2 files changed, 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f720b582fd5..754c738ec2e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1763,8 +1763,6 @@ void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-			    u64 root_objectid);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 			 struct btrfs_root *root, int *is_new);
 int btrfs_commit_write(struct file *file, struct page *page,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 65df9d83023..b4dbb5c9764 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1818,19 +1818,6 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
 		args->root == BTRFS_I(inode)->root);
 }
 
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-			    u64 root_objectid)
-{
-	struct btrfs_iget_args args;
-	args.ino = objectid;
-	args.root = btrfs_lookup_fs_root(btrfs_sb(s)->fs_info, root_objectid);
-
-	if (!args.root)
-		return NULL;
-
-	return ilookup5(s, objectid, btrfs_find_actor, (void *)&args);
-}
-
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root)
 {
-- 
cgit v1.2.3


From b214107eda845f9a5851ae198f5b972e0dc30c45 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Sep 2008 16:43:31 -0400
Subject: Btrfs: trivial sparse fixes

Fix a bunch of trivial sparse complaints.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 2 --
 fs/btrfs/extent_io.c | 6 ++----
 fs/btrfs/ioctl.c     | 2 +-
 fs/btrfs/sysfs.c     | 8 ++++----
 4 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6cd4e929bd4..ab07557bac0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2749,7 +2749,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	BUG_ON(slot < 0);
 
 	if (slot != nritems) {
-		int i;
 		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
 
 		if (old_data < data_end) {
@@ -2913,7 +2912,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	nritems = btrfs_header_nritems(leaf);
 
 	if (slot + nr != nritems) {
-		int i;
 		int data_end = leaf_data_end(root, leaf);
 
 		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7ca89c45d40..239e7c908ab 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3400,10 +3400,8 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 	}
 	/* at this point we can safely release the extent buffer */
 	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		struct page *page = extent_buffer_page(eb, i);
-		page_cache_release(page);
-	}
+	for (i = 0; i < num_pages; i++)
+		page_cache_release(extent_buffer_page(eb, i));
 	rb_erase(&eb->rb_node, &tree->buffer);
 	__free_extent_buffer(eb);
 out:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 59b64c738fd..f84b5f6991c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -739,7 +739,7 @@ long btrfs_ioctl_trans_end(struct file *file)
 		goto out;
 	}
 	btrfs_end_transaction(trans, root);
-	file->private_data = 0;
+	file->private_data = NULL;
 
 	mutex_lock(&root->fs_info->trans_mutex);
 	root->fs_info->open_ioctl_trans--;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 1cf4f4f8d2c..61af5d8446e 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -254,7 +254,7 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
 	wait_for_completion(&fs->kobj_unregister);
 }
 
-int btrfs_init_sysfs()
+int btrfs_init_sysfs(void)
 {
 	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
 	if (!btrfs_kset)
@@ -262,7 +262,7 @@ int btrfs_init_sysfs()
 	return 0;
 }
 
-void btrfs_exit_sysfs()
+void btrfs_exit_sysfs(void)
 {
 	kset_unregister(btrfs_kset);
 }
@@ -289,12 +289,12 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
 	return;
 }
 
-int btrfs_init_sysfs()
+int btrfs_init_sysfs(void)
 {
 	return 0;
 }
 
-void btrfs_exit_sysfs()
+void btrfs_exit_sysfs(void)
 {
 	return;
 }
-- 
cgit v1.2.3


From 6527cdbe68a27a2ee745f36d001aa32d0f46f370 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 5 Sep 2008 16:43:53 -0400
Subject: Btrfs: Update find free objectid function for orphan cleanup code

Orphan items use BTRFS_ORPHAN_OBJECTID (-5UUL) as key objectid. This
affects the find free objectid functions, inode objectid can easily
overflow after orphan file cleanup.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  3 ++-
 fs/btrfs/inode-map.c | 15 ++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 754c738ec2e..2ed6918f32e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -82,9 +82,10 @@ struct btrfs_ordered_sum;
 #define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
 
 /*
- * All files have objectids higher than this.
+ * All files have objectids in this range.
  */
 #define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_LAST_FREE_OBJECTID -256ULL
 #define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
 
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 298346ae148..cd6171c2da4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -32,7 +32,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	search_key.objectid = (u64)-1;
+	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
+	search_key.type = -1;
 	search_key.offset = (u64)-1;
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
@@ -70,16 +71,17 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	u64 search_start = dirid;
 
 	mutex_lock(&root->objectid_mutex);
-	if (root->last_inode_alloc) {
+	if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
+	    root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
 		*objectid = ++root->last_inode_alloc;
 		mutex_unlock(&root->objectid_mutex);
 		return 0;
 	}
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	search_start = root->last_inode_alloc;
 	search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID);
 	search_key.objectid = search_start;
+	search_key.type = 0;
 	search_key.offset = 0;
 
 	btrfs_init_path(path);
@@ -88,9 +90,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto error;
 
-	if (path->slots[0] > 0)
-		path->slots[0]--;
-
 	while (1) {
 		l = path->nodes[0];
 		slot = path->slots[0];
@@ -120,13 +119,15 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 				}
 			}
 		}
+		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
+			break;
 		start_found = 1;
 		last_ino = key.objectid + 1;
 		path->slots[0]++;
 	}
 	// FIXME -ENOSPC
+	BUG_ON(1);
 found:
-	root->last_inode_alloc = *objectid;
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	BUG_ON(*objectid < search_start);
-- 
cgit v1.2.3


From 325cd4bafeb6cfb44addd6e807a9b74170d1be31 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 5 Sep 2008 16:43:54 -0400
Subject: Btrfs: properly set blocksize when adding new device.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 37a8ea23e81..1546fa6f4f7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1079,6 +1079,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto out_close_bdev;
 
+	set_blocksize(device->bdev, 4096);
+
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
-- 
cgit v1.2.3


From 4bef084857ab8fe71cf49eae349c25e440a49150 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Sep 2008 11:18:08 -0400
Subject: Btrfs: Tree logging fixes

* Pin down data blocks to prevent them from being reallocated like so:

trans 1: allocate file extent
trans 2: free file extent
trans 3: free file extent during old snapshot deletion
trans 3: allocate file extent to new file
trans 3: fsync new file

Before the tree logging code, this was legal because the fsync
would commit the transation that did the final data extent free
and the transaction that allocated the extent to the new file
at the same time.

With the tree logging code, the tree log subtransaction can commit
before the transaction that freed the extent.  If we crash,
we're left with two different files using the extent.

* Don't wait in start_transaction if log replay is going on.  This
avoids deadlocks from iput while we're cleaning up link counts in the
replay code.

* Don't deadlock in replay_one_name by trying to read an inode off
the disk while holding paths for the directory

* Hold the buffer lock while we mark a buffer as written.  This
closes a race where someone is changing a buffer while we write it.
They are supposed to mark it dirty again after they change it, but
this violates the cow rules.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 33 ++++++++++++++++++++++++++++++---
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/extent-tree.c | 20 +++++++++++++++++---
 fs/btrfs/extent_io.c   | 47 ++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/extent_io.h   |  1 +
 fs/btrfs/transaction.c | 16 +++++++++++++---
 fs/btrfs/tree-log.c    | 13 +++++++++----
 7 files changed, 107 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a4373db5967..42bf9916805 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -307,9 +307,7 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
-	spin_lock(&root->fs_info->hash_lock);
-	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-	spin_unlock(&root->fs_info->hash_lock);
+
 	csum_tree_block(root, eb, 0);
 err:
 	free_extent_buffer(eb);
@@ -1998,7 +1996,36 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return ret;
 }
 
+int btree_lock_page_hook(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_buffer *eb;
+	unsigned long len;
+	u64 bytenr = page_offset(page);
+
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+
+	len = page->private >> 2;
+	eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+	if (!eb)
+		goto out;
+
+	btrfs_tree_lock(eb);
+	spin_lock(&root->fs_info->hash_lock);
+	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+	spin_unlock(&root->fs_info->hash_lock);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+out:
+	lock_page(page);
+	return 0;
+}
+
 static struct extent_io_ops btree_extent_io_ops = {
+	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.writepage_io_hook = btree_writepage_io_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 6b6fdc697f3..f84f5058dbb 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -80,4 +80,5 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
+int btree_lock_page_hook(struct page *page);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 646b9148ca2..3181759da1c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1590,13 +1590,17 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 }
 
 static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
-			  int pending)
+			  int is_data, int pending)
 {
 	int err = 0;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (!pending) {
 		struct extent_buffer *buf;
+
+		if (is_data)
+			goto pinit;
+
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
 			/* we can reuse a block if it hasn't been written
@@ -1624,6 +1628,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 			}
 			free_extent_buffer(buf);
 		}
+pinit:
 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
@@ -1744,7 +1749,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 #endif
 
 		if (pin) {
-			ret = pin_down_bytes(root, bytenr, num_bytes, 0);
+			ret = pin_down_bytes(root, bytenr, num_bytes,
+			     owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 0);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
@@ -1862,9 +1868,17 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		ref_generation = 0;
 
 	if (root == extent_root) {
-		pin_down_bytes(root, bytenr, num_bytes, 1);
+		pin_down_bytes(root, bytenr, num_bytes, 0, 1);
 		return 0;
 	}
+	/* if metadata always pin */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		pin = 1;
+
+	/* if data pin when any transaction has committed this */
+	if (ref_generation != trans->transid)
+		pin = 1;
+
 	ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
 			    ref_generation, owner_objectid, owner_offset,
 			    pin, pin == 0);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 239e7c908ab..319a0c7a4a5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -29,7 +29,10 @@ static struct kmem_cache *extent_buffer_cache;
 
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
+
+#ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
+#endif
 
 #define BUFFER_LRU_MAX 64
 
@@ -106,7 +109,9 @@ EXPORT_SYMBOL(extent_io_tree_init);
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
+#ifdef LEAK_DEBUG
 	unsigned long flags;
+#endif
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state)
@@ -114,10 +119,11 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
+#ifdef LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&state->leak_list, &states);
 	spin_unlock_irqrestore(&leak_lock, flags);
-
+#endif
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
 	return state;
@@ -129,11 +135,15 @@ void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
+#ifdef LEAK_DEBUG
 		unsigned long flags;
+#endif
 		WARN_ON(state->tree);
+#ifdef LEAK_DEBUG
 		spin_lock_irqsave(&leak_lock, flags);
 		list_del(&state->leak_list);
 		spin_unlock_irqrestore(&leak_lock, flags);
+#endif
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -2070,13 +2080,13 @@ done:
 }
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-/* Taken directly from 2.6.23 for 2.6.18 back port */
+/* Taken directly from 2.6.23 with a mod for a lockpage hook */
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                 void *data);
+#endif
 
 /**
- * write_cache_pages - walk the list of dirty pages of the given address space
- * and write all of them.
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
  * @writepage: function called for each page
@@ -2090,9 +2100,10 @@ typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
-static int write_cache_pages(struct address_space *mapping,
-		      struct writeback_control *wbc, writepage_t writepage,
-		      void *data)
+int extent_write_cache_pages(struct extent_io_tree *tree,
+			     struct address_space *mapping,
+			     struct writeback_control *wbc,
+			     writepage_t writepage, void *data)
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
@@ -2138,7 +2149,10 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			lock_page(page);
+			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+				tree->ops->write_cache_pages_lock_hook(page);
+			else
+				lock_page(page);
 
 			if (unlikely(page->mapping != mapping)) {
 				unlock_page(page);
@@ -2187,9 +2201,12 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
+
+	if (wbc->range_cont)
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 }
-#endif
+EXPORT_SYMBOL(extent_write_cache_pages);
 
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
@@ -2214,7 +2231,8 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
 	ret = __extent_writepage(page, wbc, &epd);
 
-	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
+	extent_write_cache_pages(tree, mapping, &wbc_writepages,
+				 __extent_writepage, &epd);
 	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio, 0);
 	}
@@ -2235,7 +2253,8 @@ int extent_writepages(struct extent_io_tree *tree,
 		.get_extent = get_extent,
 	};
 
-	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
+	ret = extent_write_cache_pages(tree, mapping, wbc,
+				       __extent_writepage, &epd);
 	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio, 0);
 	}
@@ -2567,15 +2586,19 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
+#ifdef LEAK_DEBUG
 	unsigned long flags;
+#endif
 
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	eb->start = start;
 	eb->len = len;
 	mutex_init(&eb->mutex);
+#ifdef LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
 	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
 	atomic_set(&eb->refs, 1);
 
 	return eb;
@@ -2583,10 +2606,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
+#ifdef LEAK_DEBUG
 	unsigned long flags;
 	spin_lock_irqsave(&leak_lock, flags);
 	list_del(&eb->leak_list);
 	spin_unlock_irqrestore(&leak_lock, flags);
+#endif
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 315cfceae31..3cb411a5f4d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -50,6 +50,7 @@ struct extent_io_ops {
 			    unsigned long old, unsigned long bits);
 	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
+	int (*write_cache_pages_lock_hook)(struct page *page);
 };
 
 struct extent_io_tree {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 49c4f5b40ed..61a377bcb2f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -161,7 +161,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	int ret;
 
 	mutex_lock(&root->fs_info->trans_mutex);
-	if ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)
+	if (!root->fs_info->log_root_recovering &&
+	    ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
 		wait_current_trans(root);
 	ret = join_transaction(root);
 	BUG_ON(ret);
@@ -328,9 +329,17 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 
 			index = start >> PAGE_CACHE_SHIFT;
 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_lock_page(btree_inode->i_mapping, index);
+			page = find_get_page(btree_inode->i_mapping, index);
 			if (!page)
 				continue;
+
+			btree_lock_page_hook(page);
+			if (!page->mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				continue;
+			}
+
 			if (PageWriteback(page)) {
 				if (PageDirty(page))
 					wait_on_page_writeback(page);
@@ -360,7 +369,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 			if (!page)
 				continue;
 			if (PageDirty(page)) {
-				lock_page(page);
+				btree_lock_page_hook(page);
+				wait_on_page_writeback(page);
 				err = write_one_page(page, 0);
 				if (err)
 					werr = err;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d1ce8314b94..13d7ee8e0c5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1176,8 +1176,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	struct btrfs_key found_key;
 	struct btrfs_key log_key;
 	struct inode *dir;
-	struct inode *inode;
 	u8 log_type;
+	int exists;
 	int ret;
 
 	dir = read_one_inode(root, key->objectid);
@@ -1190,6 +1190,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 		   name_len);
 
 	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
+	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
+	if (exists == 0)
+		exists = 1;
+	else
+		exists = 0;
+	btrfs_release_path(root, path);
+
 	if (key->type == BTRFS_DIR_ITEM_KEY) {
 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
 				       name, name_len, 1);
@@ -1224,11 +1231,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	 * don't drop the conflicting directory entry if the inode
 	 * for the new entry doesn't exist
 	 */
-	inode = read_one_inode(root, log_key.objectid);
-	if (!inode)
+	if (!exists)
 		goto out;
 
-	iput(inode);
 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
 	BUG_ON(ret);
 
-- 
cgit v1.2.3


From 8d5bf1cb35ea29795862ff0ea2f4c4d7e22727f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:51:21 -0400
Subject: Btrfs: Update the highest objectid in a root after log replay is done

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c    | 3 ++-
 fs/btrfs/tree-log.c | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b4dbb5c9764..073fdd77c78 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2239,8 +2239,9 @@ static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
 
 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
 		ret = btrfs_set_inode_index_count(dir);
-		if (ret)
+		if (ret) {
 			return ret;
+		}
 	}
 
 	*index = BTRFS_I(dir)->index_cnt;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 13d7ee8e0c5..3f4b139b27e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1027,6 +1027,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		inode->i_nlink = nlink;
 		btrfs_update_inode(trans, root, inode);
 	}
+	BTRFS_I(inode)->index_cnt = (u64)-1;
 
 	return 0;
 }
@@ -2714,6 +2715,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 	struct btrfs_key tmp_key;
 	struct btrfs_root *log;
 	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
+	u64 highest_inode;
 	struct walk_control wc = {
 		.process_func = process_one_buffer,
 		.stage = 0,
@@ -2772,6 +2774,11 @@ again:
 						      path);
 			BUG_ON(ret);
 		}
+		ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
+		if (ret == 0) {
+			wc.replay_dest->highest_inode = highest_inode;
+			wc.replay_dest->last_inode_alloc = highest_inode;
+		}
 
 		key.offset = found_key.offset - 1;
 		free_extent_buffer(log->node);
-- 
cgit v1.2.3


From 98509cfc5a6857bddcfe4b19a9539726655ec9bd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:51:43 -0400
Subject: Btrfs: Fix releasepage to properly keep dirty and writeback pages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 5 ++++-
 fs/btrfs/inode.c   | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 42bf9916805..ecb74b72026 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -346,7 +346,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
 	found_start = btrfs_header_bytenr(eb);
-	if (found_start != start) {
+	if (0 && found_start != start) {
 		printk("bad tree block start %llu %llu\n",
 		       (unsigned long long)found_start,
 		       (unsigned long long)eb->start);
@@ -592,6 +592,9 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	struct extent_map_tree *map;
 	int ret;
 
+	if (PageWriteback(page) || PageDirty(page))
+	    return 0;
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 073fdd77c78..24b7e97fccb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3060,6 +3060,8 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 
 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
+	if (PageWriteback(page) || PageDirty(page))
+		return 0;
 	return __btrfs_releasepage(page, gfp_flags);
 }
 
-- 
cgit v1.2.3


From 49eb7e46d47ea72a9bd2a5f8cedb04f5159cc277 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:53:12 -0400
Subject: Btrfs: Dir fsync optimizations

Drop i_mutex during the commit

Don't bother doing the fsync at all unless the dir is marked as dirtied
and needing fsync in this transaction.  For directories, this means
that someone has unlinked a file from the dir without fsyncing the
file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  3 +++
 fs/btrfs/file.c        | 19 ++++++++++++++++++-
 fs/btrfs/inode.c       |  5 ++++-
 fs/btrfs/tree-log.c    |  8 ++++----
 4 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index fcc8cf27e90..0577fda2168 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -56,6 +56,9 @@ struct btrfs_inode {
 	 * transid that last logged this inode
 	 */
 	u64 logged_trans;
+
+	/* trans that last made a change that should be fully fsync'd */
+	u64 log_dirty_trans;
 	u64 delalloc_bytes;
 	u64 disk_i_size;
 	u32 flags;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 84ecf3ab851..58b329ddb42 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1061,7 +1061,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	root->fs_info->tree_log_batch++;
 	filemap_fdatawait(inode->i_mapping);
+	root->fs_info->tree_log_batch++;
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
@@ -1076,14 +1078,29 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-	if (ret < 0)
+	if (ret < 0) {
 		goto out;
+	}
+
+	/* we've logged all the items and now have a consistent
+	 * version of the file in the log.  It is possible that
+	 * someone will come in and modify the file, but that's
+	 * fine because the log is consistent on disk, and we
+	 * have references to all of the file's extents
+	 *
+	 * It is possible that someone will come in and log the
+	 * file again, but that will end up using the synchronization
+	 * inside btrfs_sync_log to keep things safe.
+	 */
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
 	} else {
 		btrfs_sync_log(trans, root);
 		ret = btrfs_end_transaction(trans, root);
 	}
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 24b7e97fccb..12c1c0530f3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1187,7 +1187,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
 					 inode, dir->i_ino);
-	BUG_ON(ret);
+	BUG_ON(ret != 0 && ret != -ENOENT);
+	if (ret != -ENOENT)
+		BTRFS_I(dir)->log_dirty_trans = trans->transid;
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
@@ -1790,6 +1792,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
+	bi->log_dirty_trans = 0;
 	extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
 	extent_io_tree_init(&BTRFS_I(inode)->io_tree,
 			     inode->i_mapping, GFP_NOFS);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3f4b139b27e..5d49a701bdc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1973,10 +1973,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	atomic_set(&log->fs_info->tree_log_commit, 1);
 
 	while(1) {
+		batch = log->fs_info->tree_log_batch;
 		mutex_unlock(&log->fs_info->tree_log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&log->fs_info->tree_log_mutex);
-		batch = log->fs_info->tree_log_batch;
 
 		while(atomic_read(&log->fs_info->tree_log_writers)) {
 			DEFINE_WAIT(wait);
@@ -2189,8 +2189,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 	end_log_trans(root);
 
-	if (ret == 0 || ret == -ENOENT)
-		return 0;
 	return ret;
 }
 
@@ -2620,9 +2618,11 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 		else
 			break;
 	}
-	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
+	    BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
+		BTRFS_I(inode)->log_dirty_trans = 0;
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
-- 
cgit v1.2.3


From 3a5f1d458ad1610a06e38f0be2fbc6ac215439c0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:53:37 -0400
Subject: Btrfs: Optimize btree walking while logging inodes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c  |  2 +-
 fs/btrfs/tree-log.c | 25 +++++++++++++++++++------
 2 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ecb74b72026..7c06eb4ecfd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1497,7 +1497,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * low idle thresh
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
-	fs_info->endio_write_workers.idle_thresh = 4;
+	fs_info->endio_write_workers.idle_thresh = 64;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5d49a701bdc..f43ee33ec2d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1982,7 +1982,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
-			batch = log->fs_info->tree_log_batch;
 			mutex_unlock(&log->fs_info->tree_log_mutex);
 			if (atomic_read(&log->fs_info->tree_log_writers))
 				schedule();
@@ -2024,8 +2023,7 @@ out:
 
 }
 
-/*
- * free all the extents used by the tree log.  This should be called
+/* * free all the extents used by the tree log.  This should be called
  * at commit time of the full transaction
  */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2107,6 +2105,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 	int ret;
 	int bytes_del = 0;
 
+	if (BTRFS_I(dir)->logged_trans < trans->transid)
+		return 0;
+
 	ret = join_running_log_trans(root);
 	if (ret)
 		return 0;
@@ -2178,6 +2179,9 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
 	u64 index;
 	int ret;
 
+	if (BTRFS_I(inode)->logged_trans < trans->transid)
+		return 0;
+
 	ret = join_running_log_trans(root);
 	if (ret)
 		return 0;
@@ -2484,6 +2488,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_inode_item *inode_item;
 	u32 size;
 	int ret;
+	int nritems;
 
 	log = root->log_root;
 
@@ -2541,12 +2546,11 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 					   path, 0, trans->transid);
 		if (ret != 0)
 			break;
-
+again:
 		if (min_key.objectid != inode->i_ino)
 			break;
 		if (min_key.type > max_key.type)
 			break;
-
 		src = path->nodes[0];
 		size = btrfs_item_size_nr(src, path->slots[0]);
 		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key,
@@ -2606,9 +2610,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 		}
 
 		btrfs_mark_buffer_dirty(dst_path->nodes[0]);
-		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
 
+		nritems = btrfs_header_nritems(path->nodes[0]);
+		path->slots[0]++;
+		if (path->slots[0] < nritems) {
+			btrfs_item_key_to_cpu(path->nodes[0], &min_key,
+					      path->slots[0]);
+			goto again;
+		}
+		btrfs_release_path(root, path);
+
 		if (min_key.offset < (u64)-1)
 			min_key.offset++;
 		else if (min_key.type < (u8)-1)
@@ -2626,6 +2638,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		BUG_ON(ret);
 	}
+	BTRFS_I(inode)->logged_trans = trans->transid;
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
-- 
cgit v1.2.3


From d00aff00139b40f2e9c60299d76aac29d72e48ba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 15:54:42 -0400
Subject: Btrfs: Optimize tree log block allocations

Since tree log blocks get freed every transaction, they never really
need to be written to disk.  This skips the step where we update
metadata to record they were allocated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     |  5 ++---
 fs/btrfs/extent-tree.c | 19 ++++++++++++++-----
 fs/btrfs/tree-log.c    | 20 +++++++-------------
 3 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7c06eb4ecfd..5edb7f88579 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -829,9 +829,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(btrfs_header_level(eb) != 0);
 	WARN_ON(btrfs_header_nritems(eb) != 0);
 
-	ret = btrfs_free_extent(trans, fs_info->tree_root,
-				eb->start, eb->len,
-				BTRFS_TREE_LOG_OBJECTID, 0, 0, 0, 1);
+	ret = btrfs_free_reserved_extent(fs_info->tree_root,
+				eb->start, eb->len);
 	BUG_ON(ret);
 
 	free_extent_buffer(eb);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3181759da1c..c479d71e286 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1872,8 +1872,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 	/* if metadata always pin */
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+			/* btrfs_free_reserved_extent */
+			set_extent_dirty(&root->fs_info->free_space_cache,
+				 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+			return 0;
+		}
 		pin = 1;
+	}
 
 	/* if data pin when any transaction has committed this */
 	if (ref_generation != trans->transid)
@@ -2361,11 +2368,13 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
 	BUG_ON(ret);
-	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-					    ref_generation, owner,
-					    owner_offset, ins);
-	BUG_ON(ret);
+	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+		ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+						    ref_generation, owner,
+						    owner_offset, ins);
+		BUG_ON(ret);
 
+	}
 	maybe_unlock_mutex(root);
 	return ret;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f43ee33ec2d..5f77bee0f84 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1728,9 +1728,8 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_extent(trans, root, bytenr,
-							blocksize, root_owner,
-							root_gen, 0, 0, 1);
+				ret = btrfs_free_reserved_extent(root,
+							 bytenr, blocksize);
 				BUG_ON(ret);
 			}
 			free_extent_buffer(next);
@@ -1775,8 +1774,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 			BUG_ON(ret);
 		}
 		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-		ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-					  root_owner, root_gen, 0, 0, 1);
+		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
 		BUG_ON(ret);
 	}
 	free_extent_buffer(path->nodes[*level]);
@@ -1837,10 +1835,9 @@ static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
 				}
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_extent(trans, root,
+				ret = btrfs_free_reserved_extent(root,
 						path->nodes[*level]->start,
-						path->nodes[*level]->len,
-						root_owner, root_gen, 0, 0, 1);
+						path->nodes[*level]->len);
 				BUG_ON(ret);
 			}
 			free_extent_buffer(path->nodes[*level]);
@@ -1910,11 +1907,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			}
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
-			ret = btrfs_free_extent(trans, log,
-						next->start, next->len,
-						log->root_key.objectid,
-						btrfs_header_generation(next),
-						0, 0, 1);
+			ret = btrfs_free_reserved_extent(log, next->start,
+							 next->len);
 			BUG_ON(ret);
 		}
 	}
-- 
cgit v1.2.3


From 31ff1cd25d376e8f499d450de177dffadc9e1c56 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 16:17:57 -0400
Subject: Btrfs: Copy into the log tree in big batches

This changes the log tree copy code to use btrfs_insert_items and
to work in larger batches where possible.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 183 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 122 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5f77bee0f84..ae96451bc22 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2452,6 +2452,94 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static noinline int copy_items(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *log,
+			       struct btrfs_path *dst_path,
+			       struct extent_buffer *src,
+			       int start_slot, int nr, int inode_only)
+{
+	unsigned long src_offset;
+	unsigned long dst_offset;
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_inode_item *inode_item;
+	int ret;
+	struct btrfs_key *ins_keys;
+	u32 *ins_sizes;
+	char *ins_data;
+	int i;
+
+	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
+			   nr * sizeof(u32), GFP_NOFS);
+	ins_sizes = (u32 *)ins_data;
+	ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
+
+	for (i = 0; i < nr; i++) {
+		ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
+		btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+	}
+	ret = btrfs_insert_empty_items(trans, log, dst_path,
+				       ins_keys, ins_sizes, nr);
+	BUG_ON(ret);
+
+	for (i = 0; i < nr; i++) {
+		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
+						   dst_path->slots[0]);
+
+		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+
+		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+				   src_offset, ins_sizes[i]);
+
+		if (inode_only == LOG_INODE_EXISTS &&
+		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+			inode_item = btrfs_item_ptr(dst_path->nodes[0],
+						    dst_path->slots[0],
+						    struct btrfs_inode_item);
+			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
+
+			/* set the generation to zero so the recover code
+			 * can tell the difference between an logging
+			 * just to say 'this inode exists' and a logging
+			 * to say 'update this inode with these values'
+			 */
+			btrfs_set_inode_generation(dst_path->nodes[0],
+						   inode_item, 0);
+		}
+		/* take a reference on file data extents so that truncates
+		 * or deletes of this inode don't have to relog the inode
+		 * again
+		 */
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+			int found_type;
+			extent = btrfs_item_ptr(src, start_slot + i,
+						struct btrfs_file_extent_item);
+
+			found_type = btrfs_file_extent_type(src, extent);
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
+				u64 ds = btrfs_file_extent_disk_bytenr(src,
+								   extent);
+				u64 dl = btrfs_file_extent_disk_num_bytes(src,
+								      extent);
+				/* ds == 0 is a hole */
+				if (ds != 0) {
+					ret = btrfs_inc_extent_ref(trans, log,
+						   ds, dl,
+						   BTRFS_TREE_LOG_OBJECTID,
+						   0, ins_keys[i].objectid,
+						   ins_keys[i].offset);
+					BUG_ON(ret);
+				}
+			}
+		}
+		dst_path->slots[0]++;
+	}
+
+	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
+	btrfs_release_path(log, dst_path);
+	kfree(ins_data);
+	return 0;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -2475,14 +2563,12 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key min_key;
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
-	unsigned long src_offset;
-	unsigned long dst_offset;
-	struct extent_buffer *src;
-	struct btrfs_file_extent_item *extent;
-	struct btrfs_inode_item *inode_item;
+	struct extent_buffer *src = NULL;
 	u32 size;
 	int ret;
 	int nritems;
+	int ins_start_slot = 0;
+	int ins_nr;
 
 	log = root->log_root;
 
@@ -2536,75 +2622,35 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	path->keep_locks = 1;
 
 	while(1) {
+		ins_nr = 0;
 		ret = btrfs_search_forward(root, &min_key, &max_key,
 					   path, 0, trans->transid);
 		if (ret != 0)
 			break;
 again:
+		/* note, ins_nr might be > 0 here, cleanup outside the loop */
 		if (min_key.objectid != inode->i_ino)
 			break;
 		if (min_key.type > max_key.type)
 			break;
+
 		src = path->nodes[0];
 		size = btrfs_item_size_nr(src, path->slots[0]);
-		ret = btrfs_insert_empty_item(trans, log, dst_path, &min_key,
-					      size);
-		if (ret)
-			BUG();
-
-		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
-						   dst_path->slots[0]);
-
-		src_offset = btrfs_item_ptr_offset(src, path->slots[0]);
-
-		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-				   src_offset, size);
-
-		if (inode_only == LOG_INODE_EXISTS &&
-		    min_key.type == BTRFS_INODE_ITEM_KEY) {
-			inode_item = btrfs_item_ptr(dst_path->nodes[0],
-						    dst_path->slots[0],
-						    struct btrfs_inode_item);
-			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
-
-			/* set the generation to zero so the recover code
-			 * can tell the difference between an logging
-			 * just to say 'this inode exists' and a logging
-			 * to say 'update this inode with these values'
-			 */
-			btrfs_set_inode_generation(dst_path->nodes[0],
-						   inode_item, 0);
-		}
-		/* take a reference on file data extents so that truncates
-		 * or deletes of this inode don't have to relog the inode
-		 * again
-		 */
-		if (btrfs_key_type(&min_key) == BTRFS_EXTENT_DATA_KEY) {
-			int found_type;
-			extent = btrfs_item_ptr(src, path->slots[0],
-						struct btrfs_file_extent_item);
-
-			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
-				u64 ds = btrfs_file_extent_disk_bytenr(src,
-								   extent);
-				u64 dl = btrfs_file_extent_disk_num_bytes(src,
-								      extent);
-				/* ds == 0 is a hole */
-				if (ds != 0) {
-					ret = btrfs_inc_extent_ref(trans, log,
-						   ds, dl,
-						   log->root_key.objectid,
-						   0,
-						   inode->i_ino,
-						   min_key.offset);
-					BUG_ON(ret);
-				}
-			}
+		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+			ins_nr++;
+			goto next_slot;
+		} else if (!ins_nr) {
+			ins_start_slot = path->slots[0];
+			ins_nr = 1;
+			goto next_slot;
 		}
 
-		btrfs_mark_buffer_dirty(dst_path->nodes[0]);
-		btrfs_release_path(log, dst_path);
+		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+				 ins_nr, inode_only);
+		BUG_ON(ret);
+		ins_nr = 1;
+		ins_start_slot = path->slots[0];
+next_slot:
 
 		nritems = btrfs_header_nritems(path->nodes[0]);
 		path->slots[0]++;
@@ -2613,6 +2659,13 @@ again:
 					      path->slots[0]);
 			goto again;
 		}
+		if (ins_nr) {
+			ret = copy_items(trans, log, dst_path, src,
+					 ins_start_slot,
+					 ins_nr, inode_only);
+			BUG_ON(ret);
+			ins_nr = 0;
+		}
 		btrfs_release_path(root, path);
 
 		if (min_key.offset < (u64)-1)
@@ -2624,6 +2677,14 @@ again:
 		else
 			break;
 	}
+	if (ins_nr) {
+		ret = copy_items(trans, log, dst_path, src,
+				 ins_start_slot,
+				 ins_nr, inode_only);
+		BUG_ON(ret);
+		ins_nr = 0;
+	}
+	WARN_ON(ins_nr);
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
 	    BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
 		btrfs_release_path(root, path);
-- 
cgit v1.2.3


From d0c803c4049c5ca322d4795d8b74f28768603e0e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 16:17:57 -0400
Subject: Btrfs: Record dirty pages tree-log pages in an extent_io tree

This is the same way the transaction code makes sure that all the
other tree blocks are safely on disk.  There's an extent_io tree
for each root, and any blocks allocated to the tree logs are
recorded in that tree.

At tree-log sync, the extent_io tree is walked to flush down the
dirty pages and wait for them.

The main benefit is less time spent walking the tree log and skipping
clean pages, and getting sequential IO down to the drive.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/disk-io.c     | 17 +++++++++++++++--
 fs/btrfs/extent-tree.c |  7 ++++++-
 fs/btrfs/transaction.c | 21 ++++++++++++++-------
 fs/btrfs/transaction.h |  2 ++
 fs/btrfs/tree-log.c    | 29 +++++++++++++++--------------
 6 files changed, 54 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2ed6918f32e..eb65fd80888 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -668,6 +668,8 @@ struct btrfs_root {
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
 	struct inode *inode;
+	struct extent_io_tree dirty_log_pages;
+
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5edb7f88579..57fbf107e59 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -777,6 +777,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	extent_io_tree_init(&root->dirty_log_pages,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
 	root->ref_tree = &root->ref_tree_struct;
@@ -819,11 +821,23 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info)
 {
 	struct extent_buffer *eb;
+	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+	u64 start = 0;
+	u64 end = 0;
 	int ret;
 
-	if (!fs_info->log_root_tree)
+	if (!log_root_tree)
 		return 0;
 
+	while(1) {
+		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+				    0, &start, &end, EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(&log_root_tree->dirty_log_pages,
+				   start, end, GFP_NOFS);
+	}
 	eb = fs_info->log_root_tree->node;
 
 	WARN_ON(btrfs_header_level(eb) != 0);
@@ -1412,7 +1426,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
 	insert_inode_hash(fs_info->btree_inode);
-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c479d71e286..c0bb6b9ac4c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2392,8 +2392,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 	btrfs_set_buffer_uptodate(buf);
-	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		set_extent_dirty(&root->dirty_log_pages, buf->start,
+			 buf->start + buf->len - 1, GFP_NOFS);
+	} else {
+		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
+	}
 	trans->blocks_used++;
 	return buf;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 61a377bcb2f..151b00d5259 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -302,23 +302,18 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 }
 
 
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root)
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages)
 {
 	int ret;
 	int err = 0;
 	int werr = 0;
-	struct extent_io_tree *dirty_pages;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	u64 start = 0;
 	u64 end;
 	unsigned long index;
 
-	if (!trans || !trans->transaction) {
-		return filemap_write_and_wait(btree_inode->i_mapping);
-	}
-	dirty_pages = &trans->transaction->dirty_pages;
 	while(1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 					    EXTENT_DIRTY);
@@ -385,6 +380,18 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 	return werr;
 }
 
+int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root)
+{
+	if (!trans || !trans->transaction) {
+		struct inode *btree_inode;
+		btree_inode = root->fs_info->btree_inode;
+		return filemap_write_and_wait(btree_inode->i_mapping);
+	}
+	return btrfs_write_and_wait_marked_extents(root,
+					   &trans->transaction->dirty_pages);
+}
+
 static int update_cowonly_root(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index cc63650d60d..eef2cb7d7e7 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -99,4 +99,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ae96451bc22..bfa71080096 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1954,10 +1954,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	int ret;
 	unsigned long batch;
 	struct btrfs_root *log = root->log_root;
-	struct walk_control wc = {
-		.write = 1,
-		.process_func = process_one_buffer
-	};
 
 	mutex_lock(&log->fs_info->tree_log_mutex);
 	if (atomic_read(&log->fs_info->tree_log_commit)) {
@@ -1985,18 +1981,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		if (batch == log->fs_info->tree_log_batch)
 			break;
 	}
-	ret = walk_log_tree(trans, log, &wc);
-	BUG_ON(ret);
-
-	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
-	BUG_ON(ret);
-
-	wc.wait = 1;
 
-	ret = walk_log_tree(trans, log, &wc);
+	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
-
-	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
+	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
+			       &root->fs_info->log_root_tree->dirty_log_pages);
 	BUG_ON(ret);
 
 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
@@ -2025,6 +2014,8 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	int ret;
 	struct btrfs_root *log;
 	struct key;
+	u64 start;
+	u64 end;
 	struct walk_control wc = {
 		.free = 1,
 		.process_func = process_one_buffer
@@ -2037,6 +2028,16 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	ret = walk_log_tree(trans, log, &wc);
 	BUG_ON(ret);
 
+	while(1) {
+		ret = find_first_extent_bit(&log->dirty_log_pages,
+				    0, &start, &end, EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		clear_extent_dirty(&log->dirty_log_pages,
+				   start, end, GFP_NOFS);
+	}
+
 	log = root->log_root;
 	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
 			     &log->root_key);
-- 
cgit v1.2.3


From 9623f9a3894ec95fe140ee5460bb840ac53bc6a0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Sep 2008 17:42:42 -0400
Subject: Btrfs: Disable the dir fsync optimization to skip logging the dir
 sometimes

More testing has turned up a bug, disable this for now.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index bfa71080096..56de3fb2d8d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2686,8 +2686,7 @@ next_slot:
 		ins_nr = 0;
 	}
 	WARN_ON(ins_nr);
-	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
-	    BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
+	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
 		btrfs_release_path(root, path);
 		btrfs_release_path(log, dst_path);
 		BTRFS_I(inode)->log_dirty_trans = 0;
-- 
cgit v1.2.3


From 23a07867b78ee0f33b01466e52bb608c336b26ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 12 Sep 2008 08:57:47 -0400
Subject: Btrfs: Fix mismerge in block header checks

I had incorrectly disabled the check for the block number being correct
in the header block.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 57fbf107e59..f6f7821d43a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -346,7 +346,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
 	found_start = btrfs_header_bytenr(eb);
-	if (0 && found_start != start) {
+	if (found_start != start) {
 		printk("bad tree block start %llu %llu\n",
 		       (unsigned long long)found_start,
 		       (unsigned long long)eb->start);
-- 
cgit v1.2.3


From f25956cc582f5954033cca3f9488c0a5d5193541 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 12 Sep 2008 15:32:53 -0400
Subject: Fix leaf overflow check in btrfs_insert_empty_items

It was incorrectly adding an extra sizeof(struct btrfs_item) and causing
false positives (oops)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ab07557bac0..18e84472abb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2737,8 +2737,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
 
-	if (btrfs_leaf_free_space(root, leaf) <
-	    sizeof(struct btrfs_item) + total_size) {
+	if (btrfs_leaf_free_space(root, leaf) < total_size) {
 		btrfs_print_leaf(root, leaf);
 		printk("not enough freespace need %u have %d\n",
 		       total_size, btrfs_leaf_free_space(root, leaf));
-- 
cgit v1.2.3


From ef8bbdfe7e12dc9b4e80756f6d606c4639c65851 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 23 Sep 2008 13:14:11 -0400
Subject: Btrfs: fix cache_block_group error handling

cache block group had a few bugs in the error handling code,
this makes sure paths get properly released and the correct return value
goes out.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c0bb6b9ac4c..1c10ffc837c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -66,7 +66,7 @@ static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
 	struct btrfs_path *path;
-	int ret;
+	int ret = 0;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	struct extent_io_tree *free_space_cache;
@@ -102,10 +102,10 @@ static int cache_block_group(struct btrfs_root *root,
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
-		return ret;
+		goto err;
 	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
 	if (ret < 0)
-		return ret;
+		goto err;
 	if (ret == 0) {
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
@@ -161,9 +161,10 @@ next:
 				 last + hole_size - 1, GFP_NOFS);
 	}
 	block_group->cached = 1;
+	ret = 0;
 err:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
-- 
cgit v1.2.3


From 0f9dd46cda36b8de3b9f48bc42bd09d20b9c3b52 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 23 Sep 2008 13:14:11 -0400
Subject: Btrfs: free space accounting redo

1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size.  The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing.  If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible.  When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.

2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset.  also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.

3) cleaned up the allocation code to make it a little easier to read and a
little less complicated.  Basically there are 3 steps, first look from our
provided hint.  If we couldn't find from that given hint, start back at our
original search start and look for space from there.  If that fails try to
allocate space if we can and start looking again.  If not we're screwed and need
to start over again.

4) small fixes.  there were some issues in volumes.c where we wouldn't allocate
the rest of the disk.  fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space.  Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space.  Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups.  The alloc_hint has fixed
this slight degredation and made things semi-normal.

There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space.  This only happens with metadata
allocations, and only when we are almost full.  So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall.  I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile           |   2 +-
 fs/btrfs/ctree.c            |   3 +-
 fs/btrfs/ctree.h            |  46 ++-
 fs/btrfs/disk-io.c          |   7 +-
 fs/btrfs/extent-tree.c      | 869 +++++++++++++++++++++++---------------------
 fs/btrfs/extent_io.c        |   4 +
 fs/btrfs/free-space-cache.c | 415 +++++++++++++++++++++
 fs/btrfs/inode.c            |   3 +-
 fs/btrfs/volumes.c          |  11 +-
 9 files changed, 925 insertions(+), 435 deletions(-)
 create mode 100644 fs/btrfs/free-space-cache.c

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index b7addbfd8c2..eb36ae981bd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o tree-log.o acl.o
+	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 18e84472abb..6f467901246 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2725,9 +2725,8 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0) {
+	if (ret == 0)
 		return -EEXIST;
-	}
 	if (ret < 0)
 		goto out;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eb65fd80888..730aae3bc18 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -483,7 +483,6 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
 
-
 struct btrfs_block_group_item {
 	__le64 used;
 	__le64 chunk_objectid;
@@ -498,17 +497,40 @@ struct btrfs_space_info {
 	int full;
 	int force_alloc;
 	struct list_head list;
+
+	/* for block groups in our same type */
+	struct list_head block_groups;
+	spinlock_t lock;
+};
+
+struct btrfs_free_space {
+	struct rb_node bytes_index;
+	struct rb_node offset_index;
+	u64 offset;
+	u64 bytes;
 };
 
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
-	struct btrfs_space_info *space_info;
 	spinlock_t lock;
 	u64 pinned;
 	u64 flags;
 	int cached;
 	int ro;
+	int dirty;
+
+	struct btrfs_space_info *space_info;
+
+	/* free space cache stuff */
+	struct rb_root free_space_bytes;
+	struct rb_root free_space_offset;
+
+	/* block group cache stuff */
+	struct rb_node cache_node;
+
+	/* for block groups in the same raid type */
+	struct list_head list;
 };
 
 struct btrfs_device;
@@ -525,8 +547,10 @@ struct btrfs_fs_info {
 	struct btrfs_root *log_root_tree;
 	struct radix_tree_root fs_roots_radix;
 
-	struct extent_io_tree free_space_cache;
-	struct extent_io_tree block_group_cache;
+	/* block group cache stuff */
+	spinlock_t block_group_cache_lock;
+	struct rb_root block_group_cache_tree;
+
 	struct extent_io_tree pinned_extents;
 	struct extent_io_tree pending_del;
 	struct extent_io_tree extent_ins;
@@ -1814,4 +1838,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
+
+/* free-space-cache.c */
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+				   *block_group);
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+					       *block_group, u64 offset,
+					       u64 bytes);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f6f7821d43a..535bd0fe1a7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1410,10 +1410,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
 
-	extent_io_tree_init(&fs_info->free_space_cache,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&fs_info->block_group_cache,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	spin_lock_init(&fs_info->block_group_cache_lock);
+	fs_info->block_group_cache_tree.rb_node = NULL;
+
 	extent_io_tree_init(&fs_info->pinned_extents,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&fs_info->pending_del,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1c10ffc837c..813566acc5d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -29,12 +29,6 @@
 #include "locking.h"
 #include "ref-cache.h"
 
-#define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
-#define BLOCK_GROUP_METADATA EXTENT_UPTODATE
-#define BLOCK_GROUP_SYSTEM   EXTENT_NEW
-
-#define BLOCK_GROUP_DIRTY EXTENT_DIRTY
-
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -62,6 +56,127 @@ void maybe_unlock_mutex(struct btrfs_root *root)
 	}
 }
 
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+	return (cache->flags & bits) == bits;
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+				struct btrfs_block_group_cache *block_group)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_block_group_cache *cache;
+
+	spin_lock(&info->block_group_cache_lock);
+	p = &info->block_group_cache_tree.rb_node;
+
+	while (*p) {
+		parent = *p;
+		cache = rb_entry(parent, struct btrfs_block_group_cache,
+				 cache_node);
+		if (block_group->key.objectid < cache->key.objectid) {
+			p = &(*p)->rb_left;
+		} else if (block_group->key.objectid > cache->key.objectid) {
+			p = &(*p)->rb_right;
+		} else {
+			spin_unlock(&info->block_group_cache_lock);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&block_group->cache_node, parent, p);
+	rb_insert_color(&block_group->cache_node,
+			&info->block_group_cache_tree);
+	spin_unlock(&info->block_group_cache_lock);
+
+	return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+			      int contains)
+{
+	struct btrfs_block_group_cache *cache, *ret = NULL;
+	struct rb_node *n;
+	u64 end, start;
+
+	spin_lock(&info->block_group_cache_lock);
+	n = info->block_group_cache_tree.rb_node;
+
+	while (n) {
+		cache = rb_entry(n, struct btrfs_block_group_cache,
+				 cache_node);
+		end = cache->key.objectid + cache->key.offset - 1;
+		start = cache->key.objectid;
+
+		if (bytenr < start) {
+			if (!contains && (!ret || start < ret->key.objectid))
+				ret = cache;
+			n = n->rb_left;
+		} else if (bytenr > start) {
+			if (contains && bytenr <= end) {
+				ret = cache;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			ret = cache;
+			break;
+		}
+	}
+	spin_unlock(&info->block_group_cache_lock);
+
+	return ret;
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_fs_info *info, u64 start, u64 end)
+{
+	u64 extent_start, extent_end, size;
+	int ret;
+
+	while (start < end) {
+		ret = find_first_extent_bit(&info->pinned_extents, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		if (extent_start == start) {
+			start = extent_end + 1;
+		} else if (extent_start > start && extent_start < end) {
+			size = extent_start - start;
+			ret = btrfs_add_free_space(block_group, start, size);
+			BUG_ON(ret);
+			start = extent_end + 1;
+		} else {
+			break;
+		}
+	}
+
+	if (start < end) {
+		size = end - start;
+		ret = btrfs_add_free_space(block_group, start, size);
+		BUG_ON(ret);
+	}
+
+	return 0;
+}
+
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -69,10 +184,8 @@ static int cache_block_group(struct btrfs_root *root,
 	int ret = 0;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct extent_io_tree *free_space_cache;
 	int slot;
 	u64 last = 0;
-	u64 hole_size;
 	u64 first_free;
 	int found = 0;
 
@@ -80,7 +193,6 @@ static int cache_block_group(struct btrfs_root *root,
 		return 0;
 
 	root = root->fs_info->extent_root;
-	free_space_cache = &root->fs_info->free_space_cache;
 
 	if (block_group->cached)
 		return 0;
@@ -96,7 +208,8 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	first_free = block_group->key.objectid;
+	first_free = max_t(u64, block_group->key.objectid,
+			   BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
 	key.objectid = block_group->key.objectid;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -119,32 +232,28 @@ static int cache_block_group(struct btrfs_root *root,
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto err;
-			if (ret == 0) {
+			if (ret == 0)
 				continue;
-			} else {
+			else
 				break;
-			}
 		}
 		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (key.objectid < block_group->key.objectid) {
+		if (key.objectid < block_group->key.objectid)
 			goto next;
-		}
+
 		if (key.objectid >= block_group->key.objectid +
-		    block_group->key.offset) {
+		    block_group->key.offset)
 			break;
-		}
 
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
 			if (!found) {
 				last = first_free;
 				found = 1;
 			}
-			if (key.objectid > last) {
-				hole_size = key.objectid - last;
-				set_extent_dirty(free_space_cache, last,
-						 last + hole_size - 1,
-						 GFP_NOFS);
-			}
+
+			add_new_free_space(block_group, root->fs_info, last,
+					   key.objectid);
+
 			last = key.objectid + key.offset;
 		}
 next:
@@ -153,13 +262,11 @@ next:
 
 	if (!found)
 		last = first_free;
-	if (block_group->key.objectid +
-	    block_group->key.offset > last) {
-		hole_size = block_group->key.objectid +
-			block_group->key.offset - last;
-		set_extent_dirty(free_space_cache, last,
-				 last + hole_size - 1, GFP_NOFS);
-	}
+
+	add_new_free_space(block_group, root->fs_info, last,
+			   block_group->key.objectid +
+			   block_group->key.offset);
+
 	block_group->cached = 1;
 	ret = 0;
 err:
@@ -167,166 +274,79 @@ err:
 	return ret;
 }
 
+/*
+ * return the block group that starts at or after bytenr
+ */
 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 						       btrfs_fs_info *info,
 							 u64 bytenr)
 {
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_block_group_cache *block_group = NULL;
-	u64 ptr;
-	u64 start;
-	u64 end;
-	int ret;
+	struct btrfs_block_group_cache *cache;
 
-	bytenr = max_t(u64, bytenr,
-		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
-	block_group_cache = &info->block_group_cache;
-	ret = find_first_extent_bit(block_group_cache,
-				    bytenr, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
-				    BLOCK_GROUP_SYSTEM);
-	if (ret) {
-		return NULL;
-	}
-	ret = get_state_private(block_group_cache, start, &ptr);
-	if (ret)
-		return NULL;
+	cache = block_group_cache_tree_search(info, bytenr, 0);
 
-	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
-	return block_group;
+	return cache;
 }
 
+/*
+ * return the block group that contains teh given bytenr
+ */
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr)
 {
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_block_group_cache *block_group = NULL;
-	u64 ptr;
-	u64 start;
-	u64 end;
-	int ret;
+	struct btrfs_block_group_cache *cache;
 
-	bytenr = max_t(u64, bytenr,
-		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
-	block_group_cache = &info->block_group_cache;
-	ret = find_first_extent_bit(block_group_cache,
-				    bytenr, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
-				    BLOCK_GROUP_SYSTEM);
-	if (ret) {
-		return NULL;
-	}
-	ret = get_state_private(block_group_cache, start, &ptr);
-	if (ret)
-		return NULL;
+	cache = block_group_cache_tree_search(info, bytenr, 1);
 
-	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
-	if (block_group->key.objectid <= bytenr && bytenr <
-	    block_group->key.objectid + block_group->key.offset)
-		return block_group;
-	return NULL;
+	return cache;
 }
 
-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
-{
-	return (cache->flags & bits) == bits;
-}
-
-static int noinline find_search_start(struct btrfs_root *root,
-			      struct btrfs_block_group_cache **cache_ret,
-			      u64 *start_ret, u64 num, int data)
+static int noinline find_free_space(struct btrfs_root *root,
+				    struct btrfs_block_group_cache **cache_ret,
+				    u64 *start_ret, u64 num, int data)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
-	struct extent_io_tree *free_space_cache;
-	struct extent_state *state;
+	struct btrfs_free_space *info = NULL;
 	u64 last;
-	u64 start = 0;
-	u64 cache_miss = 0;
 	u64 total_fs_bytes;
 	u64 search_start = *start_ret;
-	int wrapped = 0;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	free_space_cache = &root->fs_info->free_space_cache;
 
 	if (!cache)
 		goto out;
 
+	last = max(search_start, cache->key.objectid);
+
 again:
 	ret = cache_block_group(root, cache);
-	if (ret) {
+	if (ret)
 		goto out;
-	}
 
-	last = max(search_start, cache->key.objectid);
-	if (!block_group_bits(cache, data) || cache->ro)
+	if (cache->ro || !block_group_bits(cache, data))
 		goto new_group;
 
-	spin_lock_irq(&free_space_cache->lock);
-	state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
-	while(1) {
-		if (!state) {
-			if (!cache_miss)
-				cache_miss = last;
-			spin_unlock_irq(&free_space_cache->lock);
-			goto new_group;
-		}
-
-		start = max(last, state->start);
-		last = state->end + 1;
-		if (last - start < num) {
-			do {
-				state = extent_state_next(state);
-			} while(state && !(state->state & EXTENT_DIRTY));
-			continue;
-		}
-		spin_unlock_irq(&free_space_cache->lock);
-		if (cache->ro) {
-			goto new_group;
-		}
-		if (start + num > cache->key.objectid + cache->key.offset)
-			goto new_group;
-		if (!block_group_bits(cache, data)) {
-			printk("block group bits don't match %Lu %d\n", cache->flags, data);
-		}
-		*start_ret = start;
+	info = btrfs_find_free_space(cache, last, num);
+	if (info) {
+		*start_ret = info->offset;
 		return 0;
 	}
-out:
-	cache = btrfs_lookup_block_group(root->fs_info, search_start);
-	if (!cache) {
-		printk("Unable to find block group for %Lu\n", search_start);
-		WARN_ON(1);
-	}
-	return -ENOSPC;
 
 new_group:
 	last = cache->key.objectid + cache->key.offset;
-wrapped:
+
 	cache = btrfs_lookup_first_block_group(root->fs_info, last);
-	if (!cache || cache->key.objectid >= total_fs_bytes) {
-no_cache:
-		if (!wrapped) {
-			wrapped = 1;
-			last = search_start;
-			goto wrapped;
-		}
+	if (!cache || cache->key.objectid >= total_fs_bytes)
 		goto out;
-	}
-	if (cache_miss && !cache->cached) {
-		cache_block_group(root, cache);
-		last = cache_miss;
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-	}
-	cache_miss = 0;
-	cache = btrfs_find_block_group(root, cache, last, data, 0);
-	if (!cache)
-		goto no_cache;
+
 	*cache_ret = cache;
 	goto again;
+
+out:
+	return -ENOSPC;
 }
 
 static u64 div_factor(u64 num, int factor)
@@ -338,16 +358,19 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-static int block_group_state_bits(u64 flags)
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+						  u64 flags)
 {
-	int bits = 0;
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		bits |= BLOCK_GROUP_DATA;
-	if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		bits |= BLOCK_GROUP_METADATA;
-	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		bits |= BLOCK_GROUP_SYSTEM;
-	return bits;
+	struct list_head *head = &info->space_info;
+	struct list_head *cur;
+	struct btrfs_space_info *found;
+	list_for_each(cur, head) {
+		found = list_entry(cur, struct btrfs_space_info, list);
+		if (found->flags == flags)
+			return found;
+	}
+	return NULL;
+
 }
 
 static struct btrfs_block_group_cache *
@@ -356,28 +379,19 @@ __btrfs_find_block_group(struct btrfs_root *root,
 			 u64 search_start, int data, int owner)
 {
 	struct btrfs_block_group_cache *cache;
-	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *sinfo;
 	u64 used;
 	u64 last = 0;
-	u64 start;
-	u64 end;
 	u64 free_check;
-	u64 ptr;
-	int bit;
-	int ret;
 	int full_search = 0;
 	int factor = 10;
 	int wrapped = 0;
 
-	block_group_cache = &info->block_group_cache;
-
 	if (data & BTRFS_BLOCK_GROUP_METADATA)
 		factor = 9;
 
-	bit = block_group_state_bits(data);
-
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_first_block_group(info, search_start);
@@ -408,20 +422,30 @@ __btrfs_find_block_group(struct btrfs_root *root,
 		else
 			last = search_start;
 	}
+	sinfo = __find_space_info(root->fs_info, data);
+	if (!sinfo)
+		goto found;
 again:
 	while(1) {
-		ret = find_first_extent_bit(block_group_cache, last,
-					    &start, &end, bit);
-		if (ret)
-			break;
+		struct list_head *l;
 
-		ret = get_state_private(block_group_cache, start, &ptr);
-		if (ret) {
-			last = end + 1;
-			continue;
+		cache = NULL;
+
+		spin_lock(&sinfo->lock);
+		list_for_each(l, &sinfo->block_groups) {
+			struct btrfs_block_group_cache *entry;
+			entry = list_entry(l, struct btrfs_block_group_cache,
+					   list);
+			if ((entry->key.objectid >= last) &&
+			    (!cache || (entry->key.objectid <
+					cache->key.objectid)))
+				cache = entry;
 		}
+		spin_unlock(&sinfo->lock);
+
+		if (!cache)
+			break;
 
-		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		spin_lock(&cache->lock);
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
@@ -462,6 +486,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
 	return ret;
 }
+
 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
 			   u64 owner, u64 owner_offset)
 {
@@ -1175,34 +1200,37 @@ fail:
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_block_group_cache *cache;
-	int ret;
+	struct btrfs_block_group_cache *cache, *entry;
+	struct rb_node *n;
 	int err = 0;
 	int werr = 0;
 	struct btrfs_path *path;
 	u64 last = 0;
-	u64 start;
-	u64 end;
-	u64 ptr;
 
-	block_group_cache = &root->fs_info->block_group_cache;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
-		ret = find_first_extent_bit(block_group_cache, last,
-					    &start, &end, BLOCK_GROUP_DIRTY);
-		if (ret)
-			break;
+		cache = NULL;
+		spin_lock(&root->fs_info->block_group_cache_lock);
+		for (n = rb_first(&root->fs_info->block_group_cache_tree);
+		     n; n = rb_next(n)) {
+			entry = rb_entry(n, struct btrfs_block_group_cache,
+					 cache_node);
+			if (entry->dirty) {
+				cache = entry;
+				break;
+			}
+		}
+		spin_unlock(&root->fs_info->block_group_cache_lock);
 
-		last = end + 1;
-		ret = get_state_private(block_group_cache, start, &ptr);
-		if (ret)
+		if (!cache)
 			break;
-		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
+
+		last += cache->key.offset;
+
 		err = write_one_cache_group(trans, root,
 					    path, cache);
 		/*
@@ -1214,29 +1242,14 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			werr = err;
 			continue;
 		}
-		clear_extent_bits(block_group_cache, start, end,
-				  BLOCK_GROUP_DIRTY, GFP_NOFS);
+
+		cache->dirty = 0;
 	}
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
 	return werr;
 }
 
-static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
-						  u64 flags)
-{
-	struct list_head *head = &info->space_info;
-	struct list_head *cur;
-	struct btrfs_space_info *found;
-	list_for_each(cur, head) {
-		found = list_entry(cur, struct btrfs_space_info, list);
-		if (found->flags == flags)
-			return found;
-	}
-	return NULL;
-
-}
-
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 			     u64 total_bytes, u64 bytes_used,
 			     struct btrfs_space_info **space_info)
@@ -1256,6 +1269,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		return -ENOMEM;
 
 	list_add(&found->list, &info->space_info);
+	INIT_LIST_HEAD(&found->block_groups);
+	spin_lock_init(&found->lock);
 	found->flags = flags;
 	found->total_bytes = total_bytes;
 	found->bytes_used = bytes_used;
@@ -1318,7 +1333,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 thresh;
 	u64 start;
 	u64 num_bytes;
-	int ret;
+	int ret = 0;
 
 	flags = reduce_alloc_profile(extent_root, flags);
 
@@ -1355,10 +1370,11 @@ printk("space info full %Lu\n", flags);
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
+
 out_unlock:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 out:
-	return 0;
+	return ret;
 }
 
 static int update_block_group(struct btrfs_trans_handle *trans,
@@ -1371,8 +1387,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 total = num_bytes;
 	u64 old_val;
 	u64 byte_in_group;
-	u64 start;
-	u64 end;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while(total) {
@@ -1382,12 +1396,9 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		}
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
-		start = cache->key.objectid;
-		end = start + cache->key.offset - 1;
-		set_extent_bits(&info->block_group_cache, start, end,
-				BLOCK_GROUP_DIRTY, GFP_NOFS);
 
 		spin_lock(&cache->lock);
+		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
@@ -1401,9 +1412,11 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			if (mark_free) {
-				set_extent_dirty(&info->free_space_cache,
-						 bytenr, bytenr + num_bytes - 1,
-						 GFP_NOFS);
+				int ret;
+				ret = btrfs_add_free_space(cache, bytenr,
+							   num_bytes);
+				if (ret)
+					return -1;
 			}
 		}
 		total -= num_bytes;
@@ -1414,16 +1427,13 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 {
-	u64 start;
-	u64 end;
-	int ret;
-	ret = find_first_extent_bit(&root->fs_info->block_group_cache,
-				    search_start, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
-				    BLOCK_GROUP_SYSTEM);
-	if (ret)
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+	if (!cache)
 		return 0;
-	return start;
+
+	return cache->key.objectid;
 }
 
 
@@ -1501,8 +1511,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	int ret;
-	struct extent_io_tree *free_space_cache;
-	free_space_cache = &root->fs_info->free_space_cache;
+	struct btrfs_block_group_cache *cache;
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
@@ -1512,7 +1521,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			break;
 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
-		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
+		cache = btrfs_lookup_block_group(root->fs_info, start);
+		if (cache->cached)
+			btrfs_add_free_space(cache, start, end - start + 1);
 		if (need_resched()) {
 			mutex_unlock(&root->fs_info->alloc_mutex);
 			cond_resched();
@@ -1875,9 +1886,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	/* if metadata always pin */
 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+			struct btrfs_block_group_cache *cache;
+
 			/* btrfs_free_reserved_extent */
-			set_extent_dirty(&root->fs_info->free_space_cache,
-				 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+			BUG_ON(!cache);
+			btrfs_add_free_space(cache, bytenr, num_bytes);
 			return 0;
 		}
 		pin = 1;
@@ -1942,8 +1956,6 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	u64 total_needed = num_bytes;
 	u64 *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group;
-	int full_scan = 0;
-	int wrapped = 0;
 	int chunk_alloc_done = 0;
 	int empty_cluster = 2 * 1024 * 1024;
 	int allowed_chunk_alloc = 0;
@@ -1959,9 +1971,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		empty_cluster = 256 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
 		last_ptr = &root->fs_info->last_data_alloc;
-	}
+
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		last_ptr = &root->fs_info->last_log_alloc;
 		if (!last_ptr == 0 && root->fs_info->last_alloc) {
@@ -1972,9 +1984,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	if (last_ptr) {
 		if (*last_ptr)
 			hint_byte = *last_ptr;
-		else {
+		else
 			empty_size += empty_cluster;
-		}
 	}
 
 	search_start = max(search_start, first_logical_byte(root, 0));
@@ -1983,145 +1994,172 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
-	if (hint_byte) {
-		block_group = btrfs_lookup_first_block_group(info, hint_byte);
-		if (!block_group)
-			hint_byte = search_start;
-		block_group = btrfs_find_block_group(root, block_group,
-						     hint_byte, data, 1);
-		if (last_ptr && *last_ptr == 0 && block_group)
-			hint_byte = block_group->key.objectid;
-	} else {
-		block_group = btrfs_find_block_group(root,
-						     trans->block_group,
-						     search_start, data, 1);
-	}
 	search_start = max(search_start, hint_byte);
-
 	total_needed += empty_size;
 
-check_failed:
-	if (!block_group) {
-		block_group = btrfs_lookup_first_block_group(info,
-							     search_start);
-		if (!block_group)
-			block_group = btrfs_lookup_first_block_group(info,
-						       orig_search_start);
-	}
-	if (full_scan && !chunk_alloc_done) {
-		if (allowed_chunk_alloc) {
-			do_chunk_alloc(trans, root,
-				     num_bytes + 2 * 1024 * 1024, data, 1);
-			allowed_chunk_alloc = 0;
-		} else if (block_group && block_group_bits(block_group, data)) {
-			block_group->space_info->force_alloc = 1;
+new_group:
+	block_group = btrfs_lookup_block_group(info, search_start);
+
+	/*
+	 * Ok this looks a little tricky, buts its really simple.  First if we
+	 * didn't find a block group obviously we want to start over.
+	 * Secondly, if the block group we found does not match the type we
+	 * need, and we have a last_ptr and its not 0, chances are the last
+	 * allocation we made was at the end of the block group, so lets go
+	 * ahead and skip the looking through the rest of the block groups and
+	 * start at the beginning.  This helps with metadata allocations,
+	 * since you are likely to have a bunch of data block groups to search
+	 * through first before you realize that you need to start over, so go
+	 * ahead and start over and save the time.
+	 */
+	if (!block_group || (!block_group_bits(block_group, data) &&
+			     last_ptr && *last_ptr)) {
+		if (search_start != orig_search_start) {
+			if (last_ptr && *last_ptr)
+				*last_ptr = 0;
+			search_start = orig_search_start;
+			goto new_group;
+		} else if (!chunk_alloc_done && allowed_chunk_alloc) {
+			ret = do_chunk_alloc(trans, root,
+					     num_bytes + 2 * 1024 * 1024,
+					     data, 1);
+			if (ret < 0) {
+				struct btrfs_space_info *info;
+
+				info = __find_space_info(root->fs_info, data);
+				goto error;
+			}
+			BUG_ON(ret);
+			chunk_alloc_done = 1;
+			search_start = orig_search_start;
+			goto new_group;
+		} else {
+			ret = -ENOSPC;
+			goto error;
 		}
-		chunk_alloc_done = 1;
-	}
-	ret = find_search_start(root, &block_group, &search_start,
-				total_needed, data);
-	if (ret == -ENOSPC && last_ptr && *last_ptr) {
-		*last_ptr = 0;
-		block_group = btrfs_lookup_first_block_group(info,
-							     orig_search_start);
-		search_start = orig_search_start;
-		ret = find_search_start(root, &block_group, &search_start,
-					total_needed, data);
 	}
-	if (ret == -ENOSPC)
-		goto enospc;
-	if (ret)
-		goto error;
 
-	if (last_ptr && *last_ptr && search_start != *last_ptr) {
-		*last_ptr = 0;
-		if (!empty_size) {
-			empty_size += empty_cluster;
-			total_needed += empty_size;
+	/*
+	 * this is going to seach through all of the existing block groups it
+	 * can find, so if we don't find something we need to see if we can
+	 * allocate what we need.
+	 */
+	ret = find_free_space(root, &block_group, &search_start,
+			      total_needed, data);
+	if (ret == -ENOSPC) {
+		/*
+		 * instead of allocating, start at the original search start
+		 * and see if there is something to be found, if not then we
+		 * allocate
+		 */
+		if (search_start != orig_search_start) {
+			if (last_ptr && *last_ptr) {
+				*last_ptr = 0;
+				total_needed += empty_cluster;
+			}
+			search_start = orig_search_start;
+			goto new_group;
 		}
-		block_group = btrfs_lookup_first_block_group(info,
-						       orig_search_start);
-		search_start = orig_search_start;
-		ret = find_search_start(root, &block_group,
-					&search_start, total_needed, data);
-		if (ret == -ENOSPC)
-			goto enospc;
-		if (ret)
+
+		/*
+		 * we've already allocated, we're pretty screwed
+		 */
+		if (chunk_alloc_done) {
 			goto error;
+		} else if (!allowed_chunk_alloc && block_group &&
+			   block_group_bits(block_group, data)) {
+			block_group->space_info->force_alloc = 1;
+			goto error;
+		} else if (!allowed_chunk_alloc) {
+			goto error;
+		}
+
+		ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024,
+				     data, 1);
+		if (ret < 0)
+			goto error;
+
+		BUG_ON(ret);
+		chunk_alloc_done = 1;
+		if (block_group)
+			search_start = block_group->key.objectid +
+				block_group->key.offset;
+		else
+			search_start = orig_search_start;
+		goto new_group;
 	}
 
+	if (ret)
+		goto error;
+
 	search_start = stripe_align(root, search_start);
 	ins->objectid = search_start;
 	ins->offset = num_bytes;
 
-	if (ins->objectid + num_bytes >= search_end)
-		goto enospc;
+	if (ins->objectid + num_bytes >= search_end) {
+		search_start = orig_search_start;
+		if (chunk_alloc_done) {
+			ret = -ENOSPC;
+			goto error;
+		}
+		goto new_group;
+	}
 
 	if (ins->objectid + num_bytes >
 	    block_group->key.objectid + block_group->key.offset) {
+		if (search_start == orig_search_start && chunk_alloc_done) {
+			ret = -ENOSPC;
+			goto error;
+		}
 		search_start = block_group->key.objectid +
 			block_group->key.offset;
 		goto new_group;
 	}
 
-	if (test_range_bit(&info->extent_ins, ins->objectid,
-			   ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
-		search_start = ins->objectid + num_bytes;
-		goto new_group;
-	}
-
-	if (test_range_bit(&info->pinned_extents, ins->objectid,
-			   ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
-		search_start = ins->objectid + num_bytes;
-		goto new_group;
-	}
-
 	if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
 	    ins->objectid < exclude_start + exclude_nr)) {
 		search_start = exclude_start + exclude_nr;
 		goto new_group;
 	}
 
-	if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
-		block_group = btrfs_lookup_block_group(info, ins->objectid);
-		if (block_group)
-			trans->block_group = block_group;
-	}
+	if (!(data & BTRFS_BLOCK_GROUP_DATA))
+		trans->block_group = block_group;
+
 	ins->offset = num_bytes;
 	if (last_ptr) {
 		*last_ptr = ins->objectid + ins->offset;
 		if (*last_ptr ==
-		    btrfs_super_total_bytes(&root->fs_info->super_copy)) {
+		    btrfs_super_total_bytes(&root->fs_info->super_copy))
 			*last_ptr = 0;
-		}
-	}
-	return 0;
-
-new_group:
-	if (search_start + num_bytes >= search_end) {
-enospc:
-		search_start = orig_search_start;
-		if (full_scan) {
-			ret = -ENOSPC;
-			goto error;
-		}
-		if (wrapped) {
-			if (!full_scan)
-				total_needed -= empty_size;
-			full_scan = 1;
-		} else
-			wrapped = 1;
 	}
-	block_group = btrfs_lookup_first_block_group(info, search_start);
-	cond_resched();
-	block_group = btrfs_find_block_group(root, block_group,
-					     search_start, data, 0);
-	goto check_failed;
 
+	ret = 0;
 error:
 	return ret;
 }
 
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+{
+	struct btrfs_block_group_cache *cache;
+	struct list_head *l;
+
+	printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
+	       info->total_bytes - info->bytes_used - info->bytes_pinned,
+	       (info->full) ? "" : "not ");
+
+	spin_lock(&info->lock);
+	list_for_each(l, &info->block_groups) {
+		cache = list_entry(l, struct btrfs_block_group_cache, list);
+		spin_lock(&cache->lock);
+		printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
+		       "%Lu pinned\n",
+		       cache->key.objectid, cache->key.offset,
+		       btrfs_block_group_used(&cache->item), cache->pinned);
+		btrfs_dump_free_space(cache, bytes);
+		spin_unlock(&cache->lock);
+	}
+	spin_unlock(&info->lock);
+}
 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -2133,6 +2171,7 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	u64 search_start = 0;
 	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_block_group_cache *cache;
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -2160,11 +2199,9 @@ again:
 				     BTRFS_BLOCK_GROUP_METADATA |
 				     (info->metadata_alloc_profile &
 				      info->avail_metadata_alloc_bits), 0);
-			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 				     num_bytes + 2 * 1024 * 1024, data, 0);
-		BUG_ON(ret);
 	}
 
 	WARN_ON(num_bytes < root->sectorsize);
@@ -2175,26 +2212,44 @@ again:
 
 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes = num_bytes >> 1;
+		num_bytes = num_bytes & ~(root->sectorsize - 1);
 		num_bytes = max(num_bytes, min_alloc_size);
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       num_bytes, data, 1);
 		goto again;
 	}
 	if (ret) {
-		printk("allocation failed flags %Lu\n", data);
+		struct btrfs_space_info *sinfo;
+
+		sinfo = __find_space_info(root->fs_info, data);
+		printk("allocation failed flags %Lu, wanted %Lu\n",
+		       data, num_bytes);
+		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
-	clear_extent_dirty(&root->fs_info->free_space_cache,
-			   ins->objectid, ins->objectid + ins->offset - 1,
-			   GFP_NOFS);
-	return 0;
+	cache = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	if (!cache) {
+		printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid);
+		return -ENOSPC;
+	}
+
+	ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset);
+
+	return ret;
 }
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 {
+	struct btrfs_block_group_cache *cache;
+
 	maybe_lock_mutex(root);
-	set_extent_dirty(&root->fs_info->free_space_cache,
-			 start, start + len - 1, GFP_NOFS);
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	if (!cache) {
+		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
+		maybe_unlock_mutex(root);
+		return -ENOSPC;
+	}
+	btrfs_add_free_space(cache, start, len);
 	maybe_unlock_mutex(root);
 	return 0;
 }
@@ -2264,8 +2319,8 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
 				       sizes, 2);
-
 	BUG_ON(ret);
+
 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_extent_item);
 	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
@@ -2336,9 +2391,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
 	cache_block_group(root, block_group);
 
-	clear_extent_dirty(&root->fs_info->free_space_cache,
-			   ins->objectid, ins->objectid + ins->offset - 1,
-			   GFP_NOFS);
+	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
+	BUG_ON(ret);
+
 	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
 					    ref_generation, owner,
 					    owner_offset, ins);
@@ -2843,31 +2898,24 @@ out:
 
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
-	u64 start;
-	u64 end;
-	u64 ptr;
-	int ret;
+	struct btrfs_block_group_cache *block_group;
+	struct rb_node *n;
 
 	mutex_lock(&info->alloc_mutex);
-	while(1) {
-		ret = find_first_extent_bit(&info->block_group_cache, 0,
-					    &start, &end, (unsigned int)-1);
-		if (ret)
-			break;
-		ret = get_state_private(&info->block_group_cache, start, &ptr);
-		if (!ret)
-			kfree((void *)(unsigned long)ptr);
-		clear_extent_bits(&info->block_group_cache, start,
-				  end, (unsigned int)-1, GFP_NOFS);
-	}
-	while(1) {
-		ret = find_first_extent_bit(&info->free_space_cache, 0,
-					    &start, &end, EXTENT_DIRTY);
-		if (ret)
-			break;
-		clear_extent_dirty(&info->free_space_cache, start,
-				   end, GFP_NOFS);
-	}
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+
+		btrfs_remove_free_space_cache(block_group);
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		spin_lock(&block_group->space_info->lock);
+		list_del(&block_group->list);
+		spin_unlock(&block_group->space_info->lock);
+		kfree(block_group);
+	}
+	spin_unlock(&info->block_group_cache_lock);
 	mutex_unlock(&info->alloc_mutex);
 	return 0;
 }
@@ -3386,7 +3434,6 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	u64 total_found;
 	u64 shrink_last_byte;
 	struct btrfs_block_group_cache *shrink_block_group;
-	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
@@ -3542,15 +3589,17 @@ next:
 		goto out;
 	}
 
-	clear_extent_bits(&info->block_group_cache, key.objectid,
-			  key.objectid + key.offset - 1,
-			  (unsigned int)-1, GFP_NOFS);
-
-
-	clear_extent_bits(&info->free_space_cache,
-			   key.objectid, key.objectid + key.offset - 1,
-			   (unsigned int)-1, GFP_NOFS);
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	rb_erase(&shrink_block_group->cache_node,
+		 &root->fs_info->block_group_cache_tree);
+	spin_unlock(&root->fs_info->block_group_cache_lock);
 
+	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
+				      key.offset);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
 	/*
 	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
 	kfree(shrink_block_group);
@@ -3566,9 +3615,9 @@ next:
 	/* the code to unpin extents might set a few bits in the free
 	 * space cache for this range again
 	 */
-	clear_extent_bits(&info->free_space_cache,
-			   key.objectid, key.objectid + key.offset - 1,
-			   (unsigned int)-1, GFP_NOFS);
+	/* XXX? */
+	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
+				      key.offset);
 out:
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
@@ -3616,16 +3665,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
 	int ret;
-	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_space_info *space_info;
-	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 
-	block_group_cache = &info->block_group_cache;
 	root = info->extent_root;
 	key.objectid = 0;
 	key.offset = 0;
@@ -3653,6 +3699,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		}
 
 		spin_lock_init(&cache->lock);
+		INIT_LIST_HEAD(&cache->list);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
@@ -3661,31 +3708,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 		cache->flags = btrfs_block_group_flags(&cache->item);
-		bit = 0;
-		if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
-			bit = BLOCK_GROUP_DATA;
-		} else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
-			bit = BLOCK_GROUP_SYSTEM;
-		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
-			bit = BLOCK_GROUP_METADATA;
-		}
-		set_avail_alloc_bits(info, cache->flags);
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
 					&space_info);
 		BUG_ON(ret);
 		cache->space_info = space_info;
+		spin_lock(&space_info->lock);
+		list_add(&cache->list, &space_info->block_groups);
+		spin_unlock(&space_info->lock);
+
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		BUG_ON(ret);
 
-		/* use EXTENT_LOCKED to prevent merging */
-		set_extent_bits(block_group_cache, found_key.objectid,
-				found_key.objectid + found_key.offset - 1,
-				EXTENT_LOCKED, GFP_NOFS);
-		set_state_private(block_group_cache, found_key.objectid,
-				  (unsigned long)cache);
-		set_extent_bits(block_group_cache, found_key.objectid,
-				found_key.objectid + found_key.offset - 1,
-				bit | EXTENT_LOCKED, GFP_NOFS);
 		if (key.objectid >=
 		    btrfs_super_total_bytes(&info->super_copy))
 			break;
@@ -3703,22 +3738,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size)
 {
 	int ret;
-	int bit = 0;
 	struct btrfs_root *extent_root;
 	struct btrfs_block_group_cache *cache;
-	struct extent_io_tree *block_group_cache;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	extent_root = root->fs_info->extent_root;
-	block_group_cache = &root->fs_info->block_group_cache;
 
 	root->fs_info->last_trans_new_blockgroup = trans->transid;
 
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
-	BUG_ON(!cache);
+	if (!cache)
+		return -ENOMEM;
+
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
@@ -3729,16 +3764,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
+	spin_lock(&cache->space_info->lock);
+	list_add(&cache->list, &cache->space_info->block_groups);
+	spin_unlock(&cache->space_info->lock);
 
-	bit = block_group_state_bits(type);
-	set_extent_bits(block_group_cache, chunk_offset,
-			chunk_offset + size - 1,
-			EXTENT_LOCKED, GFP_NOFS);
-	set_state_private(block_group_cache, chunk_offset,
-			  (unsigned long)cache);
-	set_extent_bits(block_group_cache, chunk_offset,
-			chunk_offset + size - 1,
-			bit | EXTENT_LOCKED, GFP_NOFS);
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	BUG_ON(ret);
 
 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
 				sizeof(cache->item));
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 319a0c7a4a5..8624f3e8803 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2634,6 +2634,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	if (eb) {
 		atomic_inc(&eb->refs);
 		spin_unlock(&tree->buffer_lock);
+		mark_page_accessed(eb->first_page);
 		return eb;
 	}
 	spin_unlock(&tree->buffer_lock);
@@ -2713,6 +2714,9 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 		atomic_inc(&eb->refs);
 	spin_unlock(&tree->buffer_lock);
 
+	if (eb)
+		mark_page_accessed(eb->first_page);
+
 	return eb;
 }
 EXPORT_SYMBOL(find_extent_buffer);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 00000000000..01c26e8ae55
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+			      struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+		if (offset < info->offset)
+			p = &(*p)->rb_left;
+		else if (offset > info->offset)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+			     struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+
+		if (bytes < info->bytes)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+/*
+ * searches the tree for the given offset.  If contains is set we will return
+ * the free space that contains the given offset.  If contains is not set we
+ * will return the free space that starts at or after the given offset and is
+ * at least bytes long.
+ */
+static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+						   u64 offset, u64 bytes,
+						   int contains)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_free_space *entry, *ret = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+
+		if (offset < entry->offset) {
+			if (!contains &&
+			    (!ret || entry->offset < ret->offset) &&
+			    (bytes <= entry->bytes))
+				ret = entry;
+			n = n->rb_left;
+		} else if (offset > entry->offset) {
+			if (contains &&
+			    (entry->offset + entry->bytes - 1) >= offset) {
+				ret = entry;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			if (bytes > entry->bytes) {
+				n = n->rb_right;
+				continue;
+			}
+			ret = entry;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * return a chunk at least bytes size, as close to offset that we can get.
+ */
+static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+						  u64 offset, u64 bytes)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_free_space *entry, *ret = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+
+		if (bytes < entry->bytes) {
+			/*
+			 * We prefer to get a hole size as close to the size we
+			 * are asking for so we don't take small slivers out of
+			 * huge holes, but we also want to get as close to the
+			 * offset as possible so we don't have a whole lot of
+			 * fragmentation.
+			 */
+			if (offset <= entry->offset) {
+				if (!ret)
+					ret = entry;
+				else if (entry->bytes < ret->bytes)
+					ret = entry;
+				else if (entry->offset < ret->offset)
+					ret = entry;
+			}
+			n = n->rb_left;
+		} else if (bytes > entry->bytes) {
+			n = n->rb_right;
+		} else {
+			/*
+			 * Ok we may have multiple chunks of the wanted size,
+			 * so we don't want to take the first one we find, we
+			 * want to take the one closest to our given offset, so
+			 * keep searching just in case theres a better match.
+			 */
+			n = n->rb_right;
+			if (offset > entry->offset)
+				continue;
+			else if (!ret || entry->offset < ret->offset)
+				ret = entry;
+		}
+	}
+
+	return ret;
+}
+
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *info)
+{
+	rb_erase(&info->offset_index, &block_group->free_space_offset);
+	rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+}
+
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+			   struct btrfs_free_space *info)
+{
+	int ret = 0;
+
+
+	ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+				 &info->offset_index);
+	if (ret)
+		return ret;
+
+	ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+				&info->bytes_index);
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *right_info;
+	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *info = NULL;
+	struct btrfs_free_space *alloc_info;
+	int ret = 0;
+
+	alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+	if (!alloc_info)
+		return -ENOMEM;
+
+	/*
+	 * first we want to see if there is free space adjacent to the range we
+	 * are adding, if there is remove that struct and add a new one to
+	 * cover the entire range
+	 */
+	spin_lock(&block_group->lock);
+
+	right_info = tree_search_offset(&block_group->free_space_offset,
+					offset+bytes, 0, 1);
+	left_info = tree_search_offset(&block_group->free_space_offset,
+				       offset-1, 0, 1);
+
+	if (right_info && right_info->offset == offset+bytes) {
+		unlink_free_space(block_group, right_info);
+		info = right_info;
+		info->offset = offset;
+		info->bytes += bytes;
+	} else if (right_info && right_info->offset != offset+bytes) {
+		printk(KERN_ERR "adding space in the middle of an existing "
+		       "free space area. existing: offset=%Lu, bytes=%Lu. "
+		       "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
+		       right_info->bytes, offset, bytes);
+		BUG();
+	}
+
+	if (left_info) {
+		unlink_free_space(block_group, left_info);
+
+		if (unlikely((left_info->offset + left_info->bytes) !=
+			     offset)) {
+			printk(KERN_ERR "free space to the left of new free "
+			       "space isn't quite right. existing: offset=%Lu,"
+			       " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
+			       left_info->offset, left_info->bytes, offset,
+			       bytes);
+			BUG();
+		}
+
+		if (info) {
+			info->offset = left_info->offset;
+			info->bytes += left_info->bytes;
+			kfree(left_info);
+		} else {
+			info = left_info;
+			info->bytes += bytes;
+		}
+	}
+
+	if (info) {
+		ret = link_free_space(block_group, info);
+		if (!ret)
+			info = NULL;
+		goto out;
+	}
+
+	info = alloc_info;
+	alloc_info = NULL;
+	info->offset = offset;
+	info->bytes = bytes;
+
+	ret = link_free_space(block_group, info);
+	if (ret)
+		kfree(info);
+out:
+	spin_unlock(&block_group->lock);
+	if (ret) {
+		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+		if (ret == -EEXIST)
+			BUG();
+	}
+
+	if (alloc_info)
+		kfree(alloc_info);
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	spin_lock(&block_group->lock);
+	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+				  1);
+
+	if (info && info->offset == offset) {
+		if (info->bytes < bytes) {
+			printk(KERN_ERR "Found free space at %Lu, size %Lu,"
+			       "trying to use %Lu\n",
+			       info->offset, info->bytes, bytes);
+			WARN_ON(1);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		unlink_free_space(block_group, info);
+
+		if (info->bytes == bytes) {
+			kfree(info);
+			goto out;
+		}
+
+		info->offset += bytes;
+		info->bytes -= bytes;
+
+		ret = link_free_space(block_group, info);
+		BUG_ON(ret);
+	} else {
+		WARN_ON(1);
+	}
+out:
+	spin_unlock(&block_group->lock);
+	return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int count = 0;
+
+	for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (info->bytes >= bytes)
+			count++;
+		//printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
+		//       info->bytes);
+	}
+	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+	       "\n", count);
+}
+
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	u64 ret = 0;
+
+	for (n = rb_first(&block_group->free_space_offset); n;
+	     n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		ret += info->bytes;
+	}
+
+	return ret;
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *node;
+
+	spin_lock(&block_group->lock);
+	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+		info = rb_entry(node, struct btrfs_free_space, bytes_index);
+		unlink_free_space(block_group, info);
+		kfree(info);
+		if (need_resched()) {
+			spin_unlock(&block_group->lock);
+			cond_resched();
+			spin_lock(&block_group->lock);
+		}
+	}
+	spin_unlock(&block_group->lock);
+}
+
+struct btrfs_free_space *btrfs_find_free_space_offset(struct
+						      btrfs_block_group_cache
+						      *block_group, u64 offset,
+						      u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	spin_lock(&block_group->lock);
+	ret = tree_search_offset(&block_group->free_space_offset, offset,
+				 bytes, 0);
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
+struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+						     btrfs_block_group_cache
+						     *block_group, u64 offset,
+						     u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	spin_lock(&block_group->lock);
+
+	ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+					       *block_group, u64 offset,
+					       u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	spin_lock(&block_group->lock);
+	ret = tree_search_offset(&block_group->free_space_offset, offset,
+				 bytes, 0);
+	if (!ret)
+		ret = tree_search_bytes(&block_group->free_space_bytes,
+					offset, bytes);
+
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 12c1c0530f3..65b4f864b0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -141,7 +141,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   root->sectorsize, 0, 0,
+					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
@@ -558,7 +558,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 					  trans->transid, inode->i_ino,
 					  ordered_extent->file_offset, &ins);
 	BUG_ON(ret);
-
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 
 	ret = btrfs_drop_extents(trans, root, inode,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1546fa6f4f7..b9e5c2d82dd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -64,8 +64,8 @@ static void lock_chunks(struct btrfs_root *root)
 
 static void unlock_chunks(struct btrfs_root *root)
 {
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 }
 
 int btrfs_cleanup_fs_uuids(void)
@@ -1668,8 +1668,13 @@ again:
 	else
 		min_free = calc_size;
 
-	/* we add 1MB because we never use the first 1MB of the device */
-	min_free += 1024 * 1024;
+	/*
+	 * we add 1MB because we never use the first 1MB of the device, unless
+	 * we've looped, then we are likely allocating the maximum amount of
+	 * space left already
+	 */
+	if (!looped)
+		min_free += 1024 * 1024;
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
-- 
cgit v1.2.3


From ce3ed71a58094544da1e1dc4723f1f553bb5b2d5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 23 Sep 2008 13:14:12 -0400
Subject: Btrfs: Checksum tree blocks in the background

Tree blocks were using async bio submission, but the sum was still
being done directly during writepage.  This moves the checksumming
into the worker thread.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 535bd0fe1a7..25be96946a2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -315,14 +315,6 @@ out:
 	return 0;
 }
 
-static int btree_writepage_io_hook(struct page *page, u64 start, u64 end)
-{
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-
-	csum_dirty_buffer(root, page);
-	return 0;
-}
-
 int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -501,6 +493,22 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	return 0;
 }
 
+static int btree_csum_one_bio(struct bio *bio)
+{
+	struct bio_vec *bvec = bio->bi_io_vec;
+	int bio_index = 0;
+	struct btrfs_root *root;
+
+	WARN_ON(bio->bi_vcnt <= 0);
+	while(bio_index < bio->bi_vcnt) {
+		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
+		csum_dirty_buffer(root, bvec->bv_page);
+		bio_index++;
+		bvec++;
+	}
+	return 0;
+}
+
 static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num)
 {
@@ -515,6 +523,7 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
 	if (rw & (1 << BIO_RW)) {
+		btree_csum_one_bio(bio);
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 1);
 	}
@@ -2040,7 +2049,6 @@ out:
 
 static struct extent_io_ops btree_extent_io_ops = {
 	.write_cache_pages_lock_hook = btree_lock_page_hook,
-	.writepage_io_hook = btree_writepage_io_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
 	/* note we're sharing with inode.c for the merge bio hook */
-- 
cgit v1.2.3


From 1c2308f8e7d8491467e0095af2b01500f1b70819 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 23 Sep 2008 13:14:13 -0400
Subject: Add check for tree-log roots in btrfs_alloc_reserved_extents

Tree log blocks are only reserved, and should not ever get fully
allocated on disk.  This check makes sure they stay out of the
extent tree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813566acc5d..558fbe40736 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2365,6 +2365,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_key *ins)
 {
 	int ret;
+
+	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
+		return 0;
 	maybe_lock_mutex(root);
 	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
 					    ref_generation, owner,
-- 
cgit v1.2.3


From 31840ae1a6b433ca0e6a8d341756ff478bbf959e Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Tue, 23 Sep 2008 13:14:14 -0400
Subject: Btrfs: Full back reference support

This patch makes the back reference system to explicit record the
location of parent node for all types of extents. The location of
parent node is placed into the offset field of backref key. Every
time a tree block is balanced, the back references for the affected
lower level extents are updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  228 ++++++----
 fs/btrfs/ctree.h       |   62 ++-
 fs/btrfs/disk-io.c     |    4 +-
 fs/btrfs/extent-tree.c | 1143 +++++++++++++++++++++++++++++-------------------
 fs/btrfs/extent_io.c   |    3 +-
 fs/btrfs/file.c        |  121 +++--
 fs/btrfs/inode.c       |   57 ++-
 fs/btrfs/ioctl.c       |   57 +--
 fs/btrfs/print-tree.c  |    5 +-
 fs/btrfs/tree-log.c    |  108 ++---
 10 files changed, 1066 insertions(+), 722 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6f467901246..50aea8cb653 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -125,7 +125,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	int ret = 0;
 	int level;
-	struct btrfs_key first_key;
 	struct btrfs_root *new_root;
 
 	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
@@ -141,18 +140,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
-	if (nritems) {
-		if (level == 0)
-			btrfs_item_key_to_cpu(buf, &first_key, 0);
-		else
-			btrfs_node_key_to_cpu(buf, &first_key, 0);
-	} else {
-		first_key.objectid = 0;
-	}
-	cow = btrfs_alloc_free_block(trans, new_root, buf->len,
-				       new_root_objectid,
-				       trans->transid, first_key.objectid,
-				       level, buf->start, 0);
+
+	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
+				     new_root_objectid, trans->transid,
+				     level, buf->start, 0);
 	if (IS_ERR(cow)) {
 		kfree(new_root);
 		return PTR_ERR(cow);
@@ -165,7 +156,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf, 0);
+	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
 	kfree(new_root);
 
 	if (ret)
@@ -184,39 +175,31 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     u64 search_start, u64 empty_size,
 			     u64 prealloc_dest)
 {
-	u64 root_gen;
+	u64 parent_start;
 	struct extent_buffer *cow;
 	u32 nritems;
 	int ret = 0;
 	int different_trans = 0;
 	int level;
 	int unlock_orig = 0;
-	struct btrfs_key first_key;
 
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
 	WARN_ON(!btrfs_tree_locked(buf));
 
-	if (root->ref_cows) {
-		root_gen = trans->transid;
-	} else {
-		root_gen = 0;
-	}
+	if (parent)
+		parent_start = parent->start;
+	else
+		parent_start = 0;
+
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
-	if (nritems) {
-		if (level == 0)
-			btrfs_item_key_to_cpu(buf, &first_key, 0);
-		else
-			btrfs_node_key_to_cpu(buf, &first_key, 0);
-	} else {
-		first_key.objectid = 0;
-	}
+
 	if (prealloc_dest) {
 		struct btrfs_key ins;
 
@@ -224,19 +207,19 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		ins.offset = buf->len;
 		ins.type = BTRFS_EXTENT_ITEM_KEY;
 
-		ret = btrfs_alloc_reserved_extent(trans, root,
+		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
 						  root->root_key.objectid,
-						  root_gen, level,
-						  first_key.objectid,
+						  trans->transid, level, 0,
 						  &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
 					    buf->len);
 	} else {
 		cow = btrfs_alloc_free_block(trans, root, buf->len,
+					     parent_start,
 					     root->root_key.objectid,
-					     root_gen, first_key.objectid,
-					     level, search_start, empty_size);
+					     trans->transid, level,
+					     search_start, empty_size);
 	}
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
@@ -249,17 +232,23 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
+		u32 nr_extents;
 		different_trans = 1;
-		ret = btrfs_inc_ref(trans, root, buf, 1);
+		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
 		if (ret)
 			return ret;
+
+		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
+		WARN_ON(ret);
 	} else {
+		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
+		if (ret)
+			return ret;
 		clean_tree_block(trans, root, buf);
 	}
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
-		root_gen = btrfs_header_generation(buf);
 
 		spin_lock(&root->node_lock);
 		root->node = cow;
@@ -268,13 +257,14 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 		if (buf != root->commit_root) {
 			btrfs_free_extent(trans, root, buf->start,
-					  buf->len, root->root_key.objectid,
-					  root_gen, 0, 0, 1);
+					  buf->len, buf->start,
+					  root->root_key.objectid,
+					  btrfs_header_generation(buf),
+					  0, 0, 1);
 		}
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
-		root_gen = btrfs_header_generation(parent);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
 		WARN_ON(trans->transid == 0);
@@ -283,8 +273,8 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		btrfs_mark_buffer_dirty(parent);
 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  btrfs_header_owner(parent), root_gen,
-				  0, 0, 1);
+				  parent_start, btrfs_header_owner(parent),
+				  btrfs_header_generation(parent), 0, 0, 1);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -831,6 +821,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		root->node = child;
 		spin_unlock(&root->node_lock);
 
+		ret = btrfs_update_extent_ref(trans, root, child->start,
+					      mid->start, child->start,
+					      root->root_key.objectid,
+					      trans->transid, level - 1, 0);
+		BUG_ON(ret);
+
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
 		path->locks[level] = 0;
@@ -840,7 +836,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-					root->root_key.objectid,
+					mid->start, root->root_key.objectid,
 					btrfs_header_generation(mid), 0, 0, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
@@ -905,7 +901,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root, bytenr,
-						 blocksize,
+						 blocksize, parent->start,
 						 btrfs_header_owner(parent),
 						 generation, 0, 0, 1);
 			if (wret)
@@ -954,6 +950,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+					 parent->start,
 					 btrfs_header_owner(parent),
 					 root_gen, 0, 0, 1);
 		if (wret)
@@ -1499,6 +1496,41 @@ static int fixup_low_keys(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * update item key.
+ *
+ * This function isn't completely safe. It's the caller's responsibility
+ * that the new key won't break the order
+ */
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key)
+{
+	struct btrfs_disk_key disk_key;
+	struct extent_buffer *eb;
+	int slot;
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	if (slot > 0) {
+		btrfs_item_key(eb, &disk_key, slot - 1);
+		if (comp_keys(&disk_key, new_key) >= 0)
+			return -1;
+	}
+	if (slot < btrfs_header_nritems(eb) - 1) {
+		btrfs_item_key(eb, &disk_key, slot + 1);
+		if (comp_keys(&disk_key, new_key) <= 0)
+			return -1;
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(eb, &disk_key, slot);
+	btrfs_mark_buffer_dirty(eb);
+	if (slot == 0)
+		fixup_low_keys(trans, root, path, &disk_key, 1);
+	return 0;
+}
+
 /*
  * try to push data from one node into the next node left in the
  * tree.
@@ -1558,6 +1590,10 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
+
+	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
+	BUG_ON(ret);
+
 	return ret;
 }
 
@@ -1619,6 +1655,10 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
+
+	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
+	BUG_ON(ret);
+
 	return ret;
 }
 
@@ -1633,30 +1673,24 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
-	u64 root_gen;
 	u64 lower_gen;
 	struct extent_buffer *lower;
 	struct extent_buffer *c;
 	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
+	int ret;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
 
-	if (root->ref_cows)
-		root_gen = trans->transid;
-	else
-		root_gen = 0;
-
 	lower = path->nodes[level-1];
 	if (level == 1)
 		btrfs_item_key(lower, &lower_key, 0);
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = btrfs_alloc_free_block(trans, root, root->nodesize,
-				   root->root_key.objectid,
-				   root_gen, le64_to_cpu(lower_key.objectid),
+	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
+				   root->root_key.objectid, trans->transid,
 				   level, root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
@@ -1679,7 +1713,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
 	lower_gen = btrfs_header_generation(lower);
-	WARN_ON(lower_gen == 0);
+	WARN_ON(lower_gen != trans->transid);
 
 	btrfs_set_node_ptr_generation(c, 0, lower_gen);
 
@@ -1690,6 +1724,12 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	root->node = c;
 	spin_unlock(&root->node_lock);
 
+	ret = btrfs_update_extent_ref(trans, root, lower->start,
+				      lower->start, c->start,
+				      root->root_key.objectid,
+				      trans->transid, level - 1, 0);
+	BUG_ON(ret);
+
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(old);
 
@@ -1698,20 +1738,6 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	path->nodes[level] = c;
 	path->locks[level] = 1;
 	path->slots[level] = 0;
-
-	if (root->ref_cows && lower_gen != trans->transid) {
-		struct btrfs_path *back_path = btrfs_alloc_path();
-		int ret;
-		mutex_lock(&root->fs_info->alloc_mutex);
-		ret = btrfs_insert_extent_backref(trans,
-						  root->fs_info->extent_root,
-						  path, lower->start,
-						  root->root_key.objectid,
-						  trans->transid, 0, 0);
-		BUG_ON(ret);
-		mutex_unlock(&root->fs_info->alloc_mutex);
-		btrfs_free_path(back_path);
-	}
 	return 0;
 }
 
@@ -1766,7 +1792,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct btrfs_path *path, int level)
 {
-	u64 root_gen;
 	struct extent_buffer *c;
 	struct extent_buffer *split;
 	struct btrfs_disk_key disk_key;
@@ -1793,17 +1818,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	}
 
 	c_nritems = btrfs_header_nritems(c);
-	if (root->ref_cows)
-		root_gen = trans->transid;
-	else
-		root_gen = 0;
 
-	btrfs_node_key(c, &disk_key, 0);
 	split = btrfs_alloc_free_block(trans, root, root->nodesize,
-					 root->root_key.objectid,
-					 root_gen,
-					 btrfs_disk_key_objectid(&disk_key),
-					 level, c->start, 0);
+					path->nodes[level + 1]->start,
+					root->root_key.objectid,
+					trans->transid, level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -1840,6 +1859,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	if (wret)
 		ret = wret;
 
+	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
+	BUG_ON(ret);
+
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
 		btrfs_tree_unlock(c);
@@ -1955,10 +1977,23 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	else
 		nr = 1;
 
+	if (path->slots[0] >= left_nritems)
+		push_space += data_size + sizeof(*item);
+
 	i = left_nritems - 1;
 	while (i >= nr) {
 		item = btrfs_item_nr(left, i);
 
+		if (!empty && push_items > 0) {
+			if (path->slots[0] > i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, left);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
 
@@ -1973,6 +2008,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		this_item_size = btrfs_item_size(left, item);
 		if (this_item_size + sizeof(*item) + push_space > free_space)
 			break;
+
 		push_items++;
 		push_space += this_item_size + sizeof(*item);
 		if (i == 0)
@@ -2046,6 +2082,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_mark_buffer_dirty(left);
 	btrfs_mark_buffer_dirty(right);
 
+	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
+	BUG_ON(ret);
+
 	btrfs_item_key(right, &disk_key, 0);
 	btrfs_set_node_key(upper, &disk_key, slot + 1);
 	btrfs_mark_buffer_dirty(upper);
@@ -2147,6 +2186,16 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 					KM_USER1);
 		}
 
+		if (!empty && push_items > 0) {
+			if (path->slots[0] < i)
+				break;
+			if (path->slots[0] == i) {
+				int space = btrfs_leaf_free_space(root, right);
+				if (space + push_space * 2 > free_space)
+					break;
+			}
+		}
+
 		if (path->slots[0] == i)
 			push_space += data_size + sizeof(*item);
 
@@ -2255,6 +2304,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
 
+	ret = btrfs_update_ref(trans, root, right, left,
+			       old_left_nritems, push_items);
+	BUG_ON(ret);
+
 	btrfs_item_key(right, &disk_key, 0);
 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	if (wret)
@@ -2294,7 +2347,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			       struct btrfs_path *path, int data_size,
 			       int extend)
 {
-	u64 root_gen;
 	struct extent_buffer *l;
 	u32 nritems;
 	int mid;
@@ -2313,11 +2365,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	if (extend)
 		space_needed = data_size;
 
-	if (root->ref_cows)
-		root_gen = trans->transid;
-	else
-		root_gen = 0;
-
 	/* first try to make some room by pushing left and right */
 	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
@@ -2348,13 +2395,10 @@ again:
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1)/ 2;
 
-	btrfs_item_key(l, &disk_key, 0);
-
 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
-					 root->root_key.objectid,
-					 root_gen,
-					 le64_to_cpu(disk_key.objectid),
-					 0, l->start, 0);
+					path->nodes[1]->start,
+					root->root_key.objectid,
+					trans->transid, 0, l->start, 0);
 	if (IS_ERR(right)) {
 		BUG_ON(1);
 		return PTR_ERR(right);
@@ -2485,6 +2529,9 @@ again:
 	btrfs_mark_buffer_dirty(l);
 	BUG_ON(path->slots[0] != slot);
 
+	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+	BUG_ON(ret);
+
 	if (mid <= slot) {
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
@@ -2956,6 +3003,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				ret = wret;
 			wret = btrfs_free_extent(trans, root,
 					 leaf->start, leaf->len,
+					 path->nodes[1]->start,
 					 btrfs_header_owner(path->nodes[1]),
 					 root_gen, 0, 0, 1);
 			if (wret)
@@ -3007,7 +3055,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 				free_extent_buffer(leaf);
 				wret = btrfs_free_extent(trans, root, bytenr,
-					     blocksize,
+					     blocksize, path->nodes[1]->start,
 					     btrfs_header_owner(path->nodes[1]),
 					     root_gen, 0, 0, 1);
 				if (wret)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 730aae3bc18..138c157bbc4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,7 +40,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_B8RfS_M"
+#define BTRFS_MAGIC "_B9RfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
@@ -81,6 +81,9 @@ struct btrfs_ordered_sum;
 #define BTRFS_TREE_LOG_OBJECTID -6ULL
 #define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
 
+/* dummy objectid represents multiple objectids */
+#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
+
 /*
  * All files have objectids in this range.
  */
@@ -369,6 +372,7 @@ struct btrfs_extent_ref {
 	__le64 generation;
 	__le64 objectid;
 	__le64 offset;
+	__le32 num_refs;
 } __attribute__ ((__packed__));
 
 /* dev extents record free space on individual devices.  The owner
@@ -1047,9 +1051,6 @@ btrfs_inode_otime(struct btrfs_inode_item *inode_item)
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 
-/* struct btrfs_extent_item */
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
-
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
 		   chunk_tree, 64);
@@ -1070,14 +1071,20 @@ BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
 BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
 BTRFS_SETGET_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
+BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
 
 BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
 			 objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref,
+			 offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
+			 num_refs, 32);
 
+/* struct btrfs_extent_item */
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
 			 refs, 32);
 
@@ -1474,8 +1481,7 @@ static inline struct dentry *fdentry(struct file *file) {
 }
 
 /* extent-tree.c */
-int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
-			u64 start, u64 len);
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1495,10 +1501,9 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 int data, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
-					     u32 blocksize,
+					     u32 blocksize, u64 parent,
 					     u64 root_objectid,
 					     u64 ref_generation,
-					     u64 first_objectid,
 					     int level,
 					     u64 hint,
 					     u64 empty_size);
@@ -1508,23 +1513,24 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-				 struct btrfs_path *path, u64 bytenr,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 parent,
 				 u64 root_objectid, u64 ref_generation,
 				 u64 owner, u64 owner_offset);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
-		       u64 num_bytes, u64 min_bytes,
+		       u64 num_bytes, u64 parent, u64 min_bytes,
 		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data);
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins);
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins);
@@ -1535,9 +1541,16 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int cache_ref);
-int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, u64 bytenr, u64 num_bytes,
+		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+		  u32 *nr_extents);
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct extent_buffer *buf, u32 nr_extents);
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr);
+int btrfs_free_extent(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
 		      u64 root_objectid, u64 ref_generation,
 		      u64 owner_objectid, u64 owner_offset, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
@@ -1545,10 +1558,15 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct extent_io_tree *unpin);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset);
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 ref_generation,
+			 u64 owner, u64 owner_offset);
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 orig_parent, u64 parent,
+			    u64 root_objectid, u64 ref_generation,
+			    u64 owner, u64 owner_offset);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
@@ -1561,7 +1579,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
-
+int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 25be96946a2..d35ca6a3f51 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -882,8 +882,8 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 	root->ref_cows = 0;
 
 	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
-					    BTRFS_TREE_LOG_OBJECTID,
-					    0, 0, 0, 0, 0);
+					    0, BTRFS_TREE_LOG_OBJECTID,
+					    trans->transid, 0, 0, 0);
 
 	btrfs_set_header_nritems(root->node, 0);
 	btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 558fbe40736..5258923d621 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -29,6 +29,21 @@
 #include "locking.h"
 #include "ref-cache.h"
 
+#define PENDING_EXTENT_INSERT 0
+#define PENDING_EXTENT_DELETE 1
+#define PENDING_BACKREF_UPDATE 2
+
+struct pending_extent_op {
+	int type;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
+	u64 orig_parent;
+	u64 generation;
+	u64 orig_generation;
+	int level;
+};
+
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -487,48 +502,15 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	return ret;
 }
 
-static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
-			   u64 owner, u64 owner_offset)
-{
-	u32 high_crc = ~(u32)0;
-	u32 low_crc = ~(u32)0;
-	__le64 lenum;
-	lenum = cpu_to_le64(root_objectid);
-	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
-	lenum = cpu_to_le64(ref_generation);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
-	if (owner >= BTRFS_FIRST_FREE_OBJECTID) {
-		lenum = cpu_to_le64(owner);
-		low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
-		lenum = cpu_to_le64(owner_offset);
-		low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
-	}
-	return ((u64)high_crc << 32) | (u64)low_crc;
-}
-
-static int match_extent_ref(struct extent_buffer *leaf,
-			    struct btrfs_extent_ref *disk_ref,
-			    struct btrfs_extent_ref *cpu_ref)
-{
-	int ret;
-	int len;
-
-	if (cpu_ref->objectid)
-		len = sizeof(*cpu_ref);
-	else
-		len = 2 * sizeof(u64);
-	ret = memcmp_extent_buffer(leaf, cpu_ref, (unsigned long)disk_ref,
-				   len);
-	return ret == 0;
-}
-
 /* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
-			u64 start, u64 len)
+int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	int ret;
 	struct btrfs_key key;
+	struct btrfs_path *path;
 
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 	maybe_lock_mutex(root);
 	key.objectid = start;
 	key.offset = len;
@@ -536,72 +518,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
 	maybe_unlock_mutex(root);
-	return ret;
-}
-
-static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path, u64 bytenr,
-					  u64 root_objectid,
-					  u64 ref_generation, u64 owner,
-					  u64 owner_offset, int del)
-{
-	u64 hash;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct btrfs_extent_ref ref;
-	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *disk_ref;
-	int ret;
-	int ret2;
-
-	btrfs_set_stack_ref_root(&ref, root_objectid);
-	btrfs_set_stack_ref_generation(&ref, ref_generation);
-	btrfs_set_stack_ref_objectid(&ref, owner);
-	btrfs_set_stack_ref_offset(&ref, owner_offset);
-
-	hash = hash_extent_ref(root_objectid, ref_generation, owner,
-			       owner_offset);
-	key.offset = hash;
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-
-	while (1) {
-		ret = btrfs_search_slot(trans, root, &key, path,
-					del ? -1 : 0, del);
-		if (ret < 0)
-			goto out;
-		leaf = path->nodes[0];
-		if (ret != 0) {
-			u32 nritems = btrfs_header_nritems(leaf);
-			if (path->slots[0] >= nritems) {
-				ret2 = btrfs_next_leaf(root, path);
-				if (ret2)
-					goto out;
-				leaf = path->nodes[0];
-			}
-			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-			if (found_key.objectid != bytenr ||
-			    found_key.type != BTRFS_EXTENT_REF_KEY)
-				goto out;
-			key.offset = found_key.offset;
-			if (del) {
-				btrfs_release_path(root, path);
-				continue;
-			}
-		}
-		disk_ref = btrfs_item_ptr(path->nodes[0],
-					  path->slots[0],
-					  struct btrfs_extent_ref);
-		if (match_extent_ref(path->nodes[0], disk_ref, &ref)) {
-			ret = 0;
-			goto out;
-		}
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		key.offset = found_key.offset + 1;
-		btrfs_release_path(root, path);
-	}
-out:
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -622,7 +539,7 @@ out:
  * File extents can be referenced by:
  *
  * - multiple snapshots, subvolumes, or different generations in one subvol
- * - different files inside a single subvolume (in theory, not implemented yet)
+ * - different files inside a single subvolume
  * - different offsets inside a file (bookend extents in file.c)
  *
  * The extent ref structure has fields for:
@@ -631,119 +548,284 @@ out:
  * - Generation number of the tree holding the reference
  * - objectid of the file holding the reference
  * - offset in the file corresponding to the key holding the reference
+ * - number of references holding by parent node (alway 1 for tree blocks)
+ *
+ * Btree leaf may hold multiple references to a file extent. In most cases,
+ * these references are from same file and the corresponding offsets inside
+ * the file are close together. So inode objectid and offset in file are
+ * just hints, they provide hints about where in the btree the references
+ * can be found and when we can stop searching.
  *
  * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, offset in file)
+ *     (root_key.objectid, trans->transid, inode objectid, offset in file, 1)
  *
  * When a leaf is cow'd new references are added for every file extent found
- * in the leaf.  It looks the same as the create case, but trans->transid
- * will be different when the block is cow'd.
+ * in the leaf.  It looks similar to the create case, but trans->transid will
+ * be different when the block is cow'd.
  *
- *     (root_key.objectid, trans->transid, inode objectid, offset in file)
+ *     (root_key.objectid, trans->transid, inode objectid, offset in file,
+ *      number of references in the leaf)
  *
- * When a file extent is removed either during snapshot deletion or file
- * truncation, the corresponding back reference is found
- * by searching for:
+ * Because inode objectid and offset in file are just hints, they are not
+ * used when backrefs are deleted. When a file extent is removed either
+ * during snapshot deletion or file truncation, we find the corresponding
+ * back back reference and check the following fields.
  *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
- *      inode objectid, offset in file)
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf))
  *
  * Btree extents can be referenced by:
  *
  * - Different subvolumes
  * - Different generations of the same subvolume
  *
- * Storing sufficient information for a full reverse mapping of a btree
- * block would require storing the lowest key of the block in the backref,
- * and it would require updating that lowest key either before write out or
- * every time it changed.  Instead, the objectid of the lowest key is stored
- * along with the level of the tree block.  This provides a hint
- * about where in the btree the block can be found.  Searches through the
- * btree only need to look for a pointer to that block, so they stop one
- * level higher than the level recorded in the backref.
- *
- * Some btrees do not do reference counting on their extents.  These
- * include the extent tree and the tree of tree roots.  Backrefs for these
- * trees always have a generation of zero.
- *
  * When a tree block is created, back references are inserted:
  *
- * (root->root_key.objectid, trans->transid or zero, level, lowest_key_objectid)
+ * (root->root_key.objectid, trans->transid, level, 0, 1)
  *
- * When a tree block is cow'd in a reference counted root,
- * new back references are added for all the blocks it points to.
- * These are of the form (trans->transid will have increased since creation):
+ * When a tree block is cow'd, new back references are added for all the
+ * blocks it points to. If the tree block isn't in reference counted root,
+ * the old back references are removed. These new back references are of
+ * the form (trans->transid will have increased since creation):
  *
- * (root->root_key.objectid, trans->transid, level, lowest_key_objectid)
+ * (root->root_key.objectid, trans->transid, level, 0, 1)
  *
- * Because the lowest_key_objectid and the level are just hints
- * they are not used when backrefs are deleted.  When a backref is deleted:
+ * When a backref is in deleting, the following fields are checked:
  *
  * if backref was for a tree root:
- *     root_objectid = root->root_key.objectid
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself))
  * else
- *     root_objectid = btrfs_header_owner(parent)
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent))
  *
- * (root_objectid, btrfs_header_generation(parent) or zero, 0, 0)
+ * Back Reference Key composing:
  *
- * Back Reference Key hashing:
- *
- * Back references have four fields, each 64 bits long.  Unfortunately,
- * This is hashed into a single 64 bit number and placed into the key offset.
- * The key objectid corresponds to the first byte in the extent, and the
- * key type is set to BTRFS_EXTENT_REF_KEY
+ * The key objectid corresponds to the first byte in the extent, the key
+ * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
+ * byte of parent extent. If a extent is tree root, the key offset is set
+ * to the key objectid.
  */
-int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path, u64 bytenr,
-				 u64 root_objectid, u64 ref_generation,
-				 u64 owner, u64 owner_offset)
+
+static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path, u64 bytenr,
+					  u64 parent, u64 ref_root,
+					  u64 ref_generation, int del)
 {
-	u64 hash;
 	struct btrfs_key key;
-	struct btrfs_extent_ref ref;
-	struct btrfs_extent_ref *disk_ref;
+	struct btrfs_extent_ref *ref;
+	struct extent_buffer *leaf;
 	int ret;
 
-	btrfs_set_stack_ref_root(&ref, root_objectid);
-	btrfs_set_stack_ref_generation(&ref, ref_generation);
-	btrfs_set_stack_ref_objectid(&ref, owner);
-	btrfs_set_stack_ref_offset(&ref, owner_offset);
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
+
+	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	if (btrfs_ref_root(leaf, ref) != ref_root ||
+	    btrfs_ref_generation(leaf, ref) != ref_generation) {
+		ret = -EIO;
+		WARN_ON(1);
+		goto out;
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid, u64 owner_offset)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref;
+	u32 num_refs;
+	int ret;
 
-	hash = hash_extent_ref(root_objectid, ref_generation, owner,
-			       owner_offset);
-	key.offset = hash;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = parent;
 
-	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(ref));
-	while (ret == -EEXIST) {
-		disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					  struct btrfs_extent_ref);
-		if (match_extent_ref(path->nodes[0], disk_ref, &ref))
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
+	if (ret == 0) {
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		btrfs_set_ref_root(leaf, ref, ref_root);
+		btrfs_set_ref_generation(leaf, ref, ref_generation);
+		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
+		btrfs_set_ref_offset(leaf, ref, owner_offset);
+		btrfs_set_ref_num_refs(leaf, ref, 1);
+	} else if (ret == -EEXIST) {
+		u64 existing_owner;
+		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
+		leaf = path->nodes[0];
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		if (btrfs_ref_root(leaf, ref) != ref_root ||
+		    btrfs_ref_generation(leaf, ref) != ref_generation) {
+			ret = -EIO;
+			WARN_ON(1);
 			goto out;
-		key.offset++;
-		btrfs_release_path(root, path);
-		ret = btrfs_insert_empty_item(trans, root, path, &key,
-					      sizeof(ref));
-	}
-	if (ret)
+		}
+
+		num_refs = btrfs_ref_num_refs(leaf, ref);
+		BUG_ON(num_refs == 0);
+		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+
+		existing_owner = btrfs_ref_objectid(leaf, ref);
+		if (existing_owner == owner_objectid &&
+		    btrfs_ref_offset(leaf, ref) > owner_offset) {
+			btrfs_set_ref_offset(leaf, ref, owner_offset);
+		} else if (existing_owner != owner_objectid &&
+			   existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+			btrfs_set_ref_objectid(leaf, ref,
+					BTRFS_MULTIPLE_OBJECTIDS);
+			btrfs_set_ref_offset(leaf, ref, 0);
+		}
+		ret = 0;
+	} else {
 		goto out;
-	disk_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				  struct btrfs_extent_ref);
-	write_extent_buffer(path->nodes[0], &ref, (unsigned long)disk_ref,
-			    sizeof(ref));
+	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
 	btrfs_release_path(root, path);
 	return ret;
 }
 
+static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref *ref;
+	u32 num_refs;
+	int ret = 0;
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	num_refs = btrfs_ref_num_refs(leaf, ref);
+	BUG_ON(num_refs == 0);
+	num_refs -= 1;
+	if (num_refs == 0) {
+		ret = btrfs_del_item(trans, root, path);
+	} else {
+		btrfs_set_ref_num_refs(leaf, ref, num_refs);
+		btrfs_mark_buffer_dirty(leaf);
+	}
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, u64 bytenr,
+				     u64 orig_parent, u64 parent,
+				     u64 orig_root, u64 ref_root,
+				     u64 orig_generation, u64 ref_generation,
+				     u64 owner_objectid, u64 owner_offset)
+{
+	int ret;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
+
+	if (root == root->fs_info->extent_root) {
+		struct pending_extent_op *extent_op;
+		u64 num_bytes;
+
+		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
+		num_bytes = btrfs_level_size(root, (int)owner_objectid);
+		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+				bytenr + num_bytes - 1, EXTENT_LOCKED, 0)) {
+			u64 priv;
+			ret = get_state_private(&root->fs_info->extent_ins,
+						bytenr, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+							(unsigned long)priv;
+			BUG_ON(extent_op->parent != orig_parent);
+			BUG_ON(extent_op->generation != orig_generation);
+			extent_op->parent = parent;
+			extent_op->generation = ref_generation;
+		} else {
+			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+			BUG_ON(!extent_op);
+
+			extent_op->type = PENDING_BACKREF_UPDATE;
+			extent_op->bytenr = bytenr;
+			extent_op->num_bytes = num_bytes;
+			extent_op->parent = parent;
+			extent_op->orig_parent = orig_parent;
+			extent_op->generation = ref_generation;
+			extent_op->orig_generation = orig_generation;
+			extent_op->level = (int)owner_objectid;
+
+			set_extent_bits(&root->fs_info->extent_ins,
+					bytenr, bytenr + num_bytes - 1,
+					EXTENT_LOCKED, GFP_NOFS);
+			set_state_private(&root->fs_info->extent_ins,
+					  bytenr, (unsigned long)extent_op);
+		}
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, orig_parent, orig_root,
+				    orig_generation, 1);
+	if (ret)
+		goto out;
+	ret = remove_extent_backref(trans, extent_root, path);
+	if (ret)
+		goto out;
+	ret = insert_extent_backref(trans, extent_root, path, bytenr,
+				    parent, ref_root, ref_generation,
+				    owner_objectid, owner_offset);
+	BUG_ON(ret);
+	finish_current_insert(trans, extent_root);
+	del_pending_extents(trans, extent_root);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 orig_parent, u64 parent,
+			    u64 ref_root, u64 ref_generation,
+			    u64 owner_objectid, u64 owner_offset)
+{
+	int ret;
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+	maybe_lock_mutex(root);
+	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
+					parent, ref_root, ref_root,
+					ref_generation, ref_generation,
+					owner_objectid, owner_offset);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset)
+				  struct btrfs_root *root, u64 bytenr,
+				  u64 orig_parent, u64 parent,
+				  u64 orig_root, u64 ref_root,
+				  u64 orig_generation, u64 ref_generation,
+				  u64 owner_objectid, u64 owner_offset)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -752,24 +834,28 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *item;
 	u32 refs;
 
-	WARN_ON(num_bytes < root->sectorsize);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	path->reada = 1;
 	key.objectid = bytenr;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	key.offset = num_bytes;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
 				0, 1);
 	if (ret < 0)
 		return ret;
-	if (ret != 0) {
-		BUG();
-	}
-	BUG_ON(ret != 0);
+	BUG_ON(ret == 0 || path->slots[0] == 0);
+
+	path->slots[0]--;
 	l = path->nodes[0];
+
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	BUG_ON(key.objectid != bytenr);
+	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(l, item);
 	btrfs_set_extent_refs(l, item, refs + 1);
@@ -778,9 +864,10 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
-	ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
-					  path, bytenr, root_objectid,
-					  ref_generation, owner, owner_offset);
+	ret = insert_extent_backref(trans, root->fs_info->extent_root,
+				    path, bytenr, parent,
+				    ref_root, ref_generation,
+				    owner_objectid, owner_offset);
 	BUG_ON(ret);
 	finish_current_insert(trans, root->fs_info->extent_root);
 	del_pending_extents(trans, root->fs_info->extent_root);
@@ -790,18 +877,20 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset)
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 ref_root, u64 ref_generation,
+			 u64 owner_objectid, u64 owner_offset)
 {
 	int ret;
-
-	mutex_lock(&root->fs_info->alloc_mutex);
-	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
-				     root_objectid, ref_generation,
-				     owner, owner_offset);
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
+	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		return 0;
+	maybe_lock_mutex(root);
+	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+				     0, ref_root, 0, ref_generation,
+				     owner_objectid, owner_offset);
+	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -813,9 +902,9 @@ int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int lookup_extent_ref(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u32 *refs)
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -846,7 +935,6 @@ out:
 	return 0;
 }
 
-
 static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 				u64 parent_gen, u64 ref_objectid,
 			        u64 *min_generation, u32 *ref_count)
@@ -863,7 +951,7 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	int ret;
 
 	key.objectid = bytenr;
-	key.offset = 0;
+	key.offset = (u64)-1;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
 	path = btrfs_alloc_path();
@@ -872,7 +960,10 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
+	if (ret < 0 || path->slots[0] == 0)
+		goto out;
 
+	path->slots[0]--;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
@@ -909,7 +1000,7 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 					  struct btrfs_extent_ref);
 		ref_generation = btrfs_ref_generation(leaf, ref_item);
 		/*
-		 * For (parent_gen > 0 && parent_gen > ref_gen):
+		 * For (parent_gen > 0 && parent_gen > ref_generation):
 		 *
 		 * we reach here through the oldest root, therefore
 		 * all other reference from same snapshot should have
@@ -919,8 +1010,7 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 		    (parent_gen > 0 && parent_gen > ref_generation) ||
 		    (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
 		     ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
-			if (ref_count)
-				*ref_count = 2;
+			*ref_count = 2;
 			break;
 		}
 
@@ -1020,80 +1110,29 @@ out:
 	return ret;
 }
 
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int cache_ref)
+int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		    struct extent_buffer *buf, u32 nr_extents)
 {
-	u64 bytenr;
 	u32 nritems;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int level;
-	int ret;
-	int faili;
-	int nr_file_extents = 0;
+	int ret = 0;
 
 	if (!root->ref_cows)
 		return 0;
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		if (level == 0) {
-			u64 disk_bytenr;
-			btrfs_item_key_to_cpu(buf, &key, i);
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-				continue;
-			fi = btrfs_item_ptr(buf, i,
-					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(buf, fi) ==
-			    BTRFS_FILE_EXTENT_INLINE)
-				continue;
-			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (disk_bytenr == 0)
-				continue;
-
-			if (buf != root->commit_root)
-				nr_file_extents++;
-
-			mutex_lock(&root->fs_info->alloc_mutex);
-			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf, fi),
-				    root->root_key.objectid, trans->transid,
-				    key.objectid, key.offset);
-			mutex_unlock(&root->fs_info->alloc_mutex);
-			if (ret) {
-				faili = i;
-				WARN_ON(1);
-				goto fail;
-			}
-		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
-			btrfs_node_key_to_cpu(buf, &key, i);
 
-			mutex_lock(&root->fs_info->alloc_mutex);
-			ret = __btrfs_inc_extent_ref(trans, root, bytenr,
-					   btrfs_level_size(root, level - 1),
-					   root->root_key.objectid,
-					   trans->transid,
-					   level - 1, key.objectid);
-			mutex_unlock(&root->fs_info->alloc_mutex);
-			if (ret) {
-				faili = i;
-				WARN_ON(1);
-				goto fail;
-			}
-		}
-	}
-	/* cache orignal leaf block's references */
-	if (level == 0 && cache_ref && buf != root->commit_root) {
+	if (level == 0) {
 		struct btrfs_leaf_ref *ref;
 		struct btrfs_extent_info *info;
 
-		ref = btrfs_alloc_leaf_ref(root, nr_file_extents);
+		ref = btrfs_alloc_leaf_ref(root, nr_extents);
 		if (!ref) {
-			WARN_ON(1);
+			ret = -ENOMEM;
 			goto out;
 		}
 
@@ -1101,10 +1140,10 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
-		ref->nritems = nr_file_extents;
+		ref->nritems = nr_extents;
 		info = ref->extents;
 
-		for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
+		for (i = 0; nr_extents > 0 && i < nritems; i++) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
@@ -1132,13 +1171,52 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		btrfs_free_leaf_ref(root, ref);
 	}
 out:
-	return 0;
-fail:
-	WARN_ON(1);
-#if 0
-	for (i =0; i < faili; i++) {
+	return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
+		  u32 *nr_extents)
+{
+	u64 bytenr;
+	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	u32 nritems;
+	u32 nr_file_extents = 0;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int level;
+	int ret = 0;
+	int faili = 0;
+	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
+			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
+
+	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+
+	nritems = btrfs_header_nritems(buf);
+	level = btrfs_header_level(buf);
+
+	if (root->ref_cows) {
+		process_func = __btrfs_inc_extent_ref;
+	} else {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			goto out;
+		process_func = __btrfs_update_extent_ref;
+	}
+
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
 		if (level == 0) {
-			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
@@ -1147,24 +1225,131 @@ fail:
 			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
 				continue;
-			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (disk_bytenr == 0)
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
 				continue;
-			err = btrfs_free_extent(trans, root, disk_bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf,
-								      fi), 0);
-			BUG_ON(err);
+
+			nr_file_extents++;
+
+			maybe_lock_mutex(root);
+			ret = process_func(trans, root, bytenr,
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   key.objectid, key.offset);
+			maybe_unlock_mutex(root);
+
+			if (ret) {
+				faili = i;
+				WARN_ON(1);
+				goto fail;
+			}
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
-			err = btrfs_free_extent(trans, root, bytenr,
-					btrfs_level_size(root, level - 1), 0);
-			BUG_ON(err);
+			maybe_lock_mutex(root);
+			ret = process_func(trans, root, bytenr,
+					   orig_buf->start, buf->start,
+					   orig_root, ref_root,
+					   orig_generation, ref_generation,
+					   level - 1, 0);
+			maybe_unlock_mutex(root);
+			if (ret) {
+				faili = i;
+				WARN_ON(1);
+				goto fail;
+			}
 		}
 	}
-#endif
+out:
+	if (nr_extents) {
+		if (level == 0)
+			*nr_extents = nr_file_extents;
+		else
+			*nr_extents = nritems;
+	}
+	return 0;
+fail:
+	WARN_ON(1);
 	return ret;
 }
 
+int btrfs_update_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root, struct extent_buffer *orig_buf,
+		     struct extent_buffer *buf, int start_slot, int nr)
+
+{
+	u64 bytenr;
+	u64 ref_root;
+	u64 orig_root;
+	u64 ref_generation;
+	u64 orig_generation;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int i;
+	int ret;
+	int slot;
+	int level;
+
+	BUG_ON(start_slot < 0);
+	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
+
+	ref_root = btrfs_header_owner(buf);
+	ref_generation = btrfs_header_generation(buf);
+	orig_root = btrfs_header_owner(orig_buf);
+	orig_generation = btrfs_header_generation(orig_buf);
+	level = btrfs_header_level(buf);
+
+	if (!root->ref_cows) {
+		if (level == 0 &&
+		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+		if (level != 0 &&
+		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
+			return 0;
+	}
+
+	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
+		cond_resched();
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, slot);
+			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+				continue;
+			fi = btrfs_item_ptr(buf, slot,
+					    struct btrfs_file_extent_item);
+			if (btrfs_file_extent_type(buf, fi) ==
+			    BTRFS_FILE_EXTENT_INLINE)
+				continue;
+			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+			if (bytenr == 0)
+				continue;
+			maybe_lock_mutex(root);
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    key.objectid, key.offset);
+			maybe_unlock_mutex(root);
+			if (ret)
+				goto fail;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, slot);
+			maybe_lock_mutex(root);
+			ret = __btrfs_update_extent_ref(trans, root, bytenr,
+					    orig_buf->start, buf->start,
+					    orig_root, ref_root,
+					    orig_generation, ref_generation,
+					    level - 1, 0);
+			maybe_unlock_mutex(root);
+			if (ret)
+				goto fail;
+		}
+	}
+	return 0;
+fail:
+	WARN_ON(1);
+	return -1;
+}
+
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
@@ -1539,19 +1724,18 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 {
 	u64 start;
 	u64 end;
+	u64 priv;
 	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct extent_buffer *eb;
 	struct btrfs_path *path;
-	struct btrfs_key ins;
-	struct btrfs_disk_key first;
+	struct btrfs_extent_ref *ref;
+	struct pending_extent_op *extent_op;
+	struct btrfs_key key;
 	struct btrfs_extent_item extent_item;
 	int ret;
-	int level;
 	int err = 0;
 
 	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	btrfs_set_stack_extent_refs(&extent_item, 1);
-	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 	path = btrfs_alloc_path();
 
 	while(1) {
@@ -1560,37 +1744,54 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 		if (ret)
 			break;
 
-		ins.objectid = start;
-		ins.offset = end + 1 - start;
-		err = btrfs_insert_item(trans, extent_root, &ins,
+		ret = get_state_private(&info->extent_ins, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long)priv;
+
+		if (extent_op->type == PENDING_EXTENT_INSERT) {
+			key.objectid = start;
+			key.offset = end + 1 - start;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			err = btrfs_insert_item(trans, extent_root, &key,
 					&extent_item, sizeof(extent_item));
-		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
-				  GFP_NOFS);
+			BUG_ON(err);
 
-		eb = btrfs_find_create_tree_block(extent_root, ins.objectid,
-					   ins.offset);
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_LOCKED, GFP_NOFS);
 
-		if (!btrfs_buffer_uptodate(eb, trans->transid))
-			btrfs_read_buffer(eb, trans->transid);
+			err = insert_extent_backref(trans, extent_root, path,
+						start, extent_op->parent,
+						extent_root->root_key.objectid,
+						extent_op->generation,
+						extent_op->level, 0);
+			BUG_ON(err);
+		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+			err = lookup_extent_backref(trans, extent_root, path,
+						start, extent_op->orig_parent,
+						extent_root->root_key.objectid,
+						extent_op->orig_generation, 0);
+			BUG_ON(err);
 
-		btrfs_tree_lock(eb);
-		level = btrfs_header_level(eb);
-		if (level == 0) {
-			btrfs_item_key(eb, &first, 0);
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_LOCKED, GFP_NOFS);
+
+			key.objectid = start;
+			key.offset = extent_op->parent;
+			key.type = BTRFS_EXTENT_REF_KEY;
+			err = btrfs_set_item_key_safe(trans, extent_root, path,
+						      &key);
+			BUG_ON(err);
+			ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					     struct btrfs_extent_ref);
+			btrfs_set_ref_generation(path->nodes[0], ref,
+						 extent_op->generation);
+			btrfs_mark_buffer_dirty(path->nodes[0]);
+			btrfs_release_path(extent_root, path);
 		} else {
-			btrfs_node_key(eb, &first, 0);
+			BUG_ON(1);
 		}
-		btrfs_tree_unlock(eb);
-		free_extent_buffer(eb);
-		/*
-		 * the first key is just a hint, so the race we've created
-		 * against reading it is fine
-		 */
-		err = btrfs_insert_extent_backref(trans, extent_root, path,
-					  start, extent_root->root_key.objectid,
-					  0, level,
-					  btrfs_disk_key_objectid(&first));
-		BUG_ON(err);
+		kfree(extent_op);
+
 		if (need_resched()) {
 			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
@@ -1601,52 +1802,44 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
-			  int is_data, int pending)
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u64 num_bytes, int is_data)
 {
 	int err = 0;
+	struct extent_buffer *buf;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
-	if (!pending) {
-		struct extent_buffer *buf;
-
-		if (is_data)
-			goto pinit;
-
-		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
-		if (buf) {
-			/* we can reuse a block if it hasn't been written
-			 * and it is from this transaction.  We can't
-			 * reuse anything from the tree log root because
-			 * it has tiny sub-transactions.
-			 */
-			if (btrfs_buffer_uptodate(buf, 0) &&
-			    btrfs_try_tree_lock(buf)) {
-				u64 transid =
-				    root->fs_info->running_transaction->transid;
-				u64 header_transid =
-					btrfs_header_generation(buf);
-				if (btrfs_header_owner(buf) !=
-				    BTRFS_TREE_LOG_OBJECTID &&
-				    header_transid == transid &&
-				    !btrfs_header_flag(buf,
-					       BTRFS_HEADER_FLAG_WRITTEN)) {
-					clean_tree_block(NULL, root, buf);
-					btrfs_tree_unlock(buf);
-					free_extent_buffer(buf);
-					return 1;
-				}
-				btrfs_tree_unlock(buf);
-			}
+	if (is_data)
+		goto pinit;
+
+	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
+	if (!buf)
+		goto pinit;
+
+	/* we can reuse a block if it hasn't been written
+	 * and it is from this transaction.  We can't
+	 * reuse anything from the tree log root because
+	 * it has tiny sub-transactions.
+	 */
+	if (btrfs_buffer_uptodate(buf, 0) &&
+	    btrfs_try_tree_lock(buf)) {
+		u64 header_owner = btrfs_header_owner(buf);
+		u64 header_transid = btrfs_header_generation(buf);
+		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+		    header_transid == trans->transid &&
+		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+			clean_tree_block(NULL, root, buf);
+			btrfs_tree_unlock(buf);
 			free_extent_buffer(buf);
+			return 1;
 		}
-pinit:
-		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-	} else {
-		set_extent_bits(&root->fs_info->pending_del,
-				bytenr, bytenr + num_bytes - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+		btrfs_tree_unlock(buf);
 	}
+	free_extent_buffer(buf);
+pinit:
+	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+
 	BUG_ON(err < 0);
 	return 0;
 }
@@ -1654,11 +1847,12 @@ pinit:
 /*
  * remove an extent from the root, returns 0 on success
  */
-static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
-			 *root, u64 bytenr, u64 num_bytes,
+static int __free_extent(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, u64 owner_offset, int pin,
-			 int mark_free)
+			 u64 owner_objectid, u64 owner_offset,
+			 int pin, int mark_free)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -1681,10 +1875,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		return -ENOMEM;
 
 	path->reada = 1;
-	ret = lookup_extent_backref(trans, extent_root, path,
-				    bytenr, root_objectid,
-				    ref_generation,
-				    owner_objectid, owner_offset, 1);
+	ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent,
+				    root_objectid, ref_generation, 1);
 	if (ret == 0) {
 		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
@@ -1702,8 +1894,15 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 			if (path->slots[0] - extent_slot > 5)
 				break;
 		}
-		if (!found_extent)
-			ret = btrfs_del_item(trans, extent_root, path);
+		if (!found_extent) {
+			ret = remove_extent_backref(trans, extent_root, path);
+			BUG_ON(ret);
+			btrfs_release_path(extent_root, path);
+			ret = btrfs_search_slot(trans, extent_root,
+						&key, path, -1, 1);
+			BUG_ON(ret);
+			extent_slot = path->slots[0];
+		}
 	} else {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
@@ -1712,14 +1911,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		       root_objectid, ref_generation, owner_objectid,
 		       owner_offset);
 	}
-	if (!found_extent) {
-		btrfs_release_path(extent_root, path);
-		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-		if (ret < 0)
-			return ret;
-		BUG_ON(ret);
-		extent_slot = path->slots[0];
-	}
 
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, extent_slot,
@@ -1732,6 +1923,10 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	btrfs_mark_buffer_dirty(leaf);
 
 	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+		struct btrfs_extent_ref *ref;
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_ref);
+		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
 		/* if the back ref and the extent are next to each other
 		 * they get deleted below in one shot
 		 */
@@ -1739,15 +1934,13 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		num_to_del = 2;
 	} else if (found_extent) {
 		/* otherwise delete the extent back ref */
-		ret = btrfs_del_item(trans, extent_root, path);
+		ret = remove_extent_backref(trans, extent_root, path);
 		BUG_ON(ret);
 		/* if refs are 0, we need to setup the path for deletion */
 		if (refs == 0) {
 			btrfs_release_path(extent_root, path);
 			ret = btrfs_search_slot(trans, extent_root, &key, path,
 						-1, 1);
-			if (ret < 0)
-				return ret;
 			BUG_ON(ret);
 		}
 	}
@@ -1761,8 +1954,8 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 #endif
 
 		if (pin) {
-			ret = pin_down_bytes(root, bytenr, num_bytes,
-			     owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, 0);
+			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
@@ -1781,9 +1974,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 					   root_used - num_bytes);
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
-		if (ret) {
-			return ret;
-		}
+		BUG_ON(ret);
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free);
 		BUG_ON(ret);
@@ -1822,33 +2013,61 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 {
 	int ret;
 	int err = 0;
+	int mark_free = 0;
 	u64 start;
 	u64 end;
+	u64 priv;
 	struct extent_io_tree *pending_del;
-	struct extent_io_tree *pinned_extents;
+	struct extent_io_tree *extent_ins;
+	struct pending_extent_op *extent_op;
 
 	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+	extent_ins = &extent_root->fs_info->extent_ins;
 	pending_del = &extent_root->fs_info->pending_del;
-	pinned_extents = &extent_root->fs_info->pinned_extents;
 
 	while(1) {
 		ret = find_first_extent_bit(pending_del, 0, &start, &end,
 					    EXTENT_LOCKED);
 		if (ret)
 			break;
+
+		ret = get_state_private(pending_del, start, &priv);
+		BUG_ON(ret);
+		extent_op = (struct pending_extent_op *)(unsigned long)priv;
+
 		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
-		if (!test_range_bit(&extent_root->fs_info->extent_ins,
-				    start, end, EXTENT_LOCKED, 0)) {
-			btrfs_update_pinned_extents(extent_root, start,
-					      end + 1 - start, 1);
+
+		ret = pin_down_bytes(trans, extent_root, start,
+				     end + 1 - start, 0);
+		mark_free = ret > 0;
+		if (!test_range_bit(extent_ins, start, end,
+				    EXTENT_LOCKED, 0)) {
+free_extent:
 			ret = __free_extent(trans, extent_root,
-					     start, end + 1 - start,
-					     extent_root->root_key.objectid,
-					     0, 0, 0, 0, 0);
+					    start, end + 1 - start,
+					    extent_op->orig_parent,
+					    extent_root->root_key.objectid,
+					    extent_op->orig_generation,
+					    extent_op->level, 0, 0, mark_free);
+			kfree(extent_op);
 		} else {
-			clear_extent_bits(&extent_root->fs_info->extent_ins,
-					  start, end, EXTENT_LOCKED, GFP_NOFS);
+			kfree(extent_op);
+			ret = get_state_private(extent_ins, start, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+							(unsigned long)priv;
+
+			clear_extent_bits(extent_ins, start, end,
+					  EXTENT_LOCKED, GFP_NOFS);
+
+			if (extent_op->type == PENDING_BACKREF_UPDATE)
+				goto free_extent;
+
+			ret = update_block_group(trans, extent_root, start,
+						end + 1 - start, 0, mark_free);
+			BUG_ON(ret);
+			kfree(extent_op);
 		}
 		if (ret)
 			err = ret;
@@ -1866,21 +2085,36 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
  * remove an extent from the root, returns 0 on success
  */
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root, u64 bytenr,
-			       u64 num_bytes, u64 root_objectid,
-			       u64 ref_generation, u64 owner_objectid,
-			       u64 owner_offset, int pin)
+			       struct btrfs_root *root,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 root_objectid, u64 ref_generation,
+			       u64 owner_objectid, u64 owner_offset, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
 	int ret;
 
 	WARN_ON(num_bytes < root->sectorsize);
-	if (!root->ref_cows)
-		ref_generation = 0;
-
 	if (root == extent_root) {
-		pin_down_bytes(root, bytenr, num_bytes, 0, 1);
+		struct pending_extent_op *extent_op;
+
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_DELETE;
+		extent_op->bytenr = bytenr;
+		extent_op->num_bytes = num_bytes;
+		extent_op->parent = parent;
+		extent_op->orig_parent = parent;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = ref_generation;
+		extent_op->level = (int)owner_objectid;
+
+		set_extent_bits(&root->fs_info->pending_del,
+				bytenr, bytenr + num_bytes - 1,
+				EXTENT_LOCKED, GFP_NOFS);
+		set_state_private(&root->fs_info->pending_del,
+				  bytenr, (unsigned long)extent_op);
 		return 0;
 	}
 	/* if metadata always pin */
@@ -1901,9 +2135,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (ref_generation != trans->transid)
 		pin = 1;
 
-	ret = __free_extent(trans, root, bytenr, num_bytes, root_objectid,
-			    ref_generation, owner_objectid, owner_offset,
-			    pin, pin == 0);
+	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+			    root_objectid, ref_generation, owner_objectid,
+			    owner_offset, pin, pin == 0);
 
 	finish_current_insert(trans, root->fs_info->extent_root);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
@@ -1911,15 +2145,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, u64 bytenr,
-		      u64 num_bytes, u64 root_objectid,
-		      u64 ref_generation, u64 owner_objectid,
-		      u64 owner_offset, int pin)
+		      struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent,
+		      u64 root_objectid, u64 ref_generation,
+		      u64 owner_objectid, u64 owner_offset, int pin)
 {
 	int ret;
 
 	maybe_lock_mutex(root);
-	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes,
+	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
 				  root_objectid, ref_generation,
 				  owner_objectid, owner_offset, pin);
 	maybe_unlock_mutex(root);
@@ -2271,7 +2505,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 }
 
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root,
+					 struct btrfs_root *root, u64 parent,
 					 u64 root_objectid, u64 ref_generation,
 					 u64 owner, u64 owner_offset,
 					 struct btrfs_key *ins)
@@ -2289,6 +2523,9 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
+	if (parent == 0)
+		parent = ins->objectid;
+
 	/* block accounting for super block */
 	spin_lock_irq(&info->delalloc_lock);
 	super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2300,17 +2537,32 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 
 	if (root == extent_root) {
+		struct pending_extent_op *extent_op;
+
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+
+		extent_op->type = PENDING_EXTENT_INSERT;
+		extent_op->bytenr = ins->objectid;
+		extent_op->num_bytes = ins->offset;
+		extent_op->parent = parent;
+		extent_op->orig_parent = 0;
+		extent_op->generation = ref_generation;
+		extent_op->orig_generation = 0;
+		extent_op->level = (int)owner;
+
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
 				EXTENT_LOCKED, GFP_NOFS);
+		set_state_private(&root->fs_info->extent_ins,
+				  ins->objectid, (unsigned long)extent_op);
 		goto update_block;
 	}
 
 	memcpy(&keys[0], ins, sizeof(*ins));
-	keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
-					 owner, owner_offset);
 	keys[1].objectid = ins->objectid;
 	keys[1].type = BTRFS_EXTENT_REF_KEY;
+	keys[1].offset = parent;
 	sizes[0] = sizeof(*extent_item);
 	sizes[1] = sizeof(*ref);
 
@@ -2331,6 +2583,7 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
 	btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
+	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 
@@ -2359,7 +2612,7 @@ out:
 }
 
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins)
@@ -2369,9 +2622,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
 	maybe_lock_mutex(root);
-	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-					    ref_generation, owner,
-					    owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					    root_objectid, ref_generation,
+					    owner, owner_offset, ins);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2382,7 +2635,7 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
  * space cache bits as well
  */
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
+				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
 				u64 owner, u64 owner_offset,
 				struct btrfs_key *ins)
@@ -2396,10 +2649,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
 	BUG_ON(ret);
-
-	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-					    ref_generation, owner,
-					    owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					    root_objectid, ref_generation,
+					    owner, owner_offset, ins);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2413,9 +2665,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
  */
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
-		       u64 num_bytes, u64 min_alloc_size,
+		       u64 num_bytes, u64 parent, u64 min_alloc_size,
 		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 owner_offset,
+		       u64 owner_objectid, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
@@ -2428,9 +2680,9 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 				     search_end, ins, data);
 	BUG_ON(ret);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
-						    ref_generation, owner,
-						    owner_offset, ins);
+		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+					root_objectid, ref_generation,
+					owner_objectid, owner_offset, ins);
 		BUG_ON(ret);
 
 	}
@@ -2468,10 +2720,9 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
-					     u32 blocksize,
+					     u32 blocksize, u64 parent,
 					     u64 root_objectid,
 					     u64 ref_generation,
-					     u64 first_objectid,
 					     int level,
 					     u64 hint,
 					     u64 empty_size)
@@ -2480,10 +2731,9 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	int ret;
 	struct extent_buffer *buf;
 
-	ret = btrfs_alloc_extent(trans, root, blocksize, blocksize,
-				 root_objectid, ref_generation,
-				 level, first_objectid, empty_size, hint,
-				 (u64)-1, &ins, 0);
+	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
+				 root_objectid, ref_generation, level, 0,
+				 empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
@@ -2531,15 +2781,14 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
-				leaf_owner, leaf_generation,
+				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, key.offset, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
+		BUG_ON(ret);
 
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
 		cond_resched();
-
-		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -2554,10 +2803,10 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 
 	for (i = 0; i < ref->nritems; i++) {
 		mutex_lock(&root->fs_info->alloc_mutex);
-		ret = __btrfs_free_extent(trans, root,
-					info->bytenr, info->num_bytes,
-					ref->owner, ref->generation,
-					info->objectid, info->offset, 0);
+		ret = __btrfs_free_extent(trans, root, info->bytenr,
+					  info->num_bytes, ref->bytenr,
+					  ref->owner, ref->generation,
+					  info->objectid, info->offset, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		atomic_inc(&root->fs_info->throttle_gen);
@@ -2576,7 +2825,7 @@ int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 {
 	int ret;
 
-	ret = lookup_extent_ref(NULL, root, start, len, refs);
+	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 // some debugging code in case we see problems here
@@ -2672,8 +2921,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
-						blocksize, root_owner,
-						root_gen, 0, 0, 1);
+						blocksize, parent->start,
+						root_owner, root_gen, 0, 0, 1);
 			BUG_ON(ret);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 
@@ -2690,8 +2939,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		 * So, we don't need to check it again
 		 */
 		if (*level == 1) {
-			struct btrfs_key key;
-			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
 			ref = btrfs_lookup_leaf_ref(root, bytenr);
 			if (ref) {
 				ret = cache_drop_leaf_ref(trans, root, ref);
@@ -2750,12 +2997,13 @@ out:
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
-				  root_owner, root_gen, 0, 0, 1);
+				  parent->start, root_owner, root_gen,
+				  0, 0, 1);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
 	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 
 	cond_resched();
 	return 0;
@@ -2792,19 +3040,18 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
 			root_item->drop_level = i;
 			return 0;
 		} else {
-			if (path->nodes[*level] == root->node) {
-				root_owner = root->root_key.objectid;
-				root_gen =
-				   btrfs_header_generation(path->nodes[*level]);
-			} else {
-				struct extent_buffer *node;
-				node = path->nodes[*level + 1];
-				root_owner = btrfs_header_owner(node);
-				root_gen = btrfs_header_generation(node);
-			}
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
 			ret = btrfs_free_extent(trans, root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len,
+						parent->start,
 						root_owner, root_gen, 0, 0, 1);
 			BUG_ON(ret);
 			free_extent_buffer(path->nodes[*level]);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8624f3e8803..58ad25838a4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2201,9 +2201,10 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
-
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 	if (wbc->range_cont)
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
+#endif
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_cache_pages);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 58b329ddb42..48a702d41c8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -524,6 +524,9 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
 	u64 extent_end = 0;
 	u64 search_start = start;
+	u64 leaf_start;
+	u64 root_gen;
+	u64 root_owner;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_path *path;
@@ -562,6 +565,9 @@ next_slot:
 		bookend = 0;
 		found_extent = 0;
 		found_inline = 0;
+		leaf_start = 0;
+		root_gen = 0;
+		root_owner = 0;
 		extent = NULL;
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -628,27 +634,18 @@ next_slot:
 			search_start = extent_end;
 		if (end <= extent_end && start >= key.offset && found_inline) {
 			*hint_byte = EXTENT_MAP_INLINE;
-			continue;
+			goto out;
+		}
+
+		if (found_extent) {
+			read_extent_buffer(leaf, &old, (unsigned long)extent,
+					   sizeof(old));
+			root_gen = btrfs_header_generation(leaf);
+			root_owner = btrfs_header_owner(leaf);
+			leaf_start = leaf->start;
 		}
+
 		if (end < extent_end && end >= key.offset) {
-			if (found_extent) {
-				u64 disk_bytenr =
-				    btrfs_file_extent_disk_bytenr(leaf, extent);
-				u64 disk_num_bytes =
-				    btrfs_file_extent_disk_num_bytes(leaf,
-								      extent);
-				read_extent_buffer(leaf, &old,
-						   (unsigned long)extent,
-						   sizeof(old));
-				if (disk_bytenr != 0) {
-					ret = btrfs_inc_extent_ref(trans, root,
-					         disk_bytenr, disk_num_bytes,
-						 root->root_key.objectid,
-						 trans->transid,
-						 key.objectid, end);
-					BUG_ON(ret);
-				}
-			}
 			bookend = 1;
 			if (found_inline && start <= key.offset)
 				keep = 1;
@@ -687,49 +684,12 @@ next_slot:
 		}
 		/* delete the entire extent */
 		if (!keep) {
-			u64 disk_bytenr = 0;
-			u64 disk_num_bytes = 0;
-			u64 extent_num_bytes = 0;
-			u64 root_gen;
-			u64 root_owner;
-
-			root_gen = btrfs_header_generation(leaf);
-			root_owner = btrfs_header_owner(leaf);
-			if (found_extent) {
-				disk_bytenr =
-				      btrfs_file_extent_disk_bytenr(leaf,
-								     extent);
-				disk_num_bytes =
-				      btrfs_file_extent_disk_num_bytes(leaf,
-								       extent);
-				extent_num_bytes =
-				      btrfs_file_extent_num_bytes(leaf, extent);
-				*hint_byte =
-					btrfs_file_extent_disk_bytenr(leaf,
-								      extent);
-			}
 			ret = btrfs_del_item(trans, root, path);
 			/* TODO update progress marker and return */
 			BUG_ON(ret);
-			btrfs_release_path(root, path);
 			extent = NULL;
-			if (found_extent && disk_bytenr != 0) {
-				dec_i_blocks(inode, extent_num_bytes);
-				ret = btrfs_free_extent(trans, root,
-						disk_bytenr,
-						disk_num_bytes,
-						root_owner,
-						root_gen, inode->i_ino,
-						key.offset, 0);
-			}
-
-			BUG_ON(ret);
-			if (!bookend && search_start >= end) {
-				ret = 0;
-				goto out;
-			}
-			if (!bookend)
-				continue;
+			btrfs_release_path(root, path);
+			/* the extent will be freed later */
 		}
 		if (bookend && found_inline && start <= key.offset) {
 			u32 new_size;
@@ -737,10 +697,13 @@ next_slot:
 						   extent_end - end);
 			dec_i_blocks(inode, (extent_end - key.offset) -
 					(extent_end - end));
-			btrfs_truncate_item(trans, root, path, new_size, 0);
+			ret = btrfs_truncate_item(trans, root, path,
+						  new_size, 0);
+			BUG_ON(ret);
 		}
 		/* create bookend, splitting the extent in two */
 		if (bookend && found_extent) {
+			u64 disk_bytenr;
 			struct btrfs_key ins;
 			ins.objectid = inode->i_ino;
 			ins.offset = end;
@@ -748,13 +711,9 @@ next_slot:
 			btrfs_release_path(root, path);
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
+			BUG_ON(ret);
 
 			leaf = path->nodes[0];
-			if (ret) {
-				btrfs_print_leaf(root, leaf);
-				printk("got %d on inserting %Lu %u %Lu start %Lu end %Lu found %Lu %Lu keep was %d\n", ret , ins.objectid, ins.type, ins.offset, start, end, key.offset, extent_end, keep);
-			}
-			BUG_ON(ret);
 			extent = btrfs_item_ptr(leaf, path->slots[0],
 						struct btrfs_file_extent_item);
 			write_extent_buffer(leaf, &old,
@@ -770,11 +729,43 @@ next_slot:
 						   BTRFS_FILE_EXTENT_REG);
 
 			btrfs_mark_buffer_dirty(path->nodes[0]);
-			if (le64_to_cpu(old.disk_bytenr) != 0) {
+
+			disk_bytenr = le64_to_cpu(old.disk_bytenr);
+			if (disk_bytenr != 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid,
+						ins.objectid, ins.offset);
+				BUG_ON(ret);
+			}
+			btrfs_release_path(root, path);
+			if (disk_bytenr != 0) {
 				inode->i_blocks +=
 				      btrfs_file_extent_num_bytes(leaf,
 								  extent) >> 9;
 			}
+		}
+
+		if (found_extent && !keep) {
+			u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
+
+			if (disk_bytenr != 0) {
+				dec_i_blocks(inode, le64_to_cpu(old.num_bytes));
+				ret = btrfs_free_extent(trans, root,
+						disk_bytenr,
+						le64_to_cpu(old.disk_num_bytes),
+						leaf_start, root_owner,
+						root_gen, key.objectid,
+						key.offset, 0);
+				BUG_ON(ret);
+				*hint_byte = disk_bytenr;
+			}
+		}
+
+		if (search_start >= end) {
 			ret = 0;
 			goto out;
 		}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 65b4f864b0d..2e7d82ec5d1 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -528,6 +528,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_file_extent_item *extent_item;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *leaf;
 	u64 alloc_hint = 0;
 	struct list_head list;
 	struct btrfs_key ins;
@@ -544,20 +547,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
 		goto nocow;
 
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
 	lock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
 
 	INIT_LIST_HEAD(&list);
 
-	ins.objectid = ordered_extent->start;
-	ins.offset = ordered_extent->len;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-	ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
-					  trans->transid, inode->i_ino,
-					  ordered_extent->file_offset, &ins);
-	BUG_ON(ret);
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 
 	ret = btrfs_drop_extents(trans, root, inode,
@@ -566,18 +564,42 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 				 ordered_extent->len,
 				 ordered_extent->file_offset, &alloc_hint);
 	BUG_ON(ret);
-	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-				       ordered_extent->file_offset,
-				       ordered_extent->start,
-				       ordered_extent->len,
-				       ordered_extent->len, 0);
+
+	ins.objectid = inode->i_ino;
+	ins.offset = ordered_extent->file_offset;
+	ins.type = BTRFS_EXTENT_DATA_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &ins,
+				      sizeof(*extent_item));
 	BUG_ON(ret);
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_file_extent_type(leaf, extent_item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
+					  ordered_extent->start);
+	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
+					     ordered_extent->len);
+	btrfs_set_file_extent_offset(leaf, extent_item, 0);
+	btrfs_set_file_extent_num_bytes(leaf, extent_item,
+					ordered_extent->len);
+	btrfs_mark_buffer_dirty(leaf);
 
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
 				ordered_extent->len - 1);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
+	ins.objectid = ordered_extent->start;
+	ins.offset = ordered_extent->len;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+					  root->root_key.objectid,
+					  trans->transid, inode->i_ino,
+					  ordered_extent->file_offset, &ins);
+	BUG_ON(ret);
+	btrfs_release_path(root, path);
+
 	inode->i_blocks += ordered_extent->len >> 9;
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
@@ -596,6 +618,8 @@ nocow:
 	btrfs_put_ordered_extent(ordered_extent);
 
 	btrfs_end_transaction(trans, root);
+	if (path)
+		btrfs_free_path(path);
 	return 0;
 }
 
@@ -1433,10 +1457,7 @@ search_again:
 					if (root->ref_cows)
 						dec_i_blocks(inode, num_dec);
 				}
-				if (root->ref_cows) {
-					root_gen =
-						btrfs_header_generation(leaf);
-				}
+				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1477,7 +1498,7 @@ delete:
 		if (found_extent) {
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
-						root_owner,
+						leaf->start, root_owner,
 						root_gen, inode->i_ino,
 						found_key.offset, 0);
 			BUG_ON(ret);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f84b5f6991c..4c6e0c15754 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -76,9 +76,8 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	if (ret)
 		goto fail;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      objectid, trans->transid, 0, 0,
-				      0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      objectid, trans->transid, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
@@ -525,13 +524,10 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	struct file *src_file;
 	struct inode *src;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_ordered_extent *ordered;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	char *buf;
 	struct btrfs_key key;
-	struct btrfs_key new_key;
-	u32 size;
 	u32 nritems;
 	int slot;
 	int ret;
@@ -576,6 +572,7 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	/* do any pending delalloc/csum calc on src, one way or
 	   another, and lock file content */
 	while (1) {
+		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
@@ -619,6 +616,32 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		    key.objectid != src->i_ino)
 			break;
 
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
+		    btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
+			u32 size;
+			struct btrfs_key new_key;
+
+			size = btrfs_item_size_nr(leaf, slot);
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+			btrfs_release_path(root, path);
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.objectid = inode->i_ino;
+			ret = btrfs_insert_empty_item(trans, root, path,
+						      &new_key, size);
+			if (ret)
+				goto out;
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
 			struct btrfs_file_extent_item *extent;
 			int found_type;
@@ -634,31 +657,15 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 				/* ds == 0 means there's a hole */
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, root,
-						     ds, dl,
+						     ds, dl, leaf->start,
 						     root->root_key.objectid,
 						     trans->transid,
 						     inode->i_ino, key.offset);
-					if (ret)
-						goto out;
+					BUG_ON(ret);
 				}
 			}
 		}
-
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
-		    btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
-			size = btrfs_item_size_nr(leaf, slot);
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
-			btrfs_release_path(root, path);
-			memcpy(&new_key, &key, sizeof(new_key));
-			new_key.objectid = inode->i_ino;
-			ret = btrfs_insert_item(trans, root, &new_key,
-						buf, size);
-			BUG_ON(ret);
-		} else {
-			btrfs_release_path(root, path);
-		}
+		btrfs_release_path(root, path);
 		key.offset++;
 	}
 	ret = 0;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f1374d597a1..3577badfa5b 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -102,11 +102,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_EXTENT_REF_KEY:
 			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
 			printk("\t\textent back ref root %llu gen %llu "
-			       "owner %llu offset %llu\n",
+			       "owner %llu offset %llu num_refs %lu\n",
 			       (unsigned long long)btrfs_ref_root(l, ref),
 			       (unsigned long long)btrfs_ref_generation(l, ref),
 			       (unsigned long long)btrfs_ref_objectid(l, ref),
-			       (unsigned long long)btrfs_ref_offset(l, ref));
+			       (unsigned long long)btrfs_ref_offset(l, ref),
+			       (unsigned long)btrfs_ref_num_refs(l, ref));
 			break;
 
 		case BTRFS_EXTENT_DATA_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 56de3fb2d8d..88bbfd959f1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -89,9 +89,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	int ret;
 	u64 objectid = root->root_key.objectid;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 				      BTRFS_TREE_LOG_OBJECTID,
-				      0, 0, 0, 0, 0);
+				      trans->transid, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		return ret;
@@ -433,6 +433,49 @@ insert:
 						   trans->transid);
 		}
 	}
+
+	if (overwrite_root &&
+	    key->type == BTRFS_EXTENT_DATA_KEY) {
+		int extent_type;
+		struct btrfs_file_extent_item *fi;
+
+		fi = (struct btrfs_file_extent_item *)dst_ptr;
+		extent_type = btrfs_file_extent_type(path->nodes[0], fi);
+		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+			struct btrfs_key ins;
+			ins.objectid = btrfs_file_extent_disk_bytenr(
+							path->nodes[0], fi);
+			ins.offset = btrfs_file_extent_disk_num_bytes(
+							path->nodes[0], fi);
+			ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+			/*
+			 * is this extent already allocated in the extent
+			 * allocation tree?  If so, just add a reference
+			 */
+			ret = btrfs_lookup_extent(root, ins.objectid,
+						  ins.offset);
+			if (ret == 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						ins.objectid, ins.offset,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid,
+						key->objectid, key->offset);
+			} else {
+				/*
+				 * insert the extent pointer in the extent
+				 * allocation tree
+				 */
+				ret = btrfs_alloc_logged_extent(trans, root,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						key->offset, &ins);
+				BUG_ON(ret);
+			}
+		}
+	}
 no_copy:
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(root, path);
@@ -551,45 +594,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			 start, extent_end, start, &alloc_hint);
 	BUG_ON(ret);
 
+	/* insert the extent */
+	ret = overwrite_item(trans, root, path, eb, slot, key);
 	BUG_ON(ret);
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
-		struct btrfs_key ins;
-
-		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
-		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
-		ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-		/* insert the extent pointer in the file */
-		ret = overwrite_item(trans, root, path, eb, slot, key);
-		BUG_ON(ret);
 
-		/*
-		 * is this extent already allocated in the extent
-		 * allocation tree?  If so, just add a reference
-		 */
-		ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset);
-		btrfs_release_path(root, path);
-		if (ret == 0) {
-			ret = btrfs_inc_extent_ref(trans, root,
-				   ins.objectid, ins.offset,
-				   root->root_key.objectid,
-				   trans->transid, key->objectid, start);
-		} else {
-			/*
-			 * insert the extent pointer in the extent
-			 * allocation tree
-			 */
-			ret = btrfs_alloc_logged_extent(trans, root,
-						root->root_key.objectid,
-						trans->transid, key->objectid,
-						start, &ins);
-			BUG_ON(ret);
-		}
-	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		/* inline extents are easy, we just overwrite them */
-		ret = overwrite_item(trans, root, path, eb, slot, key);
-		BUG_ON(ret);
-	}
 	/* btrfs_drop_extents changes i_blocks, update it here */
 	inode->i_blocks += (extent_end - start) >> 9;
 	btrfs_update_inode(trans, root, inode);
@@ -1806,16 +1814,14 @@ static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
 			WARN_ON(*level == 0);
 			return 0;
 		} else {
-			if (path->nodes[*level] == root->node) {
-				root_owner = root->root_key.objectid;
-				root_gen =
-				   btrfs_header_generation(path->nodes[*level]);
-			} else {
-				struct extent_buffer *node;
-				node = path->nodes[*level + 1];
-				root_owner = btrfs_header_owner(node);
-				root_gen = btrfs_header_generation(node);
-			}
+			struct extent_buffer *parent;
+			if (path->nodes[*level] == root->node)
+				parent = path->nodes[*level];
+			else
+				parent = path->nodes[*level + 1];
+
+			root_owner = btrfs_header_owner(parent);
+			root_gen = btrfs_header_generation(parent);
 			wc->process_func(root, path->nodes[*level], wc,
 				 btrfs_header_generation(path->nodes[*level]));
 			if (wc->free) {
@@ -2525,8 +2531,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, log,
 						   ds, dl,
+						   dst_path->nodes[0]->start,
 						   BTRFS_TREE_LOG_OBJECTID,
-						   0, ins_keys[i].objectid,
+						   trans->transid,
+						   ins_keys[i].objectid,
 						   ins_keys[i].offset);
 					BUG_ON(ret);
 				}
-- 
cgit v1.2.3


From 34353029534a08e41cfb8be647d734b9ce9ebff8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 23 Sep 2008 20:19:49 -0400
Subject: Btrfs: Fix race against disk_i_size updates

The code to update the on disk i_size happens before the
ordered_extent record is removed.  So, it is possible for multiple
ordered_extent completion routines to run at the same time, and to
find each other in the ordered tree.

The end result is they both decide not to update disk_i_size, leaving
it too small.  This temporary fix just puts the updates inside
the extent_mutex.  A real solution would be stronger ordering of
disk_i_size updates against removing the ordered extent from the tree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2e7d82ec5d1..adb169d739c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -608,9 +608,11 @@ nocow:
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	btrfs_ordered_update_i_size(inode, ordered_extent);
 	btrfs_update_inode(trans, root, inode);
 	btrfs_remove_ordered_extent(inode, ordered_extent);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
-- 
cgit v1.2.3


From b4f6c45dfbf84f47c21f73f6370ad1292b0627fd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 11:05:53 -0400
Subject: Update Btrfs files for in-kernel usage

btrfs had magic to put the chagneset id into a printk on module load.
This removes that from the Makefile and hardcodes the printk to print
"Btrfs"

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile  | 7 +------
 fs/btrfs/version.h | 4 ++++
 2 files changed, 5 insertions(+), 6 deletions(-)
 create mode 100644 fs/btrfs/version.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index eb36ae981bd..d5c28557fba 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -13,17 +13,12 @@ else
 # Normal Makefile
 
 KERNELDIR := /lib/modules/`uname -r`/build
-all: version
+all:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
 
-version:
-	bash version.sh
-
 modules_install:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
 clean:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` clean
 
-tester:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` tree-defrag.o transaction.o sysfs.o super.o root-tree.o inode-map.o inode-item.o inode.o file-item.o file.o extent_map.o disk-io.o ctree.o dir-item.o extent-tree.o
 endif
diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h
new file mode 100644
index 00000000000..9bf3946d5ef
--- /dev/null
+++ b/fs/btrfs/version.h
@@ -0,0 +1,4 @@
+#ifndef __BTRFS_VERSION_H
+#define __BTRFS_VERSION_H
+#define BTRFS_BUILD_VERSION "Btrfs"
+#endif
-- 
cgit v1.2.3


From 60582d1e933649f14f46f816bb75049058679e01 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 25 Sep 2008 12:25:16 -0400
Subject: Add Btrfs to fs/Kconfig and fs/Makefile

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/Kconfig  | 17 +++++++++++++++++
 fs/Makefile |  1 +
 2 files changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index abccb5dab9a..18f5a85b47c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -497,6 +497,23 @@ config OCFS2_DEBUG_FS
 	  this option for debugging only as it is likely to decrease
 	  performance of the filesystem.
 
+config BTRFS_FS
+	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+	depends on EXPERIMENTAL
+	select LIBCRC32C
+	help
+	  Btrfs is a new filesystem with extents, writable snapshotting,
+	  support for multiple devices and many more features.
+
+	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+	  FINALIZED.  You should say N here unless you are interested in
+	  testing Btrfs with non-critical data.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called btrfs.
+
+	  If unsure, say N.
+
 endif # BLOCK
 
 config DNOTIFY
diff --git a/fs/Makefile b/fs/Makefile
index a1482a5eff1..41fcc858c97 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -121,4 +121,5 @@ obj-$(CONFIG_HOSTFS)		+= hostfs/
 obj-$(CONFIG_HPPFS)		+= hppfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
+obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
-- 
cgit v1.2.3


From 9b49c9b9f93e148815f2544d0c91f43b6d72eea9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 11:23:25 -0400
Subject: Btrfs: Fix allocation completions in tree log replay

After a crash, the tree log code uses btrfs_alloc_logged_extent to
record allocations of data extents that it finds in the log tree.  These
come in basically random order, which does not fit how
btrfs_remove_free_space() expects to be called.

btrfs_remove_free_space was changed to support recording an extent
allocation in the middle of a region of free space.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 01c26e8ae55..f1d9b6bc23b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -306,11 +306,45 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 
 		ret = link_free_space(block_group, info);
 		BUG_ON(ret);
+	} else if (info && info->offset < offset &&
+		   info->offset + info->bytes >= offset + bytes) {
+		u64 old_start = info->offset;
+		/*
+		 * we're freeing space in the middle of the info,
+		 * this can happen during tree log replay
+		 *
+		 * first unlink the old info and then
+		 * insert it again after the hole we're creating
+		 */
+		unlink_free_space(block_group, info);
+		if (offset + bytes < info->offset + info->bytes) {
+			u64 old_end = info->offset + info->bytes;
+
+			info->offset = offset + bytes;
+			info->bytes = old_end - info->offset;
+			ret = link_free_space(block_group, info);
+			BUG_ON(ret);
+		} else {
+			/* the hole we're creating ends at the end
+			 * of the info struct, just free the info
+			 */
+			kfree(info);
+		}
+
+		/* step two, insert a new info struct to cover anything
+		 * before the hole
+		 */
+		spin_unlock(&block_group->lock);
+		ret = btrfs_add_free_space(block_group, old_start,
+					   offset - old_start);
+		BUG_ON(ret);
+		goto out_nolock;
 	} else {
 		WARN_ON(1);
 	}
 out:
 	spin_unlock(&block_group->lock);
+out_nolock:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2b1f55b0f0d0d1a66470ef4ea2696cd5dd741a12 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 11:48:04 -0400
Subject: Remove Btrfs compat code for older kernels

Btrfs had compatibility code for kernels back to 2.6.18.  These have
been removed, and will be maintained in a separate backport
git tree from now on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c |   6 ---
 fs/btrfs/compat.h       |  40 +-------------
 fs/btrfs/crc32c.h       |   6 ---
 fs/btrfs/ctree.h        |   7 +--
 fs/btrfs/disk-io.c      |  28 ----------
 fs/btrfs/export.c       |   6 ---
 fs/btrfs/extent_io.c    |  66 ++---------------------
 fs/btrfs/file.c         |  24 +--------
 fs/btrfs/inode.c        | 135 +-----------------------------------------------
 fs/btrfs/super.c        |   7 ---
 fs/btrfs/sysfs.c        |  33 ------------
 fs/btrfs/volumes.c      |  20 -------
 12 files changed, 10 insertions(+), 368 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 2ee30174019..4e780b279de 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,13 +20,7 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
 # include <linux/freezer.h>
-#else
-# include <linux/sched.h>
-#endif
-
 #include "async-thread.h"
 
 /*
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index b0ed1887d9b..cd6598b169d 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -1,9 +1,8 @@
 #ifndef _COMPAT_H_
 #define _COMPAT_H_
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,26)
-#define trylock_page(page) (!TestSetPageLocked(page))
-#endif
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode)	inc_nlink(inode)
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
 static inline struct dentry *d_obtain_alias(struct inode *inode)
@@ -22,39 +21,4 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-static inline void btrfs_drop_nlink(struct inode *inode)
-{
-	inode->i_nlink--;
-}
-
-static inline void btrfs_inc_nlink(struct inode *inode)
-{
-	inode->i_nlink++;
-}
-#else
-# define btrfs_drop_nlink(inode) drop_nlink(inode)
-# define btrfs_inc_nlink(inode)	inc_nlink(inode)
-#endif
-
-/*
- * Even if AppArmor isn't enabled, it still has different prototypes.
- * Add more distro/version pairs here to declare which has AppArmor applied.
- */
-#if defined(CONFIG_SUSE_KERNEL)
-# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
-# define REMOVE_SUID_PATH 1
-# endif
-#endif
-
-/*
- * catch any other distros that have patched in apparmor.  This isn't
- * 100% reliable because it won't catch people that hand compile their
- * own distro kernels without apparmor compiled in.  But, it is better
- * than nothing.
- */
-#ifdef CONFIG_SECURITY_APPARMOR
-# define REMOVE_SUID_PATH 1
-#endif
-
 #endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
index bf6c12e8573..4f0fefed132 100644
--- a/fs/btrfs/crc32c.h
+++ b/fs/btrfs/crc32c.h
@@ -96,13 +96,7 @@ static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
  * We must workaround older implementations of crc32c_le()
  * found on older kernel versions.
  */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-#define btrfs_crc32c(seed, data, length) \
-	__cpu_to_le32( __btrfs_crc32c( __le32_to_cpu(seed), \
-		                      (unsigned char const *)data, length) )
-#else
 #define btrfs_crc32c(seed, data, length) \
 	__btrfs_crc32c(seed, (unsigned char const *)data, length)
 #endif
-#endif
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 138c157bbc4..3b3c1ca50c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1472,12 +1472,9 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
 	((unsigned long)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset_nr(leaf, slot)))
 
-static inline struct dentry *fdentry(struct file *file) {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	return file->f_dentry;
-#else
+static inline struct dentry *fdentry(struct file *file)
+{
 	return file->f_path.dentry;
-#endif
 }
 
 /* extent-tree.c */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d35ca6a3f51..dffb8dabd53 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,11 +26,7 @@
 #include <linux/buffer_head.h> // for block_sync_page
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
 # include <linux/freezer.h>
-#else
-# include <linux/sched.h>
-#endif
 #include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -373,21 +369,11 @@ out:
 	return ret;
 }
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_workqueue_bio(struct bio *bio, int err)
-#else
-static int end_workqueue_bio(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
 	struct btrfs_fs_info *fs_info;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
 	fs_info = end_io_wq->info;
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
@@ -397,10 +383,6 @@ static int end_workqueue_bio(struct bio *bio,
 				   &end_io_wq->work);
 	else
 		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -1161,9 +1143,7 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_init(bdi);
-#endif
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
 	bdi->state		= 0;
 	bdi->capabilities	= default_backing_dev_info.capabilities;
@@ -1242,11 +1222,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kfree(end_io_wq);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	bio_endio(bio, bio->bi_size, error);
-#else
 	bio_endio(bio, error);
-#endif
 }
 
 static int cleaner_kthread(void *arg)
@@ -1673,9 +1649,7 @@ fail:
 
 	kfree(extent_root);
 	kfree(tree_root);
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
-#endif
 	kfree(fs_info);
 	return ERR_PTR(err);
 }
@@ -1936,9 +1910,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
-#endif
 
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 2b357a6d240..48b82cd7583 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,12 +7,6 @@
 #include "export.h"
 #include "compat.h"
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-#define FILEID_BTRFS_WITHOUT_PARENT		0x4d
-#define FILEID_BTRFS_WITH_PARENT 		0x4e
-#define FILEID_BTRFS_WITH_PARENT_ROOT 		0x4f
-#endif
-
 #define BTRFS_FID_SIZE_NON_CONNECTABLE		(offsetof(struct btrfs_fid, parent_objectid)/4)
 #define BTRFS_FID_SIZE_CONNECTABLE		(offsetof(struct btrfs_fid, parent_root_objectid)/4)
 #define BTRFS_FID_SIZE_CONNECTABLE_ROOT		(sizeof(struct btrfs_fid)/4)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 58ad25838a4..e3a25be5c66 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1397,12 +1397,7 @@ static int check_page_writeback(struct extent_io_tree *tree,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_extent_writepage(struct bio *bio, int err)
-#else
-static int end_bio_extent_writepage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	int uptodate = err == 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1412,10 +1407,6 @@ static int end_bio_extent_writepage(struct bio *bio,
 	int whole_page;
 	int ret;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1461,10 +1452,8 @@ static int end_bio_extent_writepage(struct bio *bio,
 		else
 			check_page_writeback(tree, page);
 	} while (bvec >= bio->bi_io_vec);
+
 	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 /*
@@ -1478,12 +1467,7 @@ static int end_bio_extent_writepage(struct bio *bio,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_extent_readpage(struct bio *bio, int err)
-#else
-static int end_bio_extent_readpage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1493,11 +1477,6 @@ static int end_bio_extent_readpage(struct bio *bio,
 	int whole_page;
 	int ret;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1556,9 +1535,6 @@ static int end_bio_extent_readpage(struct bio *bio,
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 /*
@@ -1566,12 +1542,7 @@ static int end_bio_extent_readpage(struct bio *bio,
  * the structs in the extent tree when done, and set the uptodate bits
  * as appropriate.
  */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_extent_preparewrite(struct bio *bio, int err)
-#else
-static int end_bio_extent_preparewrite(struct bio *bio,
-				       unsigned int bytes_done, int err)
-#endif
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1579,11 +1550,6 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 	u64 start;
 	u64 end;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1607,9 +1573,6 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 static struct bio *
@@ -2079,12 +2042,6 @@ done:
 	return 0;
 }
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-/* Taken directly from 2.6.23 with a mod for a lockpage hook */
-typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
-                                void *data);
-#endif
-
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -2201,10 +2158,9 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+
 	if (wbc->range_cont)
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
-#endif
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_cache_pages);
@@ -2560,18 +2516,10 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	 * by increasing the reference count.  So we know the page must
 	 * be in the radix tree.
 	 */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 	rcu_read_lock();
-#else
-	read_lock_irq(&mapping->tree_lock);
-#endif
 	p = radix_tree_lookup(&mapping->page_tree, i);
-
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 	rcu_read_unlock();
-#else
-	read_unlock_irq(&mapping->tree_lock);
-#endif
+
 	return p;
 }
 
@@ -2773,21 +2721,13 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			}
 		}
 		clear_page_dirty_for_io(page);
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 		spin_lock_irq(&page->mapping->tree_lock);
-#else
-		read_lock_irq(&page->mapping->tree_lock);
-#endif
 		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&page->mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 		spin_unlock_irq(&page->mapping->tree_lock);
-#else
-		read_unlock_irq(&page->mapping->tree_lock);
-#endif
 		unlock_page(page);
 	}
 	return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 48a702d41c8..8856570a0eb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -871,15 +871,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		goto out_nolock;
 	if (count == 0)
 		goto out_nolock;
-#ifdef REMOVE_SUID_PATH
-	err = remove_suid(&file->f_path);
-#else
-# if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+
 	err = file_remove_suid(file);
-# else
-	err = remove_suid(fdentry(file));
-# endif
-#endif
 	if (err)
 		goto out_nolock;
 	file_update_time(file);
@@ -1003,17 +996,10 @@ out_nolock:
 			btrfs_commit_transaction(trans, root);
 		}
 	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-		do_sync_file_range(file, start_pos,
-				      start_pos + num_written - 1,
-				      SYNC_FILE_RANGE_WRITE |
-				      SYNC_FILE_RANGE_WAIT_AFTER);
-#else
 		do_sync_mapping_range(inode->i_mapping, start_pos,
 				      start_pos + num_written - 1,
 				      SYNC_FILE_RANGE_WRITE |
 				      SYNC_FILE_RANGE_WAIT_AFTER);
-#endif
 		invalidate_mapping_pages(inode->i_mapping,
 		      start_pos >> PAGE_CACHE_SHIFT,
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1097,12 +1083,7 @@ out:
 }
 
 static struct vm_operations_struct btrfs_file_vm_ops = {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-	.nopage         = filemap_nopage,
-	.populate       = filemap_populate,
-#else
 	.fault		= filemap_fault,
-#endif
 	.page_mkwrite	= btrfs_page_mkwrite,
 };
 
@@ -1118,9 +1099,6 @@ struct file_operations btrfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read       = generic_file_aio_read,
 	.splice_read	= generic_file_splice_read,
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	.sendfile	= generic_file_sendfile,
-#endif
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index adb169d739c..48a3dc03080 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2073,104 +2073,6 @@ err:
 	return ret;
 }
 
-/* Kernels earlier than 2.6.28 still have the NFS deadlock where nfsd
-   will call the file system's ->lookup() method from within its
-   filldir callback, which in turn was called from the file system's
-   ->readdir() method. And will deadlock for many file systems. */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-
-struct nfshack_dirent {
-	u64		ino;
-	loff_t		offset;
-	int		namlen;
-	unsigned int	d_type;
-	char		name[];
-};
-
-struct nfshack_readdir {
-	char		*dirent;
-	size_t		used;
-	int		full;
-};
-
-
-
-static int btrfs_nfshack_filldir(void *__buf, const char *name, int namlen,
-			      loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct nfshack_readdir *buf = __buf;
-	struct nfshack_dirent *de = (void *)(buf->dirent + buf->used);
-	unsigned int reclen;
-
-	reclen = ALIGN(sizeof(struct nfshack_dirent) + namlen, sizeof(u64));
-	if (buf->used + reclen > PAGE_SIZE) {
-		buf->full = 1;
-		return -EINVAL;
-	}
-
-	de->namlen = namlen;
-	de->offset = offset;
-	de->ino = ino;
-	de->d_type = d_type;
-	memcpy(de->name, name, namlen);
-	buf->used += reclen;
-
-	return 0;
-}
-
-static int btrfs_nfshack_readdir(struct file *file, void *dirent,
-				 filldir_t filldir)
-{
-	struct nfshack_readdir buf;
-	struct nfshack_dirent *de;
-	int err;
-	int size;
-	loff_t offset;
-
-	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
-	if (!buf.dirent)
-		return -ENOMEM;
-
-	offset = file->f_pos;
-
-	do {
-		unsigned int reclen;
-
-		buf.used = 0;
-		buf.full = 0;
-		err = btrfs_real_readdir(file, &buf, btrfs_nfshack_filldir);
-		if (err)
-			break;
-
-		size = buf.used;
-
-		if (!size)
-			break;
-
-		de = (struct nfshack_dirent *)buf.dirent;
-		while (size > 0) {
-			offset = de->offset;
-
-			if (filldir(dirent, de->name, de->namlen, de->offset,
-				    de->ino, de->d_type))
-				goto done;
-			offset = file->f_pos;
-
-			reclen = ALIGN(sizeof(*de) + de->namlen,
-				       sizeof(u64));
-			size -= reclen;
-			de = (struct nfshack_dirent *)((char *)de + reclen);
-		}
-	} while (buf.full);
-
- done:
-	free_page((unsigned long)buf.dirent);
-	file->f_pos = offset;
-
-	return err;
-}
-#endif
-
 int btrfs_write_inode(struct inode *inode, int wait)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3311,13 +3213,8 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 {
 	pgoff_t req_size = last_index - offset + 1;
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-	offset = page_cache_readahead(mapping, ra, file, offset, req_size);
-	return offset;
-#else
 	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
 	return offset + req_size;
-#endif
 }
 
 struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -3373,14 +3270,7 @@ void btrfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 static void init_once(void *foo)
-#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void init_once(struct kmem_cache * cachep, void *foo)
-#else
-static void init_once(void * foo, struct kmem_cache * cachep,
-		      unsigned long flags)
-#endif
 {
 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
 
@@ -3403,22 +3293,10 @@ void btrfs_destroy_cachep(void)
 
 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 				       unsigned long extra_flags,
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
-				       void (*ctor)(void *)
-#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-				       void (*ctor)(struct kmem_cache *, void *)
-#else
-				       void (*ctor)(void *, struct kmem_cache *,
-						    unsigned long)
-#endif
-				     )
+				       void (*ctor)(void *))
 {
 	return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
-				 SLAB_MEM_SPREAD | extra_flags), ctor
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-				 ,NULL
-#endif
-				);
+				 SLAB_MEM_SPREAD | extra_flags), ctor);
 }
 
 int btrfs_init_cachep(void)
@@ -3666,12 +3544,7 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 static int btrfs_permission(struct inode *inode, int mask)
-#else
-static int btrfs_permission(struct inode *inode, int mask,
-			    struct nameidata *nd)
-#endif
 {
 	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
@@ -3702,11 +3575,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-	.readdir	= btrfs_nfshack_readdir,
-#else /* NFSd readdir/lookup deadlock is fixed */
 	.readdir	= btrfs_real_readdir,
-#endif
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f7b3eac7ac6..8399d6d05d6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -349,10 +349,7 @@ static int btrfs_fill_super(struct super_block * sb,
 
 	sb->s_root = root_dentry;
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
 	save_mount_options(sb, data);
-#endif
-
 	return 0;
 
 fail_close:
@@ -566,11 +563,7 @@ static struct super_operations btrfs_super_ops = {
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
-	.read_inode     = btrfs_read_locked_inode,
-#else
 	.show_options	= generic_show_options,
-#endif
 	.write_inode	= btrfs_write_inode,
 	.dirty_inode	= btrfs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 61af5d8446e..300076e6676 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,7 +28,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
 static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
@@ -267,35 +266,3 @@ void btrfs_exit_sysfs(void)
 	kset_unregister(btrfs_kset);
 }
 
-#else
-
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
-{
-	return 0;
-}
-
-int btrfs_sysfs_add_root(struct btrfs_root *root)
-{
-	return 0;
-}
-
-void btrfs_sysfs_del_root(struct btrfs_root *root)
-{
-	return;
-}
-
-void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
-{
-	return;
-}
-
-int btrfs_init_sysfs(void)
-{
-	return 0;
-}
-
-void btrfs_exit_sysfs(void)
-{
-	return;
-}
-#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9e5c2d82dd..ddf89626498 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2080,20 +2080,11 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 }
 
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_multi_stripe(struct bio *bio, int err)
-#else
-static int end_bio_multi_stripe(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
 	int is_orig_bio = 0;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
 	if (err)
 		atomic_inc(&multi->error);
 
@@ -2122,17 +2113,10 @@ static int end_bio_multi_stripe(struct bio *bio,
 		}
 		kfree(multi);
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-		bio_endio(bio, bio->bi_size, err);
-#else
 		bio_endio(bio, err);
-#endif
 	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 struct async_sched {
@@ -2248,11 +2232,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-			bio_endio(bio, bio->bi_size, -EIO);
-#else
 			bio_endio(bio, -EIO);
-#endif
 		}
 		dev_nr++;
 	}
-- 
cgit v1.2.3


From 4434c33c7f233a7ee5dcaf357fbf795818ea8861 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 12:50:10 -0400
Subject: Btrfs: fix sleep with spinlock held during unmount

The code to free block groups needs to drop the space info spin lock
before calling btrfs_remove_free_space_cache (which can schedule).

This is safe because at unmount time, nobody else is going to play
with the block groups.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5258923d621..fe4e11b31a4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3157,9 +3157,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		block_group = rb_entry(n, struct btrfs_block_group_cache,
 				       cache_node);
 
+		spin_unlock(&info->block_group_cache_lock);
 		btrfs_remove_free_space_cache(block_group);
+		spin_lock(&info->block_group_cache_lock);
+
 		rb_erase(&block_group->cache_node,
 			 &info->block_group_cache_tree);
+
 		spin_lock(&block_group->space_info->lock);
 		list_del(&block_group->list);
 		spin_unlock(&block_group->space_info->lock);
-- 
cgit v1.2.3


From 24ab9cd85c11bccacbd0cce7f8e1aebd4930404c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 14:51:30 -0400
Subject: Btrfs: Raise thresholds for metadata writeback

Btrfs metadata writeback is fairly expensive.  Once a tree block is written
it must be cowed before it can be changed again.  The btree writepages
code has a threshold based on a count of dirty btree bytes which is
updated as IO is sent out.

This changes btree_writepages to skip the writeout if there are less
than 32MB of dirty bytes from the btrees, improving performance
across many workloads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dffb8dabd53..71e81f3a765 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -556,7 +556,7 @@ static int btree_writepages(struct address_space *mapping,
 	if (wbc->sync_mode == WB_SYNC_NONE) {
 		u64 num_dirty;
 		u64 start = 0;
-		unsigned long thresh = 8 * 1024 * 1024;
+		unsigned long thresh = 32 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
@@ -690,7 +690,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
 	return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
-				      buf->start + buf->len - 1, WB_SYNC_NONE);
+				      buf->start + buf->len - 1, WB_SYNC_ALL);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
-- 
cgit v1.2.3


From e8569813849b5da394a195c7e76b4faa452b12d1 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 26 Sep 2008 10:05:48 -0400
Subject: Btrfs: allocator fixes for space balancing update

* Reserved extent accounting:  reserved extents have been
allocated in the rbtrees that track free space but have not
been allocated on disk.  They were never properly accounted for
in the past, making it hard to know how much space was really free.

* btrfs_find_block_group used to return NULL for block groups that
had been removed by the space balancing code.  This made it hard
to account for space during the final stages of a balance run.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   2 +
 fs/btrfs/extent-tree.c | 136 +++++++++++++++++++++++--------------------------
 2 files changed, 67 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3b3c1ca50c5..c683aaa925f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -498,6 +498,7 @@ struct btrfs_space_info {
 	u64 total_bytes;
 	u64 bytes_used;
 	u64 bytes_pinned;
+	u64 bytes_reserved;
 	int full;
 	int force_alloc;
 	struct list_head list;
@@ -519,6 +520,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_block_group_item item;
 	spinlock_t lock;
 	u64 pinned;
+	u64 reserved;
 	u64 flags;
 	int cached;
 	int ro;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe4e11b31a4..3e2f969de42 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -325,12 +325,9 @@ static int noinline find_free_space(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache = *cache_ret;
 	struct btrfs_free_space *info = NULL;
 	u64 last;
-	u64 total_fs_bytes;
 	u64 search_start = *start_ret;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
-	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-
 	if (!cache)
 		goto out;
 
@@ -354,7 +351,7 @@ new_group:
 	last = cache->key.objectid + cache->key.offset;
 
 	cache = btrfs_lookup_first_block_group(root->fs_info, last);
-	if (!cache || cache->key.objectid >= total_fs_bytes)
+	if (!cache)
 		goto out;
 
 	*cache_ret = cache;
@@ -385,7 +382,6 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 			return found;
 	}
 	return NULL;
-
 }
 
 static struct btrfs_block_group_cache *
@@ -396,7 +392,6 @@ __btrfs_find_block_group(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_space_info *sinfo;
 	u64 used;
 	u64 last = 0;
 	u64 free_check;
@@ -413,7 +408,7 @@ __btrfs_find_block_group(struct btrfs_root *root,
 		if (shint && block_group_bits(shint, data) && !shint->ro) {
 			spin_lock(&shint->lock);
 			used = btrfs_block_group_used(&shint->item);
-			if (used + shint->pinned <
+			if (used + shint->pinned + shint->reserved <
 			    div_factor(shint->key.offset, factor)) {
 				spin_unlock(&shint->lock);
 				return shint;
@@ -424,7 +419,7 @@ __btrfs_find_block_group(struct btrfs_root *root,
 	if (hint && !hint->ro && block_group_bits(hint, data)) {
 		spin_lock(&hint->lock);
 		used = btrfs_block_group_used(&hint->item);
-		if (used + hint->pinned <
+		if (used + hint->pinned + hint->reserved <
 		    div_factor(hint->key.offset, factor)) {
 			spin_unlock(&hint->lock);
 			return hint;
@@ -437,27 +432,9 @@ __btrfs_find_block_group(struct btrfs_root *root,
 		else
 			last = search_start;
 	}
-	sinfo = __find_space_info(root->fs_info, data);
-	if (!sinfo)
-		goto found;
 again:
-	while(1) {
-		struct list_head *l;
-
-		cache = NULL;
-
-		spin_lock(&sinfo->lock);
-		list_for_each(l, &sinfo->block_groups) {
-			struct btrfs_block_group_cache *entry;
-			entry = list_entry(l, struct btrfs_block_group_cache,
-					   list);
-			if ((entry->key.objectid >= last) &&
-			    (!cache || (entry->key.objectid <
-					cache->key.objectid)))
-				cache = entry;
-		}
-		spin_unlock(&sinfo->lock);
-
+	while (1) {
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 		if (!cache)
 			break;
 
@@ -467,7 +444,8 @@ again:
 
 		if (!cache->ro && block_group_bits(cache, data)) {
 			free_check = div_factor(cache->key.offset, factor);
-			if (used + cache->pinned < free_check) {
+			if (used + cache->pinned + cache->reserved <
+			    free_check) {
 				found_group = cache;
 				spin_unlock(&cache->lock);
 				goto found;
@@ -1414,6 +1392,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		if (!cache)
 			break;
 
+		cache->dirty = 0;
 		last += cache->key.offset;
 
 		err = write_one_cache_group(trans, root,
@@ -1427,8 +1406,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			werr = err;
 			continue;
 		}
-
-		cache->dirty = 0;
 	}
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
@@ -1460,6 +1437,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->total_bytes = total_bytes;
 	found->bytes_used = bytes_used;
 	found->bytes_pinned = 0;
+	found->bytes_reserved = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
@@ -1539,8 +1517,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 
 	thresh = div_factor(space_info->total_bytes, 6);
 	if (!force &&
-	   (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
-	    thresh)
+	   (space_info->bytes_used + space_info->bytes_pinned +
+	    space_info->bytes_reserved + alloc_bytes) < thresh)
 		goto out;
 
 	mutex_lock(&extent_root->fs_info->chunk_mutex);
@@ -1621,7 +1599,6 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 	return cache->key.objectid;
 }
 
-
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
@@ -1639,29 +1616,20 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 	}
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
-		if (!cache) {
-			u64 first = first_logical_byte(root, bytenr);
-			WARN_ON(first < bytenr);
-			len = min(first - bytenr, num);
-		} else {
-			len = min(num, cache->key.offset -
-				  (bytenr - cache->key.objectid));
-		}
+		BUG_ON(!cache);
+		len = min(num, cache->key.offset -
+			  (bytenr - cache->key.objectid));
 		if (pin) {
-			if (cache) {
-				spin_lock(&cache->lock);
-				cache->pinned += len;
-				cache->space_info->bytes_pinned += len;
-				spin_unlock(&cache->lock);
-			}
+			spin_lock(&cache->lock);
+			cache->pinned += len;
+			cache->space_info->bytes_pinned += len;
+			spin_unlock(&cache->lock);
 			fs_info->total_pinned += len;
 		} else {
-			if (cache) {
-				spin_lock(&cache->lock);
-				cache->pinned -= len;
-				cache->space_info->bytes_pinned -= len;
-				spin_unlock(&cache->lock);
-			}
+			spin_lock(&cache->lock);
+			cache->pinned -= len;
+			cache->space_info->bytes_pinned -= len;
+			spin_unlock(&cache->lock);
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1670,6 +1638,36 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 	return 0;
 }
 
+static int update_reserved_extents(struct btrfs_root *root,
+				   u64 bytenr, u64 num, int reserve)
+{
+	u64 len;
+	struct btrfs_block_group_cache *cache;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
+	while (num > 0) {
+		cache = btrfs_lookup_block_group(fs_info, bytenr);
+		BUG_ON(!cache);
+		len = min(num, cache->key.offset -
+			  (bytenr - cache->key.objectid));
+		if (reserve) {
+			spin_lock(&cache->lock);
+			cache->reserved += len;
+			cache->space_info->bytes_reserved += len;
+			spin_unlock(&cache->lock);
+		} else {
+			spin_lock(&cache->lock);
+			cache->reserved -= len;
+			cache->space_info->bytes_reserved -= len;
+			spin_unlock(&cache->lock);
+		}
+		bytenr += len;
+		num -= len;
+	}
+	return 0;
+}
+
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 {
 	u64 last = 0;
@@ -2126,6 +2124,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
 			BUG_ON(!cache);
 			btrfs_add_free_space(cache, bytenr, num_bytes);
+			update_reserved_extents(root, bytenr, num_bytes, 0);
 			return 0;
 		}
 		pin = 1;
@@ -2225,14 +2224,11 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	search_start = max(search_start, first_logical_byte(root, 0));
 	orig_search_start = search_start;
 
-	if (search_end == (u64)-1)
-		search_end = btrfs_super_total_bytes(&info->super_copy);
-
 	search_start = max(search_start, hint_byte);
 	total_needed += empty_size;
 
 new_group:
-	block_group = btrfs_lookup_block_group(info, search_start);
+	block_group = btrfs_lookup_first_block_group(info, search_start);
 
 	/*
 	 * Ok this looks a little tricky, buts its really simple.  First if we
@@ -2257,12 +2253,8 @@ new_group:
 			ret = do_chunk_alloc(trans, root,
 					     num_bytes + 2 * 1024 * 1024,
 					     data, 1);
-			if (ret < 0) {
-				struct btrfs_space_info *info;
-
-				info = __find_space_info(root->fs_info, data);
+			if (ret < 0)
 				goto error;
-			}
 			BUG_ON(ret);
 			chunk_alloc_done = 1;
 			search_start = orig_search_start;
@@ -2378,22 +2370,24 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	struct list_head *l;
 
 	printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
-	       info->total_bytes - info->bytes_used - info->bytes_pinned,
-	       (info->full) ? "" : "not ");
+	       info->total_bytes - info->bytes_used - info->bytes_pinned -
+	       info->bytes_reserved, (info->full) ? "" : "not ");
 
 	spin_lock(&info->lock);
 	list_for_each(l, &info->block_groups) {
 		cache = list_entry(l, struct btrfs_block_group_cache, list);
 		spin_lock(&cache->lock);
 		printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
-		       "%Lu pinned\n",
+		       "%Lu pinned %Lu reserved\n",
 		       cache->key.objectid, cache->key.offset,
-		       btrfs_block_group_used(&cache->item), cache->pinned);
+		       btrfs_block_group_used(&cache->item),
+		       cache->pinned, cache->reserved);
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
 	spin_unlock(&info->lock);
 }
+
 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -2500,6 +2494,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
 				     empty_size, hint_byte, search_end, ins,
 				     data);
+	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2625,6 +2620,7 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
 					    root_objectid, ref_generation,
 					    owner, owner_offset, ins);
+	update_reserved_extents(root, ins->objectid, ins->offset, 0);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2685,6 +2681,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 					owner_objectid, owner_offset, ins);
 		BUG_ON(ret);
 
+	} else {
+		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	}
 	maybe_unlock_mutex(root);
 	return ret;
@@ -3974,10 +3972,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
 		BUG_ON(ret);
-
-		if (key.objectid >=
-		    btrfs_super_total_bytes(&info->super_copy))
-			break;
 	}
 	ret = 0;
 error:
-- 
cgit v1.2.3


From e465768938f95388723b0fd3c50a0ae48173edb9 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 26 Sep 2008 10:04:53 -0400
Subject: Btrfs: Add shared reference cache

Btrfs has a cache of reference counts in leaves, allowing it to
avoid reading tree leaves while deleting snapshots.  To reduce
contention with multiple subvolumes, this cache is private to each
subvolume.

This patch adds shared reference cache support. The new space
balancing code plays with multiple subvols at the same time, So
the old per-subvol reference cache is not well suited.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 19 +++++++++++------
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 17 +++++++++++----
 fs/btrfs/ref-cache.c   | 58 +++++++++++++++++++++++++++++++++-----------------
 fs/btrfs/ref-cache.h   |  7 ++++--
 fs/btrfs/transaction.c |  2 +-
 6 files changed, 71 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c683aaa925f..b9f9f815ed0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -81,6 +81,10 @@ struct btrfs_ordered_sum;
 #define BTRFS_TREE_LOG_OBJECTID -6ULL
 #define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
 
+/* for space balancing */
+#define BTRFS_TREE_RELOC_OBJECTID -8ULL
+#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -539,6 +543,12 @@ struct btrfs_block_group_cache {
 	struct list_head list;
 };
 
+struct btrfs_leaf_ref_tree {
+	struct rb_root root;
+	struct list_head list;
+	spinlock_t lock;
+};
+
 struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
@@ -637,6 +647,8 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
+	struct btrfs_leaf_ref_tree shared_ref_tree;
+
 	struct kobject super_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
@@ -670,13 +682,6 @@ struct btrfs_fs_info {
 	void *bdev_holder;
 };
 
-struct btrfs_leaf_ref_tree {
-	struct rb_root root;
-	struct btrfs_leaf_ref *last;
-	struct list_head list;
-	spinlock_t lock;
-};
-
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 71e81f3a765..8969fee2331 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1406,6 +1406,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
+	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
+
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3e2f969de42..9ab099bc01a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1091,16 +1091,26 @@ out:
 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		    struct extent_buffer *buf, u32 nr_extents)
 {
-	u32 nritems;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
+	u64 root_gen;
+	u32 nritems;
 	int i;
 	int level;
 	int ret = 0;
+	int shared = 0;
 
 	if (!root->ref_cows)
 		return 0;
 
+	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+		shared = 0;
+		root_gen = root->root_key.offset;
+	} else {
+		shared = 1;
+		root_gen = trans->transid - 1;
+	}
+
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
 
@@ -1114,7 +1124,7 @@ int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			goto out;
 		}
 
-		ref->root_gen = root->root_key.offset;
+		ref->root_gen = root_gen;
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
@@ -1143,8 +1153,7 @@ int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			info++;
 		}
 
-		BUG_ON(!root->ref_tree);
-		ret = btrfs_add_leaf_ref(root, ref);
+		ret = btrfs_add_leaf_ref(root, ref, shared);
 		WARN_ON(ret);
 		btrfs_free_leaf_ref(root, ref);
 	}
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 272b9890c98..c5809988c87 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -78,7 +78,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 	}
 
 	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
-	entry->in_tree = 1;
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
 	return NULL;
@@ -103,23 +102,29 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 	return NULL;
 }
 
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen)
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+			   int shared)
 {
 	struct btrfs_leaf_ref *ref = NULL;
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
+	if (shared)
+		tree = &root->fs_info->shared_ref_tree;
 	if (!tree)
 		return 0;
 
 	spin_lock(&tree->lock);
 	while(!list_empty(&tree->list)) {
 		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
-		BUG_ON(!ref->in_tree);
+		BUG_ON(ref->tree != tree);
 		if (ref->root_gen > max_root_gen)
 			break;
+		if (!xchg(&ref->in_tree, 0)) {
+			cond_resched_lock(&tree->lock);
+			continue;
+		}
 
 		rb_erase(&ref->rb_node, &tree->root);
-		ref->in_tree = 0;
 		list_del_init(&ref->list);
 
 		spin_unlock(&tree->lock);
@@ -137,32 +142,43 @@ struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 	struct rb_node *rb;
 	struct btrfs_leaf_ref *ref = NULL;
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-
-	if (!tree)
-		return NULL;
-
-	spin_lock(&tree->lock);
-	rb = tree_search(&tree->root, bytenr);
-	if (rb)
-		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
-	if (ref)
-		atomic_inc(&ref->usage);
-	spin_unlock(&tree->lock);
-	return ref;
+again:
+	if (tree) {
+		spin_lock(&tree->lock);
+		rb = tree_search(&tree->root, bytenr);
+		if (rb)
+			ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+		if (ref)
+			atomic_inc(&ref->usage);
+		spin_unlock(&tree->lock);
+		if (ref)
+			return ref;
+	}
+	if (tree != &root->fs_info->shared_ref_tree) {
+		tree = &root->fs_info->shared_ref_tree;
+		goto again;
+	}
+	return NULL;
 }
 
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+		       int shared)
 {
 	int ret = 0;
 	struct rb_node *rb;
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
+	if (shared)
+		tree = &root->fs_info->shared_ref_tree;
+
 	spin_lock(&tree->lock);
 	rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
 	if (rb) {
 		ret = -EEXIST;
 	} else {
 		atomic_inc(&ref->usage);
+		ref->tree = tree;
+		ref->in_tree = 1;
 		list_add_tail(&ref->list, &tree->list);
 	}
 	spin_unlock(&tree->lock);
@@ -171,13 +187,15 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
-	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
+	struct btrfs_leaf_ref_tree *tree;
+
+	if (!xchg(&ref->in_tree, 0))
+		return 0;
 
-	BUG_ON(!ref->in_tree);
+	tree = ref->tree;
 	spin_lock(&tree->lock);
 
 	rb_erase(&ref->rb_node, &tree->root);
-	ref->in_tree = 0;
 	list_del_init(&ref->list);
 
 	spin_unlock(&tree->lock);
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index c361b321c0c..617564787f5 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -27,6 +27,7 @@ struct btrfs_extent_info {
 
 struct btrfs_leaf_ref {
 	struct rb_node rb_node;
+	struct btrfs_leaf_ref_tree *tree;
 	int in_tree;
 	atomic_t usage;
 
@@ -64,8 +65,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
 void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     u64 bytenr);
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen);
+int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
+		       int shared);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
+			   int shared);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 151b00d5259..656baefa525 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -650,7 +650,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		ret = btrfs_remove_leaf_refs(root, max_useless);
+		ret = btrfs_remove_leaf_refs(root, max_useless, 0);
 		BUG_ON(ret);
 
 		free_extent_buffer(dirty->root->node);
-- 
cgit v1.2.3


From 5b21f2ed3f2947b5195b65c9fdbdd9e52904cc03 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 26 Sep 2008 10:05:38 -0400
Subject: Btrfs: extent_map and data=ordered fixes for space balancing

* Add an EXTENT_BOUNDARY state bit to keep the writepage code
from merging data extents that are in the process of being
relocated.  This allows us to do accounting for them properly.

* The balancing code relocates data extents indepdent of the underlying
inode.  The extent_map code was modified to properly account for
things moving around (invalidating extent_map caches in the inode).

* Don't take the drop_mutex in the create_subvol ioctl.  It isn't
required.

* Fix walking of the ordered extent list to avoid races with sys_unlink

* Change the lock ordering rules.  Transaction start goes outside
the drop_mutex.  This allows btrfs_commit_transaction to directly
drop the relocation trees.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c        |  9 +++++----
 fs/btrfs/ctree.h        | 11 ++++++++++-
 fs/btrfs/extent_io.c    | 13 +++++++++----
 fs/btrfs/extent_io.h    |  1 +
 fs/btrfs/file.c         | 31 +++++++++++++++++++++++------
 fs/btrfs/inode-map.c    |  4 ++++
 fs/btrfs/inode.c        | 52 +++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/ioctl.c        |  2 --
 fs/btrfs/ordered-data.c | 26 ++++++++++++-------------
 fs/btrfs/transaction.c  |  8 ++++----
 10 files changed, 108 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 50aea8cb653..f9cd40967d0 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -290,7 +290,6 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct extent_buffer **cow_ret, u64 prealloc_dest)
 {
 	u64 search_start;
-	u64 header_trans;
 	int ret;
 
 	if (trans->transaction != root->fs_info->running_transaction) {
@@ -304,9 +303,9 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 	}
 
-	header_trans = btrfs_header_generation(buf);
 	spin_lock(&root->fs_info->hash_lock);
-	if (header_trans == trans->transid &&
+	if (btrfs_header_generation(buf) == trans->transid &&
+	    btrfs_header_owner(buf) == root->root_key.objectid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
 		spin_unlock(&root->fs_info->hash_lock);
@@ -1300,6 +1299,7 @@ again:
 			/* is a cow on this block not required */
 			spin_lock(&root->fs_info->hash_lock);
 			if (btrfs_header_generation(b) == trans->transid &&
+			    btrfs_header_owner(b) == root->root_key.objectid &&
 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
 				spin_unlock(&root->fs_info->hash_lock);
 				goto cow_done;
@@ -1396,7 +1396,8 @@ cow_done:
 
 			/* this is only true while dropping a snapshot */
 			if (level == lowest_level) {
-				break;
+				ret = 0;
+				goto done;
 			}
 
 			blocknr = btrfs_node_blockptr(b, slot);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9f9f815ed0..3e62a1b0a1f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1486,6 +1486,9 @@ static inline struct dentry *fdentry(struct file *file)
 
 /* extent-tree.c */
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 bytenr,
+			    u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1812,6 +1815,8 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    struct btrfs_root *root, int wait);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
@@ -1824,13 +1829,17 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode);
+int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
+int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
+void btrfs_orphan_cleanup(struct btrfs_root *root);
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end);
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			    int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e3a25be5c66..8bd1b402f3f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -292,7 +292,7 @@ static int merge_state(struct extent_io_tree *tree,
 	struct extent_state *other;
 	struct rb_node *other_node;
 
-	if (state->state & EXTENT_IOBITS)
+	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 		return 0;
 
 	other_node = rb_prev(&state->rb_node);
@@ -1070,7 +1070,8 @@ search_again:
 
 	while(1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (found && state->start != cur_start) {
+		if (found && (state->start != cur_start ||
+			      (state->state & EXTENT_BOUNDARY))) {
 			goto out;
 		}
 		if (!(state->state & EXTENT_DELALLOC)) {
@@ -1078,7 +1079,7 @@ search_again:
 				*end = state->end;
 			goto out;
 		}
-		if (!found) {
+		if (!found && !(state->state & EXTENT_BOUNDARY)) {
 			struct extent_state *prev_state;
 			struct rb_node *prev_node = node;
 			while(1) {
@@ -1088,7 +1089,11 @@ search_again:
 				prev_state = rb_entry(prev_node,
 						      struct extent_state,
 						      rb_node);
-				if (!(prev_state->state & EXTENT_DELALLOC))
+				if ((prev_state->end + 1 != state->start) ||
+				    !(prev_state->state & EXTENT_DELALLOC))
+					break;
+				if ((cur_start - prev_state->start) * 2 >
+				     max_bytes)
 					break;
 				state = prev_state;
 				node = prev_node;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3cb411a5f4d..c9d1908a1ae 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -15,6 +15,7 @@
 #define EXTENT_BUFFER_FILLED (1 << 8)
 #define EXTENT_ORDERED (1 << 9)
 #define EXTENT_ORDERED_METADATA (1 << 10)
+#define EXTENT_BOUNDARY (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8856570a0eb..1b7e51a9db0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -294,7 +294,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 						       last_pos_in_file,
 						       0, 0, hole_size, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
-					last_pos_in_file + hole_size -1);
+					last_pos_in_file + hole_size - 1, 0);
 			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 			btrfs_check_file(root, inode);
 		}
@@ -337,7 +337,7 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		inline_size -= start_pos;
 		err = insert_inline_extent(trans, root, inode, start_pos,
 					   inline_size, pages, 0, num_pages);
-		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
+		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
 		BUG_ON(err);
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
@@ -362,7 +362,8 @@ out_unlock:
 	return err;
 }
 
-int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
+int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			    int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
@@ -371,6 +372,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 	u64 len = end - start + 1;
 	int ret;
 	int testend = 1;
+	unsigned long flags;
 
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
@@ -389,6 +391,23 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			spin_unlock(&em_tree->lock);
 			break;
 		}
+		flags = em->flags;
+		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+			spin_unlock(&em_tree->lock);
+			if (em->start <= start &&
+			    (!testend || em->start + em->len >= start + len)) {
+				free_extent_map(em);
+				break;
+			}
+			if (start < em->start) {
+				len = em->start - start;
+			} else {
+				len = start + len - (em->start + em->len);
+				start = em->start + em->len;
+			}
+			free_extent_map(em);
+			continue;
+		}
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		remove_extent_mapping(em_tree, em);
 
@@ -398,7 +417,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			split->len = start - em->start;
 			split->block_start = em->block_start;
 			split->bdev = em->bdev;
-			split->flags = em->flags;
+			split->flags = flags;
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret);
 			free_extent_map(split);
@@ -412,7 +431,7 @@ int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			split->start = start + len;
 			split->len = em->start + em->len - (start + len);
 			split->bdev = em->bdev;
-			split->flags = em->flags;
+			split->flags = flags;
 
 			split->block_start = em->block_start + diff;
 
@@ -541,7 +560,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int recow;
 	int ret;
 
-	btrfs_drop_extent_cache(inode, start, end - 1);
+	btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
 	path = btrfs_alloc_path();
 	if (!path)
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index cd6171c2da4..80038c5ef7c 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -117,10 +117,14 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 					*objectid = last_ino;
 					goto found;
 				}
+			} else if (key.objectid > search_start) {
+				*objectid = search_start;
+				goto found;
 			}
 		}
 		if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
 			break;
+
 		start_found = 1;
 		last_ino = key.objectid + 1;
 		path->slots[0]++;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 48a3dc03080..4516fbf0167 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -135,7 +135,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 
 	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
-	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	while(num_bytes > 0) {
@@ -163,7 +163,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 				break;
 			}
 			btrfs_drop_extent_cache(inode, start,
-						start + ins.offset - 1);
+						start + ins.offset - 1, 0);
 		}
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
@@ -587,7 +587,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
-				ordered_extent->len - 1);
+				ordered_extent->len - 1, 0);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	ins.objectid = ordered_extent->start;
@@ -880,7 +880,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
 	/* don't do orphan cleanup if the fs is readonly. */
-	if (root->inode->i_sb->s_flags & MS_RDONLY)
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return;
 
 	path = btrfs_alloc_path();
@@ -892,8 +892,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
 	key.offset = (u64)-1;
 
-	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, root->inode);
 
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -933,7 +931,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
-		inode = btrfs_iget_locked(root->inode->i_sb,
+		inode = btrfs_iget_locked(root->fs_info->sb,
 					  found_key.offset, root);
 		if (!inode)
 			break;
@@ -965,7 +963,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * do a destroy_inode
 		 */
 		if (is_bad_inode(inode)) {
+			trans = btrfs_start_transaction(root, 1);
 			btrfs_orphan_del(trans, inode);
+			btrfs_end_transaction(trans, root);
 			iput(inode);
 			continue;
 		}
@@ -988,7 +988,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
 
 	btrfs_free_path(path);
-	btrfs_end_transaction(trans, root);
 }
 
 void btrfs_read_locked_inode(struct inode *inode)
@@ -1343,8 +1342,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 mask = root->sectorsize - 1;
 
 	if (root->ref_cows)
-		btrfs_drop_extent_cache(inode,
-					new_size & (~mask), (u64)-1);
+		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -1677,7 +1675,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 						       hole_start, 0, 0,
 						       hole_size, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
-						(u64)-1);
+						(u64)-1, 0);
 			btrfs_check_file(root, inode);
 		}
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -1843,6 +1841,24 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
 		args->root == BTRFS_I(inode)->root);
 }
 
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    struct btrfs_root *root, int wait)
+{
+	struct inode *inode;
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = root;
+
+	if (wait) {
+		inode = ilookup5(s, objectid, btrfs_find_actor,
+				 (void *)&args);
+	} else {
+		inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+					(void *)&args);
+	}
+	return inode;
+}
+
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root)
 {
@@ -3266,7 +3282,7 @@ void btrfs_destroy_inode(struct inode *inode)
 			btrfs_put_ordered_extent(ordered);
 		}
 	}
-	btrfs_drop_extent_cache(inode, 0, (u64)-1);
+	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
@@ -3412,16 +3428,22 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 {
 	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
+	struct inode *inode;
 	unsigned long flags;
 
 	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	while(!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
 				    delalloc_inodes);
-		atomic_inc(&binode->vfs_inode.i_count);
+		inode = igrab(&binode->vfs_inode);
+		if (!inode)
+			list_del_init(&binode->delalloc_inodes);
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
-		filemap_write_and_wait(binode->vfs_inode.i_mapping);
-		iput(&binode->vfs_inode);
+		if (inode) {
+			filemap_write_and_wait(inode->i_mapping);
+			iput(inode);
+		}
+		cond_resched();
 		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	}
 	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4c6e0c15754..04de767a8db 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -444,12 +444,10 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
 		goto out;
 	}
 
-	mutex_lock(&root->fs_info->drop_mutex);
 	if (root == root->fs_info->tree_root)
 		ret = create_subvol(root, vol_args->name, namelen);
 	else
 		ret = create_snapshot(root, vol_args->name, namelen);
-	mutex_unlock(&root->fs_info->drop_mutex);
 out:
 	kfree(vol_args);
 	return ret;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index da6d43eb41d..951eacff242 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -309,7 +309,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 {
 	struct list_head splice;
 	struct list_head *cur;
-	struct list_head *tmp;
 	struct btrfs_ordered_extent *ordered;
 	struct inode *inode;
 
@@ -317,37 +316,38 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 
 	spin_lock(&root->fs_info->ordered_extent_lock);
 	list_splice_init(&root->fs_info->ordered_extents, &splice);
-	list_for_each_safe(cur, tmp, &splice) {
+	while (!list_empty(&splice)) {
 		cur = splice.next;
 		ordered = list_entry(cur, struct btrfs_ordered_extent,
 				     root_extent_list);
 		if (nocow_only &&
 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+			list_move(&ordered->root_extent_list,
+				  &root->fs_info->ordered_extents);
 			cond_resched_lock(&root->fs_info->ordered_extent_lock);
 			continue;
 		}
 
 		list_del_init(&ordered->root_extent_list);
 		atomic_inc(&ordered->refs);
-		inode = ordered->inode;
 
 		/*
-		 * the inode can't go away until all the pages are gone
-		 * and the pages won't go away while there is still
-		 * an ordered extent and the ordered extent won't go
-		 * away until it is off this list.  So, we can safely
-		 * increment i_count here and call iput later
+		 * the inode may be getting freed (in sys_unlink path).
 		 */
-		atomic_inc(&inode->i_count);
+		inode = igrab(ordered->inode);
+
 		spin_unlock(&root->fs_info->ordered_extent_lock);
 
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-		iput(inode);
+		if (inode) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			iput(inode);
+		} else {
+			btrfs_put_ordered_extent(ordered);
+		}
 
 		spin_lock(&root->fs_info->ordered_extent_lock);
 	}
-	list_splice_init(&splice, &root->fs_info->ordered_extents);
 	spin_unlock(&root->fs_info->ordered_extent_lock);
 	return 0;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 656baefa525..8c83cf464c8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -109,6 +109,7 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 			spin_lock_init(&dirty->root->node_lock);
 			spin_lock_init(&dirty->root->list_lock);
 			mutex_init(&dirty->root->objectid_mutex);
+			mutex_init(&dirty->root->log_mutex);
 			INIT_LIST_HEAD(&dirty->root->dead_list);
 			dirty->root->node = root->commit_root;
 			dirty->root->commit_root = NULL;
@@ -590,13 +591,14 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		root = dirty->latest_root;
 		atomic_inc(&root->fs_info->throttles);
 
-		mutex_lock(&root->fs_info->drop_mutex);
 		while(1) {
 			trans = btrfs_start_transaction(tree_root, 1);
+			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
 			if (ret != -EAGAIN) {
 				break;
 			}
+			mutex_unlock(&root->fs_info->drop_mutex);
 
 			err = btrfs_update_root(trans,
 					tree_root,
@@ -608,10 +610,8 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			ret = btrfs_end_transaction(trans, tree_root);
 			BUG_ON(ret);
 
-			mutex_unlock(&root->fs_info->drop_mutex);
 			btrfs_btree_balance_dirty(tree_root, nr);
 			cond_resched();
-			mutex_lock(&root->fs_info->drop_mutex);
 		}
 		BUG_ON(ret);
 		atomic_dec(&root->fs_info->throttles);
@@ -689,7 +689,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
 	key.objectid = objectid;
-	key.offset = 1;
+	key.offset = trans->transid;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
-- 
cgit v1.2.3


From 1a40e23b95da45051ee4d74374c58ae87a14051c Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 26 Sep 2008 10:09:34 -0400
Subject: Btrfs: update space balancing code

This patch updates the space balancing code to utilize the new
backref format.  Before, btrfs-vol -b would break any COW links
on data blocks or metadata.  This was slow and caused the amount
of space used to explode if a large number of snapshots were present.

The new code can keeps the sharing of all data extents and
most of the tree blocks.

To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.

To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).

To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  155 +++-
 fs/btrfs/ctree.h       |   26 +-
 fs/btrfs/disk-io.c     |    9 +
 fs/btrfs/extent-tree.c | 2074 ++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/root-tree.c   |    5 +-
 fs/btrfs/transaction.c |   15 +-
 fs/btrfs/volumes.c     |    9 +-
 7 files changed, 1848 insertions(+), 445 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f9cd40967d0..50e81f43e6d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -179,7 +179,6 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *cow;
 	u32 nritems;
 	int ret = 0;
-	int different_trans = 0;
 	int level;
 	int unlock_orig = 0;
 
@@ -233,13 +232,33 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		u32 nr_extents;
-		different_trans = 1;
 		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
 		if (ret)
 			return ret;
 
 		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
 		WARN_ON(ret);
+	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+		/*
+		 * There are only two places that can drop reference to
+		 * tree blocks owned by living reloc trees, one is here,
+		 * the other place is btrfs_merge_path. In both places,
+		 * we check reference count while tree block is locked.
+		 * Furthermore, if reference count is one, it won't get
+		 * increased by someone else.
+		 */
+		u32 refs;
+		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+					      buf->len, &refs);
+		BUG_ON(ret);
+		if (refs == 1) {
+			ret = btrfs_update_ref(trans, root, buf, cow,
+					       0, nritems);
+			clean_tree_block(trans, root, buf);
+		} else {
+			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+		}
+		BUG_ON(ret);
 	} else {
 		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
 		if (ret)
@@ -247,6 +266,14 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		clean_tree_block(trans, root, buf);
 	}
 
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		ret = btrfs_add_reloc_mapping(root, buf->start,
+					      buf->len, cow->start);
+		BUG_ON(ret);
+		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+		WARN_ON(ret);
+	}
+
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
 
@@ -1466,6 +1493,130 @@ done:
 	return ret;
 }
 
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 bytenr;
+	u64 generation;
+	u32 blocksize;
+	int level;
+	int slot;
+	int key_match;
+	int ret;
+
+	eb = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+	BUG_ON(ret);
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		if (level == 0 || level <= lowest_level)
+			break;
+
+		ret = bin_search(parent, &node_keys[lowest_level], level,
+				 &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		bytenr = btrfs_node_blockptr(parent, slot);
+		if (nodes[level - 1] == bytenr)
+			break;
+
+		blocksize = btrfs_level_size(root, level - 1);
+		generation = btrfs_node_ptr_generation(parent, slot);
+		btrfs_node_key_to_cpu(eb, &key, slot);
+		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+
+		/*
+		 * if node keys match and node pointer hasn't been modified
+		 * in the running transaction, we can merge the path. for
+		 * reloc trees, the node pointer check is skipped, this is
+		 * because the reloc trees are fully controlled by the space
+		 * balance code, no one else can modify them.
+		 */
+		if (!nodes[level - 1] || !key_match ||
+		    (generation == trans->transid &&
+		     root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) {
+next_level:
+			if (level == 1 || level == lowest_level + 1)
+				break;
+
+			eb = read_tree_block(root, bytenr, blocksize,
+					     generation);
+			btrfs_tree_lock(eb);
+
+			ret = btrfs_cow_block(trans, root, eb, parent, slot,
+					      &eb, 0);
+			BUG_ON(ret);
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+			parent = eb;
+			continue;
+		}
+
+		if (generation == trans->transid) {
+			u32 refs;
+			BUG_ON(btrfs_header_owner(eb) !=
+			       BTRFS_TREE_RELOC_OBJECTID);
+			/*
+			 * lock the block to keep __btrfs_cow_block from
+			 * changing the reference count.
+			 */
+			eb = read_tree_block(root, bytenr, blocksize,
+					     generation);
+			btrfs_tree_lock(eb);
+
+			ret = btrfs_lookup_extent_ref(trans, root, bytenr,
+						      blocksize, &refs);
+			BUG_ON(ret);
+			/*
+			 * if replace block whose reference count is one,
+			 * we have to "drop the subtree". so skip it for
+			 * simplicity
+			 */
+			if (refs == 1) {
+				btrfs_tree_unlock(eb);
+				free_extent_buffer(eb);
+				goto next_level;
+			}
+		}
+
+		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+		btrfs_mark_buffer_dirty(parent);
+
+		ret = btrfs_inc_extent_ref(trans, root,
+					nodes[level - 1],
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1, 0);
+		BUG_ON(ret);
+		ret = btrfs_free_extent(trans, root, bytenr,
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1, 0, 1);
+		BUG_ON(ret);
+
+		if (generation == trans->transid) {
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		}
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return 0;
+}
+
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e62a1b0a1f..2775e270881 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -604,6 +604,7 @@ struct btrfs_fs_info {
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
+	struct mutex tree_reloc_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -647,6 +648,10 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
+	/* tree relocation relocated fields */
+	struct extent_io_tree reloc_mapping_tree;
+	struct list_head dead_reloc_roots;
+	struct btrfs_leaf_ref_tree reloc_ref_tree;
 	struct btrfs_leaf_ref_tree shared_ref_tree;
 
 	struct kobject super_kobj;
@@ -698,6 +703,7 @@ struct btrfs_root {
 	struct btrfs_leaf_ref_tree ref_tree_struct;
 	struct btrfs_dirty_root *dirty_root;
 	struct btrfs_root *log_root;
+	struct btrfs_root *reloc_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
@@ -1517,7 +1523,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
-int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
@@ -1582,10 +1587,29 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 new_bytenr);
+int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 *new_bytenr);
+void btrfs_free_reloc_mappings(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *new_key);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8969fee2331..45bc3132b05 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1406,6 +1406,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
+	extent_io_tree_init(&fs_info->reloc_mapping_tree,
+			    fs_info->btree_inode->i_mapping, GFP_NOFS);
+	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+	btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
 	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
 
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1421,6 +1425,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
+	mutex_init(&fs_info->tree_reloc_mutex);
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1627,6 +1632,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		ret = btrfs_recover_log_trees(log_tree_root);
 		BUG_ON(ret);
 	}
+
+	ret = btrfs_cleanup_reloc_trees(tree_root);
+	BUG_ON(ret);
+
 	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 	return tree_root;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9ab099bc01a..8043b9d584a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1834,6 +1834,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_owner = btrfs_header_owner(buf);
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			clean_tree_block(NULL, root, buf);
@@ -2487,6 +2488,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 		return -ENOSPC;
 	}
 	btrfs_add_free_space(cache, start, len);
+	update_reserved_extents(root, start, len, 0);
 	maybe_unlock_mutex(root);
 	return 0;
 }
@@ -2947,6 +2949,10 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		 */
 		if (*level == 1) {
 			ref = btrfs_lookup_leaf_ref(root, bytenr);
+			if (ref && ref->generation != ptr_gen) {
+				btrfs_free_leaf_ref(root, ref);
+				ref = NULL;
+			}
 			if (ref) {
 				ret = cache_drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
@@ -3153,34 +3159,6 @@ out:
 	return ret;
 }
 
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
-{
-	struct btrfs_block_group_cache *block_group;
-	struct rb_node *n;
-
-	mutex_lock(&info->alloc_mutex);
-	spin_lock(&info->block_group_cache_lock);
-	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
-		block_group = rb_entry(n, struct btrfs_block_group_cache,
-				       cache_node);
-
-		spin_unlock(&info->block_group_cache_lock);
-		btrfs_remove_free_space_cache(block_group);
-		spin_lock(&info->block_group_cache_lock);
-
-		rb_erase(&block_group->cache_node,
-			 &info->block_group_cache_tree);
-
-		spin_lock(&block_group->space_info->lock);
-		list_del(&block_group->list);
-		spin_unlock(&block_group->space_info->lock);
-		kfree(block_group);
-	}
-	spin_unlock(&info->block_group_cache_lock);
-	mutex_unlock(&info->alloc_mutex);
-	return 0;
-}
-
 static unsigned long calc_ra(unsigned long start, unsigned long last,
 			     unsigned long nr)
 {
@@ -3192,37 +3170,43 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 {
 	u64 page_start;
 	u64 page_end;
+	unsigned long first_index;
 	unsigned long last_index;
 	unsigned long i;
 	struct page *page;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct file_ra_state *ra;
-	unsigned long total_read = 0;
-	unsigned long ra_pages;
 	struct btrfs_ordered_extent *ordered;
-	struct btrfs_trans_handle *trans;
+	unsigned int total_read = 0;
+	unsigned int total_dirty = 0;
+	int ret = 0;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 
 	mutex_lock(&inode->i_mutex);
-	i = start >> PAGE_CACHE_SHIFT;
+	first_index = start >> PAGE_CACHE_SHIFT;
 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
-	ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
+	/* make sure the dirty trick played by the caller work */
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    first_index, last_index);
+	if (ret)
+		goto out_unlock;
 
 	file_ra_state_init(ra, inode->i_mapping);
 
-	for (; i <= last_index; i++) {
-		if (total_read % ra_pages == 0) {
+	for (i = first_index ; i <= last_index; i++) {
+		if (total_read % ra->ra_pages == 0) {
 			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-				       calc_ra(i, last_index, ra_pages));
+				       calc_ra(i, last_index, ra->ra_pages));
 		}
 		total_read++;
 again:
 		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-			goto truncate_racing;
+			BUG_ON(1);
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page) {
+			ret = -ENOMEM;
 			goto out_unlock;
 		}
 		if (!PageUptodate(page)) {
@@ -3231,6 +3215,7 @@ again:
 			if (!PageUptodate(page)) {
 				unlock_page(page);
 				page_cache_release(page);
+				ret = -EIO;
 				goto out_unlock;
 			}
 		}
@@ -3251,14 +3236,13 @@ again:
 		}
 		set_page_extent_mapped(page);
 
-		/*
-		 * make sure page_mkwrite is called for this page if userland
-		 * wants to change it from mmap
-		 */
-		clear_page_dirty_for_io(page);
-
 		btrfs_set_extent_delalloc(inode, page_start, page_end);
+		if (i == first_index)
+			set_extent_bits(io_tree, page_start, page_end,
+					EXTENT_BOUNDARY, GFP_NOFS);
+
 		set_page_dirty(page);
+		total_dirty++;
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
@@ -3266,350 +3250,1460 @@ again:
 	}
 
 out_unlock:
-	/* we have to start the IO in order to get the ordered extents
-	 * instantiated.  This allows the relocation to code to wait
-	 * for all the ordered extents to hit the disk.
-	 *
-	 * Otherwise, it would constantly loop over the same extents
-	 * because the old ones don't get deleted  until the IO is
-	 * started
-	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
-			       WB_SYNC_NONE);
 	kfree(ra);
-	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
-	if (trans) {
-		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-		mark_inode_dirty(inode);
-	}
 	mutex_unlock(&inode->i_mutex);
-	return 0;
-
-truncate_racing:
-	vmtruncate(inode, inode->i_size);
-	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-					   total_read);
-	goto out_unlock;
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+	return ret;
 }
 
-/*
- * The back references tell us which tree holds a ref on a block,
- * but it is possible for the tree root field in the reference to
- * reflect the original root before a snapshot was made.  In this
- * case we should search through all the children of a given root
- * to find potential holders of references on a block.
- *
- * Instead, we do something a little less fancy and just search
- * all the roots for a given key/block combination.
- */
-static int find_root_for_ref(struct btrfs_root *root,
-			     struct btrfs_path *path,
-			     struct btrfs_key *key0,
-			     int level,
-			     int file_key,
-			     struct btrfs_root **found_root,
-			     u64 bytenr)
-{
-	struct btrfs_key root_location;
-	struct btrfs_root *cur_root = *found_root;
-	struct btrfs_file_extent_item *file_extent;
-	u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
-	u64 found_bytenr;
-	int ret;
+static int noinline relocate_data_extent(struct inode *reloc_inode,
+					 struct btrfs_key *extent_key,
+					 u64 offset)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+	struct extent_map *em;
 
-	root_location.offset = (u64)-1;
-	root_location.type = BTRFS_ROOT_ITEM_KEY;
-	path->lowest_level = level;
-	path->reada = 0;
-	while(1) {
-		ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
-		found_bytenr = 0;
-		if (ret == 0 && file_key) {
-			struct extent_buffer *leaf = path->nodes[0];
-			file_extent = btrfs_item_ptr(leaf, path->slots[0],
-					     struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(leaf, file_extent) ==
-			    BTRFS_FILE_EXTENT_REG) {
-				found_bytenr =
-					btrfs_file_extent_disk_bytenr(leaf,
-							       file_extent);
-		       }
-		} else if (!file_key) {
-			if (path->nodes[level])
-				found_bytenr = path->nodes[level]->start;
-		}
-
-		btrfs_release_path(cur_root, path);
-
-		if (found_bytenr == bytenr) {
-			*found_root = cur_root;
-			ret = 0;
-			goto out;
-		}
-		ret = btrfs_search_root(root->fs_info->tree_root,
-					root_search_start, &root_search_start);
-		if (ret)
-			break;
+	em = alloc_extent_map(GFP_NOFS);
+	BUG_ON(!em || IS_ERR(em));
 
-		root_location.objectid = root_search_start;
-		cur_root = btrfs_read_fs_root_no_name(root->fs_info,
-						      &root_location);
-		if (!cur_root) {
-			ret = 1;
+	em->start = extent_key->objectid - offset;
+	em->len = extent_key->offset;
+	em->block_start = extent_key->objectid;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	/* setup extent map to cheat btrfs_readpage */
+	mutex_lock(&BTRFS_I(reloc_inode)->extent_mutex);
+	while (1) {
+		int ret;
+		spin_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		spin_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
 			break;
 		}
+		btrfs_drop_extent_cache(reloc_inode, em->start,
+					em->start + em->len - 1, 0);
 	}
-out:
-	path->lowest_level = 0;
-	return ret;
-}
+	mutex_unlock(&BTRFS_I(reloc_inode)->extent_mutex);
 
-/*
- * note, this releases the path
- */
-static int noinline relocate_one_reference(struct btrfs_root *extent_root,
-				  struct btrfs_path *path,
-				  struct btrfs_key *extent_key,
-				  u64 *last_file_objectid,
-				  u64 *last_file_offset,
-				  u64 *last_file_root,
-				  u64 last_extent)
-{
-	struct inode *inode;
-	struct btrfs_root *found_root;
-	struct btrfs_key root_location;
-	struct btrfs_key found_key;
-	struct btrfs_extent_ref *ref;
-	u64 ref_root;
-	u64 ref_gen;
-	u64 ref_objectid;
-	u64 ref_offset;
-	int ret;
-	int level;
+	return relocate_inode_pages(reloc_inode, extent_key->objectid - offset,
+				    extent_key->offset);
+}
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+struct btrfs_ref_path {
+	u64 extent_start;
+	u64 nodes[BTRFS_MAX_LEVEL];
+	u64 root_objectid;
+	u64 root_generation;
+	u64 owner_objectid;
+	u64 owner_offset;
+	u32 num_refs;
+	int lowest_level;
+	int current_level;
+};
 
-	ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			     struct btrfs_extent_ref);
-	ref_root = btrfs_ref_root(path->nodes[0], ref);
-	ref_gen = btrfs_ref_generation(path->nodes[0], ref);
-	ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
-	ref_offset = btrfs_ref_offset(path->nodes[0], ref);
-	btrfs_release_path(extent_root, path);
+struct disk_extent {
+	u64 disk_bytenr;
+	u64 disk_num_bytes;
+	u64 offset;
+	u64 num_bytes;
+};
 
-	root_location.objectid = ref_root;
-	if (ref_gen == 0)
-		root_location.offset = 0;
-	else
-		root_location.offset = (u64)-1;
-	root_location.type = BTRFS_ROOT_ITEM_KEY;
+static int is_cowonly_root(u64 root_objectid)
+{
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID)
+		return 1;
+	return 0;
+}
 
-	found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
-						&root_location);
-	BUG_ON(!found_root);
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_ref_path *ref_path,
+				    int first_time)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 bytenr;
+	u32 nritems;
+	int level;
+	int ret = 1;
 
-	if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-		found_key.objectid = ref_objectid;
-		found_key.type = BTRFS_EXTENT_DATA_KEY;
-		found_key.offset = ref_offset;
-		level = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
-		if (last_extent == extent_key->objectid &&
-		    *last_file_objectid == ref_objectid &&
-		    *last_file_offset == ref_offset &&
-		    *last_file_root == ref_root)
-			goto out;
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
 
-		ret = find_root_for_ref(extent_root, path, &found_key,
-					level, 1, &found_root,
-					extent_key->objectid);
+	if (first_time) {
+		ref_path->lowest_level = -1;
+		ref_path->current_level = -1;
+		goto walk_up;
+	}
+walk_down:
+	level = ref_path->current_level - 1;
+	while (level >= -1) {
+		u64 parent;
+		if (level < ref_path->lowest_level)
+			break;
 
-		if (ret)
-			goto out;
+		if (level >= 0) {
+			bytenr = ref_path->nodes[level];
+		} else {
+			bytenr = ref_path->extent_start;
+		}
+		BUG_ON(bytenr == 0);
 
-		if (last_extent == extent_key->objectid &&
-		    *last_file_objectid == ref_objectid &&
-		    *last_file_offset == ref_offset &&
-		    *last_file_root == ref_root)
-			goto out;
+		parent = ref_path->nodes[level + 1];
+		ref_path->nodes[level + 1] = 0;
+		ref_path->current_level = level;
+		BUG_ON(parent == 0);
 
-		inode = btrfs_iget_locked(extent_root->fs_info->sb,
-					  ref_objectid, found_root);
-		if (inode->i_state & I_NEW) {
-			/* the inode and parent dir are two different roots */
-			BTRFS_I(inode)->root = found_root;
-			BTRFS_I(inode)->location.objectid = ref_objectid;
-			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-			BTRFS_I(inode)->location.offset = 0;
-			btrfs_read_locked_inode(inode);
-			unlock_new_inode(inode);
+		key.objectid = bytenr;
+		key.offset = parent + 1;
+		key.type = BTRFS_EXTENT_REF_KEY;
 
-		}
-		/* this can happen if the reference is not against
-		 * the latest version of the tree root
-		 */
-		if (is_bad_inode(inode))
+		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+		if (ret < 0)
 			goto out;
+		BUG_ON(ret == 0);
 
-		*last_file_objectid = inode->i_ino;
-		*last_file_root = found_root->root_key.objectid;
-		*last_file_offset = ref_offset;
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				goto next;
+			leaf = path->nodes[0];
+		}
 
-		relocate_inode_pages(inode, ref_offset, extent_key->offset);
-		iput(inode);
-	} else {
-		struct btrfs_trans_handle *trans;
-		struct extent_buffer *eb;
-		int needs_lock = 0;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid == bytenr &&
+				found_key.type == BTRFS_EXTENT_REF_KEY)
+			goto found;
+next:
+		level--;
+		btrfs_release_path(extent_root, path);
+		if (need_resched()) {
+			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+			cond_resched();
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
+	}
+	/* reached lowest level */
+	ret = 1;
+	goto out;
+walk_up:
+	level = ref_path->current_level;
+	while (level < BTRFS_MAX_LEVEL - 1) {
+		u64 ref_objectid;
+		if (level >= 0) {
+			bytenr = ref_path->nodes[level];
+		} else {
+			bytenr = ref_path->extent_start;
+		}
+		BUG_ON(bytenr == 0);
 
-		eb = read_tree_block(found_root, extent_key->objectid,
-				     extent_key->offset, 0);
-		btrfs_tree_lock(eb);
-		level = btrfs_header_level(eb);
+		key.objectid = bytenr;
+		key.offset = 0;
+		key.type = BTRFS_EXTENT_REF_KEY;
 
-		if (level == 0)
-			btrfs_item_key_to_cpu(eb, &found_key, 0);
-		else
-			btrfs_node_key_to_cpu(eb, &found_key, 0);
+		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
 
-		btrfs_tree_unlock(eb);
-		free_extent_buffer(eb);
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				/* the extent was freed by someone */
+				if (ref_path->lowest_level == level)
+					goto out;
+				btrfs_release_path(extent_root, path);
+				goto walk_down;
+			}
+			leaf = path->nodes[0];
+		}
 
-		ret = find_root_for_ref(extent_root, path, &found_key,
-					level, 0, &found_root,
-					extent_key->objectid);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != bytenr ||
+				found_key.type != BTRFS_EXTENT_REF_KEY) {
+			/* the extent was freed by someone */
+			if (ref_path->lowest_level == level) {
+				ret = 1;
+				goto out;
+			}
+			btrfs_release_path(extent_root, path);
+			goto walk_down;
+		}
+found:
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_extent_ref);
+		ref_objectid = btrfs_ref_objectid(leaf, ref);
+		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+			if (first_time) {
+				level = (int)ref_objectid;
+				BUG_ON(level >= BTRFS_MAX_LEVEL);
+				ref_path->lowest_level = level;
+				ref_path->current_level = level;
+				ref_path->nodes[level] = bytenr;
+			} else {
+				WARN_ON(ref_objectid != level);
+			}
+		} else {
+			WARN_ON(level != -1);
+		}
+		first_time = 0;
 
-		if (ret)
-			goto out;
+		if (ref_path->lowest_level == level) {
+			ref_path->owner_objectid = ref_objectid;
+			ref_path->owner_offset = btrfs_ref_offset(leaf, ref);
+			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+		}
 
 		/*
-		 * right here almost anything could happen to our key,
-		 * but that's ok.  The cow below will either relocate it
-		 * or someone else will have relocated it.  Either way,
-		 * it is in a different spot than it was before and
-		 * we're happy.
+		 * the block is tree root or the block isn't in reference
+		 * counted tree.
 		 */
+		if (found_key.objectid == found_key.offset ||
+		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+			ref_path->root_generation =
+				btrfs_ref_generation(leaf, ref);
+			if (level < 0) {
+				/* special reference from the tree log */
+				ref_path->nodes[0] = found_key.offset;
+				ref_path->current_level = 0;
+			}
+			ret = 0;
+			goto out;
+		}
 
-		trans = btrfs_start_transaction(found_root, 1);
+		level++;
+		BUG_ON(ref_path->nodes[level] != 0);
+		ref_path->nodes[level] = found_key.offset;
+		ref_path->current_level = level;
 
-		if (found_root == extent_root->fs_info->extent_root ||
-		    found_root == extent_root->fs_info->chunk_root ||
-		    found_root == extent_root->fs_info->dev_root) {
-			needs_lock = 1;
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		/*
+		 * the reference was created in the running transaction,
+		 * no need to continue walking up.
+		 */
+		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+			ref_path->root_generation =
+				btrfs_ref_generation(leaf, ref);
+			ret = 0;
+			goto out;
 		}
 
-		path->lowest_level = level;
-		path->reada = 2;
-		ret = btrfs_search_slot(trans, found_root, &found_key, path,
-					0, 1);
-		path->lowest_level = 0;
-		btrfs_release_path(found_root, path);
-
-		if (found_root == found_root->fs_info->extent_root)
-			btrfs_extent_post_op(trans, found_root);
-		if (needs_lock)
+		btrfs_release_path(extent_root, path);
+		if (need_resched()) {
 			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-
-		btrfs_end_transaction(trans, found_root);
-
+			cond_resched();
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
 	}
+	/* reached max tree level, but no tree root found. */
+	BUG();
 out:
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
-	return 0;
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+	btrfs_free_path(path);
+	return ret;
 }
 
-static int noinline del_extent_zero(struct btrfs_root *extent_root,
-				    struct btrfs_path *path,
-				    struct btrfs_key *extent_key)
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct btrfs_ref_path *ref_path,
+				u64 extent_start)
 {
-	int ret;
-	struct btrfs_trans_handle *trans;
+	memset(ref_path, 0, sizeof(*ref_path));
+	ref_path->extent_start = extent_start;
 
-	trans = btrfs_start_transaction(extent_root, 1);
-	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
-	if (ret > 0) {
-		ret = -EIO;
-		goto out;
-	}
-	if (ret < 0)
-		goto out;
-	ret = btrfs_del_item(trans, extent_root, path);
-out:
-	btrfs_end_transaction(trans, extent_root);
-	return ret;
+	return __next_ref_path(trans, extent_root, ref_path, 1);
 }
 
-static int noinline relocate_one_extent(struct btrfs_root *extent_root,
-					struct btrfs_path *path,
-					struct btrfs_key *extent_key)
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct btrfs_ref_path *ref_path)
 {
-	struct btrfs_key key;
-	struct btrfs_key found_key;
+	return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+
+static int noinline get_new_locations(struct inode *reloc_inode,
+				      struct btrfs_key *extent_key,
+				      u64 offset, int no_fragment,
+				      struct disk_extent **extents,
+				      int *nr_extents)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *leaf;
-	u64 last_file_objectid = 0;
-	u64 last_file_root = 0;
-	u64 last_file_offset = (u64)-1;
-	u64 last_extent = 0;
+	struct disk_extent *exts = *extents;
+	struct btrfs_key found_key;
+	u64 cur_pos;
+	u64 last_byte;
 	u32 nritems;
-	u32 item_size;
-	int ret = 0;
+	int nr = 0;
+	int max = *nr_extents;
+	int ret;
 
-	if (extent_key->objectid == 0) {
-		ret = del_extent_zero(extent_root, path, extent_key);
-		goto out;
+	WARN_ON(!no_fragment && *extents);
+	if (!exts) {
+		max = 1;
+		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+		if (!exts)
+			return -ENOMEM;
 	}
-	key.objectid = extent_key->objectid;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = 0;
 
-	while(1) {
-		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 
-		if (ret < 0)
-			goto out;
+	cur_pos = extent_key->objectid - offset;
+	last_byte = extent_key->objectid + extent_key->offset;
+	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+				       cur_pos, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
 
-		ret = 0;
+	while (1) {
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] == nritems) {
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret > 0) {
-				ret = 0;
-				goto out;
-			}
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto out;
+			if (ret > 0)
+				break;
 			leaf = path->nodes[0];
 		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != extent_key->objectid) {
+		if (found_key.offset != cur_pos ||
+		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
+		    found_key.objectid != reloc_inode->i_ino)
 			break;
-		}
 
-		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) !=
+		    BTRFS_FILE_EXTENT_REG ||
+		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
 			break;
+
+		if (nr == max) {
+			struct disk_extent *old = exts;
+			max *= 2;
+			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+			memcpy(exts, old, sizeof(*exts) * nr);
+			if (old != *extents)
+				kfree(old);
 		}
 
-		key.offset = found_key.offset + 1;
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		exts[nr].disk_bytenr =
+			btrfs_file_extent_disk_bytenr(leaf, fi);
+		exts[nr].disk_num_bytes =
+			btrfs_file_extent_disk_num_bytes(leaf, fi);
+		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		WARN_ON(exts[nr].offset > 0);
+		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
-		ret = relocate_one_reference(extent_root, path, extent_key,
-					     &last_file_objectid,
-					     &last_file_offset,
-					     &last_file_root, last_extent);
-		if (ret)
+		cur_pos += exts[nr].num_bytes;
+		nr++;
+
+		if (cur_pos + offset >= last_byte)
+			break;
+
+		if (no_fragment) {
+			ret = 1;
 			goto out;
-		last_extent = extent_key->objectid;
+		}
+		path->slots[0]++;
+	}
+
+	WARN_ON(cur_pos + offset > last_byte);
+	if (cur_pos + offset < last_byte) {
+		ret = -ENOENT;
+		goto out;
 	}
 	ret = 0;
 out:
-	btrfs_release_path(extent_root, path);
-	return ret;
-}
-
+	btrfs_free_path(path);
+	if (ret) {
+		if (exts != *extents)
+			kfree(exts);
+	} else {
+		*extents = exts;
+		*nr_extents = nr;
+	}
+	return ret;
+}
+
+static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_key *extent_key,
+					struct btrfs_key *leaf_key,
+					struct btrfs_ref_path *ref_path,
+					struct disk_extent *new_extents,
+					int nr_extents)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct inode *inode = NULL;
+	struct btrfs_key key;
+	u64 lock_start = 0;
+	u64 lock_end = 0;
+	u64 num_bytes;
+	u64 ext_offset;
+	u64 first_pos;
+	u32 nritems;
+	int extent_locked = 0;
+	int ret;
+
+	first_pos = ref_path->owner_offset;
+	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+		key.objectid = ref_path->owner_objectid;
+		key.offset = ref_path->owner_offset;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+	} else {
+		memcpy(&key, leaf_key, sizeof(key));
+	}
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto out;
+
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+next:
+		if (extent_locked && ret > 0) {
+			/*
+			 * the file extent item was modified by someone
+			 * before the extent got locked.
+			 */
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+			extent_locked = 0;
+		}
+
+		if (path->slots[0] >= nritems) {
+			if (ref_path->owner_objectid ==
+			    BTRFS_MULTIPLE_OBJECTIDS)
+				break;
+
+			BUG_ON(extent_locked);
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+			if ((key.objectid > ref_path->owner_objectid) ||
+			    (key.objectid == ref_path->owner_objectid &&
+			     key.type > BTRFS_EXTENT_DATA_KEY) ||
+			    (key.offset >= first_pos + extent_key->offset))
+				break;
+		}
+
+		if (inode && key.objectid != inode->i_ino) {
+			BUG_ON(extent_locked);
+			btrfs_release_path(root, path);
+			mutex_unlock(&inode->i_mutex);
+			iput(inode);
+			inode = NULL;
+			continue;
+		}
+
+		if (key.type != BTRFS_EXTENT_DATA_KEY) {
+			path->slots[0]++;
+			ret = 1;
+			goto next;
+		}
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if ((btrfs_file_extent_type(leaf, fi) !=
+		     BTRFS_FILE_EXTENT_REG) ||
+		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+		     extent_key->objectid)) {
+			path->slots[0]++;
+			ret = 1;
+			goto next;
+		}
+
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		ext_offset = btrfs_file_extent_offset(leaf, fi);
+
+		if (first_pos > key.offset - ext_offset)
+			first_pos = key.offset - ext_offset;
+
+		if (!extent_locked) {
+			lock_start = key.offset;
+			lock_end = lock_start + num_bytes - 1;
+		} else {
+			BUG_ON(lock_start != key.offset);
+			BUG_ON(lock_end - lock_start + 1 < num_bytes);
+		}
+
+		if (!inode) {
+			btrfs_release_path(root, path);
+
+			inode = btrfs_iget_locked(root->fs_info->sb,
+						  key.objectid, root);
+			if (inode->i_state & I_NEW) {
+				BTRFS_I(inode)->root = root;
+				BTRFS_I(inode)->location.objectid =
+					key.objectid;
+				BTRFS_I(inode)->location.type =
+					BTRFS_INODE_ITEM_KEY;
+				BTRFS_I(inode)->location.offset = 0;
+				btrfs_read_locked_inode(inode);
+				unlock_new_inode(inode);
+			}
+			/*
+			 * some code call btrfs_commit_transaction while
+			 * holding the i_mutex, so we can't use mutex_lock
+			 * here.
+			 */
+			if (is_bad_inode(inode) ||
+			    !mutex_trylock(&inode->i_mutex)) {
+				iput(inode);
+				inode = NULL;
+				key.offset = (u64)-1;
+				goto skip;
+			}
+		}
+
+		if (!extent_locked) {
+			struct btrfs_ordered_extent *ordered;
+
+			btrfs_release_path(root, path);
+
+			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				    lock_end, GFP_NOFS);
+			ordered = btrfs_lookup_first_ordered_extent(inode,
+								    lock_end);
+			if (ordered &&
+			    ordered->file_offset <= lock_end &&
+			    ordered->file_offset + ordered->len > lock_start) {
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      lock_start, lock_end, GFP_NOFS);
+				btrfs_start_ordered_extent(inode, ordered, 1);
+				btrfs_put_ordered_extent(ordered);
+				key.offset += num_bytes;
+				goto skip;
+			}
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+
+			mutex_lock(&BTRFS_I(inode)->extent_mutex);
+			extent_locked = 1;
+			continue;
+		}
+
+		if (nr_extents == 1) {
+			/* update extent pointer in place */
+			btrfs_set_file_extent_generation(leaf, fi,
+						trans->transid);
+			btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extents[0].disk_bytenr);
+			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extents[0].disk_num_bytes);
+			ext_offset += new_extents[0].offset;
+			btrfs_set_file_extent_offset(leaf, fi, ext_offset);
+			btrfs_mark_buffer_dirty(leaf);
+
+			btrfs_drop_extent_cache(inode, key.offset,
+						key.offset + num_bytes - 1, 0);
+
+			ret = btrfs_inc_extent_ref(trans, root,
+						new_extents[0].disk_bytenr,
+						new_extents[0].disk_num_bytes,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid,
+						key.objectid, key.offset);
+			BUG_ON(ret);
+
+			ret = btrfs_free_extent(trans, root,
+						extent_key->objectid,
+						extent_key->offset,
+						leaf->start,
+						btrfs_header_owner(leaf),
+						btrfs_header_generation(leaf),
+						key.objectid, key.offset, 0);
+			BUG_ON(ret);
+
+			btrfs_release_path(root, path);
+			key.offset += num_bytes;
+		} else {
+			u64 alloc_hint;
+			u64 extent_len;
+			int i;
+			/*
+			 * drop old extent pointer at first, then insert the
+			 * new pointers one bye one
+			 */
+			btrfs_release_path(root, path);
+			ret = btrfs_drop_extents(trans, root, inode, key.offset,
+						 key.offset + num_bytes,
+						 key.offset, &alloc_hint);
+			BUG_ON(ret);
+
+			for (i = 0; i < nr_extents; i++) {
+				if (ext_offset >= new_extents[i].num_bytes) {
+					ext_offset -= new_extents[i].num_bytes;
+					continue;
+				}
+				extent_len = min(new_extents[i].num_bytes -
+						 ext_offset, num_bytes);
+
+				ret = btrfs_insert_empty_item(trans, root,
+							      path, &key,
+							      sizeof(*fi));
+				BUG_ON(ret);
+
+				leaf = path->nodes[0];
+				fi = btrfs_item_ptr(leaf, path->slots[0],
+						struct btrfs_file_extent_item);
+				btrfs_set_file_extent_generation(leaf, fi,
+							trans->transid);
+				btrfs_set_file_extent_type(leaf, fi,
+							BTRFS_FILE_EXTENT_REG);
+				btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extents[i].disk_bytenr);
+				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extents[i].disk_num_bytes);
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_len);
+				ext_offset += new_extents[i].offset;
+				btrfs_set_file_extent_offset(leaf, fi,
+							ext_offset);
+				btrfs_mark_buffer_dirty(leaf);
+
+				btrfs_drop_extent_cache(inode, key.offset,
+						key.offset + extent_len - 1, 0);
+
+				ret = btrfs_inc_extent_ref(trans, root,
+						new_extents[i].disk_bytenr,
+						new_extents[i].disk_num_bytes,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid,
+						key.objectid, key.offset);
+				BUG_ON(ret);
+				btrfs_release_path(root, path);
+
+				inode->i_blocks += extent_len >> 9;
+
+				ext_offset = 0;
+				num_bytes -= extent_len;
+				key.offset += extent_len;
+
+				if (num_bytes == 0)
+					break;
+			}
+			BUG_ON(i >= nr_extents);
+		}
+
+		if (extent_locked) {
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+			extent_locked = 0;
+		}
+skip:
+		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
+		    key.offset >= first_pos + extent_key->offset)
+			break;
+
+		cond_resched();
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	if (inode) {
+		mutex_unlock(&inode->i_mutex);
+		if (extent_locked) {
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+		}
+		iput(inode);
+	}
+	return ret;
+}
+
+int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 new_bytenr)
+{
+	set_extent_bits(&root->fs_info->reloc_mapping_tree,
+			orig_bytenr, orig_bytenr + num_bytes - 1,
+			EXTENT_LOCKED, GFP_NOFS);
+	set_state_private(&root->fs_info->reloc_mapping_tree,
+			  orig_bytenr, new_bytenr);
+	return 0;
+}
+
+int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 *new_bytenr)
+{
+	u64 bytenr;
+	u64 cur_bytenr = orig_bytenr;
+	u64 prev_bytenr = orig_bytenr;
+	int ret;
+
+	while (1) {
+		ret = get_state_private(&root->fs_info->reloc_mapping_tree,
+					cur_bytenr, &bytenr);
+		if (ret)
+			break;
+		prev_bytenr = cur_bytenr;
+		cur_bytenr = bytenr;
+	}
+
+	if (orig_bytenr == cur_bytenr)
+		return -ENOENT;
+
+	if (prev_bytenr != orig_bytenr) {
+		set_state_private(&root->fs_info->reloc_mapping_tree,
+				  orig_bytenr, cur_bytenr);
+	}
+	*new_bytenr = cur_bytenr;
+	return 0;
+}
+
+void btrfs_free_reloc_mappings(struct btrfs_root *root)
+{
+	clear_extent_bits(&root->fs_info->reloc_mapping_tree,
+			  0, (u64)-1, -1, GFP_NOFS);
+}
+
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf, u64 orig_start)
+{
+	int level;
+	int ret;
+
+	BUG_ON(btrfs_header_generation(buf) != trans->transid);
+	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+	level = btrfs_header_level(buf);
+	if (level == 0) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_leaf_ref *orig_ref;
+
+		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
+		if (!orig_ref)
+			return -ENOENT;
+
+		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+		if (!ref) {
+			btrfs_free_leaf_ref(root, orig_ref);
+			return -ENOMEM;
+		}
+
+		ref->nritems = orig_ref->nritems;
+		memcpy(ref->extents, orig_ref->extents,
+			sizeof(ref->extents[0]) * ref->nritems);
+
+		btrfs_free_leaf_ref(root, orig_ref);
+
+		ref->root_gen = trans->transid;
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ret = btrfs_add_leaf_ref(root, ref, 0);
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(root, ref);
+	}
+	return 0;
+}
+
+static int noinline invalidate_extent_cache(struct btrfs_root *root,
+					struct extent_buffer *leaf,
+					struct btrfs_block_group_cache *group,
+					struct btrfs_root *target_root)
+{
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	struct btrfs_file_extent_item *fi;
+	u64 num_bytes;
+	u64 skip_objectid = 0;
+	u32 nritems;
+	u32 i;
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (key.objectid == skip_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+			continue;
+		if (!inode || inode->i_ino != key.objectid) {
+			iput(inode);
+			inode = btrfs_ilookup(target_root->fs_info->sb,
+					      key.objectid, target_root, 1);
+		}
+		if (!inode) {
+			skip_objectid = key.objectid;
+			continue;
+		}
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+
+		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+			    key.offset + num_bytes - 1, GFP_NOFS);
+		mutex_lock(&BTRFS_I(inode)->extent_mutex);
+		btrfs_drop_extent_cache(inode, key.offset,
+					key.offset + num_bytes - 1, 1);
+		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+			      key.offset + num_bytes - 1, GFP_NOFS);
+		cond_resched();
+	}
+	iput(inode);
+	return 0;
+}
+
+static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct extent_buffer *leaf,
+					struct btrfs_block_group_cache *group,
+					struct inode *reloc_inode)
+{
+	struct btrfs_key key;
+	struct btrfs_key extent_key;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_leaf_ref *ref;
+	struct disk_extent *new_extent;
+	u64 bytenr;
+	u64 num_bytes;
+	u32 nritems;
+	u32 i;
+	int ext_index;
+	int nr_extent;
+	int ret;
+
+	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+	BUG_ON(!new_extent);
+
+	ref = btrfs_lookup_leaf_ref(root, leaf->start);
+	BUG_ON(!ref);
+
+	ext_index = -1;
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		if (bytenr == 0)
+			continue;
+
+		ext_index++;
+		if (bytenr >= group->key.objectid + group->key.offset ||
+		    bytenr + num_bytes <= group->key.objectid)
+			continue;
+
+		extent_key.objectid = bytenr;
+		extent_key.offset = num_bytes;
+		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+		nr_extent = 1;
+		ret = get_new_locations(reloc_inode, &extent_key,
+					group->key.objectid, 1,
+					&new_extent, &nr_extent);
+		if (ret > 0)
+			continue;
+		BUG_ON(ret < 0);
+
+		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extent->disk_bytenr);
+		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extent->disk_num_bytes);
+		new_extent->offset += btrfs_file_extent_offset(leaf, fi);
+		btrfs_set_file_extent_offset(leaf, fi, new_extent->offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_inc_extent_ref(trans, root,
+					new_extent->disk_bytenr,
+					new_extent->disk_num_bytes,
+					leaf->start,
+					root->root_key.objectid,
+					trans->transid,
+					key.objectid, key.offset);
+		BUG_ON(ret);
+		ret = btrfs_free_extent(trans, root,
+					bytenr, num_bytes, leaf->start,
+					btrfs_header_owner(leaf),
+					btrfs_header_generation(leaf),
+					key.objectid, key.offset, 0);
+		BUG_ON(ret);
+		cond_resched();
+	}
+	kfree(new_extent);
+	BUG_ON(ext_index + 1 != ref->nritems);
+	btrfs_free_leaf_ref(root, ref);
+	return 0;
+}
+
+int btrfs_free_reloc_root(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		root->reloc_root = NULL;
+		list_add(&reloc_root->dead_list,
+			 &root->fs_info->dead_reloc_roots);
+	}
+	return 0;
+}
+
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root *prev_root = NULL;
+	struct list_head dead_roots;
+	int ret;
+	unsigned long nr;
+
+	INIT_LIST_HEAD(&dead_roots);
+	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+
+	while (!list_empty(&dead_roots)) {
+		reloc_root = list_entry(dead_roots.prev,
+					struct btrfs_root, dead_list);
+		list_del_init(&reloc_root->dead_list);
+
+		BUG_ON(reloc_root->commit_root != NULL);
+		while (1) {
+			trans = btrfs_join_transaction(root, 1);
+			BUG_ON(!trans);
+
+			mutex_lock(&root->fs_info->drop_mutex);
+			ret = btrfs_drop_snapshot(trans, reloc_root);
+			if (ret != -EAGAIN)
+				break;
+			mutex_unlock(&root->fs_info->drop_mutex);
+
+			nr = trans->blocks_used;
+			ret = btrfs_end_transaction(trans, root);
+			BUG_ON(ret);
+			btrfs_btree_balance_dirty(root, nr);
+		}
+
+		free_extent_buffer(reloc_root->node);
+
+		ret = btrfs_del_root(trans, root->fs_info->tree_root,
+				     &reloc_root->root_key);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->drop_mutex);
+
+		nr = trans->blocks_used;
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+		btrfs_btree_balance_dirty(root, nr);
+
+		kfree(prev_root);
+		prev_root = reloc_root;
+	}
+	if (prev_root) {
+		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+		kfree(prev_root);
+	}
+	return 0;
+}
+
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+	return 0;
+}
+
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key location;
+	int found;
+	int ret;
+
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+	BUG_ON(ret);
+	found = !list_empty(&root->fs_info->dead_reloc_roots);
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	if (found) {
+		trans = btrfs_start_transaction(root, 1);
+		BUG_ON(!trans);
+		ret = btrfs_commit_transaction(trans, root);
+		BUG_ON(ret);
+	}
+
+	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+	location.offset = (u64)-1;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+
+	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	BUG_ON(!reloc_root);
+	btrfs_orphan_cleanup(reloc_root);
+	return 0;
+}
+
+static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb;
+	struct btrfs_root_item *root_item;
+	struct btrfs_key root_key;
+	int ret;
+
+	BUG_ON(!root->ref_cows);
+	if (root->reloc_root)
+		return 0;
+
+	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	BUG_ON(!root_item);
+
+	ret = btrfs_copy_root(trans, root, root->commit_root,
+			      &eb, BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(ret);
+
+	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	root_key.offset = root->root_key.objectid;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+
+	memcpy(root_item, &root->root_item, sizeof(root_item));
+	btrfs_set_root_refs(root_item, 0);
+	btrfs_set_root_bytenr(root_item, eb->start);
+	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	memset(&root_item->drop_progress, 0, sizeof(root_item->drop_progress));
+	root_item->drop_level = 0;
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root_key, root_item);
+	BUG_ON(ret);
+	kfree(root_item);
+
+	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						 &root_key);
+	BUG_ON(!reloc_root);
+	reloc_root->last_trans = trans->transid;
+	reloc_root->commit_root = NULL;
+	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+
+	root->reloc_root = reloc_root;
+	return 0;
+}
+
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, all reloc
+ * trees share same key objectid. Reloc trees are snapshots of the
+ * latest committed roots (subvol root->commit_root). To relocate a tree
+ * block referenced by a subvol, the code COW the block through the reloc
+ * tree, then update pointer in the subvol to point to the new block.
+ * Since all reloc trees share same key objectid, we can easily do special
+ * handing to share tree blocks between reloc trees. Once a tree block has
+ * been COWed in one reloc tree, we can use the result when the same block
+ * is COWed again through other reloc trees.
+ */
+static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct btrfs_key *first_key,
+				      struct btrfs_ref_path *ref_path,
+				      struct btrfs_block_group_cache *group,
+				      struct inode *reloc_inode)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb = NULL;
+	struct btrfs_key *keys;
+	u64 *nodes;
+	int level;
+	int lowest_merge;
+	int lowest_level = 0;
+	int update_refs;
+	int ret;
+
+	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		lowest_level = ref_path->owner_objectid;
+
+	if (is_cowonly_root(ref_path->root_objectid)) {
+		path->lowest_level = lowest_level;
+		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+		BUG_ON(ret < 0);
+		path->lowest_level = 0;
+		btrfs_release_path(root, path);
+		return 0;
+	}
+
+	keys = kzalloc(sizeof(*keys) * BTRFS_MAX_LEVEL, GFP_NOFS);
+	BUG_ON(!keys);
+	nodes = kzalloc(sizeof(*nodes) * BTRFS_MAX_LEVEL, GFP_NOFS);
+	BUG_ON(!nodes);
+
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+	ret = init_reloc_tree(trans, root);
+	BUG_ON(ret);
+	reloc_root = root->reloc_root;
+
+	path->lowest_level = lowest_level;
+	ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 0);
+	BUG_ON(ret);
+	/*
+	 * get relocation mapping for tree blocks in the path
+	 */
+	lowest_merge = BTRFS_MAX_LEVEL;
+	for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
+		u64 new_bytenr;
+		eb = path->nodes[level];
+		if (!eb || eb == reloc_root->node)
+			continue;
+		ret = btrfs_get_reloc_mapping(reloc_root, eb->start, eb->len,
+					      &new_bytenr);
+		if (ret)
+			continue;
+		if (level == 0)
+			btrfs_item_key_to_cpu(eb, &keys[level], 0);
+		else
+			btrfs_node_key_to_cpu(eb, &keys[level], 0);
+		nodes[level] = new_bytenr;
+		lowest_merge = level;
+	}
+
+	update_refs = 0;
+	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		eb = path->nodes[0];
+		if (btrfs_header_generation(eb) < trans->transid)
+			update_refs = 1;
+	}
+
+	btrfs_release_path(reloc_root, path);
+	/*
+	 * merge tree blocks that already relocated in other reloc trees
+	 */
+	if (lowest_merge != BTRFS_MAX_LEVEL) {
+		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+				       lowest_merge);
+		BUG_ON(ret < 0);
+	}
+	/*
+	 * cow any tree blocks that still haven't been relocated
+	 */
+	ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 1);
+	BUG_ON(ret);
+	/*
+	 * if we are relocating data block group, update extent pointers
+	 * in the newly created tree leaf.
+	 */
+	eb = path->nodes[0];
+	if (update_refs && nodes[0] != eb->start) {
+		ret = replace_extents_in_leaf(trans, reloc_root, eb, group,
+					      reloc_inode);
+		BUG_ON(ret);
+	}
+
+	memset(keys, 0, sizeof(*keys) * BTRFS_MAX_LEVEL);
+	memset(nodes, 0, sizeof(*nodes) * BTRFS_MAX_LEVEL);
+	for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
+		eb = path->nodes[level];
+		if (!eb || eb == reloc_root->node)
+			continue;
+		BUG_ON(btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID);
+		nodes[level] = eb->start;
+		if (level == 0)
+			btrfs_item_key_to_cpu(eb, &keys[level], 0);
+		else
+			btrfs_node_key_to_cpu(eb, &keys[level], 0);
+	}
+
+	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		eb = path->nodes[0];
+		extent_buffer_get(eb);
+	}
+	btrfs_release_path(reloc_root, path);
+	/*
+	 * replace tree blocks in the fs tree with tree blocks in
+	 * the reloc tree.
+	 */
+	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
+	BUG_ON(ret < 0);
+
+	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		ret = invalidate_extent_cache(reloc_root, eb, group, root);
+		BUG_ON(ret);
+		free_extent_buffer(eb);
+	}
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	path->lowest_level = 0;
+	kfree(nodes);
+	kfree(keys);
+	return 0;
+}
+
+static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_key *first_key,
+					struct btrfs_ref_path *ref_path)
+{
+	int ret;
+	int needs_lock = 0;
+
+	if (root == root->fs_info->extent_root ||
+	    root == root->fs_info->chunk_root ||
+	    root == root->fs_info->dev_root) {
+		needs_lock = 1;
+		mutex_lock(&root->fs_info->alloc_mutex);
+	}
+
+	ret = relocate_one_path(trans, root, path, first_key,
+				ref_path, NULL, NULL);
+	BUG_ON(ret);
+
+	if (root == root->fs_info->extent_root)
+		btrfs_extent_post_op(trans, root);
+	if (needs_lock)
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
+	return 0;
+}
+
+static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct btrfs_key *extent_key)
+{
+	int ret;
+
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
+	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+	if (ret)
+		goto out;
+	ret = btrfs_del_item(trans, extent_root, path);
+out:
+	btrfs_release_path(extent_root, path);
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+	return ret;
+}
+
+static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
+						struct btrfs_ref_path *ref_path)
+{
+	struct btrfs_key root_key;
+
+	root_key.objectid = ref_path->root_objectid;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	if (is_cowonly_root(ref_path->root_objectid))
+		root_key.offset = 0;
+	else
+		root_key.offset = (u64)-1;
+
+	return btrfs_read_fs_root_no_name(fs_info, &root_key);
+}
+
+static int noinline relocate_one_extent(struct btrfs_root *extent_root,
+					struct btrfs_path *path,
+					struct btrfs_key *extent_key,
+					struct btrfs_block_group_cache *group,
+					struct inode *reloc_inode, int pass)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *found_root;
+	struct btrfs_ref_path *ref_path = NULL;
+	struct disk_extent *new_extents = NULL;
+	int nr_extents = 0;
+	int loops;
+	int ret;
+	int level;
+	struct btrfs_key first_key;
+	u64 prev_block = 0;
+
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+
+	trans = btrfs_start_transaction(extent_root, 1);
+	BUG_ON(!trans);
+
+	if (extent_key->objectid == 0) {
+		ret = del_extent_zero(trans, extent_root, path, extent_key);
+		goto out;
+	}
+
+	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
+	if (!ref_path) {
+	       ret = -ENOMEM;
+	       goto out;
+	}
+
+	for (loops = 0; ; loops++) {
+		if (loops == 0) {
+			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
+						   extent_key->objectid);
+		} else {
+			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
+		}
+		if (ret < 0)
+			goto out;
+		if (ret > 0)
+			break;
+
+		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+			continue;
+
+		found_root = read_ref_root(extent_root->fs_info, ref_path);
+		BUG_ON(!found_root);
+		/*
+		 * for reference counted tree, only process reference paths
+		 * rooted at the latest committed root.
+		 */
+		if (found_root->ref_cows &&
+		    ref_path->root_generation != found_root->root_key.offset)
+			continue;
+
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			if (pass == 0) {
+				/*
+				 * copy data extents to new locations
+				 */
+				u64 group_start = group->key.objectid;
+				ret = relocate_data_extent(reloc_inode,
+							   extent_key,
+							   group_start);
+				if (ret < 0)
+					goto out;
+				break;
+			}
+			level = 0;
+		} else {
+			level = ref_path->owner_objectid;
+		}
+
+		if (prev_block != ref_path->nodes[level]) {
+			struct extent_buffer *eb;
+			u64 block_start = ref_path->nodes[level];
+			u64 block_size = btrfs_level_size(found_root, level);
+
+			eb = read_tree_block(found_root, block_start,
+					     block_size, 0);
+			btrfs_tree_lock(eb);
+			BUG_ON(level != btrfs_header_level(eb));
+
+			if (level == 0)
+				btrfs_item_key_to_cpu(eb, &first_key, 0);
+			else
+				btrfs_node_key_to_cpu(eb, &first_key, 0);
+
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+			prev_block = block_start;
+		}
+
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+		    pass >= 2) {
+			/*
+			 * use fallback method to process the remaining
+			 * references.
+			 */
+			if (!new_extents) {
+				u64 group_start = group->key.objectid;
+				ret = get_new_locations(reloc_inode,
+							extent_key,
+							group_start, 0,
+							&new_extents,
+							&nr_extents);
+				if (ret < 0)
+					goto out;
+			}
+			btrfs_record_root_in_trans(found_root);
+			ret = replace_one_extent(trans, found_root,
+						path, extent_key,
+						&first_key, ref_path,
+						new_extents, nr_extents);
+			if (ret < 0)
+				goto out;
+			continue;
+		}
+
+		btrfs_record_root_in_trans(found_root);
+		if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+			ret = relocate_tree_block(trans, found_root, path,
+						  &first_key, ref_path);
+		} else {
+			/*
+			 * try to update data extent references while
+			 * keeping metadata shared between snapshots.
+			 */
+			ret = relocate_one_path(trans, found_root, path,
+						&first_key, ref_path,
+						group, reloc_inode);
+		}
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	btrfs_end_transaction(trans, extent_root);
+	kfree(new_extents);
+	kfree(ref_path);
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
+	return ret;
+}
+
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
 	u64 num_devices;
@@ -3686,84 +4780,155 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 u64 objectid, u64 size)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+	btrfs_set_inode_generation(leaf, item, 1);
+	btrfs_set_inode_size(leaf, item, size);
+	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
+					struct btrfs_block_group_cache *group)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	int err = 0;
+
+	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+	if (err)
+		goto out;
+
+	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+	BUG_ON(err);
+
+	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+				       group->key.offset, 0);
+	BUG_ON(err);
+
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+		BUG_ON(is_bad_inode(inode));
+	} else {
+		BUG_ON(1);
+	}
+
+	err = btrfs_orphan_add(trans, inode);
+out:
+	btrfs_end_transaction(trans, root);
+	if (err) {
+		if (inode)
+			iput(inode);
+		inode = ERR_PTR(err);
+	}
+	return inode;
+}
+
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	struct btrfs_path *path;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct extent_buffer *leaf;
+	struct inode *reloc_inode;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_key key;
 	u64 cur_byte;
 	u64 total_found;
-	u64 shrink_last_byte;
-	struct btrfs_block_group_cache *shrink_block_group;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
 	u32 nritems;
 	int ret;
 	int progress;
+	int pass = 0;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
-						      shrink_start);
-	BUG_ON(!shrink_block_group);
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(info, group_start);
+	BUG_ON(!block_group);
 
-	shrink_last_byte = shrink_block_group->key.objectid +
-		shrink_block_group->key.offset;
+	printk("btrfs relocating block group %llu flags %llu\n",
+	       (unsigned long long)block_group->key.objectid,
+	       (unsigned long long)block_group->flags);
 
-	shrink_block_group->space_info->total_bytes -=
-		shrink_block_group->key.offset;
 	path = btrfs_alloc_path();
-	root = root->fs_info->extent_root;
-	path->reada = 2;
+	BUG_ON(!path);
 
-	printk("btrfs relocating block group %llu flags %llu\n",
-	       (unsigned long long)shrink_start,
-	       (unsigned long long)shrink_block_group->flags);
+	reloc_inode = create_reloc_inode(info, block_group);
+	BUG_ON(IS_ERR(reloc_inode));
 
-	__alloc_chunk_for_shrink(root, shrink_block_group, 1);
+	mutex_lock(&root->fs_info->alloc_mutex);
 
-again:
+	__alloc_chunk_for_shrink(root, block_group, 1);
+	block_group->ro = 1;
+	block_group->space_info->total_bytes -= block_group->key.offset;
 
-	shrink_block_group->ro = 1;
+	mutex_unlock(&root->fs_info->alloc_mutex);
 
+	btrfs_start_delalloc_inodes(info->tree_root);
+	btrfs_wait_ordered_extents(info->tree_root, 0);
+again:
 	total_found = 0;
 	progress = 0;
-	key.objectid = shrink_start;
+	key.objectid = block_group->key.objectid;
 	key.offset = 0;
 	key.type = 0;
 	cur_byte = key.objectid;
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
 
-	btrfs_start_delalloc_inodes(root);
-	btrfs_wait_ordered_extents(tree_root, 0);
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_clean_old_snapshots(info->tree_root);
+	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
-	if (ret < 0)
-		goto out;
-
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid + found_key.offset > shrink_start &&
-		    found_key.objectid < shrink_last_byte) {
-			cur_byte = found_key.objectid;
-			key.objectid = cur_byte;
-		}
-	}
-	btrfs_release_path(root, path);
-
 	while(1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
-
 next:
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
@@ -3779,109 +4944,76 @@ next:
 			nritems = btrfs_header_nritems(leaf);
 		}
 
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
-		if (found_key.objectid >= shrink_last_byte)
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset)
 			break;
 
 		if (progress && need_resched()) {
-			memcpy(&key, &found_key, sizeof(key));
-			cond_resched();
 			btrfs_release_path(root, path);
-			btrfs_search_slot(NULL, root, &key, path, 0, 0);
+			mutex_unlock(&root->fs_info->alloc_mutex);
+			cond_resched();
+			mutex_lock(&root->fs_info->alloc_mutex);
 			progress = 0;
-			goto next;
+			continue;
 		}
 		progress = 1;
 
-		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
-		    found_key.objectid + found_key.offset <= cur_byte) {
-			memcpy(&key, &found_key, sizeof(key));
-			key.offset++;
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+		    key.objectid + key.offset <= cur_byte) {
 			path->slots[0]++;
 			goto next;
 		}
 
 		total_found++;
-		cur_byte = found_key.objectid + found_key.offset;
-		key.objectid = cur_byte;
+		cur_byte = key.objectid + key.offset;
 		btrfs_release_path(root, path);
-		ret = relocate_one_extent(root, path, &found_key);
-		__alloc_chunk_for_shrink(root, shrink_block_group, 0);
-	}
-
-	btrfs_release_path(root, path);
-
-	if (total_found > 0) {
-		printk("btrfs relocate found %llu last extent was %llu\n",
-		       (unsigned long long)total_found,
-		       (unsigned long long)found_key.objectid);
-		mutex_unlock(&root->fs_info->alloc_mutex);
-		trans = btrfs_start_transaction(tree_root, 1);
-		btrfs_commit_transaction(trans, tree_root);
 
-		btrfs_clean_old_snapshots(tree_root);
+		__alloc_chunk_for_shrink(root, block_group, 0);
+		ret = relocate_one_extent(root, path, &key, block_group,
+					  reloc_inode, pass);
+		BUG_ON(ret < 0);
 
-		btrfs_start_delalloc_inodes(root);
-		btrfs_wait_ordered_extents(tree_root, 0);
-
-		trans = btrfs_start_transaction(tree_root, 1);
-		btrfs_commit_transaction(trans, tree_root);
-		mutex_lock(&root->fs_info->alloc_mutex);
-		goto again;
+		key.objectid = cur_byte;
+		key.type = 0;
+		key.offset = 0;
 	}
 
-	/*
-	 * we've freed all the extents, now remove the block
-	 * group item from the tree
-	 */
+	btrfs_release_path(root, path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
 
-	trans = btrfs_start_transaction(root, 1);
-
-	mutex_lock(&root->fs_info->alloc_mutex);
-	memcpy(&key, &shrink_block_group->key, sizeof(key));
-
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret > 0)
-		ret = -EIO;
-	if (ret < 0) {
-		btrfs_end_transaction(trans, root);
-		goto out;
+	if (pass == 0) {
+		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+		WARN_ON(reloc_inode->i_mapping->nrpages);
 	}
 
-	spin_lock(&root->fs_info->block_group_cache_lock);
-	rb_erase(&shrink_block_group->cache_node,
-		 &root->fs_info->block_group_cache_tree);
-	spin_unlock(&root->fs_info->block_group_cache_lock);
-
-	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
-				      key.offset);
-	if (ret) {
-		btrfs_end_transaction(trans, root);
-		goto out;
+	if (total_found > 0) {
+		printk("btrfs found %llu extents in pass %d\n",
+		       (unsigned long long)total_found, pass);
+		pass++;
+		goto again;
 	}
-	/*
-	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
-	kfree(shrink_block_group);
-	*/
 
-	btrfs_del_item(trans, root, path);
-	btrfs_release_path(root, path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
-	btrfs_commit_transaction(trans, root);
+	/* delete reloc_inode */
+	iput(reloc_inode);
+
+	/* unpin extents in this range */
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 
-	/* the code to unpin extents might set a few bits in the free
-	 * space cache for this range again
-	 */
-	/* XXX? */
-	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
-				      key.offset);
+	spin_lock(&block_group->lock);
+	WARN_ON(block_group->pinned > 0);
+	WARN_ON(block_group->reserved > 0);
+	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+	spin_unlock(&block_group->lock);
+	ret = 0;
 out:
-	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -3922,6 +5054,33 @@ out:
 	return ret;
 }
 
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct rb_node *n;
+
+	mutex_lock(&info->alloc_mutex);
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+
+		spin_unlock(&info->block_group_cache_lock);
+		btrfs_remove_free_space_cache(block_group);
+		spin_lock(&info->block_group_cache_lock);
+
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		spin_lock(&block_group->space_info->lock);
+		list_del(&block_group->list);
+		spin_unlock(&block_group->space_info->lock);
+		kfree(block_group);
+	}
+	spin_unlock(&info->block_group_cache_lock);
+	mutex_unlock(&info->alloc_mutex);
+	return 0;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -4039,3 +5198,46 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	return 0;
 }
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start)
+{
+	struct btrfs_path *path;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_key key;
+	int ret;
+
+	BUG_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+	BUG_ON(!block_group);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	btrfs_remove_free_space_cache(block_group);
+	rb_erase(&block_group->cache_node,
+		 &root->fs_info->block_group_cache_tree);
+	spin_lock(&block_group->space_info->lock);
+	list_del(&block_group->list);
+	spin_unlock(&block_group->space_info->lock);
+
+	/*
+	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
+	kfree(shrink_block_group);
+	*/
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e3984f902e7..0091c01abb0 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -210,7 +210,10 @@ again:
 			goto err;
 		}
 
-		ret = btrfs_add_dead_root(dead_root, latest);
+		if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+			ret = btrfs_add_dead_reloc_root(dead_root);
+		else
+			ret = btrfs_add_dead_root(dead_root, latest);
 		if (ret)
 			goto err;
 		goto again;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8c83cf464c8..444abe0796a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -477,6 +477,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			dirty = root->dirty_root;
 
 			btrfs_free_log(trans, root);
+			btrfs_free_reloc_root(root);
 
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
@@ -855,6 +856,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 * with the tree-log code.
 	 */
 	mutex_lock(&root->fs_info->tree_log_mutex);
+	/*
+	 * keep tree reloc code from adding new reloc trees
+	 */
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+
 
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
@@ -865,6 +871,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
+	btrfs_free_reloc_mappings(root);
+
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
@@ -910,10 +918,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->fs_info->tree_log_mutex);
 
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
-	mutex_lock(&root->fs_info->trans_mutex);
-
 	kfree(pinned_copy);
 
+	btrfs_drop_dead_reloc_roots(root);
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+
 	cur_trans->commit_done = 1;
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ddf89626498..51f113119b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1268,7 +1268,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	em_tree = &root->fs_info->mapping_tree.map_tree;
 
 	/* step one, relocate all the extents inside this chunk */
-	ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
+	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
 	BUG_ON(ret);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1308,15 +1308,18 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 		BUG_ON(ret);
 	}
 
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
 	spin_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
 	kfree(map);
 	em->bdev = NULL;
 
 	/* once for the tree */
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
-
 	/* once for us */
 	free_extent_map(em);
 
-- 
cgit v1.2.3


From 8c8bee1d7ca47fc75b6bd24a8085c525a2394c02 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Sep 2008 11:19:10 -0400
Subject: Btrfs: Wait for IO on the block device inodes of newly added devices

btrfs-vol -a /dev/xxx will zero the first and last two MB of the device.
The kernel code needs to wait for this IO to finish before it adds
the device.

btrfs metadata IO does not happen through the block device inode.  A
separate address space is used, allowing the zero filled buffer heads in
the block device inode to be written to disk after FS metadata starts
going down to the disk via the btrfs metadata inode.

The end result is zero filled metadata blocks after adding new devices
into the filesystem.

The fix is a simple filemap_write_and_wait on the block device inode
before actually inserting it into the pool of available devices.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 11 ++++++++---
 fs/btrfs/inode.c   | 13 ++++++++++++-
 fs/btrfs/volumes.c |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2775e270881..0079b60b18f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -610,6 +610,7 @@ struct btrfs_fs_info {
 	struct list_head dead_roots;
 
 	atomic_t nr_async_submits;
+	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t tree_log_writers;
 	atomic_t tree_log_commit;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 45bc3132b05..45b4f728527 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -460,6 +460,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_hook = submit_bio_hook;
 	async->work.func = run_one_async_submit;
 	async->work.flags = 0;
+
+	while(atomic_read(&fs_info->async_submit_draining) &&
+	      atomic_read(&fs_info->nr_async_submits)) {
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) == 0));
+	}
+
 	atomic_inc(&fs_info->nr_async_submits);
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 
@@ -495,11 +502,8 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 offset;
 	int ret;
 
-	offset = bio->bi_sector << 9;
-
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
@@ -1360,6 +1364,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->throttles, 0);
 	atomic_set(&fs_info->throttle_gen, 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4516fbf0167..404704d2682 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3440,13 +3440,24 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 			list_del_init(&binode->delalloc_inodes);
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 		if (inode) {
-			filemap_write_and_wait(inode->i_mapping);
+			filemap_flush(inode->i_mapping);
 			iput(inode);
 		}
 		cond_resched();
 		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	}
 	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+
+	/* the filemap_flush will queue IO into the worker threads, but
+	 * we have to make sure the IO is actually started and that
+	 * ordered extents get created before we return
+	 */
+	atomic_inc(&root->fs_info->async_submit_draining);
+	while(atomic_read(&root->fs_info->nr_async_submits)) {
+		wait_event(root->fs_info->async_submit_wait,
+		   (atomic_read(&root->fs_info->nr_async_submits) == 0));
+	}
+	atomic_dec(&root->fs_info->async_submit_draining);
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f113119b2..f63cf7621a0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1038,6 +1038,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EIO;
 	}
 
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	trans = btrfs_start_transaction(root, 1);
-- 
cgit v1.2.3


From 9a5e1ea1e1e539e244a54afffc330fc368376ab9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Sep 2008 11:24:41 -0400
Subject: Btrfs: drop WARN_ON from btrfs_add_leaf_ref

btrfs_add_leaf_ref was doing checks on the objects it found in the
rbtree to make sure they were properly linked into the tree.  But, the field
it was checking can be safely changed outside of the tree spin lock.

The WARN_ON was for debugging the initial implementation and can be
safely removed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ref-cache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index c5809988c87..30fcb7aea5b 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -67,7 +67,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 	while(*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
-		WARN_ON(!entry->in_tree);
 
 		if (bytenr < entry->bytenr)
 			p = &(*p)->rb_left;
-- 
cgit v1.2.3


From d352ac68148b69937d39ca5d48bcc4478e118dbf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Sep 2008 15:18:18 -0400
Subject: Btrfs: add and improve comments

This improves the comments at the top of many functions.  It didn't
dive into the guts of functions because I was trying to
avoid merging problems with the new allocator and back reference work.

extent-tree.c and volumes.c were both skipped, and there is definitely
more work todo in cleaning and commenting the code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/TODO           |  20 -----
 fs/btrfs/async-thread.c |  10 ++-
 fs/btrfs/async-thread.h |   7 +-
 fs/btrfs/bit-radix.c    | 130 ---------------------------------
 fs/btrfs/bit-radix.h    |  33 ---------
 fs/btrfs/btrfs_inode.h  |  54 +++++++++++++-
 fs/btrfs/crc32c.h       |  18 +++++
 fs/btrfs/ctree.c        | 127 +++++++++++++++++++++++++++++---
 fs/btrfs/ctree.h        |   1 -
 fs/btrfs/dir-item.c     |  41 +++++++++++
 fs/btrfs/disk-io.c      |  33 ++++++++-
 fs/btrfs/extent_io.c    |  34 ++++++++-
 fs/btrfs/extent_map.c   |  10 +++
 fs/btrfs/file.c         |  44 ++++++++++-
 fs/btrfs/inode.c        | 189 +++++++++++++++++++++++++++++++-----------------
 fs/btrfs/locking.c      |  13 ++++
 fs/btrfs/ordered-data.c |  19 ++++-
 fs/btrfs/ref-cache.c    |  26 +++++++
 fs/btrfs/ref-cache.h    |   3 +
 fs/btrfs/root-tree.c    |  21 +++++-
 fs/btrfs/struct-funcs.c |  21 ++++++
 fs/btrfs/super.c        |   3 +
 fs/btrfs/transaction.c  |  67 ++++++++++++++++-
 fs/btrfs/tree-defrag.c  |   4 +
 25 files changed, 653 insertions(+), 277 deletions(-)
 delete mode 100644 fs/btrfs/TODO
 delete mode 100644 fs/btrfs/bit-radix.c
 delete mode 100644 fs/btrfs/bit-radix.h

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d5c28557fba..48b7909ca8d 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -4,7 +4,7 @@ ifneq ($(KERNELRELEASE),)
 obj-m  := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o inode-map.o disk-io.o \
-	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
+	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o
diff --git a/fs/btrfs/TODO b/fs/btrfs/TODO
deleted file mode 100644
index d9b6d38c603..00000000000
--- a/fs/btrfs/TODO
+++ /dev/null
@@ -1,20 +0,0 @@
-* cleanup, add more error checking, get rid of BUG_ONs
-* Fix ENOSPC handling
-* Make allocator smarter
-* add a block group to struct inode
-* Do actual block accounting
-* Check compat and incompat flags on the inode
-* Get rid of struct ctree_path, limiting tree levels held at one time
-* Add generation number to key pointer in nodes
-* Add generation number to inode
-* forbid cross subvolume renames and hardlinks
-* Release
-* Do real tree locking
-* Add extent mirroring (backup copies of blocks)
-* Add fancy interface to get access to incremental backups
-* Add fancy striped extents to make big reads faster
-* Use relocation to try and fix write errors
-* Make allocator much smarter
-* xattrs (directory streams for regular files)
-* Scrub & defrag
-
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4e780b279de..04fb9702d14 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -231,17 +231,25 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 
 	/*
 	 * if we pick a busy task, move the task to the end of the list.
-	 * hopefully this will keep things somewhat evenly balanced
+	 * hopefully this will keep things somewhat evenly balanced.
+	 * Do the move in batches based on the sequence number.  This groups
+	 * requests submitted at roughly the same time onto the same worker.
 	 */
 	next = workers->worker_list.next;
 	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
 	atomic_inc(&worker->num_pending);
 	worker->sequence++;
+
 	if (worker->sequence % workers->idle_thresh == 0)
 		list_move_tail(next, &workers->worker_list);
 	return worker;
 }
 
+/*
+ * selects a worker thread to take the next job.  This will either find
+ * an idle worker, start a new worker up to the max count, or just return
+ * one of the existing busy workers.
+ */
 static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 43e44d115dd..4ec9a2ee0f9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -63,14 +63,17 @@ struct btrfs_workers {
 	/* once a worker has this many requests or fewer, it is idle */
 	int idle_thresh;
 
-	/* list with all the work threads */
+	/* list with all the work threads.  The workers on the idle thread
+	 * may be actively servicing jobs, but they haven't yet hit the
+	 * idle thresh limit above.
+	 */
 	struct list_head worker_list;
 	struct list_head idle_list;
 
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
 
-	/* extra name for this worker */
+	/* extra name for this worker, used for current->name */
 	char *name;
 };
 
diff --git a/fs/btrfs/bit-radix.c b/fs/btrfs/bit-radix.c
deleted file mode 100644
index e8bf876db39..00000000000
--- a/fs/btrfs/bit-radix.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include "bit-radix.h"
-
-#define BIT_ARRAY_BYTES 256
-#define BIT_RADIX_BITS_PER_ARRAY ((BIT_ARRAY_BYTES - sizeof(unsigned long)) * 8)
-
-extern struct kmem_cache *btrfs_bit_radix_cachep;
-int set_radix_bit(struct radix_tree_root *radix, unsigned long bit)
-{
-	unsigned long *bits;
-	unsigned long slot;
-	int bit_slot;
-	int ret;
-
-	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
-	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
-
-	bits = radix_tree_lookup(radix, slot);
-	if (!bits) {
-		bits = kmem_cache_alloc(btrfs_bit_radix_cachep, GFP_NOFS);
-		if (!bits)
-			return -ENOMEM;
-		memset(bits + 1, 0, BIT_ARRAY_BYTES - sizeof(unsigned long));
-		bits[0] = slot;
-		ret = radix_tree_insert(radix, slot, bits);
-		if (ret)
-			return ret;
-	}
-	ret = test_and_set_bit(bit_slot, bits + 1);
-	if (ret < 0)
-		ret = 1;
-	return ret;
-}
-
-int test_radix_bit(struct radix_tree_root *radix, unsigned long bit)
-{
-	unsigned long *bits;
-	unsigned long slot;
-	int bit_slot;
-
-	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
-	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
-
-	bits = radix_tree_lookup(radix, slot);
-	if (!bits)
-		return 0;
-	return test_bit(bit_slot, bits + 1);
-}
-
-int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit)
-{
-	unsigned long *bits;
-	unsigned long slot;
-	int bit_slot;
-	int i;
-	int empty = 1;
-
-	slot = bit / BIT_RADIX_BITS_PER_ARRAY;
-	bit_slot = bit % BIT_RADIX_BITS_PER_ARRAY;
-
-	bits = radix_tree_lookup(radix, slot);
-	if (!bits)
-		return 0;
-	clear_bit(bit_slot, bits + 1);
-	for (i = 1; i < BIT_ARRAY_BYTES / sizeof(unsigned long); i++) {
-		if (bits[i]) {
-			empty = 0;
-			break;
-		}
-	}
-	if (empty) {
-		bits = radix_tree_delete(radix, slot);
-		BUG_ON(!bits);
-		kmem_cache_free(btrfs_bit_radix_cachep, bits);
-	}
-	return 0;
-}
-
-int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
-			 unsigned long start, int nr)
-{
-	unsigned long *bits;
-	unsigned long *gang[4];
-	int found;
-	int ret;
-	int i;
-	int total_found = 0;
-	unsigned long slot;
-
-	slot = start / BIT_RADIX_BITS_PER_ARRAY;
-	ret = radix_tree_gang_lookup(radix, (void **)gang, slot,
-				     ARRAY_SIZE(gang));
-	found = start % BIT_RADIX_BITS_PER_ARRAY;
-	for (i = 0; i < ret && nr > 0; i++) {
-		bits = gang[i];
-		while(nr > 0) {
-			found = find_next_bit(bits + 1,
-					      BIT_RADIX_BITS_PER_ARRAY,
-					      found);
-			if (found < BIT_RADIX_BITS_PER_ARRAY) {
-				*retbits = bits[0] *
-					BIT_RADIX_BITS_PER_ARRAY + found;
-				retbits++;
-				nr--;
-				total_found++;
-				found++;
-			} else
-				break;
-		}
-		found = 0;
-	}
-	return total_found;
-}
diff --git a/fs/btrfs/bit-radix.h b/fs/btrfs/bit-radix.h
deleted file mode 100644
index c100f54d5c3..00000000000
--- a/fs/btrfs/bit-radix.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __BIT_RADIX__
-#define __BIT_RADIX__
-#include <linux/radix-tree.h>
-
-int set_radix_bit(struct radix_tree_root *radix, unsigned long bit);
-int test_radix_bit(struct radix_tree_root *radix, unsigned long bit);
-int clear_radix_bit(struct radix_tree_root *radix, unsigned long bit);
-int find_first_radix_bit(struct radix_tree_root *radix, unsigned long *retbits,
-			 unsigned long start, int nr);
-
-static inline void init_bit_radix(struct radix_tree_root *radix)
-{
-	INIT_RADIX_TREE(radix, GFP_NOFS);
-}
-#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0577fda2168..0b2e623cf42 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -25,27 +25,58 @@
 
 /* in memory btrfs inode */
 struct btrfs_inode {
+	/* which subvolume this inode belongs to */
 	struct btrfs_root *root;
+
+	/* the block group preferred for allocations.  This pointer is buggy
+	 * and needs to be replaced with a bytenr instead
+	 */
 	struct btrfs_block_group_cache *block_group;
+
+	/* key used to find this inode on disk.  This is used by the code
+	 * to read in roots of subvolumes
+	 */
 	struct btrfs_key location;
+
+	/* the extent_tree has caches of all the extent mappings to disk */
 	struct extent_map_tree extent_tree;
+
+	/* the io_tree does range state (DIRTY, LOCKED etc) */
 	struct extent_io_tree io_tree;
+
+	/* special utility tree used to record which mirrors have already been
+	 * tried when checksums fail for a given block
+	 */
 	struct extent_io_tree io_failure_tree;
+
+	/* held while inserting checksums to avoid races */
 	struct mutex csum_mutex;
+
+	/* held while inesrting or deleting extents from files */
 	struct mutex extent_mutex;
+
+	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
-	struct inode vfs_inode;
+
+	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
+	/* standard acl pointers */
 	struct posix_acl *i_acl;
 	struct posix_acl *i_default_acl;
 
 	/* for keeping track of orphaned inodes */
 	struct list_head i_orphan;
 
+	/* list of all the delalloc inodes in the FS.  There are times we need
+	 * to write all the delalloc pages to disk, and this list is used
+	 * to walk them all.
+	 */
 	struct list_head delalloc_inodes;
 
-	/* full 64 bit generation number */
+	/* full 64 bit generation number, struct vfs_inode doesn't have a big
+	 * enough field for this.
+	 */
 	u64 generation;
 
 	/*
@@ -57,10 +88,25 @@ struct btrfs_inode {
 	 */
 	u64 logged_trans;
 
-	/* trans that last made a change that should be fully fsync'd */
+	/*
+	 * trans that last made a change that should be fully fsync'd.  This
+	 * gets reset to zero each time the inode is logged
+	 */
 	u64 log_dirty_trans;
+
+	/* total number of bytes pending delalloc, used by stat to calc the
+	 * real block usage of the file
+	 */
 	u64 delalloc_bytes;
+
+	/*
+	 * the size of the file stored in the metadata on disk.  data=ordered
+	 * means the in-memory i_size might be larger than the size on disk
+	 * because not all the blocks are written yet.
+	 */
 	u64 disk_i_size;
+
+	/* flags field from the on disk inode */
 	u32 flags;
 
 	/*
@@ -68,6 +114,8 @@ struct btrfs_inode {
 	 * number for new files that are created
 	 */
 	u64 index_cnt;
+
+	struct inode vfs_inode;
 };
 
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
index 4f0fefed132..1eaf11d334f 100644
--- a/fs/btrfs/crc32c.h
+++ b/fs/btrfs/crc32c.h
@@ -1,3 +1,21 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
 #ifndef __BTRFS_CRC32C__
 #define __BTRFS_CRC32C__
 #include <asm/byteorder.h>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 50e81f43e6d..ff3261ff2e1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2007,2008 Oracle.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -54,12 +54,19 @@ struct btrfs_path *btrfs_alloc_path(void)
 	return path;
 }
 
+/* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
 	btrfs_release_path(NULL, p);
 	kmem_cache_free(btrfs_path_cachep, p);
 }
 
+/*
+ * path release drops references on the extent buffers in the path
+ * and it drops any locks held by this path
+ *
+ * It is safe to call this on paths that no locks or extent buffers held.
+ */
 void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
@@ -77,6 +84,16 @@ void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 	}
 }
 
+/*
+ * safely gets a reference on the root node of a tree.  A lock
+ * is not taken, so a concurrent writer may put a different node
+ * at the root of the tree.  See btrfs_lock_root_node for the
+ * looping required.
+ *
+ * The extent buffer returned by this has a reference taken, so
+ * it won't disappear.  It may stop being the root of the tree
+ * at any time because there are no locks held.
+ */
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
@@ -87,6 +104,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 	return eb;
 }
 
+/* loop around taking references on and locking the root node of the
+ * tree until you end up with a lock on the root.  A locked buffer
+ * is returned, with a reference held.
+ */
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
@@ -108,6 +129,10 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 	return eb;
 }
 
+/* cowonly root (everything not a reference counted cow subvolume), just get
+ * put onto a simple dirty list.  transaction.c walks this to make sure they
+ * get properly updated on disk.
+ */
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
 	if (root->track_dirty && list_empty(&root->dirty_list)) {
@@ -116,6 +141,11 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
 	}
 }
 
+/*
+ * used by snapshot creation to make a copy of a root for a tree with
+ * a given objectid.  The buffer with the new root node is returned in
+ * cow_ret, and this func returns zero on success or a negative error code.
+ */
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
@@ -167,6 +197,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * does the dirty work in cow of a single block.  The parent block
+ * (if supplied) is updated to point to the new cow copy.  The new
+ * buffer is marked dirty and returned locked.  If you modify the block
+ * it needs to be marked dirty again.
+ *
+ * search_start -- an allocation hint for the new block
+ *
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
+ * the allocator should try to find free next to the block it returns.  This is
+ * just a hint and may be ignored by the allocator.
+ *
+ * prealloc_dest -- if you have already reserved a destination for the cow,
+ * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
+ * is used to finish the allocation.
+ */
 int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
@@ -311,6 +357,11 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * cows a single block, see __btrfs_cow_block for the real work.
+ * This version of it has extra checks so that a block isn't cow'd more than
+ * once per transaction, as long as it hasn't been written yet
+ */
 int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
@@ -347,6 +398,10 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * helper function for defrag to decide if two blocks pointed to by a
+ * node are actually close by
+ */
 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
 {
 	if (blocknr < other && other - (blocknr + blocksize) < 32768)
@@ -381,6 +436,11 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 }
 
 
+/*
+ * this is used by the defrag code to go through all the
+ * leaves pointed to by a node and reallocate them so that
+ * disk order is close to key order
+ */
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
 		       int start_slot, int cache_only, u64 *last_ret,
@@ -521,6 +581,10 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
 	return btrfs_item_offset_nr(leaf, nr - 1);
 }
 
+/*
+ * extra debugging checks to make sure all the items in a key are
+ * well formed and in the proper order
+ */
 static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
@@ -561,6 +625,10 @@ static int check_node(struct btrfs_root *root, struct btrfs_path *path,
 	return 0;
 }
 
+/*
+ * extra checking to make sure all the items in a leaf are
+ * well formed and in the proper order
+ */
 static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		      int level)
 {
@@ -782,6 +850,10 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return -1;
 }
 
+/* given a node and slot number, this reads the blocks it points to.  The
+ * extent buffer is returned with a reference taken (but unlocked).
+ * NULL is returned on error.
+ */
 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 				   struct extent_buffer *parent, int slot)
 {
@@ -798,6 +870,11 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 		       btrfs_node_ptr_generation(parent, slot));
 }
 
+/*
+ * node level balancing, used to make sure nodes are in proper order for
+ * item deletion.  We balance from the top down, so we have to make sure
+ * that a deletion won't leave an node completely empty later on.
+ */
 static noinline int balance_level(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 struct btrfs_path *path, int level)
@@ -1024,7 +1101,10 @@ enospc:
 	return ret;
 }
 
-/* returns zero if the push worked, non-zero otherwise */
+/* Node balancing for insertion.  Here we only split or push nodes around
+ * when they are completely full.  This is also done top down, so we
+ * have to be pessimistic.
+ */
 static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, int level)
@@ -1150,7 +1230,8 @@ static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
 }
 
 /*
- * readahead one full node of leaves
+ * readahead one full node of leaves, finding things that are close
+ * to the block in 'slot', and triggering ra on them.
  */
 static noinline void reada_for_search(struct btrfs_root *root,
 				      struct btrfs_path *path,
@@ -1226,6 +1307,19 @@ static noinline void reada_for_search(struct btrfs_root *root,
 	}
 }
 
+/*
+ * when we walk down the tree, it is usually safe to unlock the higher layers in
+ * the tree.  The exceptions are when our path goes through slot 0, because operations
+ * on the tree might require changing key pointers higher up in the tree.
+ *
+ * callers might also have set path->keep_locks, which tells this code to
+ * keep the lock if the path points to the last slot in the block.  This is
+ * part of walking through the tree, and selecting the next slot in the higher
+ * block.
+ *
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
+ * so if lowest_unlock is 1, level 0 won't be unlocked
+ */
 static noinline void unlock_up(struct btrfs_path *path, int level,
 			       int lowest_unlock)
 {
@@ -2705,6 +2799,12 @@ again:
 	return ret;
 }
 
+/*
+ * make the item pointed to by the path smaller.  new_size indicates
+ * how small to make it, and from_end tells us if we just chop bytes
+ * off the end of the item or if we shift the item to chop bytes off
+ * the front.
+ */
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_path *path,
@@ -2818,6 +2918,9 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * make the item pointed to by the path bigger, data_size is the new size.
+ */
 int btrfs_extend_item(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, struct btrfs_path *path,
 		      u32 data_size)
@@ -2897,7 +3000,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 }
 
 /*
- * Given a key and some data, insert an item into the tree.
+ * Given a key and some data, insert items into the tree.
  * This does all the path init required, making room in the tree if needed.
  */
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -3046,9 +3149,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 /*
  * delete the pointer from a given node.
  *
- * If the delete empties a node, the node is removed from the tree,
- * continuing all the way the root if required.  The root is converted into
- * a leaf if all the nodes are emptied.
+ * the tree should have been previously balanced so the deletion does not
+ * empty a node.
  */
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot)
@@ -3233,6 +3335,9 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * search the tree again to find a leaf with lesser keys
  * returns 0 if it found something or 1 if there are no lesser leaves.
  * returns < 0 on io errors.
+ *
+ * This may release the path, and so you may lose any locks held at the
+ * time you call it.
  */
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
@@ -3265,9 +3370,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 /*
  * A helper function to walk down the tree starting at min_key, and looking
  * for nodes or leaves that are either in cache or have a minimum
- * transaction id.  This is used by the btree defrag code, but could
- * also be used to search for blocks that have changed since a given
- * transaction id.
+ * transaction id.  This is used by the btree defrag code, and tree logging
  *
  * This does not cow, but it does stuff the starting key it finds back
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -3279,6 +3382,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
  * This honors path->lowest_level to prevent descent past a given level
  * of the tree.
  *
+ * min_trans indicates the oldest transaction that you are interested
+ * in walking through.  Any nodes or leaves older than min_trans are
+ * skipped over (without reading them).
+ *
  * returns zero if something useful was found, < 0 on error and 1 if there
  * was nothing in the tree that matched the search criteria.
  */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0079b60b18f..ded1643c027 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -27,7 +27,6 @@
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <asm/kmap_types.h>
-#include "bit-radix.h"
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e4f30090d64..5040b71f190 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,6 +21,14 @@
 #include "hash.h"
 #include "transaction.h"
 
+/*
+ * insert a name into a directory, doing overflow properly if there is a hash
+ * collision.  data_size indicates how big the item inserted should be.  On
+ * success a struct btrfs_dir_item pointer is returned, otherwise it is
+ * an ERR_PTR.
+ *
+ * The name is not copied into the dir item, you have to do that yourself.
+ */
 static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 						   *trans,
 						   struct btrfs_root *root,
@@ -55,6 +63,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 	return (struct btrfs_dir_item *)ptr;
 }
 
+/*
+ * xattrs work a lot like directories, this inserts an xattr item
+ * into the tree
+ */
 int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, const char *name,
 			    u16 name_len, const void *data, u16 data_len,
@@ -109,6 +121,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * insert a directory item in the tree, doing all the magic for
+ * both indexes. 'dir' indicates which objectid to insert it into,
+ * 'location' is the key to stuff into the directory item, 'type' is the
+ * type of the inode we're pointing to, and 'index' is the sequence number
+ * to use for the second index (if one is created).
+ */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len, u64 dir,
 			  struct btrfs_key *location, u8 type, u64 index)
@@ -184,6 +203,11 @@ out:
 	return 0;
 }
 
+/*
+ * lookup a directory item based on name.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ */
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     struct btrfs_path *path, u64 dir,
@@ -222,6 +246,14 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+/*
+ * lookup a directory item based on index.  'dir' is the objectid
+ * we're searching in, and 'mod' tells us if you plan on deleting the
+ * item (use mod < 0) or changing the options (use mod > 0)
+ *
+ * The name is used to make sure the index really points to the name you were
+ * looking for.
+ */
 struct btrfs_dir_item *
 btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
@@ -282,6 +314,11 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+/*
+ * helper function to look at the directory item pointed to by 'path'
+ * this walks through all the entries in a dir item and finds one
+ * for a specific name.
+ */
 struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
@@ -313,6 +350,10 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 	return NULL;
 }
 
+/*
+ * given a pointer into a directory item, delete it.  This
+ * handles items that have more than one entry in them.
+ */
 int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 45b4f728527..5ee10d3136f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,6 +55,11 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 
+/*
+ * end_io_wq structs are used to do processing in task context when an IO is
+ * complete.  This is used during reads to verify checksums, and it is used
+ * by writes to insert metadata for new file extents after IO is complete.
+ */
 struct end_io_wq {
 	struct bio *bio;
 	bio_end_io_t *end_io;
@@ -66,6 +71,11 @@ struct end_io_wq {
 	struct btrfs_work work;
 };
 
+/*
+ * async submit bios are used to offload expensive checksumming
+ * onto the worker threads.  They checksum file and metadata bios
+ * just before they are sent down the IO stack.
+ */
 struct async_submit_bio {
 	struct inode *inode;
 	struct bio *bio;
@@ -76,6 +86,10 @@ struct async_submit_bio {
 	struct btrfs_work work;
 };
 
+/*
+ * extents on the btree inode are pretty simple, there's one extent
+ * that covers the entire device
+ */
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 				    size_t page_offset, u64 start, u64 len,
 				    int create)
@@ -151,6 +165,10 @@ void btrfs_csum_final(u32 crc, char *result)
 	*(__le32 *)result = ~cpu_to_le32(crc);
 }
 
+/*
+ * compute the csum for a btree block, and either verify it or write it
+ * into the csum field of the block.
+ */
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
@@ -204,6 +222,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	return 0;
 }
 
+/*
+ * we can't consider a given block up to date unless the transid of the
+ * block matches the transid in the parent node's pointer.  This is how we
+ * detect blocks that either didn't get written at all or got written
+ * in the wrong place.
+ */
 static int verify_parent_transid(struct extent_io_tree *io_tree,
 				 struct extent_buffer *eb, u64 parent_transid)
 {
@@ -228,9 +252,12 @@ out:
 	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
 		      GFP_NOFS);
 	return ret;
-
 }
 
+/*
+ * helper to read a given tree block, doing retries as required when
+ * the checksums don't match and we have alternate mirrors to try.
+ */
 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
 					  u64 start, u64 parent_transid)
@@ -260,6 +287,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
 	return -EIO;
 }
 
+/*
+ * checksum a dirty tree block before IO.  This has extra checks to make
+ * sure we only fill in the checksum field in the first page of a multi-page block
+ */
 int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8bd1b402f3f..563b2d12f4f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -914,6 +914,10 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(wait_on_extent_writeback);
 
+/*
+ * either insert or lock state struct between start and end use mask to tell
+ * us if waiting is desired.
+ */
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 {
 	int err;
@@ -982,6 +986,13 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(set_range_writeback);
 
+/*
+ * find the first offset in the io tree with 'bits' set. zero is
+ * returned if we find something, and *start_ret and *end_ret are
+ * set to reflect the state struct that was found.
+ *
+ * If nothing was found, 1 is returned, < 0 on error
+ */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits)
 {
@@ -1017,6 +1028,10 @@ out:
 }
 EXPORT_SYMBOL(find_first_extent_bit);
 
+/* find the first state struct with 'bits' set after 'start', and
+ * return it.  tree->lock must be held.  NULL will returned if
+ * nothing was found after 'start'
+ */
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 						 u64 start, int bits)
 {
@@ -1046,8 +1061,14 @@ out:
 }
 EXPORT_SYMBOL(find_first_extent_bit_state);
 
-u64 find_lock_delalloc_range(struct extent_io_tree *tree,
-			     u64 *start, u64 *end, u64 max_bytes)
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
+					     u64 *start, u64 *end, u64 max_bytes)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1130,6 +1151,11 @@ out:
 	return found;
 }
 
+/*
+ * count the number of bytes in the tree that have a given bit(s)
+ * set.  This can be fairly slow, except for EXTENT_DIRTY which is
+ * cached.  The total number found is returned.
+ */
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end, u64 max_bytes,
 		     unsigned long bits)
@@ -1245,6 +1271,10 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 }
 EXPORT_SYMBOL(unlock_range);
 
+/*
+ * set the private field for a given byte offset in the tree.  If there isn't
+ * an extent_state there already, this does nothing.
+ */
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 {
 	struct rb_node *node;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 78ced11d18c..74b2a29880d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -114,6 +114,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
+/*
+ * search through the tree for an extent_map with a given offset.  If
+ * it can't be found, try to find some neighboring extents
+ */
 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 				     struct rb_node **prev_ret,
 				     struct rb_node **next_ret)
@@ -160,6 +164,10 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 	return NULL;
 }
 
+/*
+ * look for an offset in the tree, and if it can't be found, return
+ * the first offset we can find smaller than 'offset'.
+ */
 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 {
 	struct rb_node *prev;
@@ -170,6 +178,7 @@ static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
 	return ret;
 }
 
+/* check to see if two extent_map structs are adjacent and safe to merge */
 static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 {
 	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
@@ -250,6 +259,7 @@ out:
 }
 EXPORT_SYMBOL(add_extent_mapping);
 
+/* simple helper to do math around the end of an extent, handling wrap */
 static u64 range_end(u64 start, u64 len)
 {
 	if (start + len < start)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1b7e51a9db0..3088a118448 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,9 @@
 #include "compat.h"
 
 
+/* simple helper to fault in pages and copy.  This should go away
+ * and be replaced with calls into generic code.
+ */
 static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
 					 int write_bytes,
 					 struct page **prepared_pages,
@@ -72,12 +75,19 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
 	return page_fault ? -EFAULT : 0;
 }
 
+/*
+ * unlocks pages after btrfs_file_write is done with them
+ */
 static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
 		if (!pages[i])
 			break;
+		/* page checked is some magic around finding pages that
+		 * have been modified without going through btrfs_set_page_dirty
+		 * clear it here
+		 */
 		ClearPageChecked(pages[i]);
 		unlock_page(pages[i]);
 		mark_page_accessed(pages[i]);
@@ -85,6 +95,10 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
+/* this does all the hard work for inserting an inline extent into
+ * the btree.  Any existing inline extent is extended as required to make room,
+ * otherwise things are inserted as required into the btree
+ */
 static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode,
 				u64 offset, size_t size,
@@ -228,6 +242,14 @@ fail:
 	return err;
 }
 
+/*
+ * after copy_from_user, pages need to be dirtied and we need to make
+ * sure holes are created between the current EOF and the start of
+ * any next extents (if required).
+ *
+ * this also makes the decision about creating an inline extent vs
+ * doing real data extents, marking pages dirty and delalloc as required.
+ */
 static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct file *file,
@@ -362,6 +384,10 @@ out_unlock:
 	return err;
 }
 
+/*
+ * this drops all the extents in the cache that intersect the range
+ * [start, end].  Existing extents are split as required.
+ */
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned)
 {
@@ -536,6 +562,9 @@ out:
  * If an extent intersects the range but is not entirely inside the range
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
+ *
+ * inline_limit is used to tell this code which offsets in the file to keep
+ * if they contain inline extents.
  */
 int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
@@ -796,7 +825,9 @@ out:
 }
 
 /*
- * this gets pages into the page cache and locks them down
+ * this gets pages into the page cache and locks them down, it also properly
+ * waits for data=ordered extents to finish before allowing the pages to be
+ * modified.
  */
 static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
@@ -1034,6 +1065,17 @@ int btrfs_release_file(struct inode * inode, struct file * filp)
 	return 0;
 }
 
+/*
+ * fsync call for both files and directories.  This logs the inode into
+ * the tree log instead of forcing full commits whenever possible.
+ *
+ * It needs to call filemap_fdatawait so that all ordered extent updates are
+ * in the metadata btree are up to date for copying to the log.
+ *
+ * It drops the inode mutex before doing the tree log commit.  This is an
+ * important optimization for directories because holding the mutex prevents
+ * new operations on the dir while we write to disk.
+ */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 404704d2682..f3abecc2d14 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 
 static void btrfs_truncate(struct inode *inode);
 
+/*
+ * a very lame attempt at stopping writes when the FS is 85% full.  There
+ * are countless ways this is incorrect, but it is better than nothing.
+ */
 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 			   int for_del)
 {
@@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	return ret;
 }
 
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ */
 static int cow_file_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -185,6 +195,13 @@ out:
 	return ret;
 }
 
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
 static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
 {
 	u64 extent_start;
@@ -291,6 +308,9 @@ out:
 	return err;
 }
 
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
 static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 	return ret;
 }
 
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
 int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
@@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
 int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			 unsigned long old, unsigned long bits)
 {
@@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio)
 {
@@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	return 0;
 }
 
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
 int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num)
 {
@@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation on write,
+ * or reading the csums from the tree before a read
+ */
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num)
 {
@@ -408,6 +452,10 @@ mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct inode *inode, u64 file_offset,
 			     struct list_head *list)
@@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
 				   GFP_NOFS);
 }
 
+/* see btrfs_writepage_start_hook for details on why this is required */
 struct btrfs_writepage_fixup {
 	struct page *page;
 	struct btrfs_work work;
 };
 
-/* see btrfs_writepage_start_hook for details on why this is required */
 void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
@@ -522,6 +570,10 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	return -EAGAIN;
 }
 
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -631,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
 struct io_failure_record {
 	struct page *page;
 	u64 start;
@@ -725,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 	return 0;
 }
 
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
 int btrfs_clean_io_failures(struct inode *inode, u64 start)
 {
 	u64 private;
@@ -753,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
 	return 0;
 }
 
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, we go through
+ * the io_failure_record routines to find good copies
+ */
 int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -990,6 +1059,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	btrfs_free_path(path);
 }
 
+/*
+ * read an inode from the btree into the in-memory inode
+ */
 void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
@@ -1083,6 +1155,9 @@ make_bad:
 	make_bad_inode(inode);
 }
 
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
 static void fill_inode_item(struct btrfs_trans_handle *trans,
 			    struct extent_buffer *leaf,
 			    struct btrfs_inode_item *item,
@@ -1118,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 				    BTRFS_I(inode)->block_group->key.objectid);
 }
 
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
 int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode)
@@ -1151,6 +1229,11 @@ failed:
 }
 
 
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       struct inode *dir, struct inode *inode,
@@ -1309,7 +1392,7 @@ fail:
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
- * any higher than i_size.
+ * any higher than new_size
  *
  * csum items that cross the new i_size are truncated to the new size
  * as well.
@@ -2123,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode)
 	btrfs_end_transaction(trans, root);
 }
 
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
 static int btrfs_set_inode_index_count(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2175,6 +2263,10 @@ out:
 	return ret;
 }
 
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
 static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
 				 u64 *index)
 {
@@ -2305,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode)
 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
 }
 
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
 int btrfs_add_link(struct btrfs_trans_handle *trans,
 		   struct inode *parent_inode, struct inode *inode,
 		   const char *name, int name_len, int add_backref, u64 index)
@@ -2611,6 +2709,10 @@ out_unlock:
 	return err;
 }
 
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
 				struct extent_map *existing,
 				struct extent_map *em,
@@ -2627,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 	return add_extent_mapping(em_tree, em);
 }
 
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the
+ * in-ram representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t pg_offset, u64 start, u64 len,
 				    int create)
@@ -2869,76 +2979,11 @@ out:
 	return em;
 }
 
-#if 0 /* waiting for O_DIRECT reads */
-static int btrfs_get_block(struct inode *inode, sector_t iblock,
-			struct buffer_head *bh_result, int create)
-{
-	struct extent_map *em;
-	u64 start = (u64)iblock << inode->i_blkbits;
-	struct btrfs_multi_bio *multi = NULL;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 len;
-	u64 logical;
-	u64 map_length;
-	int ret = 0;
-
-	em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0);
-
-	if (!em || IS_ERR(em))
-		goto out;
-
-	if (em->start > start || em->start + em->len <= start) {
-	    goto out;
-	}
-
-	if (em->block_start == EXTENT_MAP_INLINE) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	len = em->start + em->len - start;
-	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
-
-	if (em->block_start == EXTENT_MAP_HOLE ||
-	    em->block_start == EXTENT_MAP_DELALLOC) {
-		bh_result->b_size = len;
-		goto out;
-	}
-
-	logical = start - em->start;
-	logical = em->block_start + logical;
-
-	map_length = len;
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-			      logical, &map_length, &multi, 0);
-	BUG_ON(ret);
-	bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
-	bh_result->b_size = min(map_length, len);
-
-	bh_result->b_bdev = multi->stripes[0].dev->bdev;
-	set_buffer_mapped(bh_result);
-	kfree(multi);
-out:
-	free_extent_map(em);
-	return ret;
-}
-#endif
-
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
 	return -EINVAL;
-#if 0
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-
-	if (rw == WRITE)
-		return -EINVAL;
-
-	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-				  offset, nr_segs, btrfs_get_block, NULL);
-#endif
 }
 
 static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
@@ -3202,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 	}
 }
 
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
 int btrfs_create_subvol_root(struct btrfs_root *new_root,
 		struct btrfs_trans_handle *trans, u64 new_dirid,
 		struct btrfs_block_group_cache *block_group)
@@ -3223,6 +3271,9 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 	return btrfs_update_inode(trans, new_root, inode);
 }
 
+/* helper function for file defrag and space balancing.  This
+ * forces readahead on a given range of bytes in an inode
+ */
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index)
@@ -3424,6 +3475,10 @@ out_unlock:
 	return ret;
 }
 
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
 int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 {
 	struct list_head *head = &root->fs_info->delalloc_inodes;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0cc314c10d6..e30aa6e2958 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,6 +25,15 @@
 #include "extent_io.h"
 #include "locking.h"
 
+/*
+ * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
+ * and the spin is not tuned very extensively.  The spinning does make a big
+ * difference in almost every workload, but spinning for the right amount of
+ * time needs some help.
+ *
+ * In general, we want to spin as long as the lock holder is doing btree searches,
+ * and we should give up if they are in more expensive code.
+ */
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
 	int i;
@@ -57,6 +66,10 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 	return mutex_is_locked(&eb->mutex);
 }
 
+/*
+ * btrfs_search_slot uses this to decide if it should drop its locks
+ * before doing something expensive like allocating free blocks for cow.
+ */
 int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
 {
 	int i;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 951eacff242..dcc1730dd83 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -26,7 +26,6 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
 
-
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
 	if (entry->file_offset + entry->len < entry->file_offset)
@@ -34,6 +33,9 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)
 	return entry->file_offset + entry->len;
 }
 
+/* returns NULL if the insertion worked, or it returns the node it did find
+ * in the tree
+ */
 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 				   struct rb_node *node)
 {
@@ -58,6 +60,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 	return NULL;
 }
 
+/*
+ * look for a given offset in the tree, and if it can't be found return the
+ * first lesser offset
+ */
 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 				     struct rb_node **prev_ret)
 {
@@ -108,6 +114,9 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	return NULL;
 }
 
+/*
+ * helper to check if a given offset is inside a given entry
+ */
 static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 {
 	if (file_offset < entry->file_offset ||
@@ -116,6 +125,10 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 	return 1;
 }
 
+/*
+ * look find the first ordered struct that has this offset, otherwise
+ * the first one less than this offset
+ */
 static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 					  u64 file_offset)
 {
@@ -305,6 +318,10 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	return 0;
 }
 
+/*
+ * wait for all the ordered extents in a root.  This is done when balancing
+ * space between drives.
+ */
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 {
 	struct list_head splice;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 30fcb7aea5b..a50ebb67055 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -21,6 +21,16 @@
 #include "ref-cache.h"
 #include "transaction.h"
 
+/*
+ * leaf refs are used to cache the information about which extents
+ * a given leaf has references on.  This allows us to process that leaf
+ * in btrfs_drop_snapshot without needing to read it back from disk.
+ */
+
+/*
+ * kmalloc a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
 struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
 					    int nr_extents)
 {
@@ -40,6 +50,10 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
 	return ref;
 }
 
+/*
+ * free a leaf reference struct and update the counters for the
+ * total ref cache size
+ */
 void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	if (!ref)
@@ -135,6 +149,10 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 	return 0;
 }
 
+/*
+ * find the leaf ref for a given extent.  This returns the ref struct with
+ * a usage reference incremented
+ */
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     u64 bytenr)
 {
@@ -160,6 +178,10 @@ again:
 	return NULL;
 }
 
+/*
+ * add a fully filled in leaf ref struct
+ * remove all the refs older than a given root generation
+ */
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
 		       int shared)
 {
@@ -184,6 +206,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
 	return ret;
 }
 
+/*
+ * remove a single leaf ref from the tree.  This drops the ref held by the tree
+ * only
+ */
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	struct btrfs_leaf_ref_tree *tree;
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 617564787f5..16f3183d7c5 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -19,8 +19,11 @@
 #define __REFCACHE__
 
 struct btrfs_extent_info {
+	/* bytenr and num_bytes find the extent in the extent allocation tree */
 	u64 bytenr;
 	u64 num_bytes;
+
+	/* objectid and offset find the back reference for the file */
 	u64 objectid;
 	u64 offset;
 };
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0091c01abb0..eb7f7655e9d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,8 +22,10 @@
 #include "print-tree.h"
 
 /*
- * returns 0 on finding something, 1 if no more roots are there
- * and < 0 on error
+ *  search forward for a root, starting with objectid 'search_start'
+ *  if a root key is found, the objectid we find is filled into 'found_objectid'
+ *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
+ *  left in the tree.
  */
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 		      u64 *found_objectid)
@@ -66,6 +68,11 @@ out:
 	return ret;
 }
 
+/*
+ * lookup the root with the highest offset for a given objectid.  The key we do
+ * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
+ * on error.
+ */
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 			struct btrfs_root_item *item, struct btrfs_key *key)
 {
@@ -104,6 +111,9 @@ out:
 	return ret;
 }
 
+/*
+ * copy the data in 'item' into the btree
+ */
 int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_root_item
 		      *item)
@@ -147,6 +157,12 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
+/*
+ * at mount time we want to find all the old transaction snapshots that were in
+ * the process of being deleted if we crashed.  This is any root item with an offset
+ * lower than the latest root.  They need to be queued for deletion to finish
+ * what was happening when we crashed.
+ */
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest)
 {
@@ -227,6 +243,7 @@ err:
 	return ret;
 }
 
+/* drop the root item for 'key' from 'root' */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key)
 {
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index ad03a32d111..cdedbe144d4 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -17,6 +17,27 @@
  */
 
 #include <linux/highmem.h>
+
+/* this is some deeply nasty code.  ctree.h has a different
+ * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+ *
+ * The end result is that anyone who #includes ctree.h gets a
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions
+ *
+ * This file declares the macros and then #includes ctree.h, which results
+ * in cpp creating the function here based on the template below.
+ *
+ * These setget functions do all the extent_buffer related mapping
+ * required to efficiently read and write specific fields in the extent
+ * buffers.  Every pointer to metadata items in btrfs is really just
+ * an unsigned long offset into the extent buffer which has been
+ * cast to a specific type.  This gives us all the gcc type checking.
+ *
+ * The extent buffer api is used to do all the kmapping and page
+ * spanning work required to get extent buffers in highmem and have
+ * a metadata blocksize different from the page size.
+ */
+
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
 u##bits btrfs_##name(struct extent_buffer *eb,				\
 				   type *s)				\
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8399d6d05d6..2e6039825b7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -519,6 +519,9 @@ static struct file_system_type btrfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+/*
+ * used by btrfsctl to scan devices when no FS is mounted
+ */
 static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 				unsigned long arg)
 {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 444abe0796a..11266d68a6c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -46,6 +46,9 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 	}
 }
 
+/*
+ * either allocate a new transaction or hop into the existing one
+ */
 static noinline int join_transaction(struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
@@ -85,6 +88,12 @@ static noinline int join_transaction(struct btrfs_root *root)
 	return 0;
 }
 
+/*
+ * this does all the record keeping required to make sure that a
+ * reference counted root is properly recorded in a given transaction.
+ * This is required to make sure the old root from before we joined the transaction
+ * is deleted when the transaction commits
+ */
 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
 	struct btrfs_dirty_root *dirty;
@@ -127,6 +136,10 @@ noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 	return 0;
 }
 
+/* wait for commit against the current transaction to become unblocked
+ * when this is done, it is safe to start a new transaction, but the current
+ * transaction might not be fully on disk.
+ */
 static void wait_current_trans(struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans;
@@ -198,7 +211,7 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
 	return start_transaction(r, num_blocks, 2);
 }
 
-
+/* wait for a transaction commit to be fully complete */
 static noinline int wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
@@ -218,6 +231,10 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+/*
+ * rate limit against the drop_snapshot code.  This helps to slow down new operations
+ * if the drop_snapshot code isn't able to keep up.
+ */
 static void throttle_on_drops(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
@@ -302,7 +319,11 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 	return __btrfs_end_transaction(trans, root, 1);
 }
 
-
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages)
 {
@@ -393,6 +414,16 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 					   &trans->transaction->dirty_pages);
 }
 
+/*
+ * this is used to update the root pointer in the tree of tree roots.
+ *
+ * But, in the case of the extent allocation tree, updating the root
+ * pointer may allocate blocks which may change the root of the extent
+ * allocation tree.
+ *
+ * So, this loops and repeats and makes sure the cowonly root didn't
+ * change while the root pointer was being updated in the metadata.
+ */
 static int update_cowonly_root(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
@@ -418,6 +449,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * update all the cowonly tree roots on disk
+ */
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root)
 {
@@ -433,6 +467,11 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * dead roots are old snapshots that need to be deleted.  This allocates
+ * a dirty root struct and adds it into the list of dead roots that need to
+ * be deleted
+ */
 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
 {
 	struct btrfs_dirty_root *dirty;
@@ -449,6 +488,12 @@ int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
 	return 0;
 }
 
+/*
+ * at transaction commit time we need to schedule the old roots for
+ * deletion via btrfs_drop_snapshot.  This runs through all the
+ * reference counted roots that were modified in the current
+ * transaction and puts them into the drop list
+ */
 static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 				    struct radix_tree_root *radix,
 				    struct list_head *list)
@@ -541,6 +586,10 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 	return err;
 }
 
+/*
+ * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
+ * otherwise every leaf in the btree is read and defragged.
+ */
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 {
 	struct btrfs_fs_info *info = root->fs_info;
@@ -570,6 +619,10 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	return 0;
 }
 
+/*
+ * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
+ * all of them
+ */
 static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 				     struct list_head *list)
 {
@@ -664,6 +717,10 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	return ret;
 }
 
+/*
+ * new snapshots need to be created at a very specific time in the
+ * transaction commit.  This does the actual creation
+ */
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_pending_snapshot *pending)
@@ -734,6 +791,9 @@ fail:
 	return ret;
 }
 
+/*
+ * create all the snapshots we've scheduled for creation
+ */
 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 					     struct btrfs_fs_info *fs_info)
 {
@@ -944,6 +1004,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * interface function to delete all the snapshots we have scheduled for deletion
+ */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
 	struct list_head dirty_roots;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b3bb5bbad76..6f57d0889b1 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,6 +23,10 @@
 #include "transaction.h"
 #include "locking.h"
 
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read things
+ * from disk, otherwise read all the leaves and try to get key order to
+ * better reflect disk order
+ */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only)
 {
-- 
cgit v1.2.3


From 45b8c9a8b1e15bf79c2c17ec217adf96785f8011 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 30 Sep 2008 14:40:06 -0400
Subject: Btrfs: fix seekiness due to finding the wrong block group

This patch fixes a problem where we end up seeking too much when *last_ptr is
valid.  This happens because btrfs_lookup_first_block_group only returns a
block group that starts on or after the given search start, so if the
search_start is in the middle of a block group it will return the block group
after the given search_start, which is suboptimal.

This patch fixes that by doing a btrfs_lookup_block_group, which will return
the block group that contains the given search start.  If we fail to find a
block group, we fall back on btrfs_lookup_first_block_group so we can find the
next block group, not sure if this is absolutely needed, but better safe than
sorry.

Also if we can't find the block group that we need, or it happens to not be of
the right type, we need to add empty_cluster since *last_ptr could point to a
mismatched block group, which means we need to start over with empty_cluster
added to total needed.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8043b9d584a..64e14ddf623 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2238,7 +2238,10 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	total_needed += empty_size;
 
 new_group:
-	block_group = btrfs_lookup_first_block_group(info, search_start);
+	block_group = btrfs_lookup_block_group(info, search_start);
+	if (!block_group)
+		block_group = btrfs_lookup_first_block_group(info,
+							     search_start);
 
 	/*
 	 * Ok this looks a little tricky, buts its really simple.  First if we
@@ -2255,8 +2258,10 @@ new_group:
 	if (!block_group || (!block_group_bits(block_group, data) &&
 			     last_ptr && *last_ptr)) {
 		if (search_start != orig_search_start) {
-			if (last_ptr && *last_ptr)
+			if (last_ptr && *last_ptr) {
+				total_needed += empty_cluster;
 				*last_ptr = 0;
+			}
 			search_start = orig_search_start;
 			goto new_group;
 		} else if (!chunk_alloc_done && allowed_chunk_alloc) {
-- 
cgit v1.2.3


From 75ccf47d13bfb66de7faf596bfe497b9af7aaa40 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 30 Sep 2008 19:24:06 -0400
Subject: Btrfs: fix multi-device code to use raid policies set by mkfs

When reading in block groups, a global mask of the available raid policies
should be adjusted based on the types of block groups found on disk.  This
global mask is then used to decide which raid policy to use for new
block groups.

The recent allocator changes dropped the call that updated the global
mask, making all the block groups allocated at run time single striped
onto a single drive.

This also fixes the async worker threads to set any thread that uses
the requeue mechanism as busy.  This allows us to avoid blocking
on get_request_wait for the async bio submission threads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 14 +++++++++++++-
 fs/btrfs/extent-tree.c  |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 04fb9702d14..d82efd722a4 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -302,8 +302,20 @@ int btrfs_requeue_work(struct btrfs_work *work)
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
 	list_add_tail(&work->list, &worker->pending);
-	check_busy_worker(worker);
+
+	/* by definition we're busy, take ourselves off the idle
+	 * list
+	 */
+	if (worker->idle) {
+		spin_lock_irqsave(&worker->workers->lock, flags);
+		worker->idle = 0;
+		list_move_tail(&worker->worker_list,
+			       &worker->workers->worker_list);
+		spin_unlock_irqrestore(&worker->workers->lock, flags);
+	}
+
 	spin_unlock_irqrestore(&worker->lock, flags);
+
 out:
 	return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 64e14ddf623..677d5e774fa 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5145,6 +5145,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
 		BUG_ON(ret);
+
+		set_avail_alloc_bits(root->fs_info, cache->flags);
 	}
 	ret = 0;
 error:
-- 
cgit v1.2.3


From 0463bb4e8d2f717a9bf3be6cc12c0aec51cc261d Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Wed, 1 Oct 2008 19:09:04 -0400
Subject: Btrfs: disk-io.c (open_ctree): Don't deref. NULL upon failed kzalloc

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5ee10d3136f..41b7d24d07e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1371,7 +1371,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	struct btrfs_super_block *disk_super;
 
-	if (!extent_root || !tree_root || !fs_info) {
+	if (!extent_root || !tree_root || !fs_info ||
+	    !chunk_root || !dev_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
-- 
cgit v1.2.3


From 83afeac42c5680b0b70d64fb8c4724cf05483fc2 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Wed, 1 Oct 2008 19:09:51 -0400
Subject: Btrfs: disk-io.c (open_ctree): avoid leaks upon allocation failure

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b7d24d07e..0be044bb619 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1699,6 +1699,8 @@ fail:
 	kfree(tree_root);
 	bdi_destroy(&fs_info->bdi);
 	kfree(fs_info);
+	kfree(chunk_root);
+	kfree(dev_root);
 	return ERR_PTR(err);
 }
 
-- 
cgit v1.2.3


From cf749823857230017c86504bfdc70524f929ba96 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 1 Oct 2008 19:11:18 -0400
Subject: Btrfs: fix deadlock between alloc_mutex/chunk_mutex

This fixes a deadlock that happens between the alloc_mutex and chunk_mutex.
Process A comes in, decides to do a do_chunk_alloc, which takes the
chunk_mutex, and is holding the alloc_mutex because the only way you get to
do_chunk_alloc is by holding the alloc_mutex.  btrfs_alloc_chunk does its thing
and goes to insert a new item, which results in a cow of the block.

We get into del_pending_extents from there, where if we need to be rescheduled
we drop the alloc_mutex and schedule.  At this point process B comes in to do
an allocation and gets the alloc_mutex, and because process A did not do the
chunk allocation completely it thinks its a good time to do a chunk allocation
as well, and hangs on the chunk_mutex.

Process A wakes up and tries to take the alloc_mutex and cannot.  The way to
fix this is do a mutex_trylock() on chunk_mutex.  If we return 0 we didn't get
the lock, and if this is just a "hey it may be a good time to allocate a chunk"
then we just exit.  If we are trying to force an allocation then we reschedule
and keep trying to acquire the chunk_mutex.  If once we acquire it the space is
already full then we can just exit, otherwise we can continue with the chunk
allocation.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 677d5e774fa..db37b867e4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1505,7 +1505,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 thresh;
 	u64 start;
 	u64 num_bytes;
-	int ret = 0;
+	int ret = 0, waited = 0;
 
 	flags = reduce_alloc_profile(extent_root, flags);
 
@@ -1530,7 +1530,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	    space_info->bytes_reserved + alloc_bytes) < thresh)
 		goto out;
 
-	mutex_lock(&extent_root->fs_info->chunk_mutex);
+	while (!mutex_trylock(&extent_root->fs_info->chunk_mutex)) {
+		if (!force)
+			goto out;
+		mutex_unlock(&extent_root->fs_info->alloc_mutex);
+		cond_resched();
+		mutex_lock(&extent_root->fs_info->alloc_mutex);
+		waited = 1;
+	}
+
+	if (waited && space_info->full)
+		goto out_unlock;
+
 	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
 	if (ret == -ENOSPC) {
 printk("space info full %Lu\n", flags);
-- 
cgit v1.2.3


From 323ac95bce442bbde514e3ce57e840402f80d909 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 1 Oct 2008 19:05:46 -0400
Subject: Btrfs: don't read leaf blocks containing only checksums during
 truncate

Checksum items take up a significant portion of the metadata for large files.
It is possible to avoid reading them during truncates by checking the keys in
the higher level nodes.

If a given leaf is followed by another leaf where the lowest key is a checksum
item from the same file, we know we can safely delete the leaf without
reading it.

For a 32GB file on a 6 drive raid0 array, Btrfs needs 8s to delete
the file with a cold cache.  It is read bound during the run.

With this change, Btrfs is able to delete the file in 0.5s

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c |  69 +++++++++++++------------
 fs/btrfs/ctree.h |   4 +-
 fs/btrfs/inode.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 193 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ff3261ff2e1..2eab4643dcb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1388,7 +1388,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_key prealloc_block;
 
 	lowest_level = p->lowest_level;
-	WARN_ON(lowest_level && ins_len);
+	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(cow && root == root->fs_info->extent_root &&
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
@@ -3186,6 +3186,36 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	return ret;
 }
 
+/*
+ * a helper function to delete the leaf pointed to by path->slots[1] and
+ * path->nodes[1].  bytenr is the node block pointer, but since the callers
+ * already know it, it is faster to have them pass it down than to
+ * read it out of the node again.
+ *
+ * This deletes the pointer in path->nodes[1] and frees the leaf
+ * block extent.  zero is returned if it all worked out, < 0 otherwise.
+ *
+ * The path must have already been setup for deleting the leaf, including
+ * all the proper balancing.  path->nodes[1] must be locked.
+ */
+noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr)
+{
+	int ret;
+	u64 root_gen = btrfs_header_generation(path->nodes[1]);
+
+	ret = del_ptr(trans, root, path, 1, path->slots[1]);
+	if (ret)
+		return ret;
+
+	ret = btrfs_free_extent(trans, root, bytenr,
+				btrfs_level_size(root, 0),
+				path->nodes[1]->start,
+				btrfs_header_owner(path->nodes[1]),
+				root_gen, 0, 0, 1);
+	return ret;
+}
 /*
  * delete the item at the leaf level in path.  If that empties
  * the leaf, remove it from the tree
@@ -3251,17 +3281,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (leaf == root->node) {
 			btrfs_set_header_level(leaf, 0);
 		} else {
-			u64 root_gen = btrfs_header_generation(path->nodes[1]);
-			wret = del_ptr(trans, root, path, 1, path->slots[1]);
-			if (wret)
-				ret = wret;
-			wret = btrfs_free_extent(trans, root,
-					 leaf->start, leaf->len,
-					 path->nodes[1]->start,
-					 btrfs_header_owner(path->nodes[1]),
-					 root_gen, 0, 0, 1);
-			if (wret)
-				ret = wret;
+			ret = btrfs_del_leaf(trans, root, path, leaf->start);
+			BUG_ON(ret);
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, nritems);
@@ -3296,24 +3317,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 
 			if (btrfs_header_nritems(leaf) == 0) {
-				u64 root_gen;
-				u64 bytenr = leaf->start;
-				u32 blocksize = leaf->len;
-
-				root_gen = btrfs_header_generation(
-							   path->nodes[1]);
-
-				wret = del_ptr(trans, root, path, 1, slot);
-				if (wret)
-					ret = wret;
-
+				path->slots[1] = slot;
+				ret = btrfs_del_leaf(trans, root, path, leaf->start);
+				BUG_ON(ret);
 				free_extent_buffer(leaf);
-				wret = btrfs_free_extent(trans, root, bytenr,
-					     blocksize, path->nodes[1]->start,
-					     btrfs_header_owner(path->nodes[1]),
-					     root_gen, 0, 0, 1);
-				if (wret)
-					ret = wret;
 			} else {
 				/* if we're still in the path, make sure
 				 * we're dirty.  Otherwise, one of the
@@ -3418,8 +3425,8 @@ again:
 		level = btrfs_header_level(cur);
 		sret = bin_search(cur, min_key, level, &slot);
 
-		/* at level = 0, we're done, setup the path and exit */
-		if (level == 0) {
+		/* at the lowest level, we're done, setup the path and exit */
+		if (level == path->lowest_level) {
 			if (slot >= nritems)
 				goto find_next_key;
 			ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ded1643c027..94e0cdfddc0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1649,7 +1649,9 @@ void btrfs_free_path(struct btrfs_path *p);
 void btrfs_init_path(struct btrfs_path *p);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
-
+int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f3abecc2d14..e5c9261dcba 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1389,6 +1389,154 @@ fail:
 	return err;
 }
 
+/*
+ * when truncating bytes in a file, it is possible to avoid reading
+ * the leaves that contain only checksum items.  This can be the
+ * majority of the IO required to delete a large file, but it must
+ * be done carefully.
+ *
+ * The keys in the level just above the leaves are checked to make sure
+ * the lowest key in a given leaf is a csum key, and starts at an offset
+ * after the new  size.
+ *
+ * Then the key for the next leaf is checked to make sure it also has
+ * a checksum item for the same file.  If it does, we know our target leaf
+ * contains only checksum items, and it can be safely freed without reading
+ * it.
+ *
+ * This is just an optimization targeted at large files.  It may do
+ * nothing.  It will return 0 unless things went badly.
+ */
+static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct btrfs_path *path,
+				     struct inode *inode, u64 new_size)
+{
+	struct btrfs_key key;
+	int ret;
+	int nritems;
+	struct btrfs_key found_key;
+	struct btrfs_key other_key;
+
+	path->lowest_level = 1;
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_CSUM_ITEM_KEY;
+	key.offset = new_size;
+again:
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (path->nodes[1] == NULL) {
+		ret = 0;
+		goto out;
+	}
+	ret = 0;
+	btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
+	nritems = btrfs_header_nritems(path->nodes[1]);
+
+	if (!nritems)
+		goto out;
+
+	if (path->slots[1] >= nritems)
+		goto next_node;
+
+	/* did we find a key greater than anything we want to delete? */
+	if (found_key.objectid > inode->i_ino ||
+	   (found_key.objectid == inode->i_ino && found_key.type > key.type))
+		goto out;
+
+	/* we check the next key in the node to make sure the leave contains
+	 * only checksum items.  This comparison doesn't work if our
+	 * leaf is the last one in the node
+	 */
+	if (path->slots[1] + 1 >= nritems) {
+next_node:
+		/* search forward from the last key in the node, this
+		 * will bring us into the next node in the tree
+		 */
+		btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
+
+		/* unlikely, but we inc below, so check to be safe */
+		if (found_key.offset == (u64)-1)
+			goto out;
+
+		/* search_forward needs a path with locks held, do the
+		 * search again for the original key.  It is possible
+		 * this will race with a balance and return a path that
+		 * we could modify, but this drop is just an optimization
+		 * and is allowed to miss some leaves.
+		 */
+		btrfs_release_path(root, path);
+		found_key.offset++;
+
+		/* setup a max key for search_forward */
+		other_key.offset = (u64)-1;
+		other_key.type = key.type;
+		other_key.objectid = key.objectid;
+
+		path->keep_locks = 1;
+		ret = btrfs_search_forward(root, &found_key, &other_key,
+					   path, 0, 0);
+		path->keep_locks = 0;
+		if (ret || found_key.objectid != key.objectid ||
+		    found_key.type != key.type) {
+			ret = 0;
+			goto out;
+		}
+
+		key.offset = found_key.offset;
+		btrfs_release_path(root, path);
+		cond_resched();
+		goto again;
+	}
+
+	/* we know there's one more slot after us in the tree,
+	 * read that key so we can verify it is also a checksum item
+	 */
+	btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
+
+	if (found_key.objectid < inode->i_ino)
+		goto next_key;
+
+	if (found_key.type != key.type || found_key.offset < new_size)
+		goto next_key;
+
+	/*
+	 * if the key for the next leaf isn't a csum key from this objectid,
+	 * we can't be sure there aren't good items inside this leaf.
+	 * Bail out
+	 */
+	if (other_key.objectid != inode->i_ino || other_key.type != key.type)
+		goto out;
+
+	/*
+	 * it is safe to delete this leaf, it contains only
+	 * csum items from this inode at an offset >= new_size
+	 */
+	ret = btrfs_del_leaf(trans, root, path,
+			     btrfs_node_blockptr(path->nodes[1],
+						 path->slots[1]));
+	BUG_ON(ret);
+
+next_key:
+	btrfs_release_path(root, path);
+
+	if (other_key.objectid == inode->i_ino &&
+	    other_key.type == key.type && other_key.offset > key.offset) {
+		key.offset = other_key.offset;
+		cond_resched();
+		goto again;
+	}
+	ret = 0;
+out:
+	/* fixup any changes we've made to the path */
+	path->lowest_level = 0;
+	path->keep_locks = 0;
+	btrfs_release_path(root, path);
+	return ret;
+}
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -1436,6 +1584,10 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.type = (u8)-1;
 
 	btrfs_init_path(path);
+
+	ret = drop_csum_leaves(trans, root, path, inode, new_size);
+	BUG_ON(ret);
+
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {
-- 
cgit v1.2.3


From cb843a6f513a1a91c54951005e60bd9b95bdf973 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Oct 2008 12:30:02 -0400
Subject: Btrfs: O_DIRECT writes via buffered writes + invaldiate

This reworks the btrfs O_DIRECT write code a bit.  It had always fallen
back to buffered IO and done an invalidate, but needed to be updated
for the data=ordered code.  The invalidate wasn't actually removing pages
because they were still inside an ordered extent.

This also combines the O_DIRECT/O_SYNC paths where possible, and kicks
off IO in the main btrfs_file_write loop to keep the pipe down the the
disk full as we process long writes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c         | 64 +++++++++++++++++++++++++++----------------------
 fs/btrfs/ordered-data.c |  3 ++-
 fs/btrfs/ordered-data.h |  2 +-
 3 files changed, 38 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3088a118448..a03d1bbb19a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -905,6 +905,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	struct page *pinned[2];
 	unsigned long first_index;
 	unsigned long last_index;
+	int will_write;
+
+	will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
+		      (file->f_flags & O_DIRECT));
 
 	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
 		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
@@ -1001,15 +1005,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		if (ret)
 			goto out;
 
+		if (will_write) {
+			btrfs_fdatawrite_range(inode->i_mapping, pos,
+					       pos + write_bytes - 1,
+					       WB_SYNC_NONE);
+		} else {
+			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+							   num_pages);
+			if (num_pages <
+			    (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+				btrfs_btree_balance_dirty(root, 1);
+			btrfs_throttle(root);
+		}
+
 		buf += write_bytes;
 		count -= write_bytes;
 		pos += write_bytes;
 		num_written += write_bytes;
 
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages);
-		if (num_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-			btrfs_btree_balance_dirty(root, 1);
-		btrfs_throttle(root);
 		cond_resched();
 	}
 out:
@@ -1023,36 +1036,29 @@ out_nolock:
 		page_cache_release(pinned[1]);
 	*ppos = pos;
 
-	if (num_written > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+	if (num_written > 0 && will_write) {
 		struct btrfs_trans_handle *trans;
 
-		err = btrfs_fdatawrite_range(inode->i_mapping, start_pos,
-					     start_pos + num_written -1,
-					     WB_SYNC_NONE);
-		if (err < 0)
-			num_written = err;
-
-		err = btrfs_wait_on_page_writeback_range(inode->i_mapping,
-				 start_pos, start_pos + num_written - 1);
-		if (err < 0)
+		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
+		if (err)
 			num_written = err;
 
-		trans = btrfs_start_transaction(root, 1);
-		ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-		if (ret == 0) {
-			btrfs_sync_log(trans, root);
-			btrfs_end_transaction(trans, root);
-		} else {
-			btrfs_commit_transaction(trans, root);
+		if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+			trans = btrfs_start_transaction(root, 1);
+			ret = btrfs_log_dentry_safe(trans, root,
+						    file->f_dentry);
+			if (ret == 0) {
+				btrfs_sync_log(trans, root);
+				btrfs_end_transaction(trans, root);
+			} else {
+				btrfs_commit_transaction(trans, root);
+			}
+		}
+		if (file->f_flags & O_DIRECT) {
+			invalidate_mapping_pages(inode->i_mapping,
+			      start_pos >> PAGE_CACHE_SHIFT,
+			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 		}
-	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
-		do_sync_mapping_range(inode->i_mapping, start_pos,
-				      start_pos + num_written - 1,
-				      SYNC_FILE_RANGE_WRITE |
-				      SYNC_FILE_RANGE_WAIT_AFTER);
-		invalidate_mapping_pages(inode->i_mapping,
-		      start_pos >> PAGE_CACHE_SHIFT,
-		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 	}
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index dcc1730dd83..2eb6caba57c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -397,7 +397,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 /*
  * Used to wait on ordered extents across a large range of bytes.
  */
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
 	u64 end;
 	u64 orig_end;
@@ -451,6 +451,7 @@ again:
 		       (unsigned long long)orig_end);
 		goto again;
 	}
+	return 0;
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index fd45519f30a..f50f8870a14 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -135,7 +135,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
-- 
cgit v1.2.3


From 30c43e2444c16afe3b2130f40ad273541bf3dc36 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Oct 2008 12:24:01 -0400
Subject: Btrfs: remove last_log_alloc allocator optimization

The tree logging code was trying to separate tree log allocations
from normal metadata allocations to improve writeback patterns during
an fsync.

But, the code was not effective and ended up just mixing tree log
blocks with regular metadata.  That seems to be working fairly well,
so the last_log_alloc code can be removed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 1 -
 fs/btrfs/extent-tree.c | 7 -------
 fs/btrfs/transaction.c | 1 -
 3 files changed, 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 94e0cdfddc0..8566eb30f56 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -672,7 +672,6 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
-	u64 last_log_alloc;
 
 	spinlock_t ref_cache_lock;
 	u64 total_ref_cache_size;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db37b867e4f..29380467b67 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2228,13 +2228,6 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
 		last_ptr = &root->fs_info->last_data_alloc;
 
-	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-		last_ptr = &root->fs_info->last_log_alloc;
-		if (!last_ptr == 0 && root->fs_info->last_alloc) {
-			*last_ptr = root->fs_info->last_alloc + empty_cluster;
-		}
-	}
-
 	if (last_ptr) {
 		if (*last_ptr)
 			hint_byte = *last_ptr;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 11266d68a6c..5ecc24d634a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -61,7 +61,6 @@ static noinline int join_transaction(struct btrfs_root *root)
 		root->fs_info->generation++;
 		root->fs_info->last_alloc = 0;
 		root->fs_info->last_data_alloc = 0;
-		root->fs_info->last_log_alloc = 0;
 		cur_trans->num_writers = 1;
 		cur_trans->num_joined = 0;
 		cur_trans->transid = root->fs_info->generation;
-- 
cgit v1.2.3


From a62b940160d8125016e85046e68ae621c99e751f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Oct 2008 16:31:08 -0400
Subject: Btrfs: cast bio->bi_sector to a u64 before shifting

On 32 bit machines without CONFIG_LBD, the bi_sector field is only 32 bits.
Btrfs needs to cast it before shifting up, or we end up doing IO into
the wrong place.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c   | 2 +-
 fs/btrfs/volumes.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e5c9261dcba..ff0c3597665 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -386,7 +386,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_mapping_tree *map_tree;
-	u64 logical = bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	int ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f63cf7621a0..2eed7f91f51 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2187,7 +2187,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
-	u64 logical = bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	struct btrfs_multi_bio *multi = NULL;
-- 
cgit v1.2.3


From a76a3cd40c1127ca199d4f7f37bf0d541bf44eb2 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 9 Oct 2008 11:46:29 -0400
Subject: Btrfs: Count space allocated to file in bytes

This patch makes btrfs count space allocated to file in bytes instead
of 512 byte sectors.

Everything else in btrfs uses a byte count instead of sector sizes or
blocks sizes, so this fits better.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h       | 13 ++-----------
 fs/btrfs/extent-tree.c |  2 +-
 fs/btrfs/file.c        | 24 +++++++++++++-----------
 fs/btrfs/inode.c       | 23 ++++++++++++-----------
 fs/btrfs/ioctl.c       |  4 ++--
 fs/btrfs/tree-log.c    |  6 +++---
 6 files changed, 33 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8566eb30f56..50fbcc9ec45 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -411,7 +411,7 @@ struct btrfs_inode_item {
 	/* transid that last touched this inode */
 	__le64 transid;
 	__le64 size;
-	__le64 nblocks;
+	__le64 nbytes;
 	__le64 block_group;
 	__le32 nlink;
 	__le32 uid;
@@ -1017,7 +1017,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_FUNCS(inode_nblocks, struct btrfs_inode_item, nblocks, 64);
+BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
 BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64);
 BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32);
 BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
@@ -1814,15 +1814,6 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio);
 
-static inline void dec_i_blocks(struct inode *inode, u64 dec)
-{
-	dec = dec >> 9;
-	if (dec <= inode->i_blocks)
-		inode->i_blocks -= dec;
-	else
-		inode->i_blocks = 0;
-}
-
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 29380467b67..69db54e09fb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3930,7 +3930,7 @@ next:
 				BUG_ON(ret);
 				btrfs_release_path(root, path);
 
-				inode->i_blocks += extent_len >> 9;
+				inode_add_bytes(inode, extent_len);
 
 				ext_offset = 0;
 				num_bytes -= extent_len;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a03d1bbb19a..18dfdf5f91d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -193,7 +193,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 			leaf = path->nodes[0];
 			ei = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
-			inode->i_blocks += (offset + size - found_end) >> 9;
+			inode_add_bytes(inode, offset + size - found_end);
 		}
 		if (found_end < offset) {
 			ptr = btrfs_file_extent_inline_start(ei) + found_size;
@@ -203,7 +203,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 insert:
 		btrfs_release_path(root, path);
 		datasize = offset + size - key.offset;
-		inode->i_blocks += datasize >> 9;
+		inode_add_bytes(inode, datasize);
 		datasize = btrfs_file_extent_calc_inline_size(datasize);
 		ret = btrfs_insert_empty_item(trans, root, path, &key,
 					      datasize);
@@ -713,7 +713,8 @@ next_slot:
 								      extent);
 				if (btrfs_file_extent_disk_bytenr(leaf,
 								  extent)) {
-					dec_i_blocks(inode, old_num - new_num);
+					inode_sub_bytes(inode, old_num -
+							new_num);
 				}
 				btrfs_set_file_extent_num_bytes(leaf, extent,
 								new_num);
@@ -724,14 +725,17 @@ next_slot:
 				u32 new_size;
 				new_size = btrfs_file_extent_calc_inline_size(
 						   inline_limit - key.offset);
-				dec_i_blocks(inode, (extent_end - key.offset) -
-					(inline_limit - key.offset));
+				inode_sub_bytes(inode, extent_end -
+						inline_limit);
 				btrfs_truncate_item(trans, root, path,
 						    new_size, 1);
 			}
 		}
 		/* delete the entire extent */
 		if (!keep) {
+			if (found_inline)
+				inode_sub_bytes(inode, extent_end -
+						key.offset);
 			ret = btrfs_del_item(trans, root, path);
 			/* TODO update progress marker and return */
 			BUG_ON(ret);
@@ -743,8 +747,7 @@ next_slot:
 			u32 new_size;
 			new_size = btrfs_file_extent_calc_inline_size(
 						   extent_end - end);
-			dec_i_blocks(inode, (extent_end - key.offset) -
-					(extent_end - end));
+			inode_sub_bytes(inode, end - key.offset);
 			ret = btrfs_truncate_item(trans, root, path,
 						  new_size, 0);
 			BUG_ON(ret);
@@ -791,9 +794,7 @@ next_slot:
 			}
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0) {
-				inode->i_blocks +=
-				      btrfs_file_extent_num_bytes(leaf,
-								  extent) >> 9;
+				inode_add_bytes(inode, extent_end - end);
 			}
 		}
 
@@ -801,7 +802,8 @@ next_slot:
 			u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
 
 			if (disk_bytenr != 0) {
-				dec_i_blocks(inode, le64_to_cpu(old.num_bytes));
+				inode_sub_bytes(inode,
+						le64_to_cpu(old.num_bytes));
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ff0c3597665..f9df89c5fdf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -652,7 +652,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
-	inode->i_blocks += ordered_extent->len >> 9;
+	inode_add_bytes(inode, ordered_extent->len);
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
@@ -1104,7 +1104,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
 
-	inode->i_blocks = btrfs_inode_nblocks(leaf, inode_item);
+	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
@@ -1184,7 +1184,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
 				inode->i_ctime.tv_nsec);
 
-	btrfs_set_inode_nblocks(leaf, item, inode->i_blocks);
+	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
 	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
@@ -1679,7 +1679,7 @@ search_again:
 				num_dec = (orig_num_bytes -
 					   extent_num_bytes);
 				if (root->ref_cows && extent_start != 0)
-					dec_i_blocks(inode, num_dec);
+					inode_sub_bytes(inode, num_dec);
 				btrfs_mark_buffer_dirty(leaf);
 			} else {
 				extent_num_bytes =
@@ -1690,7 +1690,7 @@ search_again:
 				if (extent_start != 0) {
 					found_extent = 1;
 					if (root->ref_cows)
-						dec_i_blocks(inode, num_dec);
+						inode_sub_bytes(inode, num_dec);
 				}
 				root_gen = btrfs_header_generation(leaf);
 				root_owner = btrfs_header_owner(leaf);
@@ -1700,8 +1700,8 @@ search_again:
 				u32 size = new_size - found_key.offset;
 
 				if (root->ref_cows) {
-					dec_i_blocks(inode, item_end + 1 -
-						    found_key.offset - size);
+					inode_sub_bytes(inode, item_end + 1 -
+							new_size);
 				}
 				size =
 				    btrfs_file_extent_calc_inline_size(size);
@@ -1709,8 +1709,8 @@ search_again:
 							  size, 1);
 				BUG_ON(ret);
 			} else if (root->ref_cows) {
-				dec_i_blocks(inode, item_end + 1 -
-					     found_key.offset);
+				inode_sub_bytes(inode, item_end + 1 -
+						found_key.offset);
 			}
 		}
 delete:
@@ -2514,7 +2514,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	inode->i_gid = current->fsgid;
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
-	inode->i_blocks = 0;
+	inode_set_bytes(inode, 0);
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
@@ -3557,7 +3557,8 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	stat->blksize = PAGE_CACHE_SIZE;
-	stat->blocks = inode->i_blocks + (BTRFS_I(inode)->delalloc_bytes >> 9);
+	stat->blocks = (inode_get_bytes(inode) +
+			BTRFS_I(inode)->delalloc_bytes) >> 9;
 	return 0;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 04de767a8db..ab7a0f61ded 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -99,7 +99,7 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	inode_item->generation = cpu_to_le64(1);
 	inode_item->size = cpu_to_le64(3);
 	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
@@ -671,7 +671,7 @@ out:
 	btrfs_release_path(root, path);
 	if (ret == 0) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode->i_blocks = src->i_blocks;
+		inode_set_bytes(inode, inode_get_bytes(src));
 		btrfs_i_size_write(inode, src->i_size);
 		BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
 		ret = btrfs_update_inode(trans, root, inode);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 88bbfd959f1..8df719a73d8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -113,7 +113,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	inode_item->generation = cpu_to_le64(1);
 	inode_item->size = cpu_to_le64(3);
 	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nblocks = cpu_to_le64(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
@@ -598,8 +598,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	ret = overwrite_item(trans, root, path, eb, slot, key);
 	BUG_ON(ret);
 
-	/* btrfs_drop_extents changes i_blocks, update it here */
-	inode->i_blocks += (extent_end - start) >> 9;
+	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
+	inode_add_bytes(inode, extent_end - start);
 	btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
-- 
cgit v1.2.3


From 3bb1a1bc42f2ae9582c28adf620484efcd4da38d Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 9 Oct 2008 11:46:24 -0400
Subject: Btrfs: Remove offset field from struct btrfs_extent_ref

The offset field in struct btrfs_extent_ref records the position
inside file that file extent is referenced by. In the new back
reference system, tree leaves holding references to file extent
are recorded explicitly. We can scan these tree leaves very quickly, so the
offset field is not required.

This patch also makes the back reference system check the objectid
when extents are in deleting.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.c       |  24 +++----
 fs/btrfs/ctree.h       |  25 ++-----
 fs/btrfs/extent-tree.c | 185 +++++++++++++++++++++++--------------------------
 fs/btrfs/file.c        |   6 +-
 fs/btrfs/inode.c       |   6 +-
 fs/btrfs/ioctl.c       |   2 +-
 fs/btrfs/print-tree.c  |   3 +-
 fs/btrfs/tree-log.c    |   8 +--
 8 files changed, 115 insertions(+), 144 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2eab4643dcb..9caeb377de6 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,8 +254,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
 						  root->root_key.objectid,
-						  trans->transid, level, 0,
-						  &ins);
+						  trans->transid, level, &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
 					    buf->len);
@@ -333,7 +332,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 					  buf->len, buf->start,
 					  root->root_key.objectid,
 					  btrfs_header_generation(buf),
-					  0, 0, 1);
+					  level, 1);
 		}
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
@@ -347,7 +346,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len,
 				  parent_start, btrfs_header_owner(parent),
-				  btrfs_header_generation(parent), 0, 0, 1);
+				  btrfs_header_generation(parent), level, 1);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -927,7 +926,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		ret = btrfs_update_extent_ref(trans, root, child->start,
 					      mid->start, child->start,
 					      root->root_key.objectid,
-					      trans->transid, level - 1, 0);
+					      trans->transid, level - 1);
 		BUG_ON(ret);
 
 		add_root_to_dirty_list(root);
@@ -940,7 +939,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
 					mid->start, root->root_key.objectid,
-					btrfs_header_generation(mid), 0, 0, 1);
+					btrfs_header_generation(mid),
+					level, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return ret;
@@ -1006,7 +1006,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			wret = btrfs_free_extent(trans, root, bytenr,
 						 blocksize, parent->start,
 						 btrfs_header_owner(parent),
-						 generation, 0, 0, 1);
+						 generation, level, 1);
 			if (wret)
 				ret = wret;
 		} else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
 					 parent->start,
 					 btrfs_header_owner(parent),
-					 root_gen, 0, 0, 1);
+					 root_gen, level, 1);
 		if (wret)
 			ret = wret;
 	} else {
@@ -1691,13 +1691,13 @@ next_level:
 					blocksize, parent->start,
 					btrfs_header_owner(parent),
 					btrfs_header_generation(parent),
-					level - 1, 0);
+					level - 1);
 		BUG_ON(ret);
 		ret = btrfs_free_extent(trans, root, bytenr,
 					blocksize, parent->start,
 					btrfs_header_owner(parent),
 					btrfs_header_generation(parent),
-					level - 1, 0, 1);
+					level - 1, 1);
 		BUG_ON(ret);
 
 		if (generation == trans->transid) {
@@ -1973,7 +1973,7 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	ret = btrfs_update_extent_ref(trans, root, lower->start,
 				      lower->start, c->start,
 				      root->root_key.objectid,
-				      trans->transid, level - 1, 0);
+				      trans->transid, level - 1);
 	BUG_ON(ret);
 
 	/* the super has an extra ref to root->node */
@@ -3213,7 +3213,7 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 				btrfs_level_size(root, 0),
 				path->nodes[1]->start,
 				btrfs_header_owner(path->nodes[1]),
-				root_gen, 0, 0, 1);
+				root_gen, 0, 1);
 	return ret;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50fbcc9ec45..a37fd783407 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -374,7 +374,6 @@ struct btrfs_extent_ref {
 	__le64 root;
 	__le64 generation;
 	__le64 objectid;
-	__le64 offset;
 	__le32 num_refs;
 } __attribute__ ((__packed__));
 
@@ -1082,7 +1081,6 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
 BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
-BTRFS_SETGET_FUNCS(ref_offset, struct btrfs_extent_ref, offset, 64);
 BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
 
 BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
@@ -1090,8 +1088,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
 			 objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_offset, struct btrfs_extent_ref,
-			 offset, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
 			 num_refs, 32);
 
@@ -1522,29 +1518,20 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
-int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 u64 bytenr, u64 parent,
-				 u64 root_objectid, u64 ref_generation,
-				 u64 owner, u64 owner_offset);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_bytes,
 		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 owner_offset,
-		       u64 empty_size, u64 hint_byte,
+		       u64 owner, u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data);
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins);
+				u64 owner, struct btrfs_key *ins);
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins);
+				u64 owner, struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -1563,7 +1550,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
 		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, u64 owner_offset, int pin);
+		      u64 owner_objectid, int pin);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
@@ -1572,12 +1559,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner, u64 owner_offset);
+			 u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
 			    u64 orig_parent, u64 parent,
 			    u64 root_objectid, u64 ref_generation,
-			    u64 owner, u64 owner_offset);
+			    u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 69db54e09fb..ab36769c356 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -525,31 +525,28 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  * - Objectid of the subvolume root
  * - Generation number of the tree holding the reference
  * - objectid of the file holding the reference
- * - offset in the file corresponding to the key holding the reference
  * - number of references holding by parent node (alway 1 for tree blocks)
  *
  * Btree leaf may hold multiple references to a file extent. In most cases,
  * these references are from same file and the corresponding offsets inside
- * the file are close together. So inode objectid and offset in file are
- * just hints, they provide hints about where in the btree the references
- * can be found and when we can stop searching.
+ * the file are close together.
  *
  * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, offset in file, 1)
+ *     (root_key.objectid, trans->transid, inode objectid, 1)
  *
  * When a leaf is cow'd new references are added for every file extent found
  * in the leaf.  It looks similar to the create case, but trans->transid will
  * be different when the block is cow'd.
  *
- *     (root_key.objectid, trans->transid, inode objectid, offset in file,
+ *     (root_key.objectid, trans->transid, inode objectid,
  *      number of references in the leaf)
  *
- * Because inode objectid and offset in file are just hints, they are not
- * used when backrefs are deleted. When a file extent is removed either
- * during snapshot deletion or file truncation, we find the corresponding
- * back back reference and check the following fields.
+ * When a file extent is removed either during snapshot deletion or
+ * file truncation, we find the corresponding back reference and check
+ * the following fields:
  *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf))
+ *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
+ *      inode objectid)
  *
  * Btree extents can be referenced by:
  *
@@ -558,21 +555,21 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  *
  * When a tree block is created, back references are inserted:
  *
- * (root->root_key.objectid, trans->transid, level, 0, 1)
+ * (root->root_key.objectid, trans->transid, level, 1)
  *
  * When a tree block is cow'd, new back references are added for all the
  * blocks it points to. If the tree block isn't in reference counted root,
  * the old back references are removed. These new back references are of
  * the form (trans->transid will have increased since creation):
  *
- * (root->root_key.objectid, trans->transid, level, 0, 1)
+ * (root->root_key.objectid, trans->transid, level, 1)
  *
  * When a backref is in deleting, the following fields are checked:
  *
  * if backref was for a tree root:
- *     (btrfs_header_owner(itself), btrfs_header_generation(itself))
+ *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
  * else
- *     (btrfs_header_owner(parent), btrfs_header_generation(parent))
+ *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
  *
  * Back Reference Key composing:
  *
@@ -584,13 +581,15 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 
 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
-					  struct btrfs_path *path, u64 bytenr,
-					  u64 parent, u64 ref_root,
-					  u64 ref_generation, int del)
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 ref_root, u64 ref_generation,
+					  u64 owner_objectid, int del)
 {
 	struct btrfs_key key;
 	struct btrfs_extent_ref *ref;
 	struct extent_buffer *leaf;
+	u64 ref_objectid;
 	int ret;
 
 	key.objectid = bytenr;
@@ -607,8 +606,11 @@ static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
 
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	ref_objectid = btrfs_ref_objectid(leaf, ref);
 	if (btrfs_ref_root(leaf, ref) != ref_root ||
-	    btrfs_ref_generation(leaf, ref) != ref_generation) {
+	    btrfs_ref_generation(leaf, ref) != ref_generation ||
+	    (ref_objectid != owner_objectid &&
+	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
 		ret = -EIO;
 		WARN_ON(1);
 		goto out;
@@ -623,7 +625,7 @@ static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
 					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid, u64 owner_offset)
+					  u64 owner_objectid)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
@@ -643,7 +645,6 @@ static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_ref_root(leaf, ref, ref_root);
 		btrfs_set_ref_generation(leaf, ref, ref_generation);
 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-		btrfs_set_ref_offset(leaf, ref, owner_offset);
 		btrfs_set_ref_num_refs(leaf, ref, 1);
 	} else if (ret == -EEXIST) {
 		u64 existing_owner;
@@ -663,14 +664,10 @@ static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
 		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
 
 		existing_owner = btrfs_ref_objectid(leaf, ref);
-		if (existing_owner == owner_objectid &&
-		    btrfs_ref_offset(leaf, ref) > owner_offset) {
-			btrfs_set_ref_offset(leaf, ref, owner_offset);
-		} else if (existing_owner != owner_objectid &&
-			   existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
+		if (existing_owner != owner_objectid &&
+		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
 			btrfs_set_ref_objectid(leaf, ref,
 					BTRFS_MULTIPLE_OBJECTIDS);
-			btrfs_set_ref_offset(leaf, ref, 0);
 		}
 		ret = 0;
 	} else {
@@ -711,7 +708,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 				     u64 orig_parent, u64 parent,
 				     u64 orig_root, u64 ref_root,
 				     u64 orig_generation, u64 ref_generation,
-				     u64 owner_objectid, u64 owner_offset)
+				     u64 owner_objectid)
 {
 	int ret;
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
@@ -762,7 +759,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, orig_parent, orig_root,
-				    orig_generation, 1);
+				    orig_generation, owner_objectid, 1);
 	if (ret)
 		goto out;
 	ret = remove_extent_backref(trans, extent_root, path);
@@ -770,7 +767,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	ret = insert_extent_backref(trans, extent_root, path, bytenr,
 				    parent, ref_root, ref_generation,
-				    owner_objectid, owner_offset);
+				    owner_objectid);
 	BUG_ON(ret);
 	finish_current_insert(trans, extent_root);
 	del_pending_extents(trans, extent_root);
@@ -783,7 +780,7 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr,
 			    u64 orig_parent, u64 parent,
 			    u64 ref_root, u64 ref_generation,
-			    u64 owner_objectid, u64 owner_offset)
+			    u64 owner_objectid)
 {
 	int ret;
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
@@ -793,7 +790,7 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
 					parent, ref_root, ref_root,
 					ref_generation, ref_generation,
-					owner_objectid, owner_offset);
+					owner_objectid);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -803,7 +800,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  u64 orig_parent, u64 parent,
 				  u64 orig_root, u64 ref_root,
 				  u64 orig_generation, u64 ref_generation,
-				  u64 owner_objectid, u64 owner_offset)
+				  u64 owner_objectid)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -845,7 +842,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
 				    path, bytenr, parent,
 				    ref_root, ref_generation,
-				    owner_objectid, owner_offset);
+				    owner_objectid);
 	BUG_ON(ret);
 	finish_current_insert(trans, root->fs_info->extent_root);
 	del_pending_extents(trans, root->fs_info->extent_root);
@@ -858,7 +855,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 ref_root, u64 ref_generation,
-			 u64 owner_objectid, u64 owner_offset)
+			 u64 owner_objectid)
 {
 	int ret;
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
@@ -867,7 +864,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	maybe_lock_mutex(root);
 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
 				     0, ref_root, 0, ref_generation,
-				     owner_objectid, owner_offset);
+				     owner_objectid);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -1179,7 +1176,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int ret = 0;
 	int faili = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
 	ref_generation = btrfs_header_generation(buf);
@@ -1223,7 +1220,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
-					   key.objectid, key.offset);
+					   key.objectid);
 			maybe_unlock_mutex(root);
 
 			if (ret) {
@@ -1238,7 +1235,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
-					   level - 1, 0);
+					   level - 1);
 			maybe_unlock_mutex(root);
 			if (ret) {
 				faili = i;
@@ -1314,7 +1311,7 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
-					    key.objectid, key.offset);
+					    key.objectid);
 			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
@@ -1325,7 +1322,7 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
-					    level - 1, 0);
+					    level - 1);
 			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
@@ -1781,13 +1778,14 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 						start, extent_op->parent,
 						extent_root->root_key.objectid,
 						extent_op->generation,
-						extent_op->level, 0);
+						extent_op->level);
 			BUG_ON(err);
 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
 			err = lookup_extent_backref(trans, extent_root, path,
 						start, extent_op->orig_parent,
 						extent_root->root_key.objectid,
-						extent_op->orig_generation, 0);
+						extent_op->orig_generation,
+						extent_op->level, 0);
 			BUG_ON(err);
 
 			clear_extent_bits(&info->extent_ins, start, end,
@@ -1870,8 +1868,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
 			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, u64 owner_offset,
-			 int pin, int mark_free)
+			 u64 owner_objectid, int pin, int mark_free)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -1894,8 +1891,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	path->reada = 1;
-	ret = lookup_extent_backref(trans, extent_root, path, bytenr, parent,
-				    root_objectid, ref_generation, 1);
+	ret = lookup_extent_backref(trans, extent_root, path,
+				    bytenr, parent, root_objectid,
+				    ref_generation, owner_objectid, 1);
 	if (ret == 0) {
 		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
@@ -1926,9 +1924,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk("Unable to find ref byte nr %Lu root %Lu "
-		       " gen %Lu owner %Lu offset %Lu\n", bytenr,
-		       root_objectid, ref_generation, owner_objectid,
-		       owner_offset);
+		       "gen %Lu owner %Lu\n", bytenr,
+		       root_objectid, ref_generation, owner_objectid);
 	}
 
 	leaf = path->nodes[0];
@@ -2068,7 +2065,7 @@ free_extent:
 					    extent_op->orig_parent,
 					    extent_root->root_key.objectid,
 					    extent_op->orig_generation,
-					    extent_op->level, 0, 0, mark_free);
+					    extent_op->level, 0, mark_free);
 			kfree(extent_op);
 		} else {
 			kfree(extent_op);
@@ -2107,7 +2104,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 root_objectid, u64 ref_generation,
-			       u64 owner_objectid, u64 owner_offset, int pin)
+			       u64 owner_objectid, int pin)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	int pending_ret;
@@ -2156,8 +2153,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		pin = 1;
 
 	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
-			    root_objectid, ref_generation, owner_objectid,
-			    owner_offset, pin, pin == 0);
+			    root_objectid, ref_generation,
+			    owner_objectid, pin, pin == 0);
 
 	finish_current_insert(trans, root->fs_info->extent_root);
 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
@@ -2168,14 +2165,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
 		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, u64 owner_offset, int pin)
+		      u64 owner_objectid, int pin)
 {
 	int ret;
 
 	maybe_lock_mutex(root);
 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
 				  root_objectid, ref_generation,
-				  owner_objectid, owner_offset, pin);
+				  owner_objectid, pin);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2522,8 +2519,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root, u64 parent,
 					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, u64 owner_offset,
-					 struct btrfs_key *ins)
+					 u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	int pending_ret;
@@ -2597,7 +2593,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-	btrfs_set_ref_offset(path->nodes[0], ref, owner_offset);
 	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
@@ -2629,17 +2624,15 @@ out:
 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins)
+				u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
 	maybe_lock_mutex(root);
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-					    root_objectid, ref_generation,
-					    owner, owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins);
 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
 	maybe_unlock_mutex(root);
 	return ret;
@@ -2653,8 +2646,7 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, u64 parent,
 				u64 root_objectid, u64 ref_generation,
-				u64 owner, u64 owner_offset,
-				struct btrfs_key *ins)
+				u64 owner, struct btrfs_key *ins)
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
@@ -2665,9 +2657,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
 	BUG_ON(ret);
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-					    root_objectid, ref_generation,
-					    owner, owner_offset, ins);
+	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
+					    ref_generation, owner, ins);
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2683,8 +2674,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_alloc_size,
 		       u64 root_objectid, u64 ref_generation,
-		       u64 owner_objectid, u64 owner_offset,
-		       u64 empty_size, u64 hint_byte,
+		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
@@ -2698,7 +2688,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
 					root_objectid, ref_generation,
-					owner_objectid, owner_offset, ins);
+					owner_objectid, ins);
 		BUG_ON(ret);
 
 	} else {
@@ -2750,7 +2740,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *buf;
 
 	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
-				 root_objectid, ref_generation, level, 0,
+				 root_objectid, ref_generation, level,
 				 empty_size, hint, (u64)-1, &ins, 0);
 	if (ret) {
 		BUG_ON(ret > 0);
@@ -2800,7 +2790,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
-				key.objectid, key.offset, 0);
+				key.objectid, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 
@@ -2824,7 +2814,7 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
-					  info->objectid, info->offset, 0);
+					  info->objectid, 0);
 		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		atomic_inc(&root->fs_info->throttle_gen);
@@ -2940,7 +2930,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
-						root_owner, root_gen, 0, 0, 1);
+						root_owner, root_gen,
+						*level - 1, 1);
 			BUG_ON(ret);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 
@@ -2970,9 +2961,10 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				*level = 0;
 				break;
 			}
-			if (printk_ratelimit())
+			if (printk_ratelimit()) {
 				printk("leaf ref miss for bytenr %llu\n",
 				       (unsigned long long)bytenr);
+			}
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
@@ -3020,7 +3012,7 @@ out:
 	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
-				  0, 0, 1);
+				  *level, 1);
 	mutex_unlock(&root->fs_info->alloc_mutex);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
@@ -3073,8 +3065,8 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
 			ret = btrfs_free_extent(trans, root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len,
-						parent->start,
-						root_owner, root_gen, 0, 0, 1);
+						parent->start, root_owner,
+						root_gen, *level, 1);
 			BUG_ON(ret);
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -3308,7 +3300,6 @@ struct btrfs_ref_path {
 	u64 root_objectid;
 	u64 root_generation;
 	u64 owner_objectid;
-	u64 owner_offset;
 	u32 num_refs;
 	int lowest_level;
 	int current_level;
@@ -3480,7 +3471,6 @@ found:
 
 		if (ref_path->lowest_level == level) {
 			ref_path->owner_objectid = ref_objectid;
-			ref_path->owner_offset = btrfs_ref_offset(leaf, ref);
 			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
 		}
 
@@ -3686,16 +3676,20 @@ static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
 	u64 ext_offset;
 	u64 first_pos;
 	u32 nritems;
+	int nr_scaned = 0;
 	int extent_locked = 0;
 	int ret;
 
-	first_pos = ref_path->owner_offset;
+	memcpy(&key, leaf_key, sizeof(key));
+	first_pos = INT_LIMIT(loff_t) - extent_key->offset;
 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-		key.objectid = ref_path->owner_objectid;
-		key.offset = ref_path->owner_offset;
-		key.type = BTRFS_EXTENT_DATA_KEY;
-	} else {
-		memcpy(&key, leaf_key, sizeof(key));
+		if (key.objectid < ref_path->owner_objectid ||
+		    (key.objectid == ref_path->owner_objectid &&
+		     key.type < BTRFS_EXTENT_DATA_KEY)) {
+			key.objectid = ref_path->owner_objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = 0;
+		}
 	}
 
 	while (1) {
@@ -3718,8 +3712,7 @@ next:
 		}
 
 		if (path->slots[0] >= nritems) {
-			if (ref_path->owner_objectid ==
-			    BTRFS_MULTIPLE_OBJECTIDS)
+			if (++nr_scaned > 2)
 				break;
 
 			BUG_ON(extent_locked);
@@ -3858,7 +3851,7 @@ next:
 						leaf->start,
 						root->root_key.objectid,
 						trans->transid,
-						key.objectid, key.offset);
+						key.objectid);
 			BUG_ON(ret);
 
 			ret = btrfs_free_extent(trans, root,
@@ -3867,7 +3860,7 @@ next:
 						leaf->start,
 						btrfs_header_owner(leaf),
 						btrfs_header_generation(leaf),
-						key.objectid, key.offset, 0);
+						key.objectid, 0);
 			BUG_ON(ret);
 
 			btrfs_release_path(root, path);
@@ -3925,8 +3918,7 @@ next:
 						new_extents[i].disk_num_bytes,
 						leaf->start,
 						root->root_key.objectid,
-						trans->transid,
-						key.objectid, key.offset);
+						trans->transid, key.objectid);
 				BUG_ON(ret);
 				btrfs_release_path(root, path);
 
@@ -4182,14 +4174,13 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					new_extent->disk_num_bytes,
 					leaf->start,
 					root->root_key.objectid,
-					trans->transid,
-					key.objectid, key.offset);
+					trans->transid, key.objectid);
 		BUG_ON(ret);
 		ret = btrfs_free_extent(trans, root,
 					bytenr, num_bytes, leaf->start,
 					btrfs_header_owner(leaf),
 					btrfs_header_generation(leaf),
-					key.objectid, key.offset, 0);
+					key.objectid, 0);
 		BUG_ON(ret);
 		cond_resched();
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 18dfdf5f91d..69abbe19add 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -788,8 +788,7 @@ next_slot:
 						le64_to_cpu(old.disk_num_bytes),
 						leaf->start,
 						root->root_key.objectid,
-						trans->transid,
-						ins.objectid, ins.offset);
+						trans->transid, ins.objectid);
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
@@ -808,8 +807,7 @@ next_slot:
 						disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
 						leaf_start, root_owner,
-						root_gen, key.objectid,
-						key.offset, 0);
+						root_gen, key.objectid, 0);
 				BUG_ON(ret);
 				*hint_byte = disk_bytenr;
 			}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f9df89c5fdf..3ab147dc3c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -647,8 +647,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
 					  root->root_key.objectid,
-					  trans->transid, inode->i_ino,
-					  ordered_extent->file_offset, &ins);
+					  trans->transid, inode->i_ino, &ins);
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
@@ -1734,8 +1733,7 @@ delete:
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes,
 						leaf->start, root_owner,
-						root_gen, inode->i_ino,
-						found_key.offset, 0);
+						root_gen, inode->i_ino, 0);
 			BUG_ON(ret);
 		}
 next:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ab7a0f61ded..50c8a066d1f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -658,7 +658,7 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 						     ds, dl, leaf->start,
 						     root->root_key.objectid,
 						     trans->transid,
-						     inode->i_ino, key.offset);
+						     inode->i_ino);
 					BUG_ON(ret);
 				}
 			}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 3577badfa5b..bd9ab3e9a7f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -102,11 +102,10 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_EXTENT_REF_KEY:
 			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
 			printk("\t\textent back ref root %llu gen %llu "
-			       "owner %llu offset %llu num_refs %lu\n",
+			       "owner %llu num_refs %lu\n",
 			       (unsigned long long)btrfs_ref_root(l, ref),
 			       (unsigned long long)btrfs_ref_generation(l, ref),
 			       (unsigned long long)btrfs_ref_objectid(l, ref),
-			       (unsigned long long)btrfs_ref_offset(l, ref),
 			       (unsigned long)btrfs_ref_num_refs(l, ref));
 			break;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8df719a73d8..cf618cc8b34 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -460,8 +460,7 @@ insert:
 						ins.objectid, ins.offset,
 						path->nodes[0]->start,
 						root->root_key.objectid,
-						trans->transid,
-						key->objectid, key->offset);
+						trans->transid, key->objectid);
 			} else {
 				/*
 				 * insert the extent pointer in the extent
@@ -471,7 +470,7 @@ insert:
 						path->nodes[0]->start,
 						root->root_key.objectid,
 						trans->transid, key->objectid,
-						key->offset, &ins);
+						&ins);
 				BUG_ON(ret);
 			}
 		}
@@ -2534,8 +2533,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						   dst_path->nodes[0]->start,
 						   BTRFS_TREE_LOG_OBJECTID,
 						   trans->transid,
-						   ins_keys[i].objectid,
-						   ins_keys[i].offset);
+						   ins_keys[i].objectid);
 					BUG_ON(ret);
 				}
 			}
-- 
cgit v1.2.3


From 5b84e8d6eef82fcf7b1b16d92e29375b85b6549a Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 9 Oct 2008 11:46:19 -0400
Subject: Btrfs: Fix leaf reference cache miss

Due to the optimization for truncate, tree leaves only containing
checksum items can be deleted without being COW'ed first. This causes
reference cache misses. The way to fix the miss is create cache
entries for tree leaves only contain checksum.

This patch also fixes a -EEXIST issue in shared reference cache.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c |  8 ++++++++
 fs/btrfs/inode.c       | 26 +++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ab36769c356..280ac1aa9b6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1151,6 +1151,14 @@ int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		ret = btrfs_add_leaf_ref(root, ref, shared);
+		if (ret == -EEXIST && shared) {
+			struct btrfs_leaf_ref *old;
+			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
+			BUG_ON(!old);
+			btrfs_remove_leaf_ref(root, old);
+			btrfs_free_leaf_ref(root, old);
+			ret = btrfs_add_leaf_ref(root, ref, shared);
+		}
 		WARN_ON(ret);
 		btrfs_free_leaf_ref(root, ref);
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ab147dc3c0..11bfe131fde 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -48,6 +48,7 @@
 #include "xattr.h"
 #include "compat.h"
 #include "tree-log.h"
+#include "ref-cache.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -1416,6 +1417,9 @@ static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
 	int nritems;
 	struct btrfs_key found_key;
 	struct btrfs_key other_key;
+	struct btrfs_leaf_ref *ref;
+	u64 leaf_gen;
+	u64 leaf_start;
 
 	path->lowest_level = 1;
 	key.objectid = inode->i_ino;
@@ -1509,15 +1513,31 @@ next_node:
 	if (other_key.objectid != inode->i_ino || other_key.type != key.type)
 		goto out;
 
+	leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
+	leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
 	/*
 	 * it is safe to delete this leaf, it contains only
 	 * csum items from this inode at an offset >= new_size
 	 */
-	ret = btrfs_del_leaf(trans, root, path,
-			     btrfs_node_blockptr(path->nodes[1],
-						 path->slots[1]));
+	ret = btrfs_del_leaf(trans, root, path, leaf_start);
 	BUG_ON(ret);
 
+	if (root->ref_cows && leaf_gen < trans->transid) {
+		ref = btrfs_alloc_leaf_ref(root, 0);
+		if (ref) {
+			ref->root_gen = root->root_key.offset;
+			ref->bytenr = leaf_start;
+			ref->owner = 0;
+			ref->generation = leaf_gen;
+			ref->nritems = 0;
+
+			ret = btrfs_add_leaf_ref(root, ref, 0);
+			WARN_ON(ret);
+			btrfs_free_leaf_ref(root, ref);
+		} else {
+			WARN_ON(1);
+		}
+	}
 next_key:
 	btrfs_release_path(root, path);
 
-- 
cgit v1.2.3


From 61f8c86ee8f9ca55488449db886104a0ab4e1f98 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 9 Oct 2008 11:52:35 -0400
Subject: Btrfs: Fix makefile for builing btrfs static

This fixes the btrfs makefile for building in the tree and out of the tree
both as a module and static.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 48b7909ca8d..7125716e142 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,7 +1,7 @@
 ifneq ($(KERNELRELEASE),)
 # kbuild part of makefile
 
-obj-m  := btrfs.o
+obj-$(CONFIG_BTRFS_FS) := btrfs.o
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o inode.o file.o tree-defrag.o \
@@ -14,7 +14,7 @@ else
 
 KERNELDIR := /lib/modules/`uname -r`/build
 all:
-	$(MAKE) -C $(KERNELDIR) M=`pwd` modules
+	$(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules
 
 modules_install:
 	$(MAKE) -C $(KERNELDIR) M=`pwd` modules_install
-- 
cgit v1.2.3


From 833023e46c2a0180ff07d90252c24cb3fdea811d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 9 Oct 2008 11:55:03 -0400
Subject: Btrfs: Rev the disk format for the new back reference format

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a37fd783407..3fa9b8d6751 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,7 +39,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_B9RfS_M"
+#define BTRFS_MAGIC "_BBRfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From cb8e70901d36f32017614f16d2cf7cc089544574 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 9 Oct 2008 13:39:39 -0400
Subject: Btrfs: Fix subvolume creation locking rules

Creating a subvolume is in many ways like a normal VFS ->mkdir, and we
really need to play with the VFS topology locking rules.  So instead of
just creating the snapshot on disk and then later getting rid of
confliting aliases do it correctly from the start.  This will become
especially important once we allow for subvolumes anywhere in the tree,
and not just below a hidden root.

Note that snapshots will need the same treatment, but do to the delay
in creating them we can't do it currently.  Chris promised to fix that
issue, so I'll wait on that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.h |   2 +-
 fs/btrfs/inode.c |  10 ++++--
 fs/btrfs/ioctl.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 100 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3fa9b8d6751..8559f39fd47 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1791,7 +1791,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
-int btrfs_create_subvol_root(struct btrfs_root *new_root,
+int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
 		struct btrfs_trans_handle *trans, u64 new_dirid,
 		struct btrfs_block_group_cache *block_group);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 11bfe131fde..bf4bed6ca4d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3420,11 +3420,12 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 /*
  * create a new subvolume directory/inode (helper for the ioctl).
  */
-int btrfs_create_subvol_root(struct btrfs_root *new_root,
+int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
 		struct btrfs_trans_handle *trans, u64 new_dirid,
 		struct btrfs_block_group_cache *block_group)
 {
 	struct inode *inode;
+	int error;
 	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -3438,7 +3439,12 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
 	inode->i_nlink = 1;
 	btrfs_i_size_write(inode, 0);
 
-	return btrfs_update_inode(trans, new_root, inode);
+	error = btrfs_update_inode(trans, new_root, inode);
+	if (error)
+		return error;
+
+	d_instantiate(dentry, inode);
+	return 0;
 }
 
 /* helper function for file defrag and space balancing.  This
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 50c8a066d1f..3d85f18bbba 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -21,6 +21,7 @@
 #include <linux/buffer_head.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/fsnotify.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/time.h>
@@ -28,12 +29,15 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/backing-dev.h>
+#include <linux/mount.h>
 #include <linux/mpage.h>
+#include <linux/namei.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
+#include <linux/security.h>
 #include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
@@ -48,8 +52,9 @@
 
 
-static noinline int create_subvol(struct btrfs_root *root, char *name,
-				  int namelen)
+static noinline int create_subvol(struct btrfs_root *root,
+				  struct dentry *dentry,
+				  char *name, int namelen)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
@@ -151,14 +156,11 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	trans = btrfs_start_transaction(new_root, 1);
 	BUG_ON(!trans);
 
-	ret = btrfs_create_subvol_root(new_root, trans, new_dirid,
+	ret = btrfs_create_subvol_root(new_root, dentry, trans, new_dirid,
 				       BTRFS_I(dir)->block_group);
 	if (ret)
 		goto fail;
 
-	/* Invalidate existing dcache entry for new subvolume. */
-	btrfs_invalidate_dcache_root(root, name, namelen);
-
 fail:
 	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, new_root);
@@ -210,6 +212,79 @@ fail_unlock:
 	return ret;
 }
 
+/* copy of may_create in fs/namei.c() */
+static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
+{
+	if (child->d_inode)
+		return -EEXIST;
+	if (IS_DEADDIR(dir))
+		return -ENOENT;
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+}
+
+/*
+ * Create a new subvolume below @parent.  This is largely modeled after
+ * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
+ * inside this filesystem so it's quite a bit simpler.
+ */
+static noinline int btrfs_mksubvol(struct path *parent, char *name,
+				   int mode, int namelen)
+{
+	struct dentry *dentry;
+	int error;
+
+	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+
+	dentry = lookup_one_len(name, parent->dentry, namelen);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+
+	error = -EEXIST;
+	if (dentry->d_inode)
+		goto out_dput;
+
+	if (!IS_POSIXACL(parent->dentry->d_inode))
+		mode &= ~current->fs->umask;
+	error = mnt_want_write(parent->mnt);
+	if (error)
+		goto out_dput;
+
+	error = btrfs_may_create(parent->dentry->d_inode, dentry);
+	if (error)
+		goto out_drop_write;
+
+	mode &= (S_IRWXUGO|S_ISVTX);
+	error = security_inode_mkdir(parent->dentry->d_inode, dentry, mode);
+	if (error)
+		goto out_drop_write;
+
+	/*
+	 * Actually perform the low-level subvolume creation after all
+	 * this VFS fuzz.
+	 *
+	 * Eventually we want to pass in an inode under which we create this
+	 * subvolume, but for now all are under the filesystem root.
+	 *
+	 * Also we should pass on the mode eventually to allow creating new
+	 * subvolume with specific mode bits.
+	 */
+	error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, dentry,
+			      name, namelen);
+	if (error)
+		goto out_drop_write;
+
+	fsnotify_mkdir(parent->dentry->d_inode, dentry);
+out_drop_write:
+	mnt_drop_write(parent->mnt);
+out_dput:
+	dput(dentry);
+out_unlock:
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	return error;
+}
+
+
 int btrfs_defrag_file(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
@@ -395,9 +470,10 @@ out:
 	return ret;
 }
 
-static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
+static noinline int btrfs_ioctl_snap_create(struct file *file,
 					    void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
@@ -444,10 +520,14 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
 		goto out;
 	}
 
-	if (root == root->fs_info->tree_root)
-		ret = create_subvol(root, vol_args->name, namelen);
-	else
+	if (root == root->fs_info->tree_root) {
+		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+				     file->f_path.dentry->d_inode->i_mode,
+				     namelen);
+	} else {
 		ret = create_snapshot(root, vol_args->name, namelen);
+	}
+
 out:
 	kfree(vol_args);
 	return ret;
@@ -761,7 +841,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 
 	switch (cmd) {
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(root, (void __user *)arg);
+		return btrfs_ioctl_snap_create(file, (void __user *)arg);
 	case BTRFS_IOC_DEFRAG:
 		return btrfs_ioctl_defrag(file);
 	case BTRFS_IOC_RESIZE:
-- 
cgit v1.2.3


From a3dddf3fc82a1f5942c0928abfd114e9a8c0d0c8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 10 Oct 2008 10:23:22 -0400
Subject: Btrfs: Don't call security_inode_mkdir during subvol creation

Subvol creation already requires privs, and security_inode_mkdir isn't
exported.  For now we don't need it.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3d85f18bbba..1136ce2febc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -254,11 +254,6 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 	if (error)
 		goto out_drop_write;
 
-	mode &= (S_IRWXUGO|S_ISVTX);
-	error = security_inode_mkdir(parent->dentry->d_inode, dentry, mode);
-	if (error)
-		goto out_drop_write;
-
 	/*
 	 * Actually perform the low-level subvolume creation after all
 	 * this VFS fuzz.
-- 
cgit v1.2.3


From 37d3cdddf5378606f6eab99982ca16819745d8f4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 10 Oct 2008 10:24:32 -0400
Subject: Btrfs: make tree_search_offset more flexible in its searching

Sometimes we end up freeing a reserved extent because we don't need it, however
this means that its possible for transaction->last_alloc to point to the middle
of a free area.

When we search for free space in find_free_space we do a tree_search_offset
with contains set to 0, because we want it to find the next best free area if
we do not have an offset starting on the given offset.

Unfortunately that currently means that if the offset we were given as a hint
points to the middle of a free area, we won't find anything.  This is especially
bad if we happened to last allocate from the big huge chunk of a newly formed
block group, since we won't find anything and have to go back and search the
long way around.

This fixes this problem by making it so that we return the free space area
regardless of the contains variable.  This made cache missing happen _alot_
less, and speeds things up considerably.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/free-space-cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f1d9b6bc23b..96241f01fa0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -90,8 +90,8 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
 				ret = entry;
 			n = n->rb_left;
 		} else if (offset > entry->offset) {
-			if (contains &&
-			    (entry->offset + entry->bytes - 1) >= offset) {
+			if ((entry->offset + entry->bytes - 1) >= offset &&
+			    bytes <= entry->bytes) {
 				ret = entry;
 				break;
 			}
-- 
cgit v1.2.3


From 8f72fbdf0d92e6127583cc548bf043c60cd4720f Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Wed, 29 Oct 2008 17:13:08 -0400
Subject: ext4: fix printk format warning

fs/ext4/balloc.c:607: warning: format '%lld' expects type 'long long int', but argument 2 has type 's64'
fs/ext4/inode.c:1822: warning: format '%lld' expects type 'long long int', but argument 2 has type 's64'
fs/ext4/inode.c:1824: warning: format '%lld' expects type 'long long int', but argument 2 has type 's64'

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/balloc.c | 2 +-
 fs/ext4/inode.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38b3acf5683..152c390f3c3 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -614,7 +614,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
 		if (dirty_blocks < 0) {
 			printk(KERN_CRIT "Dirty block accounting "
 					"went wrong %lld\n",
-					dirty_blocks);
+					(long long)dirty_blocks);
 		}
 	}
 	/* Check whether we have space after
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6702a49992a..5b088121686 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1831,9 +1831,9 @@ static void ext4_print_free_blocks(struct inode *inode)
 			ext4_count_free_blocks(inode->i_sb));
 	printk(KERN_EMERG "Free/Dirty block details\n");
 	printk(KERN_EMERG "free_blocks=%lld\n",
-			percpu_counter_sum(&sbi->s_freeblocks_counter));
+			(long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
 	printk(KERN_EMERG "dirty_blocks=%lld\n",
-			percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+			(long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
 	printk(KERN_EMERG "Block reservation details\n");
 	printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
 			EXT4_I(inode)->i_reserved_data_blocks);
-- 
cgit v1.2.3


From 5e1f8c9e20a92743eefc9a82c2db835213905e26 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 28 Oct 2008 13:21:55 -0400
Subject: ext3: Add support for non-native signed/unsigned htree hash
 algorithms

The original ext3 hash algorithms assumed that variables of type char
were signed, as God and K&R intended.  Unfortunately, this assumption
is not true on some architectures.  Userspace support for marking
filesystems with non-native signed/unsigned chars was added two years
ago, but the kernel-side support was never added (until now).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: akpm@linux-foundation.org
Cc: linux-kernel@vger.kernel.org
---
 fs/ext3/hash.c  | 77 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 fs/ext3/namei.c |  7 ++++++
 fs/ext3/super.c | 12 +++++++++
 3 files changed, 86 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c
index c30e149fbd2..7d215b4d4f2 100644
--- a/fs/ext3/hash.c
+++ b/fs/ext3/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
 	while (len--) {
-		__u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 
-		if (hash & 0x80000000) hash -= 0x7fffffff;
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
 		hash1 = hash0;
 		hash0 = hash;
 	}
-	return (hash0 << 1);
+	return hash0 << 1;
 }
 
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 {
 	__u32	pad, val;
 	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
+}
+
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
 
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 	for (i=0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
-		val = msg[i] + (val << 8);
+		val = ((int) ucp[i]) + (val << 8);
 		if ((i % 4) == 3) {
 			*buf++ = val;
 			val = pad;
@@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	const char	*p;
 	int		i;
 	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
 
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	}
 
 	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
 	case DX_HASH_LEGACY:
-		hash = dx_hack_hash(name, len);
+		hash = dx_hack_hash_signed(name, len);
 		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_HALF_MD4:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 8);
+			(*str2hashbuf)(p, len, in, 8);
 			half_md4_transform(buf, in);
 			len -= 32;
 			p += 32;
@@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		minor_hash = buf[2];
 		hash = buf[1];
 		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_TEA:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 4);
+			(*str2hashbuf)(p, len, in, 4);
 			TEA_transform(buf, in);
 			len -= 16;
 			p += 16;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1dd2abe6313..287b304d42a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -368,6 +368,8 @@ dx_probe(struct qstr *entry, struct inode *dir,
 		goto fail;
 	}
 	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
 	if (entry)
 		ext3fs_dirhash(entry->name, entry->len, hinfo);
@@ -636,6 +638,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
 		hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT3_SB(dir->i_sb)->s_hash_unsigned;
 		hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
 					       start_hash, start_minor_hash);
@@ -1398,6 +1403,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
 	ext3fs_dirhash(name, namelen, &hinfo);
 	frame = frames;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec..541d5e4f7f6 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1744,6 +1744,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	for (i=0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
+	i = le32_to_cpu(es->s_flags);
+	if (i & EXT2_FLAGS_UNSIGNED_HASH)
+		sbi->s_hash_unsigned = 3;
+	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+		sbi->s_hash_unsigned = 3;
+#else
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+		sb->s_dirt = 1;
+	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
 		printk (KERN_ERR
-- 
cgit v1.2.3


From f99b25897a86fcfff9140396a97261ae65fed872 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 28 Oct 2008 13:21:44 -0400
Subject: ext4: Add support for non-native signed/unsigned htree hash
 algorithms

The original ext3 hash algorithms assumed that variables of type char
were signed, as God and K&R intended.  Unfortunately, this assumption
is not true on some architectures.  Userspace support for marking
filesystems with non-native signed/unsigned chars was added two years
ago, but the kernel-side support was never added (until now).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  3 +++
 fs/ext4/ext4_sb.h |  1 +
 fs/ext4/hash.c    | 77 +++++++++++++++++++++++++++++++++++++++++++++++--------
 fs/ext4/namei.c   |  7 +++++
 fs/ext4/super.c   | 12 +++++++++
 5 files changed, 90 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c82702..8370ffd2d62 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -891,6 +891,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
 #define DX_HASH_LEGACY		0
 #define DX_HASH_HALF_MD4	1
 #define DX_HASH_TEA		2
+#define DX_HASH_LEGACY_UNSIGNED	3
+#define DX_HASH_HALF_MD4_UNSIGNED	4
+#define DX_HASH_TEA_UNSIGNED		5
 
 #ifdef __KERNEL__
 
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index b21f16713db..ad7ea09baa7 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -57,6 +57,7 @@ struct ext4_sb_info {
 	u32 s_next_generation;
 	u32 s_hash_seed[4];
 	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
 	struct percpu_counter s_freeblocks_counter;
 	struct percpu_counter s_freeinodes_counter;
 	struct percpu_counter s_dirs_counter;
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 556ca8eba3d..ac8f168c8ab 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 
 
 /* The old legacy hash */
-static __u32 dx_hack_hash(const char *name, int len)
+static __u32 dx_hack_hash_unsigned(const char *name, int len)
 {
-	__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const unsigned char *ucp = (const unsigned char *) name;
+
+	while (len--) {
+		hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
+
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
+		hash1 = hash0;
+		hash0 = hash;
+	}
+	return hash0 << 1;
+}
+
+static __u32 dx_hack_hash_signed(const char *name, int len)
+{
+	__u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
+	const signed char *scp = (const signed char *) name;
+
 	while (len--) {
-		__u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
+		hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 
-		if (hash & 0x80000000) hash -= 0x7fffffff;
+		if (hash & 0x80000000)
+			hash -= 0x7fffffff;
 		hash1 = hash0;
 		hash0 = hash;
 	}
-	return (hash0 << 1);
+	return hash0 << 1;
+}
+
+static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
+{
+	__u32	pad, val;
+	int	i;
+	const signed char *scp = (const signed char *) msg;
+
+	pad = (__u32)len | ((__u32)len << 8);
+	pad |= pad << 16;
+
+	val = pad;
+	if (len > num*4)
+		len = num * 4;
+	for (i = 0; i < len; i++) {
+		if ((i % 4) == 0)
+			val = pad;
+		val = ((int) scp[i]) + (val << 8);
+		if ((i % 4) == 3) {
+			*buf++ = val;
+			val = pad;
+			num--;
+		}
+	}
+	if (--num >= 0)
+		*buf++ = val;
+	while (--num >= 0)
+		*buf++ = pad;
 }
 
-static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
+static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 {
 	__u32	pad, val;
 	int	i;
+	const unsigned char *ucp = (const unsigned char *) msg;
 
 	pad = (__u32)len | ((__u32)len << 8);
 	pad |= pad << 16;
@@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 	for (i = 0; i < len; i++) {
 		if ((i % 4) == 0)
 			val = pad;
-		val = msg[i] + (val << 8);
+		val = ((int) ucp[i]) + (val << 8);
 		if ((i % 4) == 3) {
 			*buf++ = val;
 			val = pad;
@@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	const char	*p;
 	int		i;
 	__u32		in[8], buf[4];
+	void		(*str2hashbuf)(const char *, int, __u32 *, int) =
+				str2hashbuf_signed;
 
 	/* Initialize the default seed for the hash checksum functions */
 	buf[0] = 0x67452301;
@@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 	}
 
 	switch (hinfo->hash_version) {
+	case DX_HASH_LEGACY_UNSIGNED:
+		hash = dx_hack_hash_unsigned(name, len);
+		break;
 	case DX_HASH_LEGACY:
-		hash = dx_hack_hash(name, len);
+		hash = dx_hack_hash_signed(name, len);
 		break;
+	case DX_HASH_HALF_MD4_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_HALF_MD4:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 8);
+			(*str2hashbuf)(p, len, in, 8);
 			half_md4_transform(buf, in);
 			len -= 32;
 			p += 32;
@@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 		minor_hash = buf[2];
 		hash = buf[1];
 		break;
+	case DX_HASH_TEA_UNSIGNED:
+		str2hashbuf = str2hashbuf_unsigned;
 	case DX_HASH_TEA:
 		p = name;
 		while (len > 0) {
-			str2hashbuf(p, len, in, 4);
+			(*str2hashbuf)(p, len, in, 4);
 			TEA_transform(buf, in);
 			len -= 16;
 			p += 16;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9fd2a5e1be4..315858db807 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -372,6 +372,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
 		goto fail;
 	}
 	hinfo->hash_version = root->info.hash_version;
+	if (hinfo->hash_version <= DX_HASH_TEA)
+		hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	if (d_name)
 		ext4fs_dirhash(d_name->name, d_name->len, hinfo);
@@ -641,6 +643,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+		if (hinfo.hash_version <= DX_HASH_TEA)
+			hinfo.hash_version +=
+				EXT4_SB(dir->i_sb)->s_hash_unsigned;
 		hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 		count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
 					       start_hash, start_minor_hash);
@@ -1408,6 +1413,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 
 	/* Initialize as for dx_probe */
 	hinfo.hash_version = root->info.hash_version;
+	if (hinfo.hash_version <= DX_HASH_TEA)
+		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	ext4fs_dirhash(name, namelen, &hinfo);
 	frame = frames;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04158ad74db..08fc86a358d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2118,6 +2118,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
+	i = le32_to_cpu(es->s_flags);
+	if (i & EXT2_FLAGS_UNSIGNED_HASH)
+		sbi->s_hash_unsigned = 3;
+	else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+#ifdef __CHAR_UNSIGNED__
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+		sbi->s_hash_unsigned = 3;
+#else
+		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+#endif
+		sb->s_dirt = 1;
+	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
 		printk(KERN_ERR
-- 
cgit v1.2.3


From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Oct 2008 14:49:59 -0400
Subject: Btrfs: Add zlib compression support

This is a large change for adding compression on reading and writing,
both for inline and regular extents.  It does some fairly large
surgery to the writeback paths.

Compression is off by default and enabled by mount -o compress.  Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.

If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.

* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler.  This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.

* Inline extents are inserted at delalloc time now.  This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.

* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.

From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field.  Neither the encryption or the
'other' field are currently used.

In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k.  This is a
software only limit, the disk format supports u64 sized compressed extents.

In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k.  This is a software only limit
and will be subject to tuning later.

Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data.  This way additional encodings can be
layered on without having to figure out which encoding to checksum.

Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread.  This makes it tricky to
spread the compression load across all the cpus on the box.  We'll have to
look at parallel pdflush walks of dirty inodes at a later time.

Decompression is hooked into readpages and it does spread across CPUs nicely.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/Kconfig              |   2 +
 fs/btrfs/Makefile       |   3 +-
 fs/btrfs/compression.c  | 454 ++++++++++++++++++++++++++++++++++
 fs/btrfs/compression.h  |  47 ++++
 fs/btrfs/ctree.h        |  99 ++++++--
 fs/btrfs/disk-io.c      |  18 +-
 fs/btrfs/disk-io.h      |   1 +
 fs/btrfs/extent-tree.c  |  27 +-
 fs/btrfs/extent_io.c    | 411 ++++++++++++++++++++++++-------
 fs/btrfs/extent_io.h    |  17 +-
 fs/btrfs/extent_map.c   |   9 +
 fs/btrfs/extent_map.h   |   6 +-
 fs/btrfs/file-item.c    |  75 +++++-
 fs/btrfs/file.c         | 263 +++++---------------
 fs/btrfs/inode.c        | 584 ++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/ordered-data.c |   9 +-
 fs/btrfs/ordered-data.h |  10 +-
 fs/btrfs/print-tree.c   |   7 +-
 fs/btrfs/super.c        |  10 +-
 fs/btrfs/tree-log.c     |   3 +-
 fs/btrfs/volumes.c      |   2 +
 fs/btrfs/zlib.c         | 637 ++++++++++++++++++++++++++++++++++++++++++++++++
 22 files changed, 2315 insertions(+), 379 deletions(-)
 create mode 100644 fs/btrfs/compression.c
 create mode 100644 fs/btrfs/compression.h
 create mode 100644 fs/btrfs/zlib.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 18f5a85b47c..31cce5d88b1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -501,6 +501,8 @@ config BTRFS_FS
 	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
 	depends on EXPERIMENTAL
 	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142..d2cf5a54a4b 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   compression.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 00000000000..c5470367ca5
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+	/* number of bios pending for this compressed extent */
+	atomic_t pending_bios;
+
+	/* the pages with the compressed data on them */
+	struct page **compressed_pages;
+
+	/* inode that owns this data */
+	struct inode *inode;
+
+	/* starting offset in the inode for our pages */
+	u64 start;
+
+	/* number of bytes in the inode we're working on */
+	unsigned long len;
+
+	/* number of bytes on disk */
+	unsigned long compressed_len;
+
+	/* number of compressed pages in the array */
+	unsigned long nr_pages;
+
+	/* IO errors */
+	int errors;
+
+	/* for reads, this is the bio we are copying the data into */
+	struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+					u64 first_byte, gfp_t gfp_flags)
+{
+	struct bio *bio;
+	int nr_vecs;
+
+	nr_vecs = bio_get_nr_vecs(bdev);
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_size = 0;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_byte >> 9;
+	}
+	return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+	int ret;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, lets start
+	 * the decompression.
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+					cb->start,
+					cb->orig_bio->bi_io_vec,
+					cb->orig_bio->bi_vcnt,
+					cb->compressed_len);
+	if (ret)
+		cb->errors = 1;
+
+	/* release the compressed pages */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* do io completion on the original bio */
+	if (cb->errors)
+		bio_io_error(cb->orig_bio);
+	else
+		bio_endio(cb->orig_bio, 0);
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+					     unsigned long ram_size)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+	struct page *pages[16];
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int ret;
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			nr_pages -= 1;
+			index += 1;
+			continue;
+		}
+		for (i = 0; i < ret; i++) {
+			end_page_writeback(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+	}
+	/* the inode may be gone now */
+	return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, step one is to
+	 * call back into the FS and do all the end_io operations
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+					 cb->start,
+					 cb->start + cb->len - 1,
+					 NULL, 1);
+
+	end_compressed_writeback(inode, cb->start, cb->len);
+	/* note, our inode could be gone now */
+
+	/*
+	 * release the compressed pages, these came from alloc_page and
+	 * are not attached to the inode at all
+	 */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				 unsigned long len, u64 disk_start,
+				 unsigned long compressed_len,
+				 struct page **compressed_pages,
+				 unsigned long nr_pages)
+{
+	struct bio *bio = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct compressed_bio *cb;
+	unsigned long bytes_left;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int page_index = 0;
+	struct page *page;
+	u64 first_byte = disk_start;
+	struct block_device *bdev;
+	int ret;
+
+	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->start = start;
+	cb->len = len;
+	cb->compressed_pages = compressed_pages;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = NULL;
+	cb->nr_pages = nr_pages;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	ret = btrfs_csum_file_bytes(root, inode, start, len);
+	BUG_ON(ret);
+
+	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	bio->bi_private = cb;
+	bio->bi_end_io = end_compressed_bio_write;
+	atomic_inc(&cb->pending_bios);
+
+	/* create and submit bios for the compressed pages */
+	bytes_left = compressed_len;
+	while(bytes_left > 0) {
+		page = compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (bio->bi_size)
+			ret = io_tree->ops->merge_bio_hook(page, 0,
+							   PAGE_CACHE_SIZE,
+							   bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+			BUG_ON(ret);
+
+			bio_put(bio);
+
+			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		page_index++;
+		bytes_left -= PAGE_CACHE_SIZE;
+		first_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+	BUG_ON(ret);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *em_tree;
+	struct compressed_bio *cb;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long compressed_len;
+	unsigned long nr_pages;
+	unsigned long page_index;
+	struct page *page;
+	struct block_device *bdev;
+	struct bio *comp_bio;
+	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	struct extent_map *em;
+	int ret;
+
+	tree = &BTRFS_I(inode)->io_tree;
+	em_tree = &BTRFS_I(inode)->extent_tree;
+
+	/* we need the actual starting offset of this extent in the file */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree,
+				   page_offset(bio->bi_io_vec->bv_page),
+				   PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+
+	cb->start = em->start;
+	compressed_len = em->block_len;
+	free_extent_map(em);
+
+	cb->len = uncompressed_len;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = bio;
+
+	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+				 PAGE_CACHE_SIZE;
+	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+				       GFP_NOFS);
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+							      __GFP_HIGHMEM);
+	}
+	cb->nr_pages = nr_pages;
+
+	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	comp_bio->bi_private = cb;
+	comp_bio->bi_end_io = end_compressed_bio_read;
+	atomic_inc(&cb->pending_bios);
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		page = cb->compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (comp_bio->bi_size)
+			ret = tree->ops->merge_bio_hook(page, 0,
+							PAGE_CACHE_SIZE,
+							comp_bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(comp_bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+			BUG_ON(ret);
+
+			bio_put(comp_bio);
+
+			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+							GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		cur_disk_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(comp_bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+	BUG_ON(ret);
+
+	bio_put(comp_bio);
+	return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 00000000000..421f5b4aa71
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				  unsigned long len, u64 disk_start,
+				  unsigned long compressed_len,
+				  struct page **compressed_pages,
+				  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47..793d8fdda24 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
-/*
- * there is no padding here on purpose.  If you want to extent the inode,
- * make a new item type
- */
+typedef enum {
+	BTRFS_COMPRESS_NONE = 0,
+	BTRFS_COMPRESS_ZLIB = 1,
+	BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+	BTRFS_ENCRYPTION_NONE = 0,
+	BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
 struct btrfs_inode_item {
 	/* nfs style generation number */
 	__le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
 	__le64 rdev;
 	__le16 flags;
 	__le16 compat_flags;
+
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
 	struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
 #define BTRFS_FILE_EXTENT_INLINE 1
 
 struct btrfs_file_extent_item {
+	/*
+	 * transaction id that created this extent
+	 */
 	__le64 generation;
+	/*
+	 * max number of bytes to hold this extent in ram
+	 * when we split a compressed extent we can't know how big
+	 * each of the resulting pieces will be.  So, this is
+	 * an upper limit on the size of the extent in ram instead of
+	 * an exact limit.
+	 */
+	__le64 ram_bytes;
+
+	/*
+	 * 32 bits for the various ways we might encode the data,
+	 * including compression and encryption.  If any of these
+	 * are set to something a given disk format doesn't understand
+	 * it is treated like an incompat flag for reading and writing,
+	 * but not for stat.
+	 */
+	u8 compression;
+	u8 encryption;
+	__le16 other_encoding; /* spare for later use */
+
+	/* are we inline data or a real extent? */
 	u8 type;
+
 	/*
 	 * disk space consumed by the extent, checksum blocks are included
 	 * in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
 	 */
 	__le64 offset;
 	/*
-	 * the logical number of file blocks (no csums included)
+	 * the logical number of file blocks (no csums included).  This
+	 * always reflects the size uncompressed and without encoding.
 	 */
 	__le64 num_bytes;
+
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOBARRIER		(1 << 2)
 #define BTRFS_MOUNT_SSD			(1 << 3)
 #define BTRFS_MOUNT_DEGRADED		(1 << 4)
+#define BTRFS_MOUNT_COMPRESS		(1 << 5)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
 #define BTRFS_INODE_NODATASUM		(1 << 0)
 #define BTRFS_INODE_NODATACOW		(1 << 1)
 #define BTRFS_INODE_READONLY		(1 << 2)
+#define BTRFS_INODE_NOCOMPRESS		(1 << 3)
 #define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
 					 ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
 	return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
 }
 
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-					       struct btrfs_item *e)
-{
-	unsigned long offset;
-	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
-	return btrfs_item_size(eb, e) - offset;
-}
-
 BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
 		   disk_bytenr, 64);
 BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
 		  offset, 64);
 BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
 		   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+		   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+		   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+		   other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       struct btrfs_file_extent_item *e)
+{
+	return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+						    struct btrfs_item *e)
+{
+	unsigned long offset;
+	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+	return btrfs_item_size(eb, e) - offset;
+}
 
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 objectid, u64 pos, u64 disk_offset,
-			       u64 disk_num_bytes,
-			     u64 num_bytes, u64 offset);
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+			  u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 				  int namelen);
 
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio);
+			 size_t size, struct bio *bio, unsigned long bio_flags);
 
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb619..dc95f636a11 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int rw;
 	int mirror_num;
+	unsigned long bio_flags;
 	struct btrfs_work work;
 };
 
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	}
 	em->start = 0;
 	em->len = (u64)-1;
+	em->block_len = (u64)-1;
 	em->block_start = 0;
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
 		wake_up(&fs_info->async_submit_wait);
 
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
-			       async->mirror_num);
+			       async->mirror_num, async->bio_flags);
 	kfree(async);
 }
 
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
 			extent_submit_bio_hook_t *submit_bio_hook)
 {
 	struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_hook = submit_bio_hook;
 	async->work.func = run_one_async_submit;
 	async->work.flags = 0;
+	async->bio_flags = bio_flags;
 
 	while(atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
 }
 
 static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num)
+				 int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num)
+				 int mirror_num, unsigned long bio_flags)
 {
 	/*
 	 * kthread helpers are used to submit writes so that checksumming
 	 * can happen in parallel across all CPUs
 	 */
 	if (!(rw & (1 << BIO_RW))) {
-		return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+		return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-				   inode, rw, bio, mirror_num,
+				   inode, rw, bio, mirror_num, 0,
 				   __btree_submit_bio_hook);
 }
 
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
+
 	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
 
 	INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	btrfs_init_workers(&fs_info->workers, "worker",
 			   fs_info->thread_pool_size);
+
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
 
 	nodesize = btrfs_super_nodesize(disk_super);
 	leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbb..4eb1f1408d2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
 			extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6..bbf04e80a1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
 
 	em->start = extent_key->objectid - offset;
 	em->len = extent_key->offset;
+	em->block_len = extent_key->offset;
 	em->block_start = extent_key->objectid;
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
 };
 
 struct disk_extent {
+	u64 ram_bytes;
 	u64 disk_bytenr;
 	u64 disk_num_bytes;
 	u64 offset;
 	u64 num_bytes;
+	u8 compression;
+	u8 encryption;
+	u16 other_encoding;
 };
 
 static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
 			btrfs_file_extent_disk_num_bytes(leaf, fi);
 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+									   fi);
 		WARN_ON(exts[nr].offset > 0);
 		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
@@ -3846,6 +3856,8 @@ next:
 						new_extents[0].disk_bytenr);
 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[0].disk_num_bytes);
+			btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extents[0].ram_bytes);
 			ext_offset += new_extents[0].offset;
 			btrfs_set_file_extent_offset(leaf, fi, ext_offset);
 			btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
 						new_extents[i].disk_bytenr);
 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[i].disk_num_bytes);
+				btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extents[i].ram_bytes);
+
+				btrfs_set_file_extent_compression(leaf, fi,
+						new_extents[i].compression);
+				btrfs_set_file_extent_encryption(leaf, fi,
+						new_extents[i].encryption);
+				btrfs_set_file_extent_other_encoding(leaf, fi,
+						new_extents[i].other_encoding);
+
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_len);
 				ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
 
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extent->ram_bytes);
 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
 						new_extent->disk_bytenr);
 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	BUG_ON(err);
 
 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-				       group->key.offset, 0);
+				       group->key.offset, 0, group->key.offset,
+				       0, 0, 0);
 	BUG_ON(err);
 
 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f..314041fdfa4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
+#define LEAK_DEBUG 1
 #ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
 #endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
-					     u64 *start, u64 *end, u64 max_bytes)
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+					u64 *start, u64 *end, u64 max_bytes)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 	u64 total_bytes = 0;
 
 	spin_lock_irq(&tree->lock);
+
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-search_again:
 	node = tree_search(tree, cur_start);
 	if (!node) {
 		if (!found)
@@ -1100,40 +1101,6 @@ search_again:
 				*end = state->end;
 			goto out;
 		}
-		if (!found && !(state->state & EXTENT_BOUNDARY)) {
-			struct extent_state *prev_state;
-			struct rb_node *prev_node = node;
-			while(1) {
-				prev_node = rb_prev(prev_node);
-				if (!prev_node)
-					break;
-				prev_state = rb_entry(prev_node,
-						      struct extent_state,
-						      rb_node);
-				if ((prev_state->end + 1 != state->start) ||
-				    !(prev_state->state & EXTENT_DELALLOC))
-					break;
-				if ((cur_start - prev_state->start) * 2 >
-				     max_bytes)
-					break;
-				state = prev_state;
-				node = prev_node;
-			}
-		}
-		if (state->state & EXTENT_LOCKED) {
-			DEFINE_WAIT(wait);
-			atomic_inc(&state->refs);
-			prepare_to_wait(&state->wq, &wait,
-					TASK_UNINTERRUPTIBLE);
-			spin_unlock_irq(&tree->lock);
-			schedule();
-			spin_lock_irq(&tree->lock);
-			finish_wait(&state->wq, &wait);
-			free_extent_state(state);
-			goto search_again;
-		}
-		set_state_cb(tree, state, EXTENT_LOCKED);
-		state->state |= EXTENT_LOCKED;
 		if (!found)
 			*start = state->start;
 		found++;
@@ -1151,6 +1118,208 @@ out:
 	return found;
 }
 
+static noinline int __unlock_for_delalloc(struct inode *inode,
+					  struct page *locked_page,
+					  u64 start, u64 end)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+
+	if (index == locked_page->index && end_index == index)
+		return 0;
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] != locked_page)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+					struct page *locked_page,
+					u64 delalloc_start,
+					u64 delalloc_end)
+{
+	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+	unsigned long start_index = index;
+	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+	unsigned long pages_locked = 0;
+	struct page *pages[16];
+	unsigned long nrpages;
+	int ret;
+	int i;
+
+	/* the caller is responsible for locking the start index */
+	if (index == locked_page->index && index == end_index)
+		return 0;
+
+	/* skip the page at the start index */
+	nrpages = end_index - index + 1;
+	while(nrpages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nrpages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			ret = -EAGAIN;
+			goto done;
+		}
+		/* now we have an array of pages, lock them all */
+		for (i = 0; i < ret; i++) {
+			/*
+			 * the caller is taking responsibility for
+			 * locked_page
+			 */
+			if (pages[i] != locked_page)
+				lock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		pages_locked += ret;
+		nrpages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	ret = 0;
+done:
+	if (ret && pages_locked) {
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start,
+			      ((u64)(start_index + pages_locked - 1)) <<
+			      PAGE_CACHE_SHIFT);
+	}
+	return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+					     struct extent_io_tree *tree,
+					     struct page *locked_page,
+					     u64 *start, u64 *end,
+					     u64 max_bytes)
+{
+	u64 delalloc_start;
+	u64 delalloc_end;
+	u64 found;
+	int ret;
+	int loops = 0;
+
+again:
+	/* step one, find a bunch of delalloc bytes starting at start */
+	delalloc_start = *start;
+	delalloc_end = 0;
+	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+				    max_bytes);
+	if (!found) {
+		*start = delalloc_start;
+		*end = delalloc_end;
+		return found;
+	}
+
+	/*
+	 * make sure to limit the number of pages we try to lock down
+	 * if we're looping.
+	 */
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+		delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
+			~((u64)PAGE_CACHE_SIZE - 1);
+	}
+	/* step two, lock all the pages after the page that has start */
+	ret = lock_delalloc_pages(inode, locked_page,
+				  delalloc_start, delalloc_end);
+	if (ret == -EAGAIN) {
+		/* some of the pages are gone, lets avoid looping by
+		 * shortening the size of the delalloc range we're searching
+		 */
+		if (!loops) {
+			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+			max_bytes = PAGE_CACHE_SIZE - offset;
+			loops = 1;
+			goto again;
+		} else {
+			found = 0;
+			goto out_failed;
+		}
+	}
+	BUG_ON(ret);
+
+	/* step three, lock the state bits for the whole range */
+	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+	/* then test to make sure it is all still delalloc */
+	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+			     EXTENT_DELALLOC, 1);
+	if (!ret) {
+		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start, delalloc_end);
+		cond_resched();
+		goto again;
+	}
+	*start = delalloc_start;
+	*end = delalloc_end;
+out_failed:
+	return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int clear_dirty, int set_writeback,
+				int end_writeback)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+
+	if (clear_dirty)
+		clear_bits |= EXTENT_DIRTY;
+
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] == locked_page) {
+				page_cache_release(pages[i]);
+				continue;
+			}
+			if (clear_dirty)
+				clear_page_dirty_for_io(pages[i]);
+			if (set_writeback)
+				set_page_writeback(pages[i]);
+			if (end_writeback)
+				end_page_writeback(pages[i]);
+			unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_clear_unlock_delalloc);
+
 /*
  * count the number of bytes in the tree that have a given bit(s)
  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+			  unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
-	struct rb_node *node;
-	struct extent_state *state;
 	u64 start;
 	u64 end;
 
 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
 	end = start + bvec->bv_len - 1;
 
-	spin_lock_irq(&tree->lock);
-	node = __etree_search(tree, start, NULL, NULL);
-	BUG_ON(!node);
-	state = rb_entry(node, struct extent_state, rb_node);
-	while(state->end < end) {
-		node = rb_next(node);
-		state = rb_entry(node, struct extent_state, rb_node);
-	}
-	BUG_ON(state->end != end);
-	spin_unlock_irq(&tree->lock);
-
 	bio->bi_private = NULL;
 
 	bio_get(bio);
 
 	if (tree->ops && tree->ops->submit_bio_hook)
 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-					   mirror_num);
+					   mirror_num, bio_flags);
 	else
 		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct bio **bio_ret,
 			      unsigned long max_pages,
 			      bio_end_io_t end_io_func,
-			      int mirror_num)
+			      int mirror_num,
+			      unsigned long prev_bio_flags,
+			      unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio *bio;
 	int nr;
+	int contig = 0;
+	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+	size_t page_size = min(size, PAGE_CACHE_SIZE);
 
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
-		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		if (old_compressed)
+			contig = bio->bi_sector == sector;
+		else
+			contig = bio->bi_sector + (bio->bi_size >> 9) ==
+				sector;
+
+		if (prev_bio_flags != bio_flags || !contig ||
 		    (tree->ops && tree->ops->merge_bio_hook &&
-		     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
-		    bio_add_page(bio, page, size, offset) < size) {
-			ret = submit_one_bio(rw, bio, mirror_num);
+		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
+					       bio_flags)) ||
+		    bio_add_page(bio, page, page_size, offset) < page_size) {
+			ret = submit_one_bio(rw, bio, mirror_num,
+					     prev_bio_flags);
 			bio = NULL;
 		} else {
 			return 0;
 		}
 	}
-	nr = bio_get_nr_vecs(bdev);
+	if (this_compressed)
+		nr = BIO_MAX_PAGES;
+	else
+		nr = bio_get_nr_vecs(bdev);
+
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
 	if (!bio) {
 		printk("failed to allocate bio nr %d\n", nr);
 	}
 
-
-	bio_add_page(bio, page, size, offset);
+	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 
 	if (bio_ret) {
 		*bio_ret = bio;
 	} else {
-		ret = submit_one_bio(rw, bio, mirror_num);
+		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
 	}
 
 	return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
 static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
-				   struct bio **bio, int mirror_num)
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	int nr = 0;
 	size_t page_offset = 0;
 	size_t iosize;
+	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
+	unsigned long this_bio_flag = 0;
 
 	set_page_extent_mapped(page);
 
 	end = page_end;
 	lock_extent(tree, start, end, GFP_NOFS);
 
+	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+		char *userpage;
+		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+		if (zero_offset) {
+			iosize = PAGE_CACHE_SIZE - zero_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + zero_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+		}
+	}
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 		}
 		BUG_ON(end < cur);
 
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			this_bio_flag = EXTENT_BIO_COMPRESSED;
+
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-		sector = (em->block_start + extent_offset) >> 9;
+		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+			disk_io_size = em->block_len;
+			sector = em->block_start >> 9;
+		} else {
+			sector = (em->block_start + extent_offset) >> 9;
+			disk_io_size = iosize;
+		}
 		bdev = em->bdev;
 		block_start = em->block_start;
 		free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
 			pnr -= page->index;
 			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset,
+					 sector, disk_io_size, page_offset,
 					 bdev, bio, pnr,
-					 end_bio_extent_readpage, mirror_num);
+					 end_bio_extent_readpage, mirror_num,
+					 *bio_flags,
+					 this_bio_flag);
 			nr++;
+			*bio_flags = this_bio_flag;
 		}
 		if (ret)
 			SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			    get_extent_t *get_extent)
 {
 	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+				      &bio_flags);
 	if (bio)
-		submit_one_bio(READ, bio, 0);
+		submit_one_bio(READ, bio, 0, bio_flags);
 	return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
 	u64 nr_delalloc;
 	u64 delalloc_end;
+	int page_started;
+	int compressed;
 
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	delalloc_start = start;
 	delalloc_end = 0;
+	page_started = 0;
 	while(delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+		nr_delalloc = find_lock_delalloc_range(inode, tree,
+						       page,
+						       &delalloc_start,
 						       &delalloc_end,
 						       128 * 1024 * 1024);
 		if (nr_delalloc == 0) {
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-		tree->ops->fill_delalloc(inode, delalloc_start,
-					 delalloc_end);
-		clear_extent_bit(tree, delalloc_start,
-				 delalloc_end,
-				 EXTENT_LOCKED | EXTENT_DELALLOC,
-				 1, 0, GFP_NOFS);
+		tree->ops->fill_delalloc(inode, page, delalloc_start,
+					 delalloc_end, &page_started);
 		delalloc_start = delalloc_end + 1;
 	}
+
+	/* did the fill delalloc function already unlock and start the IO? */
+	if (page_started) {
+		return 0;
+	}
+
 	lock_extent(tree, start, page_end, GFP_NOFS);
 	unlock_start = start;
 
 	if (tree->ops && tree->ops->writepage_start_hook) {
-		ret = tree->ops->writepage_start_hook(page, start, page_end);
+		ret = tree->ops->writepage_start_hook(page, start,
+						      page_end);
 		if (ret == -EAGAIN) {
 			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
 		block_start = em->block_start;
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		free_extent_map(em);
 		em = NULL;
 
-		if (block_start == EXTENT_MAP_HOLE ||
+		/*
+		 * compressed and inline extents are written through other
+		 * paths in the FS
+		 */
+		if (compressed || block_start == EXTENT_MAP_HOLE ||
 		    block_start == EXTENT_MAP_INLINE) {
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			unlock_extent(tree, unlock_start, cur + iosize -1,
 				      GFP_NOFS);
 
-			if (tree->ops && tree->ops->writepage_end_io_hook)
+			/*
+			 * end_io notification does not happen here for
+			 * compressed extents
+			 */
+			if (!compressed && tree->ops &&
+			    tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 cur + iosize - 1,
 							 NULL, 1);
-			cur = cur + iosize;
+			else if (compressed) {
+				/* we don't want to end_page_writeback on
+				 * a compressed extent.  this happens
+				 * elsewhere
+				 */
+				nr++;
+			}
+
+			cur += iosize;
 			pg_offset += iosize;
 			unlock_start = cur;
 			continue;
 		}
-
 		/* leave this out until we have a page_mkwrite call */
 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
 				   EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			pg_offset += iosize;
 			continue;
 		}
+
 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			ret = submit_extent_page(WRITE, tree, page, sector,
 						 iosize, pg_offset, bdev,
 						 &epd->bio, max_nr,
-						 end_bio_extent_writepage, 0);
+						 end_bio_extent_writepage,
+						 0, 0, 0);
 			if (ret)
 				SetPageError(page);
 		}
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio, 0);
+		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
 	return ret;
 }
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio, 0);
+		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
 	return ret;
 }
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	struct pagevec pvec;
+	unsigned long bio_flags = 0;
 
 	pagevec_init(&pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
 			if (!pagevec_add(&pvec, page))
 				__pagevec_lru_add(&pvec);
 			__extent_read_full_page(tree, page, get_extent,
-						&bio, 0);
+						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
 		__pagevec_lru_add(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
-		submit_one_bio(READ, bio, 0);
+		submit_one_bio(READ, bio, 0, bio_flags);
 	return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
 					 NULL, 1,
-					 end_bio_extent_preparewrite, 0);
+					 end_bio_extent_preparewrite, 0,
+					 0, 0);
 			iocount++;
 			block_start = block_start + iosize;
 		} else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			}
 			if (!test_range_bit(tree, em->start,
 					    extent_map_end(em) - 1,
-					    EXTENT_LOCKED, 0)) {
+					    EXTENT_LOCKED | EXTENT_WRITEBACK |
+					    EXTENT_ORDERED,
+					    0)) {
 				remove_extent_mapping(map, em);
 				/* once for the rb tree */
 				free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	int inc_all_pages = 0;
 	unsigned long num_pages;
 	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
 
 	if (eb->flags & EXTENT_UPTODATE)
 		return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
-						      mirror_num);
+						      mirror_num, &bio_flags);
 			if (err) {
 				ret = err;
 				printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (bio)
-		submit_one_bio(READ, bio, mirror_num);
+		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
 	if (ret || !wait) {
 		if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c9d1908a1ae..86f859b87a6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
 #define EXTENT_BOUNDARY (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
@@ -28,14 +31,17 @@
 struct extent_state;
 
 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
-				       struct bio *bio, int mirror_num);
+				       struct bio *bio, int mirror_num,
+				       unsigned long bio_flags);
 struct extent_io_ops {
-	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+			     u64 start, u64 end, int *page_started);
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
-			      size_t size, struct bio *bio);
+			      size_t size, struct bio *bio,
+			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
 				       u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
 			  u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int clear_dirty, int set_writeback,
+				int clear_writeback);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 74b2a29880d..fd3ebfb8c3c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
 		return 0;
 
+	/*
+	 * don't merge compressed extents, we need to know their
+	 * actual size
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+		return 0;
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		if (rb && mergable_maps(merge, em)) {
 			em->start = merge->start;
 			em->len += merge->len;
+			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
 			rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		merge = rb_entry(rb, struct extent_map, rb_node);
 	if (rb && mergable_maps(em, merge)) {
 		em->len += merge->len;
+		em->block_len += merge->len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
 		free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 26ac6fe0b26..abbcbeb28c7 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
 
 struct extent_map {
 	struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
 	u64 start;
 	u64 len;
 	u64 block_start;
+	u64 block_len;
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
 
 static inline u64 extent_map_block_end(struct extent_map *em)
 {
-	if (em->block_start + em->len < em->block_start)
+	if (em->block_start + em->block_len < em->block_start)
 		return (u64)-1;
-	return em->block_start + em->len;
+	return em->block_start + em->block_len;
 }
 
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6dbe88b9d7d..f4d3fa71bc4 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
 			     u64 disk_offset, u64 disk_num_bytes,
-			     u64 num_bytes, u64 offset)
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
 	btrfs_set_file_extent_offset(leaf, item, offset);
 	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, compression);
+	btrfs_set_file_extent_encryption(leaf, item, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
 	btrfs_mark_buffer_dirty(leaf);
 out:
 	btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
 	return 0;
 }
 
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+			  u64 start, unsigned long len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	char *data;
+	struct page *page;
+	unsigned long total_bytes = 0;
+	unsigned long this_sum_bytes = 0;
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+	if (!sums)
+		return -ENOMEM;
+
+	sector_sum = sums->sums;
+	sums->file_offset = start;
+	sums->len = len;
+	INIT_LIST_HEAD(&sums->list);
+	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+	BUG_ON(!ordered);
+
+	while(len > 0) {
+		if (start >= ordered->file_offset + ordered->len ||
+		    start < ordered->file_offset) {
+			sums->len = this_sum_bytes;
+			this_sum_bytes = 0;
+			btrfs_add_ordered_sum(inode, ordered, sums);
+			btrfs_put_ordered_extent(ordered);
+
+			sums = kzalloc(btrfs_ordered_sum_size(root, len),
+				       GFP_NOFS);
+			BUG_ON(!sums);
+			sector_sum = sums->sums;
+			sums->len = len;
+			sums->file_offset = start;
+			ordered = btrfs_lookup_ordered_extent(inode,
+						      sums->file_offset);
+			BUG_ON(!ordered);
+		}
+
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+
+		data = kmap_atomic(page, KM_USER0);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
+						  PAGE_CACHE_SIZE);
+		kunmap_atomic(data, KM_USER0);
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->offset = page_offset(page);
+		page_cache_release(page);
+
+		sector_sum++;
+		total_bytes += PAGE_CACHE_SIZE;
+		this_sum_bytes += PAGE_CACHE_SIZE;
+		start += PAGE_CACHE_SIZE;
+
+		WARN_ON(len < PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+	}
+	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio)
 {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add..0aa15436590 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
-/* this does all the hard work for inserting an inline extent into
- * the btree.  Any existing inline extent is extended as required to make room,
- * otherwise things are inserted as required into the btree
- */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct inode *inode,
-				u64 offset, size_t size,
-				struct page **pages, size_t page_offset,
-				int num_pages)
-{
-	struct btrfs_key key;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	char *kaddr;
-	unsigned long ptr;
-	struct btrfs_file_extent_item *ei;
-	struct page *page;
-	u32 datasize;
-	int err = 0;
-	int ret;
-	int i;
-	ssize_t cur_size;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	btrfs_set_trans_block_group(trans, inode);
-
-	key.objectid = inode->i_ino;
-	key.offset = offset;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-	if (ret < 0) {
-		err = ret;
-		goto fail;
-	}
-	if (ret == 1) {
-		struct btrfs_key found_key;
-
-		if (path->slots[0] == 0)
-			goto insert;
-
-		path->slots[0]--;
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-		if (found_key.objectid != inode->i_ino)
-			goto insert;
-
-		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-			goto insert;
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-
-		if (btrfs_file_extent_type(leaf, ei) !=
-		    BTRFS_FILE_EXTENT_INLINE) {
-			goto insert;
-		}
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		ret = 0;
-	}
-	if (ret == 0) {
-		u32 found_size;
-		u64 found_end;
-
-		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-
-		if (btrfs_file_extent_type(leaf, ei) !=
-		    BTRFS_FILE_EXTENT_INLINE) {
-			err = ret;
-			btrfs_print_leaf(root, leaf);
-			printk("found wasn't inline offset %Lu inode %lu\n",
-			       offset, inode->i_ino);
-			goto fail;
-		}
-		found_size = btrfs_file_extent_inline_len(leaf,
-					  btrfs_item_nr(leaf, path->slots[0]));
-		found_end = key.offset + found_size;
-
-		if (found_end < offset + size) {
-			btrfs_release_path(root, path);
-			ret = btrfs_search_slot(trans, root, &key, path,
-						offset + size - found_end, 1);
-			BUG_ON(ret != 0);
-
-			ret = btrfs_extend_item(trans, root, path,
-						offset + size - found_end);
-			if (ret) {
-				err = ret;
-				goto fail;
-			}
-			leaf = path->nodes[0];
-			ei = btrfs_item_ptr(leaf, path->slots[0],
-					    struct btrfs_file_extent_item);
-			inode_add_bytes(inode, offset + size - found_end);
-		}
-		if (found_end < offset) {
-			ptr = btrfs_file_extent_inline_start(ei) + found_size;
-			memset_extent_buffer(leaf, 0, ptr, offset - found_end);
-		}
-	} else {
-insert:
-		btrfs_release_path(root, path);
-		datasize = offset + size - key.offset;
-		inode_add_bytes(inode, datasize);
-		datasize = btrfs_file_extent_calc_inline_size(datasize);
-		ret = btrfs_insert_empty_item(trans, root, path, &key,
-					      datasize);
-		if (ret) {
-			err = ret;
-			printk("got bad ret %d\n", ret);
-			goto fail;
-		}
-		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-		btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-		btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-	}
-	ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-
-	cur_size = size;
-	i = 0;
-	while (size > 0) {
-		page = pages[i];
-		kaddr = kmap_atomic(page, KM_USER0);
-		cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
-		write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
-		kunmap_atomic(kaddr, KM_USER0);
-		page_offset = 0;
-		ptr += cur_size;
-		size -= cur_size;
-		if (i >= num_pages) {
-			printk("i %d num_pages %d\n", i, num_pages);
-		}
-		i++;
-	}
-	btrfs_mark_buffer_dirty(leaf);
-fail:
-	btrfs_free_path(path);
-	return err;
-}
-
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
-	u64 inline_size;
-	int did_inline = 0;
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
-						       0, 0, hole_size, 0);
+						       0, 0, hole_size, 0,
+						       hole_size, 0, 0, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size - 1, 0);
 			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			goto failed;
 	}
 
-	/*
-	 * either allocate an extent for the new bytes or setup the key
-	 * to show we are doing inline data in the extent
+	/* check for reserved extents on each page, we don't want
+	 * to reset the delalloc bit on things that already have
+	 * extents reserved.
 	 */
-	inline_size = end_pos;
-	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > root->fs_info->max_inline ||
-	    (inline_size & (root->sectorsize -1)) == 0 ||
-	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		/* check for reserved extents on each page, we don't want
-		 * to reset the delalloc bit on things that already have
-		 * extents reserved.
-		 */
-		btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-		for (i = 0; i < num_pages; i++) {
-			struct page *p = pages[i];
-			SetPageUptodate(p);
-			ClearPageChecked(p);
-			set_page_dirty(p);
-		}
-	} else {
-		u64 aligned_end;
-		/* step one, delete the existing extents in this range */
-		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-		mutex_lock(&BTRFS_I(inode)->extent_mutex);
-		err = btrfs_drop_extents(trans, root, inode, start_pos,
-					 aligned_end, aligned_end, &hint_byte);
-		if (err)
-			goto failed;
-		if (isize > inline_size)
-			inline_size = min_t(u64, isize, aligned_end);
-		inline_size -= start_pos;
-		err = insert_inline_extent(trans, root, inode, start_pos,
-					   inline_size, pages, 0, num_pages);
-		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
-		BUG_ON(err);
-		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
-		/*
-		 * an ugly way to do all the prop accounting around
-		 * the page bits and mapping tags
-		 */
-		set_page_writeback(pages[0]);
-		end_page_writeback(pages[0]);
-		did_inline = 1;
+	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = pages[i];
+		SetPageUptodate(p);
+		ClearPageChecked(p);
+		set_page_dirty(p);
 	}
 	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
-		if (did_inline)
-			BTRFS_I(inode)->disk_i_size = end_pos;
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 	int ret;
 	int testend = 1;
 	unsigned long flags;
+	int compressed = 0;
 
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			free_extent_map(em);
 			continue;
 		}
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		remove_extent_mapping(em_tree, em);
 
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->start = em->start;
 			split->len = start - em->start;
 			split->block_start = em->block_start;
+
+			if (compressed)
+				split->block_len = em->block_len;
+			else
+				split->block_len = split->len;
+
 			split->bdev = em->bdev;
 			split->flags = flags;
 			ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 
-			split->block_start = em->block_start + diff;
+			if (compressed) {
+				split->block_len = em->block_len;
+				split->block_start = em->block_start;
+			} else {
+				split->block_len = split->len;
+				split->block_start = em->block_start + diff;
+			}
 
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 			struct btrfs_item *item;
 			item = btrfs_item_nr(leaf, slot);
 			extent_end = found_key.offset +
-			     btrfs_file_extent_inline_len(leaf, item);
+			     btrfs_file_extent_inline_len(leaf, extent);
 			extent_end = (extent_end + root->sectorsize - 1) &
 				~((u64)root->sectorsize -1 );
 		}
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 extent_end = 0;
 	u64 search_start = start;
 	u64 leaf_start;
+	u64 ram_bytes = 0;
+	u8 compression = 0;
+	u8 encryption = 0;
+	u16 other_encoding = 0;
 	u64 root_gen;
 	u64 root_owner;
 	struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int recow;
 	int ret;
 
+	inline_limit = 0;
 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
 	path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 			found_type = btrfs_file_extent_type(leaf, extent);
+			compression = btrfs_file_extent_compression(leaf,
+								    extent);
+			encryption = btrfs_file_extent_encryption(leaf,
+								  extent);
+			other_encoding = btrfs_file_extent_other_encoding(leaf,
+								  extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				extent_end =
 				     btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
 
 				extent_end = key.offset +
 				     btrfs_file_extent_num_bytes(leaf, extent);
+				ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+								extent);
 				found_extent = 1;
 			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-				struct btrfs_item *item;
-				item = btrfs_item_nr(leaf, slot);
 				found_inline = 1;
 				extent_end = key.offset +
-				     btrfs_file_extent_inline_len(leaf, item);
+				     btrfs_file_extent_inline_len(leaf, extent);
 			}
 		} else {
 			extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
 			search_start = (extent_end + mask) & ~mask;
 		} else
 			search_start = extent_end;
-		if (end <= extent_end && start >= key.offset && found_inline) {
+
+		if (end <= extent_end && start >= key.offset && found_inline)
 			*hint_byte = EXTENT_MAP_INLINE;
-			goto out;
-		}
 
 		if (found_extent) {
 			read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
 			write_extent_buffer(leaf, &old,
 					    (unsigned long)extent, sizeof(old));
 
+			btrfs_set_file_extent_compression(leaf, extent,
+							  compression);
+			btrfs_set_file_extent_encryption(leaf, extent,
+							 encryption);
+			btrfs_set_file_extent_other_encoding(leaf, extent,
+							     other_encoding);
 			btrfs_set_file_extent_offset(leaf, extent,
 				    le64_to_cpu(old.offset) + end - key.offset);
 			WARN_ON(le64_to_cpu(old.num_bytes) <
 				(extent_end - end));
 			btrfs_set_file_extent_num_bytes(leaf, extent,
 							extent_end - end);
+
+			/*
+			 * set the ram bytes to the size of the full extent
+			 * before splitting.  This is a worst case flag,
+			 * but its the best we can do because we don't know
+			 * how splitting affects compression
+			 */
+			btrfs_set_file_extent_ram_bytes(leaf, extent,
+							ram_bytes);
 			btrfs_set_file_extent_type(leaf, extent,
 						   BTRFS_FILE_EXTENT_REG);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d..9797592dc86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
 #include "compat.h"
 #include "tree-log.h"
 #include "ref-cache.h"
+#include "compression.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 };
 
 static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
@@ -113,58 +115,375 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	return ret;
 }
 
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode,
+				u64 start, size_t size, size_t compressed_size,
+				struct page **compressed_pages)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct page *page = NULL;
+	char *kaddr;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	int err = 0;
+	int ret;
+	size_t cur_size = size;
+	size_t datasize;
+	unsigned long offset;
+	int use_compress = 0;
+
+	if (compressed_size && compressed_pages) {
+		use_compress = 1;
+		cur_size = compressed_size;
+	}
+
+	path = btrfs_alloc_path(); if (!path)
+		return -ENOMEM;
+
+	btrfs_set_trans_block_group(trans, inode);
+
+	key.objectid = inode->i_ino;
+	key.offset = start;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	inode_add_bytes(inode, size);
+	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+	inode_add_bytes(inode, size);
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		printk("got bad ret %d\n", ret);
+		goto fail;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+	ptr = btrfs_file_extent_inline_start(ei);
+
+	if (use_compress) {
+		struct page *cpage;
+		int i = 0;
+		while(compressed_size > 0) {
+			cpage = compressed_pages[i];
+			cur_size = min(compressed_size,
+				       PAGE_CACHE_SIZE);
+
+			kaddr = kmap(cpage);
+			write_extent_buffer(leaf, kaddr, ptr, cur_size);
+			kunmap(cpage);
+
+			i++;
+			ptr += cur_size;
+			compressed_size -= cur_size;
+		}
+		btrfs_set_file_extent_compression(leaf, ei,
+						  BTRFS_COMPRESS_ZLIB);
+	} else {
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+		btrfs_set_file_extent_compression(leaf, ei, 0);
+		kaddr = kmap_atomic(page, KM_USER0);
+		offset = start & (PAGE_CACHE_SIZE - 1);
+		write_extent_buffer(leaf, kaddr + offset, ptr, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	BTRFS_I(inode)->disk_i_size = inode->i_size;
+	btrfs_update_inode(trans, root, inode);
+	return 0;
+fail:
+	btrfs_free_path(path);
+	return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct inode *inode, u64 start, u64 end,
+				 size_t compressed_size,
+				 struct page **compressed_pages)
+{
+	u64 isize = i_size_read(inode);
+	u64 actual_end = min(end + 1, isize);
+	u64 inline_len = actual_end - start;
+	u64 aligned_end = (end + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	u64 hint_byte;
+	u64 data_len = inline_len;
+	int ret;
+
+	if (compressed_size)
+		data_len = compressed_size;
+
+	if (start > 0 ||
+	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    (!compressed_size &&
+	    (actual_end & (root->sectorsize - 1)) == 0) ||
+	    end + 1 < isize ||
+	    data_len > root->fs_info->max_inline) {
+		return 1;
+	}
+
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 aligned_end, aligned_end, &hint_byte);
+	BUG_ON(ret);
+
+	if (isize > actual_end)
+		inline_len = min_t(u64, isize, actual_end);
+	ret = insert_inline_extent(trans, root, inode, start,
+				   inline_len, compressed_size,
+				   compressed_pages);
+	BUG_ON(ret);
+	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+	return 0;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
  * allocate extents on disk for the range, and create ordered data structs
  * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
  */
-static int cow_file_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, struct page *locked_page,
+			  u64 start, u64 end, int *page_started)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
+	unsigned long ram_size;
+	u64 orig_start;
+	u64 disk_num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 orig_num_bytes;
+	u64 actual_end;
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
+	struct page **pages = NULL;
+	unsigned long nr_pages;
+	unsigned long nr_pages_ret = 0;
+	unsigned long total_compressed = 0;
+	unsigned long total_in = 0;
+	unsigned long max_compressed = 128 * 1024;
+	unsigned long max_uncompressed = 256 * 1024;
+	int i;
+	int will_compress;
 
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
+	orig_start = start;
+
+	/*
+	 * compression made this loop a bit ugly, but the basic idea is to
+	 * compress some pages but keep the total size of the compressed
+	 * extent relatively small.  If compression is off, this goto target
+	 * is never used.
+	 */
+again:
+	will_compress = 0;
+	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
+	actual_end = min_t(u64, i_size_read(inode), end + 1);
+	total_compressed = actual_end - start;
+
+	/* we want to make sure that amount of ram required to uncompress
+	 * an extent is reasonable, so we limit the total size in ram
+	 * of a compressed extent to 256k
+	 */
+	total_compressed = min(total_compressed, max_uncompressed);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	orig_num_bytes = num_bytes;
+	disk_num_bytes = num_bytes;
+	total_in = 0;
+	ret = 0;
 
-	if (alloc_hint == EXTENT_MAP_INLINE)
-		goto out;
+	/* we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress
+	 */
+	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+	    btrfs_test_opt(root, COMPRESS)) {
+		WARN_ON(pages);
+		pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+		/* we want to make sure the amount of IO required to satisfy
+		 * a random read is reasonably small, so we limit the size
+		 * of a compressed extent to 128k
+		 */
+		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+						total_compressed, pages,
+						nr_pages, &nr_pages_ret,
+						&total_in,
+						&total_compressed,
+						max_compressed);
+
+		if (!ret) {
+			unsigned long offset = total_compressed &
+				(PAGE_CACHE_SIZE - 1);
+			struct page *page = pages[nr_pages_ret - 1];
+			char *kaddr;
+
+			/* zero the tail end of the last page, we might be
+			 * sending it down to disk
+			 */
+			if (offset) {
+				kaddr = kmap_atomic(page, KM_USER0);
+				memset(kaddr + offset, 0,
+				       PAGE_CACHE_SIZE - offset);
+				kunmap_atomic(kaddr, KM_USER0);
+			}
+			will_compress = 1;
+		}
+	}
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		if (ret || total_in < (end - start + 1)) {
+			/* we didn't compress the entire range, try
+			 * to make an uncompressed inline extent.  This
+			 * is almost sure to fail, but maybe inline sizes
+			 * will get bigger later
+			 */
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end, 0, NULL);
+		} else {
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end,
+						    total_compressed, pages);
+		}
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start, end, NULL,
+						     1, 1, 1);
+			*page_started = 1;
+			ret = 0;
+			goto free_pages_out;
+		}
+	}
+
+	if (will_compress) {
+		/*
+		 * we aren't doing an inline extent round the compressed size
+		 * up to a block size boundary so the allocator does sane
+		 * things
+		 */
+		total_compressed = (total_compressed + blocksize - 1) &
+			~(blocksize - 1);
+
+		/*
+		 * one last check to make sure the compression is really a
+		 * win, compare the page count read with the blocks on disk
+		 */
+		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+			~(PAGE_CACHE_SIZE - 1);
+		if (total_compressed >= total_in) {
+			will_compress = 0;
+		} else {
+			disk_num_bytes = total_compressed;
+			num_bytes = total_in;
+		}
+	}
+	if (!will_compress && pages) {
+		/*
+		 * the compression code ran but failed to make things smaller,
+		 * free any pages it allocated and our page pointer array
+		 */
+		for (i = 0; i < nr_pages_ret; i++) {
+			page_cache_release(pages[i]);
+		}
+		kfree(pages);
+		pages = NULL;
+		total_compressed = 0;
+		nr_pages_ret = 0;
+
+		/* flag the file so we don't compress in the future */
+		btrfs_set_flag(inode, NOCOMPRESS);
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
-	while(num_bytes > 0) {
-		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+	while(disk_num_bytes > 0) {
+		unsigned long min_bytes;
+
+		/*
+		 * the max size of a compressed extent is pretty small,
+		 * make the code a little less complex by forcing
+		 * the allocator to find a whole compressed extent at once
+		 */
+		if (will_compress)
+			min_bytes = disk_num_bytes;
+		else
+			min_bytes = root->sectorsize;
+
+		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   root->sectorsize, 0, alloc_hint,
+					   min_bytes, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
-			goto out;
+			goto free_pages_out_fail;
 		}
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
-		em->len = ins.offset;
+
+		if (will_compress) {
+			ram_size = num_bytes;
+			em->len = num_bytes;
+		} else {
+			/* ramsize == disk size */
+			ram_size = ins.offset;
+			em->len = ins.offset;
+		}
+
 		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+
 		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+		if (will_compress)
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
 		while(1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 				break;
 			}
 			btrfs_drop_extent_cache(inode, start,
-						start + ins.offset - 1, 0);
+						start + ram_size - 1, 0);
 		}
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-					       ins.offset, 0);
+					       ram_size, cur_alloc_size, 0,
+					       will_compress);
 		BUG_ON(ret);
-		if (num_bytes < cur_alloc_size) {
-			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+
+		if (disk_num_bytes < cur_alloc_size) {
+			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
 			       cur_alloc_size);
 			break;
 		}
+
+		if (will_compress) {
+			/*
+			 * we're doing compression, we and we need to
+			 * submit the compressed extents down to the device.
+			 *
+			 * We lock down all the file pages, clearing their
+			 * dirty bits and setting them writeback.  Everyone
+			 * that wants to modify the page will wait on the
+			 * ordered extent above.
+			 *
+			 * The writeback bits on the file pages are
+			 * cleared when the compressed pages are on disk
+			 */
+			btrfs_end_transaction(trans, root);
+
+			if (start <= page_offset(locked_page) &&
+			    page_offset(locked_page) < start + ram_size) {
+				*page_started = 1;
+			}
+
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start,
+						     start + ram_size - 1,
+						     NULL, 1, 1, 0);
+
+			ret = btrfs_submit_compressed_write(inode, start,
+						 ram_size, ins.objectid,
+						 cur_alloc_size, pages,
+						 nr_pages_ret);
+
+			BUG_ON(ret);
+			trans = btrfs_join_transaction(root, 1);
+			if (start + ram_size < end) {
+				start += ram_size;
+				alloc_hint = ins.objectid + ins.offset;
+				/* pages will be freed at end_bio time */
+				pages = NULL;
+				goto again;
+			} else {
+				/* we've written everything, time to go */
+				break;
+			}
+		}
+		/* we're not doing compressed IO, don't unlock the first
+		 * page (which the caller expects to stay locked), don't
+		 * clear any dirty bits and don't set any writeback bits
+		 */
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					     start, start + ram_size - 1,
+					     locked_page, 0, 0, 0);
+		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+
+	ret = 0;
 out:
 	btrfs_end_transaction(trans, root);
+
 	return ret;
+
+free_pages_out_fail:
+	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+				     start, end, locked_page, 0, 0, 0);
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++)
+		page_cache_release(pages[i]);
+	if (pages)
+		kfree(pages);
+
+	goto out;
 }
 
 /*
@@ -203,7 +591,8 @@ out:
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started)
 {
 	u64 extent_start;
 	u64 extent_end;
@@ -260,6 +649,11 @@ again:
 		extent_end = extent_start + extent_num_bytes;
 		err = 0;
 
+		if (btrfs_file_extent_compression(leaf, item) ||
+		    btrfs_file_extent_encryption(leaf,item) ||
+		    btrfs_file_extent_other_encoding(leaf, item))
+			goto not_found;
+
 		if (loops && start != extent_start)
 			goto not_found;
 
@@ -284,7 +678,8 @@ again:
 		bytenr += btrfs_file_extent_offset(leaf, item);
 		extent_num_bytes = min(end + 1, extent_end) - start;
 		ret = btrfs_add_ordered_extent(inode, start, bytenr,
-						extent_num_bytes, 1);
+						extent_num_bytes,
+						extent_num_bytes, 1, 0);
 		if (ret) {
 			err = ret;
 			goto out;
@@ -300,7 +695,8 @@ again:
 not_found:
 		btrfs_end_transaction(trans, root);
 		btrfs_free_path(path);
-		return cow_file_range(inode, start, end);
+		return cow_file_range(inode, locked_page, start, end,
+				      page_started);
 	}
 out:
 	WARN_ON(err);
@@ -312,16 +708,19 @@ out:
 /*
  * extent_io.c call back to do delayed allocation processing
  */
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
-		ret = run_delalloc_nocow(inode, start, end);
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started);
 	else
-		ret = cow_file_range(inode, start, end);
+		ret = cow_file_range(inode, locked_page, start, end,
+				     page_started);
 
 	return ret;
 }
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
  * we don't create bios that span stripes or chunks
  */
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio)
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * are inserted into the btree
  */
 int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num)
+			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
  * or reading the csums from the tree before a read
  */
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num)
+			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	if (!(rw & (1 << BIO_RW))) {
 		btrfs_lookup_bio_sums(root, inode, bio);
+
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
+			return btrfs_submit_compressed_read(inode, bio,
+						    mirror_num, bio_flags);
+		}
+
 		goto mapit;
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
-				   __btrfs_submit_bio_hook);
+				   bio_flags, __btrfs_submit_bio_hook);
 mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
@@ -539,7 +945,7 @@ out_page:
  * good idea.  This causes problems because we want to make sure COW
  * properly happens and the data=ordered rules are followed.
  *
- * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * In our case any range that doesn't have the ORDERED bit set
  * hasn't been properly setup for IO.  We kick off an async process
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
 					  ordered_extent->start);
 	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-					     ordered_extent->len);
+					     ordered_extent->disk_len);
 	btrfs_set_file_extent_offset(leaf, extent_item, 0);
+
+	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+		btrfs_set_file_extent_compression(leaf, extent_item, 1);
+	else
+		btrfs_set_file_extent_compression(leaf, extent_item, 0);
+	btrfs_set_file_extent_encryption(leaf, extent_item, 0);
+	btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
+
+	/* ram bytes = extent_num_bytes for now */
 	btrfs_set_file_extent_num_bytes(leaf, extent_item,
 					ordered_extent->len);
+	btrfs_set_file_extent_ram_bytes(leaf, extent_item,
+					ordered_extent->len);
 	btrfs_mark_buffer_dirty(leaf);
 
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	ins.objectid = ordered_extent->start;
-	ins.offset = ordered_extent->len;
+	ins.offset = ordered_extent->disk_len;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
 					  root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 	int ret;
 	int rw;
 	u64 logical;
+	unsigned long bio_flags = 0;
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 		}
 		logical = start - em->start;
 		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			bio_flags = EXTENT_BIO_COMPRESSED;
 		failrec->logical = logical;
 		free_extent_map(em);
 		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 		rw = READ;
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-						      failrec->last_mirror);
+						      failrec->last_mirror,
+						      bio_flags);
 	return 0;
 }
 
@@ -1644,10 +2065,8 @@ search_again:
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-				struct btrfs_item *item = btrfs_item_nr(leaf,
-							        path->slots[0]);
 				item_end += btrfs_file_extent_inline_len(leaf,
-									 item);
+									 fi);
 			}
 			item_end--;
 		}
@@ -1715,7 +2134,14 @@ search_again:
 				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-			if (!del_item) {
+			/*
+			 * we can't truncate inline items that have had
+			 * special encodings
+			 */
+			if (!del_item &&
+			    btrfs_file_extent_compression(leaf, fi) == 0 &&
+			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
 				u32 size = new_size - found_key.offset;
 
 				if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       hole_start, 0, 0,
-						       hole_size, 0);
+						       hole_size, 0, hole_size,
+						       0, 0, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
 						(u64)-1, 0);
 			btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 	start_diff = map_start - em->start;
 	em->start = map_start;
 	em->len = map_len;
-	if (em->block_start < EXTENT_MAP_LAST_BYTE)
+	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 		em->block_start += start_diff;
+		em->block_len -= start_diff;
+	}
 	return add_extent_mapping(em_tree, em);
 }
 
+static noinline int uncompress_inline(struct btrfs_path *path,
+				      struct inode *inode, struct page *page,
+				      size_t pg_offset, u64 extent_offset,
+				      struct btrfs_file_extent_item *item)
+{
+	int ret;
+	struct extent_buffer *leaf = path->nodes[0];
+	char *tmp;
+	size_t max_size;
+	unsigned long inline_size;
+	unsigned long ptr;
+
+	WARN_ON(pg_offset != 0);
+	max_size = btrfs_file_extent_ram_bytes(leaf, item);
+	inline_size = btrfs_file_extent_inline_item_len(leaf,
+					btrfs_item_nr(leaf, path->slots[0]));
+	tmp = kmalloc(inline_size, GFP_NOFS);
+	ptr = btrfs_file_extent_inline_start(item);
+
+	read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+	max_size = min(PAGE_CACHE_SIZE, max_size);
+	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+				    inline_size, max_size);
+	if (ret) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		unsigned long copy_size = min_t(u64,
+				  PAGE_CACHE_SIZE - pg_offset,
+				  max_size - extent_offset);
+		memset(kaddr + pg_offset, 0, copy_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	kfree(tmp);
+	return 0;
+}
+
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
  * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
+	int compressed;
 
 again:
 	spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
+	em->block_len = (u64)-1;
 
 	if (!path) {
 		path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
 
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
+	compressed = btrfs_file_extent_compression(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
 			em->block_start = EXTENT_MAP_HOLE;
 			goto insert;
 		}
-		bytenr += btrfs_file_extent_offset(leaf, item);
-		em->block_start = bytenr;
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
+		if (compressed) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->block_start = bytenr;
+			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+									 item);
+		} else {
+			bytenr += btrfs_file_extent_offset(leaf, item);
+			em->block_start = bytenr;
+			em->block_len = em->len;
+		}
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		u64 page_start;
@@ -3018,8 +3495,7 @@ again:
 		size_t extent_offset;
 		size_t copy_size;
 
-		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
-						    path->slots[0]));
+		size = btrfs_file_extent_inline_len(leaf, item);
 		extent_end = (extent_start + size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
 		}
 		em->block_start = EXTENT_MAP_INLINE;
 
-		if (!page) {
+		if (!page || create) {
 			em->start = extent_start;
-			em->len = size;
+			em->len = (size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
 			goto out;
 		}
 
@@ -3048,11 +3525,22 @@ again:
 		em->start = extent_start + extent_offset;
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
-		map = kmap(page);
+		if (compressed)
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
-			read_extent_buffer(leaf, map + pg_offset, ptr,
-					   copy_size);
+			if (btrfs_file_extent_compression(leaf, item) ==
+			    BTRFS_COMPRESS_ZLIB) {
+				ret = uncompress_inline(path, inode, page,
+							pg_offset,
+							extent_offset, item);
+				BUG_ON(ret);
+			} else {
+				map = kmap(page);
+				read_extent_buffer(leaf, map + pg_offset, ptr,
+						   copy_size);
+				kunmap(page);
+			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
 			if (!trans) {
@@ -3063,11 +3551,12 @@ again:
 				trans = btrfs_join_transaction(root, 1);
 				goto again;
 			}
+			map = kmap(page);
 			write_extent_buffer(leaf, map + pg_offset, ptr,
 					    copy_size);
+			kunmap(page);
 			btrfs_mark_buffer_dirty(leaf);
 		}
-		kunmap(page);
 		set_extent_uptodate(io_tree, em->start,
 				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 	btrfs_set_file_extent_type(leaf, ei,
 				   BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_compression(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
 	ptr = btrfs_file_extent_inline_start(ei);
 	write_extent_buffer(leaf, symname, ptr, name_len);
 	btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2eb6caba57c..b5745bb96d4 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  * inserted.
  */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, int nocow)
+			     u64 start, u64 len, u64 disk_len, int nocow,
+			     int compressed)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
+	entry->disk_len = disk_len;
 	entry->inode = inode;
 	if (nocow)
 		set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
+	if (compressed)
+		set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
 
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * for pdflush to find them
 	 */
 	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
-	if (wait)
+	if (wait) {
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
+	}
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a14..1ef464145d2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
 	/* disk byte number */
 	u64 start;
 
-	/* length of the extent in bytes */
+	/* ram length of the extent in bytes */
 	u64 len;
 
+	/* extent length on disk */
+	u64 disk_len;
+
 	/* flags (described above) */
 	unsigned long flags;
 
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, int nocow);
+			     u64 start, u64 len, u64 disk_len, int nocow,
+			     int compressed);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bd9ab3e9a7f..64725c13aa1 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			if (btrfs_file_extent_type(l, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE) {
 				printk("\t\tinline extent data size %u\n",
-			           btrfs_file_extent_inline_len(l, item));
+			           btrfs_file_extent_inline_len(l, fi));
 				break;
 			}
 			printk("\t\textent data disk bytenr %llu nr %llu\n",
 			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
 			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-			printk("\t\textent data offset %llu nr %llu\n",
+			printk("\t\textent data offset %llu nr %llu ram %llu\n",
 			  (unsigned long long)btrfs_file_extent_offset(l, fi),
-			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
+			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
+			  (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e6039825b7..431fdf144b5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
 #include "volumes.h"
 #include "version.h"
 #include "export.h"
+#include "compression.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
+	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
 	{Opt_noacl, "noacl"},
 	{Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
+		case Opt_compress:
+			printk(KERN_INFO "btrfs: use compression\n");
+			btrfs_set_opt(info->mount_opt, COMPRESS);
+			break;
 		case Opt_ssd:
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
 	err = btrfs_interface_init();
 	if (err)
 		goto free_extent_map;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
+	btrfs_zlib_exit();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf618cc8b34..e6d579053a4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	if (found_type == BTRFS_FILE_EXTENT_REG)
 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		size = btrfs_file_extent_inline_len(eb,
-						    btrfs_item_nr(eb, slot));
+		size = btrfs_file_extent_inline_len(eb, item);
 		extent_end = (start + size + mask) & ~mask;
 	} else {
 		ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51..7db4cfd03a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
 	em->start = key.offset;
 	em->len = *num_bytes;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->start = logical;
 	em->len = length;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	map->num_stripes = num_stripes;
 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 00000000000..e99309180a1
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+	avail_out = *dstlen - 12 and flush == Z_FINISH.
+	If it doesn't manage to finish,	call it again with
+	avail_in == 0 and avail_out set to the remaining 12
+	bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+	z_stream inf_strm;
+	z_stream def_strm;
+	char *buf;
+	struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+	struct workspace *workspace;
+	int ret;
+	int cpus = num_online_cpus();
+
+again:
+	spin_lock(&workspace_lock);
+	if (!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		num_workspace--;
+		spin_unlock(&workspace_lock);
+		return workspace;
+
+	}
+	spin_unlock(&workspace_lock);
+	if (atomic_read(&alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&alloc_workspace) > cpus)
+			schedule();
+		finish_wait(&workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(&alloc_workspace);
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+	if (!workspace->def_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+	if (!workspace->inf_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail_inflate;
+	}
+	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+	if (!workspace->buf) {
+		ret = -ENOMEM;
+		goto fail_kmalloc;
+	}
+	return workspace;
+
+fail_kmalloc:
+	vfree(workspace->inf_strm.workspace);
+fail_inflate:
+	vfree(workspace->def_strm.workspace);
+fail:
+	kfree(workspace);
+	atomic_dec(&alloc_workspace);
+	wake_up(&workspace_wait);
+	return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+	spin_lock(&workspace_lock);
+	if (num_workspace < num_online_cpus()) {
+		list_add_tail(&workspace->list, &idle_workspace);
+		num_workspace++;
+		spin_unlock(&workspace_lock);
+		if (waitqueue_active(&workspace_wait))
+			wake_up(&workspace_wait);
+		return 0;
+	}
+	spin_unlock(&workspace_lock);
+	vfree(workspace->def_strm.workspace);
+	vfree(workspace->inf_strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+
+	atomic_dec(&alloc_workspace);
+	if (waitqueue_active(&workspace_wait))
+		wake_up(&workspace_wait);
+	return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct workspace *workspace;
+	while(!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		vfree(workspace->def_strm.workspace);
+		vfree(workspace->inf_strm.workspace);
+		kfree(workspace->buf);
+		kfree(workspace);
+		atomic_dec(&alloc_workspace);
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	int ret;
+	struct workspace *workspace;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	int out_written = 0;
+	int in_read = 0;
+	unsigned long bytes_left;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -1;
+
+	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+		printk(KERN_WARNING "deflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	workspace->def_strm.total_in = 0;
+	workspace->def_strm.total_out = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	cpage_out = kmap(out_page);
+	pages[0] = out_page;
+	nr_pages = 1;
+
+	workspace->def_strm.next_in = data_in;
+	workspace->def_strm.next_out = cpage_out;
+	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+	out_written = 0;
+	in_read = 0;
+
+	while (workspace->def_strm.total_in < len) {
+		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+		if (ret != Z_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			zlib_deflateEnd(&workspace->def_strm);
+			ret = -1;
+			goto out;
+		}
+
+		/* we're making it bigger, give up */
+		if (workspace->def_strm.total_in > 8192 &&
+		    workspace->def_strm.total_in <
+		    workspace->def_strm.total_out) {
+			ret = -1;
+			goto out;
+		}
+		/* we need another page for writing out.  Test this
+		 * before the total_in so we will pull in a new page for
+		 * the stream end if required
+		 */
+		if (workspace->def_strm.avail_out == 0) {
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -1;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			cpage_out = kmap(out_page);
+			pages[nr_pages] = out_page;
+			nr_pages++;
+			workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+			workspace->def_strm.next_out = cpage_out;
+		}
+		/* we're all done */
+		if (workspace->def_strm.total_in >= len)
+			break;
+
+		/* we've read in a full page, get a new one */
+		if (workspace->def_strm.avail_in == 0) {
+			if (workspace->def_strm.total_out > max_out)
+				break;
+
+			bytes_left = len - workspace->def_strm.total_in;
+			kunmap(in_page);
+			page_cache_release(in_page);
+
+			start += PAGE_CACHE_SIZE;
+			in_page = find_get_page(mapping,
+						start >> PAGE_CACHE_SHIFT);
+			data_in = kmap(in_page);
+			workspace->def_strm.avail_in = min(bytes_left,
+							   PAGE_CACHE_SIZE);
+			workspace->def_strm.next_in = data_in;
+		}
+	}
+	workspace->def_strm.avail_in = 0;
+	ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+	zlib_deflateEnd(&workspace->def_strm);
+
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+		goto out;
+	}
+
+	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+		ret = -1;
+		goto out;
+	}
+
+	ret = 0;
+	*total_out = workspace->def_strm.total_out;
+	*total_in = workspace->def_strm.total_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	char *data_in;
+	size_t total_out = 0;
+	unsigned long page_bytes_left;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	struct page *page_out;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long buf_offset;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+	unsigned long start_byte;
+	unsigned long current_buf_start;
+	char *kaddr;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	data_in = kmap(pages_in[page_in_index]);
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.total_out = 0;
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	page_out = bvec[page_out_index].bv_page;
+	page_bytes_left = PAGE_CACHE_SIZE;
+	pg_offset = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+	while(workspace->inf_strm.total_in < srclen) {
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END) {
+			break;
+		}
+
+		/*
+		 * buf start is the byte offset we're of the start of
+		 * our workspace buffer
+		 */
+		buf_start = total_out;
+
+		/* total_out is the last byte of the workspace buffer */
+		total_out = workspace->inf_strm.total_out;
+
+		working_bytes = total_out - buf_start;
+
+		/*
+		 * start byte is the first byte of the page we're currently
+		 * copying into relative to the start of the compressed data.
+		 */
+		start_byte = page_offset(page_out) - disk_start;
+
+		if (working_bytes == 0) {
+			/* we didn't make progress in this inflate
+			 * call, we're done
+			 */
+			if (ret != Z_STREAM_END)
+				ret = -1;
+			break;
+		}
+
+		/* we haven't yet hit data corresponding to this page */
+		if (total_out <= start_byte) {
+			goto next;
+		}
+
+		/*
+		 * the start of the data we care about is offset into
+		 * the middle of our working buffer
+		 */
+		if (total_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+			working_bytes -= buf_offset;
+		} else {
+			buf_offset = 0;
+		}
+		current_buf_start = buf_start;
+
+		/* copy bytes from the working buffer into the pages */
+		while(working_bytes > 0) {
+			bytes = min(PAGE_CACHE_SIZE - pg_offset,
+				    PAGE_CACHE_SIZE - buf_offset);
+			bytes = min(bytes, working_bytes);
+			kaddr = kmap_atomic(page_out, KM_USER0);
+			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+			       bytes);
+			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page_out);
+
+			pg_offset += bytes;
+			page_bytes_left -= bytes;
+			buf_offset += bytes;
+			working_bytes -= bytes;
+			current_buf_start += bytes;
+
+			/* check if we need to pick another page */
+			if (page_bytes_left == 0) {
+				page_out_index++;
+				if (page_out_index >= vcnt) {
+					ret = 0;
+					goto done;
+				}
+				page_out = bvec[page_out_index].bv_page;
+				pg_offset = 0;
+				page_bytes_left = PAGE_CACHE_SIZE;
+				start_byte = page_offset(page_out) - disk_start;
+
+				/*
+				 * make sure our new page is covered by this
+				 * working buffer
+				 */
+				if (total_out <= start_byte) {
+					goto next;
+				}
+
+				/* the next page in the biovec might not
+				 * be adjacent to the last page, but it
+				 * might still be found inside this working
+				 * buffer.  bump our offset pointer
+				 */
+				if (total_out > start_byte &&
+				    current_buf_start < start_byte) {
+					buf_offset = start_byte - buf_start;
+					working_bytes = total_out - start_byte;
+					current_buf_start = buf_start +
+						buf_offset;
+				}
+			}
+		}
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+		if (workspace->inf_strm.avail_in == 0) {
+			unsigned long tmp;
+			kunmap(pages_in[page_in_index]);
+			page_in_index++;
+			if (page_in_index >= total_pages_in) {
+				data_in = NULL;
+				break;
+			}
+			data_in = kmap(pages_in[page_in_index]);
+			workspace->inf_strm.next_in = data_in;
+			tmp = srclen - workspace->inf_strm.total_in;
+			workspace->inf_strm.avail_in = min(tmp,
+							   PAGE_CACHE_SIZE);
+		}
+	}
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+done:
+	zlib_inflateEnd(&workspace->inf_strm);
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	unsigned long bytes_left = destlen;
+	unsigned long total_out = 0;
+	char *kaddr;
+
+	if (destlen > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = srclen;
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->inf_strm.total_out = 0;
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	while(bytes_left > 0) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+		unsigned long pg_offset = 0;
+
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END) {
+			break;
+		}
+
+		buf_start = total_out;
+		total_out = workspace->inf_strm.total_out;
+
+		if (total_out == buf_start) {
+			ret = -1;
+			break;
+		}
+
+		if (total_out <= start_byte) {
+			goto next;
+		}
+
+		if (total_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+		} else {
+			buf_offset = 0;
+		}
+
+		bytes = min(PAGE_CACHE_SIZE - pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, bytes_left);
+
+		kaddr = kmap_atomic(dest_page, KM_USER0);
+		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		pg_offset += bytes;
+		bytes_left -= bytes;
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	}
+	if (ret != Z_STREAM_END && bytes_left != 0) {
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+	zlib_inflateEnd(&workspace->inf_strm);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
-- 
cgit v1.2.3


From f82d02d9d8222183b7945e893111a6d1bf67ae4a Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: Improve space balancing code

This patch improves the space balancing code to keep more sharing
of tree blocks. The only case that breaks sharing of tree blocks is
data extents get fragmented during balancing. The main changes in
this patch are:

Add a 'drop sub-tree' function. This solves the problem in old code
that BTRFS_HEADER_FLAG_WRITTEN check breaks sharing of tree block.

Remove relocation mapping tree. Relocation mappings are stored in
struct btrfs_ref_path and updated dynamically during walking up/down
the reference path. This reduces CPU usage and simplifies code.

This patch also fixes a bug. Root items for reloc trees should be
updated in btrfs_free_reloc_root.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.c       |  93 ++++++-------
 fs/btrfs/ctree.h       |  13 +-
 fs/btrfs/disk-io.c     |   2 -
 fs/btrfs/extent-tree.c | 364 ++++++++++++++++++++++++++++++-------------------
 fs/btrfs/transaction.c |   4 +-
 5 files changed, 277 insertions(+), 199 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9caeb377de6..73899d0f9d8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -287,7 +287,7 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		/*
 		 * There are only two places that can drop reference to
 		 * tree blocks owned by living reloc trees, one is here,
-		 * the other place is btrfs_merge_path. In both places,
+		 * the other place is btrfs_drop_subtree. In both places,
 		 * we check reference count while tree block is locked.
 		 * Furthermore, if reference count is one, it won't get
 		 * increased by someone else.
@@ -312,9 +312,6 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	}
 
 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		ret = btrfs_add_reloc_mapping(root, buf->start,
-					      buf->len, cow->start);
-		BUG_ON(ret);
 		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
 		WARN_ON(ret);
 	}
@@ -1627,61 +1624,57 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 		btrfs_node_key_to_cpu(eb, &key, slot);
 		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
 
+		if (generation == trans->transid) {
+			eb = read_tree_block(root, bytenr, blocksize,
+					     generation);
+			btrfs_tree_lock(eb);
+		}
+
 		/*
 		 * if node keys match and node pointer hasn't been modified
 		 * in the running transaction, we can merge the path. for
-		 * reloc trees, the node pointer check is skipped, this is
-		 * because the reloc trees are fully controlled by the space
-		 * balance code, no one else can modify them.
+		 * blocks owened by reloc trees, the node pointer check is
+		 * skipped, this is because these blocks are fully controlled
+		 * by the space balance code, no one else can modify them.
 		 */
 		if (!nodes[level - 1] || !key_match ||
 		    (generation == trans->transid &&
-		     root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) {
-next_level:
-			if (level == 1 || level == lowest_level + 1)
+		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
+			if (level == 1 || level == lowest_level + 1) {
+				if (generation == trans->transid) {
+					btrfs_tree_unlock(eb);
+					free_extent_buffer(eb);
+				}
 				break;
+			}
 
-			eb = read_tree_block(root, bytenr, blocksize,
-					     generation);
-			btrfs_tree_lock(eb);
+			if (generation != trans->transid) {
+				eb = read_tree_block(root, bytenr, blocksize,
+						generation);
+				btrfs_tree_lock(eb);
+			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
 					      &eb, 0);
 			BUG_ON(ret);
 
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				if (!nodes[level - 1]) {
+					nodes[level - 1] = eb->start;
+					memcpy(&node_keys[level - 1], &key,
+					       sizeof(node_keys[0]));
+				} else {
+					WARN_ON(1);
+				}
+			}
+
 			btrfs_tree_unlock(parent);
 			free_extent_buffer(parent);
 			parent = eb;
 			continue;
 		}
 
-		if (generation == trans->transid) {
-			u32 refs;
-			BUG_ON(btrfs_header_owner(eb) !=
-			       BTRFS_TREE_RELOC_OBJECTID);
-			/*
-			 * lock the block to keep __btrfs_cow_block from
-			 * changing the reference count.
-			 */
-			eb = read_tree_block(root, bytenr, blocksize,
-					     generation);
-			btrfs_tree_lock(eb);
-
-			ret = btrfs_lookup_extent_ref(trans, root, bytenr,
-						      blocksize, &refs);
-			BUG_ON(ret);
-			/*
-			 * if replace block whose reference count is one,
-			 * we have to "drop the subtree". so skip it for
-			 * simplicity
-			 */
-			if (refs == 1) {
-				btrfs_tree_unlock(eb);
-				free_extent_buffer(eb);
-				goto next_level;
-			}
-		}
-
 		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
 		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
 		btrfs_mark_buffer_dirty(parent);
@@ -1693,16 +1686,24 @@ next_level:
 					btrfs_header_generation(parent),
 					level - 1);
 		BUG_ON(ret);
-		ret = btrfs_free_extent(trans, root, bytenr,
-					blocksize, parent->start,
-					btrfs_header_owner(parent),
-					btrfs_header_generation(parent),
-					level - 1, 1);
-		BUG_ON(ret);
 
+		/*
+		 * If the block was created in the running transaction,
+		 * it's possible this is the last reference to it, so we
+		 * should drop the subtree.
+		 */
 		if (generation == trans->transid) {
+			ret = btrfs_drop_subtree(trans, root, eb, parent);
+			BUG_ON(ret);
 			btrfs_tree_unlock(eb);
 			free_extent_buffer(eb);
+		} else {
+			ret = btrfs_free_extent(trans, root, bytenr,
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1, 1);
+			BUG_ON(ret);
 		}
 		break;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 793d8fdda24..117090995e7 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -684,7 +684,6 @@ struct btrfs_fs_info {
 	int thread_pool_size;
 
 	/* tree relocation relocated fields */
-	struct extent_io_tree reloc_mapping_tree;
 	struct list_head dead_reloc_roots;
 	struct btrfs_leaf_ref_tree reloc_ref_tree;
 	struct btrfs_leaf_ref_tree shared_ref_tree;
@@ -1636,13 +1635,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
-int btrfs_free_reloc_root(struct btrfs_root *root);
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
-int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
-			    u64 num_bytes, u64 new_bytenr);
-int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
-			    u64 num_bytes, u64 *new_bytenr);
-void btrfs_free_reloc_mappings(struct btrfs_root *root);
 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct extent_buffer *buf, u64 orig_start);
@@ -1726,6 +1721,10 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			*root);
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent);
 /* root-item.c */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dc95f636a11..796256440df 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1448,8 +1448,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
-	extent_io_tree_init(&fs_info->reloc_mapping_tree,
-			    fs_info->btree_inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
 	btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
 	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bbf04e80a1a..56e41369d71 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3031,6 +3031,95 @@ out:
 	return 0;
 }
 
+/*
+ * helper function for drop_subtree, this function is similar to
+ * walk_down_tree. The main difference is that it checks reference
+ * counts while tree blocks are locked.
+ */
+static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path, int *level)
+{
+	struct extent_buffer *next;
+	struct extent_buffer *cur;
+	struct extent_buffer *parent;
+	u64 bytenr;
+	u64 ptr_gen;
+	u32 blocksize;
+	u32 refs;
+	int ret;
+
+	cur = path->nodes[*level];
+	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
+				      &refs);
+	BUG_ON(ret);
+	if (refs > 1)
+		goto out;
+
+	while (*level >= 0) {
+		cur = path->nodes[*level];
+		if (*level == 0) {
+			ret = btrfs_drop_leaf_ref(trans, root, cur);
+			BUG_ON(ret);
+			clean_tree_block(trans, root, cur);
+			break;
+		}
+		if (path->slots[*level] >= btrfs_header_nritems(cur)) {
+			clean_tree_block(trans, root, cur);
+			break;
+		}
+
+		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		blocksize = btrfs_level_size(root, *level - 1);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+
+		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		btrfs_tree_lock(next);
+
+		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
+					      &refs);
+		BUG_ON(ret);
+		if (refs > 1) {
+			parent = path->nodes[*level];
+			ret = btrfs_free_extent(trans, root, bytenr,
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					*level - 1, 1);
+			BUG_ON(ret);
+			path->slots[*level]++;
+			btrfs_tree_unlock(next);
+			free_extent_buffer(next);
+			continue;
+		}
+
+		*level = btrfs_header_level(next);
+		path->nodes[*level] = next;
+		path->slots[*level] = 0;
+		path->locks[*level] = 1;
+		cond_resched();
+	}
+out:
+	parent = path->nodes[*level + 1];
+	bytenr = path->nodes[*level]->start;
+	blocksize = path->nodes[*level]->len;
+
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
+			parent->start, btrfs_header_owner(parent),
+			btrfs_header_generation(parent), *level, 1);
+	BUG_ON(ret);
+
+	if (path->locks[*level]) {
+		btrfs_tree_unlock(path->nodes[*level]);
+		path->locks[*level] = 0;
+	}
+	free_extent_buffer(path->nodes[*level]);
+	path->nodes[*level] = NULL;
+	*level += 1;
+	cond_resched();
+	return 0;
+}
+
 /*
  * helper for dropping snapshots.  This walks back up the tree in the path
  * to find the first node higher up where we haven't yet gone through
@@ -3038,7 +3127,8 @@ out:
  */
 static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-				 struct btrfs_path *path, int *level)
+				 struct btrfs_path *path,
+				 int *level, int max_level)
 {
 	u64 root_owner;
 	u64 root_gen;
@@ -3047,7 +3137,7 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
 	int slot;
 	int ret;
 
-	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+	for (i = *level; i < max_level && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			struct extent_buffer *node;
@@ -3070,12 +3160,18 @@ static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
 
 			root_owner = btrfs_header_owner(parent);
 			root_gen = btrfs_header_generation(parent);
+
+			clean_tree_block(trans, root, path->nodes[*level]);
 			ret = btrfs_free_extent(trans, root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len,
 						parent->start, root_owner,
 						root_gen, *level, 1);
 			BUG_ON(ret);
+			if (path->locks[*level]) {
+				btrfs_tree_unlock(path->nodes[*level]);
+				path->locks[*level] = 0;
+			}
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
 			*level = i + 1;
@@ -3145,7 +3241,8 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		if (wret < 0)
 			ret = wret;
 
-		wret = walk_up_tree(trans, root, path, &level);
+		wret = walk_up_tree(trans, root, path, &level,
+				    BTRFS_MAX_LEVEL);
 		if (wret > 0)
 			break;
 		if (wret < 0)
@@ -3168,6 +3265,50 @@ out:
 	return ret;
 }
 
+int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			struct extent_buffer *node,
+			struct extent_buffer *parent)
+{
+	struct btrfs_path *path;
+	int level;
+	int parent_level;
+	int ret = 0;
+	int wret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	BUG_ON(!btrfs_tree_locked(parent));
+	parent_level = btrfs_header_level(parent);
+	extent_buffer_get(parent);
+	path->nodes[parent_level] = parent;
+	path->slots[parent_level] = btrfs_header_nritems(parent);
+
+	BUG_ON(!btrfs_tree_locked(node));
+	level = btrfs_header_level(node);
+	extent_buffer_get(node);
+	path->nodes[level] = node;
+	path->slots[level] = 0;
+
+	while (1) {
+		wret = walk_down_subtree(trans, root, path, &level);
+		if (wret < 0)
+			ret = wret;
+		if (wret != 0)
+			break;
+
+		wret = walk_up_tree(trans, root, path, &level, parent_level);
+		if (wret < 0)
+			ret = wret;
+		if (wret != 0)
+			break;
+	}
+
+	btrfs_free_path(path);
+	return ret;
+}
+
 static unsigned long calc_ra(unsigned long start, unsigned long last,
 			     unsigned long nr)
 {
@@ -3312,6 +3453,10 @@ struct btrfs_ref_path {
 	u32 num_refs;
 	int lowest_level;
 	int current_level;
+	int shared_level;
+
+	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
+	u64 new_nodes[BTRFS_MAX_LEVEL];
 };
 
 struct disk_extent {
@@ -3360,6 +3505,7 @@ static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
 	if (first_time) {
 		ref_path->lowest_level = -1;
 		ref_path->current_level = -1;
+		ref_path->shared_level = -1;
 		goto walk_up;
 	}
 walk_down:
@@ -3403,8 +3549,11 @@ walk_down:
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid == bytenr &&
-				found_key.type == BTRFS_EXTENT_REF_KEY)
+		    found_key.type == BTRFS_EXTENT_REF_KEY) {
+			if (level < ref_path->shared_level)
+				ref_path->shared_level = level;
 			goto found;
+		}
 next:
 		level--;
 		btrfs_release_path(extent_root, path);
@@ -3992,51 +4141,6 @@ out:
 	return ret;
 }
 
-int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
-			    u64 num_bytes, u64 new_bytenr)
-{
-	set_extent_bits(&root->fs_info->reloc_mapping_tree,
-			orig_bytenr, orig_bytenr + num_bytes - 1,
-			EXTENT_LOCKED, GFP_NOFS);
-	set_state_private(&root->fs_info->reloc_mapping_tree,
-			  orig_bytenr, new_bytenr);
-	return 0;
-}
-
-int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
-			    u64 num_bytes, u64 *new_bytenr)
-{
-	u64 bytenr;
-	u64 cur_bytenr = orig_bytenr;
-	u64 prev_bytenr = orig_bytenr;
-	int ret;
-
-	while (1) {
-		ret = get_state_private(&root->fs_info->reloc_mapping_tree,
-					cur_bytenr, &bytenr);
-		if (ret)
-			break;
-		prev_bytenr = cur_bytenr;
-		cur_bytenr = bytenr;
-	}
-
-	if (orig_bytenr == cur_bytenr)
-		return -ENOENT;
-
-	if (prev_bytenr != orig_bytenr) {
-		set_state_private(&root->fs_info->reloc_mapping_tree,
-				  orig_bytenr, cur_bytenr);
-	}
-	*new_bytenr = cur_bytenr;
-	return 0;
-}
-
-void btrfs_free_reloc_mappings(struct btrfs_root *root)
-{
-	clear_extent_bits(&root->fs_info->reloc_mapping_tree,
-			  0, (u64)-1, -1, GFP_NOFS);
-}
-
 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct extent_buffer *buf, u64 orig_start)
@@ -4222,15 +4326,30 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_free_reloc_root(struct btrfs_root *root)
+int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
 {
 	struct btrfs_root *reloc_root;
+	int ret;
 
 	if (root->reloc_root) {
 		reloc_root = root->reloc_root;
 		root->reloc_root = NULL;
 		list_add(&reloc_root->dead_list,
 			 &root->fs_info->dead_reloc_roots);
+
+		btrfs_set_root_bytenr(&reloc_root->root_item,
+				      reloc_root->node->start);
+		btrfs_set_root_level(&root->root_item,
+				     btrfs_header_level(reloc_root->node));
+		memset(&reloc_root->root_item.drop_progress, 0,
+			sizeof(struct btrfs_disk_key));
+		reloc_root->root_item.drop_level = 0;
+
+		ret = btrfs_update_root(trans, root->fs_info->tree_root,
+					&reloc_root->root_key,
+					&reloc_root->root_item);
+		BUG_ON(ret);
 	}
 	return 0;
 }
@@ -4356,8 +4475,6 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_root_refs(root_item, 0);
 	btrfs_set_root_bytenr(root_item, eb->start);
 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
-	memset(&root_item->drop_progress, 0, sizeof(root_item->drop_progress));
-	root_item->drop_level = 0;
 
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
@@ -4382,15 +4499,19 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
  * Core function of space balance.
  *
  * The idea is using reloc trees to relocate tree blocks in reference
- * counted roots. There is one reloc tree for each subvol, all reloc
- * trees share same key objectid. Reloc trees are snapshots of the
- * latest committed roots (subvol root->commit_root). To relocate a tree
- * block referenced by a subvol, the code COW the block through the reloc
- * tree, then update pointer in the subvol to point to the new block.
- * Since all reloc trees share same key objectid, we can easily do special
- * handing to share tree blocks between reloc trees. Once a tree block has
- * been COWed in one reloc tree, we can use the result when the same block
- * is COWed again through other reloc trees.
+ * counted roots. There is one reloc tree for each subvol, and all
+ * reloc trees share same root key objectid. Reloc trees are snapshots
+ * of the latest committed roots of subvols (root->commit_root).
+ *
+ * To relocate a tree block referenced by a subvol, there are two steps.
+ * COW the block through subvol's reloc tree, then update block pointer
+ * in the subvol to point to the new block. Since all reloc trees share
+ * same root key objectid, doing special handing for tree blocks owned
+ * by them is easy. Once a tree block has been COWed in one reloc tree,
+ * we can use the resulting new block directly when the same block is
+ * required to COW again through other reloc trees. By this way, relocated
+ * tree blocks are shared between reloc trees, so they are also shared
+ * between subvols.
  */
 static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
@@ -4405,15 +4526,14 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 	struct btrfs_key *keys;
 	u64 *nodes;
 	int level;
-	int lowest_merge;
+	int shared_level;
 	int lowest_level = 0;
-	int update_refs;
 	int ret;
 
 	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		lowest_level = ref_path->owner_objectid;
 
-	if (is_cowonly_root(ref_path->root_objectid)) {
+	if (!root->ref_cows) {
 		path->lowest_level = lowest_level;
 		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
 		BUG_ON(ret < 0);
@@ -4422,91 +4542,49 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	keys = kzalloc(sizeof(*keys) * BTRFS_MAX_LEVEL, GFP_NOFS);
-	BUG_ON(!keys);
-	nodes = kzalloc(sizeof(*nodes) * BTRFS_MAX_LEVEL, GFP_NOFS);
-	BUG_ON(!nodes);
-
 	mutex_lock(&root->fs_info->tree_reloc_mutex);
 	ret = init_reloc_tree(trans, root);
 	BUG_ON(ret);
 	reloc_root = root->reloc_root;
 
-	path->lowest_level = lowest_level;
-	ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 0);
-	BUG_ON(ret);
-	/*
-	 * get relocation mapping for tree blocks in the path
-	 */
-	lowest_merge = BTRFS_MAX_LEVEL;
-	for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
-		u64 new_bytenr;
-		eb = path->nodes[level];
-		if (!eb || eb == reloc_root->node)
-			continue;
-		ret = btrfs_get_reloc_mapping(reloc_root, eb->start, eb->len,
-					      &new_bytenr);
-		if (ret)
-			continue;
-		if (level == 0)
-			btrfs_item_key_to_cpu(eb, &keys[level], 0);
-		else
-			btrfs_node_key_to_cpu(eb, &keys[level], 0);
-		nodes[level] = new_bytenr;
-		lowest_merge = level;
-	}
+	shared_level = ref_path->shared_level;
+	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
 
-	update_refs = 0;
-	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-		eb = path->nodes[0];
-		if (btrfs_header_generation(eb) < trans->transid)
-			update_refs = 1;
-	}
+	keys = ref_path->node_keys;
+	nodes = ref_path->new_nodes;
+	memset(&keys[shared_level + 1], 0,
+	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
+	memset(&nodes[shared_level + 1], 0,
+	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
 
-	btrfs_release_path(reloc_root, path);
-	/*
-	 * merge tree blocks that already relocated in other reloc trees
-	 */
-	if (lowest_merge != BTRFS_MAX_LEVEL) {
+	if (nodes[lowest_level] == 0) {
+		path->lowest_level = lowest_level;
+		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+					0, 1);
+		BUG_ON(ret);
+		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
+			eb = path->nodes[level];
+			if (!eb || eb == reloc_root->node)
+				break;
+			nodes[level] = eb->start;
+			if (level == 0)
+				btrfs_item_key_to_cpu(eb, &keys[level], 0);
+			else
+				btrfs_node_key_to_cpu(eb, &keys[level], 0);
+		}
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			eb = path->nodes[0];
+			ret = replace_extents_in_leaf(trans, reloc_root, eb,
+						      group, reloc_inode);
+			BUG_ON(ret);
+		}
+		btrfs_release_path(reloc_root, path);
+	} else {
 		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
-				       lowest_merge);
-		BUG_ON(ret < 0);
-	}
-	/*
-	 * cow any tree blocks that still haven't been relocated
-	 */
-	ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 1);
-	BUG_ON(ret);
-	/*
-	 * if we are relocating data block group, update extent pointers
-	 * in the newly created tree leaf.
-	 */
-	eb = path->nodes[0];
-	if (update_refs && nodes[0] != eb->start) {
-		ret = replace_extents_in_leaf(trans, reloc_root, eb, group,
-					      reloc_inode);
+				       lowest_level);
 		BUG_ON(ret);
 	}
 
-	memset(keys, 0, sizeof(*keys) * BTRFS_MAX_LEVEL);
-	memset(nodes, 0, sizeof(*nodes) * BTRFS_MAX_LEVEL);
-	for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
-		eb = path->nodes[level];
-		if (!eb || eb == reloc_root->node)
-			continue;
-		BUG_ON(btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID);
-		nodes[level] = eb->start;
-		if (level == 0)
-			btrfs_item_key_to_cpu(eb, &keys[level], 0);
-		else
-			btrfs_node_key_to_cpu(eb, &keys[level], 0);
-	}
-
-	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-		eb = path->nodes[0];
-		extent_buffer_get(eb);
-	}
-	btrfs_release_path(reloc_root, path);
 	/*
 	 * replace tree blocks in the fs tree with tree blocks in
 	 * the reloc tree.
@@ -4515,15 +4593,19 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 	BUG_ON(ret < 0);
 
 	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
+					0, 0);
+		BUG_ON(ret);
+		extent_buffer_get(path->nodes[0]);
+		eb = path->nodes[0];
+		btrfs_release_path(reloc_root, path);
 		ret = invalidate_extent_cache(reloc_root, eb, group, root);
 		BUG_ON(ret);
 		free_extent_buffer(eb);
 	}
-	mutex_unlock(&root->fs_info->tree_reloc_mutex);
 
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
 	path->lowest_level = 0;
-	kfree(nodes);
-	kfree(keys);
 	return 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5ecc24d634a..1df67129cc3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -521,7 +521,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			dirty = root->dirty_root;
 
 			btrfs_free_log(trans, root);
-			btrfs_free_reloc_root(root);
+			btrfs_free_reloc_root(trans, root);
 
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
@@ -930,8 +930,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
-	btrfs_free_reloc_mappings(root);
-
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
-- 
cgit v1.2.3


From 80eb234af09dbe6c97b2e3d60a13ec391e98fbba Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: fix enospc when there is plenty of space

So there is an odd case where we can possibly return -ENOSPC when there is in
fact space to be had.  It only happens with Metadata writes, and happens _very_
infrequently.  What has to happen is we have to allocate have allocated out of
the first logical byte on the disk, which would set last_alloc to
first_logical_byte(root, 0), so search_start == orig_search_start.  We then
need to allocate for normal metadata, so BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DUP.  We will do a block lookup for the given search_start,
block_group_bits() won't match and we'll go to choose another block group.
However because search_start matches orig_search_start we go to see if we can
allocate a chunk.

If we are in the situation that we cannot allocate a chunk, we fail and ENOSPC.
This is kind of a big flaw of the way find_free_extent works, as it along with
find_free_space loop through _all_ of the block groups, not just the ones that
we want to allocate out of.  This patch completely kills find_free_space and
rolls it into find_free_extent.  I've introduced a sort of state machine into
this, which will make it easier to get cache miss information out of the
allocator, and will work well with my locking changes.

The basic flow is this:  We have the variable loop which is 0, meaning we are
in the hint phase.  We lookup the block group for the hint, and lookup the
space_info for what we want to allocate out of.  If the block group we were
pointed at by the hint either isn't of the correct type, or just doesn't have
the space we need, we set head to space_info->block_groups, so we start at the
beginning of the block groups for this particular space info, and loop through.

This is also where we add the empty_cluster to total_needed.  At this point
loop is set to 1 and we just loop through all of the block groups for this
particular space_info looking for the space we need, just as find_free_space
would have done, except we only hit the block groups we want and not _all_ of
the block groups.  If we come full circle we see if we can allocate a chunk.
If we cannot of course we exit with -ENOSPC and we are good.  If not we start
over at space_info->block_groups and loop through again, with loop == 2.  If we
come full circle and haven't found what we need then we exit with -ENOSPC.
I've been running this for a couple of days now and it seems stable, and I
haven't yet hit a -ENOSPC when there was plenty of space left.

Also I've made a groups_sem to handle the group list for the space_info.  This
is part of my locking changes, but is relatively safe and seems better than
holding the space_info spinlock over that entire search time.  Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/extent-tree.c | 322 ++++++++++++++++++++-----------------------------
 2 files changed, 132 insertions(+), 191 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 117090995e7..caa860a1c3e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -544,6 +544,7 @@ struct btrfs_space_info {
 	/* for block groups in our same type */
 	struct list_head block_groups;
 	spinlock_t lock;
+	struct rw_semaphore groups_sem;
 };
 
 struct btrfs_free_space {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 56e41369d71..e3b3e13a481 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -317,59 +317,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	return cache;
 }
 
-static int noinline find_free_space(struct btrfs_root *root,
-				    struct btrfs_block_group_cache **cache_ret,
-				    u64 *start_ret, u64 num, int data)
-{
-	int ret;
-	struct btrfs_block_group_cache *cache = *cache_ret;
-	struct btrfs_free_space *info = NULL;
-	u64 last;
-	u64 search_start = *start_ret;
-
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
-	if (!cache)
-		goto out;
-
-	last = max(search_start, cache->key.objectid);
-
-again:
-	ret = cache_block_group(root, cache);
-	if (ret)
-		goto out;
-
-	if (cache->ro || !block_group_bits(cache, data))
-		goto new_group;
-
-	info = btrfs_find_free_space(cache, last, num);
-	if (info) {
-		*start_ret = info->offset;
-		return 0;
-	}
-
-new_group:
-	last = cache->key.objectid + cache->key.offset;
-
-	cache = btrfs_lookup_first_block_group(root->fs_info, last);
-	if (!cache)
-		goto out;
-
-	*cache_ret = cache;
-	goto again;
-
-out:
-	return -ENOSPC;
-}
-
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 						  u64 flags)
 {
@@ -384,6 +331,15 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 	return NULL;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
 static struct btrfs_block_group_cache *
 __btrfs_find_block_group(struct btrfs_root *root,
 			 struct btrfs_block_group_cache *hint,
@@ -1446,6 +1402,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 	list_add(&found->list, &info->space_info);
 	INIT_LIST_HEAD(&found->block_groups);
+	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
 	found->flags = flags;
 	found->total_bytes = total_bytes;
@@ -2208,19 +2165,22 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				     u64 exclude_start, u64 exclude_nr,
 				     int data)
 {
-	int ret;
-	u64 orig_search_start;
+	int ret = 0;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
-	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
 	u64 *last_ptr = NULL;
-	struct btrfs_block_group_cache *block_group;
+	struct btrfs_block_group_cache *block_group = NULL;
 	int chunk_alloc_done = 0;
 	int empty_cluster = 2 * 1024 * 1024;
 	int allowed_chunk_alloc = 0;
+	struct list_head *head = NULL, *cur = NULL;
+	int loop = 0;
+	struct btrfs_space_info *space_info;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+	ins->objectid = 0;
+	ins->offset = 0;
 
 	if (orig_root->ref_cows || empty_size)
 		allowed_chunk_alloc = 1;
@@ -2239,152 +2199,132 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		else
 			empty_size += empty_cluster;
 	}
-
 	search_start = max(search_start, first_logical_byte(root, 0));
-	orig_search_start = search_start;
-
 	search_start = max(search_start, hint_byte);
 	total_needed += empty_size;
 
-new_group:
-	block_group = btrfs_lookup_block_group(info, search_start);
-	if (!block_group)
-		block_group = btrfs_lookup_first_block_group(info,
-							     search_start);
+	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+	space_info = __find_space_info(root->fs_info, data);
 
-	/*
-	 * Ok this looks a little tricky, buts its really simple.  First if we
-	 * didn't find a block group obviously we want to start over.
-	 * Secondly, if the block group we found does not match the type we
-	 * need, and we have a last_ptr and its not 0, chances are the last
-	 * allocation we made was at the end of the block group, so lets go
-	 * ahead and skip the looking through the rest of the block groups and
-	 * start at the beginning.  This helps with metadata allocations,
-	 * since you are likely to have a bunch of data block groups to search
-	 * through first before you realize that you need to start over, so go
-	 * ahead and start over and save the time.
-	 */
-	if (!block_group || (!block_group_bits(block_group, data) &&
-			     last_ptr && *last_ptr)) {
-		if (search_start != orig_search_start) {
-			if (last_ptr && *last_ptr) {
-				total_needed += empty_cluster;
-				*last_ptr = 0;
-			}
-			search_start = orig_search_start;
-			goto new_group;
-		} else if (!chunk_alloc_done && allowed_chunk_alloc) {
-			ret = do_chunk_alloc(trans, root,
-					     num_bytes + 2 * 1024 * 1024,
-					     data, 1);
-			if (ret < 0)
-				goto error;
-			BUG_ON(ret);
-			chunk_alloc_done = 1;
-			search_start = orig_search_start;
-			goto new_group;
-		} else {
-			ret = -ENOSPC;
-			goto error;
-		}
-	}
-
-	/*
-	 * this is going to seach through all of the existing block groups it
-	 * can find, so if we don't find something we need to see if we can
-	 * allocate what we need.
-	 */
-	ret = find_free_space(root, &block_group, &search_start,
-			      total_needed, data);
-	if (ret == -ENOSPC) {
+	down_read(&space_info->groups_sem);
+	while (1) {
+		struct btrfs_free_space *free_space;
 		/*
-		 * instead of allocating, start at the original search start
-		 * and see if there is something to be found, if not then we
-		 * allocate
+		 * the only way this happens if our hint points to a block
+		 * group thats not of the proper type, while looping this
+		 * should never happen
 		 */
-		if (search_start != orig_search_start) {
-			if (last_ptr && *last_ptr) {
-				*last_ptr = 0;
-				total_needed += empty_cluster;
-			}
-			search_start = orig_search_start;
+		if (unlikely(!block_group_bits(block_group, data)))
 			goto new_group;
-		}
 
-		/*
-		 * we've already allocated, we're pretty screwed
-		 */
-		if (chunk_alloc_done) {
-			goto error;
-		} else if (!allowed_chunk_alloc && block_group &&
-			   block_group_bits(block_group, data)) {
-			block_group->space_info->force_alloc = 1;
-			goto error;
-		} else if (!allowed_chunk_alloc) {
-			goto error;
-		}
+		ret = cache_block_group(root, block_group);
+		if (ret)
+			break;
 
-		ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024,
-				     data, 1);
-		if (ret < 0)
-			goto error;
+		if (block_group->ro)
+			goto new_group;
 
-		BUG_ON(ret);
-		chunk_alloc_done = 1;
-		if (block_group)
-			search_start = block_group->key.objectid +
+		free_space = btrfs_find_free_space(block_group, search_start,
+						   total_needed);
+		if (free_space) {
+			u64 start = block_group->key.objectid;
+			u64 end = block_group->key.objectid +
 				block_group->key.offset;
-		else
-			search_start = orig_search_start;
-		goto new_group;
-	}
 
-	if (ret)
-		goto error;
+			search_start = stripe_align(root, free_space->offset);
 
-	search_start = stripe_align(root, search_start);
-	ins->objectid = search_start;
-	ins->offset = num_bytes;
+			/* move on to the next group */
+			if (search_start + num_bytes >= search_end)
+				goto new_group;
 
-	if (ins->objectid + num_bytes >= search_end) {
-		search_start = orig_search_start;
-		if (chunk_alloc_done) {
-			ret = -ENOSPC;
-			goto error;
+			/* move on to the next group */
+			if (search_start + num_bytes > end)
+				goto new_group;
+
+			if (exclude_nr > 0 &&
+			    (search_start + num_bytes > exclude_start &&
+			     search_start < exclude_start + exclude_nr)) {
+				search_start = exclude_start + exclude_nr;
+				/*
+				 * if search_start is still in this block group
+				 * then we just re-search this block group
+				 */
+				if (search_start >= start &&
+				    search_start < end)
+					continue;
+
+				/* else we go to the next block group */
+				goto new_group;
+			}
+
+			ins->objectid = search_start;
+			ins->offset = num_bytes;
+			/* we are all good, lets return */
+			break;
 		}
-		goto new_group;
-	}
+new_group:
+		/*
+		 * Here's how this works.
+		 * loop == 0: we were searching a block group via a hint
+		 *		and didn't find anything, so we start at
+		 *		the head of the block groups and keep searching
+		 * loop == 1: we're searching through all of the block groups
+		 *		if we hit the head again we have searched
+		 *		all of the block groups for this space and we
+		 *		need to try and allocate, if we cant error out.
+		 * loop == 2: we allocated more space and are looping through
+		 *		all of the block groups again.
+		 */
+		if (loop == 0) {
+			head = &space_info->block_groups;
+			cur = head->next;
 
-	if (ins->objectid + num_bytes >
-	    block_group->key.objectid + block_group->key.offset) {
-		if (search_start == orig_search_start && chunk_alloc_done) {
-			ret = -ENOSPC;
-			goto error;
+			if (last_ptr && *last_ptr) {
+				total_needed += empty_cluster;
+				*last_ptr = 0;
+			}
+			loop++;
+		} else if (loop == 1 && cur == head) {
+			if (allowed_chunk_alloc && !chunk_alloc_done) {
+				up_read(&space_info->groups_sem);
+				ret = do_chunk_alloc(trans, root, num_bytes +
+						     2 * 1024 * 1024, data, 1);
+				if (ret < 0)
+					break;
+				down_read(&space_info->groups_sem);
+				loop++;
+				head = &space_info->block_groups;
+				cur = head->next;
+				chunk_alloc_done = 1;
+			} else if (!allowed_chunk_alloc) {
+				space_info->force_alloc = 1;
+				break;
+			} else {
+				break;
+			}
+		} else if (cur == head) {
+			break;
 		}
-		search_start = block_group->key.objectid +
-			block_group->key.offset;
-		goto new_group;
-	}
 
-	if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
-	    ins->objectid < exclude_start + exclude_nr)) {
-		search_start = exclude_start + exclude_nr;
-		goto new_group;
+		block_group = list_entry(cur, struct btrfs_block_group_cache,
+					 list);
+		search_start = block_group->key.objectid;
+		cur = cur->next;
 	}
 
-	if (!(data & BTRFS_BLOCK_GROUP_DATA))
-		trans->block_group = block_group;
+	/* we found what we needed */
+	if (ins->objectid) {
+		if (!(data & BTRFS_BLOCK_GROUP_DATA))
+			trans->block_group = block_group;
 
-	ins->offset = num_bytes;
-	if (last_ptr) {
-		*last_ptr = ins->objectid + ins->offset;
-		if (*last_ptr ==
-		    btrfs_super_total_bytes(&root->fs_info->super_copy))
-			*last_ptr = 0;
+		if (last_ptr)
+			*last_ptr = ins->objectid + ins->offset;
+		ret = 0;
+	} else if (!ret) {
+		ret = -ENOSPC;
 	}
 
-	ret = 0;
-error:
+	up_read(&space_info->groups_sem);
 	return ret;
 }
 
@@ -2397,7 +2337,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	       info->total_bytes - info->bytes_used - info->bytes_pinned -
 	       info->bytes_reserved, (info->full) ? "" : "not ");
 
-	spin_lock(&info->lock);
+	down_read(&info->groups_sem);
 	list_for_each(l, &info->block_groups) {
 		cache = list_entry(l, struct btrfs_block_group_cache, list);
 		spin_lock(&cache->lock);
@@ -2409,7 +2349,7 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
-	spin_unlock(&info->lock);
+	up_read(&info->groups_sem);
 }
 
 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
@@ -5186,9 +5126,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 
 		rb_erase(&block_group->cache_node,
 			 &info->block_group_cache_tree);
-		spin_lock(&block_group->space_info->lock);
+		down_write(&block_group->space_info->groups_sem);
 		list_del(&block_group->list);
-		spin_unlock(&block_group->space_info->lock);
+		up_write(&block_group->space_info->groups_sem);
 		kfree(block_group);
 	}
 	spin_unlock(&info->block_group_cache_lock);
@@ -5249,9 +5189,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 					&space_info);
 		BUG_ON(ret);
 		cache->space_info = space_info;
-		spin_lock(&space_info->lock);
-		list_add(&cache->list, &space_info->block_groups);
-		spin_unlock(&space_info->lock);
+		down_write(&space_info->groups_sem);
+		list_add_tail(&cache->list, &space_info->block_groups);
+		up_write(&space_info->groups_sem);
 
 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
 		BUG_ON(ret);
@@ -5297,9 +5237,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
-	spin_lock(&cache->space_info->lock);
-	list_add(&cache->list, &cache->space_info->block_groups);
-	spin_unlock(&cache->space_info->lock);
+	down_write(&cache->space_info->groups_sem);
+	list_add_tail(&cache->list, &cache->space_info->block_groups);
+	up_write(&cache->space_info->groups_sem);
 
 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
 	BUG_ON(ret);
@@ -5338,9 +5278,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	btrfs_remove_free_space_cache(block_group);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
-	spin_lock(&block_group->space_info->lock);
+	down_write(&block_group->space_info->groups_sem);
 	list_del(&block_group->list);
-	spin_unlock(&block_group->space_info->lock);
+	up_write(&block_group->space_info->groups_sem);
 
 	/*
 	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
-- 
cgit v1.2.3


From 2517920135b0d29e70453e5b03d70d7b94207df3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: nuke fs wide allocation mutex V2

This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.

There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.

The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same.  Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.

Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one.  I have tested this heavily and it does
not appear to break anything.  This has to be applied on top of my
find_free_extent redo patch.

I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.c            |   3 +-
 fs/btrfs/ctree.h            |   8 +-
 fs/btrfs/disk-io.c          |   3 +-
 fs/btrfs/extent-tree.c      | 333 +++++++++++++++++++++-----------------------
 fs/btrfs/extent_io.c        |  14 ++
 fs/btrfs/extent_io.h        |   2 +
 fs/btrfs/free-space-cache.c |  92 ++++++++----
 fs/btrfs/transaction.c      |   2 -
 fs/btrfs/tree-defrag.c      |   3 -
 fs/btrfs/tree-log.c         |   4 +-
 fs/btrfs/volumes.c          |   2 -
 11 files changed, 250 insertions(+), 216 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 73899d0f9d8..f82f8db0227 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1387,8 +1387,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
-	WARN_ON(cow && root == root->fs_info->extent_root &&
-		!mutex_is_locked(&root->fs_info->alloc_mutex));
+
 	if (ins_len < 0)
 		lowest_unlock = 2;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index caa860a1c3e..fdba4f1b634 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -558,6 +558,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	spinlock_t lock;
+	struct mutex alloc_mutex;
 	u64 pinned;
 	u64 reserved;
 	u64 flags;
@@ -635,7 +636,8 @@ struct btrfs_fs_info {
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
-	struct mutex alloc_mutex;
+	struct mutex extent_ins_mutex;
+	struct mutex pinned_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
@@ -1941,8 +1943,12 @@ int btrfs_acl_chmod(struct inode *inode);
 /* free-space-cache.c */
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 bytenr, u64 size);
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 bytenr, u64 size);
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes);
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
 				   *block_group);
 struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 796256440df..d1137d7ea8d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1460,7 +1460,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
-	mutex_init(&fs_info->alloc_mutex);
+	mutex_init(&fs_info->extent_ins_mutex);
+	mutex_init(&fs_info->pinned_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e3b3e13a481..564260872c7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -53,24 +53,6 @@ __btrfs_find_block_group(struct btrfs_root *root,
 			 struct btrfs_block_group_cache *hint,
 			 u64 search_start, int data, int owner);
 
-void maybe_lock_mutex(struct btrfs_root *root)
-{
-	if (root != root->fs_info->extent_root &&
-	    root != root->fs_info->chunk_root &&
-	    root != root->fs_info->dev_root) {
-		mutex_lock(&root->fs_info->alloc_mutex);
-	}
-}
-
-void maybe_unlock_mutex(struct btrfs_root *root)
-{
-	if (root != root->fs_info->extent_root &&
-	    root != root->fs_info->chunk_root &&
-	    root != root->fs_info->dev_root) {
-		mutex_unlock(&root->fs_info->alloc_mutex);
-	}
-}
-
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -164,6 +146,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 	u64 extent_start, extent_end, size;
 	int ret;
 
+	mutex_lock(&info->pinned_mutex);
 	while (start < end) {
 		ret = find_first_extent_bit(&info->pinned_extents, start,
 					    &extent_start, &extent_end,
@@ -175,7 +158,8 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 			start = extent_end + 1;
 		} else if (extent_start > start && extent_start < end) {
 			size = extent_start - start;
-			ret = btrfs_add_free_space(block_group, start, size);
+			ret = btrfs_add_free_space_lock(block_group, start,
+							size);
 			BUG_ON(ret);
 			start = extent_end + 1;
 		} else {
@@ -185,9 +169,10 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (start < end) {
 		size = end - start;
-		ret = btrfs_add_free_space(block_group, start, size);
+		ret = btrfs_add_free_space_lock(block_group, start, size);
 		BUG_ON(ret);
 	}
+	mutex_unlock(&info->pinned_mutex);
 
 	return 0;
 }
@@ -445,13 +430,11 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	maybe_lock_mutex(root);
 	key.objectid = start;
 	key.offset = len;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
-	maybe_unlock_mutex(root);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -676,8 +659,9 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 
 		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
 		num_bytes = btrfs_level_size(root, (int)owner_objectid);
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-				bytenr + num_bytes - 1, EXTENT_LOCKED, 0)) {
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
 			u64 priv;
 			ret = get_state_private(&root->fs_info->extent_ins,
 						bytenr, &priv);
@@ -686,6 +670,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 							(unsigned long)priv;
 			BUG_ON(extent_op->parent != orig_parent);
 			BUG_ON(extent_op->generation != orig_generation);
+
 			extent_op->parent = parent;
 			extent_op->generation = ref_generation;
 		} else {
@@ -703,10 +688,11 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 
 			set_extent_bits(&root->fs_info->extent_ins,
 					bytenr, bytenr + num_bytes - 1,
-					EXTENT_LOCKED, GFP_NOFS);
+					EXTENT_WRITEBACK, GFP_NOFS);
 			set_state_private(&root->fs_info->extent_ins,
 					  bytenr, (unsigned long)extent_op);
 		}
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		return 0;
 	}
 
@@ -742,12 +728,10 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
 					parent, ref_root, ref_root,
 					ref_generation, ref_generation,
 					owner_objectid);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -817,11 +801,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
 				     0, ref_root, 0, ref_generation,
 				     owner_objectid);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -886,7 +868,6 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
 	path = btrfs_alloc_path();
-	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -953,7 +934,6 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	}
 	ret = 0;
 out:
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1179,13 +1159,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			nr_file_extents++;
 
-			maybe_lock_mutex(root);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   key.objectid);
-			maybe_unlock_mutex(root);
 
 			if (ret) {
 				faili = i;
@@ -1194,13 +1172,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
-			maybe_lock_mutex(root);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   level - 1);
-			maybe_unlock_mutex(root);
 			if (ret) {
 				faili = i;
 				WARN_ON(1);
@@ -1270,24 +1246,20 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (bytenr == 0)
 				continue;
-			maybe_lock_mutex(root);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    key.objectid);
-			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, slot);
-			maybe_lock_mutex(root);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    level - 1);
-			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
 		}
@@ -1344,7 +1316,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		cache = NULL;
 		spin_lock(&root->fs_info->block_group_cache_lock);
@@ -1378,7 +1349,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		}
 	}
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return werr;
 }
 
@@ -1390,9 +1360,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 	found = __find_space_info(info, flags);
 	if (found) {
+		spin_lock(&found->lock);
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
 		found->full = 0;
+		spin_unlock(&found->lock);
 		*space_info = found;
 		return 0;
 	}
@@ -1479,43 +1451,53 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(!space_info);
 
+	spin_lock(&space_info->lock);
 	if (space_info->force_alloc) {
 		force = 1;
 		space_info->force_alloc = 0;
 	}
-	if (space_info->full)
+	if (space_info->full) {
+		spin_unlock(&space_info->lock);
 		goto out;
+	}
 
 	thresh = div_factor(space_info->total_bytes, 6);
 	if (!force &&
 	   (space_info->bytes_used + space_info->bytes_pinned +
-	    space_info->bytes_reserved + alloc_bytes) < thresh)
+	    space_info->bytes_reserved + alloc_bytes) < thresh) {
+		spin_unlock(&space_info->lock);
 		goto out;
+	}
 
-	while (!mutex_trylock(&extent_root->fs_info->chunk_mutex)) {
-		if (!force)
-			goto out;
-		mutex_unlock(&extent_root->fs_info->alloc_mutex);
-		cond_resched();
-		mutex_lock(&extent_root->fs_info->alloc_mutex);
+	spin_unlock(&space_info->lock);
+
+	ret = mutex_trylock(&extent_root->fs_info->chunk_mutex);
+	if (!ret && !force) {
+		goto out;
+	} else if (!ret) {
+		mutex_lock(&extent_root->fs_info->chunk_mutex);
 		waited = 1;
 	}
 
-	if (waited && space_info->full)
-		goto out_unlock;
+	if (waited) {
+		spin_lock(&space_info->lock);
+		if (space_info->full) {
+			spin_unlock(&space_info->lock);
+			goto out_unlock;
+		}
+		spin_unlock(&space_info->lock);
+	}
 
 	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
-	if (ret == -ENOSPC) {
+	if (ret) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
 		goto out_unlock;
 	}
-	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-
 out_unlock:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 out:
@@ -1533,7 +1515,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while(total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache) {
@@ -1542,6 +1523,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
+		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
@@ -1551,11 +1533,13 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			cache->space_info->bytes_used += num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 		} else {
 			old_val -= num_bytes;
 			cache->space_info->bytes_used -= num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			if (mark_free) {
 				int ret;
 				ret = btrfs_add_free_space(cache, bytenr,
@@ -1588,7 +1572,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
+	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
 	if (pin) {
 		set_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
@@ -1602,16 +1586,20 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 		len = min(num, cache->key.offset -
 			  (bytenr - cache->key.objectid));
 		if (pin) {
+			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
 			cache->pinned += len;
 			cache->space_info->bytes_pinned += len;
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned += len;
 		} else {
+			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
 			cache->pinned -= len;
 			cache->space_info->bytes_pinned -= len;
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1627,23 +1615,23 @@ static int update_reserved_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
 		BUG_ON(!cache);
 		len = min(num, cache->key.offset -
 			  (bytenr - cache->key.objectid));
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
 		if (reserve) {
-			spin_lock(&cache->lock);
 			cache->reserved += len;
 			cache->space_info->bytes_reserved += len;
-			spin_unlock(&cache->lock);
 		} else {
-			spin_lock(&cache->lock);
 			cache->reserved -= len;
 			cache->space_info->bytes_reserved -= len;
-			spin_unlock(&cache->lock);
 		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&cache->space_info->lock);
 		bytenr += len;
 		num -= len;
 	}
@@ -1658,6 +1646,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
+	mutex_lock(&root->fs_info->pinned_mutex);
 	while(1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
@@ -1666,6 +1655,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 		set_extent_dirty(copy, start, end, GFP_NOFS);
 		last = end + 1;
 	}
+	mutex_unlock(&root->fs_info->pinned_mutex);
 	return 0;
 }
 
@@ -1678,7 +1668,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group_cache *cache;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->pinned_mutex);
 	while(1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -1690,12 +1680,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		if (cache->cached)
 			btrfs_add_free_space(cache, start, end - start + 1);
 		if (need_resched()) {
-			mutex_unlock(&root->fs_info->alloc_mutex);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
+			mutex_lock(&root->fs_info->pinned_mutex);
 		}
 	}
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->pinned_mutex);
 	return 0;
 }
 
@@ -1705,6 +1695,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	u64 priv;
+	u64 search = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
 	struct btrfs_extent_ref *ref;
@@ -1714,20 +1705,37 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	int ret;
 	int err = 0;
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 	path = btrfs_alloc_path();
 
 	while(1) {
-		ret = find_first_extent_bit(&info->extent_ins, 0, &start,
-					    &end, EXTENT_LOCKED);
-		if (ret)
+		mutex_lock(&info->extent_ins_mutex);
+		ret = find_first_extent_bit(&info->extent_ins, search, &start,
+					    &end, EXTENT_WRITEBACK);
+		if (ret) {
+			mutex_unlock(&info->extent_ins_mutex);
+			if (search) {
+				search = 0;
+				continue;
+			}
 			break;
+		}
+
+		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			continue;
+		}
+		BUG_ON(ret < 0);
 
 		ret = get_state_private(&info->extent_ins, start, &priv);
 		BUG_ON(ret);
 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
 
+		mutex_unlock(&info->extent_ins_mutex);
+
 		if (extent_op->type == PENDING_EXTENT_INSERT) {
 			key.objectid = start;
 			key.offset = end + 1 - start;
@@ -1736,8 +1744,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 					&extent_item, sizeof(extent_item));
 			BUG_ON(err);
 
+			mutex_lock(&info->extent_ins_mutex);
 			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+					  EXTENT_WRITEBACK, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			err = insert_extent_backref(trans, extent_root, path,
 						start, extent_op->parent,
@@ -1753,8 +1763,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 						extent_op->level, 0);
 			BUG_ON(err);
 
+			mutex_lock(&info->extent_ins_mutex);
 			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+					  EXTENT_WRITEBACK, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			key.objectid = start;
 			key.offset = extent_op->parent;
@@ -1772,12 +1784,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 			BUG_ON(1);
 		}
 		kfree(extent_op);
+		unlock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		search = 0;
 
-		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
+		cond_resched();
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -1790,7 +1800,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 	int err = 0;
 	struct extent_buffer *buf;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (is_data)
 		goto pinit;
 
@@ -1847,7 +1856,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -1935,8 +1943,10 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 #endif
 
 		if (pin) {
+			mutex_lock(&root->fs_info->pinned_mutex);
 			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
 				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
@@ -1956,6 +1966,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
+		btrfs_release_path(extent_root, path);
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free);
 		BUG_ON(ret);
@@ -1994,70 +2005,91 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 {
 	int ret;
 	int err = 0;
-	int mark_free = 0;
 	u64 start;
 	u64 end;
 	u64 priv;
+	u64 search = 0;
 	struct extent_io_tree *pending_del;
 	struct extent_io_tree *extent_ins;
 	struct pending_extent_op *extent_op;
+	struct btrfs_fs_info *info = extent_root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	extent_ins = &extent_root->fs_info->extent_ins;
 	pending_del = &extent_root->fs_info->pending_del;
 
 	while(1) {
-		ret = find_first_extent_bit(pending_del, 0, &start, &end,
-					    EXTENT_LOCKED);
-		if (ret)
+		mutex_lock(&info->extent_ins_mutex);
+		ret = find_first_extent_bit(pending_del, search, &start, &end,
+					    EXTENT_WRITEBACK);
+		if (ret) {
+			mutex_unlock(&info->extent_ins_mutex);
+			if (search) {
+				search = 0;
+				continue;
+			}
 			break;
+		}
+
+		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			continue;
+		}
+		BUG_ON(ret < 0);
 
 		ret = get_state_private(pending_del, start, &priv);
 		BUG_ON(ret);
 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
 
-		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
+		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
 				  GFP_NOFS);
-
-		ret = pin_down_bytes(trans, extent_root, start,
-				     end + 1 - start, 0);
-		mark_free = ret > 0;
 		if (!test_range_bit(extent_ins, start, end,
-				    EXTENT_LOCKED, 0)) {
+				    EXTENT_WRITEBACK, 0)) {
+			mutex_unlock(&info->extent_ins_mutex);
 free_extent:
 			ret = __free_extent(trans, extent_root,
 					    start, end + 1 - start,
 					    extent_op->orig_parent,
 					    extent_root->root_key.objectid,
 					    extent_op->orig_generation,
-					    extent_op->level, 0, mark_free);
+					    extent_op->level, 1, 0);
 			kfree(extent_op);
 		} else {
 			kfree(extent_op);
-			ret = get_state_private(extent_ins, start, &priv);
+
+			ret = get_state_private(&info->extent_ins, start,
+						&priv);
 			BUG_ON(ret);
 			extent_op = (struct pending_extent_op *)
-							(unsigned long)priv;
+						(unsigned long)priv;
+
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_WRITEBACK, GFP_NOFS);
 
-			clear_extent_bits(extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			if (extent_op->type == PENDING_BACKREF_UPDATE)
 				goto free_extent;
 
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, start,
+					     end + 1 - start, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
 			ret = update_block_group(trans, extent_root, start,
-						end + 1 - start, 0, mark_free);
+						end + 1 - start, 0, ret > 0);
+
 			BUG_ON(ret);
 			kfree(extent_op);
 		}
 		if (ret)
 			err = ret;
+		unlock_extent(extent_ins, start, end, GFP_NOFS);
 
-		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
+		search = 0;
+		cond_resched();
 	}
 	return err;
 }
@@ -2091,11 +2123,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		extent_op->orig_generation = ref_generation;
 		extent_op->level = (int)owner_objectid;
 
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+				EXTENT_WRITEBACK, GFP_NOFS);
 		set_state_private(&root->fs_info->pending_del,
 				  bytenr, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		return 0;
 	}
 	/* if metadata always pin */
@@ -2134,11 +2168,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	maybe_lock_mutex(root);
 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
 				  root_objectid, ref_generation,
 				  owner_objectid, pin);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2214,12 +2246,16 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		 * group thats not of the proper type, while looping this
 		 * should never happen
 		 */
+		WARN_ON(!block_group);
+		mutex_lock(&block_group->alloc_mutex);
 		if (unlikely(!block_group_bits(block_group, data)))
 			goto new_group;
 
 		ret = cache_block_group(root, block_group);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&block_group->alloc_mutex);
 			break;
+		}
 
 		if (block_group->ro)
 			goto new_group;
@@ -2250,8 +2286,10 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				 * then we just re-search this block group
 				 */
 				if (search_start >= start &&
-				    search_start < end)
+				    search_start < end) {
+					mutex_unlock(&block_group->alloc_mutex);
 					continue;
+				}
 
 				/* else we go to the next block group */
 				goto new_group;
@@ -2259,10 +2297,15 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 			ins->objectid = search_start;
 			ins->offset = num_bytes;
+
+			btrfs_remove_free_space_lock(block_group, search_start,
+						     num_bytes);
 			/* we are all good, lets return */
+			mutex_unlock(&block_group->alloc_mutex);
 			break;
 		}
 new_group:
+		mutex_unlock(&block_group->alloc_mutex);
 		/*
 		 * Here's how this works.
 		 * loop == 0: we were searching a block group via a hint
@@ -2363,7 +2406,6 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	u64 search_start = 0;
 	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_block_group_cache *cache;
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -2419,13 +2461,6 @@ again:
 		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
-	cache = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid);
-		return -ENOSPC;
-	}
-
-	ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset);
 
 	return ret;
 }
@@ -2434,16 +2469,13 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	struct btrfs_block_group_cache *cache;
 
-	maybe_lock_mutex(root);
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
 		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
-		maybe_unlock_mutex(root);
 		return -ENOSPC;
 	}
 	btrfs_add_free_space(cache, start, len);
 	update_reserved_extents(root, start, len, 0);
-	maybe_unlock_mutex(root);
 	return 0;
 }
 
@@ -2455,12 +2487,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 data)
 {
 	int ret;
-	maybe_lock_mutex(root);
 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
 				     empty_size, hint_byte, search_end, ins,
 				     data);
 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2510,11 +2540,13 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 		extent_op->orig_generation = 0;
 		extent_op->level = (int)owner;
 
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+				EXTENT_WRITEBACK, GFP_NOFS);
 		set_state_private(&root->fs_info->extent_ins,
 				  ins->objectid, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		goto update_block;
 	}
 
@@ -2578,11 +2610,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2599,15 +2629,16 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group_cache *block_group;
 
-	maybe_lock_mutex(root);
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	mutex_lock(&block_group->alloc_mutex);
 	cache_block_group(root, block_group);
 
-	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
+	ret = btrfs_remove_free_space_lock(block_group, ins->objectid,
+					   ins->offset);
+	mutex_unlock(&block_group->alloc_mutex);
 	BUG_ON(ret);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2627,8 +2658,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	maybe_lock_mutex(root);
-
 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
@@ -2642,7 +2671,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	} else {
 		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	}
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2734,12 +2762,10 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (disk_bytenr == 0)
 			continue;
 
-		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, 0);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 
 		atomic_inc(&root->fs_info->throttle_gen);
@@ -2758,12 +2784,10 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_info *info = ref->extents;
 
 	for (i = 0; i < ref->nritems; i++) {
-		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
 					  info->objectid, 0);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
@@ -2875,13 +2899,11 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
 
-			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
 						root_owner, root_gen,
 						*level - 1, 1);
 			BUG_ON(ret);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 
 			atomic_inc(&root->fs_info->throttle_gen);
 			wake_up(&root->fs_info->transaction_throttle);
@@ -2957,11 +2979,9 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -3440,8 +3460,6 @@ static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
-
 	if (first_time) {
 		ref_path->lowest_level = -1;
 		ref_path->current_level = -1;
@@ -3498,9 +3516,7 @@ next:
 		level--;
 		btrfs_release_path(extent_root, path);
 		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
 		}
 	}
 	/* reached lowest level */
@@ -3613,15 +3629,12 @@ found:
 
 		btrfs_release_path(extent_root, path);
 		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
 		}
 	}
 	/* reached max tree level, but no tree root found. */
 	BUG();
 out:
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -4556,14 +4569,6 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 					struct btrfs_ref_path *ref_path)
 {
 	int ret;
-	int needs_lock = 0;
-
-	if (root == root->fs_info->extent_root ||
-	    root == root->fs_info->chunk_root ||
-	    root == root->fs_info->dev_root) {
-		needs_lock = 1;
-		mutex_lock(&root->fs_info->alloc_mutex);
-	}
 
 	ret = relocate_one_path(trans, root, path, first_key,
 				ref_path, NULL, NULL);
@@ -4571,8 +4576,6 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 
 	if (root == root->fs_info->extent_root)
 		btrfs_extent_post_op(trans, root);
-	if (needs_lock)
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 	return 0;
 }
@@ -4584,14 +4587,12 @@ static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
 	if (ret)
 		goto out;
 	ret = btrfs_del_item(trans, extent_root, path);
 out:
 	btrfs_release_path(extent_root, path);
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -4627,7 +4628,6 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	struct btrfs_key first_key;
 	u64 prev_block = 0;
 
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 
 	trans = btrfs_start_transaction(extent_root, 1);
 	BUG_ON(!trans);
@@ -4754,7 +4754,6 @@ out:
 	btrfs_end_transaction(trans, extent_root);
 	kfree(new_extents);
 	kfree(ref_path);
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -4807,10 +4806,8 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	spin_lock(&shrink_block_group->lock);
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
 		spin_unlock(&shrink_block_group->lock);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		trans = btrfs_start_transaction(root, 1);
-		mutex_lock(&root->fs_info->alloc_mutex);
 		spin_lock(&shrink_block_group->lock);
 
 		new_alloc_flags = update_block_group_flags(root,
@@ -4826,9 +4823,7 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
 
-		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_end_transaction(trans, root);
-		mutex_lock(&root->fs_info->alloc_mutex);
 	} else
 		spin_unlock(&shrink_block_group->lock);
 	return 0;
@@ -4952,14 +4947,10 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	reloc_inode = create_reloc_inode(info, block_group);
 	BUG_ON(IS_ERR(reloc_inode));
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	__alloc_chunk_for_shrink(root, block_group, 1);
 	block_group->ro = 1;
 	block_group->space_info->total_bytes -= block_group->key.offset;
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
-
 	btrfs_start_delalloc_inodes(info->tree_root);
 	btrfs_wait_ordered_extents(info->tree_root, 0);
 again:
@@ -4978,8 +4969,6 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	while(1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -5007,9 +4996,7 @@ next:
 
 		if (progress && need_resched()) {
 			btrfs_release_path(root, path);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
 			progress = 0;
 			continue;
 		}
@@ -5036,7 +5023,6 @@ next:
 	}
 
 	btrfs_release_path(root, path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 
 	if (pass == 0) {
 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
@@ -5058,8 +5044,6 @@ next:
 	trans = btrfs_start_transaction(info->tree_root, 1);
 	btrfs_commit_transaction(trans, info->tree_root);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	spin_lock(&block_group->lock);
 	WARN_ON(block_group->pinned > 0);
 	WARN_ON(block_group->reserved > 0);
@@ -5067,7 +5051,6 @@ next:
 	spin_unlock(&block_group->lock);
 	ret = 0;
 out:
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -5114,7 +5097,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	struct btrfs_block_group_cache *block_group;
 	struct rb_node *n;
 
-	mutex_lock(&info->alloc_mutex);
 	spin_lock(&info->block_group_cache_lock);
 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
 		block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -5132,7 +5114,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		kfree(block_group);
 	}
 	spin_unlock(&info->block_group_cache_lock);
-	mutex_unlock(&info->alloc_mutex);
 	return 0;
 }
 
@@ -5155,7 +5136,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
@@ -5174,6 +5154,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		}
 
 		spin_lock_init(&cache->lock);
+		mutex_init(&cache->alloc_mutex);
 		INIT_LIST_HEAD(&cache->list);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -5201,7 +5182,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -5214,7 +5194,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_root *extent_root;
 	struct btrfs_block_group_cache *cache;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	extent_root = root->fs_info->extent_root;
 
 	root->fs_info->last_trans_new_blockgroup = trans->transid;
@@ -5226,6 +5205,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	spin_lock_init(&cache->lock);
+	mutex_init(&cache->alloc_mutex);
 	INIT_LIST_HEAD(&cache->list);
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
@@ -5264,7 +5244,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ret;
 
-	BUG_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	root = root->fs_info->extent_root;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 314041fdfa4..7503bd46819 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -938,6 +938,20 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 }
 EXPORT_SYMBOL(lock_extent);
 
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+
+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+			     &failed_start, mask);
+	if (err == -EEXIST)
+		return 0;
+	return 1;
+}
+EXPORT_SYMBOL(try_lock_extent);
+
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		  gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 86f859b87a6..283110ec4ee 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -128,6 +128,8 @@ int try_release_extent_state(struct extent_map_tree *map,
 			     gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent);
 int __init extent_io_init(void);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 96241f01fa0..f4926c0f3c8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -184,8 +184,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 	return ret;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 offset, u64 bytes)
+static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+				  u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *right_info;
 	struct btrfs_free_space *left_info;
@@ -202,8 +202,6 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	 * are adding, if there is remove that struct and add a new one to
 	 * cover the entire range
 	 */
-	spin_lock(&block_group->lock);
-
 	right_info = tree_search_offset(&block_group->free_space_offset,
 					offset+bytes, 0, 1);
 	left_info = tree_search_offset(&block_group->free_space_offset,
@@ -261,7 +259,6 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	if (ret)
 		kfree(info);
 out:
-	spin_unlock(&block_group->lock);
 	if (ret) {
 		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
 		if (ret == -EEXIST)
@@ -274,13 +271,13 @@ out:
 	return ret;
 }
 
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-			    u64 offset, u64 bytes)
+static int
+__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			  u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *info;
 	int ret = 0;
 
-	spin_lock(&block_group->lock);
 	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
 				  1);
 
@@ -334,17 +331,63 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 		/* step two, insert a new info struct to cover anything
 		 * before the hole
 		 */
-		spin_unlock(&block_group->lock);
-		ret = btrfs_add_free_space(block_group, old_start,
-					   offset - old_start);
+		ret = __btrfs_add_free_space(block_group, old_start,
+					     offset - old_start);
 		BUG_ON(ret);
-		goto out_nolock;
 	} else {
 		WARN_ON(1);
 	}
 out:
-	spin_unlock(&block_group->lock);
-out_nolock:
+	return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	int ret = 0;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes)
+{
+	int ret;
+
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+
 	return ret;
 }
 
@@ -386,18 +429,18 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 	struct btrfs_free_space *info;
 	struct rb_node *node;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
 		info = rb_entry(node, struct btrfs_free_space, bytes_index);
 		unlink_free_space(block_group, info);
 		kfree(info);
 		if (need_resched()) {
-			spin_unlock(&block_group->lock);
+			mutex_unlock(&block_group->alloc_mutex);
 			cond_resched();
-			spin_lock(&block_group->lock);
+			mutex_lock(&block_group->alloc_mutex);
 		}
 	}
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 }
 
 struct btrfs_free_space *btrfs_find_free_space_offset(struct
@@ -407,10 +450,10 @@ struct btrfs_free_space *btrfs_find_free_space_offset(struct
 {
 	struct btrfs_free_space *ret;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 	ret = tree_search_offset(&block_group->free_space_offset, offset,
 				 bytes, 0);
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 
 	return ret;
 }
@@ -422,10 +465,10 @@ struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 {
 	struct btrfs_free_space *ret;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 
 	ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 
 	return ret;
 }
@@ -434,16 +477,13 @@ struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
 					       *block_group, u64 offset,
 					       u64 bytes)
 {
-	struct btrfs_free_space *ret;
+	struct btrfs_free_space *ret = NULL;
 
-	spin_lock(&block_group->lock);
 	ret = tree_search_offset(&block_group->free_space_offset, offset,
 				 bytes, 0);
 	if (!ret)
 		ret = tree_search_bytes(&block_group->free_space_bytes,
 					offset, bytes);
 
-	spin_unlock(&block_group->lock);
-
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1df67129cc3..48b455fdaac 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -670,7 +670,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		atomic_dec(&root->fs_info->throttles);
 		wake_up(&root->fs_info->transaction_throttle);
 
-		mutex_lock(&root->fs_info->alloc_mutex);
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
@@ -678,7 +677,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret) {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 6f57d0889b1..a6a3956cedf 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -125,9 +125,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (is_extent)
 		btrfs_extent_post_op(trans, root);
 out:
-	if (is_extent)
-		mutex_unlock(&root->fs_info->alloc_mutex);
-
 	if (path)
 		btrfs_free_path(path);
 	if (ret == -EAGAIN) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e6d579053a4..835daed5561 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -271,10 +271,10 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct walk_control *wc, u64 gen)
 {
 	if (wc->pin) {
-		mutex_lock(&log->fs_info->alloc_mutex);
+		mutex_lock(&log->fs_info->pinned_mutex);
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
 					    eb->start, eb->len, 1);
-		mutex_unlock(&log->fs_info->alloc_mutex);
+		mutex_unlock(&log->fs_info->pinned_mutex);
 	}
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7db4cfd03a9..cbb9bb31431 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -58,14 +58,12 @@ void btrfs_unlock_volumes(void)
 
 static void lock_chunks(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->alloc_mutex);
 	mutex_lock(&root->fs_info->chunk_mutex);
 }
 
 static void unlock_chunks(struct btrfs_root *root)
 {
 	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 }
 
 int btrfs_cleanup_fs_uuids(void)
-- 
cgit v1.2.3


From 84234f3a1f7c532e4afeba03cc8e7e4a8a5277ea Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: Add root tree pointer transaction ids

This patch adds transaction IDs to root tree pointers.
Transaction IDs in tree pointers are compared with the
generation numbers in block headers when reading root
blocks of trees. This can detect some types of IO errors.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h       |  8 ++++++++
 fs/btrfs/disk-io.c     | 21 ++++++++++++++-------
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/ioctl.c       |  1 +
 fs/btrfs/transaction.c | 13 +++++++++++++
 fs/btrfs/tree-log.c    |  2 ++
 6 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fdba4f1b634..0621ab90b1a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -297,6 +297,7 @@ struct btrfs_super_block {
 	__le32 leafsize;
 	__le32 stripesize;
 	__le32 sys_chunk_array_size;
+	__le64 chunk_root_generation;
 	u8 root_level;
 	u8 chunk_root_level;
 	u8 log_root_level;
@@ -448,6 +449,7 @@ struct btrfs_dir_item {
 
 struct btrfs_root_item {
 	struct btrfs_inode_item inode;
+	__le64 generation;
 	__le64 root_dirid;
 	__le64 bytenr;
 	__le64 byte_limit;
@@ -1396,10 +1398,14 @@ static inline int btrfs_is_leaf(struct extent_buffer *eb)
 }
 
 /* struct btrfs_root_item */
+BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item,
+		   generation, 64);
 BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32);
 BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64);
 BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8);
 
+BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item,
+			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
 BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
@@ -1416,6 +1422,8 @@ BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
 BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
 			 struct btrfs_super_block, sys_chunk_array_size, 32);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation,
+			 struct btrfs_super_block, chunk_root_generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
 			 root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d1137d7ea8d..94b4e50f6b2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -832,6 +832,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 {
 	int ret;
 	u32 blocksize;
+	u64 generation;
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
@@ -840,9 +841,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 				   &root->root_item, &root->root_key);
 	BUG_ON(ret);
 
+	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize, 0);
+				     blocksize, generation);
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -929,6 +931,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	struct btrfs_path *path;
 	struct extent_buffer *l;
 	u64 highest_inode;
+	u64 generation;
 	u32 blocksize;
 	int ret = 0;
 
@@ -970,9 +973,10 @@ out:
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize, 0);
+				     blocksize, generation);
 	BUG_ON(!root->node);
 insert:
 	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1357,6 +1361,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 leafsize;
 	u32 blocksize;
 	u32 stripesize;
+	u64 generation;
 	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
@@ -1596,13 +1601,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_chunk_root_level(disk_super));
+	generation = btrfs_super_chunk_root_generation(disk_super);
 
 	__setup_root(nodesize, leafsize, sectorsize, stripesize,
 		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
 
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
-					   blocksize, 0);
+					   blocksize, generation);
 	BUG_ON(!chunk_root->node);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
@@ -1618,11 +1624,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
-
+	generation = btrfs_super_generation(disk_super);
 
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
-					  blocksize, 0);
+					  blocksize, generation);
 	if (!tree_root->node)
 		goto fail_sb_buffer;
 
@@ -1672,15 +1678,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
 
 		log_tree_root->node = read_tree_block(tree_root, bytenr,
-						      blocksize, 0);
+						      blocksize,
+						      generation + 1);
 		ret = btrfs_recover_log_trees(log_tree_root);
 		BUG_ON(ret);
 	}
+	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 
 	ret = btrfs_cleanup_reloc_trees(tree_root);
 	BUG_ON(ret);
 
-	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 	return tree_root;
 
 fail_cleaner:
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 564260872c7..155c8dc56a2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4428,6 +4428,7 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_root_refs(root_item, 0);
 	btrfs_set_root_bytenr(root_item, eb->start);
 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	btrfs_set_root_generation(root_item, trans->transid);
 
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1136ce2febc..fd3c8b5676c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -108,6 +108,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
 	btrfs_set_root_level(&root_item, 0);
 	btrfs_set_root_refs(&root_item, 1);
 	btrfs_set_root_used(&root_item, 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 48b455fdaac..924af6f2aea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -439,6 +439,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 				       root->node->start);
 		btrfs_set_root_level(&root->root_item,
 				     btrfs_header_level(root->node));
+		btrfs_set_root_generation(&root->root_item, trans->transid);
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
@@ -456,6 +457,12 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
+	struct extent_buffer *eb;
+
+	eb = btrfs_lock_root_node(fs_info->tree_root);
+	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
 
 	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
@@ -559,6 +566,9 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 					      root->node->start);
 			btrfs_set_root_level(&root->root_item,
 					     btrfs_header_level(root->node));
+			btrfs_set_root_generation(&root->root_item,
+						  root->root_key.offset);
+
 			err = btrfs_insert_root(trans, root->fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
@@ -756,6 +766,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 
 	btrfs_set_root_bytenr(new_root_item, tmp->start);
 	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
+	btrfs_set_root_generation(new_root_item, trans->transid);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				new_root_item);
 	btrfs_tree_unlock(tmp);
@@ -946,6 +957,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				   chunk_root->node->start);
 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
 					 btrfs_header_level(chunk_root->node));
+	btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
+				btrfs_header_generation(chunk_root->node));
 
 	if (!root->fs_info->log_root_recovering) {
 		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 835daed5561..e0201c3a7dc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -117,6 +117,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
+	btrfs_set_root_generation(&root_item, trans->transid);
 	btrfs_set_root_level(&root_item, 0);
 	btrfs_set_root_refs(&root_item, 0);
 	btrfs_set_root_used(&root_item, 0);
@@ -2065,6 +2066,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 		return 0;
 
 	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_generation(&log->root_item, trans->transid);
 	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
 	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
 				&log->root_key, &log->root_item);
-- 
cgit v1.2.3


From 09fde3c9ba360926ce021c184a1ee343f4d8fa19 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Oct 2008 14:49:04 -0400
Subject: Btrfs: Rev the disk format for compression and root pointer
 generation fields

---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0621ab90b1a..abb27332c91 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,7 +39,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BBRfS_M"
+#define BTRFS_MAGIC "_BCRfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From 87ef2bb46bfc4be0b40799e68115cbe28d80a1bd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Oct 2008 11:23:27 -0400
Subject: Btrfs: prevent looping forever in finish_current_insert and
 del_pending_extents

finish_current_insert and del_pending_extents process extent tree modifications
that build up while we are changing the extent tree.  It is a confusing
bit of code that prevents recursion.

Both functions run through a list of pending operations and both funcs
add to the list of pending operations.  If you have two procs in either
one of them, they can end up looping forever making more work for each other.

This patch makes them walk forward through the list of pending changes instead
of always trying to process the entire list.  At transaction commit
time, we catch any changes that were left over.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 54 ++++++++++++++++++++++++++++----------------------
 fs/btrfs/transaction.c | 12 +++++++++++
 2 files changed, 42 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 155c8dc56a2..fada9c22a02 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -45,9 +45,9 @@ struct pending_extent_op {
 };
 
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
-				 btrfs_root *extent_root);
+				 btrfs_root *extent_root, int all);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root);
+			       btrfs_root *extent_root, int all);
 static struct btrfs_block_group_cache *
 __btrfs_find_block_group(struct btrfs_root *root,
 			 struct btrfs_block_group_cache *hint,
@@ -711,8 +711,8 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 				    parent, ref_root, ref_generation,
 				    owner_objectid);
 	BUG_ON(ret);
-	finish_current_insert(trans, extent_root);
-	del_pending_extents(trans, extent_root);
+	finish_current_insert(trans, extent_root, 0);
+	del_pending_extents(trans, extent_root, 0);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -784,8 +784,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				    ref_root, ref_generation,
 				    owner_objectid);
 	BUG_ON(ret);
-	finish_current_insert(trans, root->fs_info->extent_root);
-	del_pending_extents(trans, root->fs_info->extent_root);
+	finish_current_insert(trans, root->fs_info->extent_root, 0);
+	del_pending_extents(trans, root->fs_info->extent_root, 0);
 
 	btrfs_free_path(path);
 	return 0;
@@ -810,8 +810,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root)
 {
-	finish_current_insert(trans, root->fs_info->extent_root);
-	del_pending_extents(trans, root->fs_info->extent_root);
+	finish_current_insert(trans, root->fs_info->extent_root, 1);
+	del_pending_extents(trans, root->fs_info->extent_root, 1);
 	return 0;
 }
 
@@ -1292,8 +1292,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(extent_root, path);
 fail:
-	finish_current_insert(trans, extent_root);
-	pending_ret = del_pending_extents(trans, extent_root);
+	finish_current_insert(trans, extent_root, 0);
+	pending_ret = del_pending_extents(trans, extent_root, 0);
 	if (ret)
 		return ret;
 	if (pending_ret)
@@ -1690,7 +1690,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 }
 
 static int finish_current_insert(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *extent_root)
+				 struct btrfs_root *extent_root, int all)
 {
 	u64 start;
 	u64 end;
@@ -1714,7 +1714,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
 			mutex_unlock(&info->extent_ins_mutex);
-			if (search) {
+			if (search && all) {
 				search = 0;
 				continue;
 			}
@@ -1723,7 +1723,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 
 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
 		if (!ret) {
-			search = end+1;
+			search = end + 1;
 			mutex_unlock(&info->extent_ins_mutex);
 			cond_resched();
 			continue;
@@ -1785,7 +1785,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 		}
 		kfree(extent_op);
 		unlock_extent(&info->extent_ins, start, end, GFP_NOFS);
-		search = 0;
+		if (all)
+			search = 0;
+		else
+			search = end + 1;
 
 		cond_resched();
 	}
@@ -1992,7 +1995,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 #endif
 	}
 	btrfs_free_path(path);
-	finish_current_insert(trans, extent_root);
+	finish_current_insert(trans, extent_root, 0);
 	return ret;
 }
 
@@ -2001,7 +2004,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
  * them from the extent map
  */
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root)
+			       btrfs_root *extent_root, int all)
 {
 	int ret;
 	int err = 0;
@@ -2023,7 +2026,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 					    EXTENT_WRITEBACK);
 		if (ret) {
 			mutex_unlock(&info->extent_ins_mutex);
-			if (search) {
+			if (all && search) {
 				search = 0;
 				continue;
 			}
@@ -2088,7 +2091,10 @@ free_extent:
 			err = ret;
 		unlock_extent(extent_ins, start, end, GFP_NOFS);
 
-		search = 0;
+		if (all)
+			search = 0;
+		else
+			search = end + 1;
 		cond_resched();
 	}
 	return err;
@@ -2155,8 +2161,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			    root_objectid, ref_generation,
 			    owner_objectid, pin, pin == 0);
 
-	finish_current_insert(trans, root->fs_info->extent_root);
-	pending_ret = del_pending_extents(trans, root->fs_info->extent_root);
+	finish_current_insert(trans, root->fs_info->extent_root, 0);
+	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
 	return ret ? ret : pending_ret;
 }
 
@@ -2580,8 +2586,8 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	trans->alloc_exclude_start = 0;
 	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
-	finish_current_insert(trans, extent_root);
-	pending_ret = del_pending_extents(trans, extent_root);
+	finish_current_insert(trans, extent_root, 0);
+	pending_ret = del_pending_extents(trans, extent_root, 0);
 
 	if (ret)
 		goto out;
@@ -5229,8 +5235,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 				sizeof(cache->item));
 	BUG_ON(ret);
 
-	finish_current_insert(trans, extent_root);
-	ret = del_pending_extents(trans, extent_root);
+	finish_current_insert(trans, extent_root, 0);
+	ret = del_pending_extents(trans, extent_root, 0);
 	BUG_ON(ret);
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 924af6f2aea..968b84f17a1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -430,7 +430,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	u64 old_root_bytenr;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 
+	btrfs_extent_post_op(trans, root);
 	btrfs_write_dirty_block_groups(trans, root);
+	btrfs_extent_post_op(trans, root);
+
 	while(1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
@@ -440,11 +443,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		btrfs_set_root_level(&root->root_item,
 				     btrfs_header_level(root->node));
 		btrfs_set_root_generation(&root->root_item, trans->transid);
+
+		btrfs_extent_post_op(trans, root);
+
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
 		BUG_ON(ret);
 		btrfs_write_dirty_block_groups(trans, root);
+		btrfs_extent_post_op(trans, root);
 	}
 	return 0;
 }
@@ -459,15 +466,20 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 	struct list_head *next;
 	struct extent_buffer *eb;
 
+	btrfs_extent_post_op(trans, fs_info->tree_root);
+
 	eb = btrfs_lock_root_node(fs_info->tree_root);
 	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 
+	btrfs_extent_post_op(trans, fs_info->tree_root);
+
 	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
+
 		update_cowonly_root(trans, root);
 	}
 	return 0;
-- 
cgit v1.2.3


From cfbc246eaae2a1089911016094b74b3055e8a906 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Oct 2008 13:22:14 -0400
Subject: Btrfs: walk compressed pages based on the nr_pages count instead of
 bytes

The byte walk counting was awkward and error prone.  This uses the
number of pages sent the higher layer to build bios.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 7 +++++--
 fs/btrfs/inode.c       | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c5470367ca5..9adaa79adad 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -296,7 +296,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
 	/* create and submit bios for the compressed pages */
 	bytes_left = compressed_len;
-	while(bytes_left > 0) {
+	for (page_index = 0; page_index < cb->nr_pages; page_index++) {
 		page = compressed_pages[page_index];
 		page->mapping = inode->i_mapping;
 		if (bio->bi_size)
@@ -324,7 +324,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 		}
-		page_index++;
+		if (bytes_left < PAGE_CACHE_SIZE) {
+			printk("bytes left %lu compress len %lu nr %lu\n",
+			       bytes_left, cb->compressed_len, cb->nr_pages);
+		}
 		bytes_left -= PAGE_CACHE_SIZE;
 		first_byte += PAGE_CACHE_SIZE;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9797592dc86..6739424c0fe 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -338,7 +338,7 @@ again:
 	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
 	    btrfs_test_opt(root, COMPRESS)) {
 		WARN_ON(pages);
-		pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
 		/* we want to make sure the amount of IO required to satisfy
 		 * a random read is reasonably small, so we limit the size
-- 
cgit v1.2.3


From 19b9bdb054895ba07086f0264641c9f80e0eb2c4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 30 Oct 2008 14:23:13 -0400
Subject: Btrfs: Fix logic to avoid reading checksums for -o nodatasum,compress

When compression was on, we were improperly ignoring -o nodatasum.  This
reworks the logic a bit to properly honor all the flags.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6739424c0fe..dd9cd01042b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -833,28 +833,29 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
+	int skip_sum;
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM)) {
-		goto mapit;
-	}
+	skip_sum = btrfs_test_opt(root, NODATASUM) ||
+		btrfs_test_flag(inode, NODATASUM);
 
 	if (!(rw & (1 << BIO_RW))) {
-		btrfs_lookup_bio_sums(root, inode, bio);
+		if (!skip_sum)
+			btrfs_lookup_bio_sums(root, inode, bio);
 
-		if (bio_flags & EXTENT_BIO_COMPRESSED) {
+		if (bio_flags & EXTENT_BIO_COMPRESSED)
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
-		}
-
 		goto mapit;
-	}
-	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+	} else if (!skip_sum) {
+		/* we're doing a write, do the async checksumming */
+		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
 				   bio_flags, __btrfs_submit_bio_hook);
+	}
+
 mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
-- 
cgit v1.2.3


From 9036c10208e1fc496cef7692ba66a78699b360dc Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Oct 2008 14:19:41 -0400
Subject: Btrfs: update hole handling v2

This patch splits the hole insertion code out of btrfs_setattr
into btrfs_cont_expand and updates btrfs_get_extent to properly
handle the case that file extent items are not continuous.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h      |   1 +
 fs/btrfs/extent_map.h |   1 +
 fs/btrfs/file.c       |  41 ++---------
 fs/btrfs/inode.c      | 189 ++++++++++++++++++++++++++------------------------
 4 files changed, 108 insertions(+), 124 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index abb27332c91..ca5547af609 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1908,6 +1908,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_cont_expand(struct inode *inode, loff_t size);
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index abbcbeb28c7..e693e1b4ac4 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -11,6 +11,7 @@
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
 #define EXTENT_FLAG_COMPRESSED 1
+#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 
 struct extent_map {
 	struct rb_node rb_node;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0aa15436590..b8a7637e14a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -142,40 +142,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	}
 	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
-	/* FIXME...EIEIO, ENOSPC and more */
-	/* insert any holes we need to create */
-	if (isize < start_pos) {
-		u64 last_pos_in_file;
-		u64 hole_size;
-		u64 mask = root->sectorsize - 1;
-		last_pos_in_file = (isize + mask) & ~mask;
-		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		if (hole_size > 0) {
-			btrfs_wait_ordered_range(inode, last_pos_in_file,
-						 last_pos_in_file + hole_size);
-			mutex_lock(&BTRFS_I(inode)->extent_mutex);
-			err = btrfs_drop_extents(trans, root, inode,
-						 last_pos_in_file,
-						 last_pos_in_file + hole_size,
-						 last_pos_in_file,
-						 &hint_byte);
-			if (err)
-				goto failed;
-
-			err = btrfs_insert_file_extent(trans, root,
-						       inode->i_ino,
-						       last_pos_in_file,
-						       0, 0, hole_size, 0,
-						       hole_size, 0, 0, 0);
-			btrfs_drop_extent_cache(inode, last_pos_in_file,
-					last_pos_in_file + hole_size - 1, 0);
-			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-			btrfs_check_file(root, inode);
-		}
-		if (err)
-			goto failed;
-	}
-
 	/* check for reserved extents on each page, we don't want
 	 * to reset the delalloc bit on things that already have
 	 * extents reserved.
@@ -191,7 +157,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		i_size_write(inode, end_pos);
 		btrfs_update_inode(trans, root, inode);
 	}
-failed:
 	err = btrfs_end_transaction(trans, root);
 out_unlock:
 	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
@@ -697,6 +662,12 @@ static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
 	start_pos = pos & ~((u64)root->sectorsize - 1);
 	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 
+	if (start_pos > inode->i_size) {
+		err = btrfs_cont_expand(inode, start_pos);
+		if (err)
+			return err;
+	}
+
 	memset(pages, 0, num_pages * sizeof(struct page *));
 again:
 	for (i = 0; i < num_pages; i++) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dd9cd01042b..8254d6fa691 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2296,81 +2296,91 @@ out:
 	return ret;
 }
 
-static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+int btrfs_cont_expand(struct inode *inode, loff_t size)
 {
-	struct inode *inode = dentry->d_inode;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em;
+	u64 mask = root->sectorsize - 1;
+	u64 hole_start = (inode->i_size + mask) & ~mask;
+	u64 block_end = (size + mask) & ~mask;
+	u64 last_byte;
+	u64 cur_offset;
+	u64 hole_size;
 	int err;
 
-	err = inode_change_ok(inode, attr);
+	if (size <= hole_start)
+		return 0;
+
+	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		return err;
 
-	if (S_ISREG(inode->i_mode) &&
-	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
-		struct btrfs_trans_handle *trans;
-		struct btrfs_root *root = BTRFS_I(inode)->root;
-		struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		u64 mask = root->sectorsize - 1;
-		u64 hole_start = (inode->i_size + mask) & ~mask;
-		u64 block_end = (attr->ia_size + mask) & ~mask;
-		u64 hole_size;
-		u64 alloc_hint = 0;
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		btrfs_wait_ordered_range(inode, hole_start,
+					 block_end - hole_start);
+		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+		if (!ordered)
+			break;
+		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+		btrfs_put_ordered_extent(ordered);
+	}
 
-		if (attr->ia_size <= hole_start)
-			goto out;
+	trans = btrfs_start_transaction(root, 1);
+	btrfs_set_trans_block_group(trans, inode);
 
-		err = btrfs_check_free_space(root, 1, 0);
-		if (err)
-			goto fail;
+	cur_offset = hole_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				block_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), block_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+			hole_size = last_byte - cur_offset;
+			err = btrfs_insert_file_extent(trans, root,
+					inode->i_ino, cur_offset, 0,
+					0, hole_size, 0, hole_size,
+					0, 0, 0);
+			btrfs_drop_extent_cache(inode, hole_start,
+					last_byte - 1, 0);
+		}
+		free_extent_map(em);
+		cur_offset = last_byte;
+		if (err || cur_offset >= block_end)
+			break;
+	}
 
-		btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	btrfs_end_transaction(trans, root);
+	unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+	return err;
+}
 
-		hole_size = block_end - hole_start;
-		while(1) {
-			struct btrfs_ordered_extent *ordered;
-			btrfs_wait_ordered_range(inode, hole_start, hole_size);
-
-			lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
-			ordered = btrfs_lookup_ordered_extent(inode, hole_start);
-			if (ordered) {
-				unlock_extent(io_tree, hole_start,
-					      block_end - 1, GFP_NOFS);
-				btrfs_put_ordered_extent(ordered);
-			} else {
-				break;
-			}
-		}
+static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
 
-		trans = btrfs_start_transaction(root, 1);
-		btrfs_set_trans_block_group(trans, inode);
-		mutex_lock(&BTRFS_I(inode)->extent_mutex);
-		err = btrfs_drop_extents(trans, root, inode,
-					 hole_start, block_end, hole_start,
-					 &alloc_hint);
+	err = inode_change_ok(inode, attr);
+	if (err)
+		return err;
 
-		if (alloc_hint != EXTENT_MAP_INLINE) {
-			err = btrfs_insert_file_extent(trans, root,
-						       inode->i_ino,
-						       hole_start, 0, 0,
-						       hole_size, 0, hole_size,
-						       0, 0, 0);
-			btrfs_drop_extent_cache(inode, hole_start,
-						(u64)-1, 0);
-			btrfs_check_file(root, inode);
-		}
-		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-		btrfs_end_transaction(trans, root);
-		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+	if (S_ISREG(inode->i_mode) &&
+	    attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+		err = btrfs_cont_expand(inode, attr->ia_size);
 		if (err)
 			return err;
 	}
-out:
+
 	err = inode_setattr(inode, attr);
 
 	if (!err && ((attr->ia_valid & ATTR_MODE)))
 		err = btrfs_acl_chmod(inode);
-fail:
 	return err;
 }
 
@@ -3456,27 +3466,44 @@ again:
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
-		err = 0;
-		if (start < extent_start || start >= extent_end) {
-			em->start = start;
-			if (start < extent_start) {
-				if (start + len <= extent_start)
-					goto not_found;
-				em->len = extent_end - extent_start;
-			} else {
-				em->len = len;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		size_t size;
+		size = btrfs_file_extent_inline_len(leaf, item);
+		extent_end = (extent_start + size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	}
+
+	if (start >= extent_end) {
+		path->slots[0]++;
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
 			}
-			goto not_found_em;
+			if (ret > 0)
+				goto not_found;
+			leaf = path->nodes[0];
 		}
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != objectid ||
+		    found_key.type != BTRFS_EXTENT_DATA_KEY)
+			goto not_found;
+		if (start + len <= found_key.offset)
+			goto not_found;
+		em->start = start;
+		em->len = found_key.offset - start;
+		goto not_found_em;
+	}
+
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		em->start = extent_start;
+		em->len = extent_end - extent_start;
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0) {
-			em->start = extent_start;
-			em->len = extent_end - extent_start;
 			em->block_start = EXTENT_MAP_HOLE;
 			goto insert;
 		}
-		em->start = extent_start;
-		em->len = extent_end - extent_start;
 		if (compressed) {
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 			em->block_start = bytenr;
@@ -3489,38 +3516,21 @@ again:
 		}
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		u64 page_start;
 		unsigned long ptr;
 		char *map;
 		size_t size;
 		size_t extent_offset;
 		size_t copy_size;
 
-		size = btrfs_file_extent_inline_len(leaf, item);
-		extent_end = (extent_start + size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-		if (start < extent_start || start >= extent_end) {
-			em->start = start;
-			if (start < extent_start) {
-				if (start + len <= extent_start)
-					goto not_found;
-				em->len = extent_end - extent_start;
-			} else {
-				em->len = len;
-			}
-			goto not_found_em;
-		}
 		em->block_start = EXTENT_MAP_INLINE;
-
 		if (!page || create) {
 			em->start = extent_start;
-			em->len = (size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
+			em->len = extent_end - extent_start;
 			goto out;
 		}
 
-		page_start = page_offset(page) + pg_offset;
-		extent_offset = page_start - extent_start;
+		size = btrfs_file_extent_inline_len(leaf, item);
+		extent_offset = page_offset(page) + pg_offset - extent_start;
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
 				size - extent_offset);
 		em->start = extent_start + extent_offset;
@@ -3570,6 +3580,7 @@ not_found:
 	em->len = len;
 not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
+	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
 insert:
 	btrfs_release_path(root, path);
 	if (em->start > start || extent_map_end(em) <= start) {
-- 
cgit v1.2.3


From 6643558db29006825dbb10012b3f8890aca4bcd5 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Oct 2008 14:19:50 -0400
Subject: Btrfs: Fix bookend extent race v2

When dropping middle part of an extent, btrfs_drop_extents truncates
the extent at first, then inserts a bookend extent.

Since truncation and insertion can't be done atomically, there is a small
period that the bookend extent isn't in the tree. This causes problem for
functions that search the tree for file extent item. The way to fix this is
lock the range of the bookend extent before truncation.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 +++++++++++++---------------
 fs/btrfs/extent_io.c   |  6 +++++-
 fs/btrfs/file.c        | 31 +++++++++++++++++++++++++------
 fs/btrfs/inode.c       | 10 ----------
 4 files changed, 43 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fada9c22a02..535cee47fcf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3379,11 +3379,13 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
 	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
 	struct extent_map *em;
+	u64 start = extent_key->objectid - offset;
+	u64 end = start + extent_key->offset - 1;
 
 	em = alloc_extent_map(GFP_NOFS);
 	BUG_ON(!em || IS_ERR(em));
 
-	em->start = extent_key->objectid - offset;
+	em->start = start;
 	em->len = extent_key->offset;
 	em->block_len = extent_key->offset;
 	em->block_start = extent_key->objectid;
@@ -3391,7 +3393,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 	/* setup extent map to cheat btrfs_readpage */
-	mutex_lock(&BTRFS_I(reloc_inode)->extent_mutex);
+	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
 	while (1) {
 		int ret;
 		spin_lock(&em_tree->lock);
@@ -3401,13 +3403,11 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
 			free_extent_map(em);
 			break;
 		}
-		btrfs_drop_extent_cache(reloc_inode, em->start,
-					em->start + em->len - 1, 0);
+		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
 	}
-	mutex_unlock(&BTRFS_I(reloc_inode)->extent_mutex);
+	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
 
-	return relocate_inode_pages(reloc_inode, extent_key->objectid - offset,
-				    extent_key->offset);
+	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
 }
 
 struct btrfs_ref_path {
@@ -3831,7 +3831,6 @@ next:
 			 * the file extent item was modified by someone
 			 * before the extent got locked.
 			 */
-			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
 				      lock_end, GFP_NOFS);
 			extent_locked = 0;
@@ -3896,8 +3895,12 @@ next:
 			lock_start = key.offset;
 			lock_end = lock_start + num_bytes - 1;
 		} else {
-			BUG_ON(lock_start != key.offset);
-			BUG_ON(lock_end - lock_start + 1 < num_bytes);
+			if (lock_start > key.offset ||
+			    lock_end + 1 < key.offset + num_bytes) {
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      lock_start, lock_end, GFP_NOFS);
+				extent_locked = 0;
+			}
 		}
 
 		if (!inode) {
@@ -3951,7 +3954,6 @@ next:
 			if (ordered)
 				btrfs_put_ordered_extent(ordered);
 
-			mutex_lock(&BTRFS_I(inode)->extent_mutex);
 			extent_locked = 1;
 			continue;
 		}
@@ -4073,7 +4075,6 @@ next:
 		}
 
 		if (extent_locked) {
-			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
 				      lock_end, GFP_NOFS);
 			extent_locked = 0;
@@ -4091,7 +4092,6 @@ out:
 	if (inode) {
 		mutex_unlock(&inode->i_mutex);
 		if (extent_locked) {
-			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
 				      lock_end, GFP_NOFS);
 		}
@@ -4180,10 +4180,8 @@ static int noinline invalidate_extent_cache(struct btrfs_root *root,
 
 		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
 			    key.offset + num_bytes - 1, GFP_NOFS);
-		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		btrfs_drop_extent_cache(inode, key.offset,
 					key.offset + num_bytes - 1, 1);
-		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
 			      key.offset + num_bytes - 1, GFP_NOFS);
 		cond_resched();
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7503bd46819..65a0583027e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -946,8 +946,12 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 
 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
 			     &failed_start, mask);
-	if (err == -EEXIST)
+	if (err == -EEXIST) {
+		if (failed_start > start)
+			clear_extent_bit(tree, start, failed_start - 1,
+					 EXTENT_LOCKED, 1, 0, mask);
 		return 0;
+	}
 	return 1;
 }
 EXPORT_SYMBOL(try_lock_extent);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b8a7637e14a..1a0510ad030 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -364,6 +364,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
 	u64 extent_end = 0;
+	u64 locked_end = end;
 	u64 search_start = start;
 	u64 leaf_start;
 	u64 ram_bytes = 0;
@@ -479,12 +480,6 @@ next_slot:
 			goto next_slot;
 		}
 
-		if (found_inline) {
-			u64 mask = root->sectorsize - 1;
-			search_start = (extent_end + mask) & ~mask;
-		} else
-			search_start = extent_end;
-
 		if (end <= extent_end && start >= key.offset && found_inline)
 			*hint_byte = EXTENT_MAP_INLINE;
 
@@ -501,6 +496,26 @@ next_slot:
 			if (found_inline && start <= key.offset)
 				keep = 1;
 		}
+
+		if (bookend && found_extent && locked_end < extent_end) {
+			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+			if (!ret) {
+				btrfs_release_path(root, path);
+				lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+				locked_end = extent_end;
+				continue;
+			}
+			locked_end = extent_end;
+		}
+
+		if (found_inline) {
+			u64 mask = root->sectorsize - 1;
+			search_start = (extent_end + mask) & ~mask;
+		} else
+			search_start = extent_end;
+
 		/* truncate existing extent */
 		if (start > key.offset) {
 			u64 new_num;
@@ -638,6 +653,10 @@ next_slot:
 	}
 out:
 	btrfs_free_path(path);
+	if (locked_end > end) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+			      GFP_NOFS);
+	}
 	btrfs_check_file(root, inode);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8254d6fa691..e8511d14b11 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -246,7 +246,6 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
-	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	ret = btrfs_drop_extents(trans, root, inode, start,
 				 aligned_end, aligned_end, &hint_byte);
 	BUG_ON(ret);
@@ -258,7 +257,6 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				   compressed_pages);
 	BUG_ON(ret);
 	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
-	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 	return 0;
 }
 
@@ -437,9 +435,7 @@ again:
 	BUG_ON(disk_num_bytes >
 	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
-	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	while(disk_num_bytes > 0) {
 		unsigned long min_bytes;
@@ -477,8 +473,6 @@ again:
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
-
-		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
 		if (will_compress)
@@ -495,7 +489,6 @@ again:
 			btrfs_drop_extent_cache(inode, start,
 						start + ram_size - 1, 0);
 		}
-		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
@@ -1016,8 +1009,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 
 	INIT_LIST_HEAD(&list);
 
-	mutex_lock(&BTRFS_I(inode)->extent_mutex);
-
 	ret = btrfs_drop_extents(trans, root, inode,
 				 ordered_extent->file_offset,
 				 ordered_extent->file_offset +
@@ -1059,7 +1050,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
 				ordered_extent->file_offset +
 				ordered_extent->len - 1, 0);
-	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	ins.objectid = ordered_extent->start;
 	ins.offset = ordered_extent->disk_len;
-- 
cgit v1.2.3


From 80ff385665b7fca29fefe358a60ab0d09f9b8e87 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Oct 2008 14:20:02 -0400
Subject: Btrfs: update nodatacow code v2

This patch simplifies the nodatacow checker. If all references
were created after the latest snapshot, then we can avoid COW
safely. This patch also updates run_delalloc_nocow to do more
fine-grained checking.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h        |   8 +-
 fs/btrfs/extent-tree.c  | 131 ++++-------------------------
 fs/btrfs/inode.c        | 213 ++++++++++++++++++++++++++++--------------------
 fs/btrfs/ioctl.c        |   1 +
 fs/btrfs/ordered-data.c |   9 +-
 fs/btrfs/ordered-data.h |   3 +-
 fs/btrfs/transaction.c  |   2 +
 7 files changed, 154 insertions(+), 213 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ca5547af609..8bf6a085a73 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -454,6 +454,7 @@ struct btrfs_root_item {
 	__le64 bytenr;
 	__le64 byte_limit;
 	__le64 bytes_used;
+	__le64 last_snapshot;
 	__le32 flags;
 	__le32 refs;
 	struct btrfs_disk_key drop_progress;
@@ -1413,6 +1414,8 @@ BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
 BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32);
 BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
 BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
+BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
+			 last_snapshot, 64);
 
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -1564,9 +1567,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   struct btrfs_key *key, u64 bytenr);
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 bytenr);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 535cee47fcf..1eb69a91b72 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -848,9 +848,8 @@ out:
 	return 0;
 }
 
-static int get_reference_status(struct btrfs_root *root, u64 bytenr,
-				u64 parent_gen, u64 ref_objectid,
-			        u64 *min_generation, u32 *ref_count)
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, u64 bytenr)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	struct btrfs_path *path;
@@ -858,8 +857,8 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	struct btrfs_extent_ref *ref_item;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	u64 root_objectid = root->root_key.objectid;
-	u64 ref_generation;
+	u64 ref_root;
+	u64 last_snapshot;
 	u32 nritems;
 	int ret;
 
@@ -872,7 +871,9 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
-	if (ret < 0 || path->slots[0] == 0)
+
+	ret = -ENOENT;
+	if (path->slots[0] == 0)
 		goto out;
 
 	path->slots[0]--;
@@ -880,14 +881,10 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
 	if (found_key.objectid != bytenr ||
-	    found_key.type != BTRFS_EXTENT_ITEM_KEY) {
-		ret = 1;
+	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
 		goto out;
-	}
-
-	*ref_count = 0;
-	*min_generation = (u64)-1;
 
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
 	while (1) {
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
@@ -910,114 +907,22 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 
 		ref_item = btrfs_item_ptr(leaf, path->slots[0],
 					  struct btrfs_extent_ref);
-		ref_generation = btrfs_ref_generation(leaf, ref_item);
-		/*
-		 * For (parent_gen > 0 && parent_gen > ref_generation):
-		 *
-		 * we reach here through the oldest root, therefore
-		 * all other reference from same snapshot should have
-		 * a larger generation.
-		 */
-		if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
-		    (parent_gen > 0 && parent_gen > ref_generation) ||
-		    (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-		     ref_objectid != btrfs_ref_objectid(leaf, ref_item))) {
-			*ref_count = 2;
-			break;
-		}
-
-		*ref_count = 1;
-		if (*min_generation > ref_generation)
-			*min_generation = ref_generation;
-
-		path->slots[0]++;
-	}
-	ret = 0;
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-int btrfs_cross_ref_exists(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   struct btrfs_key *key, u64 bytenr)
-{
-	struct btrfs_root *old_root;
-	struct btrfs_path *path = NULL;
-	struct extent_buffer *eb;
-	struct btrfs_file_extent_item *item;
-	u64 ref_generation;
-	u64 min_generation;
-	u64 extent_start;
-	u32 ref_count;
-	int level;
-	int ret;
-
-	BUG_ON(trans == NULL);
-	BUG_ON(key->type != BTRFS_EXTENT_DATA_KEY);
-	ret = get_reference_status(root, bytenr, 0, key->objectid,
-				   &min_generation, &ref_count);
-	if (ret)
-		return ret;
-
-	if (ref_count != 1)
-		return 1;
-
-	old_root = root->dirty_root->root;
-	ref_generation = old_root->root_key.offset;
-
-	/* all references are created in running transaction */
-	if (min_generation > ref_generation) {
-		ret = 0;
-		goto out;
-	}
-
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	path->skip_locking = 1;
-	/* if no item found, the extent is referenced by other snapshot */
-	ret = btrfs_search_slot(NULL, old_root, key, path, 0, 0);
-	if (ret)
-		goto out;
-
-	eb = path->nodes[0];
-	item = btrfs_item_ptr(eb, path->slots[0],
-			      struct btrfs_file_extent_item);
-	if (btrfs_file_extent_type(eb, item) != BTRFS_FILE_EXTENT_REG ||
-	    btrfs_file_extent_disk_bytenr(eb, item) != bytenr) {
-		ret = 1;
-		goto out;
-	}
-
-	for (level = BTRFS_MAX_LEVEL - 1; level >= -1; level--) {
-		if (level >= 0) {
-			eb = path->nodes[level];
-			if (!eb)
-				continue;
-			extent_start = eb->start;
-		} else
-			extent_start = bytenr;
-
-		ret = get_reference_status(root, extent_start, ref_generation,
-					   0, &min_generation, &ref_count);
-		if (ret)
+		ref_root = btrfs_ref_root(leaf, ref_item);
+		if (ref_root != root->root_key.objectid &&
+		    ref_root != BTRFS_TREE_LOG_OBJECTID) {
+			ret = 1;
 			goto out;
-
-		if (ref_count != 1) {
+		}
+		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
 			ret = 1;
 			goto out;
 		}
-		if (level >= 0)
-			ref_generation = btrfs_header_generation(eb);
+
+		path->slots[0]++;
 	}
 	ret = 0;
 out:
-	if (path)
-		btrfs_free_path(path);
+	btrfs_free_path(path);
 	return ret;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e8511d14b11..3e6f0568fdb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -298,6 +298,7 @@ static int cow_file_range(struct inode *inode, struct page *locked_page,
 	unsigned long max_compressed = 128 * 1024;
 	unsigned long max_uncompressed = 256 * 1024;
 	int i;
+	int ordered_type;
 	int will_compress;
 
 	trans = btrfs_join_transaction(root, 1);
@@ -491,9 +492,10 @@ again:
 		}
 
 		cur_alloc_size = ins.offset;
+		ordered_type = will_compress ? BTRFS_ORDERED_COMPRESSED : 0;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-					       ram_size, cur_alloc_size, 0,
-					       will_compress);
+					       ram_size, cur_alloc_size,
+					       ordered_type);
 		BUG_ON(ret);
 
 		if (disk_num_bytes < cur_alloc_size) {
@@ -587,115 +589,148 @@ free_pages_out:
 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
 			      u64 start, u64 end, int *page_started)
 {
-	u64 extent_start;
-	u64 extent_end;
-	u64 bytenr;
-	u64 loops = 0;
-	u64 total_fs_bytes;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_block_group_cache *block_group;
 	struct btrfs_trans_handle *trans;
 	struct extent_buffer *leaf;
-	int found_type;
 	struct btrfs_path *path;
-	struct btrfs_file_extent_item *item;
-	int ret;
-	int err = 0;
+	struct btrfs_file_extent_item *fi;
 	struct btrfs_key found_key;
+	u64 cow_start;
+	u64 cur_offset;
+	u64 extent_end;
+	u64 disk_bytenr;
+	u64 num_bytes;
+	int extent_type;
+	int ret;
+	int nocow;
+	int check_prev = 1;
 
-	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
-again:
-	ret = btrfs_lookup_file_extent(NULL, root, path,
-				       inode->i_ino, start, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-
-	if (ret != 0) {
-		if (path->slots[0] == 0)
-			goto not_found;
-		path->slots[0]--;
-	}
-
-	leaf = path->nodes[0];
-	item = btrfs_item_ptr(leaf, path->slots[0],
-			      struct btrfs_file_extent_item);
-
-	/* are we inside the extent that was found? */
-	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-	found_type = btrfs_key_type(&found_key);
-	if (found_key.objectid != inode->i_ino ||
-	    found_type != BTRFS_EXTENT_DATA_KEY)
-		goto not_found;
-
-	found_type = btrfs_file_extent_type(leaf, item);
-	extent_start = found_key.offset;
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
-		u64 extent_num_bytes;
-
-		extent_num_bytes = btrfs_file_extent_num_bytes(leaf, item);
-		extent_end = extent_start + extent_num_bytes;
-		err = 0;
 
-		if (btrfs_file_extent_compression(leaf, item) ||
-		    btrfs_file_extent_encryption(leaf,item) ||
-		    btrfs_file_extent_other_encoding(leaf, item))
-			goto not_found;
+	cow_start = (u64)-1;
+	cur_offset = start;
+	while (1) {
+		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+					       cur_offset, 0);
+		BUG_ON(ret < 0);
+		if (ret > 0 && path->slots[0] > 0 && check_prev) {
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0] - 1);
+			if (found_key.objectid == inode->i_ino &&
+			    found_key.type == BTRFS_EXTENT_DATA_KEY)
+				path->slots[0]--;
+		}
+		check_prev = 0;
+next_slot:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				BUG_ON(1);
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
 
-		if (loops && start != extent_start)
-			goto not_found;
+		nocow = 0;
+		disk_bytenr = 0;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-		if (start < extent_start || start >= extent_end)
-			goto not_found;
+		if (found_key.objectid > inode->i_ino ||
+		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
+		    found_key.offset > end)
+			break;
 
-		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
-		if (bytenr == 0)
-			goto not_found;
+		if (found_key.offset > cur_offset) {
+			extent_end = found_key.offset;
+			goto out_check;
+		}
 
-		if (btrfs_cross_ref_exists(trans, root, &found_key, bytenr))
-			goto not_found;
-		/*
-		 * we may be called by the resizer, make sure we're inside
-		 * the limits of the FS
-		 */
-		block_group = btrfs_lookup_block_group(root->fs_info,
-						       bytenr);
-		if (!block_group || block_group->ro)
-			goto not_found;
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(leaf, fi);
 
-		bytenr += btrfs_file_extent_offset(leaf, item);
-		extent_num_bytes = min(end + 1, extent_end) - start;
-		ret = btrfs_add_ordered_extent(inode, start, bytenr,
-						extent_num_bytes,
-						extent_num_bytes, 1, 0);
-		if (ret) {
-			err = ret;
-			goto out;
+		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+			struct btrfs_block_group_cache *block_group;
+			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			extent_end = found_key.offset +
+				btrfs_file_extent_num_bytes(leaf, fi);
+			if (extent_end <= start) {
+				path->slots[0]++;
+				goto next_slot;
+			}
+			if (btrfs_file_extent_compression(leaf, fi) ||
+			    btrfs_file_extent_encryption(leaf, fi) ||
+			    btrfs_file_extent_other_encoding(leaf, fi))
+				goto out_check;
+			if (disk_bytenr == 0)
+				goto out_check;
+			if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
+				goto out_check;
+			block_group = btrfs_lookup_block_group(root->fs_info,
+							       disk_bytenr);
+			if (!block_group || block_group->ro)
+				goto out_check;
+			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+			nocow = 1;
+		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			extent_end = found_key.offset +
+				btrfs_file_extent_inline_len(leaf, fi);
+			extent_end = ALIGN(extent_end, root->sectorsize);
+		} else {
+			BUG_ON(1);
+		}
+out_check:
+		if (extent_end <= start) {
+			path->slots[0]++;
+			goto next_slot;
+		}
+		if (!nocow) {
+			if (cow_start == (u64)-1)
+				cow_start = cur_offset;
+			cur_offset = extent_end;
+			if (cur_offset > end)
+				break;
+			path->slots[0]++;
+			goto next_slot;
 		}
 
 		btrfs_release_path(root, path);
-		start = extent_end;
-		if (start <= end) {
-			loops++;
-			goto again;
+		if (cow_start != (u64)-1) {
+			ret = cow_file_range(inode, locked_page, cow_start,
+					found_key.offset - 1, page_started);
+			BUG_ON(ret);
+			cow_start = (u64)-1;
 		}
-	} else {
-not_found:
-		btrfs_end_transaction(trans, root);
-		btrfs_free_path(path);
-		return cow_file_range(inode, locked_page, start, end,
-				      page_started);
+
+		disk_bytenr += cur_offset - found_key.offset;
+		num_bytes = min(end + 1, extent_end) - cur_offset;
+
+		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
+					       num_bytes, num_bytes,
+					       BTRFS_ORDERED_NOCOW);
+		cur_offset = extent_end;
+		if (cur_offset > end)
+			break;
 	}
-out:
-	WARN_ON(err);
-	btrfs_end_transaction(trans, root);
+	btrfs_release_path(root, path);
+
+	if (cur_offset <= end && cow_start == (u64)-1)
+		cow_start = cur_offset;
+	if (cow_start != (u64)-1) {
+		ret = cow_file_range(inode, locked_page, cow_start, end,
+				     page_started);
+		BUG_ON(ret);
+	}
+
+	ret = btrfs_end_transaction(trans, root);
+	BUG_ON(ret);
 	btrfs_free_path(path);
-	return err;
+	return 0;
 }
 
 /*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fd3c8b5676c..7f915d47839 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -112,6 +112,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	btrfs_set_root_level(&root_item, 0);
 	btrfs_set_root_refs(&root_item, 1);
 	btrfs_set_root_used(&root_item, 0);
+	btrfs_set_root_last_snapshot(&root_item, 0);
 
 	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
 	root_item.drop_level = 0;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b5745bb96d4..e7317c8fda2 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,8 +165,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  * inserted.
  */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int nocow,
-			     int compressed)
+			     u64 start, u64 len, u64 disk_len, int type)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -183,10 +182,8 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->len = len;
 	entry->disk_len = disk_len;
 	entry->inode = inode;
-	if (nocow)
-		set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
-	if (compressed)
-		set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
+	if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_COMPRESSED)
+		set_bit(type, &entry->flags);
 
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1ef464145d2..e6d9bc54c2b 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -132,8 +132,7 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int nocow,
-			     int compressed);
+			     u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 968b84f17a1..e72a013d24b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -763,6 +763,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
+	btrfs_record_root_in_trans(root);
+	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
 	key.objectid = objectid;
-- 
cgit v1.2.3


From d899e05215178fed903ad0e7fc1cb4d8e0cc0a88 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 30 Oct 2008 14:25:28 -0400
Subject: Btrfs: Add fallocate support v2 This patch updates btrfs-progs for
 fallocate support.

fallocate is a little different in Btrfs because we need to tell the
COW system that a given preallocated extent doesn't need to be
cow'd as long as there are no snapshots of it.  This leverages the
-o nodatacow checks.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h        |   9 +-
 fs/btrfs/extent-tree.c  |  65 +++++-----
 fs/btrfs/extent_io.c    |  12 +-
 fs/btrfs/extent_map.h   |   1 +
 fs/btrfs/file.c         | 245 +++++++++++++++++++++++++++++++++++-
 fs/btrfs/inode.c        | 323 +++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ioctl.c        |   3 +-
 fs/btrfs/ordered-data.c |   5 +-
 fs/btrfs/ordered-data.h |   4 +-
 fs/btrfs/tree-log.c     |  13 +-
 10 files changed, 563 insertions(+), 117 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8bf6a085a73..d5ba3d1aaf9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -462,8 +462,9 @@ struct btrfs_root_item {
 	u8 level;
 } __attribute__ ((__packed__));
 
-#define BTRFS_FILE_EXTENT_REG 0
-#define BTRFS_FILE_EXTENT_INLINE 1
+#define BTRFS_FILE_EXTENT_INLINE 0
+#define BTRFS_FILE_EXTENT_REG 1
+#define BTRFS_FILE_EXTENT_PREALLOC 2
 
 struct btrfs_file_extent_item {
 	/*
@@ -868,6 +869,7 @@ struct btrfs_root {
 #define BTRFS_INODE_NODATACOW		(1 << 1)
 #define BTRFS_INODE_READONLY		(1 << 2)
 #define BTRFS_INODE_NOCOMPRESS		(1 << 3)
+#define BTRFS_INODE_PREALLOC		(1 << 4)
 #define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
 					 ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
@@ -1924,6 +1926,9 @@ extern struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_block);
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
 
 /* tree-defrag.c */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1eb69a91b72..8af39521eb7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2147,6 +2147,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	total_needed += empty_size;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
+	if (!block_group)
+		block_group = btrfs_lookup_first_block_group(root->fs_info,
+							     search_start);
 	space_info = __find_space_info(root->fs_info, data);
 
 	down_read(&space_info->groups_sem);
@@ -3426,9 +3429,7 @@ walk_down:
 next:
 		level--;
 		btrfs_release_path(extent_root, path);
-		if (need_resched()) {
-			cond_resched();
-		}
+		cond_resched();
 	}
 	/* reached lowest level */
 	ret = 1;
@@ -3539,9 +3540,7 @@ found:
 		}
 
 		btrfs_release_path(extent_root, path);
-		if (need_resched()) {
-			cond_resched();
-		}
+		cond_resched();
 	}
 	/* reached max tree level, but no tree root found. */
 	BUG();
@@ -3654,8 +3653,9 @@ static int noinline get_new_locations(struct inode *reloc_inode,
 		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
 		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
 									   fi);
-		WARN_ON(exts[nr].offset > 0);
-		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
+		BUG_ON(exts[nr].offset > 0);
+		BUG_ON(exts[nr].compression || exts[nr].encryption);
+		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
 		cur_pos += exts[nr].num_bytes;
 		nr++;
@@ -3709,6 +3709,7 @@ static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	int nr_scaned = 0;
 	int extent_locked = 0;
+	int extent_type;
 	int ret;
 
 	memcpy(&key, leaf_key, sizeof(key));
@@ -3781,8 +3782,9 @@ next:
 		}
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
-		if ((btrfs_file_extent_type(leaf, fi) !=
-		     BTRFS_FILE_EXTENT_REG) ||
+		extent_type = btrfs_file_extent_type(leaf, fi);
+		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
+		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
 		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
 		     extent_key->objectid)) {
 			path->slots[0]++;
@@ -3865,16 +3867,10 @@ next:
 
 		if (nr_extents == 1) {
 			/* update extent pointer in place */
-			btrfs_set_file_extent_generation(leaf, fi,
-						trans->transid);
 			btrfs_set_file_extent_disk_bytenr(leaf, fi,
 						new_extents[0].disk_bytenr);
 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[0].disk_num_bytes);
-			btrfs_set_file_extent_ram_bytes(leaf, fi,
-						new_extents[0].ram_bytes);
-			ext_offset += new_extents[0].offset;
-			btrfs_set_file_extent_offset(leaf, fi, ext_offset);
 			btrfs_mark_buffer_dirty(leaf);
 
 			btrfs_drop_extent_cache(inode, key.offset,
@@ -3901,6 +3897,8 @@ next:
 			btrfs_release_path(root, path);
 			key.offset += num_bytes;
 		} else {
+			BUG_ON(1);
+#if 0
 			u64 alloc_hint;
 			u64 extent_len;
 			int i;
@@ -3977,6 +3975,7 @@ next:
 					break;
 			}
 			BUG_ON(i >= nr_extents);
+#endif
 		}
 
 		if (extent_locked) {
@@ -4156,15 +4155,10 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
 
-		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-		btrfs_set_file_extent_ram_bytes(leaf, fi,
-						new_extent->ram_bytes);
 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
 						new_extent->disk_bytenr);
 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extent->disk_num_bytes);
-		new_extent->offset += btrfs_file_extent_offset(leaf, fi);
-		btrfs_set_file_extent_offset(leaf, fi, new_extent->offset);
 		btrfs_mark_buffer_dirty(leaf);
 
 		ret = btrfs_inc_extent_ref(trans, root,
@@ -4625,12 +4619,15 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 			 */
 			if (!new_extents) {
 				u64 group_start = group->key.objectid;
+				new_extents = kmalloc(sizeof(*new_extents),
+						      GFP_NOFS);
+				nr_extents = 1;
 				ret = get_new_locations(reloc_inode,
 							extent_key,
-							group_start, 0,
+							group_start, 1,
 							&new_extents,
 							&nr_extents);
-				if (ret < 0)
+				if (ret)
 					goto out;
 			}
 			btrfs_record_root_in_trans(found_root);
@@ -4762,7 +4759,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_generation(leaf, item, 1);
 	btrfs_set_inode_size(leaf, item, size);
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM |
+					  BTRFS_INODE_NOCOMPRESS);
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root, path);
 out:
@@ -4835,6 +4833,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	struct inode *reloc_inode;
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_key key;
+	u64 skipped;
 	u64 cur_byte;
 	u64 total_found;
 	u32 nritems;
@@ -4864,6 +4863,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	btrfs_start_delalloc_inodes(info->tree_root);
 	btrfs_wait_ordered_extents(info->tree_root, 0);
 again:
+	skipped = 0;
 	total_found = 0;
 	progress = 0;
 	key.objectid = block_group->key.objectid;
@@ -4926,6 +4926,8 @@ next:
 		ret = relocate_one_extent(root, path, &key, block_group,
 					  reloc_inode, pass);
 		BUG_ON(ret < 0);
+		if (ret > 0)
+			skipped++;
 
 		key.objectid = cur_byte;
 		key.type = 0;
@@ -4944,6 +4946,11 @@ next:
 		printk("btrfs found %llu extents in pass %d\n",
 		       (unsigned long long)total_found, pass);
 		pass++;
+		if (total_found == skipped && pass > 2) {
+			iput(reloc_inode);
+			reloc_inode = create_reloc_inode(info, block_group);
+			pass = 0;
+		}
 		goto again;
 	}
 
@@ -5011,17 +5018,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
 		block_group = rb_entry(n, struct btrfs_block_group_cache,
 				       cache_node);
-
-		spin_unlock(&info->block_group_cache_lock);
-		btrfs_remove_free_space_cache(block_group);
-		spin_lock(&info->block_group_cache_lock);
-
 		rb_erase(&block_group->cache_node,
 			 &info->block_group_cache_tree);
+		spin_unlock(&info->block_group_cache_lock);
+
+		btrfs_remove_free_space_cache(block_group);
 		down_write(&block_group->space_info->groups_sem);
 		list_del(&block_group->list);
 		up_write(&block_group->space_info->groups_sem);
 		kfree(block_group);
+
+		spin_lock(&info->block_group_cache_lock);
 	}
 	spin_unlock(&info->block_group_cache_lock);
 	return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 65a0583027e..eb3c12e7bea 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2015,6 +2015,8 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 		}
 		bdev = em->bdev;
 		block_start = em->block_start;
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			block_start = EXTENT_MAP_HOLE;
 		free_extent_map(em);
 		em = NULL;
 
@@ -2769,14 +2771,18 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 	struct inode *inode = mapping->host;
 	u64 start = iblock << inode->i_blkbits;
 	sector_t sector = 0;
+	size_t blksize = (1 << inode->i_blkbits);
 	struct extent_map *em;
 
-	em = get_extent(inode, NULL, 0, start, (1 << inode->i_blkbits), 0);
+	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+		    GFP_NOFS);
+	em = get_extent(inode, NULL, 0, start, blksize, 0);
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+		      GFP_NOFS);
 	if (!em || IS_ERR(em))
 		return 0;
 
-	if (em->block_start == EXTENT_MAP_INLINE ||
-	    em->block_start == EXTENT_MAP_HOLE)
+	if (em->block_start > EXTENT_MAP_LAST_BYTE)
 		goto out;
 
 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index e693e1b4ac4..accfedaeb51 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -12,6 +12,7 @@
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
+#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
 
 struct extent_map {
 	struct rb_node rb_node;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1a0510ad030..238a8e215eb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -381,7 +381,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int keep;
 	int slot;
 	int bookend;
-	int found_type;
+	int found_type = 0;
 	int found_extent;
 	int found_inline;
 	int recow;
@@ -442,7 +442,8 @@ next_slot:
 								  extent);
 			other_encoding = btrfs_file_extent_other_encoding(leaf,
 								  extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 				extent_end =
 				     btrfs_file_extent_disk_bytenr(leaf,
 								   extent);
@@ -609,8 +610,7 @@ next_slot:
 			 */
 			btrfs_set_file_extent_ram_bytes(leaf, extent,
 							ram_bytes);
-			btrfs_set_file_extent_type(leaf, extent,
-						   BTRFS_FILE_EXTENT_REG);
+			btrfs_set_file_extent_type(leaf, extent, found_type);
 
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 
@@ -661,6 +661,243 @@ out:
 	return ret;
 }
 
+static int extent_mergeable(struct extent_buffer *leaf, int slot,
+			    u64 objectid, u64 bytenr, u64 *start, u64 *end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 extent_end;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
+	    btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	if ((*start && *start != key.offset) || (*end && *end != extent_end))
+		return 0;
+
+	*start = key.offset;
+	*end = extent_end;
+	return 1;
+}
+
+/*
+ * Mark extent in the range start - end as written.
+ *
+ * This changes extent type from 'pre-allocated' to 'regular'. If only
+ * part of extent is marked as written, the extent will be split into
+ * two or three.
+ */
+int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct inode *inode, u64 start, u64 end)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+	u64 bytenr;
+	u64 num_bytes;
+	u64 extent_end;
+	u64 extent_offset;
+	u64 other_start;
+	u64 other_end;
+	u64 split = start;
+	u64 locked_end = end;
+	int extent_type;
+	int split_end = 1;
+	int ret;
+
+	btrfs_drop_extent_cache(inode, start, end - 1, 0);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	key.objectid = inode->i_ino;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (split == start)
+		key.offset = split;
+	else
+		key.offset = split - 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	BUG_ON(key.objectid != inode->i_ino ||
+	       key.type != BTRFS_EXTENT_DATA_KEY);
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	extent_type = btrfs_file_extent_type(leaf, fi);
+	BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+	BUG_ON(key.offset > start || extent_end < end);
+
+	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+	extent_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (key.offset == start)
+		split = end;
+
+	if (key.offset == start && extent_end == end) {
+		int del_nr = 0;
+		int del_slot = 0;
+		u64 leaf_owner = btrfs_header_owner(leaf);
+		u64 leaf_gen = btrfs_header_generation(leaf);
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			extent_end = other_end;
+			del_slot = path->slots[0] + 1;
+			del_nr++;
+			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
+			BUG_ON(ret);
+		}
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			key.offset = other_start;
+			del_slot = path->slots[0];
+			del_nr++;
+			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+						leaf->start, leaf_owner,
+						leaf_gen, inode->i_ino, 0);
+			BUG_ON(ret);
+		}
+		split_end = 0;
+		if (del_nr == 0) {
+			btrfs_set_file_extent_type(leaf, fi,
+						   BTRFS_FILE_EXTENT_REG);
+			goto done;
+		}
+
+		fi = btrfs_item_ptr(leaf, del_slot - 1,
+				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_num_bytes(leaf, fi,
+						extent_end - key.offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+		BUG_ON(ret);
+		goto done;
+	} else if (split == start) {
+		if (locked_end < extent_end) {
+			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+			if (!ret) {
+				btrfs_release_path(root, path);
+				lock_extent(&BTRFS_I(inode)->io_tree,
+					locked_end, extent_end - 1, GFP_NOFS);
+				locked_end = extent_end;
+				goto again;
+			}
+			locked_end = extent_end;
+		}
+		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+		extent_offset += split - key.offset;
+	} else  {
+		BUG_ON(key.offset != start);
+		btrfs_set_file_extent_offset(leaf, fi, extent_offset +
+					     split - key.offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+		key.offset = split;
+		btrfs_set_item_key_safe(trans, root, path, &key);
+		extent_end = split;
+	}
+
+	if (extent_end == end) {
+		split_end = 0;
+		extent_type = BTRFS_FILE_EXTENT_REG;
+	}
+	if (extent_end == end && split == start) {
+		other_start = end;
+		other_end = 0;
+		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			path->slots[0]++;
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			key.offset = split;
+			btrfs_set_item_key_safe(trans, root, path, &key);
+			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							other_end - split);
+			goto done;
+		}
+	}
+	if (extent_end == end && split == end) {
+		other_start = 0;
+		other_end = start;
+		if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+				     bytenr, &other_start, &other_end)) {
+			path->slots[0]--;
+			fi = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+							other_start);
+			goto done;
+		}
+	}
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+
+	key.offset = start;
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, 0);
+	btrfs_set_file_extent_encryption(leaf, fi, 0);
+	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+
+	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+				   leaf->start, root->root_key.objectid,
+				   trans->transid, inode->i_ino);
+	BUG_ON(ret);
+done:
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+	if (split_end && split == start) {
+		split = end;
+		goto again;
+	}
+	if (locked_end > end) {
+		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+			      GFP_NOFS);
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
 /*
  * this gets pages into the page cache and locks them down, it also properly
  * waits for data=ordered extents to finish before allowing the pages to be
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e6f0568fdb..789c376157f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
+#include <linux/falloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -587,7 +588,7 @@ free_pages_out:
  * blocks on disk
  */
 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
-			      u64 start, u64 end, int *page_started)
+			      u64 start, u64 end, int *page_started, int force)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -602,6 +603,7 @@ static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
 	u64 num_bytes;
 	int extent_type;
 	int ret;
+	int type;
 	int nocow;
 	int check_prev = 1;
 
@@ -654,7 +656,8 @@ next_slot:
 				    struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(leaf, fi);
 
-		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			struct btrfs_block_group_cache *block_group;
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 			extent_end = found_key.offset +
@@ -669,6 +672,8 @@ next_slot:
 				goto out_check;
 			if (disk_bytenr == 0)
 				goto out_check;
+			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
+				goto out_check;
 			if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
 				goto out_check;
 			block_group = btrfs_lookup_block_group(root->fs_info,
@@ -709,10 +714,39 @@ out_check:
 
 		disk_bytenr += cur_offset - found_key.offset;
 		num_bytes = min(end + 1, extent_end) - cur_offset;
+		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			struct extent_map *em;
+			struct extent_map_tree *em_tree;
+			em_tree = &BTRFS_I(inode)->extent_tree;
+			em = alloc_extent_map(GFP_NOFS);
+			em->start = cur_offset;
+			em->len = num_bytes;
+			em->block_len = num_bytes;
+			em->block_start = disk_bytenr;
+			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			while (1) {
+				spin_lock(&em_tree->lock);
+				ret = add_extent_mapping(em_tree, em);
+				spin_unlock(&em_tree->lock);
+				if (ret != -EEXIST) {
+					free_extent_map(em);
+					break;
+				}
+				btrfs_drop_extent_cache(inode, em->start,
+						em->start + em->len - 1, 0);
+			}
+			type = BTRFS_ORDERED_PREALLOC;
+		} else {
+			type = BTRFS_ORDERED_NOCOW;
+		}
 
 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
-					       num_bytes, num_bytes,
-					       BTRFS_ORDERED_NOCOW);
+					       num_bytes, num_bytes, type);
+		BUG_ON(ret);
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					cur_offset, cur_offset + num_bytes - 1,
+					locked_page, 0, 0, 0);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -745,7 +779,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started);
+					 page_started, 0);
+	else if (btrfs_test_flag(inode, PREALLOC))
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started, 1);
 	else
 		ret = cow_file_range(inode, locked_page, start, end,
 				     page_started);
@@ -1006,6 +1043,63 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 	return -EAGAIN;
 }
 
+static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
+				       struct inode *inode, u64 file_pos,
+				       u64 disk_bytenr, u64 disk_num_bytes,
+				       u64 num_bytes, u64 ram_bytes,
+				       u8 compression, u8 encryption,
+				       u16 other_encoding, int extent_type)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key ins;
+	u64 hint;
+	int ret;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+				 file_pos + num_bytes, file_pos, &hint);
+	BUG_ON(ret);
+
+	ins.objectid = inode->i_ino;
+	ins.offset = file_pos;
+	ins.type = BTRFS_EXTENT_DATA_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
+	BUG_ON(ret);
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+	btrfs_set_file_extent_type(leaf, fi, extent_type);
+	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
+	btrfs_set_file_extent_offset(leaf, fi, 0);
+	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+	btrfs_set_file_extent_compression(leaf, fi, compression);
+	btrfs_set_file_extent_encryption(leaf, fi, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+	btrfs_mark_buffer_dirty(leaf);
+
+	inode_add_bytes(inode, num_bytes);
+	btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
+
+	ins.objectid = disk_bytenr;
+	ins.offset = disk_num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+					  root->root_key.objectid,
+					  trans->transid, inode->i_ino, &ins);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
 /* as ordered data IO finishes, this gets called so we can finish
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
@@ -1016,12 +1110,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered_extent;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_file_extent_item *extent_item;
-	struct btrfs_path *path = NULL;
-	struct extent_buffer *leaf;
-	u64 alloc_hint = 0;
-	struct list_head list;
-	struct btrfs_key ins;
+	int compressed = 0;
 	int ret;
 
 	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
@@ -1035,67 +1124,30 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
 		goto nocow;
 
-	path = btrfs_alloc_path();
-	BUG_ON(!path);
-
 	lock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
 
-	INIT_LIST_HEAD(&list);
-
-	ret = btrfs_drop_extents(trans, root, inode,
-				 ordered_extent->file_offset,
-				 ordered_extent->file_offset +
-				 ordered_extent->len,
-				 ordered_extent->file_offset, &alloc_hint);
-	BUG_ON(ret);
-
-	ins.objectid = inode->i_ino;
-	ins.offset = ordered_extent->file_offset;
-	ins.type = BTRFS_EXTENT_DATA_KEY;
-	ret = btrfs_insert_empty_item(trans, root, path, &ins,
-				      sizeof(*extent_item));
-	BUG_ON(ret);
-	leaf = path->nodes[0];
-	extent_item = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_file_extent_item);
-	btrfs_set_file_extent_generation(leaf, extent_item, trans->transid);
-	btrfs_set_file_extent_type(leaf, extent_item, BTRFS_FILE_EXTENT_REG);
-	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
-					  ordered_extent->start);
-	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-					     ordered_extent->disk_len);
-	btrfs_set_file_extent_offset(leaf, extent_item, 0);
-
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-		btrfs_set_file_extent_compression(leaf, extent_item, 1);
-	else
-		btrfs_set_file_extent_compression(leaf, extent_item, 0);
-	btrfs_set_file_extent_encryption(leaf, extent_item, 0);
-	btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
-
-	/* ram bytes = extent_num_bytes for now */
-	btrfs_set_file_extent_num_bytes(leaf, extent_item,
-					ordered_extent->len);
-	btrfs_set_file_extent_ram_bytes(leaf, extent_item,
-					ordered_extent->len);
-	btrfs_mark_buffer_dirty(leaf);
-
-	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
-				ordered_extent->file_offset +
-				ordered_extent->len - 1, 0);
-
-	ins.objectid = ordered_extent->start;
-	ins.offset = ordered_extent->disk_len;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
-	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
-					  root->root_key.objectid,
-					  trans->transid, inode->i_ino, &ins);
-	BUG_ON(ret);
-	btrfs_release_path(root, path);
-
-	inode_add_bytes(inode, ordered_extent->len);
+		compressed = 1;
+	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+		BUG_ON(compressed);
+		ret = btrfs_mark_extent_written(trans, root, inode,
+						ordered_extent->file_offset,
+						ordered_extent->file_offset +
+						ordered_extent->len);
+		BUG_ON(ret);
+	} else {
+		ret = insert_reserved_file_extent(trans, inode,
+						ordered_extent->file_offset,
+						ordered_extent->start,
+						ordered_extent->disk_len,
+						ordered_extent->len,
+						ordered_extent->len,
+						compressed, 0, 0,
+						BTRFS_FILE_EXTENT_REG);
+		BUG_ON(ret);
+	}
 	unlock_extent(io_tree, ordered_extent->file_offset,
 		    ordered_extent->file_offset + ordered_extent->len - 1,
 		    GFP_NOFS);
@@ -1115,8 +1167,6 @@ nocow:
 	btrfs_put_ordered_extent(ordered_extent);
 
 	btrfs_end_transaction(trans, root);
-	if (path)
-		btrfs_free_path(path);
 	return 0;
 }
 
@@ -3488,7 +3538,8 @@ again:
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
 	compressed = btrfs_file_extent_compression(leaf, item);
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -3521,7 +3572,8 @@ again:
 		goto not_found_em;
 	}
 
-	if (found_type == BTRFS_FILE_EXTENT_REG) {
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
@@ -3538,6 +3590,8 @@ again:
 			bytenr += btrfs_file_extent_offset(leaf, item);
 			em->block_start = bytenr;
 			em->block_len = em->len;
+			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 		}
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -3969,6 +4023,7 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
 	if (error)
 		return error;
 
+	atomic_inc(&inode->i_count);
 	d_instantiate(dentry, inode);
 	return 0;
 }
@@ -4318,6 +4373,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len - 1);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
@@ -4335,6 +4391,130 @@ out_fail:
 	return err;
 }
 
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+			       u64 alloc_hint, int mode)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_key ins;
+	u64 alloc_size;
+	u64 cur_offset = start;
+	u64 num_bytes = end - start;
+	int ret = 0;
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
+
+	while (num_bytes > 0) {
+		alloc_size = min(num_bytes, root->fs_info->max_extent);
+		ret = btrfs_reserve_extent(trans, root, alloc_size,
+					   root->sectorsize, 0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		if (ret) {
+			WARN_ON(1);
+			goto out;
+		}
+		ret = insert_reserved_file_extent(trans, inode,
+						  cur_offset, ins.objectid,
+						  ins.offset, ins.offset,
+						  ins.offset, 0, 0, 0,
+						  BTRFS_FILE_EXTENT_PREALLOC);
+		BUG_ON(ret);
+		num_bytes -= ins.offset;
+		cur_offset += ins.offset;
+		alloc_hint = ins.objectid + ins.offset;
+	}
+out:
+	if (cur_offset > start) {
+		inode->i_ctime = CURRENT_TIME;
+		btrfs_set_flag(inode, PREALLOC);
+		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+		    cur_offset > i_size_read(inode))
+			btrfs_i_size_write(inode, cur_offset);
+		ret = btrfs_update_inode(trans, root, inode);
+		BUG_ON(ret);
+	}
+
+	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+static long btrfs_fallocate(struct inode *inode, int mode,
+			    loff_t offset, loff_t len)
+{
+	u64 cur_offset;
+	u64 last_byte;
+	u64 alloc_start;
+	u64 alloc_end;
+	u64 alloc_hint = 0;
+	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+	struct extent_map *em;
+	int ret;
+
+	alloc_start = offset & ~mask;
+	alloc_end =  (offset + len + mask) & ~mask;
+
+	mutex_lock(&inode->i_mutex);
+	if (alloc_start > inode->i_size) {
+		ret = btrfs_cont_expand(inode, alloc_start);
+		if (ret)
+			goto out;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
+			    alloc_end - 1, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    alloc_end - 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > alloc_start &&
+		    ordered->file_offset < alloc_end) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent(&BTRFS_I(inode)->io_tree,
+				      alloc_start, alloc_end - 1, GFP_NOFS);
+			btrfs_wait_ordered_range(inode, alloc_start,
+						 alloc_end - alloc_start);
+		} else {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+	}
+
+	cur_offset = alloc_start;
+	while (1) {
+		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+				      alloc_end - cur_offset, 0);
+		BUG_ON(IS_ERR(em) || !em);
+		last_byte = min(extent_map_end(em), alloc_end);
+		last_byte = (last_byte + mask) & ~mask;
+		if (em->block_start == EXTENT_MAP_HOLE) {
+			ret = prealloc_file_range(inode, cur_offset,
+					last_byte, alloc_hint, mode);
+			if (ret < 0) {
+				free_extent_map(em);
+				break;
+			}
+		}
+		if (em->block_start <= EXTENT_MAP_LAST_BYTE)
+			alloc_hint = em->block_start;
+		free_extent_map(em);
+
+		cur_offset = last_byte;
+		if (cur_offset >= alloc_end) {
+			ret = 0;
+			break;
+		}
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
+		      GFP_NOFS);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 static int btrfs_set_page_dirty(struct page *page)
 {
 	return __set_page_dirty_nobuffers(page);
@@ -4421,6 +4601,7 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
+	.fallocate	= btrfs_fallocate,
 };
 static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7f915d47839..9ff2b4e0e92 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -724,7 +724,8 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 			found_type = btrfs_file_extent_type(leaf, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 				u64 ds = btrfs_file_extent_disk_bytenr(leaf,
 								       extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e7317c8fda2..370bb428559 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -182,7 +182,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->len = len;
 	entry->disk_len = disk_len;
 	entry->inode = inode;
-	if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_COMPRESSED)
+	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
 	/* one ref for the tree */
@@ -339,7 +339,8 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 		ordered = list_entry(cur, struct btrfs_ordered_extent,
 				     root_extent_list);
 		if (nocow_only &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
 			list_move(&ordered->root_extent_list,
 				  &root->fs_info->ordered_extents);
 			cond_resched_lock(&root->fs_info->ordered_extent_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e6d9bc54c2b..260bf95dfe0 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,6 +68,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
 
+#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -132,7 +134,7 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, u64 disk_len, int type);
+			     u64 start, u64 len, u64 disk_len, int tyep);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e0201c3a7dc..be4fc30a30e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -442,7 +442,8 @@ insert:
 
 		fi = (struct btrfs_file_extent_item *)dst_ptr;
 		extent_type = btrfs_file_extent_type(path->nodes[0], fi);
-		if (extent_type == BTRFS_FILE_EXTENT_REG) {
+		if (extent_type == BTRFS_FILE_EXTENT_REG ||
+		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			struct btrfs_key ins;
 			ins.objectid = btrfs_file_extent_disk_bytenr(
 							path->nodes[0], fi);
@@ -538,7 +539,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 	found_type = btrfs_file_extent_type(eb, item);
 
-	if (found_type == BTRFS_FILE_EXTENT_REG)
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size = btrfs_file_extent_inline_len(eb, item);
@@ -562,7 +564,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
 				       start, 0);
 
-	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
+	if (ret == 0 &&
+	    (found_type == BTRFS_FILE_EXTENT_REG ||
+	     found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 		struct btrfs_file_extent_item cmp1;
 		struct btrfs_file_extent_item cmp2;
 		struct btrfs_file_extent_item *existing;
@@ -2522,7 +2526,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						struct btrfs_file_extent_item);
 
 			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG) {
+			if (found_type == BTRFS_FILE_EXTENT_REG ||
+			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 				u64 ds = btrfs_file_extent_disk_bytenr(src,
 								   extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(src,
-- 
cgit v1.2.3


From 70b99e6959a4c28ae1b314985eca731f3db72f1d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 31 Oct 2008 12:46:39 -0400
Subject: Btrfs: Compression corner fixes

Make sure we keep page->mapping NULL on the pages we're getting
via alloc_page.  It gets set so a few of the callbacks can do the right
thing, but in general these pages don't have a mapping.

Don't try to truncate compressed inline items in btrfs_drop_extents.
The whole compressed item must be preserved.

Don't try to create multipage inline compressed items.  When we try to
overwrite just the first page of the file, we would have to read in and recow
all the pages after it in the same compressed inline items.  For now, only
create single page inline items.

Make sure we lock pages in the correct order during delalloc.  The
search into the state tree for delalloc bytes can return bytes before
the page we already have locked.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c |  4 ++++
 fs/btrfs/extent_io.c   | 11 ++++++++++-
 fs/btrfs/file.c        | 21 +++++++++++++++------
 fs/btrfs/inode.c       |  8 ++++++--
 4 files changed, 35 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9adaa79adad..354913177ba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -220,10 +220,12 @@ static void end_compressed_bio_write(struct bio *bio, int err)
 	 */
 	inode = cb->inode;
 	tree = &BTRFS_I(inode)->io_tree;
+	cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
 	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
 					 cb->start,
 					 cb->start + cb->len - 1,
 					 NULL, 1);
+	cb->compressed_pages[0]->mapping = NULL;
 
 	end_compressed_writeback(inode, cb->start, cb->len);
 	/* note, our inode could be gone now */
@@ -306,6 +308,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		else
 			ret = 0;
 
+		page->mapping = NULL;
 		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
 		    PAGE_CACHE_SIZE) {
 			bio_get(bio);
@@ -423,6 +426,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 		else
 			ret = 0;
 
+		page->mapping = NULL;
 		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
 		    PAGE_CACHE_SIZE) {
 			bio_get(comp_bio);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eb3c12e7bea..9b37ce6e516 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1242,12 +1242,21 @@ again:
 	delalloc_end = 0;
 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
 				    max_bytes);
-	if (!found) {
+	if (!found || delalloc_end <= *start) {
 		*start = delalloc_start;
 		*end = delalloc_end;
 		return found;
 	}
 
+	/*
+	 * start comes from the offset of locked_page.  We have to lock
+	 * pages in order, so we can't process delalloc bytes before
+	 * locked_page
+	 */
+	if (delalloc_start < *start) {
+		delalloc_start = *start;
+	}
+
 	/*
 	 * make sure to limit the number of pages we try to lock down
 	 * if we're looping.
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 238a8e215eb..0c8cc35a8b9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -368,8 +368,8 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 search_start = start;
 	u64 leaf_start;
 	u64 ram_bytes = 0;
-	u8 compression = 0;
-	u8 encryption = 0;
+	u8 compression;
+	u8 encryption;
 	u16 other_encoding = 0;
 	u64 root_gen;
 	u64 root_owner;
@@ -415,6 +415,8 @@ next_slot:
 		leaf_start = 0;
 		root_gen = 0;
 		root_owner = 0;
+		compression = 0;
+		encryption = 0;
 		extent = NULL;
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -546,8 +548,12 @@ next_slot:
 						   inline_limit - key.offset);
 				inode_sub_bytes(inode, extent_end -
 						inline_limit);
-				btrfs_truncate_item(trans, root, path,
-						    new_size, 1);
+				btrfs_set_file_extent_ram_bytes(leaf, extent,
+							new_size);
+				if (!compression && !encryption) {
+					btrfs_truncate_item(trans, root, path,
+							    new_size, 1);
+				}
 			}
 		}
 		/* delete the entire extent */
@@ -567,8 +573,11 @@ next_slot:
 			new_size = btrfs_file_extent_calc_inline_size(
 						   extent_end - end);
 			inode_sub_bytes(inode, end - key.offset);
-			ret = btrfs_truncate_item(trans, root, path,
-						  new_size, 0);
+			btrfs_set_file_extent_ram_bytes(leaf, extent,
+							new_size);
+			if (!compression && !encryption)
+				ret = btrfs_truncate_item(trans, root, path,
+							  new_size, 0);
 			BUG_ON(ret);
 		}
 		/* create bookend, splitting the extent in two */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 789c376157f..806708dd7e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -239,6 +239,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		data_len = compressed_size;
 
 	if (start > 0 ||
+	    actual_end >= PAGE_CACHE_SIZE ||
 	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
 	    (!compressed_size &&
 	    (actual_end & (root->sectorsize - 1)) == 0) ||
@@ -248,7 +249,7 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_drop_extents(trans, root, inode, start,
-				 aligned_end, aligned_end, &hint_byte);
+				 aligned_end, start, &hint_byte);
 	BUG_ON(ret);
 
 	if (isize > actual_end)
@@ -423,6 +424,7 @@ again:
 		 * free any pages it allocated and our page pointer array
 		 */
 		for (i = 0; i < nr_pages_ret; i++) {
+			WARN_ON(pages[i]->mapping);
 			page_cache_release(pages[i]);
 		}
 		kfree(pages);
@@ -572,8 +574,10 @@ free_pages_out_fail:
 	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 				     start, end, locked_page, 0, 0, 0);
 free_pages_out:
-	for (i = 0; i < nr_pages_ret; i++)
+	for (i = 0; i < nr_pages_ret; i++) {
+		WARN_ON(pages[i]->mapping);
 		page_cache_release(pages[i]);
+	}
 	if (pages)
 		kfree(pages);
 
-- 
cgit v1.2.3


From 537fb0671549a9a6457ce42a25ab34b29d97a256 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 31 Oct 2008 12:54:14 -0400
Subject: Btrfs: rev the disk format for fallocate

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d5ba3d1aaf9..689df070c8e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,7 +39,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BCRfS_M"
+#define BTRFS_MAGIC "_BDRfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From 59e315b4c410b00a9acd0f24a00dbadbe81ce692 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 6 Dec 2008 16:58:39 -0500
Subject: ext3/4: Fix loop index in do_split() so it is signed

This fixes a gcc warning but it doesn't appear able to result in a
failure, since the primary way the loop is exited is the first
conditional in the for loop, and at least for a consistent filesystem,
the signed/unsigned should in practice never be exposed.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/namei.c | 4 ++--
 fs/ext4/namei.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 287b304d42a..2c2d700c1cc 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1161,9 +1161,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
-	unsigned split, move, size, i;
+	unsigned split, move, size;
 	struct ext3_dir_entry_2 *de = NULL, *de2;
-	int	err = 0;
+	int	err = 0, i;
 
 	bh2 = ext3_append (handle, dir, &newblock, &err);
 	if (!(bh2)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 315858db807..84a68ae623c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1171,9 +1171,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 	u32 hash2;
 	struct dx_map_entry *map;
 	char *data1 = (*bh)->b_data, *data2;
-	unsigned split, move, size, i;
+	unsigned split, move, size;
 	struct ext4_dir_entry_2 *de = NULL, *de2;
-	int	err = 0;
+	int	err = 0, i;
 
 	bh2 = ext4_append (handle, dir, &newblock, &err);
 	if (!(bh2)) {
-- 
cgit v1.2.3


From 815a1130687ffac2c3e91513ce64aab629d6a54d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 1 Jan 2009 23:59:43 -0500
Subject: ext4: remove ext4_new_blocks() and call ext4_mb_new_blocks() directly

There was only one caller of the compatibility function
ext4_new_blocks(), in balloc.c's ext4_alloc_blocks().  Change it to
call ext4_mb_new_blocks() directly, and remove ext4_new_blocks()
altogether.  This cleans up the code, by removing two extra functions
from the call chain, and hopefully saving some stack usage.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 20 --------------------
 fs/ext4/ext4.h   |  3 ---
 fs/ext4/inode.c  | 18 +++++++++++++-----
 3 files changed, 13 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 152c390f3c3..10ce275ebbf 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -741,26 +741,6 @@ ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
 }
 
-/*
- * ext4_new_blocks() -- allocate data blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @count:		total number of blocks need
- * @errp:               error code
- *
- * Return 1st allocated block numberon success, *count stores total account
- * error stores in errp pointer
- */
-
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, ext4_fsblk_t goal,
-				unsigned long *count, int *errp)
-{
-	return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
-}
-
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
  * @sb:		superblock
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8370ffd2d62..74cb395e689 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1002,9 +1002,6 @@ extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, int *errp);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
-					ext4_lblk_t iblock, ext4_fsblk_t goal,
-					unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5b088121686..5120243024f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -547,6 +547,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 				int indirect_blks, int blks,
 				ext4_fsblk_t new_blocks[4], int *err)
 {
+	struct ext4_allocation_request ar;
 	int target, i;
 	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
@@ -595,10 +596,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	if (!target)
 		goto allocated;
 	/* Now allocate data blocks */
-	count = target;
-	/* allocating blocks for data blocks */
-	current_block = ext4_new_blocks(handle, inode, iblock,
-						goal, &count, err);
+	memset(&ar, 0, sizeof(ar));
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = target;
+	ar.logical = iblock;
+	if (S_ISREG(inode->i_mode))
+		/* enable in-core preallocation only for regular files */
+		ar.flags = EXT4_MB_HINT_DATA;
+
+	current_block = ext4_mb_new_blocks(handle, &ar, err);
+
 	if (*err && (target == blks)) {
 		/*
 		 * if the allocation failed and we didn't allocate
@@ -614,7 +622,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 		 */
 			new_blocks[index] = current_block;
 		}
-		blk_allocated += count;
+		blk_allocated += ar.len;
 	}
 allocated:
 	/* total number of blocks allocated for direct blocks */
-- 
cgit v1.2.3


From cfe82c856747b7841a3a00d591ce9ed46f579d27 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 7 Dec 2008 14:10:54 -0500
Subject: ext4: remove ext4_new_meta_block()

There were only two one callers of the function ext4_new_meta_block(),
which just a very simpler wrapper function around
ext4_new_meta_blocks().  Change those two functions to call
ext4_new_meta_blocks() directly, to save code and stack space usage.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 17 -----------------
 fs/ext4/ext4.h    |  2 --
 fs/ext4/extents.c |  3 ++-
 fs/ext4/xattr.c   |  5 +++--
 4 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 10ce275ebbf..98a97129fc5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -724,23 +724,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	return ret;
 }
 
-/*
- * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
- *
- * @handle:             handle to this transaction
- * @inode:              file inode
- * @goal:               given target block(filesystem wide)
- * @errp:               error code
- *
- * Return allocated block number on success
- */
-ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
-{
-	unsigned long count = 1;
-	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
-}
-
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
  * @sb:		superblock
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 74cb395e689..ac8551e0b70 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -998,8 +998,6 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
-extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int *errp);
 extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
 extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae6..e5b169b44b4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -189,9 +189,10 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_extent *ex, int *err)
 {
 	ext4_fsblk_t goal, newblock;
+	unsigned long count = 1;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_meta_block(handle, inode, goal, err);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, &count, err);
 	return newblock;
 }
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 80626d516fe..f896e2c452f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -689,6 +689,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		     struct ext4_xattr_info *i,
 		     struct ext4_xattr_block_find *bs)
 {
+	unsigned long count = 1;
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
 	struct ext4_xattr_search *s = &bs->s;
@@ -810,8 +811,8 @@ inserted:
 			/* We need to allocate a new block */
 			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
-							goal, &error);
+			ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+						  goal, &count, &error);
 			if (error)
 				goto cleanup;
 			ea_idebug(inode, "creating block %d", block);
-- 
cgit v1.2.3


From 97df5d155dee478efe33b001f502e9630e1bba92 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 12 Dec 2008 12:41:28 -0500
Subject: ext4: remove do_blk_alloc()

The convenience function do_blk_alloc() is a static function with only
one caller, so fold it into ext4_new_meta_blocks() to simplify the
code and to make it easier to understand.

To save more stack space, if count is a null pointer in
ext4_new_meta_blocks() assume that caller wanted a single block (and
if there is an error, no blocks were allocated).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 49 +++++++++++++++----------------------------------
 fs/ext4/extents.c |  3 +--
 fs/ext4/xattr.c   |  3 +--
 3 files changed, 17 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 98a97129fc5..35f5f9a2772 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -666,59 +666,40 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
 
-#define EXT4_META_BLOCK 0x1
-
-static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, ext4_fsblk_t goal,
-				unsigned long *count, int *errp, int flags)
-{
-	struct ext4_allocation_request ar;
-	ext4_fsblk_t ret;
-
-	memset(&ar, 0, sizeof(ar));
-	/* Fill with neighbour allocated blocks */
-
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = *count;
-	ar.logical = iblock;
-
-	if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
-		/* enable in-core preallocation for data block allocation */
-		ar.flags = EXT4_MB_HINT_DATA;
-	else
-		/* disable in-core preallocation for non-regular files */
-		ar.flags = 0;
-
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	*count = ar.len;
-	return ret;
-}
-
 /*
  * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
  *
  * @handle:             handle to this transaction
  * @inode:              file inode
  * @goal:               given target block(filesystem wide)
- * @count:		total number of blocks need
+ * @count:		pointer to total number of blocks needed
  * @errp:               error code
  *
- * Return 1st allocated block numberon success, *count stores total account
+ * Return 1st allocated block number on success, *count stores total account
  * error stores in errp pointer
  */
 ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
+	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
-	ret = do_blk_alloc(handle, inode, 0, goal,
-				count, errp, EXT4_META_BLOCK);
+
+	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+	ar.inode = inode;
+	ar.goal = goal;
+	ar.len = count ? *count : 1;
+
+	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	if (count)
+		*count = ar.len;
+
 	/*
 	 * Account for the allocated meta blocks
 	 */
 	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-		EXT4_I(inode)->i_allocated_meta_blocks += *count;
+		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	}
 	return ret;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e5b169b44b4..59401d057c6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -189,10 +189,9 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_extent *ex, int *err)
 {
 	ext4_fsblk_t goal, newblock;
-	unsigned long count = 1;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_meta_blocks(handle, inode, goal, &count, err);
+	newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
 	return newblock;
 }
 
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index f896e2c452f..9b4a368c572 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -689,7 +689,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		     struct ext4_xattr_info *i,
 		     struct ext4_xattr_block_find *bs)
 {
-	unsigned long count = 1;
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
 	struct ext4_xattr_search *s = &bs->s;
@@ -812,7 +811,7 @@ inserted:
 			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
 			ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
-						  goal, &count, &error);
+						  goal, NULL, &error);
 			if (error)
 				goto cleanup;
 			ea_idebug(inode, "creating block %d", block);
-- 
cgit v1.2.3


From 2a21e37e48b94388f2cc8c0392f104f5443d4bb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 5 Nov 2008 09:22:24 -0500
Subject: ext4: tone down ext4_da_writepages warnings

If the filesystem has errors, ext4_da_writepages() will return a *lot*
of errors, including lots and lots of stack dumps.  While it's true
that we are dropping user data on the floor, which is unfortunate, the
stack dumps aren't helpful, and they tend to obscure the true original
root cause of the problem.  So in the case where the filesystem has
aborted, return an EROFS right away.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5120243024f..ac97348f85b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2397,6 +2397,20 @@ static int ext4_da_writepages(struct address_space *mapping,
 	 */
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
+
+	/*
+	 * If the filesystem has aborted, it is read-only, so return
+	 * right away instead of dumping stack traces later on that
+	 * will obscure the real source of the problem.  We test
+	 * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
+	 * the latter could be true if the filesystem is mounted
+	 * read-only, and in that case, ext4_da_writepages should
+	 * *never* be called, so if that ever happens, we would want
+	 * the stack trace.
+	 */
+	if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
+		return -EROFS;
+
 	/*
 	 * Make sure nr_to_write is >= sbi->s_mb_stream_request
 	 * This make sure small files blocks are allocated in
@@ -2441,7 +2455,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-			printk(KERN_EMERG "%s: jbd2_start: "
+			printk(KERN_CRIT "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d\n", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			dump_stack();
-- 
cgit v1.2.3


From 791b7f08954869d7b8ff438f3dac3cfb39778297 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:50:43 -0500
Subject: ext4: Fix the delalloc writepages to allocate blocks at the right
 offset.

When iterating through the pages which have mapped buffer_heads, we
failed to update the b_state value. This results in allocating blocks
at logical offset 0.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 56 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ac97348f85b..c77a7ac753f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1653,35 +1653,39 @@ struct mpage_da_data {
  */
 static int mpage_da_submit_io(struct mpage_da_data *mpd)
 {
-	struct address_space *mapping = mpd->inode->i_mapping;
-	int ret = 0, err, nr_pages, i;
-	unsigned long index, end;
-	struct pagevec pvec;
 	long pages_skipped;
+	struct pagevec pvec;
+	unsigned long index, end;
+	int ret = 0, err, nr_pages, i;
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
 
 	BUG_ON(mpd->next_page <= mpd->first_page);
-	pagevec_init(&pvec, 0);
+	/*
+	 * We need to start from the first_page to the next_page - 1
+	 * to make sure we also write the mapped dirty buffer_heads.
+	 * If we look at mpd->lbh.b_blocknr we would only be looking
+	 * at the currently mapped buffer_heads.
+	 */
 	index = mpd->first_page;
 	end = mpd->next_page - 1;
 
+	pagevec_init(&pvec, 0);
 	while (index <= end) {
-		/*
-		 * We can use PAGECACHE_TAG_DIRTY lookup here because
-		 * even though we have cleared the dirty flag on the page
-		 * We still keep the page in the radix tree with tag
-		 * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
-		 * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
-		 * which is called via the below writepage callback.
-		 */
-		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					PAGECACHE_TAG_DIRTY,
-					min(end - index,
-					(pgoff_t)PAGEVEC_SIZE-1) + 1);
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 		if (nr_pages == 0)
 			break;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+
 			pages_skipped = mpd->wbc->pages_skipped;
 			err = mapping->a_ops->writepage(page, mpd->wbc);
 			if (!err && (pages_skipped == mpd->wbc->pages_skipped))
@@ -2095,11 +2099,29 @@ static int __mpage_da_writepage(struct page *page,
 		bh = head;
 		do {
 			BUG_ON(buffer_locked(bh));
+			/*
+			 * We need to try to allocate
+			 * unmapped blocks in the same page.
+			 * Otherwise we won't make progress
+			 * with the page in ext4_da_writepage
+			 */
 			if (buffer_dirty(bh) &&
 				(!buffer_mapped(bh) || buffer_delay(bh))) {
 				mpage_add_bh_to_extent(mpd, logical, bh);
 				if (mpd->io_done)
 					return MPAGE_DA_EXTENT_TAIL;
+			} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
+				/*
+				 * mapped dirty buffer. We need to update
+				 * the b_state because we look at
+				 * b_state in mpage_da_map_blocks. We don't
+				 * update b_size because if we find an
+				 * unmapped buffer_head later we need to
+				 * use the b_state flag of that buffer_head.
+				 */
+				if (mpd->lbh.b_size == 0)
+					mpd->lbh.b_state =
+						bh->b_state & BH_FLAGS;
 			}
 			logical++;
 		} while ((bh = bh->b_this_page) != head);
-- 
cgit v1.2.3


From 565a9617b2151e21b22700e97a8b04e70e103153 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:51:07 -0500
Subject: ext4: avoid ext4_error when mounting a fs with a single bg

Remove some completely unneeded code which which caused an ext4_error
to be generated when mounting a file system with only a single block
group.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 08fc86a358d..81aed8b825a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1445,7 +1445,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
-	__u64 block_bitmap = 0;
 	int i;
 
 	if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1468,9 +1467,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 		goto failed;
 	}
 
-	gdp = ext4_get_group_desc(sb, 1, &bh);
-	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
-
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		gdp = ext4_get_group_desc(sb, i, &bh);
 
-- 
cgit v1.2.3


From 25f1ee3aba17584ba4810da892175acab7fff9c8 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Tue, 25 Nov 2008 17:24:23 -0500
Subject: ext4: fix build warning

Replace `if' with `goto' to assure gcc that ix has been initialized.

Signed-off-by: Wu Fengguang <wfg@linux.intel.com>
---
 fs/ext4/extents.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 59401d057c6..0917be51f10 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1160,15 +1160,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	while (--depth >= 0) {
 		ix = path[depth].p_idx;
 		if (ix != EXT_LAST_INDEX(path[depth].p_hdr))
-			break;
+			goto got_index;
 	}
 
-	if (depth < 0) {
-		/* we've gone up to the root and
-		 * found no index to the right */
-		return 0;
-	}
+	/* we've gone up to the root and found no index to the right */
+	return 0;
 
+got_index:
 	/* we've found index to the right, let's
 	 * follow it and find the closest allocated
 	 * block to the right */
@@ -1201,7 +1199,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	*phys = ext_pblock(ex);
 	put_bh(bh);
 	return 0;
-
 }
 
 /*
-- 
cgit v1.2.3


From 23475e264c4f5c8b635a31924851287ead1ebe32 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Wed, 26 Nov 2008 02:23:19 -0500
Subject: ext4: Use simple_strtol() instead of simple_strtoul() in
 ext4_ui_proc_open

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 81aed8b825a..8a0ae883f56 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3521,18 +3521,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file)
 static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
 			       size_t cnt, loff_t *ppos)
 {
-	unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+	unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
 	char str[32];
-	unsigned long value;
 
 	if (cnt >= sizeof(str))
 		return -EINVAL;
 	if (copy_from_user(str, buf, cnt))
 		return -EFAULT;
-	value = simple_strtol(str, NULL, 0);
-	if (value < 0)
-		return -ERANGE;
-	*p = value;
+
+	*p = simple_strtoul(str, NULL, 0);
 	return cnt;
 }
 
-- 
cgit v1.2.3


From 93c0d86371a5b2e68473752a6e54ff03185c473e Mon Sep 17 00:00:00 2001
From: "Solofo.Ramangalahy@bull.net" <>
Date: Wed, 26 Nov 2008 23:44:10 -0500
Subject: ext4: When resizing set the EXT4_BG_INODE_ZEROED flag for new block
 groups

The inode table has been zeroed in setup_new_group_blocks().  Mark it as
such in ext4_group_add().  Since we are currently clearing inode table
for the new block group, we should set the EXT4_BG_INODE_ZEROED flag.
If at some point in the future we don't immediately zero out the inode
table as part of the resize operation, then obviously we shouldn't do
this.

Signed-off-by: Solofo.Ramangalahy@bull.net
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6ec1843a01..d448eb1d9ba 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -864,6 +864,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
 	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
 	gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
 	/*
-- 
cgit v1.2.3


From ff7ef329b268b603ea4a2303241ef1c3829fd574 Mon Sep 17 00:00:00 2001
From: Yasunori Goto <y-goto@jp.fujitsu.com>
Date: Wed, 17 Dec 2008 00:48:39 -0500
Subject: ext4: Widen type of ext4_sb_info.s_mb_maxs[]

I chased the cause of following ext4 oops report which is tested on
ia64 box.

http://bugzilla.kernel.org/show_bug.cgi?id=12018

The cause is the size of s_mb_maxs array that is defined as "unsigned
short" in ext4_sb_info structure.  If the file system's block size is
8k or greater, an unsigned short is not wide enough to contain the
value fs->blocksize << 3.

Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: stable@kernel.org
---
 fs/ext4/ext4_sb.h | 3 ++-
 fs/ext4/mballoc.c | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index ad7ea09baa7..3db800f399a 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -102,7 +102,8 @@ struct ext4_sb_info {
 	spinlock_t s_reserve_lock;
 	spinlock_t s_md_lock;
 	tid_t s_last_transaction;
-	unsigned short *s_mb_offsets, *s_mb_maxs;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
 
 	/* tunables */
 	unsigned long s_stripe;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 444ad998f72..7beab7141dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2493,6 +2493,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	if (sbi->s_mb_offsets == NULL) {
 		return -ENOMEM;
 	}
+
+	i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
 	sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
 	if (sbi->s_mb_maxs == NULL) {
 		kfree(sbi->s_mb_maxs);
-- 
cgit v1.2.3


From 0390131ba84fd3f726f9e24fc4553828125700bb Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Wed, 7 Jan 2009 00:06:22 -0500
Subject: ext4: Allow ext4 to run without a journal

A few weeks ago I posted a patch for discussion that allowed ext4 to run
without a journal.  Since that time I've integrated the excellent
comments from Andreas and fixed several serious bugs.  We're currently
running with this patch and generating some performance numbers against
both ext2 (with backported reservations code) and ext4 with and without
a journal.  It just so happens that running without a journal is
slightly faster for most everything.

We did
	iozone -T -t 4 s 2g -r 256k -T -I -i0 -i1 -i2

which creates 4 threads, each of which create and do reads and writes on
a 2G file, with a buffer size of 256K, using O_DIRECT for all file opens
to bypass the page cache.  Results:

                     ext2        ext4, default   ext4, no journal
  initial writes   13.0 MB/s        15.4 MB/s          15.7 MB/s
  rewrites         13.1 MB/s        15.6 MB/s          15.9 MB/s
  reads            15.2 MB/s        16.9 MB/s          17.2 MB/s
  re-reads         15.3 MB/s        16.9 MB/s          17.2 MB/s
  random readers    5.6 MB/s         5.6 MB/s           5.7 MB/s
  random writers    5.1 MB/s         5.3 MB/s           5.4 MB/s

So it seems that, so far, this was a useful exercise.

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c    |   4 +-
 fs/ext4/ext4_jbd2.c |  83 ++++++++++++++++-----
 fs/ext4/ext4_jbd2.h |  83 +++++++++++++++++----
 fs/ext4/extents.c   |  12 +--
 fs/ext4/ialloc.c    |  25 ++++---
 fs/ext4/inode.c     | 130 ++++++++++++++++++++++-----------
 fs/ext4/ioctl.c     |   2 +-
 fs/ext4/mballoc.c   |  17 +++--
 fs/ext4/migrate.c   |   5 +-
 fs/ext4/namei.c     |  56 +++++++-------
 fs/ext4/resize.c    |  31 ++++----
 fs/ext4/super.c     | 207 +++++++++++++++++++++++++++++++++++-----------------
 fs/ext4/xattr.c     |  21 +++---
 13 files changed, 452 insertions(+), 224 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 35f5f9a2772..31ebeb5e7b0 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -531,11 +531,11 @@ do_more:
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_journal_dirty_metadata(handle, gd_bh);
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
 	if (!err) err = ret;
 	*pdquot_freed_blocks += group_freed;
 
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index c75384b34f2..ad13a84644e 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -7,53 +7,96 @@
 int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_undo_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_undo_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_get_write_access(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_write_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_write_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_forget(const char *where, handle_t *handle,
 				struct buffer_head *bh)
 {
-	int err = jbd2_journal_forget(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_forget(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_revoke(const char *where, handle_t *handle,
 				ext4_fsblk_t blocknr, struct buffer_head *bh)
 {
-	int err = jbd2_journal_revoke(handle, blocknr, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_revoke(handle, blocknr, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
 int __ext4_journal_get_create_access(const char *where,
 				handle_t *handle, struct buffer_head *bh)
 {
-	int err = jbd2_journal_get_create_access(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_get_create_access(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	}
 	return err;
 }
 
-int __ext4_journal_dirty_metadata(const char *where,
-				handle_t *handle, struct buffer_head *bh)
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+				 struct inode *inode, struct buffer_head *bh)
 {
-	int err = jbd2_journal_dirty_metadata(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(where, __func__, bh, handle, err);
+	int err = 0;
+
+	if (ext4_handle_valid(handle)) {
+		err = jbd2_journal_dirty_metadata(handle, bh);
+		if (err)
+			ext4_journal_abort_handle(where, __func__, bh,
+						  handle, err);
+	} else {
+		mark_buffer_dirty(bh);
+		if (inode && inode_needs_sync(inode)) {
+			sync_dirty_buffer(bh);
+			if (buffer_req(bh) && !buffer_uptodate(bh)) {
+				ext4_error(inode->i_sb, __func__,
+					   "IO error syncing inode, "
+					   "inode=%lu, block=%llu",
+					   inode->i_ino,
+					   (unsigned long long) bh->b_blocknr);
+				err = -EIO;
+			}
+		}
+	}
 	return err;
 }
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b455c685a98..663197adae5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
  * been done yet.
  */
 
-static inline void ext4_journal_release_buffer(handle_t *handle,
-						struct buffer_head *bh)
-{
-	jbd2_journal_release_buffer(handle, bh);
-}
-
 void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 		struct buffer_head *bh, handle_t *handle, int err);
 
@@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
 int __ext4_journal_get_create_access(const char *where,
 				handle_t *handle, struct buffer_head *bh);
 
-int __ext4_journal_dirty_metadata(const char *where,
-				handle_t *handle, struct buffer_head *bh);
+int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
+				 struct inode *inode, struct buffer_head *bh);
 
 #define ext4_journal_get_undo_access(handle, bh) \
 	__ext4_journal_get_undo_access(__func__, (handle), (bh))
@@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where,
 	__ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
 	__ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_dirty_metadata(handle, bh) \
-	__ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
 	__ext4_journal_forget(__func__, (handle), (bh))
+#define ext4_handle_dirty_metadata(handle, inode, bh) \
+	__ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
 
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
 
+#define EXT4_NOJOURNAL_HANDLE	((handle_t *) 0x1)
+
+static inline int ext4_handle_valid(handle_t *handle)
+{
+	if (handle == EXT4_NOJOURNAL_HANDLE)
+		return 0;
+	return 1;
+}
+
+static inline void ext4_handle_sync(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		handle->h_sync = 1;
+}
+
+static inline void ext4_handle_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_release_buffer(handle, bh);
+}
+
+static inline int ext4_handle_is_aborted(handle_t *handle)
+{
+	if (ext4_handle_valid(handle))
+		return is_handle_aborted(handle);
+	return 0;
+}
+
+static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
+{
+	if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
+		return 0;
+	return 1;
+}
+
+static inline void ext4_journal_release_buffer(handle_t *handle,
+						struct buffer_head *bh)
+{
+	if (ext4_handle_valid(handle))
+		jbd2_journal_release_buffer(handle, bh);
+}
+
 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 {
 	return ext4_journal_start_sb(inode->i_sb, nblocks);
@@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void)
 
 static inline int ext4_journal_extend(handle_t *handle, int nblocks)
 {
-	return jbd2_journal_extend(handle, nblocks);
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_extend(handle, nblocks);
+	return 0;
 }
 
 static inline int ext4_journal_restart(handle_t *handle, int nblocks)
 {
-	return jbd2_journal_restart(handle, nblocks);
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_restart(handle, nblocks);
+	return 0;
 }
 
 static inline int ext4_journal_blocks_per_page(struct inode *inode)
 {
-	return jbd2_journal_blocks_per_page(inode);
+	if (EXT4_JOURNAL(inode) != NULL)
+		return jbd2_journal_blocks_per_page(inode);
+	return 0;
 }
 
 static inline int ext4_journal_force_commit(journal_t *journal)
 {
-	return jbd2_journal_force_commit(journal);
+	if (journal)
+		return jbd2_journal_force_commit(journal);
+	return 0;
 }
 
 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
 {
-	return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+	if (ext4_handle_valid(handle))
+		return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+	return 0;
 }
 
 /* super.c */
@@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb);
 
 static inline int ext4_should_journal_data(struct inode *inode)
 {
+	if (EXT4_JOURNAL(inode) == NULL)
+		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 1;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
@@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode)
 
 static inline int ext4_should_order_data(struct inode *inode)
 {
+	if (EXT4_JOURNAL(inode) == NULL)
+		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
 	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
@@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode)
 
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
+	if (EXT4_JOURNAL(inode) == NULL)
+		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
 	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0917be51f10..743e3feb3e5 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
 	int err;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
 	if (handle->h_buffer_credits > needed)
 		return 0;
 	err = ext4_journal_extend(handle, needed);
@@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode,
 	int err;
 	if (path->p_bh) {
 		/* path points to block */
-		err = ext4_journal_dirty_metadata(handle, path->p_bh);
+		err = ext4_handle_dirty_metadata(handle, inode, path->p_bh);
 	} else {
 		/* path points to leaf/index in inode body */
 		err = ext4_mark_inode_dirty(handle, inode);
@@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	err = ext4_journal_dirty_metadata(handle, bh);
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (err)
 		goto cleanup;
 	brelse(bh);
@@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 
-		err = ext4_journal_dirty_metadata(handle, bh);
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (err)
 			goto cleanup;
 		brelse(bh);
@@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 
-	err = ext4_journal_dirty_metadata(handle, bh);
+	err = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (err)
 		goto out;
 
@@ -2947,7 +2949,7 @@ void ext4_ext_truncate(struct inode *inode)
 	 * transaction synchronous.
 	 */
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 out_stop:
 	up_write(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 6e6052879aa..9dd21b75f4b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -253,12 +253,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 				spin_unlock(sb_bgl_lock(sbi, flex_group));
 			}
 		}
-		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, bh2);
+		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bh2);
 		if (!fatal) fatal = err;
 	}
-	BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 	if (!fatal)
 		fatal = err;
 	sb->s_dirt = 1;
@@ -656,15 +656,16 @@ repeat_in_this_group:
 						ino, bitmap_bh->b_data)) {
 				/* we won it */
 				BUFFER_TRACE(bitmap_bh,
-					"call ext4_journal_dirty_metadata");
-				err = ext4_journal_dirty_metadata(handle,
+					"call ext4_handle_dirty_metadata");
+				err = ext4_handle_dirty_metadata(handle,
+								inode,
 								bitmap_bh);
 				if (err)
 					goto fail;
 				goto got;
 			}
 			/* we lost it */
-			jbd2_journal_release_buffer(handle, bitmap_bh);
+			ext4_handle_release_buffer(handle, bitmap_bh);
 
 			if (++ino < EXT4_INODES_PER_GROUP(sb))
 				goto repeat_in_this_group;
@@ -726,7 +727,8 @@ got:
 		/* Don't need to dirty bitmap block if we didn't change it */
 		if (free) {
 			BUFFER_TRACE(block_bh, "dirty block bitmap");
-			err = ext4_journal_dirty_metadata(handle, block_bh);
+			err = ext4_handle_dirty_metadata(handle,
+							NULL, block_bh);
 		}
 
 		brelse(block_bh);
@@ -771,8 +773,8 @@ got:
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group));
-	BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bh2);
+	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, bh2);
 	if (err) goto fail;
 
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
@@ -825,7 +827,7 @@ got:
 
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 	if (insert_inode_locked(inode) < 0) {
 		err = -EINVAL;
 		goto fail_drop;
@@ -1028,4 +1030,3 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 	}
 	return count;
 }
-
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c77a7ac753f..45d0f70a1f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
  * "bh" may be NULL: a metadata block may have been freed from memory
  * but there may still be a record of it in the journal, and that record
  * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling so there's nothing to do.
  */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 			struct buffer_head *bh, ext4_fsblk_t blocknr)
 {
 	int err;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
+
 	might_sleep();
 
 	BUFFER_TRACE(bh, "enter");
@@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode)
  */
 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 {
-	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+	if (!ext4_handle_valid(handle))
+		return 0;
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 		return 0;
 	if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
 		return 0;
@@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
  */
 static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 {
+	BUG_ON(EXT4_JOURNAL(inode) == NULL);
 	jbd_debug(2, "restarting handle %p\n", handle);
 	return ext4_journal_restart(handle, blocks_for_truncate(inode));
 }
@@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode)
 	}
 
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 	inode->i_size = 0;
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
@@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode)
 	 * enough credits left in the handle to remove the inode from
 	 * the orphan list and set the dtime field.
 	 */
-	if (handle->h_buffer_credits < 3) {
+	if (!ext4_handle_has_enough_credits(handle, 3)) {
 		err = ext4_journal_extend(handle, 3);
 		if (err > 0)
 			err = ext4_journal_restart(handle, 3);
@@ -717,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
 
-		BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (err)
 			goto failed;
 	}
@@ -800,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 		 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
 		 */
 		jbd_debug(5, "splicing indirect only\n");
-		BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata");
-		err = ext4_journal_dirty_metadata(handle, where->bh);
+		BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, where->bh);
 		if (err)
 			goto err_out;
 	} else {
@@ -1229,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				set_buffer_uptodate(bh);
 			}
 			unlock_buffer(bh);
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			err = ext4_journal_dirty_metadata(handle, bh);
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			err = ext4_handle_dirty_metadata(handle, inode, bh);
 			if (!fatal)
 				fatal = err;
 		} else {
@@ -1395,7 +1403,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 	if (!buffer_mapped(bh) || buffer_freed(bh))
 		return 0;
 	set_buffer_uptodate(bh);
-	return ext4_journal_dirty_metadata(handle, bh);
+	return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 
 /*
@@ -2762,7 +2770,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		filemap_write_and_wait(mapping);
 	}
 
-	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+	BUG_ON(!EXT4_JOURNAL(inode) &&
+	       EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
+
+	if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
 		 * bmap on dirty files is expected to be extremely rare:
@@ -3033,7 +3044,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
 	if (offset == 0)
 		ClearPageChecked(page);
 
-	jbd2_journal_invalidatepage(journal, page, offset);
+	if (journal)
+		jbd2_journal_invalidatepage(journal, page, offset);
+	else
+		block_invalidatepage(page, offset);
 }
 
 static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3043,7 +3057,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 	WARN_ON(PageChecked(page));
 	if (!page_has_buffers(page))
 		return 0;
-	return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page, wait);
+	else
+		return try_to_free_buffers(page);
 }
 
 /*
@@ -3315,7 +3332,7 @@ int ext4_block_truncate_page(handle_t *handle,
 
 	err = 0;
 	if (ext4_should_journal_data(inode)) {
-		err = ext4_journal_dirty_metadata(handle, bh);
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
 	} else {
 		if (ext4_should_order_data(inode))
 			err = ext4_jbd2_file_inode(handle, inode);
@@ -3439,8 +3456,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
 	__le32 *p;
 	if (try_to_extend_transaction(handle, inode)) {
 		if (bh) {
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			ext4_journal_dirty_metadata(handle, bh);
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			ext4_handle_dirty_metadata(handle, inode, bh);
 		}
 		ext4_mark_inode_dirty(handle, inode);
 		ext4_journal_test_restart(handle, inode);
@@ -3540,7 +3557,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 				  count, block_to_free_p, p);
 
 	if (this_bh) {
-		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
+		BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
 
 		/*
 		 * The buffer head should have an attached journal head at this
@@ -3549,7 +3566,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		 * the block was cleared. Check for this instead of OOPSing.
 		 */
 		if (bh2jh(this_bh))
-			ext4_journal_dirty_metadata(handle, this_bh);
+			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
 			ext4_error(inode->i_sb, __func__,
 				   "circular indirect block detected, "
@@ -3579,7 +3596,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t nr;
 	__le32 *p;
 
-	if (is_handle_aborted(handle))
+	if (ext4_handle_is_aborted(handle))
 		return;
 
 	if (depth--) {
@@ -3649,7 +3666,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			 * will merely complain about releasing a free block,
 			 * rather than leaking blocks.
 			 */
-			if (is_handle_aborted(handle))
+			if (ext4_handle_is_aborted(handle))
 				return;
 			if (try_to_extend_transaction(handle, inode)) {
 				ext4_mark_inode_dirty(handle, inode);
@@ -3668,9 +3685,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 								   parent_bh)){
 					*p = 0;
 					BUFFER_TRACE(parent_bh,
-					"call ext4_journal_dirty_metadata");
-					ext4_journal_dirty_metadata(handle,
-								    parent_bh);
+					"call ext4_handle_dirty_metadata");
+					ext4_handle_dirty_metadata(handle,
+								   inode,
+								   parent_bh);
 				}
 			}
 		}
@@ -3858,7 +3876,7 @@ do_indirects:
 	 * synchronous
 	 */
 	if (IS_SYNC(inode))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 out_stop:
 	/*
 	 * If this was a simple ftruncate(), and the file will remain alive
@@ -4357,8 +4375,8 @@ static int ext4_do_update_inode(handle_t *handle,
 			EXT4_SET_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
 			sb->s_dirt = 1;
-			handle->h_sync = 1;
-			err = ext4_journal_dirty_metadata(handle,
+			ext4_handle_sync(handle);
+			err = ext4_handle_dirty_metadata(handle, inode,
 					EXT4_SB(sb)->s_sbh);
 		}
 	}
@@ -4385,9 +4403,8 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 
-
-	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-	rc = ext4_journal_dirty_metadata(handle, bh);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	rc = ext4_handle_dirty_metadata(handle, inode, bh);
 	if (!err)
 		err = rc;
 	ei->i_state &= ~EXT4_STATE_NEW;
@@ -4450,6 +4467,25 @@ int ext4_write_inode(struct inode *inode, int wait)
 	return ext4_force_commit(inode->i_sb);
 }
 
+int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
+{
+	int err = 0;
+
+	mark_buffer_dirty(bh);
+	if (inode && inode_needs_sync(inode)) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			ext4_error(inode->i_sb, __func__,
+				   "IO error syncing inode, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long)bh->b_blocknr);
+			err = -EIO;
+		}
+	}
+	return err;
+}
+
 /*
  * ext4_setattr()
  *
@@ -4754,16 +4790,15 @@ int
 ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 			 struct ext4_iloc *iloc)
 {
-	int err = 0;
-	if (handle) {
-		err = ext4_get_inode_loc(inode, iloc);
-		if (!err) {
-			BUFFER_TRACE(iloc->bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle, iloc->bh);
-			if (err) {
-				brelse(iloc->bh);
-				iloc->bh = NULL;
-			}
+	int err;
+
+	err = ext4_get_inode_loc(inode, iloc);
+	if (!err) {
+		BUFFER_TRACE(iloc->bh, "get_write_access");
+		err = ext4_journal_get_write_access(handle, iloc->bh);
+		if (err) {
+			brelse(iloc->bh);
+			iloc->bh = NULL;
 		}
 	}
 	ext4_std_error(inode->i_sb, err);
@@ -4835,7 +4870,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 
 	might_sleep();
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
-	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	if (ext4_handle_valid(handle) &&
+	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
 		/*
 		 * We need extra buffer credits since we may write into EA block
@@ -4887,6 +4923,11 @@ void ext4_dirty_inode(struct inode *inode)
 	handle_t *current_handle = ext4_journal_current_handle();
 	handle_t *handle;
 
+	if (!ext4_handle_valid(current_handle)) {
+		ext4_mark_inode_dirty(current_handle, inode);
+		return;
+	}
+
 	handle = ext4_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
@@ -4924,8 +4965,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
 			BUFFER_TRACE(iloc.bh, "get_write_access");
 			err = jbd2_journal_get_write_access(handle, iloc.bh);
 			if (!err)
-				err = ext4_journal_dirty_metadata(handle,
-								  iloc.bh);
+				err = ext4_handle_dirty_metadata(handle,
+								 inode,
+								 iloc.bh);
 			brelse(iloc.bh);
 		}
 	}
@@ -4951,6 +4993,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 */
 
 	journal = EXT4_JOURNAL(inode);
+	if (!journal)
+		return 0;
 	if (is_journal_aborted(journal))
 		return -EROFS;
 
@@ -4980,7 +5024,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		return PTR_ERR(handle);
 
 	err = ext4_mark_inode_dirty(handle, inode);
-	handle->h_sync = 1;
+	ext4_handle_sync(handle);
 	ext4_journal_stop(handle);
 	ext4_std_error(inode->i_sb, err);
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index dc99b4776d5..42dc83fb247 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			goto flags_out;
 		}
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err)
 			goto flags_err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7beab7141dd..edb512b2ec4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2553,7 +2553,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	ext4_mb_init_per_dev_proc(sb);
 	ext4_mb_history_init(sb);
 
-	sbi->s_journal->j_commit_callback = release_blocks_on_commit;
+	if (sbi->s_journal)
+		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
 
 	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	return 0;
@@ -2854,7 +2855,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
 				bitmap_bh->b_data, ac->ac_b_ex.fe_start,
 				ac->ac_b_ex.fe_len);
-		err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 		if (!err)
 			err = -EAGAIN;
 		goto out_err;
@@ -2901,10 +2902,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
 
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 	if (err)
 		goto out_err;
-	err = ext4_journal_dirty_metadata(handle, gdp_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
 
 out_err:
 	sb->s_dirt = 1;
@@ -4414,7 +4415,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	struct rb_node **n = &db->bb_free_root.rb_node, *node;
 	struct rb_node *parent = NULL, *new_node;
 
-
+	BUG_ON(!ext4_handle_valid(handle));
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
@@ -4600,7 +4601,7 @@ do_more:
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 
 	if (ac) {
 		ac->ac_b_ex.fe_group = block_group;
@@ -4609,7 +4610,7 @@ do_more:
 		ext4_mb_store_history(ac);
 	}
 
-	if (metadata) {
+	if (metadata && ext4_handle_valid(handle)) {
 		/* blocks being freed are metadata. these blocks shouldn't
 		 * be used until this transaction is committed */
 		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
@@ -4639,7 +4640,7 @@ do_more:
 
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
-	ret = ext4_journal_dirty_metadata(handle, gd_bh);
+	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
 	if (!err)
 		err = ret;
 
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f2a9cf498ec..e7cd488da4b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
 	/*
 	 * Make sure the credit we accumalated is not really high
 	 */
-	if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) {
+	if (needed && ext4_handle_has_enough_credits(handle,
+						EXT4_RESERVE_TRANS_BLOCKS)) {
 		retval = ext4_journal_restart(handle, needed);
 		if (retval)
 			goto err_out;
@@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
 {
 	int retval = 0, needed;
 
-	if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS)
+	if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
 		return 0;
 	/*
 	 * We are freeing a blocks. During this we touch
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 84a68ae623c..08873e938ab 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1233,10 +1233,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
 		de = de2;
 	}
 	dx_insert_block(frame, hash2 + continued, newblock);
-	err = ext4_journal_dirty_metadata(handle, bh2);
+	err = ext4_handle_dirty_metadata(handle, dir, bh2);
 	if (err)
 		goto journal_error;
-	err = ext4_journal_dirty_metadata(handle, frame->bh);
+	err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
 	if (err)
 		goto journal_error;
 	brelse(bh2);
@@ -1340,8 +1340,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
-	BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-	err = ext4_journal_dirty_metadata(handle, bh);
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, dir, bh);
 	if (err)
 		ext4_std_error(dir->i_sb, err);
 	brelse(bh);
@@ -1581,7 +1581,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			dxtrace(dx_show_index("node", frames[1].entries));
 			dxtrace(dx_show_index("node",
 			       ((struct dx_node *) bh2->b_data)->entries));
-			err = ext4_journal_dirty_metadata(handle, bh2);
+			err = ext4_handle_dirty_metadata(handle, inode, bh2);
 			if (err)
 				goto journal_error;
 			brelse (bh2);
@@ -1607,7 +1607,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			if (err)
 				goto journal_error;
 		}
-		ext4_journal_dirty_metadata(handle, frames[0].bh);
+		ext4_handle_dirty_metadata(handle, inode, frames[0].bh);
 	}
 	de = do_split(handle, dir, &bh, frame, &hinfo, &err);
 	if (!de)
@@ -1653,8 +1653,8 @@ static int ext4_delete_entry(handle_t *handle,
 			else
 				de->inode = 0;
 			dir->i_version++;
-			BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata");
-			ext4_journal_dirty_metadata(handle, bh);
+			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+			ext4_handle_dirty_metadata(handle, dir, bh);
 			return 0;
 		}
 		i += ext4_rec_len_from_disk(de->rec_len);
@@ -1732,7 +1732,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode (handle, dir, mode);
 	err = PTR_ERR(inode);
@@ -1766,7 +1766,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, mode);
 	err = PTR_ERR(inode);
@@ -1802,7 +1802,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
 	err = PTR_ERR(inode);
@@ -1831,8 +1831,8 @@ retry:
 	strcpy(de->name, "..");
 	ext4_set_de_type(dir->i_sb, de, S_IFDIR);
 	inode->i_nlink = 2;
-	BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
-	ext4_journal_dirty_metadata(handle, dir_block);
+	BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+	ext4_handle_dirty_metadata(handle, dir, dir_block);
 	brelse(dir_block);
 	ext4_mark_inode_dirty(handle, inode);
 	err = ext4_add_entry(handle, dentry, inode);
@@ -1944,6 +1944,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	struct ext4_iloc iloc;
 	int err = 0, rc;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
+
 	lock_super(sb);
 	if (!list_empty(&EXT4_I(inode)->i_orphan))
 		goto out_unlock;
@@ -1972,7 +1975,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	/* Insert this inode at the head of the on-disk orphan list... */
 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
 	EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-	err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
 		err = rc;
@@ -2010,6 +2013,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	struct ext4_iloc iloc;
 	int err = 0;
 
+	if (!ext4_handle_valid(handle))
+		return 0;
+
 	lock_super(inode->i_sb);
 	if (list_empty(&ei->i_orphan)) {
 		unlock_super(inode->i_sb);
@@ -2028,7 +2034,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	 * transaction handle with which to update the orphan list on
 	 * disk, but we still need to remove the inode from the linked
 	 * list in memory. */
-	if (!handle)
+	if (sbi->s_journal && !handle)
 		goto out;
 
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
@@ -2042,7 +2048,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		if (err)
 			goto out_brelse;
 		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-		err = ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+		err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
 	} else {
 		struct ext4_iloc iloc2;
 		struct inode *i_prev =
@@ -2093,7 +2099,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 		goto end_rmdir;
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = dentry->d_inode;
 
@@ -2147,7 +2153,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	retval = -ENOENT;
 	bh = ext4_find_entry(dir, &dentry->d_name, &de);
@@ -2204,7 +2210,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
 	err = PTR_ERR(inode);
@@ -2267,7 +2273,7 @@ retry:
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	inode->i_ctime = ext4_current_time(inode);
 	ext4_inc_count(handle, inode);
@@ -2316,7 +2322,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return PTR_ERR(handle);
 
 	if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
-		handle->h_sync = 1;
+		ext4_handle_sync(handle);
 
 	old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
 	/*
@@ -2370,8 +2376,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_dir->i_ctime = new_dir->i_mtime =
 					ext4_current_time(new_dir);
 		ext4_mark_inode_dirty(handle, new_dir);
-		BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, new_bh);
+		BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
+		ext4_handle_dirty_metadata(handle, new_dir, new_bh);
 		brelse(new_bh);
 		new_bh = NULL;
 	}
@@ -2421,8 +2427,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 		BUFFER_TRACE(dir_bh, "get_write_access");
 		ext4_journal_get_write_access(handle, dir_bh);
 		PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
-		BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, dir_bh);
+		BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
+		ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
 		ext4_dec_count(handle, old_dir);
 		if (new_inode) {
 			/* checked empty_dir above, can't have another parent,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index d448eb1d9ba..1665aa131d1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
 {
 	int err;
 
-	if (handle->h_buffer_credits >= thresh)
+	if (ext4_handle_has_enough_credits(handle, thresh))
 		return 0;
 
 	err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA);
@@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 		memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
 		set_buffer_uptodate(gdb);
 		unlock_buffer(gdb);
-		ext4_journal_dirty_metadata(handle, gdb);
+		ext4_handle_dirty_metadata(handle, NULL, gdb);
 		ext4_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
@@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			err = PTR_ERR(bh);
 			goto exit_bh;
 		}
-		ext4_journal_dirty_metadata(handle, gdb);
+		ext4_handle_dirty_metadata(handle, NULL, gdb);
 		ext4_set_bit(bit, bh->b_data);
 		brelse(gdb);
 	}
@@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 			err = PTR_ERR(it);
 			goto exit_bh;
 		}
-		ext4_journal_dirty_metadata(handle, it);
+		ext4_handle_dirty_metadata(handle, NULL, it);
 		brelse(it);
 		ext4_set_bit(bit, bh->b_data);
 	}
@@ -286,7 +286,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 
 	mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
 			bh->b_data);
-	ext4_journal_dirty_metadata(handle, bh);
+	ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
 
 	/* Mark unused entries in inode bitmap used */
@@ -299,7 +299,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 
 	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
 			bh->b_data);
-	ext4_journal_dirty_metadata(handle, bh);
+	ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
 	brelse(bh);
 
@@ -486,12 +486,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	 * reserved inode, and will become GDT blocks (primary and backup).
 	 */
 	data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0;
-	ext4_journal_dirty_metadata(handle, dind);
+	ext4_handle_dirty_metadata(handle, NULL, dind);
 	brelse(dind);
 	inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
 	ext4_mark_iloc_dirty(handle, inode, &iloc);
 	memset((*primary)->b_data, 0, sb->s_blocksize);
-	ext4_journal_dirty_metadata(handle, *primary);
+	ext4_handle_dirty_metadata(handle, NULL, *primary);
 
 	o_group_desc = EXT4_SB(sb)->s_group_desc;
 	memcpy(n_group_desc, o_group_desc,
@@ -502,7 +502,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
 	kfree(o_group_desc);
 
 	le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 
 	return 0;
 
@@ -618,7 +618,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 		       primary[i]->b_blocknr, gdbackups,
 		       blk + primary[i]->b_blocknr); */
 		data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr);
-		err2 = ext4_journal_dirty_metadata(handle, primary[i]);
+		err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]);
 		if (!err)
 			err = err2;
 	}
@@ -676,7 +676,8 @@ static void update_backups(struct super_block *sb,
 		struct buffer_head *bh;
 
 		/* Out of journal space, and can't get more - abort - so sad */
-		if (handle->h_buffer_credits == 0 &&
+		if (ext4_handle_valid(handle) &&
+		    handle->h_buffer_credits == 0 &&
 		    ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) &&
 		    (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
 			break;
@@ -696,7 +697,7 @@ static void update_backups(struct super_block *sb,
 			memset(bh->b_data + size, 0, rest);
 		set_buffer_uptodate(bh);
 		unlock_buffer(bh);
-		ext4_journal_dirty_metadata(handle, bh);
+		ext4_handle_dirty_metadata(handle, NULL, bh);
 		brelse(bh);
 	}
 	if ((err2 = ext4_journal_stop(handle)) && !err)
@@ -916,7 +917,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
 
-	ext4_journal_dirty_metadata(handle, primary);
+	ext4_handle_dirty_metadata(handle, NULL, primary);
 
 	/* Update the reserved block counts only once the new group is
 	 * active. */
@@ -938,7 +939,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 			EXT4_INODES_PER_GROUP(sb);
 	}
 
-	ext4_journal_dirty_metadata(handle, sbi->s_sbh);
+	ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
 	sb->s_dirt = 1;
 
 exit_journal:
@@ -1072,7 +1073,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		goto exit_put;
 	}
 	ext4_blocks_count_set(es, o_blocks_count + add);
-	ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
 	unlock_super(sb);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8a0ae883f56..9b9076d9c4f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -136,13 +136,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
 	journal = EXT4_SB(sb)->s_journal;
-	if (is_journal_aborted(journal)) {
-		ext4_abort(sb, __func__,
-			   "Detected aborted journal");
-		return ERR_PTR(-EROFS);
+	if (journal) {
+		if (is_journal_aborted(journal)) {
+			ext4_abort(sb, __func__,
+				   "Detected aborted journal");
+			return ERR_PTR(-EROFS);
+		}
+		return jbd2_journal_start(journal, nblocks);
 	}
-
-	return jbd2_journal_start(journal, nblocks);
+	/*
+	 * We're not journaling, return the appropriate indication.
+	 */
+	current->journal_info = EXT4_NOJOURNAL_HANDLE;
+	return current->journal_info;
 }
 
 /*
@@ -157,6 +163,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
 	int err;
 	int rc;
 
+	if (!ext4_handle_valid(handle)) {
+		/*
+		 * Do this here since we don't call jbd2_journal_stop() in
+		 * no-journal mode.
+		 */
+		current->journal_info = NULL;
+		return 0;
+	}
 	sb = handle->h_transaction->t_journal->j_private;
 	err = handle->h_err;
 	rc = jbd2_journal_stop(handle);
@@ -174,6 +188,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn,
 	char nbuf[16];
 	const char *errstr = ext4_decode_error(NULL, err, nbuf);
 
+	BUG_ON(!ext4_handle_valid(handle));
+
 	if (bh)
 		BUFFER_TRACE(bh, "abort");
 
@@ -448,11 +464,13 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
-	err = jbd2_journal_destroy(sbi->s_journal);
-	sbi->s_journal = NULL;
-	if (err < 0)
-		ext4_abort(sb, __func__, "Couldn't clean up the journal");
-
+	if (sbi->s_journal) {
+		err = jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+		if (err < 0)
+			ext4_abort(sb, __func__,
+				   "Couldn't clean up the journal");
+	}
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -522,6 +540,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
+	/*
+	 * Note:  We can be called before EXT4_SB(sb)->s_journal is set,
+	 * therefore it can be null here.  Don't check it, just initialize
+	 * jinode.
+	 */
 	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
 	ei->i_reserved_data_blocks = 0;
 	ei->i_reserved_meta_blocks = 0;
@@ -588,7 +611,8 @@ static void ext4_clear_inode(struct inode *inode)
 	}
 #endif
 	ext4_discard_preallocations(inode);
-	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+	if (EXT4_JOURNAL(inode))
+		jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
 				       &EXT4_I(inode)->jinode);
 }
 
@@ -1406,20 +1430,15 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		printk(KERN_WARNING
 		       "EXT4-fs warning: checktime reached, "
 		       "running e2fsck is recommended\n");
-#if 0
-		/* @@@ We _will_ want to clear the valid bit if we find
-		 * inconsistencies, to force a fsck at reboot.  But for
-		 * a plain journaled filesystem we can keep it set as
-		 * valid forever! :)
-		 */
-	es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
-#endif
+	if (!sbi->s_journal) 
+		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
 	le16_add_cpu(&es->s_mnt_count, 1);
 	es->s_mtime = cpu_to_le32(get_seconds());
 	ext4_update_dynamic_rev(sb);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	if (sbi->s_journal)
+		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
 	ext4_commit_super(sb, es, 1);
 	if (test_opt(sb, DEBUG))
@@ -1431,9 +1450,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-	       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
-	       "external", EXT4_SB(sb)->s_journal->j_devname);
+	if (EXT4_SB(sb)->s_journal) {
+		printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+		       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+		       "external", EXT4_SB(sb)->s_journal->j_devname);
+	} else {
+		printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+	}
 	return res;
 }
 
@@ -1867,6 +1890,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned long def_mount_opts;
 	struct inode *root;
 	char *cp;
+	const char *descr;
 	int ret = -EINVAL;
 	int blocksize;
 	int db_count;
@@ -2278,21 +2302,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 				ext4_commit_super(sb, es, 1);
-				printk(KERN_CRIT
-				       "EXT4-fs (device %s): mount failed\n",
-				      sb->s_id);
 				goto failed_mount4;
 			}
 		}
 	} else if (journal_inum) {
 		if (ext4_create_journal(sb, es, journal_inum))
 			goto failed_mount3;
+	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
+	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+		printk(KERN_ERR "EXT4-fs: required journal recovery "
+		       "suppressed and not mounted read-only\n");
+		goto failed_mount4;
 	} else {
-		if (!silent)
-			printk(KERN_ERR
-			       "ext4: No journal on filesystem on %s\n",
-			       sb->s_id);
-		goto failed_mount3;
+		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
+		set_opt(sbi->s_mount_opt, WRITEBACK_DATA);
+		sbi->s_journal = NULL;
+		needs_recovery = 0;
+		goto no_journal;
 	}
 
 	if (ext4_blocks_count(es) > 0xffffffffULL &&
@@ -2344,6 +2370,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		break;
 	}
 
+no_journal:
+
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
 			printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
@@ -2428,13 +2456,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-	if (needs_recovery)
+	if (needs_recovery) {
 		printk(KERN_INFO "EXT4-fs: recovery complete.\n");
-	ext4_mark_recovery_complete(sb, es);
-	printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
-	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
-	       test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
-	       "writeback");
+		ext4_mark_recovery_complete(sb, es);
+	}
+	if (EXT4_SB(sb)->s_journal) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			descr = " journalled data mode";
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			descr = " ordered data mode";
+		else
+			descr = " writeback data mode";
+	} else
+		descr = "out journal";
+
+	printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
+	       sb->s_id, descr);
 
 	lock_kernel();
 	return 0;
@@ -2446,8 +2483,11 @@ cantfind_ext4:
 	goto failed_mount;
 
 failed_mount4:
-	jbd2_journal_destroy(sbi->s_journal);
-	sbi->s_journal = NULL;
+	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	if (sbi->s_journal) {
+		jbd2_journal_destroy(sbi->s_journal);
+		sbi->s_journal = NULL;
+	}
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -2508,6 +2548,8 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 	struct inode *journal_inode;
 	journal_t *journal;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	/* First, test for the existence of a valid inode on disk.  Bad
 	 * things happen if we iget() an unused inode, as the subsequent
 	 * iput() will try to delete it. */
@@ -2556,6 +2598,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	struct ext4_super_block *es;
 	struct block_device *bdev;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	bdev = ext4_blkdev_get(j_dev);
 	if (bdev == NULL)
 		return NULL;
@@ -2643,6 +2687,8 @@ static int ext4_load_journal(struct super_block *sb,
 	int err = 0;
 	int really_read_only;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
@@ -2817,6 +2863,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		BUG_ON(journal != NULL);
+		return;
+	}
 	jbd2_journal_lock_updates(journal);
 	if (jbd2_journal_flush(journal) < 0)
 		goto out;
@@ -2846,6 +2896,8 @@ static void ext4_clear_journal_err(struct super_block *sb,
 	int j_errno;
 	const char *errstr;
 
+	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+
 	journal = EXT4_SB(sb)->s_journal;
 
 	/*
@@ -2878,14 +2930,17 @@ static void ext4_clear_journal_err(struct super_block *sb,
 int ext4_force_commit(struct super_block *sb)
 {
 	journal_t *journal;
-	int ret;
+	int ret = 0;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	sb->s_dirt = 0;
-	ret = ext4_journal_force_commit(journal);
+	if (journal) {
+		sb->s_dirt = 0;
+		ret = ext4_journal_force_commit(journal);
+	}
+
 	return ret;
 }
 
@@ -2897,9 +2952,13 @@ int ext4_force_commit(struct super_block *sb)
  */
 static void ext4_write_super(struct super_block *sb)
 {
-	if (mutex_trylock(&sb->s_lock) != 0)
-		BUG();
-	sb->s_dirt = 0;
+	if (EXT4_SB(sb)->s_journal) {
+		if (mutex_trylock(&sb->s_lock) != 0)
+			BUG();
+		sb->s_dirt = 0;
+	} else {
+		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+	}
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -2908,10 +2967,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
-	if (wait)
-		ret = ext4_force_commit(sb);
-	else
-		jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+	if (EXT4_SB(sb)->s_journal) {
+		if (wait)
+			ret = ext4_force_commit(sb);
+		else
+ 			jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+	} else {
+		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+	}
 	return ret;
 }
 
@@ -2926,15 +2989,17 @@ static void ext4_write_super_lockfs(struct super_block *sb)
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal_t *journal = EXT4_SB(sb)->s_journal;
 
-		/* Now we set up the journal barrier. */
-		jbd2_journal_lock_updates(journal);
+		if (journal) {
+			/* Now we set up the journal barrier. */
+			jbd2_journal_lock_updates(journal);
 
-		/*
-		 * We don't want to clear needs_recovery flag when we failed
-		 * to flush the journal.
-		 */
-		if (jbd2_journal_flush(journal) < 0)
-			return;
+			/*
+			 * We don't want to clear needs_recovery flag when we
+			 * failed to flush the journal.
+			 */
+			if (jbd2_journal_flush(journal) < 0)
+				return;
+		}
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2948,7 +3013,7 @@ static void ext4_write_super_lockfs(struct super_block *sb)
  */
 static void ext4_unlockfs(struct super_block *sb)
 {
-	if (!(sb->s_flags & MS_RDONLY)) {
+	if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
 		lock_super(sb);
 		/* Reser the needs_recovery flag before the fs is unlocked. */
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2999,7 +3064,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 	es = sbi->s_es;
 
-	ext4_init_journal_params(sb, sbi->s_journal);
+	if (sbi->s_journal)
+		ext4_init_journal_params(sb, sbi->s_journal);
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > ext4_blocks_count(es)) {
@@ -3028,9 +3094,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * We have to unlock super so that we can wait for
 			 * transactions.
 			 */
-			unlock_super(sb);
-			ext4_mark_recovery_complete(sb, es);
-			lock_super(sb);
+			if (sbi->s_journal) {
+				unlock_super(sb);
+				ext4_mark_recovery_complete(sb, es);
+				lock_super(sb);
+			}
 		} else {
 			__le32 ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -3084,7 +3152,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * been changed by e2fsck since we originally mounted
 			 * the partition.)
 			 */
-			ext4_clear_journal_err(sb, es);
+			if (sbi->s_journal)
+				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
 			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
 				goto restore_opts;
@@ -3092,6 +3161,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				sb->s_flags &= ~MS_RDONLY;
 		}
 	}
+	if (sbi->s_journal == NULL)
+		ext4_commit_super(sb, es, 1);
+
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -3368,7 +3440,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	 * When we journal data on quota file, we have to flush journal to see
 	 * all updates to the file when we bypass pagecache...
 	 */
-	if (ext4_should_journal_data(path.dentry->d_inode)) {
+	if (EXT4_SB(sb)->s_journal &&
+	    ext4_should_journal_data(path.dentry->d_inode)) {
 		/*
 		 * We don't need to lock updates but journal_flush() could
 		 * otherwise be livelocked...
@@ -3442,7 +3515,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	struct buffer_head *bh;
 	handle_t *handle = journal_current_handle();
 
-	if (!handle) {
+	if (EXT4_SB(sb)->s_journal && !handle) {
 		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
 			" cancelled because transaction is not started.\n",
 			(unsigned long long)off, (unsigned long long)len);
@@ -3467,7 +3540,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 		flush_dcache_page(bh->b_page);
 		unlock_buffer(bh);
 		if (journal_quota)
-			err = ext4_journal_dirty_metadata(handle, bh);
+			err = ext4_handle_dirty_metadata(handle, NULL, bh);
 		else {
 			/* Always do at least ordered writes for quotas */
 			err = ext4_jbd2_file_inode(handle, inode);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 9b4a368c572..157ce6589c5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
 	if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
 		EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
 		sb->s_dirt = 1;
-		ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
+		ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	}
 }
 
@@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
 	} else {
 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-		error = ext4_journal_dirty_metadata(handle, bh);
+		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 		DQUOT_FREE_BLOCK(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
@@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			if (error == -EIO)
 				goto bad_block;
 			if (!error)
-				error = ext4_journal_dirty_metadata(handle,
-								    bs->bh);
+				error = ext4_handle_dirty_metadata(handle,
+								   inode,
+								   bs->bh);
 			if (error)
 				goto cleanup;
 			goto inserted;
@@ -794,8 +795,9 @@ inserted:
 				ea_bdebug(new_bh, "reusing; refcount now=%d",
 					le32_to_cpu(BHDR(new_bh)->h_refcount));
 				unlock_buffer(new_bh);
-				error = ext4_journal_dirty_metadata(handle,
-								    new_bh);
+				error = ext4_handle_dirty_metadata(handle,
+								   inode,
+								   new_bh);
 				if (error)
 					goto cleanup_dquot;
 			}
@@ -833,7 +835,8 @@ getblk_failed:
 			set_buffer_uptodate(new_bh);
 			unlock_buffer(new_bh);
 			ext4_xattr_cache_insert(new_bh);
-			error = ext4_journal_dirty_metadata(handle, new_bh);
+			error = ext4_handle_dirty_metadata(handle,
+							   inode, new_bh);
 			if (error)
 				goto cleanup;
 		}
@@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		 */
 		is.iloc.bh = NULL;
 		if (IS_SYNC(inode))
-			handle->h_sync = 1;
+			ext4_handle_sync(handle);
 	}
 
 cleanup:
-- 
cgit v1.2.3


From fd98496f467b3d26d05ab1498f41718b5ef13de5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 5 Jan 2009 21:34:13 -0500
Subject: jbd2: Add barrier not supported test to journal_wait_on_commit_record

Xen doesn't report that barriers are not supported until buffer I/O is
reported as completed, instead of when the buffer I/O is submitted.
Add a check and a fallback codepath to journal_wait_on_commit_record()
to detect this case, so that attempts to mount ext4 filesystems on
LVM/devicemapper devices on Xen guests don't blow up with an "Aborting
journal on device XXX"; "Remounting filesystem read-only" error.

Thanks to Andreas Sundstrom for reporting this issue.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/jbd2/commit.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a..6393fd0d804 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/crc32.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
+#include <linux/bio.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal,
  * This function along with journal_submit_commit_record
  * allows to write the commit record asynchronously.
  */
-static int journal_wait_on_commit_record(struct buffer_head *bh)
+static int journal_wait_on_commit_record(journal_t *journal,
+					 struct buffer_head *bh)
 {
 	int ret = 0;
 
+retry:
 	clear_buffer_dirty(bh);
 	wait_on_buffer(bh);
+	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
+		printk(KERN_WARNING
+		       "JBD2: wait_on_commit_record: sync failed on %s - "
+		       "disabling barriers\n", journal->j_devname);
+		spin_lock(&journal->j_state_lock);
+		journal->j_flags &= ~JBD2_BARRIER;
+		spin_unlock(&journal->j_state_lock);
+
+		lock_buffer(bh);
+		clear_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		bh->b_end_io = journal_end_buffer_io_sync;
+
+		ret = submit_bh(WRITE_SYNC, bh);
+		if (ret) {
+			unlock_buffer(bh);
+			return ret;
+		}
+		goto retry;
+	}
 
 	if (unlikely(!buffer_uptodate(bh)))
 		ret = -EIO;
@@ -799,7 +822,7 @@ wait_for_iobuf:
 			__jbd2_journal_abort_hard(journal);
 	}
 	if (!err && !is_journal_aborted(journal))
-		err = journal_wait_on_commit_record(cbh);
+		err = journal_wait_on_commit_record(journal, cbh);
 
 	if (err)
 		jbd2_journal_abort(journal, err);
-- 
cgit v1.2.3


From fde4d95ad8711c84a36735a17136c45b19746af9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 5 Jan 2009 22:17:35 -0500
Subject: ext4: remove extraneous newlines from calls to ext4_error() and
 ext4_warning()

This removes annoying blank syslog entries emitted by ext4_error() or
ext4_warning(), since these functions add their own newline.

Signed-off-by: Nick Warne <nick@ukfsn.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  2 +-
 fs/ext4/ialloc.c  |  2 +-
 fs/ext4/mballoc.c | 24 ++++++++++++------------
 fs/ext4/resize.c  |  7 +++----
 fs/ext4/super.c   |  4 ++--
 5 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 31ebeb5e7b0..0cb1c4572f5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -100,7 +100,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * essentially implementing a per-group read-only flag. */
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 			ext4_error(sb, __func__,
-				  "Checksum bad for group %lu\n", block_group);
+				  "Checksum bad for group %lu", block_group);
 			gdp->bg_free_blocks_count = 0;
 			gdp->bg_free_inodes_count = 0;
 			gdp->bg_itable_unused = 0;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9dd21b75f4b..4794d2ce613 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __func__, "Checksum bad for group %lu\n",
+		ext4_error(sb, __func__, "Checksum bad for group %lu",
 			   block_group);
 		gdp->bg_free_blocks_count = 0;
 		gdp->bg_free_inodes_count = 0;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index edb512b2ec4..48d606cd740 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -447,7 +447,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 
 			ext4_error(sb, __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)\n",
+				   " %lu's block %llu(bit %u in group %lu)",
 				   inode ? inode->i_ino : 0, blocknr,
 				   first + i, e4b->bd_group);
 		}
@@ -691,7 +691,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 
 	if (free != grp->bb_free) {
 		ext4_error(sb, __func__,
-			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n",
+			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd",
 			group, free, grp->bb_free);
 		/*
 		 * If we intent to continue, we consider group descritor
@@ -1096,7 +1096,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 			ext4_unlock_group(sb, e4b->bd_group);
 			ext4_error(sb, __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)\n",
+				   " %lu's block %llu(bit %u in group %lu)",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
 			ext4_lock_group(sb, e4b->bd_group);
@@ -1576,7 +1576,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 * we have free blocks
 			 */
 			ext4_error(sb, __func__, "%d free blocks as per "
-					"group info. But bitmap says 0\n",
+					"group info. But bitmap says 0",
 					free);
 			break;
 		}
@@ -1585,7 +1585,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		BUG_ON(ex.fe_len <= 0);
 		if (free < ex.fe_len) {
 			ext4_error(sb, __func__, "%d free blocks as per "
-					"group info. But got %d blocks\n",
+					"group info. But got %d blocks",
 					free, ex.fe_len);
 			/*
 			 * The number of free blocks differs. This mostly
@@ -3629,7 +3629,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			pa, (unsigned long) pa->pa_lstart,
 			(unsigned long) pa->pa_pstart,
 			(unsigned long) pa->pa_len);
-		ext4_error(sb, __func__, "free %u, pa_free %u\n",
+		ext4_error(sb, __func__, "free %u, pa_free %u",
 						free, pa->pa_free);
 		/*
 		 * pa is already deleted so we use the value obtained
@@ -3703,14 +3703,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
 		ext4_error(sb, __func__, "Error in reading block "
-				"bitmap for %lu\n", group);
+				"bitmap for %lu", group);
 		return 0;
 	}
 
 	err = ext4_mb_load_buddy(sb, group, &e4b);
 	if (err) {
 		ext4_error(sb, __func__, "Error in loading buddy "
-				"information for %lu\n", group);
+				"information for %lu", group);
 		put_bh(bitmap_bh);
 		return 0;
 	}
@@ -3877,14 +3877,14 @@ repeat:
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		if (err) {
 			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %lu\n", group);
+					"information for %lu", group);
 			continue;
 		}
 
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
 			ext4_error(sb, __func__, "Error in reading block "
-					"bitmap for %lu\n", group);
+					"bitmap for %lu", group);
 			ext4_mb_release_desc(&e4b);
 			continue;
 		}
@@ -4149,7 +4149,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 		if (ext4_mb_load_buddy(sb, group, &e4b)) {
 			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %lu\n", group);
+					"information for %lu", group);
 			continue;
 		}
 		ext4_lock_group(sb, group);
@@ -4446,7 +4446,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else {
 			ext4_unlock_group(sb, group);
 			ext4_error(sb, __func__,
-			    "Double free of blocks %d (%d %d)\n",
+			    "Double free of blocks %d (%d %d)",
 			    block, entry->start_blk, entry->count);
 			return 0;
 		}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 1665aa131d1..41133811744 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -762,13 +762,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	if (ext4_blocks_count(es) + input->blocks_count <
 	    ext4_blocks_count(es)) {
-		ext4_warning(sb, __func__, "blocks_count overflow\n");
+		ext4_warning(sb, __func__, "blocks_count overflow");
 		return -EINVAL;
 	}
 
 	if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) <
 	    le32_to_cpu(es->s_inodes_count)) {
-		ext4_warning(sb, __func__, "inodes_count overflow\n");
+		ext4_warning(sb, __func__, "inodes_count overflow");
 		return -EINVAL;
 	}
 
@@ -999,8 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __func__,
-			"CONFIG_LBD not enabled\n");
+			ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9b9076d9c4f..dc27d4c613c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1309,7 +1309,7 @@ set_qf_format:
 					EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 				ext4_warning(sb, __func__,
 					"extents feature not enabled "
-					"on this filesystem, use tune2fs\n");
+					"on this filesystem, use tune2fs");
 				return 0;
 			}
 			set_opt(sbi->s_mount_opt, EXTENTS);
@@ -1993,7 +1993,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	else
 		ext4_warning(sb, __func__,
 			"extents feature not enabled on this filesystem, "
-			"use tune2fs.\n");
+			"use tune2fs.");
 
 	/*
 	 * enable delayed allocation by default
-- 
cgit v1.2.3


From 032115fcef837a00336ddf7bda584e89789ea498 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:34:30 -0500
Subject: ext4: Don't overwrite allocation_context ac_status

We can call ext4_mb_check_limits even after successfully allocating
the requested blocks.  In that case, make sure we don't overwrite
ac_status if it already has the status AC_STATUS_FOUND.  This fixes
the lockdep warning:

=============================================
[ INFO: possible recursive locking detected ]
2.6.28-rc6-autokern1 #1
---------------------------------------------
fsstress/11948 is trying to acquire lock:
 (&meta_group_info[i]->alloc_sem){----}, at: [<c04d9a49>] ext4_mb_load_buddy+0x9f/0x278
.....

stack backtrace:
.....
 [<c04db974>] ext4_mb_regular_allocator+0xbb5/0xd44
.....

but task is already holding lock:
 (&meta_group_info[i]->alloc_sem){----}, at: [<c04d9a49>] ext4_mb_load_buddy+0x9f/0x278

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 48d606cd740..6dea637b020 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1326,6 +1326,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
 	struct ext4_free_extent ex;
 	int max;
 
+	if (ac->ac_status == AC_STATUS_FOUND)
+		return;
 	/*
 	 * We don't want to scan for a whole year
 	 */
-- 
cgit v1.2.3


From e07f7183a486cf9783d1f8c9d2997b5b39eeb2d4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 26 Nov 2008 01:14:26 -0500
Subject: jbd2: improve jbd2 fsync batching

This patch removes the static sleep time in favor of a more self
optimizing approach where we measure the average amount of time it
takes to commit a transaction to disk and the ammount of time a
transaction has been running.  If somebody does a sync write or an
fsync() traditionally we would sleep for 1 jiffies, which depending on
the value of HZ could be a significant amount of time compared to how
long it takes to commit a transaction to the underlying storage.  With
this patch instead of sleeping for a jiffie, we check to see if the
amount of time this transaction has been running is less than the
average commit time, and if it is we sleep for the delta using
schedule_hrtimeout to give us a higher precision sleep time.  This
greatly benefits high end storage where you could end up sleeping for
longer than it takes to commit the transaction and therefore sitting
idle instead of allowing the transaction to be committed by keeping
the sleep time to a minimum so you are sure to always be doing
something.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c      | 14 +++++++++++++
 fs/jbd2/transaction.c | 58 ++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 58 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6393fd0d804..f22d1828ea8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -355,6 +355,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int flags;
 	int err;
 	unsigned long long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
 	char *tagp = NULL;
 	journal_header_t *header;
 	journal_block_tag_t *tag = NULL;
@@ -481,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
 	spin_unlock(&journal->j_state_lock);
@@ -995,6 +998,17 @@ restart_loop:
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in the commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time +
+				journal->j_average_commit_time*3) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
 	spin_unlock(&journal->j_state_lock);
 
 	if (journal->j_commit_callback)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599..13dcbc990f4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 {
 	transaction->t_journal = journal;
 	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
@@ -1193,7 +1195,7 @@ int jbd2_journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int old_handle_count, err;
+	int err;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1216,24 +1218,52 @@ int jbd2_journal_stop(handle_t *handle)
 	/*
 	 * Implement synchronous transaction batching.  If the handle
 	 * was synchronous, don't force a commit immediately.  Let's
-	 * yield and let another thread piggyback onto this transaction.
-	 * Keep doing that while new threads continue to arrive.
-	 * It doesn't cost much - we're about to run a commit and sleep
-	 * on IO anyway.  Speeds up many-threaded, many-dir operations
-	 * by 30x or more...
+	 * yield and let another thread piggyback onto this
+	 * transaction.  Keep doing that while new threads continue to
+	 * arrive.  It doesn't cost much - we're about to run a commit
+	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
+	 * operations by 30x or more...
+	 *
+	 * We try and optimize the sleep time against what the
+	 * underlying disk can do, instead of having a static sleep
+	 * time.  This is useful for the case where our storage is so
+	 * fast that it is more optimal to go ahead and force a flush
+	 * and wait for the transaction to be committed than it is to
+	 * wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We achieve this by measuring how
+	 * long it takes to commit a transaction, and compare it with
+	 * how long this transaction has been running, and if run time
+	 * < commit time then we sleep for the delta and commit.  This
+	 * greatly helps super fast disks that would see slowdowns as
+	 * more threads started doing fsyncs.
 	 *
-	 * But don't do this if this process was the most recent one to
-	 * perform a synchronous write.  We do this to detect the case where a
-	 * single process is doing a stream of sync writes.  No point in waiting
-	 * for joiners in that case.
+	 * But don't do this if this process was the most recent one
+	 * to perform a synchronous write.  We do this to detect the
+	 * case where a single process is doing a stream of sync
+	 * writes.  No point in waiting for joiners in that case.
 	 */
 	pid = current->pid;
 	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
 		journal->j_last_sync_writer = pid;
-		do {
-			old_handle_count = transaction->t_handle_count;
-			schedule_timeout_uninterruptible(1);
-		} while (old_handle_count != transaction->t_handle_count);
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = min_t(u64, commit_time,
+				    1000*jiffies_to_usecs(1));
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
 	}
 
 	current->journal_info = NULL;
-- 
cgit v1.2.3


From d7cfa4684d82f58e5d7cb73b8a3c88c169937f25 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 17 Dec 2008 00:20:45 -0500
Subject: ext4: display average commit time

Display the average commit time (which is used by the ext4 fsync
batching patch) in /proc/fs/jbd2/*/info for performance tuning
purposes.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f..74d87290381 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -40,6 +40,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
+#include <asm/div64.h>
 
 EXPORT_SYMBOL(jbd2_journal_start);
 EXPORT_SYMBOL(jbd2_journal_restart);
@@ -824,6 +825,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 	    jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
 	seq_printf(seq, "  %ums logging transaction\n",
 	    jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+	seq_printf(seq, "  %luus average transaction commit time\n",
+		   do_div(s->journal->j_average_commit_time, 1000));
 	seq_printf(seq, "  %lu handles per transaction\n",
 	    s->stats->u.run.rs_handle_count / s->stats->ts_tid);
 	seq_printf(seq, "  %lu blocks per transaction\n",
-- 
cgit v1.2.3


From 30773840c19cea60dcef39545960d541b1ac1cf8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Jan 2009 20:27:38 -0500
Subject: ext4: add fsync batch tuning knobs

Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched with a synchronous write transaction.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h        |  7 +++++++
 fs/ext4/ext4_sb.h     |  2 ++
 fs/ext4/super.c       | 47 ++++++++++++++++++++++++++++++++++++++++-------
 fs/jbd2/journal.c     |  2 ++
 fs/jbd2/transaction.c |  4 +++-
 5 files changed, 54 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ac8551e0b70..9ba9fd6d14d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -328,6 +328,7 @@ struct ext4_mount_options {
 	uid_t s_resuid;
 	gid_t s_resgid;
 	unsigned long s_commit_interval;
+	u32 s_min_batch_time, s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	int s_jquota_fmt;
 	char *s_qf_names[MAXQUOTAS];
@@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEFM_JMODE_ORDERED	0x0040
 #define EXT4_DEFM_JMODE_WBACK	0x0060
 
+/*
+ * Default journal batch times
+ */
+#define EXT4_DEF_MIN_BATCH_TIME	0
+#define EXT4_DEF_MAX_BATCH_TIME	15000 /* 15ms */
+
 /*
  * Structure of a directory entry
  */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 3db800f399a..039b6ea1a04 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -74,6 +74,8 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
 	struct block_device *journal_bdev;
 #ifdef CONFIG_JBD2_DEBUG
 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc27d4c613c..da377f9521b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 #endif
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
-	if (sbi->s_commit_interval) {
+	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
+	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
+		seq_printf(seq, ",min_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
+		seq_printf(seq, ",max_batch_time=%u",
+			   (unsigned) sbi->s_min_batch_time);
+	}
+
 	/*
 	 * We're changing the default of barrier mount option, so
 	 * let's always display its mount state so it's clear what its
@@ -874,7 +883,8 @@ enum {
 	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
-	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
+	Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
@@ -913,6 +923,8 @@ static const match_table_t tokens = {
 	{Opt_nobh, "nobh"},
 	{Opt_bh, "bh"},
 	{Opt_commit, "commit=%u"},
+	{Opt_min_batch_time, "min_batch_time=%u"},
+	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_update, "journal=update"},
 	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
@@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
 				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
 			sbi->s_commit_interval = HZ * option;
 			break;
+		case Opt_max_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = EXT4_DEF_MAX_BATCH_TIME;
+			sbi->s_max_batch_time = option;
+			break;
+		case Opt_min_batch_time:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_min_batch_time = option;
+			break;
 		case Opt_data_journal:
 			data_opt = EXT4_MOUNT_JOURNAL_DATA;
 			goto datacheck;
@@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
+	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
+	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
@@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (sbi->s_commit_interval)
-		journal->j_commit_interval = sbi->s_commit_interval;
-	/* We could also set up an ext4-specific default for the commit
-	 * interval here, but for now we'll just fall back to the jbd
-	 * default. */
+	journal->j_commit_interval = sbi->s_commit_interval;
+	journal->j_min_batch_time = sbi->s_min_batch_time;
+	journal->j_max_batch_time = sbi->s_max_batch_time;
 
 	spin_lock(&journal->j_state_lock);
 	if (test_opt(sb, BARRIER))
@@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	old_opts.s_resuid = sbi->s_resuid;
 	old_opts.s_resgid = sbi->s_resgid;
 	old_opts.s_commit_interval = sbi->s_commit_interval;
+	old_opts.s_min_batch_time = sbi->s_min_batch_time;
+	old_opts.s_max_batch_time = sbi->s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -3178,6 +3209,8 @@ restore_opts:
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sbi->s_commit_interval = old_opts.s_commit_interval;
+	sbi->s_min_batch_time = old_opts.s_min_batch_time;
+	sbi->s_max_batch_time = old_opts.s_max_batch_time;
 #ifdef CONFIG_QUOTA
 	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
 	for (i = 0; i < MAXQUOTAS; i++) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 74d87290381..fd1d7557a09 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
 	spin_lock_init(&journal->j_state_lock);
 
 	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
+	journal->j_min_batch_time = 0;
+	journal->j_max_batch_time = 15000; /* 15ms */
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 13dcbc990f4..48c21bac5a5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
 		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
 						   transaction->t_start_time));
 
+		commit_time = max_t(u64, commit_time,
+				    1000*journal->j_min_batch_time);
 		commit_time = min_t(u64, commit_time,
-				    1000*jiffies_to_usecs(1));
+				    1000*journal->j_max_batch_time);
 
 		if (trans_time < commit_time) {
 			ktime_t expires = ktime_add_ns(ktime_get(),
-- 
cgit v1.2.3


From cde6436004ad9cd8cab5a874b6fa8b01f1da91bf Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 4 Nov 2008 18:46:03 -0500
Subject: ext4: Remove i_ext_generation from ext4_inode_info structure

The i_ext_generation was incremented, but never used.  Remove it to
slim down the ext4_inode_info structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_extents.h | 5 -----
 fs/ext4/ext4_i.h       | 1 -
 fs/ext4/extents.c      | 2 --
 3 files changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index bec7ce59fc0..18cb67b2cbb 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode)
 	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
 }
 
-static inline void ext4_ext_tree_changed(struct inode *inode)
-{
-	EXT4_I(inode)->i_ext_generation++;
-}
-
 static inline void
 ext4_ext_invalidate_cache(struct inode *inode)
 {
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 5c124c0ac6d..acc0b726d8a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -117,7 +117,6 @@ struct ext4_inode_info {
 	struct inode vfs_inode;
 	struct jbd2_inode jinode;
 
-	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
 	/*
 	 * File creation time. Its function is same as that of
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 743e3feb3e5..b9e27bc3155 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1621,7 +1621,6 @@ cleanup:
 		ext4_ext_drop_refs(npath);
 		kfree(npath);
 	}
-	ext4_ext_tree_changed(inode);
 	ext4_ext_invalidate_cache(inode);
 	return err;
 }
@@ -2232,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 		}
 	}
 out:
-	ext4_ext_tree_changed(inode);
 	ext4_ext_drop_refs(path);
 	kfree(path);
 	ext4_journal_stop(handle);
-- 
cgit v1.2.3


From a9df9a49102f3578909cba7bd33784eb3b9caaa4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 5 Jan 2009 22:18:16 -0500
Subject: ext4: Make ext4_group_t be an unsigned int

Nearly all places in the ext3/4 code which uses "unsigned long" is
probably a bug, since on 32-bit systems a ulong a 32-bits, which means
we are wasting stack space on 64-bit systems.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 10 +++++-----
 fs/ext4/ext4.h    |  2 +-
 fs/ext4/ext4_i.h  |  2 +-
 fs/ext4/ialloc.c  |  8 ++++----
 fs/ext4/mballoc.c | 58 +++++++++++++++++++++++++++----------------------------
 fs/ext4/resize.c  |  4 ++--
 fs/ext4/super.c   | 14 +++++++-------
 7 files changed, 49 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 0cb1c4572f5..a711898923f 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -100,7 +100,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 * essentially implementing a per-group read-only flag. */
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 			ext4_error(sb, __func__,
-				  "Checksum bad for group %lu", block_group);
+				  "Checksum bad for group %u", block_group);
 			gdp->bg_free_blocks_count = 0;
 			gdp->bg_free_inodes_count = 0;
 			gdp->bg_itable_unused = 0;
@@ -213,7 +213,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	if (block_group >= sbi->s_groups_count) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "block_group >= groups_count - "
-			   "block_group = %lu, groups_count = %lu",
+			   "block_group = %u, groups_count = %u",
 			   block_group, sbi->s_groups_count);
 
 		return NULL;
@@ -225,7 +225,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	if (!sbi->s_group_desc[group_desc]) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "Group descriptor not loaded - "
-			   "block_group = %lu, group_desc = %lu, desc = %lu",
+			   "block_group = %u, group_desc = %lu, desc = %lu",
 			   block_group, group_desc, offset);
 		return NULL;
 	}
@@ -315,7 +315,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (unlikely(!bh)) {
 		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
-			    "block_group = %lu, block_bitmap = %llu",
+			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
@@ -337,7 +337,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 		put_bh(bh);
 		ext4_error(sb, __func__,
 			    "Cannot read block bitmap - "
-			    "block_group = %lu, block_bitmap = %llu",
+			    "block_group = %u, block_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9ba9fd6d14d..e9aacecfbf4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -965,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
 #define ERR_BAD_DX_DIR	-75000
 
 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
-			unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+			ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp);
 
 extern struct proc_dir_entry *ext4_proc_root;
 
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index acc0b726d8a..0a9ebe58019 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t;
 typedef __u32 ext4_lblk_t;
 
 /* data type for block group number */
-typedef unsigned long ext4_group_t;
+typedef unsigned int ext4_group_t;
 
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4794d2ce613..cac3617ec78 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -74,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	/* If checksum is bad mark all blocks and inodes use to prevent
 	 * allocation, essentially implementing a per-group read-only flag. */
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
-		ext4_error(sb, __func__, "Checksum bad for group %lu",
+		ext4_error(sb, __func__, "Checksum bad for group %u",
 			   block_group);
 		gdp->bg_free_blocks_count = 0;
 		gdp->bg_free_inodes_count = 0;
@@ -111,7 +111,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (unlikely(!bh)) {
 		ext4_error(sb, __func__,
 			    "Cannot read inode bitmap - "
-			    "block_group = %lu, inode_bitmap = %llu",
+			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
@@ -133,7 +133,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		put_bh(bh);
 		ext4_error(sb, __func__,
 			    "Cannot read inode bitmap - "
-			    "block_group = %lu, inode_bitmap = %llu",
+			    "block_group = %u, inode_bitmap = %llu",
 			    block_group, bitmap_blk);
 		return NULL;
 	}
@@ -690,7 +690,7 @@ got:
 	    ino > EXT4_INODES_PER_GROUP(sb)) {
 		ext4_error(sb, __func__,
 			   "reserved inode or inode > inodes count - "
-			   "block_group = %lu, inode=%lu", group,
+			   "block_group = %u, inode=%lu", group,
 			   ino + group * EXT4_INODES_PER_GROUP(sb));
 		err = -EIO;
 		goto fail;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6dea637b020..6cfe68a7e07 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -447,7 +447,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 
 			ext4_error(sb, __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)",
+				   " %lu's block %llu(bit %u in group %u)",
 				   inode ? inode->i_ino : 0, blocknr,
 				   first + i, e4b->bd_group);
 		}
@@ -477,7 +477,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 		b2 = (unsigned char *) bitmap;
 		for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 			if (b1[i] != b2[i]) {
-				printk(KERN_ERR "corruption in group %lu "
+				printk(KERN_ERR "corruption in group %u "
 				       "at byte %u(%u): %x in copy != %x "
 				       "on disk/prealloc\n",
 				       e4b->bd_group, i, i * 8, b1[i], b2[i]);
@@ -691,7 +691,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 
 	if (free != grp->bb_free) {
 		ext4_error(sb, __func__,
-			"EXT4-fs: group %lu: %u blocks in bitmap, %u in gd",
+			"EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
 			group, free, grp->bb_free);
 		/*
 		 * If we intent to continue, we consider group descritor
@@ -800,7 +800,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		get_bh(bh[i]);
 		bh[i]->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh[i]);
-		mb_debug("read bitmap for group %lu\n", first_group + i);
+		mb_debug("read bitmap for group %u\n", first_group + i);
 	}
 
 	/* wait for I/O completion */
@@ -895,7 +895,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	struct page *page;
 	int ret;
 
-	mb_debug("load group %lu\n", group);
+	mb_debug("load group %u\n", group);
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 
@@ -1096,7 +1096,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
 			ext4_unlock_group(sb, e4b->bd_group);
 			ext4_error(sb, __func__, "double-free of inode"
-				   " %lu's block %llu(bit %u in group %lu)",
+				   " %lu's block %llu(bit %u in group %u)",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
 			ext4_lock_group(sb, e4b->bd_group);
@@ -1934,13 +1934,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 	if (hs->op == EXT4_MB_HISTORY_ALLOC) {
 		fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
 			"%-5u %-5s %-5u %-6u\n";
-		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len,
 			hs->result.fe_logical);
-		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+		sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
 			hs->orig.fe_start, hs->orig.fe_len,
 			hs->orig.fe_logical);
-		sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group,
+		sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
 			hs->goal.fe_start, hs->goal.fe_len,
 			hs->goal.fe_logical);
 		seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
@@ -1949,20 +1949,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
 				hs->buddy ? 1 << hs->buddy : 0);
 	} else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
 		fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-		sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len,
 			hs->result.fe_logical);
-		sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group,
+		sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
 			hs->orig.fe_start, hs->orig.fe_len,
 			hs->orig.fe_logical);
 		seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
 	} else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len);
 		seq_printf(seq, "%-5u %-8u %-23s discard\n",
 				hs->pid, hs->ino, buf2);
 	} else if (hs->op == EXT4_MB_HISTORY_FREE) {
-		sprintf(buf2, "%lu/%d/%u", hs->result.fe_group,
+		sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
 			hs->result.fe_start, hs->result.fe_len);
 		seq_printf(seq, "%-5u %-8u %-23s free\n",
 				hs->pid, hs->ino, buf2);
@@ -2075,7 +2075,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 		return NULL;
 
 	group = *pos + 1;
-	return (void *) group;
+	return (void *) ((unsigned long) group);
 }
 
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -2088,13 +2088,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (*pos < 0 || *pos >= sbi->s_groups_count)
 		return NULL;
 	group = *pos + 1;
-	return (void *) group;;
+	return (void *) ((unsigned long) group);
 }
 
 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
 	struct super_block *sb = seq->private;
-	long group = (long) v;
+	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
 	int i;
 	int err;
 	struct ext4_buddy e4b;
@@ -2116,7 +2116,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 		sizeof(struct ext4_group_info);
 	err = ext4_mb_load_buddy(sb, group, &e4b);
 	if (err) {
-		seq_printf(seq, "#%-5lu: I/O error\n", group);
+		seq_printf(seq, "#%-5u: I/O error\n", group);
 		return 0;
 	}
 	ext4_lock_group(sb, group);
@@ -2124,7 +2124,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	ext4_unlock_group(sb, group);
 	ext4_mb_release_desc(&e4b);
 
-	seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free,
+	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
 			sg.info.bb_fragments, sg.info.bb_first_free);
 	for (i = 0; i <= 13; i++)
 		seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
@@ -2459,7 +2459,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
-				"EXT4-fs: can't read descriptor %lu\n", i);
+				"EXT4-fs: can't read descriptor %u\n", i);
 			goto err_freebuddy;
 		}
 		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2657,7 +2657,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 	list_for_each_safe(l, ltmp, &txn->t_private_list) {
 		entry = list_entry(l, struct ext4_free_data, list);
 
-		mb_debug("gonna free %u blocks in group %lu (0x%p):",
+		mb_debug("gonna free %u blocks in group %u (0x%p):",
 			 entry->count, entry->group, entry);
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2829,7 +2829,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!gdp)
 		goto out_err;
 
-	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
 			gdp->bg_free_blocks_count);
 
 	err = ext4_journal_get_write_access(handle, gdp_bh);
@@ -3351,7 +3351,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		preallocated += len;
 		count++;
 	}
-	mb_debug("prellocated %u for group %lu\n", preallocated, group);
+	mb_debug("prellocated %u for group %u\n", preallocated, group);
 }
 
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3368,7 +3368,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 			struct super_block *sb, struct ext4_prealloc_space *pa)
 {
-	unsigned long grp;
+	ext4_group_t grp;
 
 	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
 		return;
@@ -3697,7 +3697,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	int busy = 0;
 	int free = 0;
 
-	mb_debug("discard preallocation for group %lu\n", group);
+	mb_debug("discard preallocation for group %u\n", group);
 
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
@@ -3705,14 +3705,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
 		ext4_error(sb, __func__, "Error in reading block "
-				"bitmap for %lu", group);
+				"bitmap for %u", group);
 		return 0;
 	}
 
 	err = ext4_mb_load_buddy(sb, group, &e4b);
 	if (err) {
 		ext4_error(sb, __func__, "Error in loading buddy "
-				"information for %lu", group);
+				"information for %u", group);
 		put_bh(bitmap_bh);
 		return 0;
 	}
@@ -3879,14 +3879,14 @@ repeat:
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		if (err) {
 			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %lu", group);
+					"information for %u", group);
 			continue;
 		}
 
 		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
 			ext4_error(sb, __func__, "Error in reading block "
-					"bitmap for %lu", group);
+					"bitmap for %u", group);
 			ext4_mb_release_desc(&e4b);
 			continue;
 		}
@@ -4151,7 +4151,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
 		if (ext4_mb_load_buddy(sb, group, &e4b)) {
 			ext4_error(sb, __func__, "Error in loading buddy "
-					"information for %lu", group);
+					"information for %u", group);
 			continue;
 		}
 		ext4_lock_group(sb, group);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 41133811744..1865d6a53de 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb,
 	ext4_get_group_no_and_offset(sb, start, NULL, &offset);
 	if (group != sbi->s_groups_count)
 		ext4_warning(sb, __func__,
-			     "Cannot add at group %u (only %lu groups)",
+			     "Cannot add at group %u (only %u groups)",
 			     input->group, sbi->s_groups_count);
 	else if (offset != 0)
 			ext4_warning(sb, __func__, "Last group not full");
@@ -716,7 +716,7 @@ static void update_backups(struct super_block *sb,
 exit_err:
 	if (err) {
 		ext4_warning(sb, __func__,
-			     "can't update backup for group %lu (err %d), "
+			     "can't update backup for group %u (err %d), "
 			     "forcing fsck on next reboot", group, err);
 		sbi->s_mount_state &= ~EXT4_VALID_FS;
 		sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index da377f9521b..8fa57be5040 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1470,7 +1470,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 
 	ext4_commit_super(sb, es, 1);
 	if (test_opt(sb, DEBUG))
-		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, "
+		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
 				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
 			sb->s_blocksize,
 			sbi->s_groups_count,
@@ -1514,7 +1514,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 				     sizeof(struct flex_groups), GFP_KERNEL);
 	if (sbi->s_flex_groups == NULL) {
 		printk(KERN_ERR "EXT4-fs: not enough memory for "
-				"%lu flex groups\n", flex_group_count);
+				"%u flex groups\n", flex_group_count);
 		goto failed;
 	}
 
@@ -1599,14 +1599,14 @@ static int ext4_check_descriptors(struct super_block *sb)
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Block bitmap for group %lu not in group "
+			       "Block bitmap for group %u not in group "
 			       "(block %llu)!\n", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Inode bitmap for group %lu not in group "
+			       "Inode bitmap for group %u not in group "
 			       "(block %llu)!\n", i, inode_bitmap);
 			return 0;
 		}
@@ -1614,14 +1614,14 @@ static int ext4_check_descriptors(struct super_block *sb)
 		if (inode_table < first_block ||
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Inode table for group %lu not in group "
+			       "Inode table for group %u not in group "
 			       "(block %llu)!\n", i, inode_table);
 			return 0;
 		}
 		spin_lock(sb_bgl_lock(sbi, i));
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Checksum for group %lu failed (%u!=%u)\n",
+			       "Checksum for group %u failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
@@ -3154,7 +3154,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
 					printk(KERN_ERR
 	       "EXT4-fs: ext4_remount: "
-		"Checksum for group %lu failed (%u!=%u)\n",
+		"Checksum for group %u failed (%u!=%u)\n",
 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
 					err = -EINVAL;
-- 
cgit v1.2.3


From 498e5f24158da7bf8fa48074a70e370e22844492 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 5 Nov 2008 00:14:04 -0500
Subject: ext4: Change unsigned long to unsigned int

Convert the unsigned longs that are most responsible for bloating the
stack usage on 64-bit systems.

Nearly all places in the ext3/4 code which uses "unsigned long" is
probably a bug, since on 32-bit systems a ulong a 32-bits, which means
we are wasting stack space on 64-bit systems.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 14 +++++++-------
 fs/ext4/bitmap.c  |  5 ++---
 fs/ext4/dir.c     |  8 ++++----
 fs/ext4/ext4.h    | 22 ++++++++++------------
 fs/ext4/ext4_i.h  | 13 +++++++------
 fs/ext4/extents.c | 24 ++++++++++++------------
 fs/ext4/inode.c   | 25 +++++++++++++------------
 fs/ext4/mballoc.c | 20 ++++++++++----------
 fs/ext4/namei.c   | 25 ++++++++++++-------------
 9 files changed, 77 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a711898923f..a0c23b03a26 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -205,8 +205,8 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 					     ext4_group_t block_group,
 					     struct buffer_head **bh)
 {
-	unsigned long group_desc;
-	unsigned long offset;
+	unsigned int group_desc;
+	unsigned int offset;
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -225,7 +225,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 	if (!sbi->s_group_desc[group_desc]) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "Group descriptor not loaded - "
-			   "block_group = %u, group_desc = %lu, desc = %lu",
+			   "block_group = %u, group_desc = %u, desc = %u",
 			   block_group, group_desc, offset);
 		return NULL;
 	}
@@ -372,8 +372,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
 	ext4_grpblk_t bit;
-	unsigned long i;
-	unsigned long overflow;
+	unsigned int i;
+	unsigned int overflow;
 	struct ext4_group_desc *desc;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
@@ -720,7 +720,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
-	unsigned long x;
+	unsigned int x;
 	struct buffer_head *bitmap_bh = NULL;
 
 	es = EXT4_SB(sb)->s_es;
@@ -740,7 +740,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 			continue;
 
 		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
+		printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
 			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
 		bitmap_count += x;
 	}
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 0a7a6663c19..fa3af81ac56 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,10 +15,9 @@
 
 static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
 
-unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
+unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
 {
-	unsigned int i;
-	unsigned long sum = 0;
+	unsigned int i, sum = 0;
 
 	if (!map)
 		return 0;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index fed5b610df5..cf3ccf4a94b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
 int ext4_check_dir_entry(const char *function, struct inode *dir,
 			 struct ext4_dir_entry_2 *de,
 			 struct buffer_head *bh,
-			 unsigned long offset)
+			 unsigned int offset)
 {
 	const char *error_msg = NULL;
 	const int rlen = ext4_rec_len_from_disk(de->rec_len);
@@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 	if (error_msg != NULL)
 		ext4_error(dir->i_sb, function,
 			"bad entry in directory #%lu: %s - "
-			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+			"offset=%u, inode=%u, rec_len=%d, name_len=%d",
 			dir->i_ino, error_msg, offset,
-			(unsigned long) le32_to_cpu(de->inode),
+			le32_to_cpu(de->inode),
 			rlen, de->name_len);
 	return error_msg == NULL ? 1 : 0;
 }
@@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp,
 			 void *dirent, filldir_t filldir)
 {
 	int error = 0;
-	unsigned long offset;
+	unsigned int offset;
 	int i, stored;
 	struct ext4_dir_entry_2 *de;
 	struct super_block *sb;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e9aacecfbf4..558545d1fea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -94,9 +94,9 @@ struct ext4_allocation_request {
 	/* phys. block for ^^^ */
 	ext4_fsblk_t pright;
 	/* how many blocks we want to allocate */
-	unsigned long len;
+	unsigned int len;
 	/* flags. see above EXT4_MB_HINT_* */
-	unsigned long flags;
+	unsigned int flags;
 };
 
 /*
@@ -997,6 +997,9 @@ do {									\
 # define ATTRIB_NORET	__attribute__((noreturn))
 # define NORET_AND	noreturn,
 
+/* bitmap.c */
+extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+
 /* balloc.c */
 extern unsigned int ext4_block_group(struct super_block *sb,
 			ext4_fsblk_t blocknr);
@@ -1024,7 +1027,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
 				struct ext4_dir_entry_2 *,
-				struct buffer_head *, unsigned long);
+				struct buffer_head *, unsigned int);
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 				    __u32 minor_hash,
 				    struct ext4_dir_entry_2 *dirent);
@@ -1044,7 +1047,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
-extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1074,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-				ext4_lblk_t iblock, unsigned long maxblocks,
-				struct buffer_head *bh_result,
-				int create, int extend_disksize);
 
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode(struct inode *, int);
@@ -1276,16 +1274,16 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock,
-			unsigned long max_blocks, struct buffer_head *bh_result,
-			int create, int extend_disksize);
+			       ext4_lblk_t iblock, unsigned int max_blocks,
+			       struct buffer_head *bh_result,
+			       int create, int extend_disksize);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-			sector_t block, unsigned long max_blocks,
+			sector_t block, unsigned int max_blocks,
 			struct buffer_head *bh, int create,
 			int extend_disksize, int flag);
 #endif	/* __KERNEL__ */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 0a9ebe58019..e69acc16f5c 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -100,9 +100,6 @@ struct ext4_inode_info {
 	 */
 	loff_t	i_disksize;
 
-	/* on-disk additional length */
-	__u16 i_extra_isize;
-
 	/*
 	 * i_data_sem is for serialising ext4_truncate() against
 	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
@@ -129,10 +126,14 @@ struct ext4_inode_info {
 	spinlock_t i_prealloc_lock;
 
 	/* allocation reservation info for delalloc */
-	unsigned long i_reserved_data_blocks;
-	unsigned long i_reserved_meta_blocks;
-	unsigned long i_allocated_meta_blocks;
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
 	unsigned short i_delalloc_reserved_flag;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
 	spinlock_t i_block_reservation_lock;
 };
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b9e27bc3155..b92cb60737b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2377,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 						struct inode *inode,
 						struct ext4_ext_path *path,
 						ext4_lblk_t iblock,
-						unsigned long max_blocks)
+						unsigned int max_blocks)
 {
 	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex1 = NULL;
@@ -2675,26 +2675,26 @@ fix_extent_len:
  */
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
-			unsigned long max_blocks, struct buffer_head *bh_result,
+			unsigned int max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex;
-	ext4_fsblk_t goal, newblock;
-	int err = 0, depth, ret;
-	unsigned long allocated = 0;
+	ext4_fsblk_t newblock;
+	int err = 0, depth, ret, cache_type;
+	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
-	ext_debug("blocks %u/%lu requested for inode %u\n",
+	ext_debug("blocks %u/%u requested for inode %u\n",
 			iblock, max_blocks, inode->i_ino);
 
 	/* check in cache */
-	goal = ext4_ext_in_cache(inode, iblock, &newex);
-	if (goal) {
-		if (goal == EXT4_EXT_CACHE_GAP) {
+	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+	if (cache_type) {
+		if (cache_type == EXT4_EXT_CACHE_GAP) {
 			if (!create) {
 				/*
 				 * block isn't allocated yet and
@@ -2703,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				goto out2;
 			}
 			/* we should allocate requested block */
-		} else if (goal == EXT4_EXT_CACHE_EXTENT) {
+		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
 			/* block is already allocated */
 			newblock = iblock
 				   - le32_to_cpu(newex.ee_block)
@@ -2851,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (!newblock)
 		goto out2;
 	ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
-			goal, newblock, allocated);
+		  ar.goal, newblock, allocated);
 
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
@@ -3001,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	handle_t *handle;
 	ext4_lblk_t block;
 	loff_t new_size;
-	unsigned long max_blocks;
+	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 45d0f70a1f0..91e06f88f08 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -514,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
  *	return the total number of blocks to be allocate, including the
  *	direct and indirect blocks.
  */
-static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
+static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
 		int blocks_to_boundary)
 {
-	unsigned long count = 0;
+	unsigned int count = 0;
 
 	/*
 	 * Simple case, [t,d]Indirect block(s) has not allocated yet
@@ -856,10 +856,10 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
-int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
-		ext4_lblk_t iblock, unsigned long maxblocks,
-		struct buffer_head *bh_result,
-		int create, int extend_disksize)
+static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+				  ext4_lblk_t iblock, unsigned int maxblocks,
+				  struct buffer_head *bh_result,
+				  int create, int extend_disksize)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
@@ -1061,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  * It returns the error in case of allocation failure.
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-			unsigned long max_blocks, struct buffer_head *bh,
+			unsigned int max_blocks, struct buffer_head *bh,
 			int create, int extend_disksize, int flag)
 {
 	int retval;
@@ -1641,7 +1641,7 @@ struct mpage_da_data {
 	get_block_t *get_block;
 	struct writeback_control *wbc;
 	int io_done;
-	long pages_written;
+	int pages_written;
 	int retval;
 };
 
@@ -1855,9 +1855,9 @@ static void ext4_print_free_blocks(struct inode *inode)
 	printk(KERN_EMERG "dirty_blocks=%lld\n",
 			(long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
 	printk(KERN_EMERG "Block reservation details\n");
-	printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+	printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
 			EXT4_I(inode)->i_reserved_data_blocks);
-	printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+	printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
 			EXT4_I(inode)->i_reserved_meta_blocks);
 	return;
 }
@@ -2307,7 +2307,7 @@ static int ext4_da_writepage(struct page *page,
 {
 	int ret = 0;
 	loff_t size;
-	unsigned long len;
+	unsigned int len;
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
 
@@ -2416,7 +2416,8 @@ static int ext4_da_writepages(struct address_space *mapping,
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
 	int no_nrwrite_index_update;
-	long pages_written = 0, pages_skipped;
+	int pages_written = 0;
+	long pages_skipped;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6cfe68a7e07..1d78435ce38 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2796,7 +2796,7 @@ void exit_ext4_mballoc(void)
  */
 static noinline_for_stack int
 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
-				handle_t *handle, unsigned long reserv_blks)
+				handle_t *handle, unsigned int reserv_blks)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct ext4_super_block *es;
@@ -3036,7 +3036,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	/* check we don't cross already preallocated blocks */
 	rcu_read_lock();
 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-		unsigned long pa_end;
+		ext4_lblk_t pa_end;
 
 		if (pa->pa_deleted)
 			continue;
@@ -3080,7 +3080,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	/* XXX: extra loop to check we really don't overlap preallocations */
 	rcu_read_lock();
 	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
-		unsigned long pa_end;
+		ext4_lblk_t pa_end;
 		spin_lock(&pa->pa_lock);
 		if (pa->pa_deleted == 0) {
 			pa_end = pa->pa_lstart + pa->pa_len;
@@ -3584,8 +3584,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 {
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	unsigned long end;
-	unsigned long next;
+	unsigned int end;
+	unsigned int next;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 	sector_t start;
@@ -4029,8 +4029,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
 	ext4_group_t group;
-	unsigned long len;
-	unsigned long goal;
+	unsigned int len;
+	ext4_fsblk_t goal;
 	ext4_grpblk_t block;
 
 	/* we can't allocate > group size */
@@ -4291,8 +4291,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	unsigned long inquota;
-	unsigned long reserv_blks = 0;
+	unsigned int inquota;
+	unsigned int reserv_blks = 0;
 
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
@@ -4504,7 +4504,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_context *ac = NULL;
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
-	unsigned long overflow;
+	unsigned int overflow;
 	ext4_grpblk_t bit;
 	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 08873e938ab..183a09a8b14 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -811,7 +811,7 @@ static inline int ext4_match (int len, const char * const name,
 static inline int search_dirblock(struct buffer_head *bh,
 				  struct inode *dir,
 				  const struct qstr *d_name,
-				  unsigned long offset,
+				  unsigned int offset,
 				  struct ext4_dir_entry_2 ** res_dir)
 {
 	struct ext4_dir_entry_2 * de;
@@ -1048,11 +1048,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 	bh = ext4_find_entry(dir, &dentry->d_name, &de);
 	inode = NULL;
 	if (bh) {
-		unsigned long ino = le32_to_cpu(de->inode);
+		__u32 ino = le32_to_cpu(de->inode);
 		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
 			ext4_error(dir->i_sb, "ext4_lookup",
-				   "bad inode number: %lu", ino);
+				   "bad inode number: %u", ino);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
@@ -1065,7 +1065,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 
 struct dentry *ext4_get_parent(struct dentry *child)
 {
-	unsigned long ino;
+	__u32 ino;
 	struct inode *inode;
 	static const struct qstr dotdot = {
 		.name = "..",
@@ -1083,7 +1083,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
 
 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
 		ext4_error(child->d_inode->i_sb, "ext4_get_parent",
-			   "bad inode number: %lu", ino);
+			   "bad inode number: %u", ino);
 		return ERR_PTR(-EIO);
 	}
 
@@ -1271,7 +1271,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	struct inode	*dir = dentry->d_parent->d_inode;
 	const char	*name = dentry->d_name.name;
 	int		namelen = dentry->d_name.len;
-	unsigned long	offset = 0;
+	unsigned int	offset = 0;
 	unsigned short	reclen;
 	int		nlen, rlen, err;
 	char		*top;
@@ -1444,7 +1444,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 			  struct inode *inode)
 {
 	struct inode *dir = dentry->d_parent->d_inode;
-	unsigned long offset;
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de;
 	struct super_block *sb;
@@ -1466,7 +1465,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		ext4_mark_inode_dirty(handle, dir);
 	}
 	blocks = dir->i_size >> sb->s_blocksize_bits;
-	for (block = 0, offset = 0; block < blocks; block++) {
+	for (block = 0; block < blocks; block++) {
 		bh = ext4_bread(handle, dir, block, 0, &retval);
 		if(!bh)
 			return retval;
@@ -1861,7 +1860,7 @@ out_stop:
  */
 static int empty_dir(struct inode *inode)
 {
-	unsigned long offset;
+	unsigned int offset;
 	struct buffer_head *bh;
 	struct ext4_dir_entry_2 *de, *de1;
 	struct super_block *sb;
@@ -1906,7 +1905,7 @@ static int empty_dir(struct inode *inode)
 				if (err)
 					ext4_error(sb, __func__,
 						   "error %d reading directory"
-						   " #%lu offset %lu",
+						   " #%lu offset %u",
 						   err, inode->i_ino, offset);
 				offset += sb->s_blocksize;
 				continue;
@@ -2009,7 +2008,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	struct list_head *prev;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_sb_info *sbi;
-	unsigned long ino_next;
+	__u32 ino_next;
 	struct ext4_iloc iloc;
 	int err = 0;
 
@@ -2042,7 +2041,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		goto out_err;
 
 	if (prev == &sbi->s_orphan) {
-		jbd_debug(4, "superblock will point to %lu\n", ino_next);
+		jbd_debug(4, "superblock will point to %u\n", ino_next);
 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
 		err = ext4_journal_get_write_access(handle, sbi->s_sbh);
 		if (err)
@@ -2054,7 +2053,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		struct inode *i_prev =
 			&list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
 
-		jbd_debug(4, "orphan inode %lu will point to %lu\n",
+		jbd_debug(4, "orphan inode %lu will point to %u\n",
 			  i_prev->i_ino, ino_next);
 		err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
 		if (err)
-- 
cgit v1.2.3


From 1a0d3786dd57dbd74f340322054c3d618b999dcf Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 5 Nov 2008 00:09:22 -0500
Subject: jbd2: Remove a large array of bh's from the stack of the checkpoint
 routine

jbd2_log_do_checkpoint()n is one of the kernel's largest stack users.
Move the array of buffer head's from the stack of jbd2_log_do_checkpoint()
to the in-core journal structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 22 +++++++++-------------
 fs/jbd2/journal.c    |  2 ++
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9497718fe92..adc08ec875e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -249,16 +249,14 @@ restart:
 	return ret;
 }
 
-#define NR_BATCH	64
-
 static void
-__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
+__flush_batch(journal_t *journal, int *batch_count)
 {
 	int i;
 
-	ll_rw_block(SWRITE, *batch_count, bhs);
+	ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
 	for (i = 0; i < *batch_count; i++) {
-		struct buffer_head *bh = bhs[i];
+		struct buffer_head *bh = journal->j_chkpt_bhs[i];
 		clear_buffer_jwrite(bh);
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
@@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
-			struct buffer_head **bhs, int *batch_count,
-			transaction_t *transaction)
+			    int *batch_count, transaction_t *transaction)
 {
 	struct buffer_head *bh = jh2bh(jh);
 	int ret = 0;
@@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
 		get_bh(bh);
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
 		set_buffer_jwrite(bh);
-		bhs[*batch_count] = bh;
+		journal->j_chkpt_bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
 		jbd_unlock_bh_state(bh);
 		transaction->t_chp_stats.cs_written++;
 		(*batch_count)++;
-		if (*batch_count == NR_BATCH) {
+		if (*batch_count == JBD2_NR_BATCH) {
 			spin_unlock(&journal->j_list_lock);
-			__flush_batch(journal, bhs, batch_count);
+			__flush_batch(journal, batch_count);
 			ret = 1;
 		}
 	}
@@ -388,7 +385,6 @@ restart:
 	if (journal->j_checkpoint_transactions == transaction &&
 			transaction->t_tid == this_tid) {
 		int batch_count = 0;
-		struct buffer_head *bhs[NR_BATCH];
 		struct journal_head *jh;
 		int retry = 0, err;
 
@@ -402,7 +398,7 @@ restart:
 				retry = 1;
 				break;
 			}
-			retry = __process_buffer(journal, jh, bhs, &batch_count,
+			retry = __process_buffer(journal, jh, &batch_count,
 						 transaction);
 			if (retry < 0 && !result)
 				result = retry;
@@ -419,7 +415,7 @@ restart:
 				spin_unlock(&journal->j_list_lock);
 				retry = 1;
 			}
-			__flush_batch(journal, bhs, &batch_count);
+			__flush_batch(journal, &batch_count);
 		}
 
 		if (retry) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fd1d7557a09..34ef9805720 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1477,7 +1477,9 @@ int jbd2_journal_destroy(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
+		mutex_lock(&journal->j_checkpoint_mutex);
 		jbd2_log_do_checkpoint(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
 	}
 
-- 
cgit v1.2.3


From 3a06d778dfeda7eaeeb79bfa49cf97f2aae132b4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 22 Nov 2008 15:04:59 -0500
Subject: ext4: sparse fixes

* Change EXT4_HAS_*_FEATURE to return a boolean
* Add a function prototype for ext4_fiemap() in ext4.h
* Make ext4_ext_fiemap_cb() and ext4_xattr_fiemap() be static functions
* Add lock annotations to mb_free_blocks()

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  9 ++++++---
 fs/ext4/extents.c |  5 +++--
 fs/ext4/file.c    |  3 ---
 fs/ext4/inode.c   |  2 +-
 fs/ext4/mballoc.c |  4 +++-
 fs/ext4/super.c   | 19 +++++++++++--------
 6 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 558545d1fea..5125c1f6e7e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -727,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
  */
 
 #define EXT4_HAS_COMPAT_FEATURE(sb,mask)			\
-	(EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
+	((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask)			\
-	(EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
+	((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0)
 #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask)			\
-	(EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
+	((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0)
 #define EXT4_SET_COMPAT_FEATURE(sb,mask)			\
 	EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
 #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask)			\
@@ -1286,6 +1286,9 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
 			sector_t block, unsigned int max_blocks,
 			struct buffer_head *bh, int create,
 			int extend_disksize, int flag);
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+			__u64 start, __u64 len);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b92cb60737b..c64080e4949 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3080,7 +3080,7 @@ retry:
 /*
  * Callback function called for each extent to gather FIEMAP information.
  */
-int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 		       struct ext4_ext_cache *newex, struct ext4_extent *ex,
 		       void *data)
 {
@@ -3149,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 /* fiemap flags we can handle specified here */
 #define EXT4_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
 
-int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+static int ext4_xattr_fiemap(struct inode *inode,
+				struct fiemap_extent_info *fieinfo)
 {
 	__u64 physical = 0;
 	__u64 length;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6bd11fba71f..f731cb545a0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
-		__u64 start, __u64 len);
-
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 91e06f88f08..bcd5ffa76c0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3907,7 +3907,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
 	ext4_fsblk_t		block;
 	int			inodes_per_block, inode_offset;
 
-	iloc->bh = 0;
+	iloc->bh = NULL;
 	if (!ext4_valid_inum(sb, inode->i_ino))
 		return -EIO;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1d78435ce38..edf9730ba72 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1056,6 +1056,8 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 
 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			  int first, int count)
+__releases(bitlock)
+__acquires(bitlock)
 {
 	int block = 0;
 	int max = 0;
@@ -2244,7 +2246,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 
 
 /* Create and initialize ext4_group_info data for the given group. */
-int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			  struct ext4_group_desc *desc)
 {
 	int i, len;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8fa57be5040..a9dd1170bfe 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1924,7 +1924,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	int db_count;
 	int i;
 	int needs_recovery, has_huge_files;
-	__le32 features;
+	int features;
 	__u64 blocks_count;
 	int err;
 
@@ -2056,15 +2056,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
 	if (features) {
 		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		       "unsupported optional features (%x).\n", sb->s_id,
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+			~EXT4_FEATURE_INCOMPAT_SUPP));
 		goto failed_mount;
 	}
 	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n",
-		       sb->s_id, le32_to_cpu(features));
+		       "unsupported optional features (%x).\n", sb->s_id,
+			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+			~EXT4_FEATURE_RO_COMPAT_SUPP));
 		goto failed_mount;
 	}
 	has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -3131,13 +3133,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				lock_super(sb);
 			}
 		} else {
-			__le32 ret;
+			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
 				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
 				       "remount RDWR because of unsupported "
-				       "optional features (%x).\n",
-				       sb->s_id, le32_to_cpu(ret));
+				       "optional features (%x).\n", sb->s_id,
+				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
+					~EXT4_FEATURE_RO_COMPAT_SUPP));
 				err = -EROFS;
 				goto restore_opts;
 			}
-- 
cgit v1.2.3


From e21675d4b63975d09eb75c443c48ebe663d23e18 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:36:02 -0500
Subject: ext4: Add blocks added during resize to bitmap

With this change new blocks added during resize
are marked as free in the block bitmap and the
group is flagged with EXT4_GROUP_INFO_NEED_INIT_BIT
flag.  This makes sure when mballoc tries to allocate
blocks from the new group we would reload the
buddy information using the bitmap present in the disk.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/balloc.c | 136 ++++++++++++-------------------------------------------
 fs/ext4/ext4.h   |   5 +-
 fs/ext4/resize.c |  11 +----
 3 files changed, 34 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a0c23b03a26..c54192e2384 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -20,6 +20,7 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "group.h"
+#include "mballoc.h"
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -350,62 +351,43 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 }
 
 /**
- * ext4_free_blocks_sb() -- Free given blocks and update quota
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
  * @handle:			handle to this transaction
  * @sb:				super block
- * @block:			start physcial block to free
+ * @block:			start physcial block to add to the block group
  * @count:			number of blocks to free
- * @pdquot_freed_blocks:	pointer to quota
  *
- * XXX This function is only used by the on-line resizing code, which
- * should probably be fixed up to call the mballoc variant.  There
- * this needs to be cleaned up later; in fact, I'm not convinced this
- * is 100% correct in the face of the mballoc code.  The online resizing
- * code needs to be fixed up to more tightly (and correctly) interlock
- * with the mballoc code.
+ * This marks the blocks as free in the bitmap. We ask the
+ * mballoc to reload the buddy after this by setting group
+ * EXT4_GROUP_INFO_NEED_INIT_BIT flag
  */
-void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-			 ext4_fsblk_t block, unsigned long count,
-			 unsigned long *pdquot_freed_blocks)
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+			 ext4_fsblk_t block, unsigned long count)
 {
 	struct buffer_head *bitmap_bh = NULL;
 	struct buffer_head *gd_bh;
 	ext4_group_t block_group;
 	ext4_grpblk_t bit;
 	unsigned int i;
-	unsigned int overflow;
 	struct ext4_group_desc *desc;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
 	int err = 0, ret;
-	ext4_grpblk_t group_freed;
+	ext4_grpblk_t blocks_freed;
+	struct ext4_group_info *grp;
 
-	*pdquot_freed_blocks = 0;
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
-	if (block < le32_to_cpu(es->s_first_data_block) ||
-	    block + count < block ||
-	    block + count > ext4_blocks_count(es)) {
-		ext4_error(sb, "ext4_free_blocks",
-			   "Freeing blocks not in datazone - "
-			   "block = %llu, count = %lu", block, count);
-		goto error_return;
-	}
-
-	ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
+	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
-do_more:
-	overflow = 0;
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
 	/*
 	 * Check to see if we are freeing blocks across a group
 	 * boundary.
 	 */
 	if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
-		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
-		count -= overflow;
+		goto error_return;
 	}
-	brelse(bitmap_bh);
 	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
@@ -418,18 +400,17 @@ do_more:
 	    in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 	    in_range(block + count - 1, ext4_inode_table(sb, desc),
 		     sbi->s_itb_per_group)) {
-		ext4_error(sb, "ext4_free_blocks",
-			   "Freeing blocks in system zones - "
+		ext4_error(sb, __func__,
+			   "Adding blocks in system zones - "
 			   "Block = %llu, count = %lu",
 			   block, count);
 		goto error_return;
 	}
 
 	/*
-	 * We are about to start releasing blocks in the bitmap,
+	 * We are about to add blocks to the bitmap,
 	 * so we need undo access.
 	 */
-	/* @@@ check errors */
 	BUFFER_TRACE(bitmap_bh, "getting undo access");
 	err = ext4_journal_get_undo_access(handle, bitmap_bh);
 	if (err)
@@ -445,87 +426,28 @@ do_more:
 	if (err)
 		goto error_return;
 
-	jbd_lock_bh_state(bitmap_bh);
-
-	for (i = 0, group_freed = 0; i < count; i++) {
-		/*
-		 * An HJ special.  This is expensive...
-		 */
-#ifdef CONFIG_JBD2_DEBUG
-		jbd_unlock_bh_state(bitmap_bh);
-		{
-			struct buffer_head *debug_bh;
-			debug_bh = sb_find_get_block(sb, block + i);
-			if (debug_bh) {
-				BUFFER_TRACE(debug_bh, "Deleted!");
-				if (!bh2jh(bitmap_bh)->b_committed_data)
-					BUFFER_TRACE(debug_bh,
-						"No commited data in bitmap");
-				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
-				__brelse(debug_bh);
-			}
-		}
-		jbd_lock_bh_state(bitmap_bh);
-#endif
-		if (need_resched()) {
-			jbd_unlock_bh_state(bitmap_bh);
-			cond_resched();
-			jbd_lock_bh_state(bitmap_bh);
-		}
-		/* @@@ This prevents newly-allocated data from being
-		 * freed and then reallocated within the same
-		 * transaction.
-		 *
-		 * Ideally we would want to allow that to happen, but to
-		 * do so requires making jbd2_journal_forget() capable of
-		 * revoking the queued write of a data block, which
-		 * implies blocking on the journal lock.  *forget()
-		 * cannot block due to truncate races.
-		 *
-		 * Eventually we can fix this by making jbd2_journal_forget()
-		 * return a status indicating whether or not it was able
-		 * to revoke the buffer.  On successful revoke, it is
-		 * safe not to set the allocation bit in the committed
-		 * bitmap, because we know that there is no outstanding
-		 * activity on the buffer any more and so it is safe to
-		 * reallocate it.
-		 */
-		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
-		J_ASSERT_BH(bitmap_bh,
-				bh2jh(bitmap_bh)->b_committed_data != NULL);
-		ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
-				bh2jh(bitmap_bh)->b_committed_data);
-
-		/*
-		 * We clear the bit in the bitmap after setting the committed
-		 * data bit, because this is the reverse order to that which
-		 * the allocator uses.
-		 */
+	for (i = 0, blocks_freed = 0; i < count; i++) {
 		BUFFER_TRACE(bitmap_bh, "clear bit");
 		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 						bit + i, bitmap_bh->b_data)) {
-			jbd_unlock_bh_state(bitmap_bh);
 			ext4_error(sb, __func__,
 				   "bit already cleared for block %llu",
 				   (ext4_fsblk_t)(block + i));
-			jbd_lock_bh_state(bitmap_bh);
 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
 		} else {
-			group_freed++;
+			blocks_freed++;
 		}
 	}
-	jbd_unlock_bh_state(bitmap_bh);
-
 	spin_lock(sb_bgl_lock(sbi, block_group));
-	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
+	le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
-	percpu_counter_add(&sbi->s_freeblocks_counter, count);
+	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 		spin_lock(sb_bgl_lock(sbi, flex_group));
-		sbi->s_flex_groups[flex_group].free_blocks += count;
+		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
 		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
 
@@ -536,15 +458,17 @@ do_more:
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
 	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-	if (!err) err = ret;
-	*pdquot_freed_blocks += group_freed;
-
-	if (overflow && !err) {
-		block += count;
-		count = overflow;
-		goto do_more;
-	}
+	if (!err)
+		err = ret;
 	sb->s_dirt = 1;
+	/*
+	 * request to reload the buddy with the
+	 * new bitmap information
+	 */
+	grp = ext4_get_group_info(sb, block_group);
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+	ext4_mb_update_group_info(grp, blocks_freed);
+
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, err);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5125c1f6e7e..8021bf558d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1014,9 +1014,8 @@ extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
 extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
-				ext4_fsblk_t block, unsigned long count,
-				unsigned long *pdquot_freed_blocks);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+				ext4_fsblk_t block, unsigned long count);
 extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
 extern void ext4_check_blocks_bitmap(struct super_block *);
 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 1865d6a53de..526db73701b 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -977,9 +977,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	struct buffer_head *bh;
 	handle_t *handle;
 	int err;
-	unsigned long freed_blocks;
 	ext4_group_t group;
-	struct ext4_group_info *grp;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -1077,7 +1075,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	unlock_super(sb);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
-	ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
+	/* We add the blocks to the bitmap and set the group need init bit */
+	ext4_add_groupblocks(handle, sb, o_blocks_count, add);
 	ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	if ((err = ext4_journal_stop(handle)))
@@ -1120,12 +1119,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			ClearPageUptodate(page);
 			page_cache_release(page);
 		}
-
-		/* Get the info on the last group */
-		grp = ext4_get_group_info(sb, group);
-
-		/* Update free blocks in group info */
-		ext4_mb_update_group_info(grp, add);
 	}
 
 	if (test_opt(sb, DEBUG))
-- 
cgit v1.2.3


From 920313a726e04fef0f2c0bcb04ad8229c0e700d8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:36:19 -0500
Subject: ext4: Use EXT4_GROUP_INFO_NEED_INIT_BIT during resize

The new groups added during resize are flagged as
need_init group. Make sure we properly initialize these
groups. When we have block size < page size and we are adding
new groups the page may still be marked uptodate even though
we haven't initialized the group. While forcing the init
of buddy cache we need to make sure other groups part of the
same page of buddy cache is not using the cache.
group_info->alloc_sem is added to ensure the same.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
cc: stable@kernel.org
---
 fs/ext4/balloc.c  |  21 +++--
 fs/ext4/ext4.h    |   7 +-
 fs/ext4/mballoc.c | 261 ++++++++++++++++++++++++++++++++++++++++++------------
 fs/ext4/mballoc.h |   3 +
 fs/ext4/resize.c  |  49 ++--------
 5 files changed, 230 insertions(+), 111 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index c54192e2384..404d81cc915 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -381,6 +381,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 
 	ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+	grp = ext4_get_group_info(sb, block_group);
 	/*
 	 * Check to see if we are freeing blocks across a group
 	 * boundary.
@@ -425,7 +426,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	err = ext4_journal_get_write_access(handle, gd_bh);
 	if (err)
 		goto error_return;
-
+	/*
+	 * make sure we don't allow a parallel init on other groups in the
+	 * same buddy cache
+	 */
+	down_write(&grp->alloc_sem);
 	for (i = 0, blocks_freed = 0; i < count; i++) {
 		BUFFER_TRACE(bitmap_bh, "clear bit");
 		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
@@ -450,6 +455,13 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 		sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
 		spin_unlock(sb_bgl_lock(sbi, flex_group));
 	}
+	/*
+	 * request to reload the buddy with the
+	 * new bitmap information
+	 */
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+	ext4_mb_update_group_info(grp, blocks_freed);
+	up_write(&grp->alloc_sem);
 
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -461,13 +473,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	if (!err)
 		err = ret;
 	sb->s_dirt = 1;
-	/*
-	 * request to reload the buddy with the
-	 * new bitmap information
-	 */
-	grp = ext4_get_group_info(sb, block_group);
-	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-	ext4_mb_update_group_info(grp, blocks_freed);
 
 error_return:
 	brelse(bitmap_bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8021bf558d1..8152b5603f0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1060,12 +1060,13 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		unsigned long, unsigned long, int, unsigned long *);
-extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+extern int ext4_mb_add_groupinfo(struct super_block *sb,
 		ext4_group_t i, struct ext4_group_desc *desc);
 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
 		ext4_grpblk_t add);
-
-
+extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
+extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
+						ext4_group_t, int);
 /* inode.c */
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index edf9730ba72..d2b1bcaf88e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -886,18 +886,20 @@ static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
 {
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
 	int blocks_per_page;
 	int block;
 	int pnum;
 	int poff;
 	struct page *page;
 	int ret;
+	struct ext4_group_info *grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
 
 	mb_debug("load group %u\n", group);
 
 	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	grp = ext4_get_group_info(sb, group);
 
 	e4b->bd_blkbits = sb->s_blocksize_bits;
 	e4b->bd_info = ext4_get_group_info(sb, group);
@@ -905,6 +907,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	e4b->bd_group = group;
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
+	e4b->alloc_semp = &grp->alloc_sem;
+
+	/* Take the read lock on the group alloc
+	 * sem. This would make sure a parallel
+	 * ext4_mb_init_group happening on other
+	 * groups mapped by the page is blocked
+	 * till we are done with allocation
+	 */
+	down_read(e4b->alloc_semp);
 
 	/*
 	 * the buddy cache inode stores the block bitmap
@@ -920,6 +931,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	page = find_get_page(inode->i_mapping, pnum);
 	if (page == NULL || !PageUptodate(page)) {
 		if (page)
+			/*
+			 * drop the page reference and try
+			 * to get the page with lock. If we
+			 * are not uptodate that implies
+			 * somebody just created the page but
+			 * is yet to initialize the same. So
+			 * wait for it to initialize.
+			 */
 			page_cache_release(page);
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
@@ -985,6 +1004,9 @@ err:
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
+
+	/* Done with the buddy cache */
+	up_read(e4b->alloc_semp);
 	return ret;
 }
 
@@ -994,6 +1016,8 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
 		page_cache_release(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
+	/* Done with the buddy cache */
+	up_read(e4b->alloc_semp);
 }
 
 
@@ -1696,6 +1720,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	return 0;
 }
 
+/*
+ * lock the group_info alloc_sem of all the groups
+ * belonging to the same buddy cache page. This
+ * make sure other parallel operation on the buddy
+ * cache doesn't happen  whild holding the buddy cache
+ * lock
+ */
+int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	int groups_per_page;
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+
+	groups_per_page = blocks_per_page >> 1;
+	if (groups_per_page == 0)
+		groups_per_page = 1;
+	/* read all groups the page covers into the cache */
+	for (i = 0; i < groups_per_page; i++) {
+
+		if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+			break;
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		down_write(&grp->alloc_sem);
+	}
+	return i;
+}
+
+void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
+					ext4_group_t group, int locked_group)
+{
+	int i;
+	int block, pnum;
+	int blocks_per_page;
+	ext4_group_t first_group;
+	struct ext4_group_info *grp;
+
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	first_group = pnum * blocks_per_page / 2;
+	/* release locks on all the groups */
+	for (i = 0; i < locked_group; i++) {
+
+		grp = ext4_get_group_info(sb, first_group + i);
+		/* take all groups write allocation
+		 * semaphore. This make sure there is
+		 * no block allocation going on in any
+		 * of that groups
+		 */
+		up_write(&grp->alloc_sem);
+	}
+
+}
+
+static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+
+	int ret;
+	void *bitmap;
+	int blocks_per_page;
+	int block, pnum, poff;
+	int num_grp_locked = 0;
+	struct ext4_group_info *this_grp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	struct page *page = NULL, *bitmap_page = NULL;
+
+	mb_debug("init group %lu\n", group);
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	this_grp = ext4_get_group_info(sb, group);
+	/*
+	 * This ensures we don't add group
+	 * to this buddy cache via resize
+	 */
+	num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+	if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+		/*
+		 * somebody initialized the group
+		 * return without doing anything
+		 */
+		ret = 0;
+		goto err;
+	}
+	/*
+	 * the buddy cache inode stores the block bitmap
+	 * and buddy information in consecutive blocks.
+	 * So for each group we need two blocks.
+	 */
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, NULL);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+	bitmap_page = page;
+	bitmap = page_address(page) + (poff * sb->s_blocksize);
+
+	/* init buddy cache */
+	block++;
+	pnum = block / blocks_per_page;
+	poff = block % blocks_per_page;
+	page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+	if (page == bitmap_page) {
+		/*
+		 * If both the bitmap and buddy are in
+		 * the same page we don't need to force
+		 * init the buddy
+		 */
+		unlock_page(page);
+	} else if (page) {
+		BUG_ON(page->mapping != inode->i_mapping);
+		ret = ext4_mb_init_cache(page, bitmap);
+		if (ret) {
+			unlock_page(page);
+			goto err;
+		}
+		unlock_page(page);
+	}
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
+		goto err;
+	}
+	mark_page_accessed(page);
+err:
+	ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+	if (bitmap_page)
+		page_cache_release(bitmap_page);
+	if (page)
+		page_cache_release(page);
+	return ret;
+}
+
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -1779,7 +1970,7 @@ repeat:
 				group = 0;
 
 			/* quick check to skip empty groups */
-			grp = ext4_get_group_info(ac->ac_sb, group);
+			grp = ext4_get_group_info(sb, group);
 			if (grp->bb_free == 0)
 				continue;
 
@@ -1792,10 +1983,9 @@ repeat:
 				 * we need full data about the group
 				 * to make a good selection
 				 */
-				err = ext4_mb_load_buddy(sb, group, &e4b);
+				err = ext4_mb_init_group(sb, group);
 				if (err)
 					goto out;
-				ext4_mb_release_desc(&e4b);
 			}
 
 			/*
@@ -2246,7 +2436,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 
 
 /* Create and initialize ext4_group_info data for the given group. */
-static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			  struct ext4_group_desc *desc)
 {
 	int i, len;
@@ -2304,6 +2494,7 @@ static int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	}
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root.rb_node = NULL;;
 
 #ifdef DOUBLE_CHECK
@@ -2330,54 +2521,6 @@ exit_meta_group_info:
 	return -ENOMEM;
 } /* ext4_mb_add_groupinfo */
 
-/*
- * Add a group to the existing groups.
- * This function is used for online resize
- */
-int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
-			       struct ext4_group_desc *desc)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct inode *inode = sbi->s_buddy_cache;
-	int blocks_per_page;
-	int block;
-	int pnum;
-	struct page *page;
-	int err;
-
-	/* Add group based on group descriptor*/
-	err = ext4_mb_add_groupinfo(sb, group, desc);
-	if (err)
-		return err;
-
-	/*
-	 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
-	 * datas) are set not up to date so that they will be re-initilaized
-	 * during the next call to ext4_mb_load_buddy
-	 */
-
-	/* Set buddy page as not up to date */
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-	block = group * 2;
-	pnum = block / blocks_per_page;
-	page = find_get_page(inode->i_mapping, pnum);
-	if (page != NULL) {
-		ClearPageUptodate(page);
-		page_cache_release(page);
-	}
-
-	/* Set bitmap page as not up to date */
-	block++;
-	pnum = block / blocks_per_page;
-	page = find_get_page(inode->i_mapping, pnum);
-	if (page != NULL) {
-		ClearPageUptodate(page);
-		page_cache_release(page);
-	}
-
-	return 0;
-}
-
 /*
  * Update an existing group.
  * This function is used for online resize
@@ -4588,11 +4731,6 @@ do_more:
 	err = ext4_journal_get_write_access(handle, gd_bh);
 	if (err)
 		goto error_return;
-
-	err = ext4_mb_load_buddy(sb, block_group, &e4b);
-	if (err)
-		goto error_return;
-
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
@@ -4606,6 +4744,8 @@ do_more:
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+	if (err)
+		goto error_return;
 
 	if (ac) {
 		ac->ac_b_ex.fe_group = block_group;
@@ -4614,6 +4754,9 @@ do_more:
 		ext4_mb_store_history(ac);
 	}
 
+	err = ext4_mb_load_buddy(sb, block_group, &e4b);
+	if (err)
+		goto error_return;
 	if (metadata && ext4_handle_valid(handle)) {
 		/* blocks being freed are metadata. these blocks shouldn't
 		 * be used until this transaction is committed */
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b5dff1fff1e..a931b6b4f6a 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -20,6 +20,7 @@
 #include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/marker.h>
+#include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "group.h"
@@ -130,6 +131,7 @@ struct ext4_group_info {
 #ifdef DOUBLE_CHECK
 	void		*bb_bitmap;
 #endif
+	struct rw_semaphore alloc_sem;
 	unsigned short	bb_counters[];
 };
 
@@ -250,6 +252,7 @@ struct ext4_buddy {
 	struct super_block *bd_sb;
 	__u16 bd_blkbits;
 	ext4_group_t bd_group;
+	struct rw_semaphore *alloc_semp;
 };
 #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 526db73701b..92034d2c8a7 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -748,6 +748,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	struct inode *inode = NULL;
 	handle_t *handle;
 	int gdb_off, gdb_num;
+	int num_grp_locked = 0;
 	int err, err2;
 
 	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -788,6 +789,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		}
 	}
 
+
 	if ((err = verify_group_input(sb, input)))
 		goto exit_put;
 
@@ -856,6 +858,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
          * using the new disk blocks.
          */
 
+	num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
 	/* Update group descriptor block for new group */
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
 					 gdb_off * EXT4_DESC_SIZE(sb));
@@ -872,9 +875,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 * We can allocate memory for mb_alloc based on the new group
 	 * descriptor
 	 */
-	err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
-	if (err)
+	err = ext4_mb_add_groupinfo(sb, input->group, gdp);
+	if (err) {
+		ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
 		goto exit_journal;
+	}
 
 	/*
 	 * Make the new blocks and inodes valid next.  We do this before
@@ -916,6 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 
 	/* Update the global fs size fields */
 	sbi->s_groups_count++;
+	ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
 
 	ext4_handle_dirty_metadata(handle, NULL, primary);
 
@@ -1082,45 +1088,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	if ((err = ext4_journal_stop(handle)))
 		goto exit_put;
 
-	/*
-	 * Mark mballoc pages as not up to date so that they will be updated
-	 * next time they are loaded by ext4_mb_load_buddy.
-	 *
-	 * XXX Bad, Bad, BAD!!!  We should not be overloading the
-	 * Uptodate flag, particularly on thte bitmap bh, as way of
-	 * hinting to ext4_mb_load_buddy() that it needs to be
-	 * overloaded.  A user could take a LVM snapshot, then do an
-	 * on-line fsck, and clear the uptodate flag, and this would
-	 * not be a bug in userspace, but a bug in the kernel.  FIXME!!!
-	 */
-	{
-		struct ext4_sb_info *sbi = EXT4_SB(sb);
-		struct inode *inode = sbi->s_buddy_cache;
-		int blocks_per_page;
-		int block;
-		int pnum;
-		struct page *page;
-
-		/* Set buddy page as not up to date */
-		blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-		block = group * 2;
-		pnum = block / blocks_per_page;
-		page = find_get_page(inode->i_mapping, pnum);
-		if (page != NULL) {
-			ClearPageUptodate(page);
-			page_cache_release(page);
-		}
-
-		/* Set bitmap page as not up to date */
-		block++;
-		pnum = block / blocks_per_page;
-		page = find_get_page(inode->i_mapping, pnum);
-		if (page != NULL) {
-			ClearPageUptodate(page);
-			page_cache_release(page);
-		}
-	}
-
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
 		       ext4_blocks_count(es));
-- 
cgit v1.2.3


From c3a326a657562dab81acf05aee106dc1fe345eb4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 25 Nov 2008 15:11:52 -0500
Subject: ext4: cleanup mballoc header files

Move some of the forward declaration of the static functions
to mballoc.c where they are used. This enables us to include
mballoc.h in other .c files. Also correct the buddy cache
documentation.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 23 +++++++++++++++++++----
 fs/ext4/mballoc.h | 18 +-----------------
 2 files changed, 20 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d2b1bcaf88e..c17063ddb30 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -100,7 +100,7 @@
  * inode as:
  *
  *  {                        page                        }
- *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.  So for each group we
@@ -330,6 +330,16 @@
  *        object
  *
  */
+static struct kmem_cache *ext4_pspace_cachep;
+static struct kmem_cache *ext4_ac_cachep;
+static struct kmem_cache *ext4_free_ext_cachep;
+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+					ext4_group_t group);
+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
+static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+
+
 
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -716,7 +726,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
  * stored in the inode as
  *
  * {                        page                        }
- * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
+ * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
  *
  *
  * one block each for bitmap and buddy information.
@@ -1322,8 +1332,13 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	ac->ac_tail = ret & 0xffff;
 	ac->ac_buddy = ret >> 16;
 
-	/* XXXXXXX: SUCH A HORRIBLE **CK */
-	/*FIXME!! Why ? */
+	/*
+	 * take the page reference. We want the page to be pinned
+	 * so that we don't get a ext4_mb_init_cache_call for this
+	 * group until we update the bitmap. That would mean we
+	 * double allocate blocks. The reference is dropped
+	 * in ext4_mb_release_context
+	 */
 	ac->ac_bitmap_page = e4b->bd_bitmap_page;
 	get_page(ac->ac_bitmap_page);
 	ac->ac_buddy_page = e4b->bd_buddy_page;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index a931b6b4f6a..997f78fff12 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -99,9 +99,6 @@
  */
 #define MB_DEFAULT_GROUP_PREALLOC	512
 
-static struct kmem_cache *ext4_pspace_cachep;
-static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
 
 struct ext4_free_data {
 	/* this links the free block information from group_info */
@@ -262,25 +259,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 {
 	return;
 }
-#else
-static void ext4_mb_store_history(struct ext4_allocation_context *ac);
 #endif
 
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 
-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
-					ext4_group_t group);
-static void ext4_mb_return_to_preallocation(struct inode *inode,
-					struct ext4_buddy *e4b, sector_t block,
-					int count);
-static void ext4_mb_put_pa(struct ext4_allocation_context *,
-			struct super_block *, struct ext4_prealloc_space *pa);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
-
 
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 {
@@ -306,7 +290,7 @@ static inline int ext4_is_group_locked(struct super_block *sb,
 						&(grinfo->bb_state));
 }
 
-static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
+static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
 	ext4_fsblk_t block;
-- 
cgit v1.2.3


From fb68407b0d9efba962c03f55009c797e22f024bc Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Nov 2008 17:50:21 -0500
Subject: jbd2: Call journal commit callback without holding j_list_lock

Avoid freeing the transaction in __jbd2_journal_drop_transaction() so
the journal commit callback can run without holding j_list_lock, to
avoid lock contention on this spinlock.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c |  2 +-
 fs/jbd2/commit.c     | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index adc08ec875e..17159cacbd9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -682,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 	   safely remove this transaction from the log */
 
 	__jbd2_journal_drop_transaction(journal, transaction);
+	kfree(transaction);
 
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
@@ -756,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 	J_ASSERT(journal->j_running_transaction != transaction);
 
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-	kfree(transaction);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f22d1828ea8..0ad84162c42 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -363,7 +363,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int space_left = 0;
 	int first_tag = 0;
 	int tag_flag;
-	int i;
+	int i, to_free = 0;
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
@@ -1011,12 +1011,10 @@ restart_loop:
 		journal->j_average_commit_time = commit_time;
 	spin_unlock(&journal->j_state_lock);
 
-	if (journal->j_commit_callback)
-		journal->j_commit_callback(journal, commit_transaction);
-
 	if (commit_transaction->t_checkpoint_list == NULL &&
 	    commit_transaction->t_checkpoint_io_list == NULL) {
 		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		to_free = 1;
 	} else {
 		if (journal->j_checkpoint_transactions == NULL) {
 			journal->j_checkpoint_transactions = commit_transaction;
@@ -1035,11 +1033,16 @@ restart_loop:
 	}
 	spin_unlock(&journal->j_list_lock);
 
+	if (journal->j_commit_callback)
+		journal->j_commit_callback(journal, commit_transaction);
+
 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
-		   journal->j_devname, journal->j_commit_sequence,
+		   journal->j_devname, commit_transaction->t_tid,
 		   journal->j_tail_sequence);
 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
+	if (to_free)
+		kfree(commit_transaction);
 
 	wake_up(&journal->j_wait_done_commit);
 }
-- 
cgit v1.2.3


From 4a69a41009c4ac691f7d9c289f5f37fabeddce46 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 6 Nov 2008 22:03:00 -0500
Subject: Btrfs: Add ordered async work queues

Btrfs uses kernel threads to create async work queues for cpu intensive
operations such as checksumming and decompression.  These work well,
but they make it difficult to keep IO order intact.

A single writepages call from pdflush or fsync will turn into a number
of bios, and each bio is checksummed in parallel.  Once the checksum is
computed, the bio is sent down to the disk, and since we don't control
the order in which the parallel operations happen, they might go down to
the disk in almost any order.

The code deals with this somewhat by having deep work queues for a single
kernel thread, making it very likely that a single thread will process all
the bios for a single inode.

This patch introduces an explicitly ordered work queue.  As work structs
are placed into the queue they are put onto the tail of a list.  They have
three callbacks:

->func (cpu intensive processing here)
->ordered_func (order sensitive processing here)
->ordered_free (free the work struct, all processing is done)

The work struct has three callbacks.  The func callback does the cpu intensive
work, and when it completes the work struct is marked as done.

Every time a work struct completes, the list is checked to see if the head
is marked as done.  If so the ordered_func callback is used to do the
order sensitive processing and the ordered_free callback is used to do
any cleanup.  Then we loop back and check the head of the list again.

This patch also changes the checksumming code to use the ordered workqueues.
One a 4 drive array, it increases streaming writes from 280MB/s to 350MB/s.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 67 ++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/async-thread.h | 18 ++++++++++-
 fs/btrfs/disk-io.c      | 81 ++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/disk-io.h      |  4 ++-
 fs/btrfs/inode.c        | 19 ++++++++++--
 5 files changed, 158 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d82efd722a4..e1e49715459 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -23,6 +23,10 @@
 # include <linux/freezer.h>
 #include "async-thread.h"
 
+#define WORK_QUEUED_BIT 0
+#define WORK_DONE_BIT 1
+#define WORK_ORDER_DONE_BIT 2
+
 /*
  * container for the kthread task pointer and the list of pending work
  * One of these is allocated per thread.
@@ -88,6 +92,47 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
 	}
 }
 
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+					    struct btrfs_work *work)
+{
+	unsigned long flags;
+
+	if (!workers->ordered)
+		return 0;
+
+	set_bit(WORK_DONE_BIT, &work->flags);
+
+	spin_lock_irqsave(&workers->lock, flags);
+
+	while(!list_empty(&workers->order_list)) {
+		work = list_entry(workers->order_list.next,
+				  struct btrfs_work, order_list);
+
+		if (!test_bit(WORK_DONE_BIT, &work->flags))
+			break;
+
+		/* we are going to call the ordered done function, but
+		 * we leave the work item on the list as a barrier so
+		 * that later work items that are done don't have their
+		 * functions called before this one returns
+		 */
+		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+			break;
+
+		spin_unlock_irqrestore(&workers->lock, flags);
+
+		work->ordered_func(work);
+
+		/* now take the lock again and call the freeing code */
+		spin_lock_irqsave(&workers->lock, flags);
+		list_del(&work->order_list);
+		work->ordered_free(work);
+	}
+
+	spin_unlock_irqrestore(&workers->lock, flags);
+	return 0;
+}
+
 /*
  * main loop for servicing work items
  */
@@ -102,7 +147,7 @@ static int worker_loop(void *arg)
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
-			clear_bit(0, &work->flags);
+			clear_bit(WORK_QUEUED_BIT, &work->flags);
 
 			work->worker = worker;
 			spin_unlock_irq(&worker->lock);
@@ -110,8 +155,15 @@ static int worker_loop(void *arg)
 			work->func(work);
 
 			atomic_dec(&worker->num_pending);
+			/*
+			 * unless this is an ordered work queue,
+			 * 'work' was probably freed by func above.
+			 */
+			run_ordered_completions(worker->workers, work);
+
 			spin_lock_irq(&worker->lock);
 			check_idle_worker(worker);
+
 		}
 		worker->working = 0;
 		if (freezing(current)) {
@@ -154,10 +206,12 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 	workers->num_workers = 0;
 	INIT_LIST_HEAD(&workers->worker_list);
 	INIT_LIST_HEAD(&workers->idle_list);
+	INIT_LIST_HEAD(&workers->order_list);
 	spin_lock_init(&workers->lock);
 	workers->max_workers = max;
 	workers->idle_thresh = 32;
 	workers->name = name;
+	workers->ordered = 0;
 }
 
 /*
@@ -296,7 +350,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
 	struct btrfs_worker_thread *worker = work->worker;
 	unsigned long flags;
 
-	if (test_and_set_bit(0, &work->flags))
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
 		goto out;
 
 	spin_lock_irqsave(&worker->lock, flags);
@@ -330,10 +384,17 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	int wake = 0;
 
 	/* don't requeue something already on a list */
-	if (test_and_set_bit(0, &work->flags))
+	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
 		goto out;
 
 	worker = find_worker(workers);
+	if (workers->ordered) {
+		spin_lock_irqsave(&workers->lock, flags);
+		list_add_tail(&work->order_list, &workers->order_list);
+		spin_unlock_irqrestore(&workers->lock, flags);
+	} else {
+		INIT_LIST_HEAD(&work->order_list);
+	}
 
 	spin_lock_irqsave(&worker->lock, flags);
 	atomic_inc(&worker->num_pending);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 4ec9a2ee0f9..31be4ed8b63 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -37,10 +37,16 @@ struct btrfs_worker_thread;
  */
 struct btrfs_work {
 	/*
-	 * only func should be set to the function you want called
+	 * func should be set to the function you want called
 	 * your work struct is passed as the only arg
+	 *
+	 * ordered_func must be set for work sent to an ordered work queue,
+	 * and it is called to complete a given work item in the same
+	 * order they were sent to the queue.
 	 */
 	void (*func)(struct btrfs_work *work);
+	void (*ordered_func)(struct btrfs_work *work);
+	void (*ordered_free)(struct btrfs_work *work);
 
 	/*
 	 * flags should be set to zero.  It is used to make sure the
@@ -51,6 +57,7 @@ struct btrfs_work {
 	/* don't touch these */
 	struct btrfs_worker_thread *worker;
 	struct list_head list;
+	struct list_head order_list;
 };
 
 struct btrfs_workers {
@@ -63,6 +70,9 @@ struct btrfs_workers {
 	/* once a worker has this many requests or fewer, it is idle */
 	int idle_thresh;
 
+	/* force completions in the order they were queued */
+	int ordered;
+
 	/* list with all the work threads.  The workers on the idle thread
 	 * may be actively servicing jobs, but they haven't yet hit the
 	 * idle thresh limit above.
@@ -70,6 +80,12 @@ struct btrfs_workers {
 	struct list_head worker_list;
 	struct list_head idle_list;
 
+	/*
+	 * when operating in ordered mode, this maintains the list
+	 * of work items waiting for completion
+	 */
+	struct list_head order_list;
+
 	/* lock for finding the next worker thread to queue on */
 	spinlock_t lock;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 94b4e50f6b2..e0a28f705a6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -80,7 +80,8 @@ struct async_submit_bio {
 	struct inode *inode;
 	struct bio *bio;
 	struct list_head list;
-	extent_submit_bio_hook_t *submit_bio_hook;
+	extent_submit_bio_hook_t *submit_bio_start;
+	extent_submit_bio_hook_t *submit_bio_done;
 	int rw;
 	int mirror_num;
 	unsigned long bio_flags;
@@ -452,7 +453,18 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 		btrfs_async_submit_limit(info);
 }
 
-static void run_one_async_submit(struct btrfs_work *work)
+static void run_one_async_start(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	fs_info = BTRFS_I(async->inode)->root->fs_info;
+	async->submit_bio_start(async->inode, async->rw, async->bio,
+			       async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_done(struct btrfs_work *work)
 {
 	struct btrfs_fs_info *fs_info;
 	struct async_submit_bio *async;
@@ -470,15 +482,23 @@ static void run_one_async_submit(struct btrfs_work *work)
 	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
 
-	async->submit_bio_hook(async->inode, async->rw, async->bio,
+	async->submit_bio_done(async->inode, async->rw, async->bio,
 			       async->mirror_num, async->bio_flags);
+}
+
+static void run_one_async_free(struct btrfs_work *work)
+{
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
 	kfree(async);
 }
 
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			unsigned long bio_flags,
-			extent_submit_bio_hook_t *submit_bio_hook)
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done)
 {
 	struct async_submit_bio *async;
 	int limit = btrfs_async_submit_limit(fs_info);
@@ -491,8 +511,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->rw = rw;
 	async->bio = bio;
 	async->mirror_num = mirror_num;
-	async->submit_bio_hook = submit_bio_hook;
-	async->work.func = run_one_async_submit;
+	async->submit_bio_start = submit_bio_start;
+	async->submit_bio_done = submit_bio_done;
+
+	async->work.func = run_one_async_start;
+	async->work.ordered_func = run_one_async_done;
+	async->work.ordered_free = run_one_async_free;
+
 	async->work.flags = 0;
 	async->bio_flags = bio_flags;
 
@@ -533,29 +558,25 @@ static int btree_csum_one_bio(struct bio *bio)
 	return 0;
 }
 
-static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num, unsigned long bio_flags)
+static int __btree_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
-
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	if (rw & (1 << BIO_RW)) {
-		btree_csum_one_bio(bio);
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-				     mirror_num, 1);
-	}
+	btree_csum_one_bio(bio);
+	return 0;
+}
 
+static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
 	/*
-	 * called for a read, do the setup so that checksum validation
-	 * can happen in the async kernel threads
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
-	BUG_ON(ret);
-
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 
@@ -567,11 +588,22 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	 * can happen in parallel across all CPUs
 	 */
 	if (!(rw & (1 << BIO_RW))) {
-		return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
+		int ret;
+		/*
+		 * called for a read, do the setup so that checksum validation
+		 * can happen in the async kernel threads
+		 */
+		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+					  bio, 1);
+		BUG_ON(ret);
+
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 1);
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num, 0,
-				   __btree_submit_bio_hook);
+				   __btree_submit_bio_start,
+				   __btree_submit_bio_done);
 }
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
@@ -1534,7 +1566,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * were sent by the writeback daemons, improving overall locality
 	 * of the IO going down the pipe.
 	 */
-	fs_info->workers.idle_thresh = 128;
+	fs_info->workers.idle_thresh = 8;
+	fs_info->workers.ordered = 1;
 
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4eb1f1408d2..b8d5948fa27 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,7 +72,9 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			unsigned long bio_flags,
-			extent_submit_bio_hook_t *submit_bio_hook);
+			extent_submit_bio_hook_t *submit_bio_start,
+			extent_submit_bio_hook_t *submit_bio_done);
+
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 806708dd7e3..3df0ffad976 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -881,7 +881,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -889,7 +889,21 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	ret = btrfs_csum_one_bio(root, inode, bio);
 	BUG_ON(ret);
+	return 0;
+}
 
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
+int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num, unsigned long bio_flags)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
@@ -922,7 +936,8 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		/* we're doing a write, do the async checksumming */
 		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
-				   bio_flags, __btrfs_submit_bio_hook);
+				   bio_flags, __btrfs_submit_bio_start,
+				   __btrfs_submit_bio_done);
 	}
 
 mapit:
-- 
cgit v1.2.3


From 771ed689d2cd53439e28e095bc38fbe40a71429e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 6 Nov 2008 22:02:51 -0500
Subject: Btrfs: Optimize compressed writeback and reads

When reading compressed extents, try to put pages into the page cache
for any pages covered by the compressed extent that readpages didn't already
preload.

Add an async work queue to handle transformations at delayed allocation processing
time.  Right now this is just compression.  The workflow is:

1) Find offsets in the file marked for delayed allocation
2) Lock the pages
3) Lock the state bits
4) Call the async delalloc code

The async delalloc code clears the state lock bits and delalloc bits.  It is
important this happens before the range goes into the work queue because
otherwise it might deadlock with other work queue items that try to lock
those extent bits.

The file pages are compressed, and if the compression doesn't work the
pages are written back directly.

An ordered work queue is used to make sure the inodes are written in the same
order that pdflush or writepages sent them down.

This changes extent_write_cache_pages to let the writepage function
update the wbc nr_written count.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c  | 150 ++++++++++-
 fs/btrfs/ctree.h        |   4 +-
 fs/btrfs/disk-io.c      |  27 +-
 fs/btrfs/extent-tree.c  |   6 +-
 fs/btrfs/extent_io.c    | 140 +++++++++--
 fs/btrfs/extent_io.h    |  13 +-
 fs/btrfs/file.c         |  53 ++--
 fs/btrfs/inode.c        | 643 +++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ordered-data.c |  13 +-
 fs/btrfs/super.c        |   4 +
 fs/btrfs/zlib.c         |   3 +-
 11 files changed, 849 insertions(+), 207 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 354913177ba..284f21025bc 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -33,6 +33,7 @@
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
+#include <linux/pagevec.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -145,9 +146,9 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	}
 
 	/* do io completion on the original bio */
-	if (cb->errors)
+	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
-	else
+	} else
 		bio_endio(cb->orig_bio, 0);
 
 	/* finally free the cb struct */
@@ -333,6 +334,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		}
 		bytes_left -= PAGE_CACHE_SIZE;
 		first_byte += PAGE_CACHE_SIZE;
+		cond_resched();
 	}
 	bio_get(bio);
 
@@ -346,6 +348,130 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	return 0;
 }
 
+static noinline int add_ra_bio_pages(struct inode *inode,
+				     u64 compressed_end,
+				     struct compressed_bio *cb)
+{
+	unsigned long end_index;
+	unsigned long page_index;
+	u64 last_offset;
+	u64 isize = i_size_read(inode);
+	int ret;
+	struct page *page;
+	unsigned long nr_pages = 0;
+	struct extent_map *em;
+	struct address_space *mapping = inode->i_mapping;
+	struct pagevec pvec;
+	struct extent_map_tree *em_tree;
+	struct extent_io_tree *tree;
+	u64 end;
+	int misses = 0;
+
+	page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
+	last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	tree = &BTRFS_I(inode)->io_tree;
+
+	if (isize == 0)
+		return 0;
+
+	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+
+	pagevec_init(&pvec, 0);
+	while(last_offset < compressed_end) {
+		page_index = last_offset >> PAGE_CACHE_SHIFT;
+
+		if (page_index > end_index)
+			break;
+
+		rcu_read_lock();
+		page = radix_tree_lookup(&mapping->page_tree, page_index);
+		rcu_read_unlock();
+		if (page) {
+			misses++;
+			if (misses > 4)
+				break;
+			goto next;
+		}
+
+		page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS);
+		if (!page)
+			break;
+
+		page->index = page_index;
+		/*
+		 * what we want to do here is call add_to_page_cache_lru,
+		 * but that isn't exported, so we reproduce it here
+		 */
+		if (add_to_page_cache(page, mapping,
+				      page->index, GFP_NOFS)) {
+			page_cache_release(page);
+			goto next;
+		}
+
+		/* open coding of lru_cache_add, also not exported */
+		page_cache_get(page);
+		if (!pagevec_add(&pvec, page))
+			__pagevec_lru_add(&pvec);
+
+		end = last_offset + PAGE_CACHE_SIZE - 1;
+		/*
+		 * at this point, we have a locked page in the page cache
+		 * for these bytes in the file.  But, we have to make
+		 * sure they map to this compressed extent on disk.
+		 */
+		set_page_extent_mapped(page);
+		lock_extent(tree, last_offset, end, GFP_NOFS);
+		spin_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, last_offset,
+					   PAGE_CACHE_SIZE);
+		spin_unlock(&em_tree->lock);
+
+		if (!em || last_offset < em->start ||
+		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+		    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+			free_extent_map(em);
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+		free_extent_map(em);
+
+		if (page->index == end_index) {
+			char *userpage;
+			size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+
+			if (zero_offset) {
+				int zeros;
+				zeros = PAGE_CACHE_SIZE - zero_offset;
+				userpage = kmap_atomic(page, KM_USER0);
+				memset(userpage + zero_offset, 0, zeros);
+				flush_dcache_page(page);
+				kunmap_atomic(userpage, KM_USER0);
+			}
+		}
+
+		ret = bio_add_page(cb->orig_bio, page,
+				   PAGE_CACHE_SIZE, 0);
+
+		if (ret == PAGE_CACHE_SIZE) {
+			nr_pages++;
+			page_cache_release(page);
+		} else {
+			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			break;
+		}
+next:
+		last_offset += PAGE_CACHE_SIZE;
+	}
+	if (pagevec_count(&pvec))
+		__pagevec_lru_add(&pvec);
+	return 0;
+}
+
 /*
  * for a compressed read, the bio we get passed has all the inode pages
  * in it.  We don't actually do IO on those pages but allocate new ones
@@ -373,6 +499,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct block_device *bdev;
 	struct bio *comp_bio;
 	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 em_len;
 	struct extent_map *em;
 	int ret;
 
@@ -393,6 +520,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	cb->start = em->start;
 	compressed_len = em->block_len;
+	em_len = em->len;
 	free_extent_map(em);
 
 	cb->len = uncompressed_len;
@@ -411,6 +539,17 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 	cb->nr_pages = nr_pages;
 
+	add_ra_bio_pages(inode, cb->start + em_len, cb);
+
+	if (!btrfs_test_opt(root, NODATASUM) &&
+	    !btrfs_test_flag(inode, NODATASUM)) {
+		btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
+	}
+
+	/* include any pages we added in add_ra-bio_pages */
+	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	cb->len = uncompressed_len;
+
 	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
 	comp_bio->bi_private = cb;
 	comp_bio->bi_end_io = end_compressed_bio_read;
@@ -442,9 +581,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
 							GFP_NOFS);
 			atomic_inc(&cb->pending_bios);
-			bio->bi_private = cb;
-			bio->bi_end_io = end_compressed_bio_write;
-			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+			comp_bio->bi_private = cb;
+			comp_bio->bi_end_io = end_compressed_bio_read;
+
+			bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
 		}
 		cur_disk_byte += PAGE_CACHE_SIZE;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 689df070c8e..c83cc5b2ded 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -625,8 +625,8 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
-	wait_queue_head_t async_submit_wait;
 
+	wait_queue_head_t async_submit_wait;
 	wait_queue_head_t tree_log_wait;
 
 	struct btrfs_super_block super_copy;
@@ -653,6 +653,7 @@ struct btrfs_fs_info {
 	atomic_t nr_async_submits;
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
+	atomic_t async_delalloc_pages;
 	atomic_t tree_log_writers;
 	atomic_t tree_log_commit;
 	unsigned long tree_log_batch;
@@ -677,6 +678,7 @@ struct btrfs_fs_info {
 	 * two
 	 */
 	struct btrfs_workers workers;
+	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e0a28f705a6..8efc123d222 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -539,6 +539,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			   (atomic_read(&fs_info->nr_async_bios) < limit),
 			   HZ/10);
 	}
+
+	while(atomic_read(&fs_info->async_submit_draining) &&
+	      atomic_read(&fs_info->nr_async_submits)) {
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) == 0));
+	}
+
 	return 0;
 }
 
@@ -1437,6 +1444,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->throttles, 0);
@@ -1550,6 +1558,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->workers, "worker",
 			   fs_info->thread_pool_size);
 
+	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
+			   fs_info->thread_pool_size);
+
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size));
@@ -1560,15 +1571,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	fs_info->submit_workers.idle_thresh = 64;
 
-	/* fs_info->workers is responsible for checksumming file data
-	 * blocks and metadata.  Using a larger idle thresh allows each
-	 * worker thread to operate on things in roughly the order they
-	 * were sent by the writeback daemons, improving overall locality
-	 * of the IO going down the pipe.
-	 */
-	fs_info->workers.idle_thresh = 8;
+	fs_info->workers.idle_thresh = 16;
 	fs_info->workers.ordered = 1;
 
+	fs_info->delalloc_workers.idle_thresh = 2;
+	fs_info->delalloc_workers.ordered = 1;
+
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
 			   fs_info->thread_pool_size);
@@ -1584,6 +1592,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
+	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_write_workers,
@@ -1732,6 +1741,7 @@ fail_tree_root:
 fail_sys_array:
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -1988,6 +1998,7 @@ int close_ctree(struct btrfs_root *root)
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
 	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
@@ -2062,7 +2073,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
-	unsigned long thresh = 96 * 1024 * 1024;
+	unsigned long thresh = 32 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
 	if (current_is_pdflush() || current->flags & PF_MEMALLOC)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8af39521eb7..ebd8275a193 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -768,7 +768,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	l = path->nodes[0];
 
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-	BUG_ON(key.objectid != bytenr);
+	if (key.objectid != bytenr) {
+		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
+		printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
+		BUG();
+	}
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9b37ce6e516..bbe3bcfcf4a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -47,6 +47,11 @@ struct extent_page_data {
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
+
+	/* tells writepage not to lock the state bits for this range
+	 * it still does the unlocking
+	 */
+	int extent_locked;
 };
 
 int __init extent_io_init(void)
@@ -1198,11 +1203,18 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 			 * the caller is taking responsibility for
 			 * locked_page
 			 */
-			if (pages[i] != locked_page)
+			if (pages[i] != locked_page) {
 				lock_page(pages[i]);
+				if (pages[i]->mapping != inode->i_mapping) {
+					ret = -EAGAIN;
+					unlock_page(pages[i]);
+					page_cache_release(pages[i]);
+					goto done;
+				}
+			}
 			page_cache_release(pages[i]);
+			pages_locked++;
 		}
-		pages_locked += ret;
 		nrpages -= ret;
 		index += ret;
 		cond_resched();
@@ -1262,8 +1274,7 @@ again:
 	 * if we're looping.
 	 */
 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
-		delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
-			~((u64)PAGE_CACHE_SIZE - 1);
+		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
 	}
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
@@ -1306,7 +1317,10 @@ out_failed:
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
-				int clear_dirty, int set_writeback,
+				int unlock_pages,
+				int clear_unlock,
+				int clear_delalloc, int clear_dirty,
+				int set_writeback,
 				int end_writeback)
 {
 	int ret;
@@ -1315,12 +1329,19 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
-	int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+	int clear_bits = 0;
 
+	if (clear_unlock)
+		clear_bits |= EXTENT_LOCKED;
 	if (clear_dirty)
 		clear_bits |= EXTENT_DIRTY;
 
+	if (clear_delalloc)
+		clear_bits |= EXTENT_DELALLOC;
+
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+		return 0;
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1336,7 +1357,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				set_page_writeback(pages[i]);
 			if (end_writeback)
 				end_page_writeback(pages[i]);
-			unlock_page(pages[i]);
+			if (unlock_pages)
+				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
 		nr_pages -= ret;
@@ -1741,9 +1763,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			}
 		}
 
-		if (uptodate)
+		if (uptodate) {
 			set_extent_uptodate(tree, start, end,
 					    GFP_ATOMIC);
+		}
 		unlock_extent(tree, start, end, GFP_ATOMIC);
 
 		if (whole_page) {
@@ -1925,6 +1948,7 @@ void set_page_extent_mapped(struct page *page)
 		set_page_private(page, EXTENT_PAGE_PRIVATE);
 	}
 }
+EXPORT_SYMBOL(set_page_extent_mapped);
 
 void set_page_extent_head(struct page *page, unsigned long len)
 {
@@ -2143,12 +2167,17 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	u64 delalloc_end;
 	int page_started;
 	int compressed;
+	unsigned long nr_written = 0;
 
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0);
+		if (epd->extent_locked) {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		}
 		unlock_page(page);
 		return 0;
 	}
@@ -2169,27 +2198,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_start = start;
 	delalloc_end = 0;
 	page_started = 0;
-	while(delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(inode, tree,
+	if (!epd->extent_locked) {
+		while(delalloc_end < page_end) {
+			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
 						       &delalloc_start,
 						       &delalloc_end,
 						       128 * 1024 * 1024);
-		if (nr_delalloc == 0) {
+			if (nr_delalloc == 0) {
+				delalloc_start = delalloc_end + 1;
+				continue;
+			}
+			tree->ops->fill_delalloc(inode, page, delalloc_start,
+						 delalloc_end, &page_started,
+						 &nr_written);
 			delalloc_start = delalloc_end + 1;
-			continue;
 		}
-		tree->ops->fill_delalloc(inode, page, delalloc_start,
-					 delalloc_end, &page_started);
-		delalloc_start = delalloc_end + 1;
-	}
 
-	/* did the fill delalloc function already unlock and start the IO? */
-	if (page_started) {
-		return 0;
+		/* did the fill delalloc function already unlock and start
+		 * the IO?
+		 */
+		if (page_started) {
+			ret = 0;
+			goto update_nr_written;
+		}
 	}
-
 	lock_extent(tree, start, page_end, GFP_NOFS);
+
 	unlock_start = start;
 
 	if (tree->ops && tree->ops->writepage_start_hook) {
@@ -2199,10 +2234,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
 			unlock_page(page);
-			return 0;
+			ret = 0;
+			goto update_nr_written;
 		}
 	}
 
+	nr_written++;
+
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
 		printk("found delalloc bits after lock_extent\n");
@@ -2333,6 +2371,12 @@ done:
 	if (unlock_start <= page_end)
 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
+
+update_nr_written:
+	wbc->nr_to_write -= nr_written;
+	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
+	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
+		page->mapping->writeback_index = page->index + nr_written;
 	return 0;
 }
 
@@ -2431,7 +2475,7 @@ retry:
 				unlock_page(page);
 				ret = 0;
 			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
+			if (ret || wbc->nr_to_write <= 0)
 				done = 1;
 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
 				wbc->encountered_congestion = 1;
@@ -2452,6 +2496,8 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
+		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+			range_whole = 1;
 
 	if (wbc->range_cont)
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
@@ -2469,6 +2515,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
+		.extent_locked = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.bdi		= wbc->bdi,
@@ -2491,6 +2538,52 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 }
 EXPORT_SYMBOL(extent_write_full_page);
 
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode)
+{
+	int ret = 0;
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page;
+	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.get_extent = get_extent,
+		.extent_locked = 1,
+	};
+	struct writeback_control wbc_writepages = {
+		.bdi		= inode->i_mapping->backing_dev_info,
+		.sync_mode	= mode,
+		.older_than_this = NULL,
+		.nr_to_write	= nr_pages * 2,
+		.range_start	= start,
+		.range_end	= end + 1,
+	};
+
+	while(start <= end) {
+		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		if (clear_page_dirty_for_io(page))
+			ret = __extent_writepage(page, &wbc_writepages, &epd);
+		else {
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, start,
+						 start + PAGE_CACHE_SIZE - 1,
+						 NULL, 1);
+			unlock_page(page);
+		}
+		page_cache_release(page);
+		start += PAGE_CACHE_SIZE;
+	}
+
+	if (epd.bio)
+		submit_one_bio(WRITE, epd.bio, 0, 0);
+	return ret;
+}
+EXPORT_SYMBOL(extent_write_locked_range);
+
 
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
@@ -2502,6 +2595,7 @@ int extent_writepages(struct extent_io_tree *tree,
 		.bio = NULL,
 		.tree = tree,
 		.get_extent = get_extent,
+		.extent_locked = 0,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 283110ec4ee..2d5f67065b6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -35,7 +35,8 @@ typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
 				       unsigned long bio_flags);
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
-			     u64 start, u64 end, int *page_started);
+			     u64 start, u64 end, int *page_started,
+			     unsigned long *nr_written);
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
@@ -172,6 +173,9 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc);
+int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
+			      u64 start, u64 end, get_extent_t *get_extent,
+			      int mode);
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
@@ -256,6 +260,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
-				int clear_dirty, int set_writeback,
-				int clear_writeback);
+				int unlock_page,
+				int clear_unlock,
+				int clear_delalloc, int clear_dirty,
+				int set_writeback,
+				int end_writeback);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0c8cc35a8b9..337221ecca2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -368,6 +368,8 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 search_start = start;
 	u64 leaf_start;
 	u64 ram_bytes = 0;
+	u64 orig_parent = 0;
+	u64 disk_bytenr = 0;
 	u8 compression;
 	u8 encryption;
 	u16 other_encoding = 0;
@@ -500,17 +502,31 @@ next_slot:
 				keep = 1;
 		}
 
-		if (bookend && found_extent && locked_end < extent_end) {
-			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-					locked_end, extent_end - 1, GFP_NOFS);
-			if (!ret) {
-				btrfs_release_path(root, path);
-				lock_extent(&BTRFS_I(inode)->io_tree,
-					locked_end, extent_end - 1, GFP_NOFS);
+		if (bookend && found_extent) {
+			if (locked_end < extent_end) {
+				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+						locked_end, extent_end - 1,
+						GFP_NOFS);
+				if (!ret) {
+					btrfs_release_path(root, path);
+					lock_extent(&BTRFS_I(inode)->io_tree,
+						locked_end, extent_end - 1,
+						GFP_NOFS);
+					locked_end = extent_end;
+					continue;
+				}
 				locked_end = extent_end;
-				continue;
 			}
-			locked_end = extent_end;
+			orig_parent = path->nodes[0]->start;
+			disk_bytenr = le64_to_cpu(old.disk_bytenr);
+			if (disk_bytenr != 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+					   disk_bytenr,
+					   le64_to_cpu(old.disk_num_bytes),
+					   orig_parent, root->root_key.objectid,
+					   trans->transid, inode->i_ino);
+				BUG_ON(ret);
+			}
 		}
 
 		if (found_inline) {
@@ -537,8 +553,12 @@ next_slot:
 					inode_sub_bytes(inode, old_num -
 							new_num);
 				}
-				btrfs_set_file_extent_num_bytes(leaf, extent,
-								new_num);
+				if (!compression && !encryption) {
+					btrfs_set_file_extent_ram_bytes(leaf,
+							extent, new_num);
+				}
+				btrfs_set_file_extent_num_bytes(leaf,
+							extent, new_num);
 				btrfs_mark_buffer_dirty(leaf);
 			} else if (key.offset < inline_limit &&
 				   (end > extent_end) &&
@@ -582,11 +602,11 @@ next_slot:
 		}
 		/* create bookend, splitting the extent in two */
 		if (bookend && found_extent) {
-			u64 disk_bytenr;
 			struct btrfs_key ins;
 			ins.objectid = inode->i_ino;
 			ins.offset = end;
 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+
 			btrfs_release_path(root, path);
 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
 						      sizeof(*extent));
@@ -623,14 +643,13 @@ next_slot:
 
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 
-			disk_bytenr = le64_to_cpu(old.disk_bytenr);
 			if (disk_bytenr != 0) {
-				ret = btrfs_inc_extent_ref(trans, root,
-						disk_bytenr,
-						le64_to_cpu(old.disk_num_bytes),
-						leaf->start,
+				ret = btrfs_update_extent_ref(trans, root,
+						disk_bytenr, orig_parent,
+					        leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
+
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3df0ffad976..e01c0d0310a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -86,6 +86,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 
 static void btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written, int unlock);
 
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
@@ -262,35 +266,72 @@ static int cow_file_range_inline(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+struct async_extent {
+	u64 start;
+	u64 ram_size;
+	u64 compressed_size;
+	struct page **pages;
+	unsigned long nr_pages;
+	struct list_head list;
+};
+
+struct async_cow {
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct page *locked_page;
+	u64 start;
+	u64 end;
+	struct list_head extents;
+	struct btrfs_work work;
+};
+
+static noinline int add_async_extent(struct async_cow *cow,
+				     u64 start, u64 ram_size,
+				     u64 compressed_size,
+				     struct page **pages,
+				     unsigned long nr_pages)
+{
+	struct async_extent *async_extent;
+
+	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+	async_extent->start = start;
+	async_extent->ram_size = ram_size;
+	async_extent->compressed_size = compressed_size;
+	async_extent->pages = pages;
+	async_extent->nr_pages = nr_pages;
+	list_add_tail(&async_extent->list, &cow->extents);
+	return 0;
+}
+
 /*
- * when extent_io.c finds a delayed allocation range in the file,
- * the call backs end up in this code.  The basic idea is to
- * allocate extents on disk for the range, and create ordered data structs
- * in ram to track those extents.
+ * we create compressed extents in two phases.  The first
+ * phase compresses a range of pages that have already been
+ * locked (both pages and state bits are locked).
  *
- * locked_page is the page that writepage had locked already.  We use
- * it to make sure we don't do extra locks or unlocks.
+ * This is done inside an ordered work queue, and the compression
+ * is spread across many cpus.  The actual IO submission is step
+ * two, and the ordered work queue takes care of making sure that
+ * happens in the same order things were put onto the queue by
+ * writepages and friends.
  *
- * *page_started is set to one if we unlock locked_page and do everything
- * required to start IO on it.  It may be clean and already done with
- * IO when we return.
+ * If this code finds it can't get good compression, it puts an
+ * entry onto the work queue to write the uncompressed bytes.  This
+ * makes sure that both compressed inodes and uncompressed inodes
+ * are written in the same order that pdflush sent them down.
  */
-static int cow_file_range(struct inode *inode, struct page *locked_page,
-			  u64 start, u64 end, int *page_started)
+static noinline int compress_file_range(struct inode *inode,
+					struct page *locked_page,
+					u64 start, u64 end,
+					struct async_cow *async_cow,
+					int *num_added)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	u64 alloc_hint = 0;
 	u64 num_bytes;
-	unsigned long ram_size;
 	u64 orig_start;
 	u64 disk_num_bytes;
-	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
-	struct btrfs_key ins;
-	struct extent_map *em;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 	struct page **pages = NULL;
 	unsigned long nr_pages;
@@ -298,22 +339,12 @@ static int cow_file_range(struct inode *inode, struct page *locked_page,
 	unsigned long total_compressed = 0;
 	unsigned long total_in = 0;
 	unsigned long max_compressed = 128 * 1024;
-	unsigned long max_uncompressed = 256 * 1024;
+	unsigned long max_uncompressed = 128 * 1024;
 	int i;
-	int ordered_type;
 	int will_compress;
 
-	trans = btrfs_join_transaction(root, 1);
-	BUG_ON(!trans);
-	btrfs_set_trans_block_group(trans, inode);
 	orig_start = start;
 
-	/*
-	 * compression made this loop a bit ugly, but the basic idea is to
-	 * compress some pages but keep the total size of the compressed
-	 * extent relatively small.  If compression is off, this goto target
-	 * is never used.
-	 */
 again:
 	will_compress = 0;
 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
@@ -324,7 +355,13 @@ again:
 
 	/* we want to make sure that amount of ram required to uncompress
 	 * an extent is reasonable, so we limit the total size in ram
-	 * of a compressed extent to 256k
+	 * of a compressed extent to 128k.  This is a crucial number
+	 * because it also controls how easily we can spread reads across
+	 * cpus for decompression.
+	 *
+	 * We also want to make sure the amount of IO required to do
+	 * a random read is reasonably small, so we limit the size of
+	 * a compressed extent to 128k.
 	 */
 	total_compressed = min(total_compressed, max_uncompressed);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -333,18 +370,16 @@ again:
 	total_in = 0;
 	ret = 0;
 
-	/* we do compression for mount -o compress and when the
-	 * inode has not been flagged as nocompress
+	/*
+	 * we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress.  This flag can
+	 * change at any time if we discover bad compression ratios.
 	 */
 	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
 	    btrfs_test_opt(root, COMPRESS)) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
 
-		/* we want to make sure the amount of IO required to satisfy
-		 * a random read is reasonably small, so we limit the size
-		 * of a compressed extent to 128k
-		 */
 		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
 						total_compressed, pages,
 						nr_pages, &nr_pages_ret,
@@ -371,26 +406,34 @@ again:
 		}
 	}
 	if (start == 0) {
+		trans = btrfs_join_transaction(root, 1);
+		BUG_ON(!trans);
+		btrfs_set_trans_block_group(trans, inode);
+
 		/* lets try to make an inline extent */
-		if (ret || total_in < (end - start + 1)) {
+		if (ret || total_in < (actual_end - start)) {
 			/* we didn't compress the entire range, try
-			 * to make an uncompressed inline extent.  This
-			 * is almost sure to fail, but maybe inline sizes
-			 * will get bigger later
+			 * to make an uncompressed inline extent.
 			 */
 			ret = cow_file_range_inline(trans, root, inode,
 						    start, end, 0, NULL);
 		} else {
+			/* try making a compressed inline extent */
 			ret = cow_file_range_inline(trans, root, inode,
 						    start, end,
 						    total_compressed, pages);
 		}
+		btrfs_end_transaction(trans, root);
 		if (ret == 0) {
+			/*
+			 * inline extent creation worked, we don't need
+			 * to create any more async work items.  Unlock
+			 * and free up our temp pages.
+			 */
 			extent_clear_unlock_delalloc(inode,
 						     &BTRFS_I(inode)->io_tree,
-						     start, end, NULL,
-						     1, 1, 1);
-			*page_started = 1;
+						     start, end, NULL, 1, 0,
+						     0, 1, 1, 1);
 			ret = 0;
 			goto free_pages_out;
 		}
@@ -435,53 +478,280 @@ again:
 		/* flag the file so we don't compress in the future */
 		btrfs_set_flag(inode, NOCOMPRESS);
 	}
+	if (will_compress) {
+		*num_added += 1;
 
-	BUG_ON(disk_num_bytes >
-	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+		/* the async work queues will take care of doing actual
+		 * allocation on disk for these compressed pages,
+		 * and will submit them to the elevator.
+		 */
+		add_async_extent(async_cow, start, num_bytes,
+				 total_compressed, pages, nr_pages_ret);
 
-	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+		if (start + num_bytes < end) {
+			start += num_bytes;
+			pages = NULL;
+			cond_resched();
+			goto again;
+		}
+	} else {
+		/*
+		 * No compression, but we still need to write the pages in
+		 * the file we've been given so far.  redirty the locked
+		 * page if it corresponds to our extent and set things up
+		 * for the async work queue to run cow_file_range to do
+		 * the normal delalloc dance
+		 */
+		if (page_offset(locked_page) >= start &&
+		    page_offset(locked_page) <= end) {
+			__set_page_dirty_nobuffers(locked_page);
+			/* unlocked later on in the async handlers */
+		}
+		add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+		*num_added += 1;
+	}
 
-	while(disk_num_bytes > 0) {
-		unsigned long min_bytes;
+out:
+	return 0;
+
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++) {
+		WARN_ON(pages[i]->mapping);
+		page_cache_release(pages[i]);
+	}
+	if (pages)
+		kfree(pages);
+
+	goto out;
+}
+
+/*
+ * phase two of compressed writeback.  This is the ordered portion
+ * of the code, which only gets called in the order the work was
+ * queued.  We walk all the async extents created by compress_file_range
+ * and send them down to the disk.
+ */
+static noinline int submit_compressed_extents(struct inode *inode,
+					      struct async_cow *async_cow)
+{
+	struct async_extent *async_extent;
+	u64 alloc_hint = 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_io_tree *io_tree;
+	int ret;
+
+	if (list_empty(&async_cow->extents))
+		return 0;
+
+	trans = btrfs_join_transaction(root, 1);
+
+	while(!list_empty(&async_cow->extents)) {
+		async_extent = list_entry(async_cow->extents.next,
+					  struct async_extent, list);
+		list_del(&async_extent->list);
 
+		io_tree = &BTRFS_I(inode)->io_tree;
+
+		/* did the compression code fall back to uncompressed IO? */
+		if (!async_extent->pages) {
+			int page_started = 0;
+			unsigned long nr_written = 0;
+
+			lock_extent(io_tree, async_extent->start,
+				    async_extent->start + async_extent->ram_size - 1,
+				    GFP_NOFS);
+
+			/* allocate blocks */
+			cow_file_range(inode, async_cow->locked_page,
+				       async_extent->start,
+				       async_extent->start +
+				       async_extent->ram_size - 1,
+				       &page_started, &nr_written, 0);
+
+			/*
+			 * if page_started, cow_file_range inserted an
+			 * inline extent and took care of all the unlocking
+			 * and IO for us.  Otherwise, we need to submit
+			 * all those pages down to the drive.
+			 */
+			if (!page_started)
+				extent_write_locked_range(io_tree,
+						  inode, async_extent->start,
+					          async_extent->start +
+						  async_extent->ram_size - 1,
+						  btrfs_get_extent,
+						  WB_SYNC_ALL);
+			kfree(async_extent);
+			cond_resched();
+			continue;
+		}
+
+		lock_extent(io_tree, async_extent->start,
+			    async_extent->start + async_extent->ram_size - 1,
+			    GFP_NOFS);
 		/*
-		 * the max size of a compressed extent is pretty small,
-		 * make the code a little less complex by forcing
-		 * the allocator to find a whole compressed extent at once
+		 * here we're doing allocation and writeback of the
+		 * compressed pages
 		 */
-		if (will_compress)
-			min_bytes = disk_num_bytes;
-		else
-			min_bytes = root->sectorsize;
+		btrfs_drop_extent_cache(inode, async_extent->start,
+					async_extent->start +
+					async_extent->ram_size - 1, 0);
+
+		ret = btrfs_reserve_extent(trans, root,
+					   async_extent->compressed_size,
+					   async_extent->compressed_size,
+					   0, alloc_hint,
+					   (u64)-1, &ins, 1);
+		BUG_ON(ret);
+		em = alloc_extent_map(GFP_NOFS);
+		em->start = async_extent->start;
+		em->len = async_extent->ram_size;
+
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
+		while(1) {
+			spin_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			spin_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1, 0);
+		}
+
+		ret = btrfs_add_ordered_extent(inode, async_extent->start,
+					       ins.objectid,
+					       async_extent->ram_size,
+					       ins.offset,
+					       BTRFS_ORDERED_COMPRESSED);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, root);
+
+		/*
+		 * clear dirty, set writeback and unlock the pages.
+		 */
+		extent_clear_unlock_delalloc(inode,
+					     &BTRFS_I(inode)->io_tree,
+					     async_extent->start,
+					     async_extent->start +
+					     async_extent->ram_size - 1,
+					     NULL, 1, 1, 0, 1, 1, 0);
+
+		ret = btrfs_submit_compressed_write(inode,
+				         async_extent->start,
+					 async_extent->ram_size,
+					 ins.objectid,
+					 ins.offset, async_extent->pages,
+					 async_extent->nr_pages);
+
+		BUG_ON(ret);
+		trans = btrfs_join_transaction(root, 1);
+		alloc_hint = ins.objectid + ins.offset;
+		kfree(async_extent);
+		cond_resched();
+	}
+
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
+ */
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written,
+				   int unlock)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	u64 alloc_hint = 0;
+	u64 num_bytes;
+	unsigned long ram_size;
+	u64 disk_num_bytes;
+	u64 cur_alloc_size;
+	u64 blocksize = root->sectorsize;
+	u64 actual_end;
+	struct btrfs_key ins;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
 
+	actual_end = min_t(u64, i_size_read(inode), end + 1);
+
+	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
+	disk_num_bytes = num_bytes;
+	ret = 0;
+
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		ret = cow_file_range_inline(trans, root, inode,
+					    start, end, 0, NULL);
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start, end, NULL, 1, 1,
+						     1, 1, 1, 1);
+			*nr_written = *nr_written +
+			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+			*page_started = 1;
+			ret = 0;
+			goto out;
+		}
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
+
+	while(disk_num_bytes > 0) {
 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   min_bytes, 0, alloc_hint,
+					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
-			WARN_ON(1);
-			goto free_pages_out_fail;
+			BUG();
 		}
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
 
-		if (will_compress) {
-			ram_size = num_bytes;
-			em->len = num_bytes;
-		} else {
-			/* ramsize == disk size */
-			ram_size = ins.offset;
-			em->len = ins.offset;
-		}
+		ram_size = ins.offset;
+		em->len = ins.offset;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-		if (will_compress)
-			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
-
 		while(1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
@@ -495,10 +765,8 @@ again:
 		}
 
 		cur_alloc_size = ins.offset;
-		ordered_type = will_compress ? BTRFS_ORDERED_COMPRESSED : 0;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-					       ram_size, cur_alloc_size,
-					       ordered_type);
+					       ram_size, cur_alloc_size, 0);
 		BUG_ON(ret);
 
 		if (disk_num_bytes < cur_alloc_size) {
@@ -506,82 +774,145 @@ again:
 			       cur_alloc_size);
 			break;
 		}
-
-		if (will_compress) {
-			/*
-			 * we're doing compression, we and we need to
-			 * submit the compressed extents down to the device.
-			 *
-			 * We lock down all the file pages, clearing their
-			 * dirty bits and setting them writeback.  Everyone
-			 * that wants to modify the page will wait on the
-			 * ordered extent above.
-			 *
-			 * The writeback bits on the file pages are
-			 * cleared when the compressed pages are on disk
-			 */
-			btrfs_end_transaction(trans, root);
-
-			if (start <= page_offset(locked_page) &&
-			    page_offset(locked_page) < start + ram_size) {
-				*page_started = 1;
-			}
-
-			extent_clear_unlock_delalloc(inode,
-						     &BTRFS_I(inode)->io_tree,
-						     start,
-						     start + ram_size - 1,
-						     NULL, 1, 1, 0);
-
-			ret = btrfs_submit_compressed_write(inode, start,
-						 ram_size, ins.objectid,
-						 cur_alloc_size, pages,
-						 nr_pages_ret);
-
-			BUG_ON(ret);
-			trans = btrfs_join_transaction(root, 1);
-			if (start + ram_size < end) {
-				start += ram_size;
-				alloc_hint = ins.objectid + ins.offset;
-				/* pages will be freed at end_bio time */
-				pages = NULL;
-				goto again;
-			} else {
-				/* we've written everything, time to go */
-				break;
-			}
-		}
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
 		 */
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					     start, start + ram_size - 1,
-					     locked_page, 0, 0, 0);
+					     locked_page, unlock, 1,
+					     1, 0, 0, 0);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
-
-	ret = 0;
 out:
+	ret = 0;
 	btrfs_end_transaction(trans, root);
 
 	return ret;
+}
 
-free_pages_out_fail:
-	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-				     start, end, locked_page, 0, 0, 0);
-free_pages_out:
-	for (i = 0; i < nr_pages_ret; i++) {
-		WARN_ON(pages[i]->mapping);
-		page_cache_release(pages[i]);
+/*
+ * work queue call back to started compression on a file and pages
+ */
+static noinline void async_cow_start(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	int num_added = 0;
+	async_cow = container_of(work, struct async_cow, work);
+
+	compress_file_range(async_cow->inode, async_cow->locked_page,
+			    async_cow->start, async_cow->end, async_cow,
+			    &num_added);
+	if (num_added == 0)
+		async_cow->inode = NULL;
+}
+
+/*
+ * work queue call back to submit previously compressed pages
+ */
+static noinline void async_cow_submit(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root;
+	unsigned long nr_pages;
+
+	async_cow = container_of(work, struct async_cow, work);
+
+	root = async_cow->root;
+	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
+		PAGE_CACHE_SHIFT;
+
+	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+
+	if (atomic_read(&root->fs_info->async_delalloc_pages) <
+	    5 * 1042 * 1024 &&
+	    waitqueue_active(&root->fs_info->async_submit_wait))
+		wake_up(&root->fs_info->async_submit_wait);
+
+	if (async_cow->inode) {
+		submit_compressed_extents(async_cow->inode, async_cow);
 	}
-	if (pages)
-		kfree(pages);
+}
 
-	goto out;
+static noinline void async_cow_free(struct btrfs_work *work)
+{
+	struct async_cow *async_cow;
+	async_cow = container_of(work, struct async_cow, work);
+	kfree(async_cow);
+}
+
+static int cow_file_range_async(struct inode *inode, struct page *locked_page,
+				u64 start, u64 end, int *page_started,
+				unsigned long *nr_written)
+{
+	struct async_cow *async_cow;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long nr_pages;
+	u64 cur_end;
+	int limit = 10 * 1024 * 1042;
+
+	if (!btrfs_test_opt(root, COMPRESS)) {
+		return cow_file_range(inode, locked_page, start, end,
+				      page_started, nr_written, 1);
+	}
+
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
+			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+	while(start < end) {
+		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+		async_cow->inode = inode;
+		async_cow->root = root;
+		async_cow->locked_page = locked_page;
+		async_cow->start = start;
+
+		if (btrfs_test_flag(inode, NOCOMPRESS))
+			cur_end = end;
+		else
+			cur_end = min(end, start + 512 * 1024 - 1);
+
+		async_cow->end = cur_end;
+		INIT_LIST_HEAD(&async_cow->extents);
+
+		async_cow->work.func = async_cow_start;
+		async_cow->work.ordered_func = async_cow_submit;
+		async_cow->work.ordered_free = async_cow_free;
+		async_cow->work.flags = 0;
+
+		while(atomic_read(&root->fs_info->async_submit_draining) &&
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			     (atomic_read(&root->fs_info->async_delalloc_pages)
+			      == 0));
+		}
+
+		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
+			PAGE_CACHE_SHIFT;
+		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
+
+		btrfs_queue_worker(&root->fs_info->delalloc_workers,
+				   &async_cow->work);
+
+		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
+			wait_event(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->async_delalloc_pages) <
+			    limit));
+		}
+
+		while(atomic_read(&root->fs_info->async_submit_draining) &&
+		      atomic_read(&root->fs_info->async_delalloc_pages)) {
+			wait_event(root->fs_info->async_submit_wait,
+			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
+			   0));
+		}
+
+		*nr_written += nr_pages;
+		start = cur_end + 1;
+	}
+	*page_started = 1;
+	return 0;
 }
 
 /*
@@ -592,7 +923,8 @@ free_pages_out:
  * blocks on disk
  */
 static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
-			      u64 start, u64 end, int *page_started, int force)
+			      u64 start, u64 end, int *page_started, int force,
+			      unsigned long *nr_written)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -711,7 +1043,8 @@ out_check:
 		btrfs_release_path(root, path);
 		if (cow_start != (u64)-1) {
 			ret = cow_file_range(inode, locked_page, cow_start,
-					found_key.offset - 1, page_started);
+					found_key.offset - 1, page_started,
+					nr_written, 1);
 			BUG_ON(ret);
 			cow_start = (u64)-1;
 		}
@@ -748,9 +1081,10 @@ out_check:
 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
 					       num_bytes, num_bytes, type);
 		BUG_ON(ret);
+
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					cur_offset, cur_offset + num_bytes - 1,
-					locked_page, 0, 0, 0);
+					locked_page, 1, 1, 1, 0, 0, 0);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -761,7 +1095,7 @@ out_check:
 		cow_start = cur_offset;
 	if (cow_start != (u64)-1) {
 		ret = cow_file_range(inode, locked_page, cow_start, end,
-				     page_started);
+				     page_started, nr_written, 1);
 		BUG_ON(ret);
 	}
 
@@ -775,7 +1109,8 @@ out_check:
  * extent_io.c call back to do delayed allocation processing
  */
 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
-			      u64 start, u64 end, int *page_started)
+			      u64 start, u64 end, int *page_started,
+			      unsigned long *nr_written)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -783,13 +1118,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started, 0);
+					 page_started, 0, nr_written);
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started, 1);
+					 page_started, 1, nr_written);
 	else
-		ret = cow_file_range(inode, locked_page, start, end,
-				     page_started);
+		ret = cow_file_range_async(inode, locked_page, start, end,
+				     page_started, nr_written);
 
 	return ret;
 }
@@ -861,6 +1196,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	u64 map_length;
 	int ret;
 
+	if (bio_flags & EXTENT_BIO_COMPRESSED)
+		return 0;
+
 	length = bio->bi_size;
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
@@ -925,12 +1263,12 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		btrfs_test_flag(inode, NODATASUM);
 
 	if (!(rw & (1 << BIO_RW))) {
-		if (!skip_sum)
-			btrfs_lookup_bio_sums(root, inode, bio);
 
 		if (bio_flags & EXTENT_BIO_COMPRESSED)
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
+		else if (!skip_sum)
+			btrfs_lookup_bio_sums(root, inode, bio);
 		goto mapit;
 	} else if (!skip_sum) {
 		/* we're doing a write, do the async checksumming */
@@ -966,6 +1304,9 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
 {
+	if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
+		WARN_ON(1);
+	}
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   GFP_NOFS);
 }
@@ -2105,6 +2446,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
+	int encoding;
 	u64 mask = root->sectorsize - 1;
 
 	if (root->ref_cows)
@@ -2144,6 +2486,7 @@ search_again:
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
+		encoding = 0;
 
 		if (found_key.objectid != inode->i_ino)
 			break;
@@ -2156,6 +2499,10 @@ search_again:
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
+			encoding = btrfs_file_extent_compression(leaf, fi);
+			encoding |= btrfs_file_extent_encryption(leaf, fi);
+			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
+
 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
@@ -2200,7 +2547,7 @@ search_again:
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item) {
+			if (!del_item && !encoding) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
 				extent_num_bytes = new_size -
@@ -2436,7 +2783,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 		last_byte = min(extent_map_end(em), block_end);
 		last_byte = (last_byte + mask) & ~mask;
 		if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+			u64 hint_byte = 0;
 			hole_size = last_byte - cur_offset;
+			err = btrfs_drop_extents(trans, root, inode,
+						 cur_offset,
+						 cur_offset + hole_size,
+						 cur_offset, &hint_byte);
+			if (err)
+				break;
 			err = btrfs_insert_file_extent(trans, root,
 					inode->i_ino, cur_offset, 0,
 					0, hole_size, 0, hole_size,
@@ -3785,6 +4139,7 @@ int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
+
 	tree = &BTRFS_I(mapping->host)->io_tree;
 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
 }
@@ -4285,9 +4640,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	 * ordered extents get created before we return
 	 */
 	atomic_inc(&root->fs_info->async_submit_draining);
-	while(atomic_read(&root->fs_info->nr_async_submits)) {
+	while(atomic_read(&root->fs_info->nr_async_submits) ||
+	      atomic_read(&root->fs_info->async_delalloc_pages)) {
 		wait_event(root->fs_info->async_submit_wait,
-		   (atomic_read(&root->fs_info->nr_async_submits) == 0));
+		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&root->fs_info->async_submit_draining);
 	return 0;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 370bb428559..027ad6b3839 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -390,7 +390,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * start IO on any dirty ones so the wait doesn't stall waiting
 	 * for pdflush to find them
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
+	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
 	if (wait) {
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
@@ -421,6 +421,12 @@ again:
 	 */
 	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE);
 
+	/* The compression code will leave pages locked but return from
+	 * writepage without setting the page writeback.  Starting again
+	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+	 */
+	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+
 	btrfs_wait_on_page_writeback_range(inode->i_mapping,
 					   start >> PAGE_CACHE_SHIFT,
 					   orig_end >> PAGE_CACHE_SHIFT);
@@ -448,10 +454,7 @@ again:
 	}
 	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
 			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
-		printk("inode %lu still ordered or delalloc after wait "
-		       "%llu %llu\n", inode->i_ino,
-		       (unsigned long long)start,
-		       (unsigned long long)orig_end);
+		schedule_timeout(1);
 		goto again;
 	}
 	return 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 431fdf144b5..ab9d5e89ed1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -375,6 +375,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
 	}
+
+	btrfs_start_delalloc_inodes(root);
+	btrfs_wait_ordered_extents(root, 0);
+
 	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index e99309180a1..ba2527d0873 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -423,8 +423,9 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 			/* we didn't make progress in this inflate
 			 * call, we're done
 			 */
-			if (ret != Z_STREAM_END)
+			if (ret != Z_STREAM_END) {
 				ret = -1;
+			}
 			break;
 		}
 
-- 
cgit v1.2.3


From 3b7885bf96e03271a9fff41124c38ed7176616e2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 6 Nov 2008 21:48:27 -0500
Subject: Btrfs: enforce metadata allocation clustering

The allocator uses the last allocation as a starting point for metadata
allocations, and tries to allocate in clusters of at least 256k.

If the search for a free block fails to find the expected block, this patch
forces a new cluster to be found in the free list.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 17 +++++++++++++++++
 fs/btrfs/extent_io.c   |  7 -------
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ebd8275a193..b8c6541c33f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2195,6 +2195,23 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 			if (search_start + num_bytes > end)
 				goto new_group;
 
+			if (last_ptr && *last_ptr && search_start != *last_ptr) {
+				total_needed += empty_cluster;
+				*last_ptr = 0;
+				/*
+				 * if search_start is still in this block group
+				 * then we just re-search this block group
+				 */
+				if (search_start >= start &&
+				    search_start < end) {
+					mutex_unlock(&block_group->alloc_mutex);
+					continue;
+				}
+
+				/* else we go to the next block group */
+				goto new_group;
+			}
+
 			if (exclude_nr > 0 &&
 			    (search_start + num_bytes > exclude_start &&
 			     search_start < exclude_start + exclude_nr)) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bbe3bcfcf4a..ad75a9cc360 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2494,13 +2494,6 @@ retry:
 		index = 0;
 		goto retry;
 	}
-	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
-		mapping->writeback_index = index;
-		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-			range_whole = 1;
-
-	if (wbc->range_cont)
-		wbc->range_start = index << PAGE_CACHE_SHIFT;
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_cache_pages);
-- 
cgit v1.2.3


From 4366211ccd050e00674b3276b5ed81981b2d7793 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 7 Nov 2008 09:06:11 -0500
Subject: Btfs: More metadata allocator optimizations

This lowers the empty cluster target for metadata allocations.  The lower
target makes it easier to do allocations and still seems to perform well.

It also fixes the allocator loop to drop the empty cluster when things
start getting difficult, avoiding false enospc warnings.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b8c6541c33f..0d73a53c676 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2116,6 +2116,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	u64 total_needed = num_bytes;
 	u64 *last_ptr = NULL;
+	u64 last_wanted = 0;
 	struct btrfs_block_group_cache *block_group = NULL;
 	int chunk_alloc_done = 0;
 	int empty_cluster = 2 * 1024 * 1024;
@@ -2134,22 +2135,28 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
-		empty_cluster = 256 * 1024;
+		empty_cluster = 64 * 1024;
 	}
 
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
 		last_ptr = &root->fs_info->last_data_alloc;
 
 	if (last_ptr) {
-		if (*last_ptr)
+		if (*last_ptr) {
 			hint_byte = *last_ptr;
-		else
+			last_wanted = *last_ptr;
+		} else
 			empty_size += empty_cluster;
+	} else {
+		empty_cluster = 0;
 	}
 	search_start = max(search_start, first_logical_byte(root, 0));
 	search_start = max(search_start, hint_byte);
 	total_needed += empty_size;
 
+	if (search_start != last_wanted)
+		last_wanted = 0;
+
 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
 	if (!block_group)
 		block_group = btrfs_lookup_first_block_group(root->fs_info,
@@ -2195,9 +2202,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 			if (search_start + num_bytes > end)
 				goto new_group;
 
-			if (last_ptr && *last_ptr && search_start != *last_ptr) {
+			if (last_wanted && search_start != last_wanted) {
 				total_needed += empty_cluster;
-				*last_ptr = 0;
+				last_wanted = 0;
 				/*
 				 * if search_start is still in this block group
 				 * then we just re-search this block group
@@ -2223,6 +2230,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				if (search_start >= start &&
 				    search_start < end) {
 					mutex_unlock(&block_group->alloc_mutex);
+					last_wanted = 0;
 					continue;
 				}
 
@@ -2240,6 +2248,11 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 			break;
 		}
 new_group:
+		last_wanted = 0;
+		if (loop > 0) {
+			total_needed -= empty_cluster;
+			empty_cluster = 0;
+		}
 		mutex_unlock(&block_group->alloc_mutex);
 		/*
 		 * Here's how this works.
@@ -2256,11 +2269,6 @@ new_group:
 		if (loop == 0) {
 			head = &space_info->block_groups;
 			cur = head->next;
-
-			if (last_ptr && *last_ptr) {
-				total_needed += empty_cluster;
-				*last_ptr = 0;
-			}
 			loop++;
 		} else if (loop == 1 && cur == head) {
 			if (allowed_chunk_alloc && !chunk_alloc_done) {
-- 
cgit v1.2.3


From af09abfece59aa50bfbf16f6f1f85822554e061f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 7 Nov 2008 12:35:44 -0500
Subject: Btrfs: make sure compressed bios don't complete too soon

When writing a compressed extent, a number of bios are created that
point to a single struct compressed_bio.  At end_io time an atomic counter in
the compressed_bio struct makes sure that all of the bios have finished
before final end_io processing is done.

But when multiple bios are needed to write a compressed extent, the
counter was being incremented after the first bio was sent to submit_bio.
It is possible the bio will complete before the counter is incremented,
making the end_io handler free the compressed_bio struct before
processing is finished.

The fix is to increment the atomic counter before bio submission,
both for compressed reads and writes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 284f21025bc..7397c622fb6 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -314,6 +314,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		    PAGE_CACHE_SIZE) {
 			bio_get(bio);
 
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
 			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 			BUG_ON(ret);
 
@@ -323,7 +330,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio_put(bio);
 
 			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
-			atomic_inc(&cb->pending_bios);
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -573,6 +579,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 			BUG_ON(ret);
 
+			/*
+			 * inc the count before we submit the bio so
+			 * we know the end IO handler won't happen before
+			 * we inc the count.  Otherwise, the cb might get
+			 * freed before we're done setting it up
+			 */
+			atomic_inc(&cb->pending_bios);
+
 			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
 			BUG_ON(ret);
 
@@ -580,7 +594,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
 							GFP_NOFS);
-			atomic_inc(&cb->pending_bios);
 			comp_bio->bi_private = cb;
 			comp_bio->bi_end_io = end_compressed_bio_read;
 
-- 
cgit v1.2.3


From 42e70e7a2f9d96fd843723fa46d5121cb3e551d0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 7 Nov 2008 18:17:11 -0500
Subject: Btrfs: Fix more false enospc errors and an oops from empty clustering

In comes cases the empty cluster was added twice to the total number of
bytes the allocator was trying to find.

With empty clustering on, the hint byte was sometimes outside of the
block group.  Add an extra goto to find the correct block group.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d73a53c676..b92e92c29e3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2152,11 +2152,13 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	}
 	search_start = max(search_start, first_logical_byte(root, 0));
 	search_start = max(search_start, hint_byte);
-	total_needed += empty_size;
 
-	if (search_start != last_wanted)
+	if (last_wanted && search_start != last_wanted) {
 		last_wanted = 0;
+		empty_size += empty_cluster;
+	}
 
+	total_needed += empty_size;
 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
 	if (!block_group)
 		block_group = btrfs_lookup_first_block_group(root->fs_info,
@@ -2171,7 +2173,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		 * group thats not of the proper type, while looping this
 		 * should never happen
 		 */
-		WARN_ON(!block_group);
+		if (!block_group)
+			goto new_group_no_lock;
+
 		mutex_lock(&block_group->alloc_mutex);
 		if (unlikely(!block_group_bits(block_group, data)))
 			goto new_group;
@@ -2248,12 +2252,13 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 			break;
 		}
 new_group:
+		mutex_unlock(&block_group->alloc_mutex);
+new_group_no_lock:
 		last_wanted = 0;
-		if (loop > 0) {
+		if (!allowed_chunk_alloc && loop > 0) {
 			total_needed -= empty_cluster;
 			empty_cluster = 0;
 		}
-		mutex_unlock(&block_group->alloc_mutex);
 		/*
 		 * Here's how this works.
 		 * loop == 0: we were searching a block group via a hint
@@ -2271,6 +2276,10 @@ new_group:
 			cur = head->next;
 			loop++;
 		} else if (loop == 1 && cur == head) {
+
+			total_needed -= empty_cluster;
+			empty_cluster = 0;
+
 			if (allowed_chunk_alloc && !chunk_alloc_done) {
 				up_read(&space_info->groups_sem);
 				ret = do_chunk_alloc(trans, root, num_bytes +
-- 
cgit v1.2.3


From 5f2cc086ccab27ac5252b3883ac004347860b4c7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 7 Nov 2008 18:22:45 -0500
Subject: Btrfs: Avoid unplug storms during commit

While doing a commit, btrfs makes sure all the metadata blocks
were properly written to disk, calling wait_on_page_writeback for
each page.  This writeback happens after allowing another transaction
to start, so it competes for the disk with other processes in the FS.

If the page writeback bit is still set, each wait_on_page_writeback might
trigger an unplug, even though the page might be waiting for checksumming
to finish or might be waiting for the async work queue to submit the
bio.

This trades wait_on_page_writeback for waiting on the extent writeback
bits.  It won't trigger any unplugs and substantially improves performance
in a number of workloads.

This also changes the async bio submission to avoid requeueing if there
is only one device.  The requeue just wastes CPU time because there are
no other devices to service.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 30 +++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c     |  3 ++-
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e72a013d24b..202c1b6df4a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
+#include <linux/blkdev.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -331,6 +332,7 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	int werr = 0;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 	u64 start = 0;
 	u64 end;
 	unsigned long index;
@@ -371,6 +373,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 			page_cache_release(page);
 		}
 	}
+	/*
+	 * we unplug once and then use the wait_on_extent_bit for
+	 * everything else
+	 */
+	blk_run_address_space(btree_inode->i_mapping);
 	while(1) {
 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -391,7 +398,28 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 				if (err)
 					werr = err;
 			}
-			wait_on_page_writeback(page);
+			if (PageWriteback(page)) {
+				/*
+				 * we don't wait on the page writeback bit
+				 * because that triggers a lot of unplugs.
+				 * The extent bits are much nicer to
+				 * the disks, but come with a slightly
+				 * higher latency because we aren't forcing
+				 * unplugs.
+				 */
+				wait_on_extent_writeback(io_tree,
+					 page_offset(page),
+					 page_offset(page) +
+					 PAGE_CACHE_SIZE - 1);
+			}
+			if (PageWriteback(page)) {
+				/*
+				 * the state bits get cleared before the
+				 * page bits, lets add some extra
+				 * paranoia here
+				 */
+				wait_on_page_writeback(page);
+			}
 			page_cache_release(page);
 			cond_resched();
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cbb9bb31431..80a27284dbf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -200,7 +200,8 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi)) {
+		if (pending && bdi_write_congested(bdi) &&
+		    fs_info->fs_devices->open_devices > 1) {
 			struct bio *old_head;
 
 			spin_lock(&device->io_lock);
-- 
cgit v1.2.3


From 5b7c3fcc46b5deb8a368d5319cf87c78c2df65fe Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 07:26:33 -0500
Subject: Btrfs: Don't substract too much from the allocation target (avoid
 wrapping)

When metadata allocation clustering has to fall back to unclustered
allocs because large free areas could not be found, it was sometimes
substracting too much from the total bytes to allocate.  This would
make it wrap below zero.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b92e92c29e3..2451717d36d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2255,9 +2255,9 @@ new_group:
 		mutex_unlock(&block_group->alloc_mutex);
 new_group_no_lock:
 		last_wanted = 0;
-		if (!allowed_chunk_alloc && loop > 0) {
-			total_needed -= empty_cluster;
-			empty_cluster = 0;
+		if (!allowed_chunk_alloc) {
+			total_needed -= empty_size;
+			empty_size = 0;
 		}
 		/*
 		 * Here's how this works.
@@ -2277,8 +2277,8 @@ new_group_no_lock:
 			loop++;
 		} else if (loop == 1 && cur == head) {
 
-			total_needed -= empty_cluster;
-			empty_cluster = 0;
+			total_needed -= empty_size;
+			empty_size = 0;
 
 			if (allowed_chunk_alloc && !chunk_alloc_done) {
 				up_read(&space_info->groups_sem);
-- 
cgit v1.2.3


From f2b1c41cf94d7f839fe9ede5f3ead92698a93fb3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 07:31:30 -0500
Subject: Btrfs: Make sure pages are dirty before doing delalloc for them

This adds a PageDirty check to the writeback path that locks pages
for delalloc.  If a page wasn't dirty at this point, it is in the
process of being truncated away.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ad75a9cc360..69ea09659d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1205,7 +1205,8 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 			 */
 			if (pages[i] != locked_page) {
 				lock_page(pages[i]);
-				if (pages[i]->mapping != inode->i_mapping) {
+				if (!PageDirty(pages[i]) ||
+				    pages[i]->mapping != inode->i_mapping) {
 					ret = -EAGAIN;
 					unlock_page(pages[i]);
 					page_cache_release(pages[i]);
-- 
cgit v1.2.3


From ff5b7ee33d82414bf4baf299c21fb703bcc89629 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 10 Nov 2008 07:34:43 -0500
Subject: Btrfs: Fix csum error for compressed data

The decompress code doesn't take the logical offset in extent
pointer into account. If the logical offset isn't zero, data
will be decompressed into wrong pages.

The solution used here is to record the starting offset of the extent
in the file separately from the logical start of the extent_map struct.
This allows us to avoid problems inserting overlapping extents.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/compression.c | 6 ++----
 fs/btrfs/extent_map.h  | 1 +
 fs/btrfs/file.c        | 2 ++
 fs/btrfs/inode.c       | 3 +++
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7397c622fb6..8e7a78acf81 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -505,7 +505,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct block_device *bdev;
 	struct bio *comp_bio;
 	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
-	u64 em_len;
 	struct extent_map *em;
 	int ret;
 
@@ -524,9 +523,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	cb->errors = 0;
 	cb->inode = inode;
 
-	cb->start = em->start;
+	cb->start = em->orig_start;
 	compressed_len = em->block_len;
-	em_len = em->len;
 	free_extent_map(em);
 
 	cb->len = uncompressed_len;
@@ -545,7 +543,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 	cb->nr_pages = nr_pages;
 
-	add_ra_bio_pages(inode, cb->start + em_len, cb);
+	add_ra_bio_pages(inode, em->start + em->len, cb);
 
 	if (!btrfs_test_opt(root, NODATASUM) &&
 	    !btrfs_test_flag(inode, NODATASUM)) {
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index accfedaeb51..fb6eeef06bb 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -20,6 +20,7 @@ struct extent_map {
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
+	u64 orig_start;
 	u64 block_start;
 	u64 block_len;
 	unsigned long flags;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 337221ecca2..85841c53880 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -222,6 +222,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		    em->start < start) {
 			split->start = em->start;
 			split->len = start - em->start;
+			split->orig_start = em->orig_start;
 			split->block_start = em->block_start;
 
 			if (compressed)
@@ -243,6 +244,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
 			split->start = start + len;
 			split->len = em->start + em->len - (start + len);
+			split->orig_start = em->orig_start;
 			split->bdev = em->bdev;
 			split->flags = flags;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e01c0d0310a..59660293d29 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3949,6 +3949,8 @@ again:
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
+		em->orig_start = extent_start -
+				 btrfs_file_extent_offset(leaf, item);
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0) {
 			em->block_start = EXTENT_MAP_HOLE;
@@ -3988,6 +3990,7 @@ again:
 		em->start = extent_start + extent_offset;
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
+		em->orig_start = EXTENT_MAP_INLINE;
 		if (compressed)
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
-- 
cgit v1.2.3


From e04ca626baee684bea9d6239e4e1119b696101b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 11:44:58 -0500
Subject: Btrfs: Fix use after free during compressed reads

Yan's fix to use the correct file offset during compressed reads used the
extent_map struct pointer after it had been freed.  This saves the
fields we want for later use instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8e7a78acf81..b582c6fd80f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -505,6 +505,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct block_device *bdev;
 	struct bio *comp_bio;
 	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	u64 em_len;
+	u64 em_start;
 	struct extent_map *em;
 	int ret;
 
@@ -525,7 +527,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	cb->start = em->orig_start;
 	compressed_len = em->block_len;
+	em_len = em->len;
+	em_start = em->start;
 	free_extent_map(em);
+	em = NULL;
 
 	cb->len = uncompressed_len;
 	cb->compressed_len = compressed_len;
@@ -543,7 +548,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	}
 	cb->nr_pages = nr_pages;
 
-	add_ra_bio_pages(inode, em->start + em->len, cb);
+	add_ra_bio_pages(inode, em_start + em_len, cb);
 
 	if (!btrfs_test_opt(root, NODATASUM) &&
 	    !btrfs_test_flag(inode, NODATASUM)) {
-- 
cgit v1.2.3


From f5a31e166772a7b9fff6725b697eb8b57633671e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 11:47:09 -0500
Subject: Btrfs: Try harder while searching for free space

The loop searching for free space would exit out too soon when
metadata clustering was trying to allocate a large extent.  This makes
sure a full scan of the free space is done searching for only the
minimum extent size requested by the higher layers.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2451717d36d..55d6a66c622 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2123,6 +2123,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	int allowed_chunk_alloc = 0;
 	struct list_head *head = NULL, *cur = NULL;
 	int loop = 0;
+	int extra_loop = 0;
 	struct btrfs_space_info *space_info;
 
 	WARN_ON(num_bytes < root->sectorsize);
@@ -2191,6 +2192,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 		free_space = btrfs_find_free_space(block_group, search_start,
 						   total_needed);
+		if (empty_size)
+			extra_loop = 1;
+
 		if (free_space) {
 			u64 start = block_group->key.objectid;
 			u64 end = block_group->key.objectid +
@@ -2254,11 +2258,11 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 new_group:
 		mutex_unlock(&block_group->alloc_mutex);
 new_group_no_lock:
+		/* don't try to compare new allocations against the
+		 * last allocation any more
+		 */
 		last_wanted = 0;
-		if (!allowed_chunk_alloc) {
-			total_needed -= empty_size;
-			empty_size = 0;
-		}
+
 		/*
 		 * Here's how this works.
 		 * loop == 0: we were searching a block group via a hint
@@ -2276,9 +2280,21 @@ new_group_no_lock:
 			cur = head->next;
 			loop++;
 		} else if (loop == 1 && cur == head) {
-
+			int keep_going;
+
+			/* at this point we give up on the empty_size
+			 * allocations and just try to allocate the min
+			 * space.
+			 *
+			 * The extra_loop field was set if an empty_size
+			 * allocation was attempted above, and if this
+			 * is try we need to try the loop again without
+			 * the additional empty_size.
+			 */
 			total_needed -= empty_size;
 			empty_size = 0;
+			keep_going = extra_loop;
+			loop++;
 
 			if (allowed_chunk_alloc && !chunk_alloc_done) {
 				up_read(&space_info->groups_sem);
@@ -2287,13 +2303,19 @@ new_group_no_lock:
 				if (ret < 0)
 					break;
 				down_read(&space_info->groups_sem);
-				loop++;
 				head = &space_info->block_groups;
-				cur = head->next;
+				/*
+				 * we've allocated a new chunk, keep
+				 * trying
+				 */
+				keep_going = 1;
 				chunk_alloc_done = 1;
 			} else if (!allowed_chunk_alloc) {
 				space_info->force_alloc = 1;
-				break;
+			}
+			if (keep_going) {
+				cur = head->next;
+				extra_loop = 0;
 			} else {
 				break;
 			}
-- 
cgit v1.2.3


From 39be25cd89450940b0e5f8a6aad71d1ec99b17bf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 11:50:50 -0500
Subject: Btrfs: Use invalidatepage when writepage finds a page outside of
 i_size

With all the recent fixes to the delalloc locking, it is now safe
again to use invalidatepage inside the writepage code for
pages outside of i_size.  This used to deadlock against some of the
code to write locked ranges of pages, but all of that has been fixed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 6 +-----
 fs/btrfs/file-item.c | 1 +
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 69ea09659d4..af2d9a9300a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2174,11 +2174,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
-		if (epd->extent_locked) {
-			if (tree->ops && tree->ops->writepage_end_io_hook)
-				tree->ops->writepage_end_io_hook(page, start,
-							 page_end, NULL, 1);
-		}
+		page->mapping->a_ops->invalidatepage(page, 0);
 		unlock_page(page);
 		return 0;
 	}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f4d3fa71bc4..f7637883140 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -185,6 +185,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 				       "%llu\n", inode->i_ino,
 				       (unsigned long long)offset);
 				item = NULL;
+				btrfs_release_path(root, path);
 				goto found;
 			}
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-- 
cgit v1.2.3


From 445a69449994a37615cd47e47bcab2e42a070adf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 11:53:33 -0500
Subject: Btrfs: Fix usage of struct extent_map->orig_start

This makes sure the orig_start field in struct extent_map gets set
everywhere the extent_map structs are created or modified.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c  | 3 ++-
 fs/btrfs/inode.c | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 85841c53880..4119f9a9532 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -244,16 +244,17 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 
 			split->start = start + len;
 			split->len = em->start + em->len - (start + len);
-			split->orig_start = em->orig_start;
 			split->bdev = em->bdev;
 			split->flags = flags;
 
 			if (compressed) {
 				split->block_len = em->block_len;
 				split->block_start = em->block_start;
+				split->orig_start = em->orig_start;
 			} else {
 				split->block_len = split->len;
 				split->block_start = em->block_start + diff;
+				split->orig_start = split->start;
 			}
 
 			ret = add_extent_mapping(em_tree, split);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 59660293d29..5ca9c067237 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -610,6 +610,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = async_extent->start;
 		em->len = async_extent->ram_size;
+		em->orig_start = em->start;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
@@ -743,6 +744,7 @@ static noinline int cow_file_range(struct inode *inode,
 		}
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
+		em->orig_start = em->start;
 
 		ram_size = ins.offset;
 		em->len = ins.offset;
@@ -1057,6 +1059,7 @@ out_check:
 			em_tree = &BTRFS_I(inode)->extent_tree;
 			em = alloc_extent_map(GFP_NOFS);
 			em->start = cur_offset;
+			em->orig_start = em->start;
 			em->len = num_bytes;
 			em->block_len = num_bytes;
 			em->block_start = disk_bytenr;
@@ -3876,6 +3879,7 @@ again:
 	}
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
+	em->orig_start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
 	em->block_len = (u64)-1;
 
-- 
cgit v1.2.3


From b47eda8690a10f4fc01eb7b795078fa3fa57149f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 12:34:40 -0500
Subject: Btrfs: Turn off extent state leak debugging

The extent_io.c code has a #define to find and cleanup extent state leaks
on module unmount.  This adds a very highly contended spinlock to a
hot path for most FS operations.

Turn it off by default.  A later changeset will add a .config option
for it.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index af2d9a9300a..5cc0082379c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,7 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
-#define LEAK_DEBUG 1
+#define LEAK_DEBUG 0
 #ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
 #endif
-- 
cgit v1.2.3


From 240d5d482b69415b92ffce4b8dfca10799c9890b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 13:08:31 -0500
Subject: Btrfs: tune btrfs unplug functions for a small number of devices

When btrfs unplugs, it tries to find the correct device to unplug
via search through the extent_map tree.  This avoids unplugging
a device that doesn't need it, but is a waste of time for filesystems
with a small number of devices.

This patch checks the total number of devices before doing the
search.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8efc123d222..1bb54d69fbb 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1201,6 +1201,16 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 		return;
 
 	inode = mapping->host;
+
+	/*
+	 * don't do the expensive searching for a small number of
+	 * devices
+	 */
+	if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
+		__unplug_io_fn(bdi, page);
+		return;
+	}
+
 	offset = page_offset(page);
 
 	em_tree = &BTRFS_I(inode)->extent_tree;
-- 
cgit v1.2.3


From 8a1413a296d38b54ded651e76ef16c033d38fd5d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 10 Nov 2008 16:13:54 -0500
Subject: Btrfs: empty_size allocation fixes again

The allocator wasn't catching all of the cases where it needed to do
extra loops because the check to enforce them wasn't happening early
enough.

When the allocator decided to increase the size of the allocation
for metadata clustering, it wasn't always setting the empty_size to
include the extra (optional) bytes.  This also fixes the empty_size field
to be correct.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 55d6a66c622..b7530c3ac20 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2174,6 +2174,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		 * group thats not of the proper type, while looping this
 		 * should never happen
 		 */
+		if (empty_size)
+			extra_loop = 1;
+
 		if (!block_group)
 			goto new_group_no_lock;
 
@@ -2192,9 +2195,6 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 		free_space = btrfs_find_free_space(block_group, search_start,
 						   total_needed);
-		if (empty_size)
-			extra_loop = 1;
-
 		if (free_space) {
 			u64 start = block_group->key.objectid;
 			u64 end = block_group->key.objectid +
@@ -2212,6 +2212,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 			if (last_wanted && search_start != last_wanted) {
 				total_needed += empty_cluster;
+				empty_size += empty_cluster;
 				last_wanted = 0;
 				/*
 				 * if search_start is still in this block group
-- 
cgit v1.2.3


From 8247b41ac980d125de8aeba6f33f381056ac0ecb Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 11 Nov 2008 09:33:29 -0500
Subject: Btrfs: Fix starting search offset inside btrfs_drop_extents

btrfs_drop_extents will drop paths and search again when it needs to
force COW of higher nodes.  It was using the key it found during the last
search as the offset for the next search.

But, this wasn't always correct.  The key could be from before our desired
range, and because we're dropping the path, it is possible for file's items
to change while we do the search again.

The fix here is to make sure we don't search for something smaller than
the offset btrfs_drop_extents was called with.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4119f9a9532..934bc094bf1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -436,7 +436,7 @@ next_slot:
 			goto out;
 		}
 		if (recow) {
-			search_start = key.offset;
+			search_start = max(key.offset, start);
 			continue;
 		}
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
-- 
cgit v1.2.3


From 5b050f04c8ce911c5b6831305a24d70eab95e732 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 11 Nov 2008 09:34:41 -0500
Subject: Btrfs: Fix compile warnings on 32 bit machines

Simple casting here and there to fix things up.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c |  3 ++-
 fs/btrfs/extent_io.c   | 11 +++++++----
 fs/btrfs/inode.c       |  4 ++--
 fs/btrfs/zlib.c        |  2 +-
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b582c6fd80f..bfd1512cce0 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -174,7 +174,8 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
 		if (ret == 0) {
 			nr_pages -= 1;
 			index += 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5cc0082379c..54d013c3bb8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1157,7 +1157,8 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long, nr_pages,
+				     ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 			if (pages[i] != locked_page)
 				unlock_page(pages[i]);
@@ -1192,7 +1193,8 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 	nrpages = end_index - index + 1;
 	while(nrpages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nrpages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long,
+				     nrpages, ARRAY_SIZE(pages)), pages);
 		if (ret == 0) {
 			ret = -EAGAIN;
 			goto done;
@@ -1346,7 +1348,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 
 	while(nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
-				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+				     min_t(unsigned long,
+				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
@@ -1896,7 +1899,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	int contig = 0;
 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
-	size_t page_size = min(size, PAGE_CACHE_SIZE);
+	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
 
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5ca9c067237..2ed2deacde9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -184,7 +184,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 		int i = 0;
 		while(compressed_size > 0) {
 			cpage = compressed_pages[i];
-			cur_size = min(compressed_size,
+			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
 
 			kaddr = kmap(cpage);
@@ -3812,7 +3812,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 
 	read_extent_buffer(leaf, tmp, ptr, inline_size);
 
-	max_size = min(PAGE_CACHE_SIZE, max_size);
+	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
 	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
 				    inline_size, max_size);
 	if (ret) {
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ba2527d0873..5b9f7002513 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -370,7 +370,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 
 	data_in = kmap(pages_in[page_in_index]);
 	workspace->inf_strm.next_in = data_in;
-	workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+	workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
 	workspace->inf_strm.total_in = 0;
 
 	workspace->inf_strm.total_out = 0;
-- 
cgit v1.2.3


From ebeb0406f153db51ab2d4771faf2342bd6ca14dd Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Wed, 12 Nov 2008 07:48:00 +0900
Subject: fat: drop negative dentry on rename() path

Drop the negative dentry on rename() path, in order to make sure to
use the case sensitive name which is specified by user if this is for
creation.

For it, this uses newly added LOOKUP_RENAME_TARGET like LOOKUP_CREATE.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/namei_vfat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bf326d4356a..8ae32e37673 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 	 * for creation.
 	 */
 	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
-		if (nd->flags & LOOKUP_CREATE)
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
 			return 0;
 	}
 
-- 
cgit v1.2.3


From 985eafcc5480b0d98419b96869f2560abb2764c7 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Wed, 12 Nov 2008 07:48:01 +0900
Subject: fat: fix duplicate addition of ->llseek handler

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/dir.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 67e05835709..3a7f603b698 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = {
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
 	.fsync		= file_fsync,
-	.llseek		= generic_file_llseek,
 };
 
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
-- 
cgit v1.2.3


From 5a6bb10393eb9a1985e97af12f0cb2906bcbf1af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 12 Nov 2008 07:48:01 +0900
Subject: fat: make sure to set d_ops in fat_get_parent

fat_get_parent needs to setup the dentry operations, otherwise we might
lose them when the NFS server needs to reconnect out of cache inodes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index bdd8fb7be2c..37a8af159a1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
 	brelse(bh);
 
 	parent = d_obtain_alias(inode);
+	if (!IS_ERR(parent))
+		parent->d_op = sb->s_root->d_op;
 out:
 	unlock_super(sb);
 
-- 
cgit v1.2.3


From 6f3577bdc768e6dae3c4d419e89b5a904f470728 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 13 Nov 2008 09:59:36 -0500
Subject: Btrfs: Improve metadata read latencies

This fixes latency problems on metadata reads by making sure they
don't go through the async submit queue, and by tuning down the amount
of readahead done during btree searches.

Also, the btrfs bdi congestion function is tuned to ignore the
number of pending async bios and checksums pending.  There is additional
code that throttles new async bios now and the congestion function
doesn't need to worry about it anymore.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   | 8 ++++----
 fs/btrfs/disk-io.c | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f82f8db0227..ac61c50a331 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1285,16 +1285,16 @@ static noinline void reada_for_search(struct btrfs_root *root,
 		}
 		search = btrfs_node_blockptr(node, nr);
 		if ((search >= lowest_read && search <= highest_read) ||
-		    (search < lowest_read && lowest_read - search <= 32768) ||
-		    (search > highest_read && search - highest_read <= 32768)) {
+		    (search < lowest_read && lowest_read - search <= 16384) ||
+		    (search > highest_read && search - highest_read <= 16384)) {
 			readahead_tree_block(root, search, blocksize,
 				     btrfs_node_ptr_generation(node, nr));
 			nread += blocksize;
 		}
 		nscan++;
-		if (path->reada < 2 && (nread > (256 * 1024) || nscan > 32))
+		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
 			break;
-		if(nread > (1024 * 1024) || nscan > 128)
+		if(nread > (256 * 1024) || nscan > 128)
 			break;
 
 		if (search < lowest_read)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1bb54d69fbb..3b0e974a9e9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -605,7 +605,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		BUG_ON(ret);
 
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-				     mirror_num, 1);
+				     mirror_num, 0);
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num, 0,
@@ -1139,11 +1139,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	struct list_head *cur;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
-
+#if 0
 	if ((bdi_bits & (1 << BDI_write_congested)) &&
 	    btrfs_congested_async(info, 0))
 		return 1;
-
+#endif
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->bdev)
-- 
cgit v1.2.3


From 2ed6d66408527be0d1c6131d44cec7e86008ba26 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 13 Nov 2008 09:59:33 -0500
Subject: Btrfs: Fix handling of space info full during allocations

When we fail to allocate a new block group, we should still do the
checks to make sure allocations try again with the minimum requested
allocation size.

This also fixes a deadlock that come from a missed down_read in
the chunk allocation failure handling.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b7530c3ac20..22820f91d2b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2301,9 +2301,9 @@ new_group_no_lock:
 				up_read(&space_info->groups_sem);
 				ret = do_chunk_alloc(trans, root, num_bytes +
 						     2 * 1024 * 1024, data, 1);
-				if (ret < 0)
-					break;
 				down_read(&space_info->groups_sem);
+				if (ret < 0)
+					goto loop_check;
 				head = &space_info->block_groups;
 				/*
 				 * we've allocated a new chunk, keep
@@ -2314,6 +2314,7 @@ new_group_no_lock:
 			} else if (!allowed_chunk_alloc) {
 				space_info->force_alloc = 1;
 			}
+loop_check:
 			if (keep_going) {
 				cur = head->next;
 				extra_loop = 0;
-- 
cgit v1.2.3


From c5c9cd4d1b827fe545ed2a945e91e3a6909f3886 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 12 Nov 2008 14:32:25 -0500
Subject: Btrfs: allow clone of an arbitrary file range

This patch adds an additional CLONE_RANGE ioctl to clone an arbitrary
(block-aligned) file range to another file.  The original CLONE ioctl
becomes a special case of cloning the entire file range.  The logic is a
bit more complex now since ranges may be cloned to different offsets, and
because we may only be cloning the beginning or end of a particular extent
or checksum item.

An additional sanity check ensures the source and destination files aren't
the same (which would previously deadlock), although eventually this could
be extended to allow the duplication of file data at a different offset
within the same file.

Any extents within the destination range in the target file are dropped.

We currently do not cope with the case where a compressed inline extent
needs to be split.  This will probably require decompressing the extent
into a temporary address_space, and inserting just the cloned portion as a
new compressed inline extent.  For now, just return -EINVAL in this case.
Note that this never comes up in the more common case of cloning an entire
file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/ioctl.h |   9 ++
 2 files changed, 221 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9ff2b4e0e92..4d7cc7c504d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -592,7 +592,8 @@ out:
 	return ret;
 }
 
-long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
+long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off,
+		       u64 olen, u64 destoff)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -606,12 +607,29 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 	u32 nritems;
 	int slot;
 	int ret;
+	u64 len = olen;
+	u64 bs = root->fs_info->sb->s_blocksize;
+	u64 hint_byte;
 
-	src_file = fget(src_fd);
+	/*
+	 * TODO:
+	 * - split compressed inline extents.  annoying: we need to
+	 *   decompress into destination's address_space (the file offset
+	 *   may change, so source mapping won't do), then recompress (or
+	 *   otherwise reinsert) a subrange.
+	 * - allow ranges within the same file to be cloned (provided
+	 *   they don't overlap)?
+	 */
+
+	src_file = fget(srcfd);
 	if (!src_file)
 		return -EBADF;
 	src = src_file->f_dentry->d_inode;
 
+	ret = -EINVAL;
+	if (src == inode)
+		goto out_fput;
+
 	ret = -EISDIR;
 	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
 		goto out_fput;
@@ -640,27 +658,46 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		mutex_lock(&inode->i_mutex);
 	}
 
-	ret = -ENOTEMPTY;
-	if (inode->i_size)
+	/* determine range to clone */
+	ret = -EINVAL;
+	if (off >= src->i_size || off + len > src->i_size)
 		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - off;
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == src->i_size)
+		len = ((src->i_size + bs-1) & ~(bs-1))
+			- off;
+
+	/* verify the end result is block aligned */
+	if ((off & (bs-1)) ||
+	    ((off + len) & (bs-1)))
+		goto out_unlock;
+
+	printk("final src extent is %llu~%llu\n", off, len);
+	printk("final dst extent is %llu~%llu\n", destoff, len);
 
 	/* do any pending delalloc/csum calc on src, one way or
 	   another, and lock file content */
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
 		if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
 			break;
-		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
-		btrfs_wait_ordered_range(src, 0, (u64)-1);
+		btrfs_wait_ordered_range(src, off, off+len);
 	}
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
+	/* punch hole in destination first */
+	btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte);
+
+	/* clone data */
 	key.objectid = src->i_ino;
 	key.type = BTRFS_EXTENT_DATA_KEY;
 	key.offset = 0;
@@ -691,56 +728,178 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		    key.objectid != src->i_ino)
 			break;
 
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY ||
-		    btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
+		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+			struct btrfs_file_extent_item *extent;
+			int type;
 			u32 size;
 			struct btrfs_key new_key;
+			u64 disko = 0, diskl = 0;
+			u64 datao = 0, datal = 0;
+			u8 comp;
 
 			size = btrfs_item_size_nr(leaf, slot);
 			read_extent_buffer(leaf, buf,
 					   btrfs_item_ptr_offset(leaf, slot),
 					   size);
+
+			extent = btrfs_item_ptr(leaf, slot,
+						struct btrfs_file_extent_item);
+			comp = btrfs_file_extent_compression(leaf, extent);
+			type = btrfs_file_extent_type(leaf, extent);
+			if (type == BTRFS_FILE_EXTENT_REG) {
+				disko = btrfs_file_extent_disk_bytenr(leaf, extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+				datao = btrfs_file_extent_offset(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf, extent);
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				/* take upper bound, may be compressed */
+				datal = btrfs_file_extent_ram_bytes(leaf,
+								    extent);
+			}
 			btrfs_release_path(root, path);
 
+			if (key.offset + datal < off ||
+			    key.offset >= off+len)
+				goto next;
+
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.objectid = inode->i_ino;
-			ret = btrfs_insert_empty_item(trans, root, path,
-						      &new_key, size);
-			if (ret)
-				goto out;
+			new_key.offset = key.offset + destoff - off;
 
-			leaf = path->nodes[0];
-			slot = path->slots[0];
-			write_extent_buffer(leaf, buf,
+			if (type == BTRFS_FILE_EXTENT_REG) {
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				if (ret)
+					goto out;
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
 					    btrfs_item_ptr_offset(leaf, slot),
 					    size);
-			btrfs_mark_buffer_dirty(leaf);
-		}
-
-		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
-			struct btrfs_file_extent_item *extent;
-			int found_type;
 
-			extent = btrfs_item_ptr(leaf, slot,
+				extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
-			found_type = btrfs_file_extent_type(leaf, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG ||
-			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-				u64 ds = btrfs_file_extent_disk_bytenr(leaf,
-								       extent);
-				u64 dl = btrfs_file_extent_disk_num_bytes(leaf,
-								 extent);
-				/* ds == 0 means there's a hole */
-				if (ds != 0) {
+				printk("  orig disk %llu~%llu data %llu~%llu\n",
+				       disko, diskl, datao, datal);
+
+				if (off > key.offset) {
+					datao += off - key.offset;
+					datal -= off - key.offset;
+				}
+				if (key.offset + datao + datal + key.offset >
+				    off + len)
+					datal = off + len - key.offset - datao;
+				/* disko == 0 means it's a hole */
+				if (!disko)
+					datao = 0;
+				printk(" final disk %llu~%llu data %llu~%llu\n",
+				       disko, diskl, datao, datal);
+
+				btrfs_set_file_extent_offset(leaf, extent,
+							     datao);
+				btrfs_set_file_extent_num_bytes(leaf, extent,
+								datal);
+				if (disko) {
+					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans, root,
-						     ds, dl, leaf->start,
-						     root->root_key.objectid,
-						     trans->transid,
-						     inode->i_ino);
+						   disko, diskl, leaf->start,
+						   root->root_key.objectid,
+						   trans->transid,
+						   inode->i_ino);
 					BUG_ON(ret);
 				}
+			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
+				u64 skip = 0;
+				u64 trim = 0;
+				if (off > key.offset) {
+					skip = off - key.offset;
+					new_key.offset += skip;
+				}
+				if (key.offset + datal > off+len)
+					trim = key.offset + datal - (off+len);
+				printk("len %lld skip %lld trim %lld\n",
+				       datal, skip, trim);
+				if (comp && (skip || trim)) {
+					printk("btrfs clone_range can't split compressed inline extents yet\n");
+					ret = -EINVAL;
+					goto out;
+				}
+				size -= skip + trim;
+				datal -= skip + trim;
+				ret = btrfs_insert_empty_item(trans, root, path,
+							      &new_key, size);
+				if (ret)
+					goto out;
+
+				if (skip) {
+					u32 start = btrfs_file_extent_calc_inline_size(0);
+					memmove(buf+start, buf+start+skip,
+						datal);
+				}
+
+				leaf = path->nodes[0];
+				slot = path->slots[0];
+				write_extent_buffer(leaf, buf,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    size);
+				inode_add_bytes(inode, datal);
 			}
+
+			btrfs_mark_buffer_dirty(leaf);
 		}
+
+		if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
+			u32 size;
+			struct btrfs_key new_key;
+			u64 coverslen;
+			int coff, clen;
+
+			size = btrfs_item_size_nr(leaf, slot);
+			coverslen = (size / BTRFS_CRC32_SIZE) <<
+				root->fs_info->sb->s_blocksize_bits;
+			printk("csums for %llu~%llu\n",
+			       key.offset, coverslen);
+			if (key.offset + coverslen < off ||
+			    key.offset >= off+len)
+				goto next;
+
+			read_extent_buffer(leaf, buf,
+					   btrfs_item_ptr_offset(leaf, slot),
+					   size);
+			btrfs_release_path(root, path);
+
+			coff = 0;
+			if (off > key.offset)
+				coff = ((off - key.offset) >>
+					root->fs_info->sb->s_blocksize_bits) *
+					BTRFS_CRC32_SIZE;
+			clen = size - coff;
+			if (key.offset + coverslen > off+len)
+				clen -= ((key.offset+coverslen-off-len) >>
+					 root->fs_info->sb->s_blocksize_bits) *
+					BTRFS_CRC32_SIZE;
+			printk(" will dup %d~%d of %d\n",
+			       coff, clen, size);
+
+			memcpy(&new_key, &key, sizeof(new_key));
+			new_key.objectid = inode->i_ino;
+			new_key.offset = key.offset + destoff - off;
+
+			ret = btrfs_insert_empty_item(trans, root, path,
+						      &new_key, clen);
+			if (ret)
+				goto out;
+
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			write_extent_buffer(leaf, buf + coff,
+					    btrfs_item_ptr_offset(leaf, slot),
+					    clen);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+	next:
 		btrfs_release_path(root, path);
 		key.offset++;
 	}
@@ -749,13 +908,13 @@ out:
 	btrfs_release_path(root, path);
 	if (ret == 0) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode_set_bytes(inode, inode_get_bytes(src));
-		btrfs_i_size_write(inode, src->i_size);
+		if (destoff + olen > inode->i_size)
+			btrfs_i_size_write(inode, destoff + olen);
 		BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
 		ret = btrfs_update_inode(trans, root, inode);
 	}
 	btrfs_end_transaction(trans, root);
-	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
+	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 	if (ret)
 		vmtruncate(inode, 0);
 out_unlock:
@@ -768,6 +927,16 @@ out_fput:
 	return ret;
 }
 
+long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
+{
+	struct btrfs_ioctl_clone_range_args args;
+
+	if (copy_from_user(&args, (void *)argptr, sizeof(args)))
+		return -EFAULT;
+	return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
+				 args.src_length, args.dest_offset);
+}
+
 /*
  * there are many ways the trans_start and trans_end ioctls can lead
  * to deadlocks.  They should only be used by applications that
@@ -851,7 +1020,9 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_BALANCE:
 		return btrfs_balance(root->fs_info->dev_root);
 	case BTRFS_IOC_CLONE:
-		return btrfs_ioctl_clone(file, arg);
+		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
+	case BTRFS_IOC_CLONE_RANGE:
+		return btrfs_ioctl_clone_range(file, arg);
 	case BTRFS_IOC_TRANS_START:
 		return btrfs_ioctl_trans_start(file);
 	case BTRFS_IOC_TRANS_END:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 85ed35a775b..989ba8a0121 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -52,4 +52,13 @@ struct btrfs_ioctl_vol_args {
 #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
 				   struct btrfs_ioctl_vol_args)
 
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
+#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
+				  struct btrfs_ioctl_clone_range_args)
+
 #endif
-- 
cgit v1.2.3


From f3465ca44e2a51fd647c167045768a8ab5a96603 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 12 Nov 2008 14:19:50 -0500
Subject: Btrfs: batch extent inserts/updates/deletions on the extent root

While profiling the allocator I noticed a good amount of time was being spent in
finish_current_insert and del_pending_extents, and as the filesystem filled up
more and more time was being spent in those functions.  This patch aims to try
and reduce that problem.  This happens two ways

1) track if we tried to delete an extent that we are going to update or insert.
Once we get into finish_current_insert we discard any of the extents that were
marked for deletion.  This saves us from doing unnecessary work almost every
time finish_current_insert runs.

2) Batch insertion/updates/deletions.  Instead of doing a btrfs_search_slot for
each individual extent and doing the needed operation, we instead keep the leaf
around and see if there is anything else we can do on that leaf.  On the insert
case I introduced a btrfs_insert_some_items, which will take an array of keys
with an array of data_sizes and try and squeeze in as many of those keys as
possible, and then return how many keys it was able to insert.  In the update
case we search for an extent ref, update the ref and then loop through the leaf
to see if any of the other refs we are looking to update are on that leaf, and
then once we are done we release the path and search for the next ref we need to
update.  And finally for the deletion we try and delete the extent+ref in pairs,
so we will try to find extent+ref pairs next to the extent we are trying to free
and free them in bulk if possible.

This along with the other cluster fix that Chris pushed out a bit ago helps make
the allocator preform more uniformly as it fills up the disk.  There is still a
slight drop as we fill up the disk since we start having to stick new blocks in
odd places which results in more COW's than on a empty fs, but the drop is not
nearly as severe as it was before.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.c       | 170 ++++++++++
 fs/btrfs/ctree.h       |   5 +
 fs/btrfs/extent-tree.c | 827 +++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 909 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ac61c50a331..8bb452456d9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -431,6 +431,25 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 	return 0;
 }
 
+/*
+ * same as comp_keys only with two btrfs_key's
+ */
+static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+{
+	if (k1->objectid > k2->objectid)
+		return 1;
+	if (k1->objectid < k2->objectid)
+		return -1;
+	if (k1->type > k2->type)
+		return 1;
+	if (k1->type < k2->type)
+		return -1;
+	if (k1->offset > k2->offset)
+		return 1;
+	if (k1->offset < k2->offset)
+		return -1;
+	return 0;
+}
 
 /*
  * this is used by the defrag code to go through all the
@@ -2999,6 +3018,157 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ * Returns the number of keys that were inserted.
+ */
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+	int ret = 0;
+	int slot;
+	int slot_orig;
+	int i;
+	u32 nritems;
+	u32 total_data = 0;
+	u32 total_size = 0;
+	unsigned int data_end;
+	struct btrfs_disk_key disk_key;
+	struct btrfs_key found_key;
+
+	found_key.objectid = 0;
+	nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));
+
+	for (i = 0; i < nr; i++)
+		total_data += data_size[i];
+
+	total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
+	total_size = total_data + (nr * sizeof(struct btrfs_item));
+	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+	if (ret == 0)
+		return -EEXIST;
+	if (ret < 0)
+		goto out;
+
+	slot_orig = path->slots[0];
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+	data_end = leaf_data_end(root, leaf);
+
+	if (btrfs_leaf_free_space(root, leaf) < total_size) {
+		for (i = nr; i >= 0; i--) {
+			total_data -= data_size[i];
+			total_size -= data_size[i] + sizeof(struct btrfs_item);
+			if (total_size < btrfs_leaf_free_space(root, leaf))
+				break;
+		}
+		nr = i;
+	}
+
+	slot = path->slots[0];
+	BUG_ON(slot < 0);
+
+	if (slot != nritems) {
+		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
+
+		item = btrfs_item_nr(leaf, slot);
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/* figure out how many keys we can insert in here */
+		total_data = data_size[0];
+		for (i = 1; i < nr; i++) {
+			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+				break;
+			total_data += data_size[i];
+		}
+		nr = i;
+
+		if (old_data < data_end) {
+			btrfs_print_leaf(root, leaf);
+			printk("slot %d old_data %d data_end %d\n",
+			       slot, old_data, data_end);
+			BUG_ON(1);
+		}
+		/*
+		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
+		 */
+		/* first correct the data pointers */
+		WARN_ON(leaf->map_token);
+		for (i = slot; i < nritems; i++) {
+			u32 ioff;
+
+			item = btrfs_item_nr(leaf, i);
+			if (!leaf->map_token) {
+				map_extent_buffer(leaf, (unsigned long)item,
+					sizeof(struct btrfs_item),
+					&leaf->map_token, &leaf->kaddr,
+					&leaf->map_start, &leaf->map_len,
+					KM_USER1);
+			}
+
+			ioff = btrfs_item_offset(leaf, item);
+			btrfs_set_item_offset(leaf, item, ioff - total_data);
+		}
+		if (leaf->map_token) {
+			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+			leaf->map_token = NULL;
+		}
+
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+		/* shift the data */
+		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
+			      data_end - total_data, btrfs_leaf_data(leaf) +
+			      data_end, old_data - data_end);
+		data_end = old_data;
+	} else {
+		/*
+		 * this sucks but it has to be done, if we are inserting at
+		 * the end of the leaf only insert 1 of the items, since we
+		 * have no way of knowing whats on the next leaf and we'd have
+		 * to drop our current locks to figure it out
+		 */
+		nr = 1;
+	}
+
+	/* setup the item for the new data */
+	for (i = 0; i < nr; i++) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
+		btrfs_set_item_key(leaf, &disk_key, slot + i);
+		item = btrfs_item_nr(leaf, slot + i);
+		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		data_end -= data_size[i];
+		btrfs_set_item_size(leaf, item, data_size[i]);
+	}
+	btrfs_set_header_nritems(leaf, nritems + nr);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (slot == 0) {
+		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
+		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+	}
+
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+out:
+	if (!ret)
+		ret = nr;
+	return ret;
+}
+
 /*
  * Given a key and some data, insert items into the tree.
  * This does all the path init required, making room in the tree if needed.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c83cc5b2ded..f575939e025 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1719,6 +1719,11 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, void *data, u32 data_size);
+int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    int nr);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 22820f91d2b..e785f0a0632 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -42,6 +42,8 @@ struct pending_extent_op {
 	u64 generation;
 	u64 orig_generation;
 	int level;
+	struct list_head list;
+	int del;
 };
 
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
@@ -52,6 +54,13 @@ static struct btrfs_block_group_cache *
 __btrfs_find_block_group(struct btrfs_root *root,
 			 struct btrfs_block_group_cache *hint,
 			 u64 search_start, int data, int owner);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 bytenr, u64 num_bytes, int is_data);
+static int update_block_group(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      u64 bytenr, u64 num_bytes, int alloc,
+			      int mark_free);
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
@@ -559,6 +568,251 @@ out:
 	return ret;
 }
 
+/*
+ * updates all the backrefs that are pending on update_list for the
+ * extent_root
+ */
+static int noinline update_backrefs(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct list_head *update_list)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct pending_extent_op *op;
+	struct extent_buffer *leaf;
+	int ret = 0;
+	struct list_head *cur = update_list->next;
+	u64 ref_objectid;
+	u64 ref_root = extent_root->root_key.objectid;
+
+	op = list_entry(cur, struct pending_extent_op, list);
+
+search:
+	key.objectid = op->bytenr;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	key.offset = op->orig_parent;
+
+	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+
+loop:
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+
+	ref_objectid = btrfs_ref_objectid(leaf, ref);
+
+	if (btrfs_ref_root(leaf, ref) != ref_root ||
+	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
+	    (ref_objectid != op->level &&
+	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
+		printk(KERN_ERR "couldn't find %Lu, parent %Lu, root %Lu, "
+		       "owner %u\n", op->bytenr, op->orig_parent,
+		       ref_root, op->level);
+		btrfs_print_leaf(extent_root, leaf);
+		BUG();
+	}
+
+	key.objectid = op->bytenr;
+	key.offset = op->parent;
+	key.type = BTRFS_EXTENT_REF_KEY;
+	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
+	BUG_ON(ret);
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
+	btrfs_set_ref_generation(leaf, ref, op->generation);
+
+	cur = cur->next;
+
+	list_del_init(&op->list);
+	unlock_extent(&info->extent_ins, op->bytenr,
+		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+	kfree(op);
+
+	if (cur == update_list) {
+		btrfs_mark_buffer_dirty(path->nodes[0]);
+		btrfs_release_path(extent_root, path);
+		goto out;
+	}
+
+	op = list_entry(cur, struct pending_extent_op, list);
+
+	path->slots[0]++;
+	while (path->slots[0] < btrfs_header_nritems(leaf)) {
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid == op->bytenr &&
+		    key.type == BTRFS_EXTENT_REF_KEY)
+			goto loop;
+		path->slots[0]++;
+	}
+
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(extent_root, path);
+	goto search;
+
+out:
+	return 0;
+}
+
+static int noinline insert_extents(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *extent_root,
+				   struct btrfs_path *path,
+				   struct list_head *insert_list, int nr)
+{
+	struct btrfs_key *keys;
+	u32 *data_size;
+	struct pending_extent_op *op;
+	struct extent_buffer *leaf;
+	struct list_head *cur = insert_list->next;
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	u64 ref_root = extent_root->root_key.objectid;
+	int i = 0, last = 0, ret;
+	int total = nr * 2;
+
+	if (!nr)
+		return 0;
+
+	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
+	if (!keys)
+		return -ENOMEM;
+
+	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
+	if (!data_size) {
+		kfree(keys);
+		return -ENOMEM;
+	}
+
+	list_for_each_entry(op, insert_list, list) {
+		keys[i].objectid = op->bytenr;
+		keys[i].offset = op->num_bytes;
+		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
+		data_size[i] = sizeof(struct btrfs_extent_item);
+		i++;
+
+		keys[i].objectid = op->bytenr;
+		keys[i].offset = op->parent;
+		keys[i].type = BTRFS_EXTENT_REF_KEY;
+		data_size[i] = sizeof(struct btrfs_extent_ref);
+		i++;
+	}
+
+	op = list_entry(cur, struct pending_extent_op, list);
+	i = 0;
+	while (i < total) {
+		int c;
+		ret = btrfs_insert_some_items(trans, extent_root, path,
+					      keys+i, data_size+i, total-i);
+		BUG_ON(ret < 0);
+
+		if (last && ret > 1)
+			BUG();
+
+		leaf = path->nodes[0];
+		for (c = 0; c < ret; c++) {
+			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
+
+			/*
+			 * if the first item we inserted was a backref, then
+			 * the EXTENT_ITEM will be the odd c's, else it will
+			 * be the even c's
+			 */
+			if ((ref_first && (c % 2)) ||
+			    (!ref_first && !(c % 2))) {
+				struct btrfs_extent_item *itm;
+
+				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
+						     struct btrfs_extent_item);
+				btrfs_set_extent_refs(path->nodes[0], itm, 1);
+				op->del++;
+			} else {
+				struct btrfs_extent_ref *ref;
+
+				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
+						     struct btrfs_extent_ref);
+				btrfs_set_ref_root(leaf, ref, ref_root);
+				btrfs_set_ref_generation(leaf, ref,
+							 op->generation);
+				btrfs_set_ref_objectid(leaf, ref, op->level);
+				btrfs_set_ref_num_refs(leaf, ref, 1);
+				op->del++;
+			}
+
+			/*
+			 * using del to see when its ok to free up the
+			 * pending_extent_op.  In the case where we insert the
+			 * last item on the list in order to help do batching
+			 * we need to not free the extent op until we actually
+			 * insert the extent_item
+			 */
+			if (op->del == 2) {
+				unlock_extent(&info->extent_ins, op->bytenr,
+					      op->bytenr + op->num_bytes - 1,
+					      GFP_NOFS);
+				cur = cur->next;
+				list_del_init(&op->list);
+				kfree(op);
+				if (cur != insert_list)
+					op = list_entry(cur,
+						struct pending_extent_op,
+						list);
+			}
+		}
+		btrfs_mark_buffer_dirty(leaf);
+		btrfs_release_path(extent_root, path);
+
+		/*
+		 * Ok backref's and items usually go right next to eachother,
+		 * but if we could only insert 1 item that means that we
+		 * inserted on the end of a leaf, and we have no idea what may
+		 * be on the next leaf so we just play it safe.  In order to
+		 * try and help this case we insert the last thing on our
+		 * insert list so hopefully it will end up being the last
+		 * thing on the leaf and everything else will be before it,
+		 * which will let us insert a whole bunch of items at the same
+		 * time.
+		 */
+		if (ret == 1 && !last && (i + ret < total)) {
+			/*
+			 * last: where we will pick up the next time around
+			 * i: our current key to insert, will be total - 1
+			 * cur: the current op we are screwing with
+			 * op: duh
+			 */
+			last = i + ret;
+			i = total - 1;
+			cur = insert_list->prev;
+			op = list_entry(cur, struct pending_extent_op, list);
+		} else if (last) {
+			/*
+			 * ok we successfully inserted the last item on the
+			 * list, lets reset everything
+			 *
+			 * i: our current key to insert, so where we left off
+			 *    last time
+			 * last: done with this
+			 * cur: the op we are messing with
+			 * op: duh
+			 * total: since we inserted the last key, we need to
+			 *        decrement total so we dont overflow
+			 */
+			i = last;
+			last = 0;
+			cur = insert_list->next;
+			op = list_entry(cur, struct pending_extent_op, list);
+			total--;
+		} else {
+			i += ret;
+		}
+
+		cond_resched();
+	}
+	ret = 0;
+	kfree(keys);
+	kfree(data_size);
+	return ret;
+}
+
 static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
@@ -642,6 +896,267 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int noinline free_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root,
+				 struct list_head *del_list)
+{
+	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct btrfs_path *path;
+	struct btrfs_key key, found_key;
+	struct extent_buffer *leaf;
+	struct list_head *cur;
+	struct pending_extent_op *op;
+	struct btrfs_extent_item *ei;
+	int ret, num_to_del, extent_slot = 0, found_extent = 0;
+	u32 refs;
+	u64 bytes_freed = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+search:
+	/* search for the backref for the current ref we want to delete */
+	cur = del_list->next;
+	op = list_entry(cur, struct pending_extent_op, list);
+	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
+				    op->orig_parent,
+				    extent_root->root_key.objectid,
+				    op->orig_generation, op->level, 1);
+	if (ret) {
+		printk("Unable to find backref byte nr %Lu root %Lu gen %Lu "
+		       "owner %u\n", op->bytenr,
+		       extent_root->root_key.objectid, op->orig_generation,
+		       op->level);
+		btrfs_print_leaf(extent_root, path->nodes[0]);
+		WARN_ON(1);
+		goto out;
+	}
+
+	extent_slot = path->slots[0];
+	num_to_del = 1;
+	found_extent = 0;
+
+	/*
+	 * if we aren't the first item on the leaf we can move back one and see
+	 * if our ref is right next to our extent item
+	 */
+	if (likely(extent_slot)) {
+		extent_slot--;
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      extent_slot);
+		if (found_key.objectid == op->bytenr &&
+		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+		    found_key.offset == op->num_bytes) {
+			num_to_del++;
+			found_extent = 1;
+		}
+	}
+
+	/*
+	 * if we didn't find the extent we need to delete the backref and then
+	 * search for the extent item key so we can update its ref count
+	 */
+	if (!found_extent) {
+		key.objectid = op->bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = op->num_bytes;
+
+		ret = remove_extent_backref(trans, extent_root, path);
+		BUG_ON(ret);
+		btrfs_release_path(extent_root, path);
+		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
+		BUG_ON(ret);
+		extent_slot = path->slots[0];
+	}
+
+	/* this is where we update the ref count for the extent */
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	BUG_ON(refs == 0);
+	refs--;
+	btrfs_set_extent_refs(leaf, ei, refs);
+
+	btrfs_mark_buffer_dirty(leaf);
+
+	/*
+	 * This extent needs deleting.  The reason cur_slot is extent_slot +
+	 * num_to_del is because extent_slot points to the slot where the extent
+	 * is, and if the backref was not right next to the extent we will be
+	 * deleting at least 1 item, and will want to start searching at the
+	 * slot directly next to extent_slot.  However if we did find the
+	 * backref next to the extent item them we will be deleting at least 2
+	 * items and will want to start searching directly after the ref slot
+	 */
+	if (!refs) {
+		struct list_head *pos, *n, *end;
+		int cur_slot = extent_slot+num_to_del;
+		u64 super_used;
+		u64 root_used;
+
+		path->slots[0] = extent_slot;
+		bytes_freed = op->num_bytes;
+
+		/*
+		 * we need to see if we can delete multiple things at once, so
+		 * start looping through the list of extents we are wanting to
+		 * delete and see if their extent/backref's are right next to
+		 * eachother and the extents only have 1 ref
+		 */
+		for (pos = cur->next; pos != del_list; pos = pos->next) {
+			struct pending_extent_op *tmp;
+
+			tmp = list_entry(pos, struct pending_extent_op, list);
+
+			/* we only want to delete extent+ref at this stage */
+			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
+				break;
+
+			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
+			if (found_key.objectid != tmp->bytenr ||
+			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
+			    found_key.offset != tmp->num_bytes)
+				break;
+
+			/* check to make sure this extent only has one ref */
+			ei = btrfs_item_ptr(leaf, cur_slot,
+					    struct btrfs_extent_item);
+			if (btrfs_extent_refs(leaf, ei) != 1)
+				break;
+
+			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
+			if (found_key.objectid != tmp->bytenr ||
+			    found_key.type != BTRFS_EXTENT_REF_KEY ||
+			    found_key.offset != tmp->orig_parent)
+				break;
+
+			/*
+			 * the ref is right next to the extent, we can set the
+			 * ref count to 0 since we will delete them both now
+			 */
+			btrfs_set_extent_refs(leaf, ei, 0);
+
+			/* pin down the bytes for this extent */
+			mutex_lock(&info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
+					     tmp->num_bytes, tmp->level >=
+					     BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&info->pinned_mutex);
+			BUG_ON(ret < 0);
+
+			/*
+			 * use the del field to tell if we need to go ahead and
+			 * free up the extent when we delete the item or not.
+			 */
+			tmp->del = ret;
+			bytes_freed += tmp->num_bytes;
+
+			num_to_del += 2;
+			cur_slot += 2;
+		}
+		end = pos;
+
+		/* update the free space counters */
+		spin_lock_irq(&info->delalloc_lock);
+		super_used = btrfs_super_bytes_used(&info->super_copy);
+		btrfs_set_super_bytes_used(&info->super_copy,
+					   super_used - bytes_freed);
+		spin_unlock_irq(&info->delalloc_lock);
+
+		root_used = btrfs_root_used(&extent_root->root_item);
+		btrfs_set_root_used(&extent_root->root_item,
+				    root_used - bytes_freed);
+
+		/* delete the items */
+		ret = btrfs_del_items(trans, extent_root, path,
+				      path->slots[0], num_to_del);
+		BUG_ON(ret);
+
+		/*
+		 * loop through the extents we deleted and do the cleanup work
+		 * on them
+		 */
+		for (pos = cur, n = pos->next; pos != end;
+		     pos = n, n = pos->next) {
+			struct pending_extent_op *tmp;
+#ifdef BIO_RW_DISCARD
+			u64 map_length;
+			struct btrfs_multi_bio *multi = NULL;
+#endif
+			tmp = list_entry(pos, struct pending_extent_op, list);
+
+			/*
+			 * remember tmp->del tells us wether or not we pinned
+			 * down the extent
+			 */
+			ret = update_block_group(trans, extent_root,
+						 tmp->bytenr, tmp->num_bytes, 0,
+						 tmp->del);
+			BUG_ON(ret);
+
+#ifdef BIO_RW_DISCARD
+			ret = btrfs_map_block(&info->mapping_tree, READ,
+					      tmp->bytenr, &map_length, &multi,
+					      0);
+			if (!ret) {
+				struct btrfs_bio_stripe *stripe;
+				int i;
+
+				stripe = multi->stripe;
+
+				if (map_length > tmp->num_bytes)
+					map_length = tmp->num_bytes;
+
+				for (i = 0; i < multi->num_stripes;
+				     i++, stripe++)
+					blkdev_issue_discard(stripe->dev->bdev,
+							stripe->physical >> 9,
+							map_length >> 9);
+				kfree(multi);
+			}
+#endif
+			list_del_init(&tmp->list);
+			unlock_extent(&info->extent_ins, tmp->bytenr,
+				      tmp->bytenr + tmp->num_bytes - 1,
+				      GFP_NOFS);
+			kfree(tmp);
+		}
+	} else if (refs && found_extent) {
+		/*
+		 * the ref and extent were right next to eachother, but the
+		 * extent still has a ref, so just free the backref and keep
+		 * going
+		 */
+		ret = remove_extent_backref(trans, extent_root, path);
+		BUG_ON(ret);
+
+		list_del_init(&op->list);
+		unlock_extent(&info->extent_ins, op->bytenr,
+			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+		kfree(op);
+	} else {
+		/*
+		 * the extent has multiple refs and the backref we were looking
+		 * for was not right next to it, so just unlock and go next,
+		 * we're good to go
+		 */
+		list_del_init(&op->list);
+		unlock_extent(&info->extent_ins, op->bytenr,
+			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
+		kfree(op);
+	}
+
+	btrfs_release_path(extent_root, path);
+	if (!list_empty(del_list))
+		goto search;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, u64 bytenr,
 				     u64 orig_parent, u64 parent,
@@ -685,6 +1200,8 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			extent_op->generation = ref_generation;
 			extent_op->orig_generation = orig_generation;
 			extent_op->level = (int)owner_objectid;
+			INIT_LIST_HEAD(&extent_op->list);
+			extent_op->del = 0;
 
 			set_extent_bits(&root->fs_info->extent_ins,
 					bytenr, bytenr + num_bytes - 1,
@@ -1426,9 +1943,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 	while(total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
-		if (!cache) {
+		if (!cache)
 			return -1;
-		}
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
@@ -1605,102 +2121,176 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 end;
 	u64 priv;
 	u64 search = 0;
+	u64 skipped = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
-	struct btrfs_extent_ref *ref;
-	struct pending_extent_op *extent_op;
-	struct btrfs_key key;
-	struct btrfs_extent_item extent_item;
+	struct pending_extent_op *extent_op, *tmp;
+	struct list_head insert_list, update_list;
 	int ret;
-	int err = 0;
+	int num_inserts = 0, max_inserts;
 
-	btrfs_set_stack_extent_refs(&extent_item, 1);
 	path = btrfs_alloc_path();
+	INIT_LIST_HEAD(&insert_list);
+	INIT_LIST_HEAD(&update_list);
 
-	while(1) {
-		mutex_lock(&info->extent_ins_mutex);
+	max_inserts = extent_root->leafsize /
+		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
+		 sizeof(struct btrfs_extent_ref) +
+		 sizeof(struct btrfs_extent_item));
+again:
+	mutex_lock(&info->extent_ins_mutex);
+	while (1) {
 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
-			mutex_unlock(&info->extent_ins_mutex);
-			if (search && all) {
-				search = 0;
+			if (skipped && all && !num_inserts) {
+				skipped = 0;
 				continue;
 			}
+			mutex_unlock(&info->extent_ins_mutex);
 			break;
 		}
 
 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
 		if (!ret) {
+			skipped = 1;
 			search = end + 1;
-			mutex_unlock(&info->extent_ins_mutex);
-			cond_resched();
+			if (need_resched()) {
+				mutex_unlock(&info->extent_ins_mutex);
+				cond_resched();
+				mutex_lock(&info->extent_ins_mutex);
+			}
 			continue;
 		}
-		BUG_ON(ret < 0);
 
 		ret = get_state_private(&info->extent_ins, start, &priv);
 		BUG_ON(ret);
-		extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
-		mutex_unlock(&info->extent_ins_mutex);
+		extent_op = (struct pending_extent_op *)(unsigned long) priv;
 
 		if (extent_op->type == PENDING_EXTENT_INSERT) {
-			key.objectid = start;
-			key.offset = end + 1 - start;
-			key.type = BTRFS_EXTENT_ITEM_KEY;
-			err = btrfs_insert_item(trans, extent_root, &key,
-					&extent_item, sizeof(extent_item));
-			BUG_ON(err);
+			num_inserts++;
+			list_add_tail(&extent_op->list, &insert_list);
+			search = end + 1;
+			if (num_inserts == max_inserts) {
+				mutex_unlock(&info->extent_ins_mutex);
+				break;
+			}
+		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
+			list_add_tail(&extent_op->list, &update_list);
+			search = end + 1;
+		} else {
+			BUG();
+		}
+	}
 
-			mutex_lock(&info->extent_ins_mutex);
-			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_WRITEBACK, GFP_NOFS);
-			mutex_unlock(&info->extent_ins_mutex);
+	/*
+	 * process teh update list, clear the writeback bit for it, and if
+	 * somebody marked this thing for deletion then just unlock it and be
+	 * done, the free_extents will handle it
+	 */
+	mutex_lock(&info->extent_ins_mutex);
+	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
+		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+				  extent_op->bytenr + extent_op->num_bytes - 1,
+				  EXTENT_WRITEBACK, GFP_NOFS);
+		if (extent_op->del) {
+			list_del_init(&extent_op->list);
+			unlock_extent(&info->extent_ins, extent_op->bytenr,
+				      extent_op->bytenr + extent_op->num_bytes
+				      - 1, GFP_NOFS);
+			kfree(extent_op);
+		}
+	}
+	mutex_unlock(&info->extent_ins_mutex);
 
-			err = insert_extent_backref(trans, extent_root, path,
-						start, extent_op->parent,
-						extent_root->root_key.objectid,
-						extent_op->generation,
-						extent_op->level);
-			BUG_ON(err);
-		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
-			err = lookup_extent_backref(trans, extent_root, path,
-						start, extent_op->orig_parent,
-						extent_root->root_key.objectid,
-						extent_op->orig_generation,
-						extent_op->level, 0);
-			BUG_ON(err);
+	/*
+	 * still have things left on the update list, go ahead an update
+	 * everything
+	 */
+	if (!list_empty(&update_list)) {
+		ret = update_backrefs(trans, extent_root, path, &update_list);
+		BUG_ON(ret);
+	}
 
-			mutex_lock(&info->extent_ins_mutex);
-			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_WRITEBACK, GFP_NOFS);
-			mutex_unlock(&info->extent_ins_mutex);
+	/*
+	 * if no inserts need to be done, but we skipped some extents and we
+	 * need to make sure everything is cleaned then reset everything and
+	 * go back to the beginning
+	 */
+	if (!num_inserts && all && skipped) {
+		search = 0;
+		skipped = 0;
+		INIT_LIST_HEAD(&update_list);
+		INIT_LIST_HEAD(&insert_list);
+		goto again;
+	} else if (!num_inserts) {
+		goto out;
+	}
 
-			key.objectid = start;
-			key.offset = extent_op->parent;
-			key.type = BTRFS_EXTENT_REF_KEY;
-			err = btrfs_set_item_key_safe(trans, extent_root, path,
-						      &key);
-			BUG_ON(err);
-			ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					     struct btrfs_extent_ref);
-			btrfs_set_ref_generation(path->nodes[0], ref,
-						 extent_op->generation);
-			btrfs_mark_buffer_dirty(path->nodes[0]);
-			btrfs_release_path(extent_root, path);
-		} else {
-			BUG_ON(1);
+	/*
+	 * process the insert extents list.  Again if we are deleting this
+	 * extent, then just unlock it, pin down the bytes if need be, and be
+	 * done with it.  Saves us from having to actually insert the extent
+	 * into the tree and then subsequently come along and delete it
+	 */
+	mutex_lock(&info->extent_ins_mutex);
+	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
+		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
+				  extent_op->bytenr + extent_op->num_bytes - 1,
+				  EXTENT_WRITEBACK, GFP_NOFS);
+		if (extent_op->del) {
+			list_del_init(&extent_op->list);
+			unlock_extent(&info->extent_ins, extent_op->bytenr,
+				      extent_op->bytenr + extent_op->num_bytes
+				      - 1, GFP_NOFS);
+
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root,
+					     extent_op->bytenr,
+					     extent_op->num_bytes, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
+			ret = update_block_group(trans, extent_root,
+						 extent_op->bytenr,
+						 extent_op->num_bytes,
+						 0, ret > 0);
+			BUG_ON(ret);
+			kfree(extent_op);
+			num_inserts--;
 		}
-		kfree(extent_op);
-		unlock_extent(&info->extent_ins, start, end, GFP_NOFS);
-		if (all)
-			search = 0;
-		else
-			search = end + 1;
+	}
+	mutex_unlock(&info->extent_ins_mutex);
 
-		cond_resched();
+	ret = insert_extents(trans, extent_root, path, &insert_list,
+			     num_inserts);
+	BUG_ON(ret);
+
+	/*
+	 * if we broke out of the loop in order to insert stuff because we hit
+	 * the maximum number of inserts at a time we can handle, then loop
+	 * back and pick up where we left off
+	 */
+	if (num_inserts == max_inserts) {
+		INIT_LIST_HEAD(&insert_list);
+		INIT_LIST_HEAD(&update_list);
+		num_inserts = 0;
+		goto again;
 	}
+
+	/*
+	 * again, if we need to make absolutely sure there are no more pending
+	 * extent operations left and we know that we skipped some, go back to
+	 * the beginning and do it all again
+	 */
+	if (all && skipped) {
+		INIT_LIST_HEAD(&insert_list);
+		INIT_LIST_HEAD(&update_list);
+		search = 0;
+		skipped = 0;
+		num_inserts = 0;
+		goto again;
+	}
+out:
 	btrfs_free_path(path);
 	return 0;
 }
@@ -1802,6 +2392,12 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			btrfs_release_path(extent_root, path);
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
+			if (ret) {
+				printk(KERN_ERR "umm, got %d back from search"
+				       ", was looking for %Lu\n", ret,
+				       bytenr);
+				btrfs_print_leaf(extent_root, path->nodes[0]);
+			}
 			BUG_ON(ret);
 			extent_slot = path->slots[0];
 		}
@@ -1921,32 +2517,42 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	u64 end;
 	u64 priv;
 	u64 search = 0;
+	int nr = 0, skipped = 0;
 	struct extent_io_tree *pending_del;
 	struct extent_io_tree *extent_ins;
 	struct pending_extent_op *extent_op;
 	struct btrfs_fs_info *info = extent_root->fs_info;
+	struct list_head delete_list;
 
+	INIT_LIST_HEAD(&delete_list);
 	extent_ins = &extent_root->fs_info->extent_ins;
 	pending_del = &extent_root->fs_info->pending_del;
 
+again:
+	mutex_lock(&info->extent_ins_mutex);
 	while(1) {
-		mutex_lock(&info->extent_ins_mutex);
 		ret = find_first_extent_bit(pending_del, search, &start, &end,
 					    EXTENT_WRITEBACK);
 		if (ret) {
-			mutex_unlock(&info->extent_ins_mutex);
-			if (all && search) {
+			if (all && skipped && !nr) {
 				search = 0;
 				continue;
 			}
+			mutex_unlock(&info->extent_ins_mutex);
 			break;
 		}
 
 		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
 		if (!ret) {
 			search = end+1;
-			mutex_unlock(&info->extent_ins_mutex);
-			cond_resched();
+			skipped = 1;
+
+			if (need_resched()) {
+				mutex_unlock(&info->extent_ins_mutex);
+				cond_resched();
+				mutex_lock(&info->extent_ins_mutex);
+			}
+
 			continue;
 		}
 		BUG_ON(ret < 0);
@@ -1959,15 +2565,8 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 				  GFP_NOFS);
 		if (!test_range_bit(extent_ins, start, end,
 				    EXTENT_WRITEBACK, 0)) {
-			mutex_unlock(&info->extent_ins_mutex);
-free_extent:
-			ret = __free_extent(trans, extent_root,
-					    start, end + 1 - start,
-					    extent_op->orig_parent,
-					    extent_root->root_key.objectid,
-					    extent_op->orig_generation,
-					    extent_op->level, 1, 0);
-			kfree(extent_op);
+			list_add_tail(&extent_op->list, &delete_list);
+			nr++;
 		} else {
 			kfree(extent_op);
 
@@ -1980,10 +2579,12 @@ free_extent:
 			clear_extent_bits(&info->extent_ins, start, end,
 					  EXTENT_WRITEBACK, GFP_NOFS);
 
-			mutex_unlock(&info->extent_ins_mutex);
-
-			if (extent_op->type == PENDING_BACKREF_UPDATE)
-				goto free_extent;
+			if (extent_op->type == PENDING_BACKREF_UPDATE) {
+				list_add_tail(&extent_op->list, &delete_list);
+				search = end + 1;
+				nr++;
+				continue;
+			}
 
 			mutex_lock(&extent_root->fs_info->pinned_mutex);
 			ret = pin_down_bytes(trans, extent_root, start,
@@ -1993,19 +2594,34 @@ free_extent:
 			ret = update_block_group(trans, extent_root, start,
 						end + 1 - start, 0, ret > 0);
 
+			unlock_extent(extent_ins, start, end, GFP_NOFS);
 			BUG_ON(ret);
 			kfree(extent_op);
 		}
 		if (ret)
 			err = ret;
-		unlock_extent(extent_ins, start, end, GFP_NOFS);
 
-		if (all)
-			search = 0;
-		else
-			search = end + 1;
-		cond_resched();
+		search = end + 1;
+
+		if (need_resched()) {
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			mutex_lock(&info->extent_ins_mutex);
+		}
 	}
+
+	if (nr) {
+		ret = free_extents(trans, extent_root, &delete_list);
+		BUG_ON(ret);
+	}
+
+	if (all && skipped) {
+		INIT_LIST_HEAD(&delete_list);
+		search = 0;
+		nr = 0;
+		goto again;
+	}
+
 	return err;
 }
 
@@ -2024,7 +2640,29 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 	WARN_ON(num_bytes < root->sectorsize);
 	if (root == extent_root) {
-		struct pending_extent_op *extent_op;
+		struct pending_extent_op *extent_op = NULL;
+
+		mutex_lock(&root->fs_info->extent_ins_mutex);
+		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
+			u64 priv;
+			ret = get_state_private(&root->fs_info->extent_ins,
+						bytenr, &priv);
+			BUG_ON(ret);
+			extent_op = (struct pending_extent_op *)
+						(unsigned long)priv;
+
+			extent_op->del = 1;
+			if (extent_op->type == PENDING_EXTENT_INSERT) {
+				mutex_unlock(&root->fs_info->extent_ins_mutex);
+				return 0;
+			}
+		}
+
+		if (extent_op) {
+			ref_generation = extent_op->orig_generation;
+			parent = extent_op->orig_parent;
+		}
 
 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
 		BUG_ON(!extent_op);
@@ -2037,8 +2675,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		extent_op->generation = ref_generation;
 		extent_op->orig_generation = ref_generation;
 		extent_op->level = (int)owner_objectid;
+		INIT_LIST_HEAD(&extent_op->list);
+		extent_op->del = 0;
 
-		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
 				EXTENT_WRITEBACK, GFP_NOFS);
@@ -2515,6 +3154,8 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 		extent_op->generation = ref_generation;
 		extent_op->orig_generation = 0;
 		extent_op->level = (int)owner;
+		INIT_LIST_HEAD(&extent_op->list);
+		extent_op->del = 0;
 
 		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
-- 
cgit v1.2.3


From c146afad2c7fea6a366d4945c1bab9b03880f526 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 12 Nov 2008 14:34:12 -0500
Subject: Btrfs: mount ro and remount support

This patch adds mount ro and remount support. The main
changes in patch are: adding btrfs_remount and related
helper function; splitting the transaction related code
out of close_ctree into btrfs_commit_super; updating
allocator to properly handle read only block group.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     | 83 ++++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/disk-io.h     |  2 ++
 fs/btrfs/extent-tree.c | 61 +++++++++++++++++++++----------------
 fs/btrfs/inode.c       | 15 +++------
 fs/btrfs/ioctl.c       | 22 +++++++++++++
 fs/btrfs/super.c       | 37 ++++++++++++++++++++--
 7 files changed, 163 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f575939e025..c4c6c127323 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -541,6 +541,7 @@ struct btrfs_space_info {
 	u64 bytes_used;
 	u64 bytes_pinned;
 	u64 bytes_reserved;
+	u64 bytes_readonly;
 	int full;
 	int force_alloc;
 	struct list_head list;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3b0e974a9e9..c599f0ee997 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1075,10 +1075,12 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
-	ret = btrfs_find_dead_roots(fs_info->tree_root,
-				    root->root_key.objectid, root);
-	BUG_ON(ret);
-
+	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_find_dead_roots(fs_info->tree_root,
+					    root->root_key.objectid, root);
+		BUG_ON(ret);
+		btrfs_orphan_cleanup(root);
+	}
 	return root;
 }
 
@@ -1700,7 +1702,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_read_block_groups(extent_root);
 
-	fs_info->generation = btrfs_super_generation(disk_super) + 1;
+	fs_info->generation = generation + 1;
+	fs_info->last_trans_committed = generation;
 	fs_info->data_alloc_profile = (u64)-1;
 	fs_info->metadata_alloc_profile = (u64)-1;
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
@@ -1715,6 +1718,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!fs_info->transaction_kthread)
 		goto fail_cleaner;
 
+	if (sb->s_flags & MS_RDONLY)
+		return tree_root;
+
 	if (btrfs_super_log_root(disk_super) != 0) {
 		u32 blocksize;
 		u64 bytenr = btrfs_super_log_root(disk_super);
@@ -1735,7 +1741,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		ret = btrfs_recover_log_trees(log_tree_root);
 		BUG_ON(ret);
 	}
-	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 
 	ret = btrfs_cleanup_reloc_trees(tree_root);
 	BUG_ON(ret);
@@ -1955,28 +1960,69 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
-int close_ctree(struct btrfs_root *root)
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 {
+	u64 root_objectid = 0;
+	struct btrfs_root *gang[8];
+	int i;
 	int ret;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	fs_info->closing = 1;
-	smp_mb();
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, root_objectid,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++) {
+			root_objectid = gang[i]->root_key.objectid;
+			ret = btrfs_find_dead_roots(fs_info->tree_root,
+						    root_objectid, gang[i]);
+			BUG_ON(ret);
+			btrfs_orphan_cleanup(gang[i]);
+		}
+		root_objectid++;
+	}
+	return 0;
+}
 
-	kthread_stop(root->fs_info->transaction_kthread);
-	kthread_stop(root->fs_info->cleaner_kthread);
+int btrfs_commit_super(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
 
+	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_clean_old_snapshots(root);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
-	/* run commit again to  drop the original snapshot */
+	BUG_ON(ret);
+	/* run commit again to drop the original snapshot */
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_commit_transaction(trans, root);
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
 
-	write_ctree_super(NULL, root);
+	ret = write_ctree_super(NULL, root);
+	return ret;
+}
+
+int close_ctree(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	int ret;
+
+	fs_info->closing = 1;
+	smp_mb();
+
+	kthread_stop(root->fs_info->transaction_kthread);
+	kthread_stop(root->fs_info->cleaner_kthread);
+
+	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
+		ret =  btrfs_commit_super(root);
+		if (ret) {
+			printk("btrfs: commit super returns %d\n", ret);
+		}
+	}
 
 	if (fs_info->delalloc_bytes) {
 		printk("btrfs: at unmount delalloc count %Lu\n",
@@ -2000,12 +2046,10 @@ int close_ctree(struct btrfs_root *root)
 		free_extent_buffer(root->fs_info->dev_root->node);
 
 	btrfs_free_block_groups(root->fs_info);
-	fs_info->closing = 2;
-	del_fs_roots(fs_info);
 
-	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+	del_fs_roots(fs_info);
 
-	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
+	iput(fs_info->btree_inode);
 
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2014,7 +2058,6 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
-	iput(fs_info->btree_inode);
 #if 0
 	while(!list_empty(&fs_info->hashers)) {
 		struct btrfs_hasher *hasher;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b8d5948fa27..717e94811e4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -38,6 +38,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
+int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
@@ -49,6 +50,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 					       struct btrfs_key *location);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location);
+int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
 int btrfs_insert_dev_radix(struct btrfs_root *root,
 			   struct block_device *bdev,
 			   u64 device_id,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e785f0a0632..af2de30dbea 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1794,7 +1794,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		*space_info = found;
 		return 0;
 	}
-	found = kmalloc(sizeof(*found), GFP_NOFS);
+	found = kzalloc(sizeof(*found), GFP_NOFS);
 	if (!found)
 		return -ENOMEM;
 
@@ -1807,6 +1807,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_used = bytes_used;
 	found->bytes_pinned = 0;
 	found->bytes_reserved = 0;
+	found->bytes_readonly = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
@@ -1829,6 +1830,19 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	}
 }
 
+static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
+{
+	spin_lock(&cache->space_info->lock);
+	spin_lock(&cache->lock);
+	if (!cache->ro) {
+		cache->space_info->bytes_readonly += cache->key.offset -
+					btrfs_block_group_used(&cache->item);
+		cache->ro = 1;
+	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&cache->space_info->lock);
+}
+
 static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	u64 num_devices = root->fs_info->fs_devices->num_devices;
@@ -1865,7 +1879,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 thresh;
 	u64 start;
 	u64 num_bytes;
-	int ret = 0, waited = 0;
+	int ret = 0;
+
+	mutex_lock(&extent_root->fs_info->chunk_mutex);
 
 	flags = reduce_alloc_profile(extent_root, flags);
 
@@ -1887,46 +1903,28 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	thresh = div_factor(space_info->total_bytes, 6);
+	thresh = space_info->total_bytes - space_info->bytes_readonly;
+	thresh = div_factor(thresh, 6);
 	if (!force &&
 	   (space_info->bytes_used + space_info->bytes_pinned +
 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
 		spin_unlock(&space_info->lock);
 		goto out;
 	}
-
 	spin_unlock(&space_info->lock);
 
-	ret = mutex_trylock(&extent_root->fs_info->chunk_mutex);
-	if (!ret && !force) {
-		goto out;
-	} else if (!ret) {
-		mutex_lock(&extent_root->fs_info->chunk_mutex);
-		waited = 1;
-	}
-
-	if (waited) {
-		spin_lock(&space_info->lock);
-		if (space_info->full) {
-			spin_unlock(&space_info->lock);
-			goto out_unlock;
-		}
-		spin_unlock(&space_info->lock);
-	}
-
 	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
 	if (ret) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
-		goto out_unlock;
+		goto out;
 	}
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-out_unlock:
-	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 out:
+	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return ret;
 }
 
@@ -1956,12 +1954,18 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (alloc) {
 			old_val += num_bytes;
 			cache->space_info->bytes_used += num_bytes;
+			if (cache->ro) {
+				cache->space_info->bytes_readonly -= num_bytes;
+				WARN_ON(1);
+			}
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 		} else {
 			old_val -= num_bytes;
 			cache->space_info->bytes_used -= num_bytes;
+			if (cache->ro)
+				cache->space_info->bytes_readonly += num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
@@ -5560,8 +5564,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	BUG_ON(IS_ERR(reloc_inode));
 
 	__alloc_chunk_for_shrink(root, block_group, 1);
-	block_group->ro = 1;
-	block_group->space_info->total_bytes -= block_group->key.offset;
+	set_block_group_readonly(block_group);
 
 	btrfs_start_delalloc_inodes(info->tree_root);
 	btrfs_wait_ordered_extents(info->tree_root, 0);
@@ -5868,6 +5871,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
 	BUG_ON(!block_group);
+	BUG_ON(!block_group->ro);
 
 	memcpy(&key, &block_group->key, sizeof(key));
 
@@ -5881,6 +5885,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	list_del(&block_group->list);
 	up_write(&block_group->space_info->groups_sem);
 
+	spin_lock(&block_group->space_info->lock);
+	block_group->space_info->total_bytes -= block_group->key.offset;
+	block_group->space_info->bytes_readonly -= block_group->key.offset;
+	spin_unlock(&block_group->space_info->lock);
+
 	/*
 	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
 	kfree(shrink_block_group);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2ed2deacde9..3e3620e69bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1808,10 +1808,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 	struct inode *inode;
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
-	/* don't do orphan cleanup if the fs is readonly. */
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return;
@@ -3050,7 +3046,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
-	int ret, new, do_orphan = 0;
+	int ret, new;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -3076,13 +3072,9 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 		if (new && root != sub_root) {
 			igrab(inode);
 			sub_root->inode = inode;
-			do_orphan = 1;
 		}
 	}
 
-	if (unlikely(do_orphan))
-		btrfs_orphan_cleanup(sub_root);
-
 	return d_splice_alias(inode, dentry);
 }
 
@@ -3237,7 +3229,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	if (root->fs_info->closing > 1)
+	if (root->fs_info->btree_inode == inode)
 		return 0;
 
 	if (wait) {
@@ -4625,6 +4617,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	struct inode *inode;
 	unsigned long flags;
 
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	while(!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4d7cc7c504d..52863cebd59 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -378,6 +378,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	int namelen;
 	int mod = 0;
 
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
 
 	if (!vol_args)
@@ -478,6 +481,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 	int namelen;
 	int ret;
 
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
 
 	if (!vol_args)
@@ -534,6 +540,11 @@ static int btrfs_ioctl_defrag(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		return ret;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
@@ -575,6 +586,9 @@ long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
 	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
 
 	if (!vol_args)
@@ -621,6 +635,10 @@ long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off,
 	 *   they don't overlap)?
 	 */
 
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		return ret;
+
 	src_file = fget(srcfd);
 	if (!src_file)
 		return -EBADF;
@@ -958,6 +976,10 @@ long btrfs_ioctl_trans_start(struct file *file)
 		goto out;
 	}
 
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out;
+
 	mutex_lock(&root->fs_info->trans_mutex);
 	root->fs_info->open_ioctl_trans++;
 	mutex_unlock(&root->fs_info->trans_mutex);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ab9d5e89ed1..04a3bf81650 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -370,6 +370,9 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	int ret;
 	root = btrfs_sb(sb);
 
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
 	sb->s_dirt = 0;
 	if (!wait) {
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
@@ -438,7 +441,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			up_write(&s->s_umount);
 			deactivate_super(s);
 			error = -EBUSY;
-			goto error_bdev;
+			goto error_close_devices;
 		}
 
 	} else {
@@ -487,7 +490,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 
 error_s:
 	error = PTR_ERR(s);
-error_bdev:
+error_close_devices:
 	btrfs_close_devices(fs_devices);
 error_free_subvol_name:
 	kfree(subvol_name);
@@ -495,6 +498,35 @@ error:
 	return error;
 }
 
+static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct btrfs_root *root = btrfs_sb(sb);
+	int ret;
+
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+		return 0;
+
+	if (*flags & MS_RDONLY) {
+		sb->s_flags |= MS_RDONLY;
+
+		ret =  btrfs_commit_super(root);
+		WARN_ON(ret);
+	} else {
+		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+			return -EINVAL;
+
+		ret = btrfs_cleanup_reloc_trees(root);
+		WARN_ON(ret);
+
+		ret = btrfs_cleanup_fs_roots(root->fs_info);
+		WARN_ON(ret);
+
+		sb->s_flags &= ~MS_RDONLY;
+	}
+
+	return 0;
+}
+
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -582,6 +614,7 @@ static struct super_operations btrfs_super_ops = {
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
+	.remount_fs	= btrfs_remount,
 	.write_super_lockfs = btrfs_write_super_lockfs,
 	.unlockfs	= btrfs_unlockfs,
 };
-- 
cgit v1.2.3


From 2b82032c34ec40515d3c45c36cd1961f37977de8 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 17 Nov 2008 21:11:30 -0500
Subject: Btrfs: Seed device support

Seed device is a special btrfs with SEEDING super flag
set and can only be mounted in read-only mode. Seed
devices allow people to create new btrfs on top of it.

The new FS contains the same contents as the seed device,
but it can be mounted in read-write mode.

This patch does the following:

1) split code in btrfs_alloc_chunk into two parts. The first part does makes
the newly allocated chunk usable, but does not do any operation that modifies
the chunk tree. The second part does the the chunk tree modifications. This
division is for the bootstrap step of adding storage to the seed device.

2) Update device management code to handle seed device.
The basic idea is: For an FS grown from seed devices, its
seed devices are put into a list. Seed devices are
opened on demand at mounting time. If any seed device is
missing or has been changed, btrfs kernel module will
refuse to mount the FS.

3) make btrfs_find_block_group not return NULL when all
block groups are read-only.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.c       |    8 +
 fs/btrfs/ctree.h       |   18 +-
 fs/btrfs/disk-io.c     |   56 ++-
 fs/btrfs/extent-tree.c |   31 +-
 fs/btrfs/ioctl.c       |    2 +-
 fs/btrfs/super.c       |    9 +
 fs/btrfs/volumes.c     | 1131 ++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/volumes.h     |   20 +-
 8 files changed, 946 insertions(+), 329 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8bb452456d9..dd1c03aea2d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -185,6 +185,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(cow, new_root_objectid);
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
 	kfree(new_root);
@@ -274,6 +278,10 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(cow, root->root_key.objectid);
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		u32 nr_extents;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4c6c127323..5ff74282a62 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -177,6 +177,9 @@ struct btrfs_dev_item {
 	/* type and info about this device */
 	__le64 type;
 
+	/* expected generation for this device */
+	__le64 generation;
+
 	/* grouping information for allocation decisions */
 	__le32 dev_group;
 
@@ -188,6 +191,9 @@ struct btrfs_dev_item {
 
 	/* btrfs generated uuid for this device */
 	u8 uuid[BTRFS_UUID_SIZE];
+
+	/* uuid of FS who owns this device */
+	u8 fsid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
 struct btrfs_stripe {
@@ -263,6 +269,7 @@ struct btrfs_header {
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 
 /*
  * this is a very generous portion of the super block, giving us
@@ -278,7 +285,7 @@ struct btrfs_header {
 struct btrfs_super_block {
 	u8 csum[BTRFS_CSUM_SIZE];
 	/* the first 4 fields must match struct btrfs_header */
-	u8 fsid[16];    /* FS specific uuid */
+	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
 	__le64 bytenr; /* this block number */
 	__le64 flags;
 
@@ -941,6 +948,7 @@ BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
 BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
 BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
 BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
 
 BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
@@ -960,12 +968,19 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
 			 seek_speed, 8);
 BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
 			 bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+			 generation, 64);
 
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
 BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
 BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
@@ -1661,6 +1676,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *buf, u64 orig_start);
 int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c599f0ee997..82833e5d84b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -345,6 +345,25 @@ out:
 	return 0;
 }
 
+static int check_tree_block_fsid(struct btrfs_root *root,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	u8 fsid[BTRFS_UUID_SIZE];
+	int ret = 1;
+
+	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+			   BTRFS_FSID_SIZE);
+	while (fs_devices) {
+		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+			ret = 0;
+			break;
+		}
+		fs_devices = fs_devices->seed;
+	}
+	return ret;
+}
+
 int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -382,9 +401,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ret = -EIO;
 		goto err;
 	}
-	if (memcmp_extent_buffer(eb, root->fs_info->fsid,
-				 (unsigned long)btrfs_header_fsid(eb),
-				 BTRFS_FSID_SIZE)) {
+	if (check_tree_block_fsid(root, eb)) {
 		printk("bad fsid on block %Lu\n", eb->start);
 		ret = -EIO;
 		goto err;
@@ -1558,9 +1575,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
-	err = btrfs_parse_options(tree_root, options);
-	if (err)
+	ret = btrfs_parse_options(tree_root, options);
+	if (ret) {
+		err = ret;
 		goto fail_sb_buffer;
+	}
 
 	/*
 	 * we need to start all the end_io workers up front because the
@@ -1610,18 +1629,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_write_workers,
 			    fs_info->thread_pool_size);
 
-	err = -EINVAL;
-	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
-		printk("Btrfs: wanted %llu devices, but found %llu\n",
-		       (unsigned long long)btrfs_super_num_devices(disk_super),
-		       (unsigned long long)fs_devices->open_devices);
-		if (btrfs_test_opt(tree_root, DEGRADED))
-			printk("continuing in degraded mode\n");
-		else {
-			goto fail_sb_buffer;
-		}
-	}
-
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
 				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
@@ -1672,7 +1679,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
 	mutex_unlock(&fs_info->chunk_mutex);
-	BUG_ON(ret);
+	if (ret) {
+		printk("btrfs: failed to read chunk tree on %s\n", sb->s_id);
+		goto fail_chunk_root;
+	}
 
 	btrfs_close_extra_devices(fs_devices);
 
@@ -1684,7 +1694,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					  btrfs_super_root(disk_super),
 					  blocksize, generation);
 	if (!tree_root->node)
-		goto fail_sb_buffer;
+		goto fail_chunk_root;
 
 
 	ret = find_and_setup_root(tree_root, fs_info,
@@ -1753,6 +1763,8 @@ fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
 	free_extent_buffer(tree_root->node);
+fail_chunk_root:
+	free_extent_buffer(chunk_root->node);
 fail_sys_array:
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -1823,9 +1835,10 @@ int write_all_supers(struct btrfs_root *root)
 			total_errors++;
 			continue;
 		}
-		if (!dev->in_fs_metadata)
+		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
+		btrfs_set_stack_device_generation(dev_item, 0);
 		btrfs_set_stack_device_type(dev_item, dev->type);
 		btrfs_set_stack_device_id(dev_item, dev->devid);
 		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
@@ -1834,6 +1847,7 @@ int write_all_supers(struct btrfs_root *root)
 		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
@@ -1881,7 +1895,7 @@ int write_all_supers(struct btrfs_root *root)
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (!dev->bdev)
 			continue;
-		if (!dev->in_fs_metadata)
+		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
 		BUG_ON(!dev->pending_io);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index af2de30dbea..197422c1dc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -355,7 +355,7 @@ __btrfs_find_block_group(struct btrfs_root *root,
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_first_block_group(info, search_start);
-		if (shint && block_group_bits(shint, data) && !shint->ro) {
+		if (shint && block_group_bits(shint, data)) {
 			spin_lock(&shint->lock);
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned + shint->reserved <
@@ -366,7 +366,7 @@ __btrfs_find_block_group(struct btrfs_root *root,
 			spin_unlock(&shint->lock);
 		}
 	}
-	if (hint && !hint->ro && block_group_bits(hint, data)) {
+	if (hint && block_group_bits(hint, data)) {
 		spin_lock(&hint->lock);
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned + hint->reserved <
@@ -392,7 +392,7 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
-		if (!cache->ro && block_group_bits(cache, data)) {
+		if (block_group_bits(cache, data)) {
 			free_check = div_factor(cache->key.offset, factor);
 			if (used + cache->pinned + cache->reserved <
 			    free_check) {
@@ -1843,9 +1843,9 @@ static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
 	spin_unlock(&cache->space_info->lock);
 }
 
-static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-	u64 num_devices = root->fs_info->fs_devices->num_devices;
+	u64 num_devices = root->fs_info->fs_devices->rw_devices;
 
 	if (num_devices == 1)
 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -1877,13 +1877,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_space_info *space_info;
 	u64 thresh;
-	u64 start;
-	u64 num_bytes;
 	int ret = 0;
 
 	mutex_lock(&extent_root->fs_info->chunk_mutex);
 
-	flags = reduce_alloc_profile(extent_root, flags);
+	flags = btrfs_reduce_alloc_profile(extent_root, flags);
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
@@ -1913,16 +1911,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&space_info->lock);
 
-	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
+	ret = btrfs_alloc_chunk(trans, extent_root, flags);
 	if (ret) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
-		goto out;
 	}
-
-	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
-		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
-	BUG_ON(ret);
 out:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return ret;
@@ -3040,7 +3033,7 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	data = reduce_alloc_profile(root, data);
+	data = btrfs_reduce_alloc_profile(root, data);
 	/*
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
@@ -5136,7 +5129,8 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 			else
 				btrfs_node_key_to_cpu(eb, &keys[level], 0);
 		}
-		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		if (nodes[0] &&
+		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			eb = path->nodes[0];
 			ret = replace_extents_in_leaf(trans, reloc_root, eb,
 						      group, reloc_inode);
@@ -5377,7 +5371,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	num_devices = root->fs_info->fs_devices->num_devices;
+	num_devices = root->fs_info->fs_devices->rw_devices;
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
@@ -5801,6 +5795,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		BUG_ON(ret);
 
 		set_avail_alloc_bits(root->fs_info, cache->flags);
+		if (btrfs_chunk_readonly(root, cache->key.objectid))
+			set_block_group_readonly(cache);
 	}
 	ret = 0;
 error:
@@ -5889,6 +5885,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
 	spin_unlock(&block_group->space_info->lock);
+	block_group->space_info->full = 0;
 
 	/*
 	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 52863cebd59..f43df72b0e1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -405,7 +405,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		devid = simple_strtoull(devstr, &end, 10);
 		printk(KERN_INFO "resizing devid %llu\n", devid);
 	}
-	device = btrfs_find_device(root, devid, NULL);
+	device = btrfs_find_device(root, devid, NULL, NULL);
 	if (!device) {
 		printk(KERN_INFO "resizer unable to find device %llu\n", devid);
 		ret = -EINVAL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 04a3bf81650..92393cc60d0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -431,6 +431,11 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (error)
 		goto error_free_subvol_name;
 
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
+		goto error_close_devices;
+	}
+
 	bdev = fs_devices->latest_bdev;
 	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
 	if (IS_ERR(s))
@@ -444,6 +449,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			goto error_close_devices;
 		}
 
+		btrfs_close_devices(fs_devices);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -512,6 +518,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		ret =  btrfs_commit_super(root);
 		WARN_ON(ret);
 	} else {
+		if (root->fs_info->fs_devices->rw_devices == 0)
+			return -EACCES;
+
 		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 80a27284dbf..d6f1996de62 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,12 @@ struct map_lookup {
 	struct btrfs_bio_stripe stripes[];
 };
 
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+
+
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
@@ -69,25 +75,31 @@ static void unlock_chunks(struct btrfs_root *root)
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
-	struct list_head *uuid_cur;
-	struct list_head *devices_cur;
 	struct btrfs_device *dev;
 
-	list_for_each(uuid_cur, &fs_uuids) {
-		fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
-					list);
+	while (!list_empty(&fs_uuids)) {
+		fs_devices = list_entry(fs_uuids.next,
+					struct btrfs_fs_devices, list);
+		list_del(&fs_devices->list);
 		while(!list_empty(&fs_devices->devices)) {
-			devices_cur = fs_devices->devices.next;
-			dev = list_entry(devices_cur, struct btrfs_device,
-					 dev_list);
+			dev = list_entry(fs_devices->devices.next,
+					 struct btrfs_device, dev_list);
 			if (dev->bdev) {
 				close_bdev_excl(dev->bdev);
 				fs_devices->open_devices--;
 			}
+			fs_devices->num_devices--;
+			if (dev->writeable)
+				fs_devices->rw_devices--;
 			list_del(&dev->dev_list);
+			list_del(&dev->dev_alloc_list);
 			kfree(dev->name);
 			kfree(dev);
 		}
+		WARN_ON(fs_devices->num_devices);
+		WARN_ON(fs_devices->open_devices);
+		WARN_ON(fs_devices->rw_devices);
+		kfree(fs_devices);
 	}
 	return 0;
 }
@@ -257,6 +269,9 @@ static noinline int device_list_add(const char *path,
 				       disk_super->dev_item.uuid);
 	}
 	if (!device) {
+		if (fs_devices->opened)
+			return -EBUSY;
+
 		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device) {
 			/* we can safely leave the fs_devices entry around */
@@ -273,8 +288,9 @@ static noinline int device_list_add(const char *path,
 			kfree(device);
 			return -ENOMEM;
 		}
+		INIT_LIST_HEAD(&device->dev_alloc_list);
 		list_add(&device->dev_list, &fs_devices->devices);
-		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
 
@@ -288,58 +304,94 @@ static noinline int device_list_add(const char *path,
 
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *head = &fs_devices->devices;
+	struct list_head *tmp;
 	struct list_head *cur;
 	struct btrfs_device *device;
+	int seed_devices = 0;
 
 	mutex_lock(&uuid_mutex);
 again:
-	list_for_each(cur, head) {
+	list_for_each_safe(cur, tmp, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
-		if (!device->in_fs_metadata) {
-			struct block_device *bdev;
-			list_del(&device->dev_list);
-			list_del(&device->dev_alloc_list);
+		if (device->in_fs_metadata)
+			continue;
+
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			device->bdev = NULL;
+			fs_devices->open_devices--;
+		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			device->writeable = 0;
+			fs_devices->rw_devices--;
+		}
+		if (!seed_devices) {
+			list_del_init(&device->dev_list);
 			fs_devices->num_devices--;
-			if (device->bdev) {
-				bdev = device->bdev;
-				fs_devices->open_devices--;
-				mutex_unlock(&uuid_mutex);
-				close_bdev_excl(bdev);
-				mutex_lock(&uuid_mutex);
-			}
 			kfree(device->name);
 			kfree(device);
-			goto again;
 		}
 	}
+
+	if (fs_devices->seed) {
+		fs_devices = fs_devices->seed;
+		seed_devices = 1;
+		goto again;
+	}
+
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
 
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *head = &fs_devices->devices;
+	struct btrfs_fs_devices *seed_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
+again:
+	if (--fs_devices->opened > 0)
+		return 0;
 
-	mutex_lock(&uuid_mutex);
-	list_for_each(cur, head) {
+	list_for_each(cur, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
 			fs_devices->open_devices--;
 		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			fs_devices->rw_devices--;
+		}
+
 		device->bdev = NULL;
+		device->writeable = 0;
 		device->in_fs_metadata = 0;
 	}
-	fs_devices->mounted = 0;
-	mutex_unlock(&uuid_mutex);
+	fs_devices->opened = 0;
+	fs_devices->seeding = 0;
+	fs_devices->sprouted = 0;
+
+	seed_devices = fs_devices->seed;
+	fs_devices->seed = NULL;
+	if (seed_devices) {
+		fs_devices = seed_devices;
+		goto again;
+	}
 	return 0;
 }
 
-int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder)
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = __btrfs_close_devices(fs_devices);
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -350,24 +402,18 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	struct btrfs_super_block *disk_super;
 	u64 latest_devid = 0;
 	u64 latest_transid = 0;
-	u64 transid;
 	u64 devid;
+	int seeding = 1;
 	int ret = 0;
 
-	mutex_lock(&uuid_mutex);
-	if (fs_devices->mounted)
-		goto out;
-
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev)
 			continue;
-
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_excl(device->name, flags, holder);
-
+		bdev = open_bdev_excl(device->name, MS_RDONLY, holder);
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			goto error;
@@ -387,16 +433,32 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (devid != device->devid)
 			goto error_brelse;
 
-		transid = btrfs_super_generation(disk_super);
-		if (!latest_transid || transid > latest_transid) {
+		if (memcmp(device->uuid, disk_super->dev_item.uuid,
+			   BTRFS_UUID_SIZE))
+			goto error_brelse;
+
+		device->generation = btrfs_super_generation(disk_super);
+		if (!latest_transid || device->generation > latest_transid) {
 			latest_devid = devid;
-			latest_transid = transid;
+			latest_transid = device->generation;
 			latest_bdev = bdev;
 		}
 
+		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+			device->writeable = 0;
+		} else {
+			device->writeable = !bdev_read_only(bdev);
+			seeding = 0;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
 		fs_devices->open_devices++;
+		if (device->writeable) {
+			fs_devices->rw_devices++;
+			list_add(&device->dev_alloc_list,
+				 &fs_devices->alloc_list);
+		}
 		continue;
 
 error_brelse:
@@ -410,11 +472,32 @@ error:
 		ret = -EIO;
 		goto out;
 	}
-	fs_devices->mounted = 1;
+	fs_devices->seeding = seeding;
+	fs_devices->opened = 1;
 	fs_devices->latest_bdev = latest_bdev;
 	fs_devices->latest_devid = latest_devid;
 	fs_devices->latest_trans = latest_transid;
+	fs_devices->total_rw_bytes = 0;
 out:
+	return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	if (fs_devices->opened) {
+		if (fs_devices->sprouted) {
+			ret = -EBUSY;
+		} else {
+			fs_devices->opened++;
+			ret = 0;
+		}
+	} else {
+		ret = __btrfs_open_devices(fs_devices, holder);
+	}
 	mutex_unlock(&uuid_mutex);
 	return ret;
 }
@@ -481,12 +564,12 @@ error:
  */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_device *device,
-					 struct btrfs_path *path,
 					 u64 num_bytes, u64 *start)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
 	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
 	u64 hole_size = 0;
 	u64 last_byte = 0;
 	u64 search_start = 0;
@@ -496,8 +579,11 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	int start_found;
 	struct extent_buffer *l;
 
-	start_found = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 	path->reada = 2;
+	start_found = 0;
 
 	/* FIXME use last free of some kind */
 
@@ -581,7 +667,6 @@ check_pending:
 	/* we have to make sure we didn't find an extent that has already
 	 * been allocated by the map tree or the original allocation
 	 */
-	btrfs_release_path(root, path);
 	BUG_ON(*start < search_start);
 
 	if (*start + num_bytes > search_end) {
@@ -589,10 +674,10 @@ check_pending:
 		goto error;
 	}
 	/* check for pending inserts here */
-	return 0;
+	ret = 0;
 
 error:
-	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -644,11 +729,10 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset,
-			   u64 num_bytes, u64 *start)
+			   u64 chunk_offset, u64 start, u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -662,13 +746,8 @@ int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
-	if (ret) {
-		goto err;
-	}
-
 	key.objectid = device->devid;
-	key.offset = *start;
+	key.offset = start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*extent));
@@ -687,7 +766,6 @@ int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
-err:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -735,12 +813,18 @@ error:
 	return ret;
 }
 
-static noinline int find_next_devid(struct btrfs_root *root,
-				    struct btrfs_path *path, u64 *objectid)
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
 {
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	struct btrfs_path *path;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
@@ -763,7 +847,7 @@ static noinline int find_next_devid(struct btrfs_root *root,
 	}
 	ret = 0;
 error:
-	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -781,7 +865,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	unsigned long ptr;
-	u64 free_devid = 0;
 
 	root = root->fs_info->chunk_root;
 
@@ -789,13 +872,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_next_devid(root, path, &free_devid);
-	if (ret)
-		goto out;
-
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
-	key.offset = free_devid;
+	key.offset = device->devid;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*dev_item));
@@ -805,8 +884,8 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
 
-	device->devid = free_devid;
 	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_generation(leaf, dev_item, 0);
 	btrfs_set_device_type(leaf, dev_item, device->type);
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
@@ -819,9 +898,11 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
-	ret = 0;
 
+	ret = 0;
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -832,11 +913,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 {
 	int ret;
 	struct btrfs_path *path;
-	struct block_device *bdev = device->bdev;
-	struct btrfs_device *next_dev;
 	struct btrfs_key key;
-	u64 total_bytes;
-	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_trans_handle *trans;
 
 	root = root->fs_info->chunk_root;
@@ -863,25 +940,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	ret = btrfs_del_item(trans, root, path);
 	if (ret)
 		goto out;
-
-	/*
-	 * at this point, the device is zero sized.  We want to
-	 * remove it from the devices list and zero out the old super
-	 */
-	list_del_init(&device->dev_list);
-	list_del_init(&device->dev_alloc_list);
-	fs_devices = root->fs_info->fs_devices;
-
-	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
-			      dev_list);
-	if (bdev == root->fs_info->sb->s_bdev)
-		root->fs_info->sb->s_bdev = next_dev->bdev;
-	if (bdev == fs_devices->latest_bdev)
-		fs_devices->latest_bdev = next_dev->bdev;
-
-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
-				    total_bytes - 1);
 out:
 	btrfs_free_path(path);
 	unlock_chunks(root);
@@ -892,11 +950,14 @@ out:
 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_device *device;
+	struct btrfs_device *next_device;
 	struct block_device *bdev;
 	struct buffer_head *bh = NULL;
 	struct btrfs_super_block *disk_super;
 	u64 all_avail;
 	u64 devid;
+	u64 num_devices;
+	u8 *dev_uuid;
 	int ret = 0;
 
 	mutex_lock(&uuid_mutex);
@@ -907,14 +968,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_metadata_alloc_bits;
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
+	    root->fs_info->fs_devices->rw_devices <= 4) {
 		printk("btrfs: unable to go below four devices on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
+	    root->fs_info->fs_devices->rw_devices <= 2) {
 		printk("btrfs: unable to go below two devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
@@ -941,15 +1002,15 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			printk("btrfs: no missing devices found to remove\n");
 			goto out;
 		}
-
 	} else {
-		bdev = open_bdev_excl(device_path, 0,
+		bdev = open_bdev_excl(device_path, MS_RDONLY,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
 			goto out;
 		}
 
+		set_blocksize(bdev, 4096);
 		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
 		if (!bh) {
 			ret = -EIO;
@@ -957,45 +1018,97 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic))) {
-			ret = -ENOENT;
-			goto error_brelse;
-		}
-		if (memcmp(disk_super->fsid, root->fs_info->fsid,
-			   BTRFS_FSID_SIZE)) {
+			    sizeof(disk_super->magic))) {
 			ret = -ENOENT;
 			goto error_brelse;
 		}
 		devid = le64_to_cpu(disk_super->dev_item.devid);
-		device = btrfs_find_device(root, devid, NULL);
+		dev_uuid = disk_super->dev_item.uuid;
+		device = btrfs_find_device(root, devid, dev_uuid,
+					   disk_super->fsid);
 		if (!device) {
 			ret = -ENOENT;
 			goto error_brelse;
 		}
+	}
 
+	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+		printk("btrfs: unable to remove the only writeable device\n");
+		ret = -EINVAL;
+		goto error_brelse;
+	}
+
+	if (device->writeable) {
+		list_del_init(&device->dev_alloc_list);
+		root->fs_info->fs_devices->rw_devices--;
 	}
-	root->fs_info->fs_devices->num_devices--;
-	root->fs_info->fs_devices->open_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
 		goto error_brelse;
 
-
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
 		goto error_brelse;
 
-	if (bh) {
+	device->in_fs_metadata = 0;
+	if (device->fs_devices == root->fs_info->fs_devices) {
+		list_del_init(&device->dev_list);
+		root->fs_info->fs_devices->num_devices--;
+		if (device->bdev)
+			device->fs_devices->open_devices--;
+	}
+
+	next_device = list_entry(root->fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (device->bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_device->bdev;
+	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		brelse(bh);
+		if (bdev)
+			close_bdev_excl(bdev);
+
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			device->bdev = NULL;
+			device->fs_devices->open_devices--;
+		}
+		if (device->fs_devices->open_devices == 0) {
+			struct btrfs_fs_devices *fs_devices;
+			fs_devices = root->fs_info->fs_devices;
+			while (fs_devices) {
+				if (fs_devices->seed == device->fs_devices)
+					break;
+				fs_devices = fs_devices->seed;
+			}
+			fs_devices->seed = device->fs_devices->seed;
+			device->fs_devices->seed = NULL;
+			__btrfs_close_devices(device->fs_devices);
+		}
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	if (device->writeable) {
 		/* make sure this device isn't detected as part of
 		 * the FS anymore
 		 */
 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
-
-		brelse(bh);
 	}
+	brelse(bh);
 
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
@@ -1021,6 +1134,129 @@ out:
 	return ret;
 }
 
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_fs_devices *old_devices;
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_device *device;
+	u64 super_flags;
+
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	if (!fs_devices->seeding || fs_devices->opened != 1)
+		return -EINVAL;
+
+	old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!old_devices)
+		return -ENOMEM;
+
+	memcpy(old_devices, fs_devices, sizeof(*old_devices));
+	old_devices->opened = 1;
+	old_devices->sprouted = 1;
+	INIT_LIST_HEAD(&old_devices->devices);
+	INIT_LIST_HEAD(&old_devices->alloc_list);
+	list_splice_init(&fs_devices->devices, &old_devices->devices);
+	list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
+	list_for_each_entry(device, &old_devices->devices, dev_list) {
+		device->fs_devices = old_devices;
+	}
+	list_add(&old_devices->list, &fs_uuids);
+
+	fs_devices->seeding = 0;
+	fs_devices->num_devices = 0;
+	fs_devices->open_devices = 0;
+	fs_devices->seed = old_devices;
+
+	generate_random_uuid(fs_devices->fsid);
+	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	super_flags = btrfs_super_flags(disk_super) &
+		      ~BTRFS_SUPER_FLAG_SEEDING;
+	btrfs_set_super_flags(disk_super, super_flags);
+
+	return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_device *device;
+	struct btrfs_key key;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+	u64 devid;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root = root->fs_info->chunk_root;
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_DEV_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto error;
+
+		leaf = path->nodes[0];
+next_slot:
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret > 0)
+				break;
+			if (ret < 0)
+				goto error;
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+			btrfs_release_path(root, path);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+		    key.type != BTRFS_DEV_ITEM_KEY)
+			break;
+
+		dev_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_dev_item);
+		devid = btrfs_device_id(leaf, dev_item);
+		read_extent_buffer(leaf, dev_uuid,
+				   (unsigned long)btrfs_device_uuid(dev_item),
+				   BTRFS_UUID_SIZE);
+		read_extent_buffer(leaf, fs_uuid,
+				   (unsigned long)btrfs_device_fsid(dev_item),
+				   BTRFS_UUID_SIZE);
+		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+		BUG_ON(!device);
+
+		if (device->fs_devices->seeding) {
+			btrfs_set_device_generation(leaf, dev_item,
+						    device->generation);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+		path->slots[0]++;
+		goto next_slot;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_trans_handle *trans;
@@ -1028,26 +1264,34 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	struct block_device *bdev;
 	struct list_head *cur;
 	struct list_head *devices;
+	struct super_block *sb = root->fs_info->sb;
 	u64 total_bytes;
+	int seeding_dev = 0;
 	int ret = 0;
 
+	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+		return -EINVAL;
 
 	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
 	if (!bdev) {
 		return -EIO;
 	}
 
+	if (root->fs_info->fs_devices->seeding) {
+		seeding_dev = 1;
+		down_write(&sb->s_umount);
+		mutex_lock(&uuid_mutex);
+	}
+
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	mutex_lock(&root->fs_info->volume_mutex);
 
-	trans = btrfs_start_transaction(root, 1);
-	lock_chunks(root);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
-			goto out;
+			goto error;
 		}
 	}
 
@@ -1055,18 +1299,31 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!device) {
 		/* we can safely leave the fs_devices entry around */
 		ret = -ENOMEM;
-		goto out_close_bdev;
+		goto error;
 	}
 
-	device->barriers = 1;
-	device->work.func = pending_bios_fn;
-	generate_random_uuid(device->uuid);
-	spin_lock_init(&device->io_lock);
 	device->name = kstrdup(device_path, GFP_NOFS);
 	if (!device->name) {
 		kfree(device);
-		goto out_close_bdev;
+		ret = -ENOMEM;
+		goto error;
 	}
+
+	ret = find_next_devid(root, &device->devid);
+	if (ret) {
+		kfree(device);
+		goto error;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	lock_chunks(root);
+
+	device->barriers = 1;
+	device->writeable = 1;
+	device->work.func = pending_bios_fn;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->generation = trans->transid;
 	device->io_width = root->sectorsize;
 	device->io_align = root->sectorsize;
 	device->sector_size = root->sectorsize;
@@ -1074,12 +1331,22 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	set_blocksize(device->bdev, 4096);
 
-	ret = btrfs_add_device(trans, root, device);
-	if (ret)
-		goto out_close_bdev;
+	if (seeding_dev) {
+		sb->s_flags &= ~MS_RDONLY;
+		ret = btrfs_prepare_sprout(trans, root);
+		BUG_ON(ret);
+	}
 
-	set_blocksize(device->bdev, 4096);
+	device->fs_devices = root->fs_info->fs_devices;
+	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
+	root->fs_info->fs_devices->rw_devices++;
+	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
@@ -1089,20 +1356,34 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes + 1);
 
-	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
-	list_add(&device->dev_alloc_list,
-		 &root->fs_info->fs_devices->alloc_list);
-	root->fs_info->fs_devices->num_devices++;
-	root->fs_info->fs_devices->open_devices++;
-out:
+	if (seeding_dev) {
+		ret = init_first_rw_device(trans, root, device);
+		BUG_ON(ret);
+		ret = btrfs_finish_sprout(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_device(trans, root, device);
+	}
+
 	unlock_chunks(root);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->volume_mutex);
+	btrfs_commit_transaction(trans, root);
 
-	return ret;
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
 
-out_close_bdev:
+		ret = btrfs_relocate_sys_chunks(root);
+		BUG_ON(ret);
+	}
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	return ret;
+error:
 	close_bdev_excl(bdev);
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+	}
 	goto out;
 }
 
@@ -1160,7 +1441,15 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = new_size - device->total_bytes;
 
+	if (!device->writeable)
+		return -EACCES;
+	if (new_size <= device->total_bytes)
+		return -EINVAL;
+
 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	device->fs_devices->total_rw_bytes += diff;
+
+	device->total_bytes = new_size;
 	return btrfs_update_device(trans, device);
 }
 
@@ -1248,7 +1537,6 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 	return ret;
 }
 
-
 int btrfs_relocate_chunk(struct btrfs_root *root,
 			 u64 chunk_tree, u64 chunk_objectid,
 			 u64 chunk_offset)
@@ -1308,24 +1596,82 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
-	BUG_ON(ret);
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	spin_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	/* once for us */
+	free_extent_map(em);
+
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 chunk_tree = chunk_root->root_key.objectid;
+	u64 chunk_type;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0);
+
+		ret = btrfs_previous_item(chunk_root, path, key.objectid,
+					  key.type);
+		if (ret < 0)
+			goto error;
+		if (ret > 0)
+			break;
 
-	spin_lock(&em_tree->lock);
-	remove_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-	kfree(map);
-	em->bdev = NULL;
+		chunk = btrfs_item_ptr(leaf, path->slots[0],
+				       struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
+		btrfs_release_path(chunk_root, path);
 
-	/* once for the tree */
-	free_extent_map(em);
-	/* once for us */
-	free_extent_map(em);
+		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+						   found_key.objectid,
+						   found_key.offset);
+			BUG_ON(ret);
+		}
 
-	unlock_chunks(root);
-	btrfs_end_transaction(trans, root);
-	return 0;
+		if (found_key.offset == 0)
+			break;
+		key.offset = found_key.offset - 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
 }
 
 static u64 div_factor(u64 num, int factor)
@@ -1337,7 +1683,6 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-
 int btrfs_balance(struct btrfs_root *dev_root)
 {
 	int ret;
@@ -1353,6 +1698,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
 
+	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
 
 	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
@@ -1363,7 +1710,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
-		if (device->total_bytes - device->bytes_used > size_to_free)
+		if (!device->writeable ||
+		    device->total_bytes - device->bytes_used > size_to_free)
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -1453,6 +1801,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = device->total_bytes - new_size;
 
+	if (new_size >= device->total_bytes)
+		return -EINVAL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1469,6 +1819,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	lock_chunks(root);
 
 	device->total_bytes = new_size;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes -= diff;
 	ret = btrfs_update_device(trans, device);
 	if (ret) {
 		unlock_chunks(root);
@@ -1561,32 +1913,27 @@ static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
 		return calc_size * num_stripes;
 }
 
-
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u64 type)
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct map_lookup **map_ret,
+			       u64 *num_bytes, u64 *stripe_size,
+			       u64 start, u64 type)
 {
-	u64 dev_offset;
 	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-	struct btrfs_path *path;
-	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
-	struct btrfs_chunk *chunk;
-	struct list_head private_devs;
-	struct list_head *dev_list;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
 	struct list_head *cur;
+	struct map_lookup *map = NULL;
 	struct extent_map_tree *em_tree;
-	struct map_lookup *map;
 	struct extent_map *em;
+	struct list_head private_devs;
 	int min_stripe_size = 1 * 1024 * 1024;
-	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
 	u64 max_chunk_size = calc_size;
 	u64 min_free;
 	u64 avail;
 	u64 max_avail = 0;
-	u64 percent_max;
+	u64 dev_offset;
 	int num_stripes = 1;
 	int min_stripes = 1;
 	int sub_stripes = 0;
@@ -1594,19 +1941,17 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int ret;
 	int index;
 	int stripe_len = 64 * 1024;
-	struct btrfs_key key;
 
 	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
 	    (type & BTRFS_BLOCK_GROUP_DUP)) {
 		WARN_ON(1);
 		type &= ~BTRFS_BLOCK_GROUP_DUP;
 	}
-	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
-	if (list_empty(dev_list))
+	if (list_empty(&fs_devices->alloc_list))
 		return -ENOSPC;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = extent_root->fs_info->fs_devices->open_devices;
+		num_stripes = fs_devices->rw_devices;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1614,14 +1959,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		num_stripes = min_t(u64, 2,
-			    extent_root->fs_info->fs_devices->open_devices);
+		num_stripes = min_t(u64, 2, fs_devices->rw_devices);
 		if (num_stripes < 2)
 			return -ENOSPC;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = extent_root->fs_info->fs_devices->open_devices;
+		num_stripes = fs_devices->rw_devices;
 		if (num_stripes < 4)
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
@@ -1641,15 +1985,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripe_size = 1 * 1024 * 1024;
 	}
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/* we don't want a chunk larger than 10% of the FS */
-	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
-	max_chunk_size = min(percent_max, max_chunk_size);
+	/* we don't want a chunk larger than 10% of writeable space */
+	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+			     max_chunk_size);
 
 again:
+	if (!map || map->num_stripes != num_stripes) {
+		kfree(map);
+		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+		if (!map)
+			return -ENOMEM;
+		map->num_stripes = num_stripes;
+	}
+
 	if (calc_size * num_stripes > max_chunk_size) {
 		calc_size = max_chunk_size;
 		do_div(calc_size, num_stripes);
@@ -1662,8 +2010,7 @@ again:
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
 
-	INIT_LIST_HEAD(&private_devs);
-	cur = dev_list->next;
+	cur = fs_devices->alloc_list.next;
 	index = 0;
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
@@ -1679,10 +2026,10 @@ again:
 	if (!looped)
 		min_free += 1024 * 1024;
 
-	/* build a private list of devices we will allocate from */
+	INIT_LIST_HEAD(&private_devs);
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-
+		BUG_ON(!device->writeable);
 		if (device->total_bytes > device->bytes_used)
 			avail = device->total_bytes - device->bytes_used;
 		else
@@ -1690,24 +2037,28 @@ again:
 		cur = cur->next;
 
 		if (device->in_fs_metadata && avail >= min_free) {
-			u64 ignored_start = 0;
-			ret = find_free_dev_extent(trans, device, path,
-						   min_free,
-						   &ignored_start);
+			ret = find_free_dev_extent(trans, device,
+						   min_free, &dev_offset);
 			if (ret == 0) {
 				list_move_tail(&device->dev_alloc_list,
 					       &private_devs);
+				map->stripes[index].dev = device;
+				map->stripes[index].physical = dev_offset;
 				index++;
-				if (type & BTRFS_BLOCK_GROUP_DUP)
+				if (type & BTRFS_BLOCK_GROUP_DUP) {
+					map->stripes[index].dev = device;
+					map->stripes[index].physical =
+						dev_offset + calc_size;
 					index++;
+				}
 			}
 		} else if (device->in_fs_metadata && avail > max_avail)
 			max_avail = avail;
-		if (cur == dev_list)
+		if (cur == &fs_devices->alloc_list)
 			break;
 	}
+	list_splice(&private_devs, &fs_devices->alloc_list);
 	if (index < num_stripes) {
-		list_splice(&private_devs, dev_list);
 		if (index >= min_stripes) {
 			num_stripes = index;
 			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -1722,115 +2073,246 @@ again:
 			calc_size = max_avail;
 			goto again;
 		}
-		btrfs_free_path(path);
+		kfree(map);
 		return -ENOSPC;
 	}
-	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
-	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-			      &key.offset);
-	if (ret) {
-		btrfs_free_path(path);
-		return ret;
-	}
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = stripe_len;
+	map->io_align = stripe_len;
+	map->io_width = stripe_len;
+	map->type = type;
+	map->num_stripes = num_stripes;
+	map->sub_stripes = sub_stripes;
 
-	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
-	if (!chunk) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
+	*map_ret = map;
+	*stripe_size = calc_size;
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
 
-	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-	if (!map) {
-		kfree(chunk);
-		btrfs_free_path(path);
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		kfree(map);
 		return -ENOMEM;
 	}
-	btrfs_free_path(path);
-	path = NULL;
+	em->bdev = (struct block_device *)map;
+	em->start = start;
+	em->len = *num_bytes;
+	em->block_start = 0;
+	em->block_len = em->len;
 
-	stripes = &chunk->stripe;
-	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+	BUG_ON(ret);
+	free_extent_map(em);
 
-	index = 0;
-	while(index < num_stripes) {
-		struct btrfs_stripe *stripe;
-		BUG_ON(list_empty(&private_devs));
-		cur = private_devs.next;
-		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+	ret = btrfs_make_block_group(trans, extent_root, 0, type,
+				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				     start, *num_bytes);
+	BUG_ON(ret);
 
-		/* loop over this device again if we're doing a dup group */
-		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
-		    (index == num_stripes - 1))
-			list_move_tail(&device->dev_alloc_list, dev_list);
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
 
 		ret = btrfs_alloc_dev_extent(trans, device,
-			     info->chunk_root->root_key.objectid,
-			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
-			     calc_size, &dev_offset);
+				info->chunk_root->root_key.objectid,
+				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				start, dev_offset, calc_size);
 		BUG_ON(ret);
-		device->bytes_used += calc_size;
+		index++;
+	}
+
+	return 0;
+}
+
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct map_lookup *map, u64 chunk_offset,
+				u64 chunk_size, u64 stripe_size)
+{
+	u64 dev_offset;
+	struct btrfs_key key;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_device *device;
+	struct btrfs_chunk *chunk;
+	struct btrfs_stripe *stripe;
+	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+	int index = 0;
+	int ret;
+
+	chunk = kzalloc(item_size, GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		device->bytes_used += stripe_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
+		index++;
+	}
+
+	index = 0;
+	stripe = &chunk->stripe;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
 
-		map->stripes[index].dev = device;
-		map->stripes[index].physical = dev_offset;
-		stripe = stripes + index;
 		btrfs_set_stack_stripe_devid(stripe, device->devid);
 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
-		physical = dev_offset;
+		stripe++;
 		index++;
 	}
-	BUG_ON(!list_empty(&private_devs));
 
-	/* key was set above */
-	btrfs_set_stack_chunk_length(chunk, *num_bytes);
+	btrfs_set_stack_chunk_length(chunk, chunk_size);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
-	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
-	btrfs_set_stack_chunk_type(chunk, type);
-	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
-	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
-	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
+	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_type(chunk, map->type);
+	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
-	btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
-	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = stripe_len;
-	map->io_align = stripe_len;
-	map->io_width = stripe_len;
-	map->type = type;
-	map->num_stripes = num_stripes;
-	map->sub_stripes = sub_stripes;
+	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
 
-	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
-				btrfs_chunk_item_size(num_stripes));
-	BUG_ON(ret);
-	*start = key.offset;;
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	key.offset = chunk_offset;
 
-	em = alloc_extent_map(GFP_NOFS);
-	if (!em)
-		return -ENOMEM;
-	em->bdev = (struct block_device *)map;
-	em->start = key.offset;
-	em->len = *num_bytes;
-	em->block_start = 0;
-	em->block_len = em->len;
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+	BUG_ON(ret);
 
-	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
-				    chunk, btrfs_chunk_item_size(num_stripes));
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+					     item_size);
 		BUG_ON(ret);
 	}
 	kfree(chunk);
+	return 0;
+}
 
-	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
-	spin_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type)
+{
+	u64 chunk_offset;
+	u64 chunk_size;
+	u64 stripe_size;
+	struct map_lookup *map;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	int ret;
+
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &chunk_offset);
+	if (ret)
+		return ret;
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, type);
+	if (ret)
+		return ret;
+
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+	return 0;
+}
+
+static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_device *device)
+{
+	u64 chunk_offset;
+	u64 sys_chunk_offset;
+	u64 chunk_size;
+	u64 sys_chunk_size;
+	u64 stripe_size;
+	u64 sys_stripe_size;
+	u64 alloc_profile;
+	struct map_lookup *map;
+	struct map_lookup *sys_map;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+
+	ret = find_next_chunk(fs_info->chunk_root,
+			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+	BUG_ON(ret);
+
+	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+			(fs_info->metadata_alloc_profile &
+			 fs_info->avail_metadata_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	sys_chunk_offset = chunk_offset + chunk_size;
+
+	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+			(fs_info->system_alloc_profile &
+			 fs_info->avail_system_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+				  &sys_chunk_size, &sys_stripe_size,
+				  sys_chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+	BUG_ON(ret);
+
+	/*
+	 * Modifying chunk tree needs allocating new blocks from both
+	 * system block group and metadata block group. So we only can
+	 * do operations require modifying the chunk tree after both
+	 * block groups were created.
+	 */
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+
+	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+				   sys_chunk_offset, sys_chunk_size,
+				   sys_stripe_size);
 	BUG_ON(ret);
+	return 0;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	int readonly = 0;
+	int i;
+
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	spin_unlock(&map_tree->map_tree.lock);
+	if (!em)
+		return 1;
+
+	map = (struct map_lookup *)em->bdev;
+	for (i = 0; i < map->num_stripes; i++) {
+		if (!map->stripes[i].dev->writeable) {
+			readonly = 1;
+			break;
+		}
+	}
 	free_extent_map(em);
-	return ret;
+	return readonly;
 }
 
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
@@ -2227,6 +2709,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
+		BUG_ON(rw == WRITE && !dev->writeable);
 		if (dev && dev->bdev) {
 			bio->bi_bdev = dev->bdev;
 			if (async_submit)
@@ -2246,11 +2729,23 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 }
 
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
-				       u8 *uuid)
+				       u8 *uuid, u8 *fsid)
 {
-	struct list_head *head = &root->fs_info->fs_devices->devices;
-
-	return __find_device(head, devid, uuid);
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *cur_devices;
+
+	cur_devices = root->fs_info->fs_devices;
+	while (cur_devices) {
+		if (!fsid ||
+		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			device = __find_device(&cur_devices->devices,
+					       devid, uuid);
+			if (device)
+				return device;
+		}
+		cur_devices = cur_devices->seed;
+	}
+	return NULL;
 }
 
 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
@@ -2262,8 +2757,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device = kzalloc(sizeof(*device), GFP_NOFS);
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
-	list_add(&device->dev_alloc_list,
-		 &fs_devices->alloc_list);
 	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
@@ -2274,7 +2767,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	return device;
 }
 
-
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -2339,8 +2831,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		read_extent_buffer(leaf, uuid, (unsigned long)
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
-		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
-
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+							NULL);
 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
@@ -2387,6 +2879,50 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	return 0;
 }
 
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+
+	fs_devices = root->fs_info->fs_devices->seed;
+	while (fs_devices) {
+		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			ret = 0;
+			goto out;
+		}
+		fs_devices = fs_devices->seed;
+	}
+
+	fs_devices = find_fsid(fsid);
+	if (!fs_devices) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (fs_devices->opened) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = __btrfs_open_devices(fs_devices, root->fs_info->bdev_holder);
+	if (ret)
+		goto out;
+
+	if (!fs_devices->seeding) {
+		__btrfs_close_devices(fs_devices);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fs_devices->seed = root->fs_info->fs_devices->seed;
+	root->fs_info->fs_devices->seed = fs_devices;
+	fs_devices->sprouted = 1;
+out:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
 static int read_one_dev(struct btrfs_root *root,
 			struct extent_buffer *leaf,
 			struct btrfs_dev_item *dev_item)
@@ -2394,23 +2930,50 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
+	int seed_devices = 0;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
 	devid = btrfs_device_id(leaf, dev_item);
 	read_extent_buffer(leaf, dev_uuid,
 			   (unsigned long)btrfs_device_uuid(dev_item),
 			   BTRFS_UUID_SIZE);
-	device = btrfs_find_device(root, devid, dev_uuid);
-	if (!device) {
-		printk("warning devid %Lu missing\n", devid);
-		device = add_missing_dev(root, devid, dev_uuid);
-		if (!device)
-			return -ENOMEM;
+	read_extent_buffer(leaf, fs_uuid,
+			   (unsigned long)btrfs_device_fsid(dev_item),
+			   BTRFS_UUID_SIZE);
+
+	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+		ret = open_seed_devices(root, fs_uuid);
+		if (ret)
+			return ret;
+		seed_devices = 1;
+	}
+
+	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	if (!device || !device->bdev) {
+		if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
+			return -EIO;
+
+		if (!device) {
+			printk("warning devid %Lu missing\n", devid);
+			device = add_missing_dev(root, devid, dev_uuid);
+			if (!device)
+				return -ENOMEM;
+		}
+	}
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		if (device->generation !=
+		    btrfs_device_generation(leaf, dev_item))
+			return -EINVAL;
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes += device->total_bytes;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
@@ -2528,12 +3091,15 @@ again:
 				dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
 				ret = read_one_dev(root, leaf, dev_item);
-				BUG_ON(ret);
+				if (ret)
+					goto error;
 			}
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 			ret = read_one_chunk(root, &found_key, leaf, chunk);
+			if (ret)
+				goto error;
 		}
 		path->slots[0]++;
 	}
@@ -2542,9 +3108,8 @@ again:
 		btrfs_release_path(root, path);
 		goto again;
 	}
-
-	btrfs_free_path(path);
 	ret = 0;
 error:
+	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c50e50580b5..1f6f25a5787 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,6 +26,7 @@ struct buffer_head;
 struct btrfs_device {
 	struct list_head dev_list;
 	struct list_head dev_alloc_list;
+	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
 	struct bio *pending_bios;
@@ -34,6 +35,7 @@ struct btrfs_device {
 	u64 generation;
 
 	int barriers;
+	int writeable;
 	int in_fs_metadata;
 
 	spinlock_t io_lock;
@@ -77,6 +79,8 @@ struct btrfs_fs_devices {
 	u64 latest_trans;
 	u64 num_devices;
 	u64 open_devices;
+	u64 rw_devices;
+	u64 total_rw_bytes;
 	struct block_device *latest_bdev;
 	/* all of the devices in the FS */
 	struct list_head devices;
@@ -84,7 +88,12 @@ struct btrfs_fs_devices {
 	/* devices not currently being allocated */
 	struct list_head alloc_list;
 	struct list_head list;
-	int mounted;
+
+	struct btrfs_fs_devices *seed;
+	int seeding;
+	int sprouted;
+
+	int opened;
 };
 
 struct btrfs_bio_stripe {
@@ -109,16 +118,14 @@ struct btrfs_multi_bio {
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset,
-			   u64 num_bytes, u64 *start);
+			   u64 chunk_offset, u64 start, u64 num_bytes);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_multi_bio **multi_ret, int mirror_num);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u64 type);
+		      struct btrfs_root *extent_root, u64 type);
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
@@ -141,10 +148,11 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size);
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
-				       u8 *uuid);
+				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
 void btrfs_unlock_volumes(void);
 void btrfs_lock_volumes(void);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 #endif
-- 
cgit v1.2.3


From c36047d729a3fa080dd194b20b684cc9fe73e90c Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 12 Nov 2008 14:19:50 -0500
Subject: Btrfs: Fix race in btrfs_mark_extent_written

When extent needs to be split, btrfs_mark_extent_written truncates the extent
first, then inserts a new extent and increases the reference count.

The race happens if someone else deletes the old extent before the new extent
is inserted. The fix here is increase the reference count in advance. This race
is similar to the race in btrfs_drop_extents that was recently fixed.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/file.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 934bc094bf1..1e8c024c69c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -746,6 +746,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	u64 other_end;
 	u64 split = start;
 	u64 locked_end = end;
+	u64 orig_parent;
 	int extent_type;
 	int split_end = 1;
 	int ret;
@@ -890,6 +891,12 @@ again:
 	}
 
 	btrfs_mark_buffer_dirty(leaf);
+
+	orig_parent = leaf->start;
+	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
+				   orig_parent, root->root_key.objectid,
+				   trans->transid, inode->i_ino);
+	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
 	key.offset = start;
@@ -910,10 +917,13 @@ again:
 	btrfs_set_file_extent_encryption(leaf, fi, 0);
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
 
-	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
-				   leaf->start, root->root_key.objectid,
-				   trans->transid, inode->i_ino);
-	BUG_ON(ret);
+	if (orig_parent != leaf->start) {
+		ret = btrfs_update_extent_ref(trans, root, bytenr,
+					      orig_parent, leaf->start,
+					      root->root_key.objectid,
+					      trans->transid, inode->i_ino);
+		BUG_ON(ret);
+	}
 done:
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root, path);
-- 
cgit v1.2.3


From 0df49b911db2b22ea808b596070b1cc65c23d148 Mon Sep 17 00:00:00 2001
From: yanhai zhu <zhu.yanhai@gmail.com>
Date: Wed, 12 Nov 2008 14:36:58 -0500
Subject: Btrfs: Check kthread_should_stop() before schedule() in worker_loop

In worker_loop(), the func should check whether it has been requested to stop
before it decides to schedule out.

Otherwise if the stop request(also the last wake_up()) sent by
btrfs_stop_workers() happens when worker_loop() running after the "while"
judgement and before schedule(), woker_loop() will schedule away and never be
woken up, which will also cause btrfs_stop_workers() wait forever.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index e1e49715459..4229450b759 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -171,7 +171,8 @@ static int worker_loop(void *arg)
 		} else {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&worker->lock);
-			schedule();
+			if (!kthread_should_stop())
+				schedule();
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
-- 
cgit v1.2.3


From 7cbd8a839118eede2393f0926b8e15027162bcd6 Mon Sep 17 00:00:00 2001
From: yanhai zhu <zhu.yanhai@gmail.com>
Date: Wed, 12 Nov 2008 14:38:54 -0500
Subject: Btrfs: Add a missing return pointer check

Add a missing kzalloc() return pointer check in add_missing_dev().

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d6f1996de62..ecf0633ab8c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2755,6 +2755,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 
 	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return NULL;
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
 	device->barriers = 1;
-- 
cgit v1.2.3


From e3e469f86eebb1b3364c118add362d00c6cff956 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Mon, 17 Nov 2008 21:11:49 -0500
Subject: Btrfs: fix free space leak

In my batch delete/update/insert patch I introduced a free space leak.  The
extent that we do the original search on in free_extents is never pinned, so we
always update the block saying that it has free space, but the free space never
actually gets added to the free space tree, since op->del will always be 0 and
it's never actually added to the pinned extents tree.

This patch fixes this problem by making sure we call pin_down_bytes on the
pending extent op and set op->del to the return value of pin_down_bytes so
update_block_group is called with the right value.  This seems to fix the case
where we were getting ENOSPC when there was plenty of space available.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 197422c1dc4..6c29669d81a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -999,6 +999,14 @@ search:
 		path->slots[0] = extent_slot;
 		bytes_freed = op->num_bytes;
 
+		mutex_lock(&info->pinned_mutex);
+		ret = pin_down_bytes(trans, extent_root, op->bytenr,
+				     op->num_bytes, op->level >=
+				     BTRFS_FIRST_FREE_OBJECTID);
+		mutex_unlock(&info->pinned_mutex);
+		BUG_ON(ret < 0);
+		op->del = ret;
+
 		/*
 		 * we need to see if we can delete multiple things at once, so
 		 * start looping through the list of extents we are wanting to
-- 
cgit v1.2.3


From 4ce4cb526f67775c1cce3e3fa01c292672ba874e Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Mon, 17 Nov 2008 21:12:00 -0500
Subject: Btrfs: Add some debugging around the ENOSPC bugs

Some people are still reporting problems with early enospc.  This
will help narrown down the cause.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6c29669d81a..b0f2241274f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2984,6 +2984,9 @@ loop_check:
 			*last_ptr = ins->objectid + ins->offset;
 		ret = 0;
 	} else if (!ret) {
+		printk(KERN_ERR "we were searching for %Lu bytes, num_bytes %Lu,"
+		       " loop %d, allowed_alloc %d\n", total_needed, num_bytes,
+		       loop, allowed_chunk_alloc);
 		ret = -ENOSPC;
 	}
 
-- 
cgit v1.2.3


From 3de4586c5278a28107030c336956381f69ff7a9d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Nov 2008 21:02:50 -0500
Subject: Btrfs: Allow subvolumes and snapshots anywhere in the directory tree

Before, all snapshots and subvolumes lived in a single flat directory.  This
was awkward and confusing because the single flat directory was only writable
with the ioctls.

This commit changes the ioctls to create subvols and snapshots at any
point in the directory tree.  This requires making separate ioctls for
snapshot and subvol creation instead of a combining them into one.

The subvol ioctl does:

btrfsctl -S subvol_name parent_dir

After the ioctl is done subvol_name lives inside parent_dir.

The snapshot ioctl does:

btrfsctl -s path_for_snapshot root_to_snapshot

path_for_snapshot can be an absolute or relative path.  btrfsctl breaks it up
into directory and basename components.

root_to_snapshot can be any file or directory in the FS.  The snapshot
is taken of the entire root where that file lives.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  7 +++--
 fs/btrfs/disk-io.c     | 14 +++++++---
 fs/btrfs/inode.c       | 50 ++++++++++++++++++++++-------------
 fs/btrfs/ioctl.c       | 71 ++++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/ioctl.h       |  7 +++--
 fs/btrfs/super.c       | 10 +++----
 fs/btrfs/transaction.c | 66 +++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/transaction.h |  2 ++
 8 files changed, 162 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5ff74282a62..5611f8e035a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -606,6 +606,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
+	struct btrfs_root *fs_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -758,7 +759,6 @@ struct btrfs_root {
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
 	struct btrfs_fs_info *fs_info;
-	struct inode *inode;
 	struct extent_io_tree dirty_log_pages;
 
 	struct kobject root_kobj;
@@ -1876,6 +1876,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 #define PageChecked PageFsMisc
 #endif
 
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+int btrfs_set_inode_index(struct inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       struct inode *dir, struct inode *inode,
@@ -1896,9 +1898,6 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
 		struct btrfs_trans_handle *trans, u64 new_dirid,
 		struct btrfs_block_group_cache *block_group);
 
-void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
-				  int namelen);
-
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio, unsigned long bio_flags);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 82833e5d84b..0a5350573f6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -838,7 +838,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			u64 objectid)
 {
 	root->node = NULL;
-	root->inode = NULL;
 	root->commit_root = NULL;
 	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
@@ -1430,6 +1429,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 blocksize;
 	u32 stripesize;
 	u64 generation;
+	struct btrfs_key location;
 	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
@@ -1729,7 +1729,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_cleaner;
 
 	if (sb->s_flags & MS_RDONLY)
-		return tree_root;
+		goto read_fs_root;
 
 	if (btrfs_super_log_root(disk_super) != 0) {
 		u32 blocksize;
@@ -1755,6 +1755,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_cleanup_reloc_trees(tree_root);
 	BUG_ON(ret);
 
+	location.objectid = BTRFS_FS_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = (u64)-1;
+
+read_fs_root:
+	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
+	if (!fs_info->fs_root)
+		goto fail_cleaner;
 	return tree_root;
 
 fail_cleaner:
@@ -1944,8 +1952,6 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 			  (unsigned long)root->root_key.objectid);
 	if (root->in_sysfs)
 		btrfs_sysfs_del_root(root);
-	if (root->inode)
-		iput(root->inode);
 	if (root->node)
 		free_extent_buffer(root->node);
 	if (root->commit_root)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3e3620e69bb..e163b1b7470 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3038,8 +3038,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 	return inode;
 }
 
-static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
-				   struct nameidata *nd)
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
 	struct inode * inode;
 	struct btrfs_inode *bi = BTRFS_I(dir);
@@ -3067,13 +3066,21 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 		inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
-
-		/* the inode and parent dir are two different roots */
-		if (new && root != sub_root) {
-			igrab(inode);
-			sub_root->inode = inode;
-		}
 	}
+	return inode;
+}
+
+static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+
+	if (dentry->d_name.len > BTRFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	inode = btrfs_lookup_dentry(dir, dentry);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
 	return d_splice_alias(inode, dentry);
 }
@@ -3129,7 +3136,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			return 0;
 		filp->f_pos = 2;
 	}
-
 	path = btrfs_alloc_path();
 	path->reada = 2;
 
@@ -3159,6 +3165,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 				path->slots[0]++;
 			}
 		}
+
 		advance = 1;
 		item = btrfs_item_nr(leaf, slot);
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3194,16 +3201,25 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
+
+			/* is this a reference to our own snapshot? If so
+			 * skip it
+			 */
+			if (location.type == BTRFS_ROOT_ITEM_KEY &&
+			    location.objectid == root->root_key.objectid) {
+				over = 0;
+				goto skip;
+			}
 			over = filldir(dirent, name_ptr, name_len,
 				       found_key.offset, location.objectid,
 				       d_type);
 
+skip:
 			if (name_ptr != tmp_name)
 				kfree(name_ptr);
 
 			if (over)
 				goto nopos;
-
 			di_len = btrfs_dir_name_len(leaf, di) +
 				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
 			di_cur += di_len;
@@ -3318,8 +3334,7 @@ out:
  * helper to find a free sequence number in a given directory.  This current
  * code is very simple, later versions will do smarter things in the btree
  */
-static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
-				 u64 *index)
+int btrfs_set_inode_index(struct inode *dir, u64 *index)
 {
 	int ret = 0;
 
@@ -3365,7 +3380,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		return ERR_PTR(-ENOMEM);
 
 	if (dir) {
-		ret = btrfs_set_inode_index(dir, inode, index);
+		ret = btrfs_set_inode_index(dir, index);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -3651,7 +3666,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
-	err = btrfs_set_inode_index(dir, inode, &index);
+	err = btrfs_set_inode_index(dir, &index);
 	if (err)
 		goto fail;
 
@@ -4349,13 +4364,13 @@ out:
  * Invalidate a single dcache entry at the root of the filesystem.
  * Needed after creation of snapshot or subvolume.
  */
-void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
+void btrfs_invalidate_dcache_root(struct inode *dir, char *name,
 				  int namelen)
 {
 	struct dentry *alias, *entry;
 	struct qstr qstr;
 
-	alias = d_find_alias(root->fs_info->sb->s_root->d_inode);
+	alias = d_find_alias(dir);
 	if (alias) {
 		qstr.name = name;
 		qstr.len = namelen;
@@ -4387,7 +4402,6 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
 		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
-	new_root->inode = inode;
 
 	inode->i_nlink = 1;
 	btrfs_i_size_write(inode, 0);
@@ -4590,7 +4604,7 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		}
 
 	}
-	ret = btrfs_set_inode_index(new_dir, old_inode, &index);
+	ret = btrfs_set_inode_index(new_dir, &index);
 	if (ret)
 		goto out_fail;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f43df72b0e1..ec45b308613 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -67,6 +67,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	int err;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+	u64 index = 0;
 	unsigned long nr = 1;
 
 	ret = btrfs_check_free_space(root, 1, 0);
@@ -126,6 +127,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	key.objectid = objectid;
 	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+printk("inserting root objectid %Lu\n", objectid);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&root_item);
 	if (ret)
@@ -135,24 +137,27 @@ static noinline int create_subvol(struct btrfs_root *root,
 	 * insert the directory item
 	 */
 	key.offset = (u64)-1;
-	dir = root->fs_info->sb->s_root->d_inode;
-	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
+	dir = dentry->d_parent->d_inode;
+	ret = btrfs_set_inode_index(dir, &index);
+	BUG_ON(ret);
+
+	ret = btrfs_insert_dir_item(trans, root,
 				    name, namelen, dir->i_ino, &key,
-				    BTRFS_FT_DIR, 0);
+				    BTRFS_FT_DIR, index);
 	if (ret)
 		goto fail;
-
+#if 0
 	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 			     name, namelen, objectid,
 			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
 	if (ret)
 		goto fail;
-
+#endif
 	ret = btrfs_commit_transaction(trans, root);
 	if (ret)
 		goto fail_commit;
 
-	new_root = btrfs_read_fs_root(root->fs_info, &key, name, namelen);
+	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
 	BUG_ON(!new_root);
 
 	trans = btrfs_start_transaction(new_root, 1);
@@ -170,14 +175,16 @@ fail:
 		ret = err;
 fail_commit:
 	btrfs_btree_balance_dirty(root, nr);
+printk("all done ret %d\n", ret);
 	return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+			   char *name, int namelen)
 {
 	struct btrfs_pending_snapshot *pending_snapshot;
 	struct btrfs_trans_handle *trans;
-	int ret;
+	int ret = 0;
 	int err;
 	unsigned long nr = 0;
 
@@ -188,7 +195,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (ret)
 		goto fail_unlock;
 
-	pending_snapshot = kmalloc(sizeof(*pending_snapshot), GFP_NOFS);
+	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot) {
 		ret = -ENOMEM;
 		goto fail_unlock;
@@ -201,12 +208,12 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	}
 	memcpy(pending_snapshot->name, name, namelen);
 	pending_snapshot->name[namelen] = '\0';
+	pending_snapshot->dentry = dentry;
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 	pending_snapshot->root = root;
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
-	ret = btrfs_update_inode(trans, root, root->inode);
 	err = btrfs_commit_transaction(trans, root);
 
 fail_unlock:
@@ -230,7 +237,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
  * inside this filesystem so it's quite a bit simpler.
  */
 static noinline int btrfs_mksubvol(struct path *parent, char *name,
-				   int mode, int namelen)
+				   int mode, int namelen,
+				   struct btrfs_root *snap_src)
 {
 	struct dentry *dentry;
 	int error;
@@ -248,6 +256,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 
 	if (!IS_POSIXACL(parent->dentry->d_inode))
 		mode &= ~current->fs->umask;
+
 	error = mnt_want_write(parent->mnt);
 	if (error)
 		goto out_dput;
@@ -266,8 +275,12 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 	 * Also we should pass on the mode eventually to allow creating new
 	 * subvolume with specific mode bits.
 	 */
-	error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, dentry,
-			      name, namelen);
+	if (snap_src) {
+		error = create_snapshot(snap_src, dentry, name, namelen);
+	} else {
+		error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
+				      dentry, name, namelen);
+	}
 	if (error)
 		goto out_drop_write;
 
@@ -471,15 +484,16 @@ out:
 }
 
 static noinline int btrfs_ioctl_snap_create(struct file *file,
-					    void __user *arg)
+					    void __user *arg, int subvol)
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
+	struct file *src_file;
 	u64 root_dirid;
 	int namelen;
-	int ret;
+	int ret = 0;
 
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
@@ -523,12 +537,29 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 		goto out;
 	}
 
-	if (root == root->fs_info->tree_root) {
+	if (subvol) {
 		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
 				     file->f_path.dentry->d_inode->i_mode,
-				     namelen);
+				     namelen, NULL);
 	} else {
-		ret = create_snapshot(root, vol_args->name, namelen);
+		struct inode *src_inode;
+		src_file = fget(vol_args->fd);
+		if (!src_file) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		src_inode = src_file->f_path.dentry->d_inode;
+		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
+			printk("btrfs: Snapshot src from another FS\n");
+			ret = -EINVAL;
+			fput(src_file);
+			goto out;
+		}
+		ret = btrfs_mksubvol(&file->f_path, vol_args->name,
+			     file->f_path.dentry->d_inode->i_mode,
+			     namelen, BTRFS_I(src_inode)->root);
+		fput(src_file);
 	}
 
 out:
@@ -1030,7 +1061,9 @@ long btrfs_ioctl(struct file *file, unsigned int
 
 	switch (cmd) {
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(file, (void __user *)arg);
+		return btrfs_ioctl_snap_create(file, (void __user *)arg, 0);
+	case BTRFS_IOC_SUBVOL_CREATE:
+		return btrfs_ioctl_snap_create(file, (void __user *)arg, 1);
 	case BTRFS_IOC_DEFRAG:
 		return btrfs_ioctl_defrag(file);
 	case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 989ba8a0121..78049ea208d 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,9 +22,10 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 4095
+#define BTRFS_PATH_NAME_MAX 3072
 
 struct btrfs_ioctl_vol_args {
+	__s64 fd;
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
@@ -51,7 +52,6 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
 				   struct btrfs_ioctl_vol_args)
-
 struct btrfs_ioctl_clone_range_args {
   __s64 src_fd;
   __u64 src_offset, src_length;
@@ -61,4 +61,7 @@ struct btrfs_ioctl_clone_range_args {
 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
 				  struct btrfs_ioctl_clone_range_args)
 
+#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
+				   struct btrfs_ioctl_vol_args)
+
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 92393cc60d0..77c5eff3e20 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -285,11 +285,11 @@ static int btrfs_parse_early_options(const char *options, int flags,
  out:
 	/*
 	 * If no subvolume name is specified we use the default one.  Allocate
-	 * a copy of the string "default" here so that code later in the
+	 * a copy of the string "." here so that code later in the
 	 * mount path doesn't care if it's the default volume or another one.
 	 */
 	if (!*subvol_name) {
-		*subvol_name = kstrdup("default", GFP_KERNEL);
+		*subvol_name = kstrdup(".", GFP_KERNEL);
 		if (!*subvol_name)
 			return -ENOMEM;
 	}
@@ -323,12 +323,12 @@ static int btrfs_fill_super(struct super_block * sb,
 	}
 	sb->s_fs_info = tree_root;
 	disk_super = &tree_root->fs_info->super_copy;
-	inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
-				  tree_root);
+	inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
+				  tree_root->fs_info->fs_root);
 	bi = BTRFS_I(inode);
 	bi->location.objectid = inode->i_ino;
 	bi->location.offset = 0;
-	bi->root = tree_root;
+	bi->root = tree_root->fs_info->fs_root;
 
 	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 202c1b6df4a..eec8b246503 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -779,7 +779,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
 	int ret;
-	int namelen;
 	u64 objectid;
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
@@ -816,28 +815,48 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
+	key.offset = (u64)-1;
+	memcpy(&pending->root_key, &key, sizeof(key));
+fail:
+	kfree(new_root_item);
+	return ret;
+}
+
+static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
+				   struct btrfs_pending_snapshot *pending)
+{
+	int ret;
+	int namelen;
+	u64 index = 0;
+	struct btrfs_trans_handle *trans;
+	struct inode *parent_inode;
+	struct inode *inode;
+
+	trans = btrfs_start_transaction(fs_info->fs_root, 1);
+
 	/*
 	 * insert the directory item
 	 */
-	key.offset = (u64)-1;
 	namelen = strlen(pending->name);
-	ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
-				    pending->name, namelen,
-				    root->fs_info->sb->s_root->d_inode->i_ino,
-				    &key, BTRFS_FT_DIR, 0);
+	parent_inode = pending->dentry->d_parent->d_inode;
+	ret = btrfs_set_inode_index(parent_inode, &index);
+	ret = btrfs_insert_dir_item(trans,
+			    BTRFS_I(parent_inode)->root,
+			    pending->name, namelen,
+			    parent_inode->i_ino,
+			    &pending->root_key, BTRFS_FT_DIR, index);
 
 	if (ret)
 		goto fail;
-
+#if 0
 	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
 			     pending->name, strlen(pending->name), objectid,
 			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
-
-	/* Invalidate existing dcache entry for new snapshot. */
-	btrfs_invalidate_dcache_root(root, pending->name, namelen);
-
+#endif
+	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+	d_instantiate(pending->dentry, inode);
 fail:
-	kfree(new_root_item);
+	btrfs_end_transaction(trans, fs_info->fs_root);
 	return ret;
 }
 
@@ -846,6 +865,22 @@ fail:
  */
 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 					     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_pending_snapshot *pending;
+	struct list_head *head = &trans->transaction->pending_snapshots;
+	struct list_head *cur;
+	int ret;
+
+	list_for_each(cur, head) {
+		pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+		ret = create_pending_snapshot(trans, fs_info, pending);
+		BUG_ON(ret);
+	}
+	return 0;
+}
+
+static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
+					     struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_pending_snapshot *pending;
 	struct list_head *head = &trans->transaction->pending_snapshots;
@@ -854,7 +889,7 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 	while(!list_empty(head)) {
 		pending = list_entry(head->next,
 				     struct btrfs_pending_snapshot, list);
-		ret = create_pending_snapshot(trans, fs_info, pending);
+		ret = finish_pending_snapshot(fs_info, pending);
 		BUG_ON(ret);
 		list_del(&pending->list);
 		kfree(pending->name);
@@ -1033,11 +1068,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_drop_dead_reloc_roots(root);
 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
 
+	/* do the directory inserts of any pending snapshot creations */
+	finish_pending_snapshots(trans, root->fs_info);
+
 	mutex_lock(&root->fs_info->trans_mutex);
 
 	cur_trans->commit_done = 1;
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
+
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
@@ -1046,6 +1085,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
+
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (root->fs_info->closing) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index eef2cb7d7e7..202c8be6c05 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,8 +47,10 @@ struct btrfs_trans_handle {
 };
 
 struct btrfs_pending_snapshot {
+	struct dentry *dentry;
 	struct btrfs_root *root;
 	char *name;
+	struct btrfs_key root_key;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From 3394e1607eaf870ebba37d303fbd590a4c569908 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Nov 2008 20:42:26 -0500
Subject: Btrfs: Give each subvol and snapshot their own anonymous devid

Each subvolume has its own private inode number space, and so we need
to fill in different device numbers for each subvolume to avoid confusing
applications.

This commit puts a struct super_block into struct btrfs_root so it can
call set_anon_super() and get a different device number generated for
each root.

btrfs_rename is changed to prevent renames across subvols.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  6 ++++++
 fs/btrfs/disk-io.c     | 13 +++++++++++++
 fs/btrfs/inode.c       | 15 +++++++++++++--
 fs/btrfs/ioctl.c       |  2 --
 fs/btrfs/transaction.c |  4 ++--
 5 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5611f8e035a..b3bc65b08c6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -799,6 +799,12 @@ struct btrfs_root {
 	spinlock_t list_lock;
 	struct list_head dead_list;
 	struct list_head orphan_list;
+
+	/*
+	 * right now this just gets used so that a root has its own devid
+	 * for stat.  It may be used for more later
+	 */
+	struct super_block anon_super;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0a5350573f6..8d7866b733d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -877,6 +877,12 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->defrag_running = 0;
 	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
+	root->anon_super.s_root = NULL;
+	root->anon_super.s_dev = 0;
+	INIT_LIST_HEAD(&root->anon_super.s_list);
+	INIT_LIST_HEAD(&root->anon_super.s_instances);
+	init_rwsem(&root->anon_super.s_umount);
+
 	return 0;
 }
 
@@ -1083,6 +1089,9 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
 	if (IS_ERR(root))
 		return root;
+
+	set_anon_super(&root->anon_super, NULL);
+
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
 				(unsigned long)root->root_key.objectid,
 				root);
@@ -1950,6 +1959,10 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
+	if (root->anon_super.s_dev) {
+		down_write(&root->anon_super.s_umount);
+		kill_anon_super(&root->anon_super);
+	}
 	if (root->in_sysfs)
 		btrfs_sysfs_del_root(root);
 	if (root->node)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e163b1b7470..7ef79ce86e2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2212,7 +2212,12 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr = 0;
 
-	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+	/*
+	 * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
+	 * the root of a subvolume or snapshot
+	 */
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
+	    inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
 		return -ENOTEMPTY;
 	}
 
@@ -4410,7 +4415,6 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
 	if (error)
 		return error;
 
-	atomic_inc(&inode->i_count);
 	d_instantiate(dentry, inode);
 	return 0;
 }
@@ -4548,6 +4552,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
 {
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
+	stat->dev = BTRFS_I(inode)->root->anon_super.s_dev;
 	stat->blksize = PAGE_CACHE_SIZE;
 	stat->blocks = (inode_get_bytes(inode) +
 			BTRFS_I(inode)->delalloc_bytes) >> 9;
@@ -4565,6 +4570,11 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	u64 index = 0;
 	int ret;
 
+	/* we're not allowed to rename between subvolumes */
+	if (BTRFS_I(old_inode)->root->root_key.objectid !=
+	    BTRFS_I(new_dir)->root->root_key.objectid)
+		return -EXDEV;
+
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
 		return -ENOTEMPTY;
@@ -4920,6 +4930,7 @@ static int btrfs_permission(struct inode *inode, int mask)
 }
 
 static struct inode_operations btrfs_dir_inode_operations = {
+	.getattr	= btrfs_getattr,
 	.lookup		= btrfs_lookup,
 	.create		= btrfs_create,
 	.unlink		= btrfs_unlink,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ec45b308613..773db07b5f7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -127,7 +127,6 @@ static noinline int create_subvol(struct btrfs_root *root,
 	key.objectid = objectid;
 	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-printk("inserting root objectid %Lu\n", objectid);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&root_item);
 	if (ret)
@@ -175,7 +174,6 @@ fail:
 		ret = err;
 fail_commit:
 	btrfs_btree_balance_dirty(root, nr);
-printk("all done ret %d\n", ret);
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index eec8b246503..93f23a456a3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -832,13 +832,13 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 	struct inode *parent_inode;
 	struct inode *inode;
 
-	trans = btrfs_start_transaction(fs_info->fs_root, 1);
+	parent_inode = pending->dentry->d_parent->d_inode;
+	trans = btrfs_start_transaction(BTRFS_I(parent_inode)->root, 1);
 
 	/*
 	 * insert the directory item
 	 */
 	namelen = strlen(pending->name);
-	parent_inode = pending->dentry->d_parent->d_inode;
 	ret = btrfs_set_inode_index(parent_inode, &index);
 	ret = btrfs_insert_dir_item(trans,
 			    BTRFS_I(parent_inode)->root,
-- 
cgit v1.2.3


From 0660b5af3f7ac0fac69de975914e1f4a3a586fb3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Nov 2008 20:37:39 -0500
Subject: Btrfs: Add backrefs and forward refs for subvols and snapshots

Subvols and snapshots can now be referenced from any point in the directory
tree.  We need to maintain back refs for them so we can find lost
subvols.

Forward refs are added so that we know all of the subvols and
snapshots referenced anywhere in the directory tree of a single subvol.  This
can be used to do recursive snapshotting (but they aren't yet) and it is
also used to detect and prevent directory loops when creating new snapshots.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 67 +++++++++++++++++++++++++++++++++-----------
 fs/btrfs/inode.c       |  6 ++++
 fs/btrfs/ioctl.c       | 24 +++++++++++-----
 fs/btrfs/root-tree.c   | 76 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/transaction.c | 31 ++++++++++++++------
 5 files changed, 172 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b3bc65b08c6..ad2cbe63503 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -469,6 +469,15 @@ struct btrfs_root_item {
 	u8 level;
 } __attribute__ ((__packed__));
 
+/*
+ * this is used for both forward and backward root refs
+ */
+struct btrfs_root_ref {
+	__le64 dirid;
+	__le64 sequence;
+	__le16 name_len;
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -814,27 +823,27 @@ struct btrfs_root {
  * the FS
  */
 #define BTRFS_INODE_ITEM_KEY		1
-#define BTRFS_INODE_REF_KEY		2
-#define BTRFS_XATTR_ITEM_KEY		8
-#define BTRFS_ORPHAN_ITEM_KEY		9
+#define BTRFS_INODE_REF_KEY		12
+#define BTRFS_XATTR_ITEM_KEY		24
+#define BTRFS_ORPHAN_ITEM_KEY		48
 /* reserve 2-15 close to the inode for later flexibility */
 
 /*
  * dir items are the name -> inode pointers in a directory.  There is one
  * for every name in a directory.
  */
-#define BTRFS_DIR_LOG_ITEM_KEY  14
-#define BTRFS_DIR_LOG_INDEX_KEY 15
-#define BTRFS_DIR_ITEM_KEY	16
-#define BTRFS_DIR_INDEX_KEY	17
+#define BTRFS_DIR_LOG_ITEM_KEY  60
+#define BTRFS_DIR_LOG_INDEX_KEY 72
+#define BTRFS_DIR_ITEM_KEY	84
+#define BTRFS_DIR_INDEX_KEY	96
 /*
  * extent data is for file data
  */
-#define BTRFS_EXTENT_DATA_KEY	18
+#define BTRFS_EXTENT_DATA_KEY	108
 /*
  * csum items have the checksums for data in the extents
  */
-#define BTRFS_CSUM_ITEM_KEY	19
+#define BTRFS_CSUM_ITEM_KEY	120
 
 
 /* reserve 21-31 for other file/dir stuff */
@@ -843,23 +852,37 @@ struct btrfs_root {
  * root items point to tree roots.  There are typically in the root
  * tree used by the super block to find all the other trees
  */
-#define BTRFS_ROOT_ITEM_KEY	32
+#define BTRFS_ROOT_ITEM_KEY	132
+
+/*
+ * root backrefs tie subvols and snapshots to the directory entries that
+ * reference them
+ */
+#define BTRFS_ROOT_BACKREF_KEY	144
+
+/*
+ * root refs make a fast index for listing all of the snapshots and
+ * subvolumes referenced by a given root.  They point directly to the
+ * directory item in the root that references the subvol
+ */
+#define BTRFS_ROOT_REF_KEY	156
+
 /*
  * extent items are in the extent map tree.  These record which blocks
  * are used, and how many references there are to each block
  */
-#define BTRFS_EXTENT_ITEM_KEY	33
-#define BTRFS_EXTENT_REF_KEY	34
+#define BTRFS_EXTENT_ITEM_KEY	168
+#define BTRFS_EXTENT_REF_KEY	180
 
 /*
  * block groups give us hints into the extent allocation trees.  Which
  * blocks are free etc etc
  */
-#define BTRFS_BLOCK_GROUP_ITEM_KEY 50
+#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
 
-#define BTRFS_DEV_EXTENT_KEY	75
-#define BTRFS_DEV_ITEM_KEY	76
-#define BTRFS_CHUNK_ITEM_KEY	77
+#define BTRFS_DEV_EXTENT_KEY	204
+#define BTRFS_DEV_ITEM_KEY	216
+#define BTRFS_CHUNK_ITEM_KEY	228
 
 /*
  * string items are for debugging.  They just store a short string of
@@ -1274,6 +1297,13 @@ static inline void btrfs_set_item_key(struct extent_buffer *eb,
 
 BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64);
 
+/*
+ * struct btrfs_root_ref
+ */
+BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64);
+BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64);
+BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16);
+
 /* struct btrfs_dir_item */
 BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
 BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
@@ -1771,6 +1801,11 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
 /* root-item.c */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u8 type, u64 ref_id,
+		       u64 dirid, u64 sequence,
+		       const char *name, int name_len);
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key);
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7ef79ce86e2..6854bf41856 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4580,6 +4580,12 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		return -ENOTEMPTY;
 	}
 
+	/* to rename a snapshot or subvolume, we need to juggle the
+	 * backrefs.  This isn't coded yet
+	 */
+	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		return -EXDEV;
+
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
 		goto out_unlock;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 773db07b5f7..536ae883780 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -145,13 +145,23 @@ static noinline int create_subvol(struct btrfs_root *root,
 				    BTRFS_FT_DIR, index);
 	if (ret)
 		goto fail;
-#if 0
-	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
-			     name, namelen, objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
-	if (ret)
-		goto fail;
-#endif
+
+	/* add the backref first */
+	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+				 objectid, BTRFS_ROOT_BACKREF_KEY,
+				 root->root_key.objectid,
+				 dir->i_ino, index, name, namelen);
+
+	BUG_ON(ret);
+
+	/* now add the forward ref */
+	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+				 root->root_key.objectid, BTRFS_ROOT_REF_KEY,
+				 objectid,
+				 dir->i_ino, index, name, namelen);
+
+	BUG_ON(ret);
+
 	ret = btrfs_commit_transaction(trans, root);
 	if (ret)
 		goto fail_commit;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index eb7f7655e9d..e9be3abcb36 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -84,7 +84,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	int slot;
 
 	search_key.objectid = objectid;
-	search_key.type = (u8)-1;
+	search_key.type = BTRFS_ROOT_ITEM_KEY;
 	search_key.offset = (u64)-1;
 
 	path = btrfs_alloc_path();
@@ -275,3 +275,77 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u8 type, u64 ref_id)
+{
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_path *path;
+
+	path = btrfs_alloc_path();
+
+	key.objectid = root_id;
+	key.type = type;
+	key.offset = ref_id;
+
+	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, tree_root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
+ * or BTRFS_ROOT_BACKREF_KEY.
+ *
+ * The dirid, sequence, name and name_len refer to the directory entry
+ * that is referencing the root.
+ *
+ * For a forward ref, the root_id is the id of the tree referencing
+ * the root and ref_id is the id of the subvol  or snapshot.
+ *
+ * For a back ref the root_id is the id of the subvol or snapshot and
+ * ref_id is the id of the tree referencing it.
+ */
+int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *tree_root,
+		       u64 root_id, u8 type, u64 ref_id,
+		       u64 dirid, u64 sequence,
+		       const char *name, int name_len)
+{
+	struct btrfs_key key;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root_ref *ref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+
+	path = btrfs_alloc_path();
+
+	key.objectid = root_id;
+	key.type = type;
+	key.offset = ref_id;
+
+	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
+				      sizeof(*ref) + name_len);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+	btrfs_set_root_ref_dirid(leaf, ref, dirid);
+	btrfs_set_root_ref_sequence(leaf, ref, sequence);
+	btrfs_set_root_ref_name_len(leaf, ref, name_len);
+	ptr = (unsigned long)(ref + 1);
+	write_extent_buffer(leaf, name, ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 93f23a456a3..e9c8ebeedd7 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -831,28 +831,43 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 	struct btrfs_trans_handle *trans;
 	struct inode *parent_inode;
 	struct inode *inode;
+	struct btrfs_root *parent_root;
 
 	parent_inode = pending->dentry->d_parent->d_inode;
-	trans = btrfs_start_transaction(BTRFS_I(parent_inode)->root, 1);
+	parent_root = BTRFS_I(parent_inode)->root;
+	trans = btrfs_start_transaction(parent_root, 1);
 
 	/*
 	 * insert the directory item
 	 */
 	namelen = strlen(pending->name);
 	ret = btrfs_set_inode_index(parent_inode, &index);
-	ret = btrfs_insert_dir_item(trans,
-			    BTRFS_I(parent_inode)->root,
+	ret = btrfs_insert_dir_item(trans, parent_root,
 			    pending->name, namelen,
 			    parent_inode->i_ino,
 			    &pending->root_key, BTRFS_FT_DIR, index);
 
 	if (ret)
 		goto fail;
-#if 0
-	ret = btrfs_insert_inode_ref(trans, root->fs_info->tree_root,
-			     pending->name, strlen(pending->name), objectid,
-			     root->fs_info->sb->s_root->d_inode->i_ino, 0);
-#endif
+
+	/* add the backref first */
+	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+				 pending->root_key.objectid,
+				 BTRFS_ROOT_BACKREF_KEY,
+				 parent_root->root_key.objectid,
+				 parent_inode->i_ino, index, pending->name,
+				 namelen);
+
+	BUG_ON(ret);
+
+	/* now add the forward ref */
+	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+				 parent_root->root_key.objectid,
+				 BTRFS_ROOT_REF_KEY,
+				 pending->root_key.objectid,
+				 parent_inode->i_ino, index, pending->name,
+				 namelen);
+
 	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
 	d_instantiate(pending->dentry, inode);
 fail:
-- 
cgit v1.2.3


From ea9e8b11bd1252dcbc23afefcf1a52ec6aa3c113 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 17 Nov 2008 21:14:24 -0500
Subject: Btrfs: prevent loops in the directory tree when creating snapshots

For a directory tree:

/mnt/subvolA/subvolB

btrfsctl -s /mnt/subvolA/subvolB /mnt

Will create a directory loop with subvolA under subvolB.  This
commit uses the forward refs for each subvol and snapshot to error out
before creating the loop.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  3 +++
 fs/btrfs/disk-io.c   |  5 ++++-
 fs/btrfs/ioctl.c     | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/root-tree.c | 16 ++++++++++++++++
 4 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad2cbe63503..70b3dbb4de1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1801,6 +1801,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct extent_buffer *node,
 			struct extent_buffer *parent);
 /* root-item.c */
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+		   struct btrfs_path *path,
+		   u64 root_id, u64 ref_id);
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
 		       u64 root_id, u8 type, u64 ref_id,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8d7866b733d..e18250a6fd2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1129,7 +1129,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
-
+#if 0
 	ret = btrfs_sysfs_add_root(root);
 	if (ret) {
 		free_extent_buffer(root->node);
@@ -1137,6 +1137,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 		kfree(root);
 		return ERR_PTR(ret);
 	}
+#endif
 	root->in_sysfs = 1;
 	return root;
 }
@@ -1963,8 +1964,10 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 		down_write(&root->anon_super.s_umount);
 		kill_anon_super(&root->anon_super);
 	}
+#if 0
 	if (root->in_sysfs)
 		btrfs_sysfs_del_root(root);
+#endif
 	if (root->node)
 		free_extent_buffer(root->node);
 	if (root->commit_root)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 536ae883780..8828109fa58 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -284,6 +284,56 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 	 * subvolume with specific mode bits.
 	 */
 	if (snap_src) {
+		struct dentry *dir = dentry->d_parent;
+		struct dentry *test = dir->d_parent;
+		struct btrfs_path *path = btrfs_alloc_path();
+		int ret;
+		u64 test_oid;
+		u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
+
+		test_oid = snap_src->root_key.objectid;
+
+		ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+					  path, parent_oid, test_oid);
+		if (ret == 0)
+			goto create;
+		btrfs_release_path(snap_src->fs_info->tree_root, path);
+
+		/* we need to make sure we aren't creating a directory loop
+		 * by taking a snapshot of something that has our current
+		 * subvol in its directory tree.  So, this loops through
+		 * the dentries and checks the forward refs for each subvolume
+		 * to see if is references the subvolume where we are
+		 * placing this new snapshot.
+		 */
+		while(1) {
+			if (!test ||
+			    dir == snap_src->fs_info->sb->s_root ||
+			    test == snap_src->fs_info->sb->s_root ||
+			    test->d_inode->i_sb != snap_src->fs_info->sb) {
+				break;
+			}
+			if (S_ISLNK(test->d_inode->i_mode)) {
+				printk("Symlink in snapshot path, failed\n");
+				error = -EMLINK;
+				btrfs_free_path(path);
+				goto out_drop_write;
+			}
+			test_oid =
+				BTRFS_I(test->d_inode)->root->root_key.objectid;
+			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
+				  path, test_oid, parent_oid);
+			if (ret == 0) {
+				printk("Snapshot creation failed, looping\n");
+				error = -EMLINK;
+				btrfs_free_path(path);
+				goto out_drop_write;
+			}
+			btrfs_release_path(snap_src->fs_info->tree_root, path);
+			test = test->d_parent;
+		}
+create:
+		btrfs_free_path(path);
 		error = create_snapshot(snap_src, dentry, name, namelen);
 	} else {
 		error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e9be3abcb36..dbe20d4c6ea 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -300,6 +300,22 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+int btrfs_find_root_ref(struct btrfs_root *tree_root,
+		   struct btrfs_path *path,
+		   u64 root_id, u64 ref_id)
+{
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_REF_KEY;
+	key.offset = ref_id;
+
+	ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+	return ret;
+}
+
+
 /*
  * add a btrfs_root_ref item.  type is either BTRFS_ROOT_REF_KEY
  * or BTRFS_ROOT_BACKREF_KEY.
-- 
cgit v1.2.3


From b4eec2ca1167bae46295aeb66abd15fd52387845 Mon Sep 17 00:00:00 2001
From: Liu Hui <onlyflyer@gmail.com>
Date: Tue, 18 Nov 2008 11:30:10 -0500
Subject: Btrfs: Some fixes for batching extent insert.

In insert_extents(), when ret==1 and last is not zero, it should
check if the current inserted item is the last item in this batching
inserts. If so, it should just break from loop. If not, 'cur =
insert_list->next' will make no sense because the list is empty now,
and 'op' will point to an unexpectable place.

There are also some trivial fixs in this patch including one comment
typo error and deleting two redundant lines.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  2 --
 fs/btrfs/extent-tree.c | 10 +++++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dd1c03aea2d..71ef0a2e2da 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3041,7 +3041,6 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 	struct btrfs_item *item;
 	int ret = 0;
 	int slot;
-	int slot_orig;
 	int i;
 	u32 nritems;
 	u32 total_data = 0;
@@ -3064,7 +3063,6 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 	if (ret < 0)
 		goto out;
 
-	slot_orig = path->slots[0];
 	leaf = path->nodes[0];
 
 	nritems = btrfs_header_nritems(leaf);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b0f2241274f..1121d518bf8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -798,9 +798,12 @@ static int noinline insert_extents(struct btrfs_trans_handle *trans,
 			 */
 			i = last;
 			last = 0;
-			cur = insert_list->next;
-			op = list_entry(cur, struct pending_extent_op, list);
 			total--;
+			if (i < total) {
+				cur = insert_list->next;
+				op = list_entry(cur, struct pending_extent_op,
+						list);
+			}
 		} else {
 			i += ret;
 		}
@@ -2150,6 +2153,7 @@ again:
 		if (ret) {
 			if (skipped && all && !num_inserts) {
 				skipped = 0;
+				search = 0;
 				continue;
 			}
 			mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2193,7 @@ again:
 	}
 
 	/*
-	 * process teh update list, clear the writeback bit for it, and if
+	 * process the update list, clear the writeback bit for it, and if
 	 * somebody marked this thing for deletion then just unlock it and be
 	 * done, the free_extents will handle it
 	 */
-- 
cgit v1.2.3


From 9f0ba5bd91d42706ba78f97ec638c6a821db1c5e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 18 Nov 2008 10:31:22 -0500
Subject: Btrfs: unplug all devices in the unplug call back

For larger multi-device filesystems, there was logic to limit the
number of devices unplugged to just the page that was sent to our sync_page
function.

But, the code wasn't always unplugging the right device.  Since this was
just an optimization, disable it for now.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e18250a6fd2..0e8d31274c9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1214,7 +1214,7 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	u64 offset;
 
 	/* the generic O_DIRECT read code does this */
-	if (!page) {
+	if (1 || !page) {
 		__unplug_io_fn(bdi, page);
 		return;
 	}
-- 
cgit v1.2.3


From 73e9f5beb16f568f797bba87f082556fac18dede Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 18 Nov 2008 11:50:33 -0500
Subject: Btrfs: Update the disk format for the seed device and new root code
 Signed-off-by: Chris Mason <chris.mason@oracle.com>

---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 70b3dbb4de1..0f2a9b584fb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,7 +39,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BDRfS_M"
+#define BTRFS_MAGIC "_BFRfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From 105d931d482b7d1b1b2dd4b0ea30365db8630b9f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 18 Nov 2008 12:13:12 -0500
Subject: Btrfs: switch back to wait_on_page_writeback to wait on metadata
 writes

The extent based waiting was using more CPU, and other fixes have helped
with the unplug storm problems.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 29 +----------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e9c8ebeedd7..c2c3b428196 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -332,7 +332,6 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	int werr = 0;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 	u64 start = 0;
 	u64 end;
 	unsigned long index;
@@ -373,11 +372,6 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 			page_cache_release(page);
 		}
 	}
-	/*
-	 * we unplug once and then use the wait_on_extent_bit for
-	 * everything else
-	 */
-	blk_run_address_space(btree_inode->i_mapping);
 	while(1) {
 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -398,28 +392,7 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 				if (err)
 					werr = err;
 			}
-			if (PageWriteback(page)) {
-				/*
-				 * we don't wait on the page writeback bit
-				 * because that triggers a lot of unplugs.
-				 * The extent bits are much nicer to
-				 * the disks, but come with a slightly
-				 * higher latency because we aren't forcing
-				 * unplugs.
-				 */
-				wait_on_extent_writeback(io_tree,
-					 page_offset(page),
-					 page_offset(page) +
-					 PAGE_CACHE_SIZE - 1);
-			}
-			if (PageWriteback(page)) {
-				/*
-				 * the state bits get cleared before the
-				 * page bits, lets add some extra
-				 * paranoia here
-				 */
-				wait_on_page_writeback(page);
-			}
+			wait_on_page_writeback(page);
 			page_cache_release(page);
 			cond_resched();
 		}
-- 
cgit v1.2.3


From d2c3f4f695edac4d75c1b3eb01a1d16072de63bb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 19 Nov 2008 12:44:22 -0500
Subject: Btrfs: Avoid writeback stalls

While building large bios in writepages, btrfs may end up waiting
for other page writeback to finish if WB_SYNC_ALL is used.

While it is waiting, the bio it is building has a number of pages with the
writeback bit set and they aren't getting to the disk any time soon.  This
lowers the latencies of writeback in general by sending down the bio being
built before waiting for other pages.

The bio submission code tries to limit the total number of async bios in
flight by waiting when we're over a certain number of async bios.  But,
the waits are happening while writepages is building bios, and this can easily
lead to stalls and other problems for people calling wait_on_page_writeback.

The current fix is to let the congestion tests take care of waiting.

sync() and others make sure to drain the current async requests to make
sure that everything that was pending when the sync was started really get
to disk.  The code would drain pending requests both before and after
submitting a new request.

But, if one of the requests is waiting for page writeback to finish,
the draining waits might block that page writeback.  This changes the
draining code to only wait after submitting the bio being processed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 12 +++---------
 fs/btrfs/extent_io.c | 24 ++++++++++++++++++++----
 fs/btrfs/inode.c     |  7 -------
 3 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0e8d31274c9..8d03e4a3c4e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -538,15 +538,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->work.flags = 0;
 	async->bio_flags = bio_flags;
 
-	while(atomic_read(&fs_info->async_submit_draining) &&
-	      atomic_read(&fs_info->nr_async_submits)) {
-		wait_event(fs_info->async_submit_wait,
-			   (atomic_read(&fs_info->nr_async_submits) == 0));
-	}
-
 	atomic_inc(&fs_info->nr_async_submits);
 	btrfs_queue_worker(&fs_info->workers, &async->work);
-
+#if 0
 	if (atomic_read(&fs_info->nr_async_submits) > limit) {
 		wait_event_timeout(fs_info->async_submit_wait,
 			   (atomic_read(&fs_info->nr_async_submits) < limit),
@@ -556,7 +550,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			   (atomic_read(&fs_info->nr_async_bios) < limit),
 			   HZ/10);
 	}
-
+#endif
 	while(atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
@@ -1765,11 +1759,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_cleanup_reloc_trees(tree_root);
 	BUG_ON(ret);
 
+read_fs_root:
 	location.objectid = BTRFS_FS_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
 	location.offset = (u64)-1;
 
-read_fs_root:
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
 		goto fail_cleaner;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 54d013c3bb8..a0f3804efe4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2398,7 +2398,8 @@ update_nr_written:
 int extent_write_cache_pages(struct extent_io_tree *tree,
 			     struct address_space *mapping,
 			     struct writeback_control *wbc,
-			     writepage_t writepage, void *data)
+			     writepage_t writepage, void *data,
+			     void (*flush_fn)(void *))
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ret = 0;
@@ -2460,8 +2461,10 @@ retry:
 				continue;
 			}
 
-			if (wbc->sync_mode != WB_SYNC_NONE)
+			if (wbc->sync_mode != WB_SYNC_NONE) {
+				flush_fn(data);
 				wait_on_page_writeback(page);
+			}
 
 			if (PageWriteback(page) ||
 			    !clear_page_dirty_for_io(page)) {
@@ -2498,6 +2501,15 @@ retry:
 }
 EXPORT_SYMBOL(extent_write_cache_pages);
 
+static noinline void flush_write_bio(void *data)
+{
+	struct extent_page_data *epd = data;
+	if (epd->bio) {
+		submit_one_bio(WRITE, epd->bio, 0, 0);
+		epd->bio = NULL;
+	}
+}
+
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent,
 			  struct writeback_control *wbc)
@@ -2523,7 +2535,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	ret = __extent_writepage(page, wbc, &epd);
 
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
-				 __extent_writepage, &epd);
+				 __extent_writepage, &epd, flush_write_bio);
 	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
@@ -2592,7 +2604,8 @@ int extent_writepages(struct extent_io_tree *tree,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
-				       __extent_writepage, &epd);
+				       __extent_writepage, &epd,
+				       flush_write_bio);
 	if (epd.bio) {
 		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
@@ -3087,6 +3100,9 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
+		if (!set && !PageDirty(page))
+			continue;
+
 		lock_page(page);
 		if (i == 0)
 			set_page_extent_head(page, eb->len);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6854bf41856..806caacff86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -883,13 +883,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->work.ordered_free = async_cow_free;
 		async_cow->work.flags = 0;
 
-		while(atomic_read(&root->fs_info->async_submit_draining) &&
-		      atomic_read(&root->fs_info->async_delalloc_pages)) {
-			wait_event(root->fs_info->async_submit_wait,
-			     (atomic_read(&root->fs_info->async_delalloc_pages)
-			      == 0));
-		}
-
 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
 			PAGE_CACHE_SHIFT;
 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
-- 
cgit v1.2.3


From 7c2ca4682a2b401cb0c43f96b05eced25eb83eec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 19 Nov 2008 15:13:35 -0500
Subject: Btrfs: Do fsync log replay when mount -o ro, except when on readonly
 media

fsync log replay can change the filesystem, so it cannot be delayed until
mount -o rw,remount, and it can't be forgotten entirely.  So, this patch
changes btrfs to do with reiserfs, ext3 and xfs do, which is to do the
log replay even when mounted readonly.

On a readonly device if log replay is required, the mount is aborted.

Getting all of this right had required fixing up some of the error
handling in open_ctree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8d03e4a3c4e..c8dcb47b6d7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -518,7 +518,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			extent_submit_bio_hook_t *submit_bio_done)
 {
 	struct async_submit_bio *async;
-	int limit = btrfs_async_submit_limit(fs_info);
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
@@ -541,6 +540,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	atomic_inc(&fs_info->nr_async_submits);
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 #if 0
+	int limit = btrfs_async_submit_limit(fs_info);
 	if (atomic_read(&fs_info->nr_async_submits) > limit) {
 		wait_event_timeout(fs_info->async_submit_wait,
 			   (atomic_read(&fs_info->nr_async_submits) < limit),
@@ -1732,13 +1732,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!fs_info->transaction_kthread)
 		goto fail_cleaner;
 
-	if (sb->s_flags & MS_RDONLY)
-		goto read_fs_root;
-
 	if (btrfs_super_log_root(disk_super) != 0) {
 		u32 blocksize;
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
+		if (fs_devices->rw_devices == 0) {
+			printk("Btrfs log replay required on RO media\n");
+			err = -EIO;
+			goto fail_trans_kthread;
+		}
 		blocksize =
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
@@ -1756,21 +1758,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_cleanup_reloc_trees(tree_root);
-	BUG_ON(ret);
+	if (!(sb->s_flags & MS_RDONLY)) {
+		ret = btrfs_cleanup_reloc_trees(tree_root);
+		BUG_ON(ret);
+	}
 
-read_fs_root:
 	location.objectid = BTRFS_FS_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
 	location.offset = (u64)-1;
 
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
-		goto fail_cleaner;
+		goto fail_trans_kthread;
 	return tree_root;
 
+fail_trans_kthread:
+	kthread_stop(fs_info->transaction_kthread);
 fail_cleaner:
 	kthread_stop(fs_info->cleaner_kthread);
+
+	/*
+	 * make sure we're done with the btree inode before we stop our
+	 * kthreads
+	 */
+	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
+
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
@@ -1778,6 +1791,7 @@ fail_tree_root:
 fail_chunk_root:
 	free_extent_buffer(chunk_root->node);
 fail_sys_array:
+	free_extent_buffer(dev_root->node);
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -1786,6 +1800,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
-- 
cgit v1.2.3


From 07103a3cdb24099324a11be1f35279b463cdfc31 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 19 Nov 2008 15:17:55 -0500
Subject: Btrfs: fix free space accounting when unpinning extents

This patch fixes what I hope is the last early ENOSPC bug left.  I did not know
that pinned extents would merge into one big extent when inserted on to the
pinned extent tree, so I was adding free space to a block group that could
possibly span multiple block groups.

This is a big issue because first that space doesn't exist in that block group,
and second we won't actually use that space because there are a bunch of other
checks to make sure we're allocating within the constraints of the block group.

This patch fixes the problem by adding the btrfs_add_free_space to
btrfs_update_pinned_extents which makes sure we are adding the appropriate
amount of free space to the appropriate block group.  Thanks much to Lee Trager
for running my myriad of debug patches to help me track this problem down.
Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1121d518bf8..ee73efe7542 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2034,6 +2034,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned -= len;
+			if (cache->cached)
+				btrfs_add_free_space(cache, bytenr, len);
 		}
 		bytenr += len;
 		num -= len;
@@ -2099,7 +2101,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	int ret;
-	struct btrfs_block_group_cache *cache;
 
 	mutex_lock(&root->fs_info->pinned_mutex);
 	while(1) {
@@ -2109,9 +2110,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			break;
 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
-		cache = btrfs_lookup_block_group(root->fs_info, start);
-		if (cache->cached)
-			btrfs_add_free_space(cache, start, end - start + 1);
 		if (need_resched()) {
 			mutex_unlock(&root->fs_info->pinned_mutex);
 			cond_resched();
-- 
cgit v1.2.3


From 15916de835a683bd8133a0d1ac0c982b795ab4ff Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 19 Nov 2008 21:17:22 -0500
Subject: Btrfs: Fixes for 2.6.28-rc API changes

* open/close_bdev_excl -> open/close_bdev_exclusive
* blkdev_issue_discard takes a GFP mask now
* Fix blkdev_issue_discard usage now that it is enabled

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compat.h      |  7 +++++++
 fs/btrfs/compression.c |  4 ++--
 fs/btrfs/extent-tree.c | 27 ++++++++++++++++++++-------
 fs/btrfs/extent_io.c   |  4 ++--
 fs/btrfs/version.sh    | 34 +++++++++++++++++-----------------
 fs/btrfs/volumes.c     | 41 +++++++++++++++++++++++------------------
 fs/btrfs/volumes.h     |  3 +++
 7 files changed, 74 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index cd6598b169d..dd1defdbfa3 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -21,4 +21,11 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+# define  __pagevec_lru_add_file __pagevec_lru_add
+# define open_bdev_exclusive open_bdev_excl
+# define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
+#endif
+
+
 #endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bfd1512cce0..df05f513e1e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -419,7 +419,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		/* open coding of lru_cache_add, also not exported */
 		page_cache_get(page);
 		if (!pagevec_add(&pvec, page))
-			__pagevec_lru_add(&pvec);
+			__pagevec_lru_add_file(&pvec);
 
 		end = last_offset + PAGE_CACHE_SIZE - 1;
 		/*
@@ -475,7 +475,7 @@ next:
 		last_offset += PAGE_CACHE_SIZE;
 	}
 	if (pagevec_count(&pvec))
-		__pagevec_lru_add(&pvec);
+		__pagevec_lru_add_file(&pvec);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ee73efe7542..62d49705d14 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,6 +28,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "compat.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -899,6 +900,17 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+#else
+	blkdev_issue_discard(bdev, start >> 9, len >> 9);
+#endif
+}
+
+
 static int noinline free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
 				 struct list_head *del_list)
@@ -1108,6 +1120,7 @@ search:
 			BUG_ON(ret);
 
 #ifdef BIO_RW_DISCARD
+			map_length = tmp->num_bytes;
 			ret = btrfs_map_block(&info->mapping_tree, READ,
 					      tmp->bytenr, &map_length, &multi,
 					      0);
@@ -1115,16 +1128,16 @@ search:
 				struct btrfs_bio_stripe *stripe;
 				int i;
 
-				stripe = multi->stripe;
+				stripe = multi->stripes;
 
 				if (map_length > tmp->num_bytes)
 					map_length = tmp->num_bytes;
 
 				for (i = 0; i < multi->num_stripes;
 				     i++, stripe++)
-					blkdev_issue_discard(stripe->dev->bdev,
-							stripe->physical >> 9,
-							map_length >> 9);
+					btrfs_issue_discard(stripe->dev->bdev,
+							    stripe->physical,
+							    map_length);
 				kfree(multi);
 			}
 #endif
@@ -2498,9 +2511,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 				map_length = num_bytes;
 
 			for (i = 0; i < multi->num_stripes; i++, stripe++) {
-				blkdev_issue_discard(stripe->dev->bdev,
-						     stripe->physical >> 9,
-						     map_length >> 9);
+				btrfs_issue_discard(stripe->dev->bdev,
+						    stripe->physical,
+						     map_length);
 			}
 			kfree(multi);
 		}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a0f3804efe4..3a65c10dce3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2639,14 +2639,14 @@ int extent_readpages(struct extent_io_tree *tree,
 			/* open coding of lru_cache_add, also not exported */
 			page_cache_get(page);
 			if (!pagevec_add(&pvec, page))
-				__pagevec_lru_add(&pvec);
+				__pagevec_lru_add_file(&pvec);
 			__extent_read_full_page(tree, page, get_extent,
 						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
 	if (pagevec_count(&pvec))
-		__pagevec_lru_add(&pvec);
+		__pagevec_lru_add_file(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		submit_one_bio(READ, bio, 0, bio_flags);
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
index 0f57f24404d..1ca1952fd91 100644
--- a/fs/btrfs/version.sh
+++ b/fs/btrfs/version.sh
@@ -8,24 +8,24 @@
  
 v="v0.16"
 
-which hg > /dev/null
-if [ -d .hg ] && [ $? == 0 ]; then
-	last=$(hg tags | grep -m1 -o '^v[0-9.]\+')
-	 
-	# now check if the repo has commits since then...
-	if [[ $(hg id -t) == $last || \
-	    $(hg di -r "$last:." | awk '/^diff/{print $NF}' | sort -u) == .hgtags ]]
-	then
-	    # check if it's dirty
-	    if [[ $(hg id | cut -d' ' -f1) == *+ ]]; then
-		v=$last+
-	    else
-		v=$last
+which git &> /dev/null
+if [ $? == 0 ]; then
+    git branch >& /dev/null
+    if [ $? == 0 ]; then
+	    if head=`git rev-parse --verify HEAD 2>/dev/null`; then
+		if tag=`git describe --tags 2>/dev/null`; then
+		    v="$tag"
+		fi
+
+		# Are there uncommitted changes?
+		git update-index --refresh --unmerged > /dev/null
+		if git diff-index --name-only HEAD | \
+		    grep -v "^scripts/package" \
+		    | read dummy; then
+		    v="$v"-dirty
+		fi
 	    fi
-	else
-	    # includes dirty flag
-	    v=$last+$(hg id -i)
-	fi
+    fi
 fi
  
 echo "#ifndef __BUILD_VERSION" > .build-version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ecf0633ab8c..c3ee63f92a5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -85,7 +85,7 @@ int btrfs_cleanup_fs_uuids(void)
 			dev = list_entry(fs_devices->devices.next,
 					 struct btrfs_device, dev_list);
 			if (dev->bdev) {
-				close_bdev_excl(dev->bdev);
+				close_bdev_exclusive(dev->bdev, dev->mode);
 				fs_devices->open_devices--;
 			}
 			fs_devices->num_devices--;
@@ -317,7 +317,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -356,7 +356,7 @@ again:
 	list_for_each(cur, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -391,7 +391,8 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 	return ret;
 }
 
-int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
+int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+			 int flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -413,7 +414,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_excl(device->name, MS_RDONLY, holder);
+		bdev = open_bdev_exclusive(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			goto error;
@@ -453,6 +454,8 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
+		device->mode = flags;
+
 		fs_devices->open_devices++;
 		if (device->writeable) {
 			fs_devices->rw_devices++;
@@ -464,7 +467,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 error:
 		continue;
 	}
@@ -496,7 +499,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			ret = 0;
 		}
 	} else {
-		ret = __btrfs_open_devices(fs_devices, holder);
+		ret = __btrfs_open_devices(fs_devices, flags, holder);
 	}
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -514,7 +517,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	bdev = open_bdev_excl(path, flags, holder);
+	bdev = open_bdev_exclusive(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
@@ -551,7 +554,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 error_brelse:
 	brelse(bh);
 error_close:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1003,7 +1006,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_excl(device_path, MS_RDONLY,
+		bdev = open_bdev_exclusive(device_path, MS_RDONLY,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
@@ -1073,10 +1076,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		BUG_ON(device->writeable);
 		brelse(bh);
 		if (bdev)
-			close_bdev_excl(bdev);
+			close_bdev_exclusive(bdev, MS_RDONLY);
 
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			device->bdev = NULL;
 			device->fs_devices->open_devices--;
 		}
@@ -1112,11 +1115,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
-		close_bdev_excl(device->bdev);
+		close_bdev_exclusive(device->bdev, device->mode);
 	}
 	if (bdev) {
 		/* one close for us */
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 	}
 	kfree(device->name);
 	kfree(device);
@@ -1127,7 +1130,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -1272,7 +1275,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
 	if (!bdev) {
 		return -EIO;
 	}
@@ -1331,6 +1334,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	device->mode = 0;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
@@ -1379,7 +1383,7 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, 0);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
@@ -2907,7 +2911,8 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		goto out;
 	}
 
-	ret = __btrfs_open_devices(fs_devices, root->fs_info->bdev_holder);
+	ret = __btrfs_open_devices(fs_devices, MS_RDONLY,
+				   root->fs_info->bdev_holder);
 	if (ret)
 		goto out;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1f6f25a5787..9b41e4d3984 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,6 +42,9 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
+	/* the mode sent to open_bdev_exclusive */
+	fmode_t mode;
+
 	char *name;
 
 	/* the internal btrfs device id */
-- 
cgit v1.2.3


From 79683f2d685cfb6ef9c97c5194e3ce3319e80cac Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 19 Nov 2008 22:00:53 -0500
Subject: Btrfs: Use current_fsuid/gid

This fixes compile problems with linux-next

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 806caacff86..2c77e0957f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3422,8 +3422,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	if (objectid > root->highest_inode)
 		root->highest_inode = objectid;
 
-	inode->i_uid = current->fsuid;
-	inode->i_gid = current->fsgid;
+	inode->i_uid = current_fsuid();
+	inode->i_gid = current_fsgid();
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
 	inode_set_bytes(inode, 0);
-- 
cgit v1.2.3


From 4b4e25f2a6ddb070bab7f7dd2bd2926fb8db9e04 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 20 Nov 2008 10:22:27 -0500
Subject: Btrfs: compat code fixes

The btrfs git kernel trees is used to build a standalone tree for
compiling against older kernels.  This commit makes the standalone tree
work with 2.6.27

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compat.h      | 1 +
 fs/btrfs/compression.c | 2 +-
 fs/btrfs/disk-io.c     | 3 ++-
 fs/btrfs/extent-tree.c | 5 ++++-
 fs/btrfs/inode.c       | 2 +-
 fs/btrfs/ioctl.c       | 1 +
 fs/btrfs/super.c       | 2 ++
 fs/btrfs/volumes.c     | 2 ++
 8 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index dd1defdbfa3..75e4426d6fb 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -25,6 +25,7 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 # define  __pagevec_lru_add_file __pagevec_lru_add
 # define open_bdev_exclusive open_bdev_excl
 # define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
+typedef unsigned __bitwise__ fmode_t;
 #endif
 
 
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index df05f513e1e..4febe2eb0b8 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -34,13 +34,13 @@
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
 #include <linux/pagevec.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "ordered-data.h"
-#include "compat.h"
 #include "compression.h"
 #include "extent_io.h"
 #include "extent_map.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c8dcb47b6d7..981652233f7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,7 +26,8 @@
 #include <linux/buffer_head.h> // for block_sync_page
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
-# include <linux/freezer.h>
+#include <linux/freezer.h>
+#include "compat.h"
 #include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62d49705d14..b33e0bfb99e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,6 +19,8 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/version.h>
+#include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
 #include "ctree.h"
@@ -900,6 +902,7 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
@@ -909,7 +912,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 	blkdev_issue_discard(bdev, start >> 9, len >> 9);
 #endif
 }
-
+#endif
 
 static int noinline free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2c77e0957f7..7a8ad4292f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -47,7 +48,6 @@
 #include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
-#include "compat.h"
 #include "tree-log.h"
 #include "ref-cache.h"
 #include "compression.h"
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8828109fa58..f3d68457e66 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -41,6 +41,7 @@
 #include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 77c5eff3e20..1975ea273dc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,6 +37,8 @@
 #include <linux/ctype.h>
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
+#include <linux/version.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c3ee63f92a5..724ead54529 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,9 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/version.h>
 #include <asm/div64.h>
+#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
-- 
cgit v1.2.3


From e556ce2c9d2163aea122e91a1512c9a110fece5d Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 20 Nov 2008 10:25:19 -0500
Subject: Btrfs: Drop dirty roots created by log replay immediately when

The log replay produces dirty roots. These dirty roots
should be dropped immediately if the fs is mounted as
ro. Otherwise they can be added to the dirty root list
again when remounting the fs as rw. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 981652233f7..fb04665e500 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1757,6 +1757,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 						      generation + 1);
 		ret = btrfs_recover_log_trees(log_tree_root);
 		BUG_ON(ret);
+
+		if (sb->s_flags & MS_RDONLY) {
+			ret =  btrfs_commit_super(tree_root);
+			BUG_ON(ret);
+		}
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-- 
cgit v1.2.3


From 0e6bd956ed238eb2f69386f251847fe3163532e1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 20 Nov 2008 10:46:35 -0500
Subject: Btrfs: only flush down bios for writeback pages

The btrfs write_cache_pages call has a flush function so that it submits
the bio it has been building before it waits on any writeback pages.

This adds a check so that flush only happens on writeback pages.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3a65c10dce3..87dba851793 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2462,7 +2462,8 @@ retry:
 			}
 
 			if (wbc->sync_mode != WB_SYNC_NONE) {
-				flush_fn(data);
+				if (PageWriteback(page))
+					flush_fn(data);
 				wait_on_page_writeback(page);
 			}
 
-- 
cgit v1.2.3


From ea6a478ed9758cb0f5af228104b9434840aa20ff Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 20 Nov 2008 12:16:16 -0500
Subject: Btrfs: Fix for lockdep warnings with alloc_mutex and pinned_mutex

This the lockdep complaint by having a different mutex to gaurd caching the
block group, so you don't end up with this backwards dependancy.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c | 32 ++++++++++++++++++--------------
 2 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0f2a9b584fb..166896dd44c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -580,6 +580,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_block_group_item item;
 	spinlock_t lock;
 	struct mutex alloc_mutex;
+	struct mutex cache_mutex;
 	u64 pinned;
 	u64 reserved;
 	u64 flags;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b33e0bfb99e..a970472eab1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -170,8 +170,8 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 			start = extent_end + 1;
 		} else if (extent_start > start && extent_start < end) {
 			size = extent_start - start;
-			ret = btrfs_add_free_space_lock(block_group, start,
-							size);
+			ret = btrfs_add_free_space(block_group, start,
+						   size);
 			BUG_ON(ret);
 			start = extent_end + 1;
 		} else {
@@ -181,7 +181,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (start < end) {
 		size = end - start;
-		ret = btrfs_add_free_space_lock(block_group, start, size);
+		ret = btrfs_add_free_space(block_group, start, size);
 		BUG_ON(ret);
 	}
 	mutex_unlock(&info->pinned_mutex);
@@ -2842,17 +2842,19 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		if (!block_group)
 			goto new_group_no_lock;
 
+		if (unlikely(!block_group->cached)) {
+			mutex_lock(&block_group->cache_mutex);
+			ret = cache_block_group(root, block_group);
+			mutex_unlock(&block_group->cache_mutex);
+			if (ret)
+				break;
+		}
+
 		mutex_lock(&block_group->alloc_mutex);
 		if (unlikely(!block_group_bits(block_group, data)))
 			goto new_group;
 
-		ret = cache_block_group(root, block_group);
-		if (ret) {
-			mutex_unlock(&block_group->alloc_mutex);
-			break;
-		}
-
-		if (block_group->ro)
+		if (unlikely(block_group->ro))
 			goto new_group;
 
 		free_space = btrfs_find_free_space(block_group, search_start,
@@ -3273,12 +3275,12 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *block_group;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	mutex_lock(&block_group->alloc_mutex);
+	mutex_lock(&block_group->cache_mutex);
 	cache_block_group(root, block_group);
+	mutex_unlock(&block_group->cache_mutex);
 
-	ret = btrfs_remove_free_space_lock(block_group, ins->objectid,
-					   ins->offset);
-	mutex_unlock(&block_group->alloc_mutex);
+	ret = btrfs_remove_free_space(block_group, ins->objectid,
+				      ins->offset);
 	BUG_ON(ret);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
@@ -5801,6 +5803,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		spin_lock_init(&cache->lock);
 		mutex_init(&cache->alloc_mutex);
+		mutex_init(&cache->cache_mutex);
 		INIT_LIST_HEAD(&cache->list);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -5854,6 +5857,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.offset = size;
 	spin_lock_init(&cache->lock);
 	mutex_init(&cache->alloc_mutex);
+	mutex_init(&cache->cache_mutex);
 	INIT_LIST_HEAD(&cache->list);
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
-- 
cgit v1.2.3


From 7a2fcbf7f85737735fd44eb34b62315bccf6d6e4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:36:55 -0500
Subject: ext4: don't use blocks freed but not yet committed in buddy cache
 init

When we generate buddy cache (especially during resize) we need to
make sure we don't use the blocks freed but not yet comitted.  This
makes sure we have the right value of free blocks count in the group
info and also in the bitmap.  This also ensures the ordered mode
consistency

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 82 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c17063ddb30..860766421fe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -335,6 +335,8 @@ static struct kmem_cache *ext4_ac_cachep;
 static struct kmem_cache *ext4_free_ext_cachep;
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group);
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group);
 static int ext4_mb_init_per_dev_proc(struct super_block *sb);
 static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
@@ -858,7 +860,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			/*
 			 * incore got set to the group block bitmap below
 			 */
+			ext4_lock_group(sb, group);
 			ext4_mb_generate_buddy(sb, data, incore, group);
+			ext4_unlock_group(sb, group);
 			incore = NULL;
 		} else {
 			/* this is block of bitmap */
@@ -872,6 +876,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 			/* mark all preallocated blks used in in-core bitmap */
 			ext4_mb_generate_from_pa(sb, data, group);
+			ext4_mb_generate_from_freelist(sb, data, group);
 			ext4_unlock_group(sb, group);
 
 			/* set incore so that the buddy information can be
@@ -3471,6 +3476,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 	return 0;
 }
 
+/*
+ * the function goes through all block freed in the group
+ * but not yet committed and marks them used in in-core bitmap.
+ * buddy must be generated from this bitmap
+ * Need to be called with ext4 group lock (ext4_lock_group)
+ */
+static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+						ext4_group_t group)
+{
+	struct rb_node *n;
+	struct ext4_group_info *grp;
+	struct ext4_free_data *entry;
+
+	grp = ext4_get_group_info(sb, group);
+	n = rb_first(&(grp->bb_free_root));
+
+	while (n) {
+		entry = rb_entry(n, struct ext4_free_data, node);
+		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
+				bitmap, entry->start_blk,
+				entry->count);
+		n = rb_next(n);
+	}
+	return;
+}
+
 /*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
@@ -4568,12 +4599,13 @@ static int can_merge(struct ext4_free_data *entry1,
 
 static noinline_for_stack int
 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
-			  ext4_group_t group, ext4_grpblk_t block, int count)
+		      struct ext4_free_data *new_entry)
 {
+	ext4_grpblk_t block;
+	struct ext4_free_data *entry;
 	struct ext4_group_info *db = e4b->bd_info;
 	struct super_block *sb = e4b->bd_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_free_data *entry, *new_entry;
 	struct rb_node **n = &db->bb_free_root.rb_node, *node;
 	struct rb_node *parent = NULL, *new_node;
 
@@ -4581,14 +4613,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
-	new_entry->start_blk = block;
-	new_entry->group  = group;
-	new_entry->count = count;
-	new_entry->t_tid = handle->h_transaction->t_tid;
 	new_node = &new_entry->node;
+	block = new_entry->start_blk;
 
-	ext4_lock_group(sb, group);
 	if (!*n) {
 		/* first free block exent. We need to
 		   protect buddy cache from being freed,
@@ -4606,7 +4633,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			ext4_unlock_group(sb, group);
 			ext4_error(sb, __func__,
 			    "Double free of blocks %d (%d %d)",
 			    block, entry->start_blk, entry->count);
@@ -4648,7 +4674,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 	spin_lock(&sbi->s_md_lock);
 	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
 	spin_unlock(&sbi->s_md_lock);
-	ext4_unlock_group(sb, group);
 	return 0;
 }
 
@@ -4753,15 +4778,6 @@ do_more:
 			BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
 	}
 #endif
-	mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-			bit, count);
-
-	/* We dirtied the bitmap block */
-	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-	if (err)
-		goto error_return;
-
 	if (ac) {
 		ac->ac_b_ex.fe_group = block_group;
 		ac->ac_b_ex.fe_start = bit;
@@ -4773,11 +4789,29 @@ do_more:
 	if (err)
 		goto error_return;
 	if (metadata && ext4_handle_valid(handle)) {
-		/* blocks being freed are metadata. these blocks shouldn't
-		 * be used until this transaction is committed */
-		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
+		struct ext4_free_data *new_entry;
+		/*
+		 * blocks being freed are metadata. these blocks shouldn't
+		 * be used until this transaction is committed
+		 */
+		new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		new_entry->start_blk = bit;
+		new_entry->group  = block_group;
+		new_entry->count = count;
+		new_entry->t_tid = handle->h_transaction->t_tid;
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+				bit, count);
+		ext4_mb_free_metadata(handle, &e4b, new_entry);
+		ext4_unlock_group(sb, block_group);
 	} else {
 		ext4_lock_group(sb, block_group);
+		/* need to update group_info->bb_free and bitmap
+		 * with group lock held. generate_buddy look at
+		 * them with group lock_held
+		 */
+		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
+				bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
 		ext4_unlock_group(sb, block_group);
@@ -4800,6 +4834,10 @@ do_more:
 
 	*freed += count;
 
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
 	/* And the group descriptor block */
 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
 	ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
-- 
cgit v1.2.3


From b7be019e80da4db96d283734d55366014509911c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sun, 23 Nov 2008 23:51:53 -0500
Subject: ext4: Fix lockdep recursive locking warning

In ext4_mb_init_group(), if the filesystem block size is less than
PAGE_SIZE/2, the code tries to grab alloc_sem for multiple block
groups in a loop.  We need to allow for this by using
down_write_nested() and passing in the loop index as a lock subclass
number.  This works because no other code path needs to take multiple
alloc_sem's.  Note that lockdep will fail for filesystem blocksize
smaller than to PAGE_SIZE/16k.  (e.g., a 1k filesystem blocksize with
a 32k page size, or a 2k filesystem blocksize with a 64k blocksize,
etc.)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 860766421fe..0bf4c4c06b1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1780,7 +1780,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
 		 * no block allocation going on in any
 		 * of that groups
 		 */
-		down_write(&grp->alloc_sem);
+		down_write_nested(&grp->alloc_sem, i);
 	}
 	return i;
 }
-- 
cgit v1.2.3


From 1729a16c2c92bbd9e54ac7cad3101fea2e073aa5 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 26 Nov 2008 12:03:54 +0100
Subject: fuse: style fixes

Fix coding style errors reported by checkpatch and others.  Uptdate
copyright date to 2008.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/control.c |  6 ++++--
 fs/fuse/dev.c     | 22 +++++++++++++---------
 fs/fuse/dir.c     | 12 +++++++-----
 fs/fuse/file.c    |  4 ++--
 fs/fuse/fuse_i.h  | 38 +++++++++++++++++++-------------------
 fs/fuse/inode.c   | 19 +++++++++----------
 6 files changed, 54 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 4f3cab32141..99c99dfb037 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
 	size_t size;
 
 	if (!*ppos) {
+		long value;
 		struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
 		if (!fc)
 			return 0;
 
-		file->private_data=(void *)(long)atomic_read(&fc->num_waiting);
+		value = atomic_read(&fc->num_waiting);
+		file->private_data = (void *)value;
 		fuse_conn_put(fc);
 	}
 	size = sprintf(tmp, "%ld\n", (long)file->private_data);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b72361479be..85a23bb524f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -539,8 +539,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 		BUG_ON(!cs->nr_segs);
 		cs->seglen = cs->iov[0].iov_len;
 		cs->addr = (unsigned long) cs->iov[0].iov_base;
-		cs->iov ++;
-		cs->nr_segs --;
+		cs->iov++;
+		cs->nr_segs--;
 	}
 	down_read(&current->mm->mmap_sem);
 	err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
@@ -589,9 +589,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
 		kunmap_atomic(mapaddr, KM_USER1);
 	}
 	while (count) {
-		int err;
-		if (!cs->len && (err = fuse_copy_fill(cs)))
-			return err;
+		if (!cs->len) {
+			int err = fuse_copy_fill(cs);
+			if (err)
+				return err;
+		}
 		if (page) {
 			void *mapaddr = kmap_atomic(page, KM_USER1);
 			void *buf = mapaddr + offset;
@@ -631,9 +633,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
 	while (size) {
-		int err;
-		if (!cs->len && (err = fuse_copy_fill(cs)))
-			return err;
+		if (!cs->len) {
+			int err = fuse_copy_fill(cs);
+			if (err)
+				return err;
+		}
 		fuse_copy_do(cs, &val, &size);
 	}
 	return 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fd03330cade..9e7c5385699 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 				return 0;
 			}
 			spin_lock(&fc->lock);
-			fi->nlookup ++;
+			fi->nlookup++;
 			spin_unlock(&fc->lock);
 		}
 		fuse_put_request(fc, forget_req);
@@ -637,9 +637,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 	if (!err) {
 		struct inode *inode = entry->d_inode;
 
-		/* Set nlink to zero so the inode can be cleared, if
-                   the inode does have more links this will be
-                   discovered at the next lookup/getattr */
+		/*
+		 * Set nlink to zero so the inode can be cleared, if the inode
+		 * does have more links this will be discovered at the next
+		 * lookup/getattr.
+		 */
 		clear_nlink(inode);
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b8..86054f437d1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -543,7 +543,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 		}
 	}
 	req->pages[req->num_pages] = page;
-	req->num_pages ++;
+	req->num_pages++;
 	return 0;
 }
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 35accfdd747..4fc5131f5c9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -355,19 +355,19 @@ struct fuse_conn {
 	/** Connection failed (version mismatch).  Cannot race with
 	    setting other bitfields since it is only set once in INIT
 	    reply, before any other request, and never cleared */
-	unsigned conn_error : 1;
+	unsigned conn_error:1;
 
 	/** Connection successful.  Only set in INIT */
-	unsigned conn_init : 1;
+	unsigned conn_init:1;
 
 	/** Do readpages asynchronously?  Only set in INIT */
-	unsigned async_read : 1;
+	unsigned async_read:1;
 
 	/** Do not send separate SETATTR request before open(O_TRUNC)  */
-	unsigned atomic_o_trunc : 1;
+	unsigned atomic_o_trunc:1;
 
 	/** Filesystem supports NFS exporting.  Only set in INIT */
-	unsigned export_support : 1;
+	unsigned export_support:1;
 
 	/*
 	 * The following bitfields are only for optimization purposes
@@ -375,43 +375,43 @@ struct fuse_conn {
 	 */
 
 	/** Is fsync not implemented by fs? */
-	unsigned no_fsync : 1;
+	unsigned no_fsync:1;
 
 	/** Is fsyncdir not implemented by fs? */
-	unsigned no_fsyncdir : 1;
+	unsigned no_fsyncdir:1;
 
 	/** Is flush not implemented by fs? */
-	unsigned no_flush : 1;
+	unsigned no_flush:1;
 
 	/** Is setxattr not implemented by fs? */
-	unsigned no_setxattr : 1;
+	unsigned no_setxattr:1;
 
 	/** Is getxattr not implemented by fs? */
-	unsigned no_getxattr : 1;
+	unsigned no_getxattr:1;
 
 	/** Is listxattr not implemented by fs? */
-	unsigned no_listxattr : 1;
+	unsigned no_listxattr:1;
 
 	/** Is removexattr not implemented by fs? */
-	unsigned no_removexattr : 1;
+	unsigned no_removexattr:1;
 
 	/** Are file locking primitives not implemented by fs? */
-	unsigned no_lock : 1;
+	unsigned no_lock:1;
 
 	/** Is access not implemented by fs? */
-	unsigned no_access : 1;
+	unsigned no_access:1;
 
 	/** Is create not implemented by fs? */
-	unsigned no_create : 1;
+	unsigned no_create:1;
 
 	/** Is interrupt not implemented by fs? */
-	unsigned no_interrupt : 1;
+	unsigned no_interrupt:1;
 
 	/** Is bmap not implemented by fs? */
-	unsigned no_bmap : 1;
+	unsigned no_bmap:1;
 
 	/** Do multi-page cached writes */
-	unsigned big_writes : 1;
+	unsigned big_writes:1;
 
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2e99f34b443..739595b4196 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -37,10 +37,10 @@ struct fuse_mount_data {
 	unsigned rootmode;
 	unsigned user_id;
 	unsigned group_id;
-	unsigned fd_present : 1;
-	unsigned rootmode_present : 1;
-	unsigned user_id_present : 1;
-	unsigned group_id_present : 1;
+	unsigned fd_present:1;
+	unsigned rootmode_present:1;
+	unsigned user_id_present:1;
+	unsigned group_id_present:1;
 	unsigned flags;
 	unsigned max_read;
 	unsigned blksize;
@@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 
 	fi = get_fuse_inode(inode);
 	spin_lock(&fc->lock);
-	fi->nlookup ++;
+	fi->nlookup++;
 	spin_unlock(&fc->lock);
 	fuse_change_attributes(inode, attr, attr_valid, attr_version);
 
@@ -553,8 +553,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 	return fuse_iget(sb, 1, 0, &attr, 0, 0);
 }
 
-struct fuse_inode_handle
-{
+struct fuse_inode_handle {
 	u64 nodeid;
 	u32 generation;
 };
@@ -952,7 +951,7 @@ static inline void unregister_fuseblk(void)
 
 static void fuse_inode_init_once(void *foo)
 {
-	struct inode * inode = foo;
+	struct inode *inode = foo;
 
 	inode_init_once(inode);
 }
@@ -1031,7 +1030,7 @@ static int __init fuse_init(void)
 {
 	int res;
 
-	printk("fuse init (API version %i.%i)\n",
+	printk(KERN_INFO "fuse init (API version %i.%i)\n",
 	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
 	INIT_LIST_HEAD(&fuse_conn_list);
-- 
cgit v1.2.3


From e9bb09dd6c5b8ec6a971ed6251df5eba3a4c8d3c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:54 +0100
Subject: fuse: don't let fuse_req->end() put the base reference

fuse_req->end() was supposed to be put the base reference but there's
no reason why it should.  It only makes things more complex.  Move it
out of ->end() and make it the responsibility of request_end().

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c   | 5 ++---
 fs/fuse/file.c  | 5 ++---
 fs/fuse/inode.c | 1 -
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 85a23bb524f..225388f54ae 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -293,8 +293,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 	wake_up(&req->waitq);
 	if (end)
 		end(fc, req);
-	else
-		fuse_put_request(fc, req);
+	fuse_put_request(fc, req);
 }
 
 static void wait_answer_interruptible(struct fuse_conn *fc,
@@ -1006,11 +1005,11 @@ static void end_io_requests(struct fuse_conn *fc)
 		wake_up(&req->waitq);
 		if (end) {
 			req->end = NULL;
-			/* The end function will consume this reference */
 			__fuse_get_request(req);
 			spin_unlock(&fc->lock);
 			wait_event(req->waitq, !req->locked);
 			end(fc, req);
+			fuse_put_request(fc, req);
 			spin_lock(&fc->lock);
 		}
 	}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 86054f437d1..61726980391 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -79,7 +79,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	dput(req->misc.release.dentry);
 	mntput(req->misc.release.vfsmount);
-	fuse_put_request(fc, req);
 }
 
 static void fuse_file_put(struct fuse_file *ff)
@@ -493,7 +492,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 	}
 	if (req->ff)
 		fuse_file_put(req->ff);
-	fuse_put_request(fc, req);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file,
@@ -513,6 +511,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 	} else {
 		request_send(fc, req);
 		fuse_readpages_end(fc, req);
+		fuse_put_request(fc, req);
 	}
 }
 
@@ -1042,7 +1041,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
 	__free_page(req->pages[0]);
 	fuse_file_put(req->ff);
-	fuse_put_request(fc, req);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1086,6 +1084,7 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_writepage_finish(fc, req);
 	spin_unlock(&fc->lock);
 	fuse_writepage_free(fc, req);
+	fuse_put_request(fc, req);
 	spin_lock(&fc->lock);
 }
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 739595b4196..fa474989ec7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -760,7 +760,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
-	fuse_put_request(fc, req);
 	fc->blocked = 0;
 	wake_up_all(&fc->blocked_waitq);
 }
-- 
cgit v1.2.3


From 59efec7b903987dcb60b9ebc85c7acd4443a11a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:55 +0100
Subject: fuse: implement ioctl support

Generic ioctl support is tricky to implement because only the ioctl
implementation itself knows which memory regions need to be read
and/or written.  To support this, fuse client can request retry of
ioctl specifying memory regions to read and write.  Deep copying
(nested pointers) can be implemented by retrying multiple times
resolving one depth of dereference at a time.

For security and cleanliness considerations, ioctl implementation has
restricted mode where the kernel determines data transfer directions
and sizes using the _IOC_*() macros on the ioctl command.  In this
mode, retry is not allowed.

For all FUSE servers, restricted mode is enforced.  Unrestricted ioctl
will be used by CUSE.

Plese read the comment on top of fs/fuse/file.c::fuse_file_do_ioctl()
for more information.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 280 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 280 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 61726980391..baed06ea762 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1469,6 +1469,282 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
 	return retval;
 }
 
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+			unsigned int nr_segs, size_t bytes, bool to_user)
+{
+	struct iov_iter ii;
+	int page_idx = 0;
+
+	if (!bytes)
+		return 0;
+
+	iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+
+	while (iov_iter_count(&ii)) {
+		struct page *page = pages[page_idx++];
+		size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+		void *kaddr, *map;
+
+		kaddr = map = kmap(page);
+
+		while (todo) {
+			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+			size_t copy = min(todo, iov_len);
+			size_t left;
+
+			if (!to_user)
+				left = copy_from_user(kaddr, uaddr, copy);
+			else
+				left = copy_to_user(uaddr, kaddr, copy);
+
+			if (unlikely(left))
+				return -EFAULT;
+
+			iov_iter_advance(&ii, copy);
+			todo -= copy;
+			kaddr += copy;
+		}
+
+		kunmap(map);
+	}
+
+	return 0;
+}
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written.  Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs.  Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ *	char	*buf;
+ *	size_t	buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
+ *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY.  This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg, unsigned int flags)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_ioctl_in inarg = {
+		.fh = ff->fh,
+		.cmd = cmd,
+		.arg = arg,
+		.flags = flags
+	};
+	struct fuse_ioctl_out outarg;
+	struct fuse_req *req = NULL;
+	struct page **pages = NULL;
+	struct page *iov_page = NULL;
+	struct iovec *in_iov = NULL, *out_iov = NULL;
+	unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+	size_t in_size, out_size, transferred;
+	int err;
+
+	/* assume all the iovs returned by client always fits in a page */
+	BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+	if (!fuse_allow_task(fc, current))
+		return -EACCES;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	err = -ENOMEM;
+	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+	iov_page = alloc_page(GFP_KERNEL);
+	if (!pages || !iov_page)
+		goto out;
+
+	/*
+	 * If restricted, initialize IO parameters as encoded in @cmd.
+	 * RETRY from server is not allowed.
+	 */
+	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+		struct iovec *iov = page_address(iov_page);
+
+		iov->iov_base = (void *)arg;
+		iov->iov_len = _IOC_SIZE(cmd);
+
+		if (_IOC_DIR(cmd) & _IOC_WRITE) {
+			in_iov = iov;
+			in_iovs = 1;
+		}
+
+		if (_IOC_DIR(cmd) & _IOC_READ) {
+			out_iov = iov;
+			out_iovs = 1;
+		}
+	}
+
+ retry:
+	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+	inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+	/*
+	 * Out data can be used either for actual out data or iovs,
+	 * make sure there always is at least one page.
+	 */
+	out_size = max_t(size_t, out_size, PAGE_SIZE);
+	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+	/* make sure there are enough buffer pages and init request with them */
+	err = -ENOMEM;
+	if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+		goto out;
+	while (num_pages < max_pages) {
+		pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+		if (!pages[num_pages])
+			goto out;
+		num_pages++;
+	}
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		req = NULL;
+		goto out;
+	}
+	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+	req->num_pages = num_pages;
+
+	/* okay, let's send it to the client */
+	req->in.h.opcode = FUSE_IOCTL;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	if (in_size) {
+		req->in.numargs++;
+		req->in.args[1].size = in_size;
+		req->in.argpages = 1;
+
+		err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+					   false);
+		if (err)
+			goto out;
+	}
+
+	req->out.numargs = 2;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	req->out.args[1].size = out_size;
+	req->out.argpages = 1;
+	req->out.argvar = 1;
+
+	request_send(fc, req);
+	err = req->out.h.error;
+	transferred = req->out.args[1].size;
+	fuse_put_request(fc, req);
+	req = NULL;
+	if (err)
+		goto out;
+
+	/* did it ask for retry? */
+	if (outarg.flags & FUSE_IOCTL_RETRY) {
+		char *vaddr;
+
+		/* no retry if in restricted mode */
+		err = -EIO;
+		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+			goto out;
+
+		in_iovs = outarg.in_iovs;
+		out_iovs = outarg.out_iovs;
+
+		/*
+		 * Make sure things are in boundary, separate checks
+		 * are to protect against overflow.
+		 */
+		err = -ENOMEM;
+		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+		    out_iovs > FUSE_IOCTL_MAX_IOV ||
+		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+			goto out;
+
+		err = -EIO;
+		if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred)
+			goto out;
+
+		/* okay, copy in iovs and retry */
+		vaddr = kmap_atomic(pages[0], KM_USER0);
+		memcpy(page_address(iov_page), vaddr, transferred);
+		kunmap_atomic(vaddr, KM_USER0);
+
+		in_iov = page_address(iov_page);
+		out_iov = in_iov + in_iovs;
+
+		goto retry;
+	}
+
+	err = -EIO;
+	if (transferred > inarg.out_size)
+		goto out;
+
+	err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+	if (req)
+		fuse_put_request(fc, req);
+	if (iov_page)
+		__free_page(iov_page);
+	while (num_pages)
+		__free_page(pages[--num_pages]);
+	kfree(pages);
+
+	return err ? err : outarg.result;
+}
+
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	return fuse_file_do_ioctl(file, cmd, arg, 0);
+}
+
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read		= do_sync_read,
@@ -1483,6 +1759,8 @@ static const struct file_operations fuse_file_operations = {
 	.lock		= fuse_file_lock,
 	.flock		= fuse_file_flock,
 	.splice_read	= generic_file_splice_read,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -1495,6 +1773,8 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.fsync		= fuse_fsync,
 	.lock		= fuse_file_lock,
 	.flock		= fuse_file_flock,
+	.unlocked_ioctl	= fuse_file_ioctl,
+	.compat_ioctl	= fuse_file_compat_ioctl,
 	/* no mmap and splice_read */
 };
 
-- 
cgit v1.2.3


From acf99433d98c2570a619d8fb8b51abce4e532059 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:55 +0100
Subject: fuse: add file kernel handle

The file handle, fuse_file->fh, is opaque value supplied by userland
FUSE server and uniqueness is not guaranteed.  Add file kernel handle,
fuse_file->kh, which is allocated by the kernel on file allocation and
guaranteed to be unique.

This will be used by poll to match notification to the respective file
but can be used for other purposes where unique file handle is
necessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c    | 2 +-
 fs/fuse/file.c   | 8 ++++++--
 fs/fuse/fuse_i.h | 8 +++++++-
 fs/fuse/inode.c  | 1 +
 4 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 9e7c5385699..16ae55d347b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 		goto out_put_forget_req;
 
 	err = -ENOMEM;
-	ff = fuse_file_alloc();
+	ff = fuse_file_alloc(fc);
 	if (!ff)
 		goto out_put_request;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index baed06ea762..a28ced678d3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -46,7 +46,7 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	return err;
 }
 
-struct fuse_file *fuse_file_alloc(void)
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 {
 	struct fuse_file *ff;
 	ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
@@ -58,6 +58,9 @@ struct fuse_file *fuse_file_alloc(void)
 		} else {
 			INIT_LIST_HEAD(&ff->write_entry);
 			atomic_set(&ff->count, 0);
+			spin_lock(&fc->lock);
+			ff->kh = ++fc->khctr;
+			spin_unlock(&fc->lock);
 		}
 	}
 	return ff;
@@ -108,6 +111,7 @@ void fuse_finish_open(struct inode *inode, struct file *file,
 
 int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 {
+	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_open_out outarg;
 	struct fuse_file *ff;
 	int err;
@@ -120,7 +124,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir)
 	if (err)
 		return err;
 
-	ff = fuse_file_alloc();
+	ff = fuse_file_alloc(fc);
 	if (!ff)
 		return -ENOMEM;
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 4fc5131f5c9..86f01330382 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -100,6 +100,9 @@ struct fuse_file {
 	/** Request reserved for flush and release */
 	struct fuse_req *reserved_req;
 
+	/** Kernel file handle guaranteed to be unique */
+	u64 kh;
+
 	/** File handle used by userspace */
 	u64 fh;
 
@@ -322,6 +325,9 @@ struct fuse_conn {
 	/** The list of requests under I/O */
 	struct list_head io;
 
+	/** The next unique kernel file handle */
+	u64 khctr;
+
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
@@ -499,7 +505,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file,
  */
 int fuse_open_common(struct inode *inode, struct file *file, int isdir);
 
-struct fuse_file *fuse_file_alloc(void);
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
 void fuse_file_free(struct fuse_file *ff);
 void fuse_finish_open(struct inode *inode, struct file *file,
 		      struct fuse_file *ff, struct fuse_open_out *outarg);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fa474989ec7..0e15bc221d2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -485,6 +485,7 @@ static struct fuse_conn *new_conn(struct super_block *sb)
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
 		/* fuse does it's own writeback accounting */
 		fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+		fc->khctr = 0;
 		fc->dev = sb->s_dev;
 		err = bdi_init(&fc->bdi);
 		if (err)
-- 
cgit v1.2.3


From 8599396b5062bf6bd2a0b433503849e2322df1c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:55 +0100
Subject: fuse: implement unsolicited notification

Clients always used to write only in response to read requests.  To
implement poll efficiently, clients should be able to issue
unsolicited notifications.  This patch implements basic notification
support.

Zero fuse_out_header.unique is now accepted and considered unsolicited
notification and the error field contains notification code.  This
patch doesn't implement any actual notification.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 225388f54ae..ffd670bb8c8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -816,6 +816,15 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	return err;
 }
 
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+		       unsigned int size, struct fuse_copy_state *cs)
+{
+	switch (code) {
+	default:
+		return -EINVAL;
+	}
+}
+
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
@@ -879,9 +888,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 	err = fuse_copy_one(&cs, &oh, sizeof(oh));
 	if (err)
 		goto err_finish;
+
+	err = -EINVAL;
+	if (oh.len != nbytes)
+		goto err_finish;
+
+	/*
+	 * Zero oh.unique indicates unsolicited notification message
+	 * and error contains notification code.
+	 */
+	if (!oh.unique) {
+		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+		fuse_copy_finish(&cs);
+		return err ? err : nbytes;
+	}
+
 	err = -EINVAL;
-	if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
-	    oh.len != nbytes)
+	if (oh.error <= -1000 || oh.error > 0)
 		goto err_finish;
 
 	spin_lock(&fc->lock);
-- 
cgit v1.2.3


From 95668a69a4bb862063c4d28a746e55107dee7b98 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:55 +0100
Subject: fuse: implement poll support

Implement poll support.  Polled files are indexed using kh in a RB
tree rooted at fuse_conn->polled_files.

Client should send FUSE_NOTIFY_POLL notification once after processing
FUSE_POLL which has FUSE_POLL_SCHEDULE_NOTIFY set.  Sending
notification unconditionally after the latest poll or everytime file
content might have changed is inefficient but won't cause malfunction.

fuse_file_poll() can sleep and requires patches from the following
thread which allows f_op->poll() to sleep.

  http://thread.gmane.org/gmane.linux.kernel/726176

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c    |  19 ++++++++
 fs/fuse/file.c   | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/fuse/fuse_i.h |  20 +++++++++
 fs/fuse/inode.c  |   1 +
 4 files changed, 172 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ffd670bb8c8..6176e444c76 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -816,10 +816,29 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
 	return err;
 }
 
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+			    struct fuse_copy_state *cs)
+{
+	struct fuse_notify_poll_wakeup_out outarg;
+	int err;
+
+	if (size != sizeof(outarg))
+		return -EINVAL;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		return err;
+
+	return fuse_notify_poll_wakeup(fc, &outarg);
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
 	switch (code) {
+	case FUSE_NOTIFY_POLL:
+		return fuse_notify_poll(fc, size, cs);
+
 	default:
 		return -EINVAL;
 	}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a28ced678d3..b3a944e4bb9 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -62,6 +62,8 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 			ff->kh = ++fc->khctr;
 			spin_unlock(&fc->lock);
 		}
+		RB_CLEAR_NODE(&ff->polled_node);
+		init_waitqueue_head(&ff->poll_wait);
 	}
 	return ff;
 }
@@ -170,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir)
 
 		spin_lock(&fc->lock);
 		list_del(&ff->write_entry);
+		if (!RB_EMPTY_NODE(&ff->polled_node))
+			rb_erase(&ff->polled_node, &fc->polled_files);
 		spin_unlock(&fc->lock);
+
+		wake_up_interruptible_sync(&ff->poll_wait);
 		/*
 		 * Normally this will send the RELEASE request,
 		 * however if some asynchronous READ or WRITE requests
@@ -1749,6 +1755,130 @@ static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
 	return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT);
 }
 
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+					      struct rb_node **parent_out)
+{
+	struct rb_node **link = &fc->polled_files.rb_node;
+	struct rb_node *last = NULL;
+
+	while (*link) {
+		struct fuse_file *ff;
+
+		last = *link;
+		ff = rb_entry(last, struct fuse_file, polled_node);
+
+		if (kh < ff->kh)
+			link = &last->rb_left;
+		else if (kh > ff->kh)
+			link = &last->rb_right;
+		else
+			return link;
+	}
+
+	if (parent_out)
+		*parent_out = last;
+	return link;
+}
+
+/*
+ * The file is about to be polled.  Make sure it's on the polled_files
+ * RB tree.  Note that files once added to the polled_files tree are
+ * not removed before the file is released.  This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+				      struct fuse_file *ff)
+{
+	spin_lock(&fc->lock);
+	if (RB_EMPTY_NODE(&ff->polled_node)) {
+		struct rb_node **link, *parent;
+
+		link = fuse_find_polled_node(fc, ff->kh, &parent);
+		BUG_ON(*link);
+		rb_link_node(&ff->polled_node, parent, link);
+		rb_insert_color(&ff->polled_node, &fc->polled_files);
+	}
+	spin_unlock(&fc->lock);
+}
+
+static unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+	struct fuse_poll_out outarg;
+	struct fuse_req *req;
+	int err;
+
+	if (fc->no_poll)
+		return DEFAULT_POLLMASK;
+
+	poll_wait(file, &ff->poll_wait, wait);
+
+	/*
+	 * Ask for notification iff there's someone waiting for it.
+	 * The client may ignore the flag and always notify.
+	 */
+	if (waitqueue_active(&ff->poll_wait)) {
+		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+		fuse_register_polled_file(fc, ff);
+	}
+
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->in.h.opcode = FUSE_POLL;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->out.numargs = 1;
+	req->out.args[0].size = sizeof(outarg);
+	req->out.args[0].value = &outarg;
+	request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+
+	if (!err)
+		return outarg.revents;
+	if (err == -ENOSYS) {
+		fc->no_poll = 1;
+		return DEFAULT_POLLMASK;
+	}
+	return POLLERR;
+}
+
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg)
+{
+	u64 kh = outarg->kh;
+	struct rb_node **link;
+
+	spin_lock(&fc->lock);
+
+	link = fuse_find_polled_node(fc, kh, NULL);
+	if (*link) {
+		struct fuse_file *ff;
+
+		ff = rb_entry(*link, struct fuse_file, polled_node);
+		wake_up_interruptible_sync(&ff->poll_wait);
+	}
+
+	spin_unlock(&fc->lock);
+	return 0;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read		= do_sync_read,
@@ -1765,6 +1895,7 @@ static const struct file_operations fuse_file_operations = {
 	.splice_read	= generic_file_splice_read,
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -1779,6 +1910,7 @@ static const struct file_operations fuse_direct_io_file_operations = {
 	.flock		= fuse_file_flock,
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
+	.poll		= fuse_file_poll,
 	/* no mmap and splice_read */
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 86f01330382..986fbd4c1ff 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -19,6 +19,8 @@
 #include <linux/backing-dev.h>
 #include <linux/mutex.h>
 #include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -111,6 +113,12 @@ struct fuse_file {
 
 	/** Entry on inode's write_files list */
 	struct list_head write_entry;
+
+	/** RB node to be linked on fuse_conn->polled_files */
+	struct rb_node polled_node;
+
+	/** Wait queue head for poll */
+	wait_queue_head_t poll_wait;
 };
 
 /** One input argument of a request */
@@ -328,6 +336,9 @@ struct fuse_conn {
 	/** The next unique kernel file handle */
 	u64 khctr;
 
+	/** rbtree of fuse_files waiting for poll events indexed by ph */
+	struct rb_root polled_files;
+
 	/** Number of requests currently in the background */
 	unsigned num_background;
 
@@ -416,6 +427,9 @@ struct fuse_conn {
 	/** Is bmap not implemented by fs? */
 	unsigned no_bmap:1;
 
+	/** Is poll not implemented by fs? */
+	unsigned no_poll:1;
+
 	/** Do multi-page cached writes */
 	unsigned big_writes:1;
 
@@ -524,6 +538,12 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir);
 int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 		      int isdir);
 
+/**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+			    struct fuse_notify_poll_wakeup_out *outarg);
+
 /**
  * Initialize file operations on a regular file
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0e15bc221d2..ba725612808 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -486,6 +486,7 @@ static struct fuse_conn *new_conn(struct super_block *sb)
 		/* fuse does it's own writeback accounting */
 		fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
 		fc->khctr = 0;
+		fc->polled_files = RB_ROOT;
 		fc->dev = sb->s_dev;
 		err = bdi_init(&fc->bdi);
 		if (err)
-- 
cgit v1.2.3


From b93f858ab2a4bee779c360002f313ad6c3504cdc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:55 +0100
Subject: fuse: add fuse_ prefix to several functions

Add fuse_ prefix to request_send*() and get_root_inode() as some of
those functions will be exported for CUSE.  With or without CUSE
export, having the function names scoped is a good idea for
debuggability.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c    | 23 ++++++++++++-----------
 fs/fuse/dir.c    | 34 +++++++++++++++++-----------------
 fs/fuse/file.c   | 28 ++++++++++++++--------------
 fs/fuse/fuse_i.h |  9 +++++----
 fs/fuse/inode.c  | 12 ++++++------
 5 files changed, 54 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6176e444c76..3c44ce359a0 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -379,7 +379,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
 	spin_lock(&fc->lock);
@@ -398,8 +398,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 	spin_unlock(&fc->lock);
 }
 
-static void request_send_nowait_locked(struct fuse_conn *fc,
-				       struct fuse_req *req)
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+					    struct fuse_req *req)
 {
 	req->background = 1;
 	fc->num_background++;
@@ -413,11 +413,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc,
 	flush_bg_queue(fc);
 }
 
-static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fc->lock);
 	if (fc->connected) {
-		request_send_nowait_locked(fc, req);
+		fuse_request_send_nowait_locked(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
 		req->out.h.error = -ENOTCONN;
@@ -425,16 +425,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 	}
 }
 
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 0;
-	request_send_nowait(fc, req);
+	fuse_request_send_nowait(fc, req);
 }
 
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	request_send_nowait(fc, req);
+	fuse_request_send_nowait(fc, req);
 }
 
 /*
@@ -442,10 +442,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
  *
  * fc->connected must have been checked previously
  */
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req)
 {
 	req->isreply = 1;
-	request_send_nowait_locked(fc, req);
+	fuse_request_send_nowait_locked(fc, req);
 }
 
 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 16ae55d347b..f310768a02f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 		parent = dget_parent(entry);
 		fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
 				 &entry->d_name, &outarg);
-		request_send(fc, req);
+		fuse_request_send(fc, req);
 		dput(parent);
 		err = req->out.h.error;
 		fuse_put_request(fc, req);
@@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 	attr_version = fuse_get_attr_version(fc);
 
 	fuse_lookup_init(fc, req, nodeid, name, outarg);
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	/* Zero nodeid is same as -ENOENT, but with valid timeout */
@@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff,
 {
 	fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE);
 	ff->reserved_req->force = 1;
-	request_send(fc, ff->reserved_req);
+	fuse_request_send(fc, ff->reserved_req);
 	fuse_put_request(fc, ff->reserved_req);
 	kfree(ff);
 }
@@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	req->out.args[0].value = &outentry;
 	req->out.args[1].size = sizeof(outopen);
 	req->out.args[1].value = &outopen;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	if (err) {
 		if (err == -ENOSYS)
@@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 	else
 		req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err)
@@ -631,7 +631,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
@@ -664,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	req->in.numargs = 1;
 	req->in.args[0].size = entry->d_name.len + 1;
 	req->in.args[0].value = entry->d_name.name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
@@ -697,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	req->in.args[1].value = oldent->d_name.name;
 	req->in.args[2].size = newent->d_name.len + 1;
 	req->in.args[2].value = newent->d_name.name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
@@ -813,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 	else
 		req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err) {
@@ -906,7 +906,7 @@ static int fuse_access(struct inode *inode, int mask)
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -1028,7 +1028,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	req->num_pages = 1;
 	req->pages[0] = page;
 	fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR);
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	nbytes = req->out.args[0].size;
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -1062,7 +1062,7 @@ static char *read_link(struct dentry *dentry)
 	req->out.numargs = 1;
 	req->out.args[0].size = PAGE_SIZE - 1;
 	req->out.args[0].value = link;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	if (req->out.h.error) {
 		free_page((unsigned long) link);
 		link = ERR_PTR(req->out.h.error);
@@ -1268,7 +1268,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	else
 		req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err) {
@@ -1362,7 +1362,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	req->in.args[1].value = name;
 	req->in.args[2].size = size;
 	req->in.args[2].value = value;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -1408,7 +1408,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 		req->out.args[0].size = sizeof(outarg);
 		req->out.args[0].value = &outarg;
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	ret = req->out.h.error;
 	if (!ret)
 		ret = size ? req->out.args[0].size : outarg.size;
@@ -1458,7 +1458,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 		req->out.args[0].size = sizeof(outarg);
 		req->out.args[0].value = &outarg;
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	ret = req->out.h.error;
 	if (!ret)
 		ret = size ? req->out.args[0].size : outarg.size;
@@ -1491,7 +1491,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	req->in.numargs = 1;
 	req->in.args[0].size = strlen(name) + 1;
 	req->in.args[0].value = name;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b3a944e4bb9..80b5fa80f5e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -39,7 +39,7 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(*outargp);
 	req->out.args[0].value = outargp;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 
@@ -93,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff)
 		struct inode *inode = req->misc.release.dentry->d_inode;
 		struct fuse_conn *fc = get_fuse_conn(inode);
 		req->end = fuse_release_end;
-		request_send_background(fc, req);
+		fuse_request_send_background(fc, req);
 		kfree(ff);
 	}
 }
@@ -289,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
 	req->force = 1;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -353,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS) {
@@ -405,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file,
 		inarg->read_flags |= FUSE_READ_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	return req->out.args[0].size;
 }
 
@@ -517,9 +517,9 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file,
 		struct fuse_file *ff = file->private_data;
 		req->ff = fuse_file_get(ff);
 		req->end = fuse_readpages_end;
-		request_send_background(fc, req);
+		fuse_request_send_background(fc, req);
 	} else {
-		request_send(fc, req);
+		fuse_request_send(fc, req);
 		fuse_readpages_end(fc, req);
 		fuse_put_request(fc, req);
 	}
@@ -645,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file,
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	return req->misc.write.out.size;
 }
 
@@ -1087,7 +1087,7 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
 
 	req->in.args[1].size = inarg->size;
 	fi->writectr++;
-	request_send_background_locked(fc, req);
+	fuse_request_send_background_locked(fc, req);
 	return;
 
  out_free:
@@ -1334,7 +1334,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (!err)
@@ -1366,7 +1366,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 		return PTR_ERR(req);
 
 	fuse_lk_fill(req, file, fl, opcode, pid, flock);
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	/* locking is restartable */
 	if (err == -EINTR)
@@ -1442,7 +1442,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 	if (err == -ENOSYS)
@@ -1681,7 +1681,7 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
 	req->out.argpages = 1;
 	req->out.argvar = 1;
 
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	transferred = req->out.args[1].size;
 	fuse_put_request(fc, req);
@@ -1842,7 +1842,7 @@ static unsigned fuse_file_poll(struct file *file, poll_table *wait)
 	req->out.numargs = 1;
 	req->out.args[0].size = sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 986fbd4c1ff..c08e7e89092 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -619,19 +619,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 /**
  * Send a request (synchronous)
  */
-void request_send(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
  * Send a request with no reply
  */
-void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
  * Send a request in the background
  */
-void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
-void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req);
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+					 struct fuse_req *req);
 
 /* Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ba725612808..ee91639c6d3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req,
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(struct fuse_forget_in);
 	req->in.args[0].value = inarg;
-	request_send_noreply(fc, req);
+	fuse_request_send_noreply(fc, req);
 }
 
 static void fuse_clear_inode(struct inode *inode)
@@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc)
 		fc->destroy_req = NULL;
 		req->in.h.opcode = FUSE_DESTROY;
 		req->force = 1;
-		request_send(fc, req);
+		fuse_request_send(fc, req);
 		fuse_put_request(fc, req);
 	}
 }
@@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
 	req->out.args[0].size =
 		fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
 	req->out.args[0].value = &outarg;
-	request_send(fc, req);
+	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	if (!err)
 		convert_fuse_statfs(buf, &outarg.st);
@@ -544,7 +544,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
 	return fc;
 }
 
-static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 {
 	struct fuse_attr attr;
 	memset(&attr, 0, sizeof(attr));
@@ -787,7 +787,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	req->out.args[0].size = sizeof(struct fuse_init_out);
 	req->out.args[0].value = &req->misc.init_out;
 	req->end = process_init_reply;
-	request_send_background(fc, req);
+	fuse_request_send_background(fc, req);
 }
 
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -841,7 +841,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = fc;
 
 	err = -ENOMEM;
-	root = get_root_inode(sb, d.rootmode);
+	root = fuse_get_root_inode(sb, d.rootmode);
 	if (!root)
 		goto err;
 
-- 
cgit v1.2.3


From 0d179aa59285ceef529c125e181cbb79ff5245c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:55 +0100
Subject: fuse: separate out fuse_conn_init() from new_conn()

Separate out fuse_conn_init() from new_conn() and while at it
initialize fuse_conn->entry during conn initialization.

This will be used by CUSE.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/fuse_i.h |   5 +++
 fs/fuse/inode.c  | 119 +++++++++++++++++++++++++++++--------------------------
 2 files changed, 67 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c08e7e89092..eb488d48b83 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -649,6 +649,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
  */
 struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
 
+/**
+ * Initialize fuse_conn
+ */
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb);
+
 /**
  * Release reference to fuse_conn
  */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ee91639c6d3..6c9fa03aa36 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -462,70 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 	return 0;
 }
 
-static struct fuse_conn *new_conn(struct super_block *sb)
+int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb)
 {
-	struct fuse_conn *fc;
 	int err;
 
-	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
-	if (fc) {
-		spin_lock_init(&fc->lock);
-		mutex_init(&fc->inst_mutex);
-		atomic_set(&fc->count, 1);
-		init_waitqueue_head(&fc->waitq);
-		init_waitqueue_head(&fc->blocked_waitq);
-		init_waitqueue_head(&fc->reserved_req_waitq);
-		INIT_LIST_HEAD(&fc->pending);
-		INIT_LIST_HEAD(&fc->processing);
-		INIT_LIST_HEAD(&fc->io);
-		INIT_LIST_HEAD(&fc->interrupts);
-		INIT_LIST_HEAD(&fc->bg_queue);
-		atomic_set(&fc->num_waiting, 0);
-		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-		fc->bdi.unplug_io_fn = default_unplug_io_fn;
-		/* fuse does it's own writeback accounting */
-		fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
-		fc->khctr = 0;
-		fc->polled_files = RB_ROOT;
-		fc->dev = sb->s_dev;
-		err = bdi_init(&fc->bdi);
-		if (err)
-			goto error_kfree;
-		if (sb->s_bdev) {
-			err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
-					   MAJOR(fc->dev), MINOR(fc->dev));
-		} else {
-			err = bdi_register_dev(&fc->bdi, fc->dev);
-		}
-		if (err)
-			goto error_bdi_destroy;
-		/*
-		 * For a single fuse filesystem use max 1% of dirty +
-		 * writeback threshold.
-		 *
-		 * This gives about 1M of write buffer for memory maps on a
-		 * machine with 1G and 10% dirty_ratio, which should be more
-		 * than enough.
-		 *
-		 * Privileged users can raise it by writing to
-		 *
-		 *    /sys/class/bdi/<bdi>/max_ratio
-		 */
-		bdi_set_max_ratio(&fc->bdi, 1);
-		fc->reqctr = 0;
-		fc->blocked = 1;
-		fc->attr_version = 1;
-		get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+	memset(fc, 0, sizeof(*fc));
+	spin_lock_init(&fc->lock);
+	mutex_init(&fc->inst_mutex);
+	atomic_set(&fc->count, 1);
+	init_waitqueue_head(&fc->waitq);
+	init_waitqueue_head(&fc->blocked_waitq);
+	init_waitqueue_head(&fc->reserved_req_waitq);
+	INIT_LIST_HEAD(&fc->pending);
+	INIT_LIST_HEAD(&fc->processing);
+	INIT_LIST_HEAD(&fc->io);
+	INIT_LIST_HEAD(&fc->interrupts);
+	INIT_LIST_HEAD(&fc->bg_queue);
+	INIT_LIST_HEAD(&fc->entry);
+	atomic_set(&fc->num_waiting, 0);
+	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	fc->bdi.unplug_io_fn = default_unplug_io_fn;
+	/* fuse does it's own writeback accounting */
+	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+	fc->khctr = 0;
+	fc->polled_files = RB_ROOT;
+	fc->dev = sb->s_dev;
+	err = bdi_init(&fc->bdi);
+	if (err)
+		goto error_mutex_destroy;
+	if (sb->s_bdev) {
+		err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+				   MAJOR(fc->dev), MINOR(fc->dev));
+	} else {
+		err = bdi_register_dev(&fc->bdi, fc->dev);
 	}
-	return fc;
+	if (err)
+		goto error_bdi_destroy;
+	/*
+	 * For a single fuse filesystem use max 1% of dirty +
+	 * writeback threshold.
+	 *
+	 * This gives about 1M of write buffer for memory maps on a
+	 * machine with 1G and 10% dirty_ratio, which should be more
+	 * than enough.
+	 *
+	 * Privileged users can raise it by writing to
+	 *
+	 *    /sys/class/bdi/<bdi>/max_ratio
+	 */
+	bdi_set_max_ratio(&fc->bdi, 1);
+	fc->reqctr = 0;
+	fc->blocked = 1;
+	fc->attr_version = 1;
+	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+
+	return 0;
 
-error_bdi_destroy:
+ error_bdi_destroy:
 	bdi_destroy(&fc->bdi);
-error_kfree:
+ error_mutex_destroy:
 	mutex_destroy(&fc->inst_mutex);
-	kfree(fc);
-	return NULL;
+	return err;
 }
+EXPORT_SYMBOL_GPL(fuse_conn_init);
 
 void fuse_conn_put(struct fuse_conn *fc)
 {
@@ -828,10 +827,16 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (file->f_op != &fuse_dev_operations)
 		return -EINVAL;
 
-	fc = new_conn(sb);
+	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
 	if (!fc)
 		return -ENOMEM;
 
+	err = fuse_conn_init(fc, sb);
+	if (err) {
+		kfree(fc);
+		return err;
+	}
+
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
-- 
cgit v1.2.3


From 43901aabd7a043e62e24e9459dc4949b4cd69f07 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 26 Nov 2008 12:03:56 +0100
Subject: fuse: add fuse_conn->release()

Add fuse_conn->release() so that fuse_conn can be embedded in other
structures.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/fuse_i.h | 3 +++
 fs/fuse/inode.c  | 8 +++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index eb488d48b83..5e64b815a5a 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -465,6 +465,9 @@ struct fuse_conn {
 
 	/** Version counter for attribute changes */
 	u64 attr_version;
+
+	/** Called on final put */
+	void (*release)(struct fuse_conn *);
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6c9fa03aa36..47c96fdca1a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -533,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc)
 			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
 		bdi_destroy(&fc->bdi);
-		kfree(fc);
+		fc->release(fc);
 	}
 }
 
@@ -789,6 +789,11 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_request_send_background(fc, req);
 }
 
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+	kfree(fc);
+}
+
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct fuse_conn *fc;
@@ -837,6 +842,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		return err;
 	}
 
+	fc->release = fuse_free_conn;
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
-- 
cgit v1.2.3


From ce397c0616de9bfb678569f1dfa655838a711594 Mon Sep 17 00:00:00 2001
From: Liu Hui <onlyflyer@gmail.com>
Date: Mon, 1 Dec 2008 20:31:40 -0500
Subject: Btrfs: Fix cow semantic in run_delalloc_nocow()

The file preallocation code reversed the logic to force nodatacow.
This fixes it.
---
 fs/btrfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7a8ad4292f7..b3d4078b69a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1114,10 +1114,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started, 0, nr_written);
+                                        page_started, 1, nr_written);
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-					 page_started, 1, nr_written);
+                                        page_started, 0, nr_written);
 	else
 		ret = cow_file_range_async(inode, locked_page, start, end,
 				     page_started, nr_written);
-- 
cgit v1.2.3


From 641f5219f2ea0dfd1c024c91021ddc83f50bf1ce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 06:36:10 -0500
Subject: Btrfs: sparse lock verification annotations for wait_on_state

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 87dba851793..d79ccdbfdd9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -577,6 +577,8 @@ EXPORT_SYMBOL(clear_extent_bit);
 
 static int wait_on_state(struct extent_io_tree *tree,
 			 struct extent_state *state)
+		__releases(tree->lock)
+		__acquires(tree->lock)
 {
 	DEFINE_WAIT(wait);
 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-- 
cgit v1.2.3


From 6e3ad88729103c4d19703311253fab8a49669fa8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 2 Dec 2008 06:36:10 -0500
Subject: Btrfs: remove unneeded total_trans

Remove unneeded debugging sanity check.  It gets corrupted anyway when
multiple btrfs file systems are mounted, throwing bad warnings along the
way.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/btrfs/transaction.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c2c3b428196..c38f6a0e30b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,7 +28,6 @@
 #include "ref-cache.h"
 #include "tree-log.h"
 
-static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
@@ -39,8 +38,6 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 	WARN_ON(transaction->use_count == 0);
 	transaction->use_count--;
 	if (transaction->use_count == 0) {
-		WARN_ON(total_trans == 0);
-		total_trans--;
 		list_del_init(&transaction->list);
 		memset(transaction, 0, sizeof(*transaction));
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
@@ -57,7 +54,6 @@ static noinline int join_transaction(struct btrfs_root *root)
 	if (!cur_trans) {
 		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
 					     GFP_NOFS);
-		total_trans++;
 		BUG_ON(!cur_trans);
 		root->fs_info->generation++;
 		root->fs_info->last_alloc = 0;
-- 
cgit v1.2.3


From 1ffa4f426c002161b7dbd58b297f5d0680e7dd6a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 2 Dec 2008 09:53:09 -0500
Subject: Btrfs: remove unneeded btrfs_start_delalloc_inodes call

It is called by btrfs_sync_fs.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/btrfs/ioctl.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f3d68457e66..35f650e183e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1142,7 +1142,6 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_TRANS_END:
 		return btrfs_ioctl_trans_end(file);
 	case BTRFS_IOC_SYNC:
-		btrfs_start_delalloc_inodes(root);
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
 	}
-- 
cgit v1.2.3


From b2950863c61bc24cf0f63bc05947d9d50663c4c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 09:54:17 -0500
Subject: Btrfs: make things static and include the right headers

Shut up various sparse warnings about symbols that should be either
static or have their declarations in scope.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.c            |  2 +-
 fs/btrfs/disk-io.c          | 16 ++++++++--------
 fs/btrfs/extent-tree.c      | 12 ++++++------
 fs/btrfs/extent_io.c        | 35 +++++++++++++++--------------------
 fs/btrfs/free-space-cache.c |  6 ++++--
 fs/btrfs/inode-item.c       |  2 +-
 fs/btrfs/inode.c            | 26 +++++++++++++-------------
 fs/btrfs/ioctl.c            | 14 +++++++-------
 fs/btrfs/root-tree.c        |  2 ++
 fs/btrfs/super.c            |  2 +-
 fs/btrfs/tree-log.c         |  5 +++--
 fs/btrfs/volumes.c          | 12 ++++++------
 fs/btrfs/zlib.c             |  1 +
 13 files changed, 68 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 71ef0a2e2da..a83cbdf1d8c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -217,7 +217,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
  * is used to finish the allocation.
  */
-int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
+static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb04665e500..8a2bcc7024f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -93,9 +93,9 @@ struct async_submit_bio {
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
  */
-struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 len,
-				    int create)
+static struct extent_map *btree_get_extent(struct inode *inode,
+		struct page *page, size_t page_offset, u64 start, u64 len,
+		int create)
 {
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_map *em;
@@ -295,7 +295,7 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
  * checksum a dirty tree block before IO.  This has extra checks to make
  * sure we only fill in the checksum field in the first page of a multi-page block
  */
-int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -365,7 +365,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
-int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
 	struct extent_io_tree *tree;
@@ -660,7 +660,7 @@ static int btree_writepages(struct address_space *mapping,
 	return extent_writepages(tree, mapping, btree_get_extent, wbc);
 }
 
-int btree_readpage(struct file *file, struct page *page)
+static int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1200,7 +1200,7 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	}
 }
 
-void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 	struct inode *inode;
 	struct extent_map_tree *em_tree;
@@ -1842,7 +1842,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-int write_all_supers(struct btrfs_root *root)
+static int write_all_supers(struct btrfs_root *root)
 {
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a970472eab1..d1563852938 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,7 +74,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  * this adds the block group to the fs_info rb tree for the block group
  * cache
  */
-int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 				struct btrfs_block_group_cache *block_group)
 {
 	struct rb_node **p;
@@ -289,7 +289,7 @@ err:
 /*
  * return the block group that starts at or after bytenr
  */
-struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
+static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 						       btrfs_fs_info *info,
 							 u64 bytenr)
 {
@@ -3445,7 +3445,7 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 			      u32 *refs)
 {
 	int ret;
@@ -5434,7 +5434,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
-int __alloc_chunk_for_shrink(struct btrfs_root *root,
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		     struct btrfs_block_group_cache *shrink_block_group,
 		     int force)
 {
@@ -5703,8 +5703,8 @@ out:
 	return ret;
 }
 
-int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
-			   struct btrfs_key *key)
+static int find_first_block_group(struct btrfs_root *root,
+		struct btrfs_path *path, struct btrfs_key *key)
 {
 	int ret = 0;
 	struct btrfs_key found_key;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d79ccdbfdd9..c3dfe2a0ec8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -112,7 +112,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
-struct extent_state *alloc_extent_state(gfp_t mask)
+static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
 #ifdef LEAK_DEBUG
@@ -136,7 +136,7 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 }
 EXPORT_SYMBOL(alloc_extent_state);
 
-void free_extent_state(struct extent_state *state)
+static void free_extent_state(struct extent_state *state)
 {
 	if (!state)
 		return;
@@ -662,7 +662,7 @@ static void set_state_bits(struct extent_io_tree *tree,
  * [start, end] is inclusive
  * This takes the tree lock.
  */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 		   int exclusive, u64 *failed_start, gfp_t mask)
 {
 	struct extent_state *state;
@@ -879,12 +879,11 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_new);
 
-int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_new);
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			gfp_t mask)
@@ -894,27 +893,24 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_uptodate);
 
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			  gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_uptodate);
 
-int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			 gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
 			      0, NULL, mask);
 }
-EXPORT_SYMBOL(set_extent_writeback);
 
-int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			   gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_writeback);
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
@@ -994,7 +990,7 @@ EXPORT_SYMBOL(set_range_dirty);
 /*
  * helper function to set both pages and extents in the tree writeback
  */
-int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1010,7 +1006,6 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 	set_extent_writeback(tree, start, end, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(set_range_writeback);
 
 /*
  * find the first offset in the io tree with 'bits' set. zero is
@@ -1432,11 +1427,13 @@ out:
 	spin_unlock_irq(&tree->lock);
 	return total_bytes;
 }
+
+#if 0
 /*
  * helper function to lock both pages and extents in the tree.
  * pages must be locked first.
  */
-int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1473,12 +1470,11 @@ failed:
 	}
 	return err;
 }
-EXPORT_SYMBOL(lock_range);
 
 /*
  * helper function to unlock both pages and extents in the tree.
  */
-int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1493,7 +1489,7 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 	unlock_extent(tree, start, end, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(unlock_range);
+#endif
 
 /*
  * set the private field for a given byte offset in the tree.  If there isn't
@@ -1956,7 +1952,7 @@ void set_page_extent_mapped(struct page *page)
 }
 EXPORT_SYMBOL(set_page_extent_mapped);
 
-void set_page_extent_head(struct page *page, unsigned long len)
+static void set_page_extent_head(struct page *page, unsigned long len)
 {
 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
@@ -2397,7 +2393,7 @@ update_nr_written:
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
-int extent_write_cache_pages(struct extent_io_tree *tree,
+static int extent_write_cache_pages(struct extent_io_tree *tree,
 			     struct address_space *mapping,
 			     struct writeback_control *wbc,
 			     writepage_t writepage, void *data,
@@ -2502,7 +2498,6 @@ retry:
 	}
 	return ret;
 }
-EXPORT_SYMBOL(extent_write_cache_pages);
 
 static noinline void flush_write_bio(void *data)
 {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4926c0f3c8..09462adfbe3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -443,7 +443,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 	mutex_unlock(&block_group->alloc_mutex);
 }
 
-struct btrfs_free_space *btrfs_find_free_space_offset(struct
+#if 0
+static struct btrfs_free_space *btrfs_find_free_space_offset(struct
 						      btrfs_block_group_cache
 						      *block_group, u64 offset,
 						      u64 bytes)
@@ -458,7 +459,7 @@ struct btrfs_free_space *btrfs_find_free_space_offset(struct
 	return ret;
 }
 
-struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 						     btrfs_block_group_cache
 						     *block_group, u64 offset,
 						     u64 bytes)
@@ -472,6 +473,7 @@ struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 
 	return ret;
 }
+#endif
 
 struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
 					       *block_group, u64 offset,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index d93451c66ba..3d46fa1f29a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -20,7 +20,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-int find_name_in_backref(struct btrfs_path *path, const char * name,
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
 			 int name_len, struct btrfs_inode_ref **ref_ret)
 {
 	struct extent_buffer *leaf;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3d4078b69a..bd58ba655a4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1130,7 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
 	unsigned long flags;
@@ -1151,7 +1151,7 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			 unsigned long old, unsigned long bits)
 {
 	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
@@ -1215,7 +1215,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
+static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1234,7 +1234,7 @@ int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1245,7 +1245,7 @@ int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * extent_io.c submission hook. This does the right thing for csum calculation on write,
  * or reading the csums from the tree before a read
  */
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1313,7 +1313,7 @@ struct btrfs_writepage_fixup {
 	struct btrfs_work work;
 };
 
-void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
@@ -1372,7 +1372,7 @@ out_page:
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
  */
-int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_writepage_fixup *fixup;
@@ -1526,7 +1526,7 @@ nocow:
 	return 0;
 }
 
-int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
@@ -1548,7 +1548,7 @@ struct io_failure_record {
 	int last_mirror;
 };
 
-int btrfs_io_failed_hook(struct bio *failed_bio,
+static int btrfs_io_failed_hook(struct bio *failed_bio,
 			 struct page *page, u64 start, u64 end,
 			 struct extent_state *state)
 {
@@ -1642,7 +1642,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-int btrfs_clean_io_failures(struct inode *inode, u64 start)
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
 {
 	u64 private;
 	u64 private_failure;
@@ -1675,7 +1675,7 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
  * if there's a match, we allow the bio to finish.  If not, we go through
  * the io_failure_record routines to find good copies
  */
-int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
@@ -4362,8 +4362,8 @@ out:
  * Invalidate a single dcache entry at the root of the filesystem.
  * Needed after creation of snapshot or subvolume.
  */
-void btrfs_invalidate_dcache_root(struct inode *dir, char *name,
-				  int namelen)
+static void btrfs_invalidate_dcache_root(struct inode *dir,
+		char *name, int namelen)
 {
 	struct dentry *alias, *entry;
 	struct qstr qstr;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 35f650e183e..cc7c5161e26 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -354,7 +354,7 @@ out_unlock:
 }
 
 
-int btrfs_defrag_file(struct file *file)
+static int btrfs_defrag_file(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -649,7 +649,7 @@ static int btrfs_ioctl_defrag(struct file *file)
 	return 0;
 }
 
-long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
@@ -671,7 +671,7 @@ out:
 	return ret;
 }
 
-long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
@@ -696,8 +696,8 @@ out:
 	return ret;
 }
 
-long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off,
-		       u64 olen, u64 destoff)
+static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+		u64 off, u64 olen, u64 destoff)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1035,7 +1035,7 @@ out_fput:
 	return ret;
 }
 
-long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
+static long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
 {
 	struct btrfs_ioctl_clone_range_args args;
 
@@ -1051,7 +1051,7 @@ long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
  * basically own the machine, and have a very in depth understanding
  * of all the possible deadlocks and enospc problems.
  */
-long btrfs_ioctl_trans_start(struct file *file)
+static long btrfs_ioctl_trans_start(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index dbe20d4c6ea..f99335a999d 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -276,6 +276,7 @@ out:
 	return ret;
 }
 
+#if 0 /* this will get used when snapshot deletion is implemented */
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
 		       u64 root_id, u8 type, u64 ref_id)
@@ -299,6 +300,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 	btrfs_free_path(path);
 	return ret;
 }
+#endif
 
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 		   struct btrfs_path *path,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1975ea273dc..93a21c77064 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -647,7 +647,7 @@ static int btrfs_interface_init(void)
 	return misc_register(&btrfs_misc);
 }
 
-void btrfs_interface_exit(void)
+static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
 		printk("misc_deregister failed for control device");
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index be4fc30a30e..4fcfc8b1189 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -23,6 +23,7 @@
 #include "locking.h"
 #include "print-tree.h"
 #include "compat.h"
+#include "tree-log.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -78,7 +79,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
  * tree of log tree roots.  This must be called with a tree log transaction
  * running (see start_log_trans).
  */
-int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root)
 {
 	struct btrfs_key key;
@@ -1934,7 +1935,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int wait_log_commit(struct btrfs_root *log)
+static int wait_log_commit(struct btrfs_root *log)
 {
 	DEFINE_WAIT(wait);
 	u64 transid = log->fs_info->tree_log_transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 724ead54529..769f2c5d9e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -238,7 +238,7 @@ done:
 	return 0;
 }
 
-void pending_bios_fn(struct btrfs_work *work)
+static void pending_bios_fn(struct btrfs_work *work)
 {
 	struct btrfs_device *device;
 
@@ -686,7 +686,7 @@ error:
 	return ret;
 }
 
-int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 			  struct btrfs_device *device,
 			  u64 start)
 {
@@ -1393,7 +1393,7 @@ error:
 	goto out;
 }
 
-int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
 				 struct btrfs_device *device)
 {
 	int ret;
@@ -1497,7 +1497,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -1543,7 +1543,7 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 	return ret;
 }
 
-int btrfs_relocate_chunk(struct btrfs_root *root,
+static int btrfs_relocate_chunk(struct btrfs_root *root,
 			 u64 chunk_tree, u64 chunk_objectid,
 			 u64 chunk_offset)
 {
@@ -1884,7 +1884,7 @@ done:
 	return ret;
 }
 
-int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 5b9f7002513..c4617cde6c7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
+#include "compression.h"
 
 /* Plan: call deflate() with avail_in == *sourcelen,
 	avail_out = *dstlen - 12 and flush == Z_FINISH.
-- 
cgit v1.2.3


From 6e430f94e508fee1aefd1dfec88da3c24ce64433 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 06:36:09 -0500
Subject: Btrfs: fix shadowed variable declarations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 -
 fs/btrfs/file.c    | 8 ++++----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8a2bcc7024f..2464c099db5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1734,7 +1734,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_cleaner;
 
 	if (btrfs_super_log_root(disk_super) != 0) {
-		u32 blocksize;
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1e8c024c69c..1c9243560ea 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -662,18 +662,18 @@ next_slot:
 		}
 
 		if (found_extent && !keep) {
-			u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
+			u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
 
-			if (disk_bytenr != 0) {
+			if (old_disk_bytenr != 0) {
 				inode_sub_bytes(inode,
 						le64_to_cpu(old.num_bytes));
 				ret = btrfs_free_extent(trans, root,
-						disk_bytenr,
+						old_disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
 						leaf_start, root_owner,
 						root_gen, key.objectid, 0);
 				BUG_ON(ret);
-				*hint_byte = disk_bytenr;
+				*hint_byte = old_disk_bytenr;
 			}
 		}
 
-- 
cgit v1.2.3


From 97288f2c71e46965a2010baea10548fcb5f3af73 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 06:36:09 -0500
Subject: Btrfs: corret fmode_t annotations

Make sure to propagate fmode_t properly and use the right constants for
it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/super.c   | 14 +++++++++-----
 fs/btrfs/volumes.c | 18 +++++++++---------
 fs/btrfs/volumes.h |  4 ++--
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 93a21c77064..09908f25fca 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -242,7 +242,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
  * All other options will be parsed on much later in the mount process and
  * only when we need to allocate a new super block.
  */
-static int btrfs_parse_early_options(const char *options, int flags,
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		void *holder, char **subvol_name,
 		struct btrfs_fs_devices **fs_devices)
 {
@@ -418,18 +418,22 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
+	fmode_t mode = FMODE_READ;
 	int error = 0;
 
-	error = btrfs_parse_early_options(data, flags, fs_type,
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &fs_devices);
 	if (error)
 		goto error;
 
-	error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
+	error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
 	if (error)
 		goto error_free_subvol_name;
 
-	error = btrfs_open_devices(fs_devices, flags, fs_type);
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
 	if (error)
 		goto error_free_subvol_name;
 
@@ -591,7 +595,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
-		ret = btrfs_scan_one_device(vol->name, MS_RDONLY,
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
 					    &btrfs_fs_type, &fs_devices);
 		break;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 769f2c5d9e9..6c523b3360f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -394,7 +394,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 }
 
 int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-			 int flags, void *holder)
+			 fmode_t flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -469,7 +469,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 error:
 		continue;
 	}
@@ -488,7 +488,7 @@ out:
 }
 
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder)
+		       fmode_t flags, void *holder)
 {
 	int ret;
 
@@ -507,7 +507,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	return ret;
 }
 
-int btrfs_scan_one_device(const char *path, int flags, void *holder,
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_super_block *disk_super;
@@ -1008,7 +1008,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_exclusive(device_path, MS_RDONLY,
+		bdev = open_bdev_exclusive(device_path, FMODE_READ,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
@@ -1078,7 +1078,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		BUG_ON(device->writeable);
 		brelse(bh);
 		if (bdev)
-			close_bdev_exclusive(bdev, MS_RDONLY);
+			close_bdev_exclusive(bdev, FMODE_READ);
 
 		if (device->bdev) {
 			close_bdev_exclusive(device->bdev, device->mode);
@@ -1121,7 +1121,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 	if (bdev) {
 		/* one close for us */
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 	}
 	kfree(device->name);
 	kfree(device);
@@ -1132,7 +1132,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -2913,7 +2913,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		goto out;
 	}
 
-	ret = __btrfs_open_devices(fs_devices, MS_RDONLY,
+	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
 				   root->fs_info->bdev_holder);
 	if (ret)
 		goto out;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9b41e4d3984..fcbdcb3ae13 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -135,8 +135,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		  int mirror_num, int async_submit);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder);
-int btrfs_scan_one_device(const char *path, int flags, void *holder,
+		       fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
-- 
cgit v1.2.3


From 4bcabaa30a63a156fc50026f972377dada66452c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 06:36:08 -0500
Subject: Btrfs: clean up btrfs_ioctl a little bit

Provide a void __user *argp pointer so that we can avoid duplicating
the cast for various sub-command calls.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ioctl.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc7c5161e26..d2d5a5a9b02 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1117,20 +1117,21 @@ long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
 	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
 	case BTRFS_IOC_SNAP_CREATE:
-		return btrfs_ioctl_snap_create(file, (void __user *)arg, 0);
+		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
-		return btrfs_ioctl_snap_create(file, (void __user *)arg, 1);
+		return btrfs_ioctl_snap_create(file, argp, 1);
 	case BTRFS_IOC_DEFRAG:
 		return btrfs_ioctl_defrag(file);
 	case BTRFS_IOC_RESIZE:
-		return btrfs_ioctl_resize(root, (void __user *)arg);
+		return btrfs_ioctl_resize(root, argp);
 	case BTRFS_IOC_ADD_DEV:
-		return btrfs_ioctl_add_dev(root, (void __user *)arg);
+		return btrfs_ioctl_add_dev(root, argp);
 	case BTRFS_IOC_RM_DEV:
-		return btrfs_ioctl_rm_dev(root, (void __user *)arg);
+		return btrfs_ioctl_rm_dev(root, argp);
 	case BTRFS_IOC_BALANCE:
 		return btrfs_balance(root->fs_info->dev_root);
 	case BTRFS_IOC_CLONE:
-- 
cgit v1.2.3


From 7a865e8ac3a8ead776ea2c8c74fa2b2d00a2c9cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 09:52:24 -0500
Subject: Btrfs: btrfs: pass void __user * to btrfs_ioctl_clone_range

Cleans the code up a little and also avoids a sparse warning due to the
incorrect cast in the current version of the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d2d5a5a9b02..caea9eed9d6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1035,11 +1035,11 @@ out_fput:
 	return ret;
 }
 
-static long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
+static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
 {
 	struct btrfs_ioctl_clone_range_args args;
 
-	if (copy_from_user(&args, (void *)argptr, sizeof(args)))
+	if (copy_from_user(&args, argp, sizeof(args)))
 		return -EFAULT;
 	return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
 				 args.src_length, args.dest_offset);
@@ -1137,7 +1137,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_CLONE:
 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 	case BTRFS_IOC_CLONE_RANGE:
-		return btrfs_ioctl_clone_range(file, arg);
+		return btrfs_ioctl_clone_range(file, argp);
 	case BTRFS_IOC_TRANS_START:
 		return btrfs_ioctl_trans_start(file);
 	case BTRFS_IOC_TRANS_END:
-- 
cgit v1.2.3


From f2b636e80d8206dd4012de6e973c2367259a7d22 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 2 Dec 2008 06:36:08 -0500
Subject: Btrfs: add support for compat flags to btrfs

This adds the necessary disk format for handling compatibility flags
in the future to handle disk format changes.  We have a compat_flags,
compat_ro_flags and incompat_flags set for the super block.  Compat
flags will be to hold the features that are compatible with older
versions of btrfs, compat_ro flags have features that are compatible
with older versions of btrfs if the fs is mounted read only, and
incompat_flags has features that are incompatible with older versions
of btrfs.  This also axes the compat_flags field for the inode and
just makes the flags field a 64bit field, and changes the root item
flags field to 64bit.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.h   | 28 +++++++++++++++++++++-------
 fs/btrfs/disk-io.c | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 166896dd44c..b5af1fc77c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -305,6 +305,9 @@ struct btrfs_super_block {
 	__le32 stripesize;
 	__le32 sys_chunk_array_size;
 	__le64 chunk_root_generation;
+	__le64 compat_flags;
+	__le64 compat_ro_flags;
+	__le64 incompat_flags;
 	u8 root_level;
 	u8 chunk_root_level;
 	u8 log_root_level;
@@ -313,6 +316,14 @@ struct btrfs_super_block {
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
+/*
+ * Compat flags that we support.  If any incompat flags are set other than the
+ * ones specified below then we will fail to mount
+ */
+#define BTRFS_FEATURE_COMPAT_SUPP	0x0
+#define BTRFS_FEATURE_COMPAT_RO_SUPP	0x0
+#define BTRFS_FEATURE_INCOMPAT_SUPP	0x0
+
 /*
  * A leaf is full of items. offset and size tell us where to find
  * the item in the leaf (relative to the start of the data area)
@@ -433,8 +444,7 @@ struct btrfs_inode_item {
 	__le32 gid;
 	__le32 mode;
 	__le64 rdev;
-	__le16 flags;
-	__le16 compat_flags;
+	__le64 flags;
 
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
@@ -462,7 +472,7 @@ struct btrfs_root_item {
 	__le64 byte_limit;
 	__le64 bytes_used;
 	__le64 last_snapshot;
-	__le32 flags;
+	__le64 flags;
 	__le32 refs;
 	struct btrfs_disk_key drop_progress;
 	u8 drop_level;
@@ -1116,9 +1126,7 @@ BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
 BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
-BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
-		   compat_flags, 16);
+BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
 
 static inline struct btrfs_timespec *
 btrfs_inode_atime(struct btrfs_inode_item *inode_item)
@@ -1468,7 +1476,7 @@ BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8);
 BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64);
 BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 32);
+BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64);
 BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
@@ -1510,6 +1518,12 @@ BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
 			 root_dir_objectid, 64);
 BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
 			 num_devices, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
+			 compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
+			 compat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
+			 incompat_flags, 64);
 
 static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2464c099db5..6ae9bdf98b6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1434,6 +1434,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 blocksize;
 	u32 stripesize;
 	u64 generation;
+	u64 features;
 	struct btrfs_key location;
 	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
@@ -1586,6 +1587,26 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_sb_buffer;
 	}
 
+	features = btrfs_super_incompat_flags(disk_super) &
+		~BTRFS_FEATURE_INCOMPAT_SUPP;
+	if (features) {
+		printk(KERN_ERR "BTRFS: couldn't mount because of "
+		       "unsupported optional features (%Lx).\n",
+		       features);
+		err = -EINVAL;
+		goto fail_sb_buffer;
+	}
+
+	features = btrfs_super_compat_ro_flags(disk_super) &
+		~BTRFS_FEATURE_COMPAT_RO_SUPP;
+	if (!(sb->s_flags & MS_RDONLY) && features) {
+		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
+		       "unsupported option features (%Lx).\n",
+		       features);
+		err = -EINVAL;
+		goto fail_sb_buffer;
+	}
+
 	/*
 	 * we need to start all the end_io workers up front because the
 	 * queue work function gets called at interrupt time, and so it
-- 
cgit v1.2.3


From c6e2bac1a52ffc36dd10769b594dfa3994e95f77 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 2 Dec 2008 06:36:10 -0500
Subject: Btrfs: fix panic on error during mount

This needs to be applied on top of my previous patches, but is needed for more
than just my new stuff.  We're going to the wrong label when we have an error,
we try to stop the workers, but they are started below all of this code.  This
fixes it so we go to the right error label and not panic when we fail one of
these cases.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/disk-io.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ae9bdf98b6..dfd5ba05ce4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1579,12 +1579,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	disk_super = &fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
-		goto fail_sb_buffer;
+		goto fail_iput;
 
 	ret = btrfs_parse_options(tree_root, options);
 	if (ret) {
 		err = ret;
-		goto fail_sb_buffer;
+		goto fail_iput;
 	}
 
 	features = btrfs_super_incompat_flags(disk_super) &
@@ -1594,7 +1594,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		       "unsupported optional features (%Lx).\n",
 		       features);
 		err = -EINVAL;
-		goto fail_sb_buffer;
+		goto fail_iput;
 	}
 
 	features = btrfs_super_compat_ro_flags(disk_super) &
@@ -1604,7 +1604,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		       "unsupported option features (%Lx).\n",
 		       features);
 		err = -EINVAL;
-		goto fail_sb_buffer;
+		goto fail_iput;
 	}
 
 	/*
-- 
cgit v1.2.3


From 607d432da0542e84ddcd358adfddac6f68500e3d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 2 Dec 2008 07:17:45 -0500
Subject: Btrfs: add support for multiple csum algorithms

This patch gives us the space we will need in order to have different csum
algorithims at some point in the future.  We save the csum algorithim type
in the superblock, and use those instead of define's.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.h     | 19 +++++++++++++++++-
 fs/btrfs/disk-io.c   | 25 ++++++++++++++++++-----
 fs/btrfs/file-item.c | 56 ++++++++++++++++++++++++++++++----------------------
 fs/btrfs/ioctl.c     |  9 +++++----
 fs/btrfs/tree-log.c  | 10 ++++++----
 5 files changed, 81 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b5af1fc77c5..6d8350332b1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -109,8 +109,14 @@ struct btrfs_ordered_sum;
 
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
+
+/* csum types */
+#define BTRFS_CSUM_TYPE_CRC32	0
+
+static int btrfs_csum_sizes[] = { 4, 0 };
+
 /* four bytes for CRC32 */
-#define BTRFS_CRC32_SIZE 4
+//#define BTRFS_CRC32_SIZE 4
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 #define BTRFS_FT_UNKNOWN	0
@@ -308,6 +314,7 @@ struct btrfs_super_block {
 	__le64 compat_flags;
 	__le64 compat_ro_flags;
 	__le64 incompat_flags;
+	__le16 csum_type;
 	u8 root_level;
 	u8 chunk_root_level;
 	u8 log_root_level;
@@ -1483,6 +1490,7 @@ BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 			 last_snapshot, 64);
 
 /* struct btrfs_super_block */
+
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -1524,6 +1532,15 @@ BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
 			 compat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
 			 incompat_flags, 64);
+BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
+			 csum_type, 16);
+
+static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
+{
+	int t = btrfs_super_csum_type(s);
+	BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+	return btrfs_csum_sizes[t];
+}
 
 static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dfd5ba05ce4..3eb7c2576fe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -176,7 +176,9 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
-	char result[BTRFS_CRC32_SIZE];
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	char *result = NULL;
 	unsigned long len;
 	unsigned long cur_len;
 	unsigned long offset = BTRFS_CSUM_SIZE;
@@ -186,6 +188,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	unsigned long map_len;
 	int err;
 	u32 crc = ~(u32)0;
+	unsigned long inline_result;
 
 	len = buf->len - offset;
 	while(len > 0) {
@@ -204,25 +207,37 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 		offset += cur_len;
 		unmap_extent_buffer(buf, map_token, KM_USER0);
 	}
+	if (csum_size > sizeof(inline_result)) {
+		result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+		if (!result)
+			return 1;
+	} else {
+		result = (char *)&inline_result;
+	}
+
 	btrfs_csum_final(crc, result);
 
 	if (verify) {
 		/* FIXME, this is not good */
-		if (memcmp_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE)) {
+		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 			u32 val;
 			u32 found = 0;
-			memcpy(&found, result, BTRFS_CRC32_SIZE);
+			memcpy(&found, result, csum_size);
 
-			read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
+			read_extent_buffer(buf, &val, 0, csum_size);
 			printk("btrfs: %s checksum verify failed on %llu "
 			       "wanted %X found %X level %d\n",
 			       root->fs_info->sb->s_id,
 			       buf->start, val, found, btrfs_header_level(buf));
+			if (result != (char *)&inline_result)
+				kfree(result);
 			return 1;
 		}
 	} else {
-		write_extent_buffer(buf, result, 0, BTRFS_CRC32_SIZE);
+		write_extent_buffer(buf, result, 0, csum_size);
 	}
+	if (result != (char *)&inline_result)
+		kfree(result);
 	return 0;
 }
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f7637883140..234ed441736 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,9 +24,9 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-#define MAX_CSUM_ITEMS(r) ((((BTRFS_LEAF_DATA_SIZE(r) - \
-			       sizeof(struct btrfs_item) * 2) / \
-			       BTRFS_CRC32_SIZE) - 1))
+#define MAX_CSUM_ITEMS(r,size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+				   sizeof(struct btrfs_item) * 2) / \
+				  size) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -83,6 +83,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item;
 	struct extent_buffer *leaf;
 	u64 csum_offset = 0;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	int csums_in_item;
 
 	file_key.objectid = objectid;
@@ -105,7 +107,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		csum_offset = (offset - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
-		csums_in_item /= BTRFS_CRC32_SIZE;
+		csums_in_item /= csum_size;
 
 		if (csum_offset >= csums_in_item) {
 			ret = -EFBIG;
@@ -114,7 +116,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	}
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
-					  csum_offset * BTRFS_CRC32_SIZE);
+					  csum_offset * csum_size);
 	return item;
 fail:
 	if (ret > 0)
@@ -150,6 +152,8 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u32 diff;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item = NULL;
@@ -195,7 +199,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			item_size = btrfs_item_size_nr(path->nodes[0],
 						       path->slots[0]);
 			item_last_offset = item_start_offset +
-				(item_size / BTRFS_CRC32_SIZE) *
+				(item_size / csum_size) *
 				root->sectorsize;
 			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 					      struct btrfs_csum_item);
@@ -206,11 +210,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 		 */
 		diff = offset - item_start_offset;
 		diff = diff / root->sectorsize;
-		diff = diff * BTRFS_CRC32_SIZE;
+		diff = diff * csum_size;
 
 		read_extent_buffer(path->nodes[0], &sum,
 				   ((unsigned long)item) + diff,
-				   BTRFS_CRC32_SIZE);
+				   csum_size);
 found:
 		set_state_private(io_tree, offset, sum);
 		bio_index++;
@@ -383,6 +387,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	char *eb_token;
 	unsigned long map_len;
 	unsigned long map_start;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
@@ -408,7 +414,8 @@ again:
 		/* we found one, but it isn't big enough yet */
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-		if ((item_size / BTRFS_CRC32_SIZE) >= MAX_CSUM_ITEMS(root)) {
+		if ((item_size / csum_size) >=
+		    MAX_CSUM_ITEMS(root, csum_size)) {
 			/* already at max size, make a new one */
 			goto insert;
 		}
@@ -441,7 +448,7 @@ again:
 	 */
 	btrfs_release_path(root, path);
 	ret = btrfs_search_slot(trans, root, &file_key, path,
-				BTRFS_CRC32_SIZE, 1);
+				csum_size, 1);
 	if (ret < 0)
 		goto fail_unlock;
 	if (ret == 0) {
@@ -457,14 +464,14 @@ again:
 			root->fs_info->sb->s_blocksize_bits;
 	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
 	    found_key.objectid != objectid ||
-	    csum_offset >= MAX_CSUM_ITEMS(root)) {
+	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
 		goto insert;
 	}
 	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
-	    BTRFS_CRC32_SIZE) {
-		u32 diff = (csum_offset + 1) * BTRFS_CRC32_SIZE;
+	    csum_size) {
+		u32 diff = (csum_offset + 1) * csum_size;
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != BTRFS_CRC32_SIZE)
+		if (diff != csum_size)
 			goto insert;
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
@@ -479,10 +486,10 @@ insert:
 		tmp -= offset & ~((u64)root->sectorsize -1);
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
 		tmp = max((u64)1, tmp);
-		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root));
-		ins_size = BTRFS_CRC32_SIZE * tmp;
+		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
+		ins_size = csum_size * tmp;
 	} else {
-		ins_size = BTRFS_CRC32_SIZE;
+		ins_size = csum_size;
 	}
 	ret = btrfs_insert_empty_item(trans, root, path, &file_key,
 				      ins_size);
@@ -497,7 +504,7 @@ csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	ret = 0;
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
-					  csum_offset * BTRFS_CRC32_SIZE);
+					  csum_offset * csum_size);
 found:
 	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
@@ -508,14 +515,14 @@ found:
 next_sector:
 
 	if (!eb_token ||
-	   (unsigned long)item  + BTRFS_CRC32_SIZE >= map_start + map_len) {
+	   (unsigned long)item + csum_size >= map_start + map_len) {
 		int err;
 
 		if (eb_token)
 			unmap_extent_buffer(leaf, eb_token, KM_USER1);
 		eb_token = NULL;
 		err = map_private_extent_buffer(leaf, (unsigned long)item,
-						BTRFS_CRC32_SIZE,
+						csum_size,
 						&eb_token, &eb_map,
 						&map_start, &map_len, KM_USER1);
 		if (err)
@@ -523,17 +530,17 @@ next_sector:
 	}
 	if (eb_token) {
 		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-		       &sector_sum->sum, BTRFS_CRC32_SIZE);
+		       &sector_sum->sum, csum_size);
 	} else {
 		write_extent_buffer(leaf, &sector_sum->sum,
-				    (unsigned long)item, BTRFS_CRC32_SIZE);
+				    (unsigned long)item, csum_size);
 	}
 
 	total_bytes += root->sectorsize;
 	sector_sum++;
 	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
-						  BTRFS_CRC32_SIZE);
+						  csum_size);
 		if (item < item_end && offset + PAGE_CACHE_SIZE ==
 		    sector_sum->offset) {
 			    offset = sector_sum->offset;
@@ -577,7 +584,8 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 	new_item_span = isize - key.offset;
 	blocks = (new_item_span + root->sectorsize - 1) >>
 		root->fs_info->sb->s_blocksize_bits;
-	new_item_size = blocks * BTRFS_CRC32_SIZE;
+	new_item_size = blocks *
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	if (new_item_size >= btrfs_item_size_nr(leaf, slot))
 		return 0;
 	ret = btrfs_truncate_item(trans, root, path, new_item_size, 1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index caea9eed9d6..b4da53d55c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -714,7 +714,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
 	u64 hint_byte;
-
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	/*
 	 * TODO:
 	 * - split compressed inline extents.  annoying: we need to
@@ -964,7 +965,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			int coff, clen;
 
 			size = btrfs_item_size_nr(leaf, slot);
-			coverslen = (size / BTRFS_CRC32_SIZE) <<
+			coverslen = (size / csum_size) <<
 				root->fs_info->sb->s_blocksize_bits;
 			printk("csums for %llu~%llu\n",
 			       key.offset, coverslen);
@@ -981,12 +982,12 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			if (off > key.offset)
 				coff = ((off - key.offset) >>
 					root->fs_info->sb->s_blocksize_bits) *
-					BTRFS_CRC32_SIZE;
+					csum_size;
 			clen = size - coff;
 			if (key.offset + coverslen > off+len)
 				clen -= ((key.offset+coverslen-off-len) >>
 					 root->fs_info->sb->s_blocksize_bits) *
-					BTRFS_CRC32_SIZE;
+					csum_size;
 			printk(" will dup %d~%d of %d\n",
 			       coff, clen, size);
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4fcfc8b1189..c766649ad45 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -929,13 +929,15 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	int ret;
 	u32 item_size = btrfs_item_size_nr(eb, slot);
 	u64 cur_offset;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
 	unsigned long file_bytes;
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
 	struct inode *inode;
 	unsigned long ptr;
 
-	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
+	file_bytes = (item_size / csum_size) * root->sectorsize;
 	inode = read_one_inode(root, key->objectid);
 	if (!inode) {
 		return -EIO;
@@ -959,10 +961,10 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	while(item_size > 0) {
 		sector_sum->offset = cur_offset;
-		read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
+		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
-		item_size -= BTRFS_CRC32_SIZE;
-		ptr += BTRFS_CRC32_SIZE;
+		item_size -= csum_size;
+		ptr += csum_size;
 		cur_offset += root->sectorsize;
 	}
 
-- 
cgit v1.2.3


From c9f0523d88fd208ce094995a0ac63f7c04e56bab Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 2 Dec 2008 14:49:42 +0100
Subject: fuse: fix sparse warning in ioctl

Fix sparse warning:

  CHECK   fs/fuse/file.c
fs/fuse/file.c:1615:17: warning: incorrect type in assignment (different address spaces)
fs/fuse/file.c:1615:17:    expected void [noderef] <asn:1>*iov_base
fs/fuse/file.c:1615:17:    got void *<noident>

This was introduced by "fuse: implement ioctl support".

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 80b5fa80f5e..4d2f1339a88 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1612,7 +1612,7 @@ static long fuse_file_do_ioctl(struct file *file, unsigned int cmd,
 	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
 		struct iovec *iov = page_address(iov_page);
 
-		iov->iov_base = (void *)arg;
+		iov->iov_base = (void __user *)arg;
 		iov->iov_len = _IOC_SIZE(cmd);
 
 		if (_IOC_DIR(cmd) & _IOC_WRITE) {
-- 
cgit v1.2.3


From 5d9ec854bfb6f1e122b1d96b344164a71eac5be8 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Tue, 2 Dec 2008 14:49:42 +0100
Subject: fuse: clean up annotations of fc->lock

Makes the existing annotations match the more common one per line style
and adds a few missing annotations.

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c  | 17 ++++++++++++-----
 fs/fuse/file.c |  4 ++++
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 3c44ce359a0..69ebf2ecb08 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc)
  * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
-	__releases(fc->lock)
+__releases(&fc->lock)
 {
 	void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 	req->end = NULL;
@@ -298,7 +298,8 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 
 static void wait_answer_interruptible(struct fuse_conn *fc,
 				      struct fuse_req *req)
-	__releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	if (signal_pending(current))
 		return;
@@ -316,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 }
 
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
-	__releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	if (!fc->no_interrupt) {
 		/* Any signal may interrupt this */
@@ -668,6 +670,8 @@ static int request_pending(struct fuse_conn *fc)
 
 /* Wait until a request is available on the pending list */
 static void request_wait(struct fuse_conn *fc)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -695,7 +699,7 @@ static void request_wait(struct fuse_conn *fc)
  */
 static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
 			       const struct iovec *iov, unsigned long nr_segs)
-	__releases(fc->lock)
+__releases(&fc->lock)
 {
 	struct fuse_copy_state cs;
 	struct fuse_in_header ih;
@@ -1012,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
  * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	while (!list_empty(head)) {
 		struct fuse_req *req;
@@ -1034,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
  * locked).
  */
 static void end_io_requests(struct fuse_conn *fc)
-	__releases(fc->lock) __acquires(fc->lock)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	while (!list_empty(&fc->io)) {
 		struct fuse_req *req =
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4d2f1339a88..1a057f02e7d 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1068,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
 /* Called under fc->lock, may release and reacquire it */
 static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	loff_t size = i_size_read(req->inode);
@@ -1105,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
  * Called with fc->lock
  */
 void fuse_flush_writepages(struct inode *inode)
+__releases(&fc->lock)
+__acquires(&fc->lock)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
-- 
cgit v1.2.3


From 4022abf4498186de2e893ce614333216f2add461 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 2 Dec 2008 09:57:03 -0500
Subject: Btrfs: delete unused function: btrfs_invalidate_dcache_root

Snapshot and subvolume creation no longer need this helper.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bd58ba655a4..09efc9473a3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4358,31 +4358,6 @@ out:
 	btrfs_btree_balance_dirty(root, nr);
 }
 
-/*
- * Invalidate a single dcache entry at the root of the filesystem.
- * Needed after creation of snapshot or subvolume.
- */
-static void btrfs_invalidate_dcache_root(struct inode *dir,
-		char *name, int namelen)
-{
-	struct dentry *alias, *entry;
-	struct qstr qstr;
-
-	alias = d_find_alias(dir);
-	if (alias) {
-		qstr.name = name;
-		qstr.len = namelen;
-		/* change me if btrfs ever gets a d_hash operation */
-		qstr.hash = full_name_hash(qstr.name, qstr.len);
-		entry = d_lookup(alias, &qstr);
-		dput(alias);
-		if (entry) {
-			d_invalidate(entry);
-			dput(entry);
-		}
-	}
-}
-
 /*
  * create a new subvolume directory/inode (helper for the ioctl).
  */
-- 
cgit v1.2.3


From 2a7108ad89e1ea9a30afbbece8b581a0532afd12 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 2 Dec 2008 09:58:02 -0500
Subject: Btrfs: rev the disk format for the inode compat and csum selection
 changes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6d8350332b1..96f2ec7ad5b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -39,7 +39,7 @@ extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BFRfS_M"
+#define BTRFS_MAGIC "_BHRfS_M"
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-- 
cgit v1.2.3


From c99e905c945c462085c6d64646dc5af0c0a16815 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 2 Dec 2008 11:18:37 -0500
Subject: Btrfs: Fix sparse endian warnings in struct-funcs.c

The btrfs macros to access individual struct members on disk were
sending the same variable to functions that expected different types
of endianness.  This fix explicitly creates a variable of the correct
type instead of abusing a single variable for mixed purposes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/struct-funcs.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index cdedbe144d4..8d7f568009c 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -36,9 +36,14 @@
  * The extent buffer api is used to do all the kmapping and page
  * spanning work required to get extent buffers in highmem and have
  * a metadata blocksize different from the page size.
+ *
+ * The macro starts with a simple function prototype declaration so that
+ * sparse won't complain about it being static.
  */
 
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
+u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
+void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);	\
 u##bits btrfs_##name(struct extent_buffer *eb,				\
 				   type *s)				\
 {									\
@@ -59,14 +64,15 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\
 		int unmap_on_exit = (eb->map_token == NULL);		\
 		unsigned long map_start;				\
 		unsigned long map_len;					\
-		__le##bits res;						\
+		u##bits res;						\
 		err = map_extent_buffer(eb, offset,			\
 			        sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
-			read_eb_member(eb, s, type, member, &res);	\
-			return le##bits##_to_cpu(res);			\
+			__le##bits leres;				\
+			read_eb_member(eb, s, type, member, &leres);	\
+			return le##bits##_to_cpu(leres);		\
 		}							\
 		p = (type *)(kaddr + part_offset - map_start);		\
 		res = le##bits##_to_cpu(p->member);			\
@@ -101,8 +107,9 @@ void btrfs_set_##name(struct extent_buffer *eb,				\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
-			val = cpu_to_le##bits(val);			\
-			write_eb_member(eb, s, type, member, &val);	\
+			__le##bits val2;				\
+			val2 = cpu_to_le##bits(val);			\
+			write_eb_member(eb, s, type, member, &val2);	\
 			return;						\
 		}							\
 		p = (type *)(kaddr + part_offset - map_start);		\
-- 
cgit v1.2.3


From 062e4fee4400f283307cf8ac1b7931c939010229 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 26 Oct 2008 16:58:25 +0200
Subject: UBIFS: slight compression optimization

If data does not compress, it is better to leave it uncompressed
because we'll read it faster then. So do not compress data if we
save less than 64 bytes.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/compress.c    | 6 +++---
 fs/ubifs/ubifs-media.h | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index a0ada596b17..6414d50780e 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
 	}
 
 	/*
-	 * Presently, we just require that compression results in less data,
-	 * rather than any defined minimum compression ratio or amount.
+	 * If the data compressed only slightly, it is better to leave it
+	 * uncompressed to improve read speed.
 	 */
-	if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+	if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
 		goto no_compr;
 
 	return;
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0b378042a3a..b25fc36cf72 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -51,6 +51,13 @@
  */
 #define UBIFS_MIN_COMPR_LEN 128
 
+/*
+ * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
+ * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * node uncompress, because it'll be read faster.
+ */
+#define UBIFS_MIN_COMPRESS_DIFF 64
+
 /* Root inode number */
 #define UBIFS_ROOT_INO 1
 
-- 
cgit v1.2.3


From a1dc080c27ec8ea7ca1c8a9b499362a71ebff792 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sat, 1 Nov 2008 14:20:50 +0200
Subject: UBIFS: use bit-fields to store compression type

Save a 4 bytes of RAM per 'struct inode' by stroring inode
compression type in bit-filed, instead of using 'int'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 8 ++++++++
 fs/ubifs/ubifs.h | 6 +++---
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d80b2aef42b..21b4103271e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2020,6 +2020,14 @@ static int __init ubifs_init(void)
 	BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
 	BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
 
+	/*
+	 * We use 2 bit wide bit-fields to store compression type, which should
+	 * be amended if more compressors are added. The bit-fields are:
+	 * @compr_type in 'struct ubifs_inode' and @default_compr in
+	 * 'struct ubifs_info'.
+	 */
+	BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
+
 	/*
 	 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
 	 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 46b172560a0..4d76aba57ee 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -386,12 +386,12 @@ struct ubifs_inode {
 	unsigned int dirty:1;
 	unsigned int xattr:1;
 	unsigned int bulk_read:1;
+	unsigned int compr_type:2;
 	struct mutex ui_mutex;
 	spinlock_t ui_lock;
 	loff_t synced_i_size;
 	loff_t ui_size;
 	int flags;
-	int compr_type;
 	pgoff_t last_page_read;
 	pgoff_t read_in_a_row;
 	int data_len;
@@ -946,6 +946,7 @@ struct ubifs_mount_opts {
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
  * @bulk_read: enable bulk-reads
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
  *
  * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
  *             @calc_idx_sz
@@ -986,7 +987,6 @@ struct ubifs_mount_opts {
  * @main_lebs: count of LEBs in the main area
  * @main_first: first LEB of the main area
  * @main_bytes: main area size in bytes
- * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
  *
  * @key_hash_type: type of the key hash
  * @key_hash: direntry key hash function
@@ -1196,6 +1196,7 @@ struct ubifs_info {
 	unsigned int big_lpt:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
+	unsigned int default_compr:2;
 
 	struct mutex tnc_mutex;
 	struct ubifs_zbranch zroot;
@@ -1237,7 +1238,6 @@ struct ubifs_info {
 	int main_lebs;
 	int main_first;
 	long long main_bytes;
-	int default_compr;
 
 	uint8_t key_hash_type;
 	uint32_t (*key_hash)(const char *str, int len);
-- 
cgit v1.2.3


From 553dea4dd531562688ba01c641c7f8fc7abaaf8c Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sat, 1 Nov 2008 14:57:49 +0200
Subject: UBIFS: introduce compression mount options

It is very handy to be able to change default UBIFS compressor
via mount options. Introduce -o compr=<name> mount option support.
Currently only "none", "lzo" and "zlib" compressors are supported.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/compress.c |  6 +++---
 fs/ubifs/sb.c       | 10 ++++++----
 fs/ubifs/super.c    | 44 +++++++++++++++++++++++++++++++++++++-------
 fs/ubifs/ubifs.h    | 12 ++++++++++--
 4 files changed, 56 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 6414d50780e..4afb3ea24d4 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -33,7 +33,7 @@
 /* Fake description object for the "none" compressor */
 static struct ubifs_compressor none_compr = {
 	.compr_type = UBIFS_COMPR_NONE,
-	.name = "no compression",
+	.name = "none",
 	.capi_name = "",
 };
 
@@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex);
 static struct ubifs_compressor lzo_compr = {
 	.compr_type = UBIFS_COMPR_LZO,
 	.comp_mutex = &lzo_mutex,
-	.name = "LZO",
+	.name = "lzo",
 	.capi_name = "lzo",
 };
 #else
 static struct ubifs_compressor lzo_compr = {
 	.compr_type = UBIFS_COMPR_LZO,
-	.name = "LZO",
+	.name = "lzo",
 };
 #endif
 
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 0f392351dc5..c5da201ab54 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -179,8 +179,11 @@ static int create_default_filesystem(struct ubifs_info *c)
 	sup->fanout        = cpu_to_le32(DEFAULT_FANOUT);
 	sup->lsave_cnt     = cpu_to_le32(c->lsave_cnt);
 	sup->fmt_version   = cpu_to_le32(UBIFS_FORMAT_VERSION);
-	sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
 	sup->time_gran     = cpu_to_le32(DEFAULT_TIME_GRAN);
+	if (c->mount_opts.override_compr)
+		sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
+	else
+		sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
 
 	generate_random_uuid(sup->uuid);
 
@@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	c->jhead_cnt     = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
 	c->fanout        = le32_to_cpu(sup->fanout);
 	c->lsave_cnt     = le32_to_cpu(sup->lsave_cnt);
-	c->default_compr = le16_to_cpu(sup->default_compr);
 	c->rp_size       = le64_to_cpu(sup->rp_size);
 	c->rp_uid        = le32_to_cpu(sup->rp_uid);
 	c->rp_gid        = le32_to_cpu(sup->rp_gid);
 	sup_flags        = le32_to_cpu(sup->flags);
+	if (!c->mount_opts.override_compr)
+		c->default_compr = le16_to_cpu(sup->default_compr);
 
 	c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
-
 	memcpy(&c->uuid, &sup->uuid, 16);
-
 	c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
 
 	/* Automatically increase file system size to the maximum size */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 21b4103271e..fc81022cc26 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -417,6 +417,11 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 	else if (c->mount_opts.chk_data_crc == 1)
 		seq_printf(s, ",no_chk_data_crc");
 
+	if (c->mount_opts.override_compr) {
+		seq_printf(s, ",compr=");
+		seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+	}
+
 	return 0;
 }
 
@@ -878,6 +883,7 @@ static int check_volume_empty(struct ubifs_info *c)
  * Opt_no_bulk_read: disable bulk-reads
  * Opt_chk_data_crc: check CRCs when reading data nodes
  * Opt_no_chk_data_crc: do not check CRCs when reading data nodes
+ * Opt_override_compr: override default compressor
  * Opt_err: just end of array marker
  */
 enum {
@@ -887,6 +893,7 @@ enum {
 	Opt_no_bulk_read,
 	Opt_chk_data_crc,
 	Opt_no_chk_data_crc,
+	Opt_override_compr,
 	Opt_err,
 };
 
@@ -897,6 +904,7 @@ static const match_table_t tokens = {
 	{Opt_no_bulk_read, "no_bulk_read"},
 	{Opt_chk_data_crc, "chk_data_crc"},
 	{Opt_no_chk_data_crc, "no_chk_data_crc"},
+	{Opt_override_compr, "compr=%s"},
 	{Opt_err, NULL},
 };
 
@@ -950,6 +958,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 			c->mount_opts.chk_data_crc = 1;
 			c->no_chk_data_crc = 1;
 			break;
+		case Opt_override_compr:
+		{
+			char *name = match_strdup(&args[0]);
+
+			if (!name)
+				return -ENOMEM;
+			if (!strcmp(name, "none"))
+				c->mount_opts.compr_type = UBIFS_COMPR_NONE;
+			else if (!strcmp(name, "lzo"))
+				c->mount_opts.compr_type = UBIFS_COMPR_LZO;
+			else if (!strcmp(name, "zlib"))
+				c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+			else {
+				ubifs_err("unknown compressor \"%s\"", name);
+				kfree(name);
+				return -EINVAL;
+			}
+			kfree(name);
+			c->mount_opts.override_compr = 1;
+			c->default_compr = c->mount_opts.compr_type;
+			break;
+		}
 		default:
 			ubifs_err("unrecognized mount option \"%s\" "
 				  "or missing value", p);
@@ -1100,13 +1130,13 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_free;
 
 	/*
-	 * Make sure the compressor which is set as the default on in the
-	 * superblock was actually compiled in.
+	 * Make sure the compressor which is set as default in the superblock
+	 * or overriden by mount options is actually compiled in.
 	 */
 	if (!ubifs_compr_present(c->default_compr)) {
-		ubifs_warn("'%s' compressor is set by superblock, but not "
-			   "compiled in", ubifs_compr_name(c->default_compr));
-		c->default_compr = UBIFS_COMPR_NONE;
+		ubifs_err("'compressor \"%s\" is not compiled in",
+			  ubifs_compr_name(c->default_compr));
+		goto out_free;
 	}
 
 	dbg_failure_mode_registration(c);
@@ -2023,8 +2053,8 @@ static int __init ubifs_init(void)
 	/*
 	 * We use 2 bit wide bit-fields to store compression type, which should
 	 * be amended if more compressors are added. The bit-fields are:
-	 * @compr_type in 'struct ubifs_inode' and @default_compr in
-	 * 'struct ubifs_info'.
+	 * @compr_type in 'struct ubifs_inode', @default_compr in
+	 * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
 	 */
 	BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4d76aba57ee..16840e099ef 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -893,13 +893,21 @@ struct ubifs_orphan {
 /**
  * struct ubifs_mount_opts - UBIFS-specific mount options information.
  * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
- * @bulk_read: enable bulk-reads
- * @chk_data_crc: check CRCs when reading data nodes
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
+ * @chk_data_crc: enable/disable CRC data checking when reading data nodes
+ *                (%0 default, %1 disabe, %2 enable)
+ * @override_compr: override default compressor (%0 - do not override and use
+ *                  superblock compressor, %1 - override and use compressor
+ *                  specified in @compr_type)
+ * @compr_type: compressor type to override the superblock compressor with
+ *              (%UBIFS_COMPR_NONE, etc)
  */
 struct ubifs_mount_opts {
 	unsigned int unmount_mode:2;
 	unsigned int bulk_read:2;
 	unsigned int chk_data_crc:2;
+	unsigned int override_compr:1;
+	unsigned int compr_type:2;
 };
 
 /**
-- 
cgit v1.2.3


From 5dd7cbc083f3a91fa7454125fe992826701b67bc Mon Sep 17 00:00:00 2001
From: Kukkonen Mika <mika.kukkonen@nokia.com>
Date: Tue, 2 Dec 2008 11:32:49 +0200
Subject: UBIFS: avoid unnecessary checks

I have a habit of compiling kernel with
EXTRA_CFLAGS="-Wextra -Wno-unused -Wno-sign-compare -Wno-missing-field-initializers"
and so fs/ubifs/key.h give lots (~10) of these every time:

CC      fs/ubifs/tnc_misc.o
In file included from fs/ubifs/ubifs.h:1725,
from fs/ubifs/tnc_misc.c:30:
fs/ubifs/key.h: In function 'key_r5_hash':
fs/ubifs/key.h:64: warning: comparison of unsigned expression >= 0 is always true
fs/ubifs/key.h: In function 'key_test_hash':
fs/ubifs/key.h:81: warning: comparison of unsigned expression >= 0 is always true

This patch fixes the warnings.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/key.h | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 3f1f16bc25c..efb3430a258 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -37,6 +37,22 @@
 #ifndef __UBIFS_KEY_H__
 #define __UBIFS_KEY_H__
 
+/**
+ * key_mask_hash - mask a valid hash value.
+ * @val: value to be masked
+ *
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
+ * function makes sure the reserved values are not used.
+ */
+static inline uint32_t key_mask_hash(uint32_t hash)
+{
+	hash &= UBIFS_S_KEY_HASH_MASK;
+	if (unlikely(hash <= 2))
+		hash += 3;
+	return hash;
+}
+
 /**
  * key_r5_hash - R5 hash function (borrowed from reiserfs).
  * @s: direntry name
@@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
 		str++;
 	}
 
-	a &= UBIFS_S_KEY_HASH_MASK;
-
-	/*
-	 * We use hash values as offset in directories, so values %0 and %1 are
-	 * reserved for "." and "..". %2 is reserved for "end of readdir"
-	 * marker.
-	 */
-	if (unlikely(a >= 0 && a <= 2))
-		a += 3;
-	return a;
+	return key_mask_hash(a);
 }
 
 /**
@@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len)
 
 	len = min_t(uint32_t, len, 4);
 	memcpy(&a, str, len);
-	a &= UBIFS_S_KEY_HASH_MASK;
-	if (unlikely(a >= 0 && a <= 2))
-		a += 3;
-	return a;
+	return key_mask_hash(a);
 }
 
 /**
-- 
cgit v1.2.3


From 17c2f9f85c896b48a5d74a9155d99ec5b241a0e6 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 17 Oct 2008 13:31:39 +0300
Subject: UBIFS: separate debugging fields out

Introduce a new data structure which contains all debugging
stuff inside. This is cleaner than having debugging stuff
directly in 'c'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/commit.c     | 25 +++++++++---------
 fs/ubifs/debug.c      | 71 +++++++++++++++++++++++++++++++++++++++------------
 fs/ubifs/debug.h      | 51 ++++++++++++++++++++++++++++++------
 fs/ubifs/lprops.c     |  2 +-
 fs/ubifs/lpt_commit.c | 55 +++++++++++++++++++--------------------
 fs/ubifs/orphan.c     |  2 +-
 fs/ubifs/super.c      | 22 +++++-----------
 fs/ubifs/tnc_commit.c |  7 ++---
 fs/ubifs/ubifs.h      | 34 +++---------------------
 9 files changed, 156 insertions(+), 113 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b49884c8c10..f3a7945527f 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
 	struct ubifs_idx_node *idx;
 	int lnum, offs, len, err = 0;
+	struct ubifs_debug_info *d = c->dbg;
 
-	c->old_zroot = *zroot;
-
-	lnum = c->old_zroot.lnum;
-	offs = c->old_zroot.offs;
-	len = c->old_zroot.len;
+	d->old_zroot = *zroot;
+	lnum = d->old_zroot.lnum;
+	offs = d->old_zroot.offs;
+	len = d->old_zroot.len;
 
 	idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
 	if (!idx)
@@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 	if (err)
 		goto out;
 
-	c->old_zroot_level = le16_to_cpu(idx->level);
-	c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+	d->old_zroot_level = le16_to_cpu(idx->level);
+	d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
 out:
 	kfree(idx);
 	return err;
@@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 {
 	int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
 	int first = 1, iip;
+	struct ubifs_debug_info *d = c->dbg;
 	union ubifs_key lower_key, upper_key, l_key, u_key;
 	unsigned long long uninitialized_var(last_sqnum);
 	struct ubifs_idx_node *idx;
@@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 	     UBIFS_IDX_NODE_SZ;
 
 	/* Start at the old zroot */
-	lnum = c->old_zroot.lnum;
-	offs = c->old_zroot.offs;
-	len = c->old_zroot.len;
+	lnum = d->old_zroot.lnum;
+	offs = d->old_zroot.offs;
+	len = d->old_zroot.len;
 	iip = 0;
 
 	/*
@@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 		if (first) {
 			first = 0;
 			/* Check root level and sqnum */
-			if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+			if (le16_to_cpu(idx->level) != d->old_zroot_level) {
 				err = 2;
 				goto out_dump;
 			}
-			if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+			if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
 				err = 3;
 				goto out_dump;
 			}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 510ffa0bbda..0332a856a08 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -705,7 +705,7 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 
 	printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
 
-	sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+	sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
 	if (IS_ERR(sleb)) {
 		ubifs_err("scan error %d", (int)PTR_ERR(sleb));
 		return;
@@ -2097,7 +2097,7 @@ static int simple_rand(void)
 	return (next >> 16) & 32767;
 }
 
-void dbg_failure_mode_registration(struct ubifs_info *c)
+static void failure_mode_init(struct ubifs_info *c)
 {
 	struct failure_mode_info *fmi;
 
@@ -2112,7 +2112,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c)
 	spin_unlock(&fmi_lock);
 }
 
-void dbg_failure_mode_deregistration(struct ubifs_info *c)
+static void failure_mode_exit(struct ubifs_info *c)
 {
 	struct failure_mode_info *fmi, *tmp;
 
@@ -2146,42 +2146,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc)
 	struct ubifs_info *c = dbg_find_info(desc);
 
 	if (c && dbg_failure_mode)
-		return c->failure_mode;
+		return c->dbg->failure_mode;
 	return 0;
 }
 
 static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
 {
 	struct ubifs_info *c = dbg_find_info(desc);
+	struct ubifs_debug_info *d;
 
 	if (!c || !dbg_failure_mode)
 		return 0;
-	if (c->failure_mode)
+	d = c->dbg;
+	if (d->failure_mode)
 		return 1;
-	if (!c->fail_cnt) {
+	if (!d->fail_cnt) {
 		/* First call - decide delay to failure */
 		if (chance(1, 2)) {
 			unsigned int delay = 1 << (simple_rand() >> 11);
 
 			if (chance(1, 2)) {
-				c->fail_delay = 1;
-				c->fail_timeout = jiffies +
+				d->fail_delay = 1;
+				d->fail_timeout = jiffies +
 						  msecs_to_jiffies(delay);
 				dbg_rcvry("failing after %ums", delay);
 			} else {
-				c->fail_delay = 2;
-				c->fail_cnt_max = delay;
+				d->fail_delay = 2;
+				d->fail_cnt_max = delay;
 				dbg_rcvry("failing after %u calls", delay);
 			}
 		}
-		c->fail_cnt += 1;
+		d->fail_cnt += 1;
 	}
 	/* Determine if failure delay has expired */
-	if (c->fail_delay == 1) {
-		if (time_before(jiffies, c->fail_timeout))
+	if (d->fail_delay == 1) {
+		if (time_before(jiffies, d->fail_timeout))
 			return 0;
-	} else if (c->fail_delay == 2)
-		if (c->fail_cnt++ < c->fail_cnt_max)
+	} else if (d->fail_delay == 2)
+		if (d->fail_cnt++ < d->fail_cnt_max)
 			return 0;
 	if (lnum == UBIFS_SB_LNUM) {
 		if (write) {
@@ -2239,7 +2241,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
 		dbg_rcvry("failing in bud LEB %d commit not running", lnum);
 	}
 	ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
-	c->failure_mode = 1;
+	d->failure_mode = 1;
 	dump_stack();
 	return 1;
 }
@@ -2344,4 +2346,41 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
 	return 0;
 }
 
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+	c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+	if (!c->dbg)
+		return -ENOMEM;
+
+	c->dbg->buf = vmalloc(c->leb_size);
+	if (!c->dbg->buf)
+		goto out;
+
+	failure_mode_init(c);
+	return 0;
+
+out:
+	kfree(c->dbg);
+	return -ENOMEM;
+}
+
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+	failure_mode_exit(c);
+	vfree(c->dbg->buf);
+	kfree(c->dbg);
+}
+
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 33d6b95071e..d6ea1362d56 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -25,7 +25,43 @@
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
-#define UBIFS_DBG(op) op
+/**
+ * ubifs_debug_info - per-FS debugging information.
+ * @buf: a buffer of LEB size, used for various purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ * @chk_lpt_sz: used by LPT tree size checker
+ * @chk_lpt_sz2: used by LPT tree size checker
+ * @chk_lpt_wastage: used by LPT tree size checker
+ * @chk_lpt_lebs: used by LPT tree size checker
+ * @new_nhead_offs: used by LPT tree size checker
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ */
+struct ubifs_debug_info {
+	void *buf;
+	struct ubifs_zbranch old_zroot;
+	int old_zroot_level;
+	unsigned long long old_zroot_sqnum;
+	int failure_mode;
+	int fail_delay;
+	unsigned long fail_timeout;
+	unsigned int fail_cnt;
+	unsigned int fail_cnt_max;
+	long long chk_lpt_sz;
+	long long chk_lpt_sz2;
+	long long chk_lpt_wastage;
+	int chk_lpt_lebs;
+	int new_nhead_offs;
+	int new_ihead_lnum;
+	int new_ihead_offs;
+};
 
 #define ubifs_assert(expr) do {                                                \
 	if (unlikely(!(expr))) {                                               \
@@ -211,6 +247,9 @@ extern unsigned int ubifs_msg_flags;
 extern unsigned int ubifs_chk_flags;
 extern unsigned int ubifs_tst_flags;
 
+int ubifs_debugging_init(struct ubifs_info *c);
+void ubifs_debugging_exit(struct ubifs_info *c);
+
 /* Dump functions */
 
 const char *dbg_ntype(int type);
@@ -274,9 +313,6 @@ int dbg_force_in_the_gaps(void);
 
 #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
 
-void dbg_failure_mode_registration(struct ubifs_info *c);
-void dbg_failure_mode_deregistration(struct ubifs_info *c);
-
 #ifndef UBIFS_DBG_PRESERVE_UBI
 
 #define ubi_leb_read   dbg_leb_read
@@ -320,8 +356,6 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 
 #else /* !CONFIG_UBIFS_FS_DEBUG */
 
-#define UBIFS_DBG(op)
-
 /* Use "if (0)" to make compiler check arguments even if debugging is off */
 #define ubifs_assert(expr)  do {                                               \
 	if (0 && (expr))                                                       \
@@ -360,6 +394,9 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
 
+#define ubifs_debugging_init(c)               0
+#define ubifs_debugging_exit(c)               ({})
+
 #define dbg_ntype(type)                       ""
 #define dbg_cstate(cmt_state)                 ""
 #define dbg_get_key_dump(c, key)              ({})
@@ -396,8 +433,6 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define dbg_force_in_the_gaps_enabled              0
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
-#define dbg_failure_mode_registration(c)           ({})
-#define dbg_failure_mode_deregistration(c)         ({})
 
 #endif /* !CONFIG_UBIFS_FS_DEBUG */
 
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f27176e9b70..10ba663eb32 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1088,7 +1088,7 @@ static int scan_check_cb(struct ubifs_info *c,
 		}
 	}
 
-	sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+	sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
 	if (IS_ERR(sleb)) {
 		/*
 		 * After an unclean unmount, empty and freeable LEBs
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index a41434b4278..1aefab9f0b5 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1602,7 +1602,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 {
 	int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
 	int ret;
-	void *buf = c->dbg_buf;
+	void *buf = c->dbg->buf;
 
 	dbg_lp("LEB %d", lnum);
 	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
@@ -1731,15 +1731,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
  */
 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 {
+	struct ubifs_debug_info *d = c->dbg;
 	long long chk_lpt_sz, lpt_sz;
 	int err = 0;
 
 	switch (action) {
 	case 0:
-		c->chk_lpt_sz = 0;
-		c->chk_lpt_sz2 = 0;
-		c->chk_lpt_lebs = 0;
-		c->chk_lpt_wastage = 0;
+		d->chk_lpt_sz = 0;
+		d->chk_lpt_sz2 = 0;
+		d->chk_lpt_lebs = 0;
+		d->chk_lpt_wastage = 0;
 		if (c->dirty_pn_cnt > c->pnode_cnt) {
 			dbg_err("dirty pnodes %d exceed max %d",
 				c->dirty_pn_cnt, c->pnode_cnt);
@@ -1752,35 +1753,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 		}
 		return err;
 	case 1:
-		c->chk_lpt_sz += len;
+		d->chk_lpt_sz += len;
 		return 0;
 	case 2:
-		c->chk_lpt_sz += len;
-		c->chk_lpt_wastage += len;
-		c->chk_lpt_lebs += 1;
+		d->chk_lpt_sz += len;
+		d->chk_lpt_wastage += len;
+		d->chk_lpt_lebs += 1;
 		return 0;
 	case 3:
 		chk_lpt_sz = c->leb_size;
-		chk_lpt_sz *= c->chk_lpt_lebs;
+		chk_lpt_sz *= d->chk_lpt_lebs;
 		chk_lpt_sz += len - c->nhead_offs;
-		if (c->chk_lpt_sz != chk_lpt_sz) {
+		if (d->chk_lpt_sz != chk_lpt_sz) {
 			dbg_err("LPT wrote %lld but space used was %lld",
-				c->chk_lpt_sz, chk_lpt_sz);
+				d->chk_lpt_sz, chk_lpt_sz);
 			err = -EINVAL;
 		}
-		if (c->chk_lpt_sz > c->lpt_sz) {
+		if (d->chk_lpt_sz > c->lpt_sz) {
 			dbg_err("LPT wrote %lld but lpt_sz is %lld",
-				c->chk_lpt_sz, c->lpt_sz);
+				d->chk_lpt_sz, c->lpt_sz);
 			err = -EINVAL;
 		}
-		if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+		if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
 			dbg_err("LPT layout size %lld but wrote %lld",
-				c->chk_lpt_sz, c->chk_lpt_sz2);
+				d->chk_lpt_sz, d->chk_lpt_sz2);
 			err = -EINVAL;
 		}
-		if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+		if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
 			dbg_err("LPT new nhead offs: expected %d was %d",
-				c->new_nhead_offs, len);
+				d->new_nhead_offs, len);
 			err = -EINVAL;
 		}
 		lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1788,22 +1789,22 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 		lpt_sz += c->ltab_sz;
 		if (c->big_lpt)
 			lpt_sz += c->lsave_sz;
-		if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+		if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
 			dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
-				c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+				d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
 			err = -EINVAL;
 		}
 		if (err)
 			dbg_dump_lpt_info(c);
-		c->chk_lpt_sz2 = c->chk_lpt_sz;
-		c->chk_lpt_sz = 0;
-		c->chk_lpt_wastage = 0;
-		c->chk_lpt_lebs = 0;
-		c->new_nhead_offs = len;
+		d->chk_lpt_sz2 = d->chk_lpt_sz;
+		d->chk_lpt_sz = 0;
+		d->chk_lpt_wastage = 0;
+		d->chk_lpt_lebs = 0;
+		d->new_nhead_offs = len;
 		return err;
 	case 4:
-		c->chk_lpt_sz += len;
-		c->chk_lpt_wastage += len;
+		d->chk_lpt_sz += len;
+		d->chk_lpt_wastage += len;
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9bd5a43d452..9e6f403f170 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
 	for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
 		struct ubifs_scan_leb *sleb;
 
-		sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+		sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
 		if (IS_ERR(sleb)) {
 			err = PTR_ERR(sleb);
 			break;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index fc81022cc26..ad44822059c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1069,11 +1069,9 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		return err;
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
-	c->dbg_buf = vmalloc(c->leb_size);
-	if (!c->dbg_buf)
-		return -ENOMEM;
-#endif
+	err = ubifs_debugging_init(c);
+	if (err)
+		return err;
 
 	err = check_volume_empty(c);
 	if (err)
@@ -1139,18 +1137,16 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_free;
 	}
 
-	dbg_failure_mode_registration(c);
-
 	err = init_constants_late(c);
 	if (err)
-		goto out_dereg;
+		goto out_free;
 
 	sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
 	sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
 	c->cbuf = kmalloc(sz, GFP_NOFS);
 	if (!c->cbuf) {
 		err = -ENOMEM;
-		goto out_dereg;
+		goto out_free;
 	}
 
 	sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
@@ -1350,14 +1346,12 @@ out_wbufs:
 	free_wbufs(c);
 out_cbuf:
 	kfree(c->cbuf);
-out_dereg:
-	dbg_failure_mode_deregistration(c);
 out_free:
 	kfree(c->bu.buf);
 	vfree(c->ileb_buf);
 	vfree(c->sbuf);
 	kfree(c->bottom_up_buf);
-	UBIFS_DBG(vfree(c->dbg_buf));
+	ubifs_debugging_exit(c);
 	return err;
 }
 
@@ -1394,8 +1388,7 @@ static void ubifs_umount(struct ubifs_info *c)
 	vfree(c->ileb_buf);
 	vfree(c->sbuf);
 	kfree(c->bottom_up_buf);
-	UBIFS_DBG(vfree(c->dbg_buf));
-	dbg_failure_mode_deregistration(c);
+	ubifs_debugging_exit(c);
 }
 
 /**
@@ -1879,7 +1872,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_iput;
 
 	mutex_unlock(&c->umount_mutex);
-
 	return 0;
 
 out_iput:
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8ac76b1c2d5..3c0af452887 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
 	}
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
-	c->new_ihead_lnum = lnum;
-	c->new_ihead_offs = buf_offs;
+	c->dbg->new_ihead_lnum = lnum;
+	c->dbg->new_ihead_offs = buf_offs;
 #endif
 
 	return 0;
@@ -1002,7 +1002,8 @@ static int write_index(struct ubifs_info *c)
 	}
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
-	if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+	if (lnum != c->dbg->new_ihead_lnum ||
+	    buf_offs != c->dbg->new_ihead_offs) {
 		ubifs_err("inconsistent ihead");
 		return -EINVAL;
 	}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 16840e099ef..7e090a5e2bf 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -910,6 +910,8 @@ struct ubifs_mount_opts {
 	unsigned int compr_type:2;
 };
 
+struct ubifs_debug_info;
+
 /**
  * struct ubifs_info - UBIFS file-system description data structure
  * (per-superblock).
@@ -972,8 +974,6 @@ struct ubifs_mount_opts {
  * @ileb_nxt: next pre-allocated index LEBs
  * @old_idx: tree of index nodes obsoleted since the last commit start
  * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
  *
  * @mst_node: master node
  * @mst_offs: offset of valid master node
@@ -1157,15 +1157,7 @@ struct ubifs_mount_opts {
  * @always_chk_crc: always check CRCs (while mounting and remounting rw)
  * @mount_opts: UBIFS-specific mount options
  *
- * @dbg_buf: a buffer of LEB size used for debugging purposes
- * @old_zroot: old index root - used by 'dbg_check_old_index()'
- * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
- * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
- * @failure_mode: failure mode for recovery testing
- * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
- * @fail_timeout: time in jiffies when delay of failure mode expires
- * @fail_cnt: current number of calls to failure mode I/O functions
- * @fail_cnt_max: number of calls by which to delay failure mode
+ * @dbg: debugging-related information
  */
 struct ubifs_info {
 	struct super_block *vfs_sb;
@@ -1221,10 +1213,6 @@ struct ubifs_info {
 	int ileb_nxt;
 	struct rb_root old_idx;
 	int *bottom_up_buf;
-#ifdef CONFIG_UBIFS_FS_DEBUG
-	int new_ihead_lnum;
-	int new_ihead_offs;
-#endif
 
 	struct ubifs_mst_node *mst_node;
 	int mst_offs;
@@ -1399,21 +1387,7 @@ struct ubifs_info {
 	struct ubifs_mount_opts mount_opts;
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
-	void *dbg_buf;
-	struct ubifs_zbranch old_zroot;
-	int old_zroot_level;
-	unsigned long long old_zroot_sqnum;
-	int failure_mode;
-	int fail_delay;
-	unsigned long fail_timeout;
-	unsigned int fail_cnt;
-	unsigned int fail_cnt_max;
-	long long chk_lpt_sz;
-	long long chk_lpt_sz2;
-	long long chk_lpt_wastage;
-	int chk_lpt_lebs;
-	int new_nhead_lnum;
-	int new_nhead_offs;
+	struct ubifs_debug_info *dbg;
 #endif
 };
 
-- 
cgit v1.2.3


From 552ff3179d1e93a3e982357544c059f3e9a5516e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 23 Oct 2008 11:49:28 +0300
Subject: UBIFS: add debugfs support

We need to have a possibility to see various UBIFS variables
and ask UBIFS to dump various information. Debugfs is what
we need.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++------
 fs/ubifs/debug.h |  27 ++++++++-
 fs/ubifs/super.c |  12 ++++
 fs/ubifs/ubifs.h |   1 +
 4 files changed, 193 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0332a856a08..56842772c80 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,7 @@
 #include "ubifs.h"
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/debugfs.h>
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
@@ -988,22 +989,20 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 	err = 1;
 	key_read(c, &dent1->key, &key);
 	if (keys_cmp(c, &zbr1->key, &key)) {
-		dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
-			zbr1->offs, DBGKEY(&key));
-		dbg_err("but it should have key %s according to tnc",
-			DBGKEY(&zbr1->key));
-			dbg_dump_node(c, dent1);
-			goto out_free;
+		ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum,
+			  zbr1->offs, DBGKEY(&key));
+		ubifs_err("but it should have key %s according to tnc",
+			  DBGKEY(&zbr1->key)); dbg_dump_node(c, dent1);
+		goto out_free;
 	}
 
 	key_read(c, &dent2->key, &key);
 	if (keys_cmp(c, &zbr2->key, &key)) {
-		dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
-			zbr1->offs, DBGKEY(&key));
-		dbg_err("but it should have key %s according to tnc",
-			DBGKEY(&zbr2->key));
-			dbg_dump_node(c, dent2);
-			goto out_free;
+		ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+			  zbr1->offs, DBGKEY(&key));
+		ubifs_err("but it should have key %s according to tnc",
+			  DBGKEY(&zbr2->key)); dbg_dump_node(c, dent2);
+		goto out_free;
 	}
 
 	nlen1 = le16_to_cpu(dent1->nlen);
@@ -1015,14 +1014,14 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 		goto out_free;
 	}
 	if (cmp == 0 && nlen1 == nlen2)
-		dbg_err("2 xent/dent nodes with the same name");
+		ubifs_err("2 xent/dent nodes with the same name");
 	else
-		dbg_err("bad order of colliding key %s",
+		ubifs_err("bad order of colliding key %s",
 			DBGKEY(&key));
 
-	dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+	ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
 	dbg_dump_node(c, dent1);
-	dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+	ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
 	dbg_dump_node(c, dent2);
 
 out_free:
@@ -2103,7 +2102,7 @@ static void failure_mode_init(struct ubifs_info *c)
 
 	fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
 	if (!fmi) {
-		dbg_err("Failed to register failure mode - no memory");
+		ubifs_err("Failed to register failure mode - no memory");
 		return;
 	}
 	fmi->c = c;
@@ -2383,4 +2382,144 @@ void ubifs_debugging_exit(struct ubifs_info *c)
 	kfree(c->dbg);
 }
 
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *debugfs_rootdir;
+
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+	debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
+	if (IS_ERR(debugfs_rootdir)) {
+		int err = PTR_ERR(debugfs_rootdir);
+		ubifs_err("cannot create \"ubifs\" debugfs directory, "
+			  "error %d\n", err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+	debugfs_remove(debugfs_rootdir);
+}
+
+static int open_debugfs_file(struct inode *inode, struct file *file)
+{
+	file->private_data = inode->i_private;
+	return 0;
+}
+
+static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct ubifs_info *c = file->private_data;
+	struct ubifs_debug_info *d = c->dbg;
+
+	if (file->f_path.dentry == d->dump_lprops)
+		dbg_dump_lprops(c);
+	else if (file->f_path.dentry == d->dump_budg) {
+		spin_lock(&c->space_lock);
+		dbg_dump_budg(c);
+		spin_unlock(&c->space_lock);
+	} else if (file->f_path.dentry == d->dump_budg) {
+		mutex_lock(&c->tnc_mutex);
+		dbg_dump_tnc(c);
+		mutex_unlock(&c->tnc_mutex);
+	} else
+		return -EINVAL;
+
+	*ppos += count;
+	return count;
+}
+
+static const struct file_operations debugfs_fops = {
+	.open = open_debugfs_file,
+	.write = write_debugfs_file,
+	.owner = THIS_MODULE,
+};
+
+/**
+ * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates all debugfs files for this instance of UBIFS. Returns
+ * zero in case of success and a negative error code in case of failure.
+ *
+ * Note, the only reason we have not merged this function with the
+ * 'ubifs_debugging_init()' function is because it is better to initialize
+ * debugfs interfaces at the very end of the mount process, and remove them at
+ * the very beginning of the mount process.
+ */
+int dbg_debugfs_init_fs(struct ubifs_info *c)
+{
+	int err;
+	const char *fname;
+	struct dentry *dent;
+	struct ubifs_debug_info *d = c->dbg;
+
+	sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+	d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
+					      debugfs_rootdir);
+	if (IS_ERR(d->debugfs_dir)) {
+		err = PTR_ERR(d->debugfs_dir);
+		ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+			  d->debugfs_dir_name, err);
+		goto out;
+	}
+
+	fname = "dump_lprops";
+	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+				   &debugfs_fops);
+	if (IS_ERR(dent))
+		goto out_remove;
+	d->dump_lprops = dent;
+
+	fname = "dump_budg";
+	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+				   &debugfs_fops);
+	if (IS_ERR(dent))
+		goto out_remove;
+	d->dump_budg = dent;
+
+	fname = "dump_tnc";
+	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+				   &debugfs_fops);
+	if (IS_ERR(dent))
+		goto out_remove;
+	d->dump_tnc = dent;
+
+	return 0;
+
+out_remove:
+	err = PTR_ERR(dent);
+	ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+		  fname, err);
+	debugfs_remove_recursive(d->debugfs_dir);
+out:
+	return err;
+}
+
+/**
+ * dbg_debugfs_exit_fs - remove all debugfs files.
+ * @c: UBIFS file-system description object
+ */
+void dbg_debugfs_exit_fs(struct ubifs_info *c)
+{
+	debugfs_remove_recursive(c->dbg->debugfs_dir);
+}
+
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index d6ea1362d56..a6b70f8aac9 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -43,6 +43,13 @@
  * @new_nhead_offs: used by LPT tree size checker
  * @new_ihead_lnum: used by debugging to check ihead_lnum
  * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * debugfs_dir_name: name of debugfs directory containing this file-system's
+ *                   files
+ * debugfs_dir: direntry object of the file-system debugfs directory
+ * dump_lprops: "dump lprops" debugfs knob
+ * dump_budg: "dump budgeting information" debugfs knob
+ * dump_tnc: "dump TNC" debugfs knob
  */
 struct ubifs_debug_info {
 	void *buf;
@@ -61,6 +68,12 @@ struct ubifs_debug_info {
 	int new_nhead_offs;
 	int new_ihead_lnum;
 	int new_ihead_offs;
+
+	char debugfs_dir_name[100];
+	struct dentry *debugfs_dir;
+	struct dentry *dump_lprops;
+	struct dentry *dump_budg;
+	struct dentry *dump_tnc;
 };
 
 #define ubifs_assert(expr) do {                                                \
@@ -251,7 +264,6 @@ int ubifs_debugging_init(struct ubifs_info *c);
 void ubifs_debugging_exit(struct ubifs_info *c);
 
 /* Dump functions */
-
 const char *dbg_ntype(int type);
 const char *dbg_cstate(int cmt_state);
 const char *dbg_get_key_dump(const struct ubifs_info *c,
@@ -274,7 +286,6 @@ void dbg_dump_tnc(struct ubifs_info *c);
 void dbg_dump_index(struct ubifs_info *c);
 
 /* Checking helper functions */
-
 typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
 				 struct ubifs_zbranch *zbr, void *priv);
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
@@ -354,6 +365,12 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 	return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
 }
 
+/* Debugfs-related stuff */
+int dbg_debugfs_init(void);
+void dbg_debugfs_exit(void);
+int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_exit_fs(struct ubifs_info *c);
+
 #else /* !CONFIG_UBIFS_FS_DEBUG */
 
 /* Use "if (0)" to make compiler check arguments even if debugging is off */
@@ -434,6 +451,10 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
 #define dbg_force_in_the_gaps()                    0
 #define dbg_failure_mode                           0
 
-#endif /* !CONFIG_UBIFS_FS_DEBUG */
+#define dbg_debugfs_init()                         0
+#define dbg_debugfs_exit()
+#define dbg_debugfs_init_fs(c)                     0
+#define dbg_debugfs_exit_fs(c)                     0
 
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
 #endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ad44822059c..2dbaa4fc2cb 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1258,6 +1258,10 @@ static int mount_ubifs(struct ubifs_info *c)
 		}
 	}
 
+	err = dbg_debugfs_init_fs(c);
+	if (err)
+		goto out_infos;
+
 	err = dbg_check_filesystem(c);
 	if (err)
 		goto out_infos;
@@ -1369,6 +1373,7 @@ static void ubifs_umount(struct ubifs_info *c)
 	dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
 		c->vi.vol_id);
 
+	dbg_debugfs_exit_fs(c);
 	spin_lock(&ubifs_infos_lock);
 	list_del(&c->infos_list);
 	spin_unlock(&ubifs_infos_lock);
@@ -2078,12 +2083,18 @@ static int __init ubifs_init(void)
 	register_shrinker(&ubifs_shrinker_info);
 
 	err = ubifs_compressors_init();
+	if (err)
+		goto out_shrinker;
+
+	err = dbg_debugfs_init();
 	if (err)
 		goto out_compr;
 
 	return 0;
 
 out_compr:
+	ubifs_compressors_exit();
+out_shrinker:
 	unregister_shrinker(&ubifs_shrinker_info);
 	kmem_cache_destroy(ubifs_inode_slab);
 out_reg:
@@ -2098,6 +2109,7 @@ static void __exit ubifs_exit(void)
 	ubifs_assert(list_empty(&ubifs_infos));
 	ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
 
+	dbg_debugfs_exit();
 	ubifs_compressors_exit();
 	unregister_shrinker(&ubifs_shrinker_info);
 	kmem_cache_destroy(ubifs_inode_slab);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 7e090a5e2bf..4cf28e85de7 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1158,6 +1158,7 @@ struct ubifs_debug_info;
  * @mount_opts: UBIFS-specific mount options
  *
  * @dbg: debugging-related information
+ * @dfs: debugfs support-related information
  */
 struct ubifs_info {
 	struct super_block *vfs_sb;
-- 
cgit v1.2.3


From 45e12d901fee57bccf90f6940155724954e1aac7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 31 Oct 2008 11:42:18 +0200
Subject: UBIFS: run debugging checks only if they are enabled

Do not forget to check whether lpt debugging is enabled before
running the check functions. This commit also makes some spelling
fixes.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt.c        | 3 +--
 fs/ubifs/lpt_commit.c | 9 +++++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index db8bd0e518b..93c181c742f 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -36,7 +36,7 @@
  * can be written into a single eraseblock. In that case, garbage collection
  * consists of just writing the whole table, which therefore makes all other
  * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
- * selected for garbage collection, which consists are marking the nodes in
+ * selected for garbage collection, which consists of marking the clean nodes in
  * that LEB as dirty, and then only the dirty nodes are written out. Also, in
  * the case of the big model, a table of LEB numbers is saved so that the entire
  * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
@@ -156,7 +156,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
 	}
 
 	c->check_lpt_free = c->big_lpt;
-
 	return 0;
 }
 
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 1aefab9f0b5..7bbf03518c7 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1604,6 +1604,9 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 	int ret;
 	void *buf = c->dbg->buf;
 
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
 	dbg_lp("LEB %d", lnum);
 	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
 	if (err) {
@@ -1704,6 +1707,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 	long long free = 0;
 	int i;
 
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
 	for (i = 0; i < c->lpt_lebs; i++) {
 		if (c->ltab[i].tgc || c->ltab[i].cmt)
 			continue;
@@ -1735,6 +1741,9 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 	long long chk_lpt_sz, lpt_sz;
 	int err = 0;
 
+	if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+		return 0;
+
 	switch (action) {
 	case 0:
 		d->chk_lpt_sz = 0;
-- 
cgit v1.2.3


From 787845bdeadd368eedeace92d5bf53f5aa1450ba Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 31 Oct 2008 12:17:42 +0200
Subject: UBIFS: dump stack in LPT check functions

It is useful to know how we got to the checking function when
hunting the bugs.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt_commit.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 7bbf03518c7..c5c07f9cd22 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,7 @@ no_space:
 	dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
 		"done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
 	dbg_dump_lpt_info(c);
+	dump_stack();
 	return err;
 }
 
@@ -548,6 +549,7 @@ no_space:
 	dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
 	        "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
 	dbg_dump_lpt_info(c);
+	dump_stack();
 	return err;
 }
 
@@ -1722,6 +1724,7 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 		dbg_err("LPT space error: free %lld lpt_sz %lld",
 			free, c->lpt_sz);
 		dbg_dump_lpt_info(c);
+		dump_stack();
 		return -EINVAL;
 	}
 	return 0;
@@ -1803,8 +1806,10 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 				d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
 			err = -EINVAL;
 		}
-		if (err)
+		if (err) {
 			dbg_dump_lpt_info(c);
+			dump_stack();
+		}
 		d->chk_lpt_sz2 = d->chk_lpt_sz;
 		d->chk_lpt_sz = 0;
 		d->chk_lpt_wastage = 0;
-- 
cgit v1.2.3


From 2ba5f7ae8165b3f575dd3a7d8bb18f421fab8273 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 31 Oct 2008 17:32:30 +0200
Subject: UBIFS: introduce LPT dump function

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c      |  28 +++++++----
 fs/ubifs/debug.h      |  45 +++++++++--------
 fs/ubifs/lpt.c        |  27 ++++++-----
 fs/ubifs/lpt_commit.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ubifs/ubifs.h      |   3 ++
 5 files changed, 186 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 56842772c80..934db1855f0 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -646,7 +646,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
 	struct ubifs_lprops lp;
 	struct ubifs_lp_stats lst;
 
-	printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
+	printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+	       current->pid);
 	ubifs_get_lp_stats(c, &lst);
 	dbg_dump_lstats(&lst);
 
@@ -657,6 +658,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
 
 		dbg_dump_lprop(c, &lp);
 	}
+	printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+	       current->pid);
 }
 
 void dbg_dump_lpt_info(struct ubifs_info *c)
@@ -664,6 +667,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
 	int i;
 
 	spin_lock(&dbg_lock);
+	printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
 	printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
 	printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
 	printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
@@ -704,8 +708,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 	if (dbg_failure_mode)
 		return;
 
-	printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
-
+	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+	       current->pid, lnum);
 	sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
 	if (IS_ERR(sleb)) {
 		ubifs_err("scan error %d", (int)PTR_ERR(sleb));
@@ -722,6 +726,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
 		dbg_dump_node(c, snod->node);
 	}
 
+	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+	       current->pid, lnum);
 	ubifs_scan_destroy(sleb);
 	return;
 }
@@ -769,7 +775,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
 	int i;
 
-	printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
+	printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
 	       current->pid, cat, heap->cnt);
 	for (i = 0; i < heap->cnt; i++) {
 		struct ubifs_lprops *lprops = heap->arr[i];
@@ -778,6 +784,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 		       "flags %d\n", i, lprops->lnum, lprops->hpos,
 		       lprops->free, lprops->dirty, lprops->flags);
 	}
+	printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
 }
 
 void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -785,7 +792,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 {
 	int i;
 
-	printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
+	printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
 	printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
 	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
 	printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -804,7 +811,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
 	int level;
 
 	printk(KERN_DEBUG "\n");
-	printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
+	printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
 	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
 	level = znode->level;
 	printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -816,8 +823,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
 		dbg_dump_znode(c, znode);
 		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
 	}
-
-	printk(KERN_DEBUG "\n");
+	printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
 }
 
 static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -992,7 +998,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 		ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum,
 			  zbr1->offs, DBGKEY(&key));
 		ubifs_err("but it should have key %s according to tnc",
-			  DBGKEY(&zbr1->key)); dbg_dump_node(c, dent1);
+			  DBGKEY(&zbr1->key));
+		dbg_dump_node(c, dent1);
 		goto out_free;
 	}
 
@@ -1001,7 +1008,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 		ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum,
 			  zbr1->offs, DBGKEY(&key));
 		ubifs_err("but it should have key %s according to tnc",
-			  DBGKEY(&zbr2->key)); dbg_dump_node(c, dent2);
+			  DBGKEY(&zbr2->key));
+		dbg_dump_node(c, dent2);
 		goto out_free;
 	}
 
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index a6b70f8aac9..9820d6999f7 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -270,6 +270,8 @@ const char *dbg_get_key_dump(const struct ubifs_info *c,
 			     const union ubifs_key *key);
 void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
 void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
+		       int offs);
 void dbg_dump_budget_req(const struct ubifs_budget_req *req);
 void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
 void dbg_dump_budg(struct ubifs_info *c);
@@ -284,6 +286,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
 		    struct ubifs_nnode *parent, int iip);
 void dbg_dump_tnc(struct ubifs_info *c);
 void dbg_dump_index(struct ubifs_info *c);
+void dbg_dump_lpt_lebs(const struct ubifs_info *c);
 
 /* Checking helper functions */
 typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
@@ -411,26 +414,28 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 #define DBGKEY(key)  ((char *)(key))
 #define DBGKEY1(key) ((char *)(key))
 
-#define ubifs_debugging_init(c)               0
-#define ubifs_debugging_exit(c)               ({})
-
-#define dbg_ntype(type)                       ""
-#define dbg_cstate(cmt_state)                 ""
-#define dbg_get_key_dump(c, key)              ({})
-#define dbg_dump_inode(c, inode)              ({})
-#define dbg_dump_node(c, node)                ({})
-#define dbg_dump_budget_req(req)              ({})
-#define dbg_dump_lstats(lst)                  ({})
-#define dbg_dump_budg(c)                      ({})
-#define dbg_dump_lprop(c, lp)                 ({})
-#define dbg_dump_lprops(c)                    ({})
-#define dbg_dump_lpt_info(c)                  ({})
-#define dbg_dump_leb(c, lnum)                 ({})
-#define dbg_dump_znode(c, znode)              ({})
-#define dbg_dump_heap(c, heap, cat)           ({})
-#define dbg_dump_pnode(c, pnode, parent, iip) ({})
-#define dbg_dump_tnc(c)                       ({})
-#define dbg_dump_index(c)                     ({})
+#define ubifs_debugging_init(c)                0
+#define ubifs_debugging_exit(c)                ({})
+
+#define dbg_ntype(type)                        ""
+#define dbg_cstate(cmt_state)                  ""
+#define dbg_get_key_dump(c, key)               ({})
+#define dbg_dump_inode(c, inode)               ({})
+#define dbg_dump_node(c, node)                 ({})
+#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
+#define dbg_dump_budget_req(req)               ({})
+#define dbg_dump_lstats(lst)                   ({})
+#define dbg_dump_budg(c)                       ({})
+#define dbg_dump_lprop(c, lp)                  ({})
+#define dbg_dump_lprops(c)                     ({})
+#define dbg_dump_lpt_info(c)                   ({})
+#define dbg_dump_leb(c, lnum)                  ({})
+#define dbg_dump_znode(c, znode)               ({})
+#define dbg_dump_heap(c, heap, cat)            ({})
+#define dbg_dump_pnode(c, pnode, parent, iip)  ({})
+#define dbg_dump_tnc(c)                        ({})
+#define dbg_dump_index(c)                      ({})
+#define dbg_dump_lpt_lebs(c)                   ({})
 
 #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 93c181c742f..6d914160ec5 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -557,7 +557,7 @@ static int calc_nnode_num(int row, int col)
  * This function calculates and returns the nnode number based on the parent's
  * nnode number and the index in parent.
  */
-static int calc_nnode_num_from_parent(struct ubifs_info *c,
+static int calc_nnode_num_from_parent(const struct ubifs_info *c,
 				      struct ubifs_nnode *parent, int iip)
 {
 	int num, shft;
@@ -582,7 +582,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c,
  * This function calculates and returns the pnode number based on the parent's
  * nnode number and the index in parent.
  */
-static int calc_pnode_num_from_parent(struct ubifs_info *c,
+static int calc_pnode_num_from_parent(const struct ubifs_info *c,
 				      struct ubifs_nnode *parent, int iip)
 {
 	int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
@@ -965,7 +965,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
  *
  * This function returns %0 on success and a negative error code on failure.
  */
-static int unpack_pnode(struct ubifs_info *c, void *buf,
+static int unpack_pnode(const struct ubifs_info *c, void *buf,
 			struct ubifs_pnode *pnode)
 {
 	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
@@ -995,15 +995,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf,
 }
 
 /**
- * unpack_nnode - unpack a nnode.
+ * ubifs_unpack_nnode - unpack a nnode.
  * @c: UBIFS file-system description object
  * @buf: buffer containing packed nnode to unpack
  * @nnode: nnode structure to fill
  *
  * This function returns %0 on success and a negative error code on failure.
  */
-static int unpack_nnode(struct ubifs_info *c, void *buf,
-			struct ubifs_nnode *nnode)
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+		       struct ubifs_nnode *nnode)
 {
 	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
 	int i, pos = 0, err;
@@ -1035,7 +1035,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf,
  *
  * This function returns %0 on success and a negative error code on failure.
  */
-static int unpack_ltab(struct ubifs_info *c, void *buf)
+static int unpack_ltab(const struct ubifs_info *c, void *buf)
 {
 	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
 	int i, pos = 0, err;
@@ -1067,7 +1067,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf)
  *
  * This function returns %0 on success and a negative error code on failure.
  */
-static int unpack_lsave(struct ubifs_info *c, void *buf)
+static int unpack_lsave(const struct ubifs_info *c, void *buf)
 {
 	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
 	int i, pos = 0, err;
@@ -1095,7 +1095,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf)
  *
  * This function returns %0 on success and a negative error code on failure.
  */
-static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
 			  struct ubifs_nnode *parent, int iip)
 {
 	int i, lvl, max_offs;
@@ -1139,7 +1139,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
  *
  * This function returns %0 on success and a negative error code on failure.
  */
-static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
 			  struct ubifs_nnode *parent, int iip)
 {
 	int i;
@@ -1173,7 +1173,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
  * This function calculates the LEB numbers for the LEB properties it contains
  * based on the pnode number.
  */
-static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+static void set_pnode_lnum(const struct ubifs_info *c,
+			   struct ubifs_pnode *pnode)
 {
 	int i, lnum;
 
@@ -1226,7 +1227,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
 		err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
 		if (err)
 			goto out;
-		err = unpack_nnode(c, buf, nnode);
+		err = ubifs_unpack_nnode(c, buf, nnode);
 		if (err)
 			goto out;
 	}
@@ -1815,7 +1816,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
 			       c->nnode_sz);
 		if (err)
 			return ERR_PTR(err);
-		err = unpack_nnode(c, buf, nnode);
+		err = ubifs_unpack_nnode(c, buf, nnode);
 		if (err)
 			return ERR_PTR(err);
 	}
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index c5c07f9cd22..da60b5a0fab 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,7 @@ no_space:
 	dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
 		"done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
 	dbg_dump_lpt_info(c);
+	dbg_dump_lpt_lebs(c);
 	dump_stack();
 	return err;
 }
@@ -549,6 +550,7 @@ no_space:
 	dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
 	        "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
 	dbg_dump_lpt_info(c);
+	dbg_dump_lpt_lebs(c);
 	dump_stack();
 	return err;
 }
@@ -1027,7 +1029,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
  * @c: UBIFS file-system description object
  * @node_type: LPT node type
  */
-static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
 {
 	switch (node_type) {
 	case UBIFS_LPT_NNODE:
@@ -1048,7 +1050,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type)
  * @buf: buffer
  * @len: length of buffer
  */
-static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
 {
 	int offs, pad_len;
 
@@ -1065,7 +1067,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
  * @buf: buffer
  * @node_num: node number is returned here
  */
-static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
+			     int *node_num)
 {
 	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
 	int pos = 0, node_type;
@@ -1083,7 +1086,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
  *
  * This function returns %1 if the buffer contains a node or %0 if it does not.
  */
-static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
 {
 	uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
 	int pos = 0, node_type, node_len;
@@ -1107,7 +1110,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
 	return 1;
 }
 
-
 /**
  * lpt_gc_lnum - garbage collect a LPT LEB.
  * @c: UBIFS file-system description object
@@ -1724,6 +1726,7 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
 		dbg_err("LPT space error: free %lld lpt_sz %lld",
 			free, c->lpt_sz);
 		dbg_dump_lpt_info(c);
+		dbg_dump_lpt_lebs(c);
 		dump_stack();
 		return -EINVAL;
 	}
@@ -1808,6 +1811,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 		}
 		if (err) {
 			dbg_dump_lpt_info(c);
+			dbg_dump_lpt_lebs(c);
 			dump_stack();
 		}
 		d->chk_lpt_sz2 = d->chk_lpt_sz;
@@ -1825,4 +1829,121 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
 	}
 }
 
+/**
+ * dbg_dump_lpt_leb - dump an LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to dump
+ *
+ * This function dumps an LEB from LPT area. Nodes in this area are very
+ * different to nodes in the main area (e.g., they do not have common headers,
+ * they do not have 8-byte alignments, etc), so we have a separate function to
+ * dump LPT area LEBs. Note, LPT has to be locked by the coller.
+ */
+static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
+{
+	int err, len = c->leb_size, node_type, node_num, node_len, offs;
+	void *buf = c->dbg->buf;
+
+	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+	       current->pid, lnum);
+	err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+	if (err) {
+		ubifs_err("cannot read LEB %d, error %d", lnum, err);
+		return;
+	}
+	while (1) {
+		offs = c->leb_size - len;
+		if (!is_a_node(c, buf, len)) {
+			int pad_len;
+
+			pad_len = get_pad_len(c, buf, len);
+			if (pad_len) {
+				printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
+				       lnum, offs, pad_len);
+				buf += pad_len;
+				len -= pad_len;
+				continue;
+			}
+			if (len)
+				printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
+				       lnum, offs, len);
+			break;
+		}
+
+		node_type = get_lpt_node_type(c, buf, &node_num);
+		switch (node_type) {
+		case UBIFS_LPT_PNODE:
+		{
+			node_len = c->pnode_sz;
+			if (c->big_lpt)
+				printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
+				       lnum, offs, node_num);
+			else
+				printk(KERN_DEBUG "LEB %d:%d, pnode\n",
+				       lnum, offs);
+			break;
+		}
+		case UBIFS_LPT_NNODE:
+		{
+			int i;
+			struct ubifs_nnode nnode;
+
+			node_len = c->nnode_sz;
+			if (c->big_lpt)
+				printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
+				       lnum, offs, node_num);
+			else
+				printk(KERN_DEBUG "LEB %d:%d, nnode, ",
+				       lnum, offs);
+			err = ubifs_unpack_nnode(c, buf, &nnode);
+			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+				printk("%d:%d", nnode.nbranch[i].lnum,
+				       nnode.nbranch[i].offs);
+				if (i != UBIFS_LPT_FANOUT - 1)
+					printk(", ");
+			}
+			printk("\n");
+			break;
+		}
+		case UBIFS_LPT_LTAB:
+			node_len = c->ltab_sz;
+			printk(KERN_DEBUG "LEB %d:%d, ltab\n",
+			       lnum, offs);
+			break;
+		case UBIFS_LPT_LSAVE:
+			node_len = c->lsave_sz;
+			printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
+			break;
+		default:
+			ubifs_err("LPT node type %d not recognized", node_type);
+			return;
+		}
+
+		buf += node_len;
+		len -= node_len;
+	}
+
+	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+	       current->pid, lnum);
+}
+
+/**
+ * dbg_dump_lpt_lebs - dump LPT lebs.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps all LPT LEBs. The caller has to make sure the LPT is
+ * locked.
+ */
+void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+{
+	int i;
+
+	printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
+	       current->pid);
+	for (i = 0; i < c->lpt_lebs; i++)
+		dump_lpt_leb(c, i + c->lpt_first);
+	printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
+	       current->pid);
+}
+
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4cf28e85de7..e658b06fd45 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1622,6 +1622,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
 void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
 uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
 struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* Needed only in debugging code in lpt_commit.c */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+		       struct ubifs_nnode *nnode);
 
 /* lpt_commit.c */
 int ubifs_lpt_start_commit(struct ubifs_info *c);
-- 
cgit v1.2.3


From 995be04548f62c8e6b447410cd28b0666614b461 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 4 Dec 2008 17:04:18 +0300
Subject: UBIFS: fix section mismatch

This patch fixes the following section mismatch:

WARNING: fs/ubifs/ubifs.o(.init.text+0xec): Section mismatch in reference from the function init_module() to the function .exit.text:ubifs_compressors_exit()

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/compress.c | 2 +-
 fs/ubifs/ubifs.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 4afb3ea24d4..4c90ee2aef4 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -244,7 +244,7 @@ out_lzo:
 /**
  * ubifs_compressors_exit - de-initialize UBIFS compressors.
  */
-void __exit ubifs_compressors_exit(void)
+void ubifs_compressors_exit(void)
 {
 	compr_exit(&lzo_compr);
 	compr_exit(&zlib_compr);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e658b06fd45..055c6b52d2f 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1700,7 +1700,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
 /* compressor.c */
 int __init ubifs_compressors_init(void);
-void __exit ubifs_compressors_exit(void);
+void ubifs_compressors_exit(void);
 void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
 		    int *compr_type);
 int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
-- 
cgit v1.2.3


From d20f7043fa65659136c1a7c3c456eeeb5c6f431f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:58:54 -0500
Subject: Btrfs: move data checksumming into a dedicated tree

Btrfs stores checksums for each data block.  Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block.  This means that when we read the inode,
we've probably read in at least some checksums as well.

But, this has a few problems:

* The checksums are indexed by logical offset in the file.  When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data.  It would be faster if we could checksum
the compressed data instead.

* If we implement encryption, we'll be checksumming the plain text and
storing that on disk.  This is significantly less secure.

* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct.  This makes the raid
layer balancing and extent moving much more expensive.

* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.

* There is potentitally one copy of the checksum in each subvolume
referencing an extent.

The solution used here is to store the extent checksums in a dedicated
tree.  This allows us to index the checksums by phyiscal extent
start and length.  It means:

* The checksum is against the data stored on disk, after any compression
or encryption is done.

* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.

This makes compression significantly faster by reducing the amount of
data that needs to be checksummed.  It will also allow much faster
raid management code in general.

The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent.  This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c  | 124 ++++++++++++++++++++++++++++----
 fs/btrfs/ctree.h        |  30 +++++---
 fs/btrfs/disk-io.c      |  45 ++++++++++--
 fs/btrfs/extent_io.c    |   5 ++
 fs/btrfs/file-item.c    | 185 ++++++++++++++++++------------------------------
 fs/btrfs/inode.c        |  45 ++++++------
 fs/btrfs/ioctl.c        |  55 +-------------
 fs/btrfs/ordered-data.c |   7 +-
 fs/btrfs/ordered-data.h |  10 +--
 fs/btrfs/tree-log.c     | 121 +++++++++++++++++++++++++++----
 fs/btrfs/volumes.c      |   1 +
 11 files changed, 387 insertions(+), 241 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4febe2eb0b8..ad727413730 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -69,11 +69,27 @@ struct compressed_bio {
 
 	/* IO errors */
 	int errors;
+	int mirror_num;
 
 	/* for reads, this is the bio we are copying the data into */
 	struct bio *orig_bio;
+
+	/*
+	 * the start of a variable length array of checksums only
+	 * used by reads
+	 */
+	u32 sums;
 };
 
+static inline int compressed_bio_size(struct btrfs_root *root,
+				      unsigned long disk_size)
+{
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	return sizeof(struct compressed_bio) +
+		((disk_size + root->sectorsize - 1) / root->sectorsize) *
+		csum_size;
+}
+
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
 					u64 first_byte, gfp_t gfp_flags)
 {
@@ -96,6 +112,47 @@ static struct bio *compressed_bio_alloc(struct block_device *bdev,
 	return bio;
 }
 
+static int check_compressed_csum(struct inode *inode,
+				 struct compressed_bio *cb,
+				 u64 disk_start)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *page;
+	unsigned long i;
+	char *kaddr;
+	u32 csum;
+	u32 *cb_sum = &cb->sums;
+
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	for (i = 0; i < cb->nr_pages; i++) {
+		page = cb->compressed_pages[i];
+		csum = ~(u32)0;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		btrfs_csum_final(csum, (char *)&csum);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		if (csum != *cb_sum) {
+			printk("btrfs csum failed ino %lu extent %llu csum %u "
+			       "wanted %u mirror %d\n", inode->i_ino,
+			       (unsigned long long)disk_start,
+			       csum, *cb_sum, cb->mirror_num);
+			ret = -EIO;
+			goto fail;
+		}
+		cb_sum++;
+
+	}
+	ret = 0;
+fail:
+	return ret;
+}
+
 /* when we finish reading compressed pages from the disk, we
  * decompress them and then run the bio end_io routines on the
  * decompressed pages (in the inode address space).
@@ -124,16 +181,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	if (!atomic_dec_and_test(&cb->pending_bios))
 		goto out;
 
+	inode = cb->inode;
+	ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+	if (ret)
+		goto csum_failed;
+
 	/* ok, we're the last bio for this extent, lets start
 	 * the decompression.
 	 */
-	inode = cb->inode;
 	tree = &BTRFS_I(inode)->io_tree;
 	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
 					cb->start,
 					cb->orig_bio->bi_io_vec,
 					cb->orig_bio->bi_vcnt,
 					cb->compressed_len);
+csum_failed:
 	if (ret)
 		cb->errors = 1;
 
@@ -148,8 +210,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	/* do io completion on the original bio */
 	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
-	} else
+	} else {
+		int bio_index = 0;
+		struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+
+		/*
+		 * we have verified the checksum already, set page
+		 * checked so the end_io handlers know about it
+		 */
+		while(bio_index < cb->orig_bio->bi_vcnt) {
+			SetPageChecked(bvec->bv_page);
+			bvec++;
+			bio_index++;
+		}
 		bio_endio(cb->orig_bio, 0);
+	}
 
 	/* finally free the cb struct */
 	kfree(cb->compressed_pages);
@@ -277,12 +352,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	int ret;
 
 	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
-	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
 	cb->start = start;
 	cb->len = len;
+	cb->mirror_num = 0;
 	cb->compressed_pages = compressed_pages;
 	cb->compressed_len = compressed_len;
 	cb->orig_bio = NULL;
@@ -290,9 +366,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
-	ret = btrfs_csum_file_bytes(root, inode, start, len);
-	BUG_ON(ret);
-
 	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
 	bio->bi_private = cb;
 	bio->bi_end_io = end_compressed_bio_write;
@@ -325,6 +398,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 			BUG_ON(ret);
 
+			ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+			BUG_ON(ret);
+
 			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
 			BUG_ON(ret);
 
@@ -348,6 +424,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
+	ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+	BUG_ON(ret);
+
 	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
 	BUG_ON(ret);
 
@@ -510,6 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_start;
 	struct extent_map *em;
 	int ret;
+	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
 	em_tree = &BTRFS_I(inode)->extent_tree;
@@ -521,15 +601,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				   PAGE_CACHE_SIZE);
 	spin_unlock(&em_tree->lock);
 
-	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	compressed_len = em->block_len;
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
+	cb->mirror_num = mirror_num;
+	sums = &cb->sums;
 
 	cb->start = em->orig_start;
-	compressed_len = em->block_len;
 	em_len = em->len;
 	em_start = em->start;
+
 	free_extent_map(em);
 	em = NULL;
 
@@ -551,11 +634,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	add_ra_bio_pages(inode, em_start + em_len, cb);
 
-	if (!btrfs_test_opt(root, NODATASUM) &&
-	    !btrfs_test_flag(inode, NODATASUM)) {
-		btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
-	}
-
 	/* include any pages we added in add_ra-bio_pages */
 	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
 	cb->len = uncompressed_len;
@@ -568,6 +646,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	for (page_index = 0; page_index < nr_pages; page_index++) {
 		page = cb->compressed_pages[page_index];
 		page->mapping = inode->i_mapping;
+		page->index = em_start >> PAGE_CACHE_SHIFT;
+
 		if (comp_bio->bi_size)
 			ret = tree->ops->merge_bio_hook(page, 0,
 							PAGE_CACHE_SIZE,
@@ -591,7 +671,16 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			atomic_inc(&cb->pending_bios);
 
-			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+			if (!btrfs_test_opt(root, NODATASUM) &&
+			    !btrfs_test_flag(inode, NODATASUM)) {
+				btrfs_lookup_bio_sums(root, inode, comp_bio,
+						      sums);
+			}
+			sums += (comp_bio->bi_size + root->sectorsize - 1) /
+				root->sectorsize;
+
+			ret = btrfs_map_bio(root, READ, comp_bio,
+					    mirror_num, 0);
 			BUG_ON(ret);
 
 			bio_put(comp_bio);
@@ -610,7 +699,12 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+	if (!btrfs_test_opt(root, NODATASUM) &&
+	    !btrfs_test_flag(inode, NODATASUM)) {
+		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+	}
+
+	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
 	BUG_ON(ret);
 
 	bio_put(comp_bio);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 96f2ec7ad5b..242b961ae6d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -73,6 +73,9 @@ struct btrfs_ordered_sum;
 /* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
 
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -84,6 +87,13 @@ struct btrfs_ordered_sum;
 #define BTRFS_TREE_RELOC_OBJECTID -8ULL
 #define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
 
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -634,6 +644,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
+	struct btrfs_root *csum_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -716,6 +727,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers workers;
 	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_meta_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
 	/*
@@ -858,13 +870,12 @@ struct btrfs_root {
  * extent data is for file data
  */
 #define BTRFS_EXTENT_DATA_KEY	108
+
 /*
- * csum items have the checksums for data in the extents
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
  */
-#define BTRFS_CSUM_ITEM_KEY	120
-
-
-/* reserve 21-31 for other file/dir stuff */
+#define BTRFS_EXTENT_CSUM_KEY	128
 
 /*
  * root items point to tree roots.  There are typically in the root
@@ -1917,7 +1928,7 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 
 /* file-item.c */
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio);
+			  struct bio *bio, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -1929,17 +1940,16 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 bytenr, int mod);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct inode *inode,
+			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
-		       struct bio *bio);
+		       struct bio *bio, u64 file_start, int contig);
 int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
 			  u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
-					  u64 objectid, u64 offset,
-					  int cow);
+					  u64 bytenr, int cow);
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3eb7c2576fe..61dc3b2c834 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -445,11 +445,18 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
-	if (bio->bi_rw & (1 << BIO_RW))
+
+	if (bio->bi_rw & (1 << BIO_RW)) {
 		btrfs_queue_worker(&fs_info->endio_write_workers,
 				   &end_io_wq->work);
-	else
-		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+	} else {
+		if (end_io_wq->metadata)
+			btrfs_queue_worker(&fs_info->endio_meta_workers,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_workers,
+					   &end_io_wq->work);
+	}
 }
 
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -1208,6 +1215,9 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->bdev)
+			continue;
+
 		bdi = blk_get_backing_dev_info(device->bdev);
 		if (bdi->unplug_io_fn) {
 			bdi->unplug_io_fn(bdi, page);
@@ -1344,7 +1354,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	 * blocksize <= pagesize, it is basically a noop
 	 */
 	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
-		btrfs_queue_worker(&fs_info->endio_workers,
+		btrfs_queue_worker(&fs_info->endio_meta_workers,
 				   &end_io_wq->work);
 		return;
 	}
@@ -1454,6 +1464,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
+	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
 	struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
@@ -1470,7 +1482,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct btrfs_super_block *disk_super;
 
 	if (!extent_root || !tree_root || !fs_info ||
-	    !chunk_root || !dev_root) {
+	    !chunk_root || !dev_root || !csum_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -1487,6 +1499,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
+	fs_info->csum_root = csum_root;
 	fs_info->chunk_root = chunk_root;
 	fs_info->dev_root = dev_root;
 	fs_info->fs_devices = fs_devices;
@@ -1652,6 +1665,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
 			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+			   fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size);
 
@@ -1667,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_meta_workers,
+			    fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_write_workers,
 			    fs_info->thread_pool_size);
 
@@ -1751,6 +1768,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret)
 		goto fail_extent_root;
 
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+	if (ret)
+		goto fail_extent_root;
+
+	csum_root->track_dirty = 1;
+
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = generation + 1;
@@ -1761,7 +1785,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
 	if (!fs_info->cleaner_kthread)
-		goto fail_extent_root;
+		goto fail_csum_root;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
@@ -1825,6 +1849,8 @@ fail_cleaner:
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
+fail_csum_root:
+	free_extent_buffer(csum_root->node);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
@@ -1838,6 +1864,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
@@ -1853,6 +1880,7 @@ fail:
 	kfree(fs_info);
 	kfree(chunk_root);
 	kfree(dev_root);
+	kfree(csum_root);
 	return ERR_PTR(err);
 }
 
@@ -2131,6 +2159,9 @@ int close_ctree(struct btrfs_root *root)
 	if (root->fs_info->dev_root->node);
 		free_extent_buffer(root->fs_info->dev_root->node);
 
+	if (root->fs_info->csum_root->node);
+		free_extent_buffer(root->fs_info->csum_root->node);
+
 	btrfs_free_block_groups(root->fs_info);
 
 	del_fs_roots(fs_info);
@@ -2141,6 +2172,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
@@ -2163,6 +2195,7 @@ int close_ctree(struct btrfs_root *root)
 	kfree(fs_info->tree_root);
 	kfree(fs_info->chunk_root);
 	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c3dfe2a0ec8..7449ecf32c5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1732,6 +1732,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	int whole_page;
 	int ret;
 
+	if (err)
+		uptodate = 0;
+
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1761,6 +1764,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				if (err)
+					uptodate = 0;
 				continue;
 			}
 		}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 234ed441736..a3ad2ce0011 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -74,8 +74,7 @@ out:
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
-					  u64 objectid, u64 offset,
-					  int cow)
+					  u64 bytenr, int cow)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -87,9 +86,9 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		btrfs_super_csum_size(&root->fs_info->super_copy);
 	int csums_in_item;
 
-	file_key.objectid = objectid;
-	file_key.offset = offset;
-	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
 	if (ret < 0)
 		goto fail;
@@ -100,11 +99,10 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 			goto fail;
 		path->slots[0]--;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
-		    found_key.objectid != objectid) {
+		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
 			goto fail;
-		}
-		csum_offset = (offset - found_key.offset) >>
+
+		csum_offset = (bytenr - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
 		csums_in_item /= csum_size;
@@ -143,7 +141,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio)
+			  struct bio *bio, u32 *dst)
 {
 	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
@@ -151,6 +149,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	u64 offset;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
+	u64 disk_bytenr;
 	u32 diff;
 	u16 csum_size =
 		btrfs_super_csum_size(&root->fs_info->super_copy);
@@ -165,21 +164,22 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 
 	WARN_ON(bio->bi_vcnt <= 0);
 
+	disk_bytenr = (u64)bio->bi_sector << 9;
 	while(bio_index < bio->bi_vcnt) {
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		ret = btrfs_find_ordered_sum(inode, offset, &sum);
+		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
 			goto found;
 
-		if (!item || offset < item_start_offset ||
-		    offset >= item_last_offset) {
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
 			struct btrfs_key found_key;
 			u32 item_size;
 
 			if (item)
 				btrfs_release_path(root, path);
-			item = btrfs_lookup_csum(NULL, root, path,
-						 inode->i_ino, offset, 0);
+			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+						 path, disk_bytenr, 0);
 			if (IS_ERR(item)) {
 				ret = PTR_ERR(item);
 				if (ret == -ENOENT || ret == -EFBIG)
@@ -208,7 +208,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 		 * this byte range must be able to fit inside
 		 * a single leaf so it will also fit inside a u32
 		 */
-		diff = offset - item_start_offset;
+		diff = disk_bytenr - item_start_offset;
 		diff = diff / root->sectorsize;
 		diff = diff * csum_size;
 
@@ -216,7 +216,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 				   ((unsigned long)item) + diff,
 				   csum_size);
 found:
-		set_state_private(io_tree, offset, sum);
+		if (dst)
+			*dst++ = sum;
+		else
+			set_state_private(io_tree, offset, sum);
+		disk_bytenr += bvec->bv_len;
 		bio_index++;
 		bvec++;
 	}
@@ -224,75 +228,8 @@ found:
 	return 0;
 }
 
-int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
-			  u64 start, unsigned long len)
-{
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	struct btrfs_ordered_extent *ordered;
-	char *data;
-	struct page *page;
-	unsigned long total_bytes = 0;
-	unsigned long this_sum_bytes = 0;
-
-	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
-	if (!sums)
-		return -ENOMEM;
-
-	sector_sum = sums->sums;
-	sums->file_offset = start;
-	sums->len = len;
-	INIT_LIST_HEAD(&sums->list);
-	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
-	BUG_ON(!ordered);
-
-	while(len > 0) {
-		if (start >= ordered->file_offset + ordered->len ||
-		    start < ordered->file_offset) {
-			sums->len = this_sum_bytes;
-			this_sum_bytes = 0;
-			btrfs_add_ordered_sum(inode, ordered, sums);
-			btrfs_put_ordered_extent(ordered);
-
-			sums = kzalloc(btrfs_ordered_sum_size(root, len),
-				       GFP_NOFS);
-			BUG_ON(!sums);
-			sector_sum = sums->sums;
-			sums->len = len;
-			sums->file_offset = start;
-			ordered = btrfs_lookup_ordered_extent(inode,
-						      sums->file_offset);
-			BUG_ON(!ordered);
-		}
-
-		page = find_get_page(inode->i_mapping,
-				     start >> PAGE_CACHE_SHIFT);
-
-		data = kmap_atomic(page, KM_USER0);
-		sector_sum->sum = ~(u32)0;
-		sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
-						  PAGE_CACHE_SIZE);
-		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(sector_sum->sum,
-				 (char *)&sector_sum->sum);
-		sector_sum->offset = page_offset(page);
-		page_cache_release(page);
-
-		sector_sum++;
-		total_bytes += PAGE_CACHE_SIZE;
-		this_sum_bytes += PAGE_CACHE_SIZE;
-		start += PAGE_CACHE_SIZE;
-
-		WARN_ON(len < PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-	}
-	btrfs_add_ordered_sum(inode, ordered, sums);
-	btrfs_put_ordered_extent(ordered);
-	return 0;
-}
-
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
-		       struct bio *bio)
+		       struct bio *bio, u64 file_start, int contig)
 {
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
@@ -303,6 +240,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	unsigned long total_bytes = 0;
 	unsigned long this_sum_bytes = 0;
 	u64 offset;
+	u64 disk_bytenr;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
@@ -310,16 +248,25 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		return -ENOMEM;
 
 	sector_sum = sums->sums;
-	sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+	disk_bytenr = (u64)bio->bi_sector << 9;
 	sums->len = bio->bi_size;
 	INIT_LIST_HEAD(&sums->list);
-	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+
+	if (contig)
+		offset = file_start;
+	else
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	BUG_ON(!ordered);
+	sums->bytenr = ordered->start;
 
 	while(bio_index < bio->bi_vcnt) {
-		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		if (offset >= ordered->file_offset + ordered->len ||
-		    offset < ordered->file_offset) {
+		if (!contig)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+		if (!contig && (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset)) {
 			unsigned long bytes_left;
 			sums->len = this_sum_bytes;
 			this_sum_bytes = 0;
@@ -333,10 +280,9 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 			BUG_ON(!sums);
 			sector_sum = sums->sums;
 			sums->len = bytes_left;
-			sums->file_offset = offset;
-			ordered = btrfs_lookup_ordered_extent(inode,
-						      sums->file_offset);
+			ordered = btrfs_lookup_ordered_extent(inode, offset);
 			BUG_ON(!ordered);
+			sums->bytenr = ordered->start;
 		}
 
 		data = kmap_atomic(bvec->bv_page, KM_USER0);
@@ -348,13 +294,14 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		kunmap_atomic(data, KM_USER0);
 		btrfs_csum_final(sector_sum->sum,
 				 (char *)&sector_sum->sum);
-		sector_sum->offset = page_offset(bvec->bv_page) +
-			bvec->bv_offset;
+		sector_sum->bytenr = disk_bytenr;
 
 		sector_sum++;
 		bio_index++;
 		total_bytes += bvec->bv_len;
 		this_sum_bytes += bvec->bv_len;
+		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
 		bvec++;
 	}
 	this_sum_bytes = 0;
@@ -364,11 +311,10 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 }
 
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct inode *inode,
+			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums)
 {
-	u64 objectid = inode->i_ino;
-	u64 offset;
+	u64 bytenr;
 	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
@@ -396,13 +342,12 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
-	offset = sector_sum->offset;
-	file_key.objectid = objectid;
-	file_key.offset = offset;
-	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = sector_sum->bytenr;
+	bytenr = sector_sum->bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 
-	mutex_lock(&BTRFS_I(inode)->csum_mutex);
-	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
+	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
 	if (!IS_ERR(item)) {
 		leaf = path->nodes[0];
 		ret = 0;
@@ -432,8 +377,8 @@ again:
 			slot = 0;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
-		if (found_key.objectid != objectid ||
-		    found_key.type != BTRFS_CSUM_ITEM_KEY) {
+		if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
 			found_next = 1;
 			goto insert;
 		}
@@ -460,10 +405,10 @@ again:
 	path->slots[0]--;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-	csum_offset = (offset - found_key.offset) >>
+	csum_offset = (bytenr - found_key.offset) >>
 			root->fs_info->sb->s_blocksize_bits;
-	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
-	    found_key.objectid != objectid ||
+	if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+	    found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
 	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
 		goto insert;
 	}
@@ -482,8 +427,18 @@ insert:
 	btrfs_release_path(root, path);
 	csum_offset = 0;
 	if (found_next) {
-		u64 tmp = min((u64)i_size_read(inode), next_offset);
-		tmp -= offset & ~((u64)root->sectorsize -1);
+		u64 tmp = total_bytes + root->sectorsize;
+		u64 next_sector = sector_sum->bytenr;
+		struct btrfs_sector_sum *next = sector_sum + 1;
+
+		while(tmp < sums->len) {
+			if (next_sector + root->sectorsize != next->bytenr)
+				break;
+			tmp += root->sectorsize;
+			next_sector = next->bytenr;
+			next++;
+		}
+		tmp = min(tmp, next_offset - file_key.offset);
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
 		tmp = max((u64)1, tmp);
 		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
@@ -510,7 +465,6 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	cond_resched();
 next_sector:
 
@@ -541,9 +495,9 @@ next_sector:
 	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  csum_size);
-		if (item < item_end && offset + PAGE_CACHE_SIZE ==
-		    sector_sum->offset) {
-			    offset = sector_sum->offset;
+		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+		    sector_sum->bytenr) {
+			bytenr = sector_sum->bytenr;
 			goto next_sector;
 		}
 	}
@@ -562,7 +516,6 @@ out:
 	return ret;
 
 fail_unlock:
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	goto out;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09efc9473a3..c03d847b8c4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1221,7 +1221,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	ret = btrfs_csum_one_bio(root, inode, bio);
+	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
 	BUG_ON(ret);
 	return 0;
 }
@@ -1259,12 +1259,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		btrfs_test_flag(inode, NODATASUM);
 
 	if (!(rw & (1 << BIO_RW))) {
-
-		if (bio_flags & EXTENT_BIO_COMPRESSED)
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
-		else if (!skip_sum)
-			btrfs_lookup_bio_sums(root, inode, bio);
+		} else if (!skip_sum)
+			btrfs_lookup_bio_sums(root, inode, bio, NULL);
 		goto mapit;
 	} else if (!skip_sum) {
 		/* we're doing a write, do the async checksumming */
@@ -1292,8 +1291,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	btrfs_set_trans_block_group(trans, inode);
 	list_for_each(cur, list) {
 		sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
-				       inode, sum);
+		btrfs_csum_file_blocks(trans,
+		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
 	}
 	return 0;
 }
@@ -1545,6 +1544,7 @@ struct io_failure_record {
 	u64 start;
 	u64 len;
 	u64 logical;
+	unsigned long bio_flags;
 	int last_mirror;
 };
 
@@ -1563,7 +1563,6 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	int ret;
 	int rw;
 	u64 logical;
-	unsigned long bio_flags = 0;
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
@@ -1573,6 +1572,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		failrec->start = start;
 		failrec->len = end - start + 1;
 		failrec->last_mirror = 0;
+		failrec->bio_flags = 0;
 
 		spin_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -1588,8 +1588,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		}
 		logical = start - em->start;
 		logical = em->block_start + logical;
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-			bio_flags = EXTENT_BIO_COMPRESSED;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+		}
 		failrec->logical = logical;
 		free_extent_map(em);
 		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -1626,6 +1628,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = failed_bio->bi_bdev;
 	bio->bi_size = 0;
+
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
 	if (failed_bio->bi_rw & (1 << BIO_RW))
 		rw = WRITE;
@@ -1634,7 +1637,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
 						      failrec->last_mirror,
-						      bio_flags);
+						      failrec->bio_flags);
 	return 0;
 }
 
@@ -1688,9 +1691,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	u32 csum = ~(u32)0;
 	unsigned long flags;
 
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		goto good;
+	}
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
+
 	if (state && state->start == start) {
 		private = state->private;
 		ret = 0;
@@ -1709,7 +1717,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
-
+good:
 	/* if the io failure tree for this inode is non-empty,
 	 * check to see if we've recovered from a failed IO
 	 */
@@ -2243,6 +2251,7 @@ fail:
 	return err;
 }
 
+#if 0
 /*
  * when truncating bytes in a file, it is possible to avoid reading
  * the leaves that contain only checksum items.  This can be the
@@ -2410,6 +2419,8 @@ out:
 	return ret;
 }
 
+#endif
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -2459,9 +2470,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	btrfs_init_path(path);
 
-	ret = drop_csum_leaves(trans, root, path, inode, new_size);
-	BUG_ON(ret);
-
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {
@@ -2509,16 +2517,11 @@ search_again:
 			}
 			item_end--;
 		}
-		if (found_type == BTRFS_CSUM_ITEM_KEY) {
-			ret = btrfs_csum_truncate(trans, root, path,
-						  new_size);
-			BUG_ON(ret);
-		}
 		if (item_end < new_size) {
 			if (found_type == BTRFS_DIR_ITEM_KEY) {
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
-				found_type = BTRFS_CSUM_ITEM_KEY;
+				found_type = BTRFS_EXTENT_DATA_KEY;
 			} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
 				found_type = BTRFS_XATTR_ITEM_KEY;
 			} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b4da53d55c8..6228b69c2b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -714,8 +714,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
 	u64 hint_byte;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+
 	/*
 	 * TODO:
 	 * - split compressed inline extents.  annoying: we need to
@@ -833,7 +832,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		slot = path->slots[0];
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
 		    key.objectid != src->i_ino)
 			break;
 
@@ -958,56 +957,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_mark_buffer_dirty(leaf);
 		}
 
-		if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
-			u32 size;
-			struct btrfs_key new_key;
-			u64 coverslen;
-			int coff, clen;
-
-			size = btrfs_item_size_nr(leaf, slot);
-			coverslen = (size / csum_size) <<
-				root->fs_info->sb->s_blocksize_bits;
-			printk("csums for %llu~%llu\n",
-			       key.offset, coverslen);
-			if (key.offset + coverslen < off ||
-			    key.offset >= off+len)
-				goto next;
-
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
-			btrfs_release_path(root, path);
-
-			coff = 0;
-			if (off > key.offset)
-				coff = ((off - key.offset) >>
-					root->fs_info->sb->s_blocksize_bits) *
-					csum_size;
-			clen = size - coff;
-			if (key.offset + coverslen > off+len)
-				clen -= ((key.offset+coverslen-off-len) >>
-					 root->fs_info->sb->s_blocksize_bits) *
-					csum_size;
-			printk(" will dup %d~%d of %d\n",
-			       coff, clen, size);
-
-			memcpy(&new_key, &key, sizeof(new_key));
-			new_key.objectid = inode->i_ino;
-			new_key.offset = key.offset + destoff - off;
-
-			ret = btrfs_insert_empty_item(trans, root, path,
-						      &new_key, clen);
-			if (ret)
-				goto out;
-
-			leaf = path->nodes[0];
-			slot = path->slots[0];
-			write_extent_buffer(leaf, buf + coff,
-					    btrfs_item_ptr_offset(leaf, slot),
-					    clen);
-			btrfs_mark_buffer_dirty(leaf);
-		}
-
 	next:
 		btrfs_release_path(root, path);
 		key.offset++;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 027ad6b3839..d9e232227da 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -610,7 +610,8 @@ out:
  * try to find a checksum.  This is used because we allow pages to
  * be reclaimed before their checksum is actually put into the btree
  */
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum)
 {
 	struct btrfs_ordered_sum *ordered_sum;
 	struct btrfs_sector_sum *sector_sums;
@@ -629,11 +630,11 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 	mutex_lock(&tree->mutex);
 	list_for_each_prev(cur, &ordered->list) {
 		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		if (offset >= ordered_sum->file_offset) {
+		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
 			sector_sums = ordered_sum->sums;
 			for (i = 0; i < num_sectors; i++) {
-				if (sector_sums[i].offset == offset) {
+				if (sector_sums[i].bytenr == disk_bytenr) {
 					*sum = sector_sums[i].sum;
 					ret = 0;
 					goto out;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 260bf95dfe0..ab66d5e8d6d 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -33,15 +33,17 @@ struct btrfs_ordered_inode_tree {
  * the ordered extent are on disk
  */
 struct btrfs_sector_sum {
-	u64 offset;
+	/* bytenr on disk */
+	u64 bytenr;
 	u32 sum;
 };
 
 struct btrfs_ordered_sum {
-	u64 file_offset;
+	/* bytenr is the start of this extent on disk */
+	u64 bytenr;
+
 	/*
 	 * this is the length in bytes covered by the sums array below.
-	 * But, the sums array may not be contiguous in the file.
 	 */
 	unsigned long len;
 	struct list_head list;
@@ -147,7 +149,7 @@ struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
 int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c766649ad45..08469ec0585 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -934,24 +934,17 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	unsigned long file_bytes;
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
-	struct inode *inode;
 	unsigned long ptr;
 
 	file_bytes = (item_size / csum_size) * root->sectorsize;
-	inode = read_one_inode(root, key->objectid);
-	if (!inode) {
-		return -EIO;
-	}
-
 	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
 	if (!sums) {
-		iput(inode);
 		return -ENOMEM;
 	}
 
 	INIT_LIST_HEAD(&sums->list);
 	sums->len = file_bytes;
-	sums->file_offset = key->offset;
+	sums->bytenr = key->offset;
 
 	/*
 	 * copy all the sums into the ordered sum struct
@@ -960,7 +953,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	cur_offset = key->offset;
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	while(item_size > 0) {
-		sector_sum->offset = cur_offset;
+		sector_sum->bytenr = cur_offset;
 		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
 		item_size -= csum_size;
@@ -969,11 +962,9 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	}
 
 	/* let btrfs_csum_file_blocks add them into the file */
-	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
+	ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
 	BUG_ON(ret);
 	kfree(sums);
-	iput(inode);
-
 	return 0;
 }
 /*
@@ -1670,7 +1661,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
 			BUG_ON(ret);
-		} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
+		} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
 			ret = replay_one_csum(wc->trans, root, path,
 					      eb, i, &key);
 			BUG_ON(ret);
@@ -2466,6 +2457,85 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
+				      struct list_head *list,
+				      struct btrfs_root *root,
+				      u64 disk_bytenr, u64 len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item = NULL;
+	u64 end = disk_bytenr + len;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u32 diff;
+	u32 sum;
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+
+	sector_sum = sums->sums;
+	sums->bytenr = disk_bytenr;
+	sums->len = len;
+	list_add_tail(&sums->list, list);
+
+	path = btrfs_alloc_path();
+	while(disk_bytenr < end) {
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(root, path);
+			item = btrfs_lookup_csum(NULL, root, path,
+						 disk_bytenr, 0);
+			if (IS_ERR(item)) {
+				ret = PTR_ERR(item);
+				if (ret == -ENOENT || ret == -EFBIG)
+					ret = 0;
+				sum = 0;
+				printk("log no csum found for byte %llu\n",
+				       (unsigned long long)disk_bytenr);
+				item = NULL;
+				btrfs_release_path(root, path);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / csum_size) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = disk_bytenr - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * csum_size;
+
+		read_extent_buffer(path->nodes[0], &sum,
+				   ((unsigned long)item) + diff,
+				   csum_size);
+found:
+		sector_sum->bytenr = disk_bytenr;
+		sector_sum->sum = sum;
+		disk_bytenr += root->sectorsize;
+		sector_sum++;
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *log,
 			       struct btrfs_path *dst_path,
@@ -2481,6 +2551,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	u32 *ins_sizes;
 	char *ins_data;
 	int i;
+	struct list_head ordered_sums;
+
+	INIT_LIST_HEAD(&ordered_sums);
 
 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
 			   nr * sizeof(u32), GFP_NOFS);
@@ -2535,6 +2608,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 								   extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(src,
 								      extent);
+				u64 cs = btrfs_file_extent_offset(src, extent);
+				u64 cl = btrfs_file_extent_num_bytes(src,
+								     extent);;
 				/* ds == 0 is a hole */
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, log,
@@ -2544,6 +2620,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						   trans->transid,
 						   ins_keys[i].objectid);
 					BUG_ON(ret);
+					ret = copy_extent_csums(trans,
+						&ordered_sums,
+						log->fs_info->csum_root,
+						ds + cs, cl);
+					BUG_ON(ret);
 				}
 			}
 		}
@@ -2553,6 +2634,20 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
 	btrfs_release_path(log, dst_path);
 	kfree(ins_data);
+
+	/*
+	 * we have to do this after the loop above to avoid changing the
+	 * log tree while trying to change the log tree.
+	 */
+	while(!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		ret = btrfs_csum_file_blocks(trans, log, sums);
+		BUG_ON(ret);
+		list_del(&sums->list);
+		kfree(sums);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6c523b3360f..2049d179ccd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2771,6 +2771,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
+	INIT_LIST_HEAD(&device->dev_alloc_list);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
 	return device;
 }
-- 
cgit v1.2.3


From a512bbf855ff0af474257475f2e6da7acd854f52 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 8 Dec 2008 16:46:26 -0500
Subject: Btrfs: superblock duplication

This patch implements superblock duplication. Superblocks
are stored at offset 16K, 64M and 256G on every devices.
Spaces used by superblocks are preserved by the allocator,
which uses a reverse mapping function to find the logical
addresses that correspond to superblocks. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c          | 208 ++++++++++++++++++++++++++++++--------------
 fs/btrfs/disk-io.h          |  17 +++-
 fs/btrfs/extent-tree.c      |  54 ++++++------
 fs/btrfs/free-space-cache.c |   1 -
 fs/btrfs/transaction.c      |   2 +-
 fs/btrfs/tree-log.c         |   3 +-
 fs/btrfs/volumes.c          | 107 ++++++++++++++++++-----
 fs/btrfs/volumes.h          |   6 +-
 8 files changed, 279 insertions(+), 119 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 61dc3b2c834..c72f4f3b912 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1595,8 +1595,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 
-	bh = __bread(fs_devices->latest_bdev,
-		     BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh)
 		goto fail_iput;
 
@@ -1710,7 +1709,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	mutex_lock(&fs_info->chunk_mutex);
-	ret = btrfs_read_sys_array(tree_root);
+	ret = btrfs_read_sys_array(tree_root, btrfs_super_bytenr(disk_super));
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk("btrfs: failed to read the system array on %s\n",
@@ -1905,19 +1904,147 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-static int write_all_supers(struct btrfs_root *root)
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+	struct buffer_head *bh;
+	struct buffer_head *latest = NULL;
+	struct btrfs_super_block *super;
+	int i;
+	u64 transid = 0;
+	u64 bytenr;
+
+	/* we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	for (i = 0; i < 1; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+			break;
+		bh = __bread(bdev, bytenr / 4096, 4096);
+		if (!bh)
+			continue;
+
+		super = (struct btrfs_super_block *)bh->b_data;
+		if (btrfs_super_bytenr(super) != bytenr ||
+		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+			    sizeof(super->magic))) {
+			brelse(bh);
+			continue;
+		}
+
+		if (!latest || btrfs_super_generation(super) > transid) {
+			brelse(latest);
+			latest = bh;
+			transid = btrfs_super_generation(super);
+		} else {
+			brelse(bh);
+		}
+	}
+	return latest;
+}
+
+static int write_dev_supers(struct btrfs_device *device,
+			    struct btrfs_super_block *sb,
+			    int do_barriers, int wait, int max_mirrors)
+{
+	struct buffer_head *bh;
+	int i;
+	int ret;
+	int errors = 0;
+	u32 crc;
+	u64 bytenr;
+	int last_barrier = 0;
+
+	if (max_mirrors == 0)
+		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+	/* make sure only the last submit_bh does a barrier */
+	if (do_barriers) {
+		for (i = 0; i < max_mirrors; i++) {
+			bytenr = btrfs_sb_offset(i);
+			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+			    device->total_bytes)
+				break;
+			last_barrier = i;
+		}
+	}
+
+	for (i = 0; i < max_mirrors; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+			break;
+
+		if (wait) {
+			bh = __find_get_block(device->bdev, bytenr / 4096,
+					      BTRFS_SUPER_INFO_SIZE);
+			BUG_ON(!bh);
+			brelse(bh);
+			wait_on_buffer(bh);
+			if (buffer_uptodate(bh)) {
+				brelse(bh);
+				continue;
+			}
+		} else {
+			btrfs_set_super_bytenr(sb, bytenr);
+
+			crc = ~(u32)0;
+			crc = btrfs_csum_data(NULL, (char *)sb +
+					      BTRFS_CSUM_SIZE, crc,
+					      BTRFS_SUPER_INFO_SIZE -
+					      BTRFS_CSUM_SIZE);
+			btrfs_csum_final(crc, sb->csum);
+
+			bh = __getblk(device->bdev, bytenr / 4096,
+				      BTRFS_SUPER_INFO_SIZE);
+			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+			set_buffer_uptodate(bh);
+			get_bh(bh);
+			lock_buffer(bh);
+			bh->b_end_io = btrfs_end_buffer_write_sync;
+		}
+
+		if (i == last_barrier && do_barriers && device->barriers) {
+			ret = submit_bh(WRITE_BARRIER, bh);
+			if (ret == -EOPNOTSUPP) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       device->name);
+				set_buffer_uptodate(bh);
+				device->barriers = 0;
+				get_bh(bh);
+				lock_buffer(bh);
+				ret = submit_bh(WRITE, bh);
+			}
+		} else {
+			ret = submit_bh(WRITE, bh);
+		}
+
+		if (!ret && wait) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				errors++;
+		} else if (ret) {
+			errors++;
+		}
+		if (wait)
+			brelse(bh);
+	}
+	return errors < i ? 0 : -1;
+}
+
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
-	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
 	int max_errors;
 	int total_errors = 0;
-	u32 crc;
 	u64 flags;
 
 	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
@@ -1944,40 +2071,11 @@ static int write_all_supers(struct btrfs_root *root)
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
 		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
-
-		crc = ~(u32)0;
-		crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
-				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
-		btrfs_csum_final(crc, sb->csum);
-
-		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
-			      BTRFS_SUPER_INFO_SIZE);
-
-		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
-		dev->pending_io = bh;
-
-		get_bh(bh);
-		set_buffer_uptodate(bh);
-		lock_buffer(bh);
-		bh->b_end_io = btrfs_end_buffer_write_sync;
-
-		if (do_barriers && dev->barriers) {
-			ret = submit_bh(WRITE_BARRIER, bh);
-			if (ret == -EOPNOTSUPP) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       dev->name);
-				set_buffer_uptodate(bh);
-				dev->barriers = 0;
-				get_bh(bh);
-				lock_buffer(bh);
-				ret = submit_bh(WRITE, bh);
-			}
-		} else {
-			ret = submit_bh(WRITE, bh);
-		}
+		ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
 		if (ret)
 			total_errors++;
 	}
@@ -1985,8 +2083,8 @@ static int write_all_supers(struct btrfs_root *root)
 		printk("btrfs: %d errors while writing supers\n", total_errors);
 		BUG();
 	}
-	total_errors = 0;
 
+	total_errors = 0;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (!dev->bdev)
@@ -1994,29 +2092,9 @@ static int write_all_supers(struct btrfs_root *root)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		BUG_ON(!dev->pending_io);
-		bh = dev->pending_io;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(dev->pending_io)) {
-			if (do_barriers && dev->barriers) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       dev->name);
-				set_buffer_uptodate(bh);
-				get_bh(bh);
-				lock_buffer(bh);
-				dev->barriers = 0;
-				ret = submit_bh(WRITE, bh);
-				BUG_ON(ret);
-				wait_on_buffer(bh);
-				if (!buffer_uptodate(bh))
-					total_errors++;
-			} else {
-				total_errors++;
-			}
-
-		}
-		dev->pending_io = NULL;
-		brelse(bh);
+		ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+		if (ret)
+			total_errors++;
 	}
 	if (total_errors > max_errors) {
 		printk("btrfs: %d errors while writing supers\n", total_errors);
@@ -2025,12 +2103,12 @@ static int write_all_supers(struct btrfs_root *root)
 	return 0;
 }
 
-int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root)
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors)
 {
 	int ret;
 
-	ret = write_all_supers(root);
+	ret = write_all_supers(root, max_mirrors);
 	return ret;
 }
 
@@ -2116,7 +2194,7 @@ int btrfs_commit_super(struct btrfs_root *root)
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
 
-	ret = write_ctree_super(NULL, root);
+	ret = write_ctree_super(NULL, root, 0);
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 717e94811e4..c0ff404c31b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,8 +19,20 @@
 #ifndef __DISKIO__
 #define __DISKIO__
 
-#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
 #define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX	 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+	u64 start = 16 * 1024;
+	if (mirror)
+		return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+	return BTRFS_SUPER_INFO_OFFSET;
+}
+
 struct btrfs_device;
 struct btrfs_fs_devices;
 
@@ -37,7 +49,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			      char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root);
+		      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d1563852938..803647bc840 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -189,6 +189,29 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 	return 0;
 }
 
+static int remove_sb_from_cache(struct btrfs_root *root,
+				struct btrfs_block_group_cache *cache)
+{
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+				       cache->key.objectid, bytenr, 0,
+				       &logical, &nr, &stripe_len);
+		BUG_ON(ret);
+		while (nr--) {
+			btrfs_remove_free_space(cache, logical[nr],
+						stripe_len);
+		}
+		kfree(logical);
+	}
+	return 0;
+}
+
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -197,9 +220,7 @@ static int cache_block_group(struct btrfs_root *root,
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int slot;
-	u64 last = 0;
-	u64 first_free;
-	int found = 0;
+	u64 last = block_group->key.objectid;
 
 	if (!block_group)
 		return 0;
@@ -220,23 +241,13 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	first_free = max_t(u64, block_group->key.objectid,
-			   BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
-	key.objectid = block_group->key.objectid;
+	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
-	if (ret < 0)
-		goto err;
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid + key.offset > first_free)
-			first_free = key.objectid + key.offset;
-	}
+
 	while(1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -258,11 +269,6 @@ static int cache_block_group(struct btrfs_root *root,
 			break;
 
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-			if (!found) {
-				last = first_free;
-				found = 1;
-			}
-
 			add_new_free_space(block_group, root->fs_info, last,
 					   key.objectid);
 
@@ -272,13 +278,11 @@ next:
 		path->slots[0]++;
 	}
 
-	if (!found)
-		last = first_free;
-
 	add_new_free_space(block_group, root->fs_info, last,
 			   block_group->key.objectid +
 			   block_group->key.offset);
 
+	remove_sb_from_cache(root, block_group);
 	block_group->cached = 1;
 	ret = 0;
 err:
@@ -1974,10 +1978,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (alloc) {
 			old_val += num_bytes;
 			cache->space_info->bytes_used += num_bytes;
-			if (cache->ro) {
+			if (cache->ro)
 				cache->space_info->bytes_readonly -= num_bytes;
-				WARN_ON(1);
-			}
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 09462adfbe3..2e69b9c3043 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,6 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			ret = -EINVAL;
 			goto out;
 		}
-
 		unlink_free_space(block_group, info);
 
 		if (info->bytes == bytes) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c38f6a0e30b..47cd5fcad2c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1038,7 +1038,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
-	write_ctree_super(trans, root);
+	write_ctree_super(trans, root, 0);
 
 	/*
 	 * the super is written, we can safely allow the tree-loggers
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 08469ec0585..d3f9c2c663c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1996,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
 		       btrfs_header_level(log->fs_info->log_root_tree->node));
 
-	write_ctree_super(trans, log->fs_info->tree_root);
+	write_ctree_super(trans, log->fs_info->tree_root, 2);
 	log->fs_info->tree_log_transid++;
 	log->fs_info->tree_log_batch = 0;
 	atomic_set(&log->fs_info->tree_log_commit, 0);
@@ -2006,7 +2006,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 out:
 	mutex_unlock(&log->fs_info->tree_log_mutex);
 	return 0;
-
 }
 
 /* * free all the extents used by the tree log.  This should be called
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2049d179ccd..a79b3cc09e9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -423,15 +423,11 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		}
 		set_blocksize(bdev, 4096);
 
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh)
 			goto error_close;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic)))
-			goto error_brelse;
-
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		if (devid != device->devid)
 			goto error_brelse;
@@ -529,17 +525,12 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	ret = set_blocksize(bdev, 4096);
 	if (ret)
 		goto error_close;
-	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	bh = btrfs_read_dev_super(bdev);
 	if (!bh) {
 		ret = -EIO;
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-	    sizeof(disk_super->magic))) {
-		ret = -EINVAL;
-		goto error_brelse;
-	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
@@ -553,7 +544,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
-error_brelse:
 	brelse(bh);
 error_close:
 	close_bdev_exclusive(bdev, flags);
@@ -1016,17 +1006,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 
 		set_blocksize(bdev, 4096);
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh) {
 			ret = -EIO;
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-			    sizeof(disk_super->magic))) {
-			ret = -ENOENT;
-			goto error_brelse;
-		}
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		dev_uuid = disk_super->dev_item.uuid;
 		device = btrfs_find_device(root, devid, dev_uuid,
@@ -2563,6 +2548,88 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 				 mirror_num, NULL);
 }
 
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len)
+{
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 *buf;
+	u64 bytenr;
+	u64 length;
+	u64 stripe_nr;
+	int i, j, nr = 0;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(!em || em->start != chunk_start);
+	map = (struct map_lookup *)em->bdev;
+
+	length = em->len;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		do_div(length, map->num_stripes / map->sub_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+		do_div(length, map->num_stripes);
+
+	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+	BUG_ON(!buf);
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (devid && map->stripes[i].dev->devid != devid)
+			continue;
+		if (map->stripes[i].physical > physical ||
+		    map->stripes[i].physical + length <= physical)
+			continue;
+
+		stripe_nr = physical - map->stripes[i].physical;
+		do_div(stripe_nr, map->stripe_len);
+
+		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+			do_div(stripe_nr, map->sub_stripes);
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+		}
+		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		for (j = 0; j < nr; j++) {
+			if (buf[j] == bytenr)
+				break;
+		}
+		if (j == nr)
+			buf[nr++] = bytenr;
+	}
+
+	for (i = 0; i > nr; i++) {
+		struct btrfs_multi_bio *multi;
+		struct btrfs_bio_stripe *stripe;
+		int ret;
+
+		length = 1;
+		ret = btrfs_map_block(map_tree, WRITE, buf[i],
+				      &length, &multi, 0);
+		BUG_ON(ret);
+
+		stripe = multi->stripes;
+		for (j = 0; j < multi->num_stripes; j++) {
+			if (stripe->physical >= physical &&
+			    physical < stripe->physical + length)
+				break;
+		}
+		BUG_ON(j >= multi->num_stripes);
+		kfree(multi);
+	}
+
+	*logical = buf;
+	*naddrs = nr;
+	*stripe_len = map->stripe_len;
+
+	free_extent_map(em);
+	return 0;
+}
+
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 		      u64 logical, struct page *page)
 {
@@ -3003,7 +3070,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 	return read_one_dev(root, buf, dev_item);
 }
 
-int btrfs_read_sys_array(struct btrfs_root *root)
+int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb;
@@ -3018,7 +3085,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 cur;
 	struct btrfs_key key;
 
-	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+	sb = btrfs_find_create_tree_block(root, sb_bytenr,
 					  BTRFS_SUPER_INFO_SIZE);
 	if (!sb)
 		return -ENOMEM;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index fcbdcb3ae13..bdebe83c319 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -28,7 +28,6 @@ struct btrfs_device {
 	struct list_head dev_alloc_list;
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
-	struct buffer_head *pending_io;
 	struct bio *pending_bios;
 	struct bio *pending_bio_tail;
 	int running_pending;
@@ -125,7 +124,10 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_multi_bio **multi_ret, int mirror_num);
-int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 type);
-- 
cgit v1.2.3


From 934d375bacf9ea8a37fbfff5f3cf1c093f324095 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:43:10 -0500
Subject: Btrfs: Use map_private_extent_buffer during generic_bin_search

It is possible that generic_bin_search will be called on a tree block
that has not been locked.  This happens because cache_block_block skips
locking on the tree blocks.

Since the tree block isn't locked, we aren't allowed to change
the extent_buffer->map_token field.  Using map_private_extent_buffer
avoids any changes to the internal extent buffer fields.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 5 ++++-
 fs/btrfs/extent_io.c | 1 +
 fs/btrfs/volumes.c   | 5 ++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a83cbdf1d8c..19c0dd33b1e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -813,7 +813,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 				unmap_extent_buffer(eb, map_token, KM_USER0);
 				map_token = NULL;
 			}
-			err = map_extent_buffer(eb, offset,
+
+			err = map_private_extent_buffer(eb, offset,
 						sizeof(struct btrfs_disk_key),
 						&map_token, &kaddr,
 						&map_start, &map_len, KM_USER0);
@@ -3585,6 +3586,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 	int level;
 	int ret = 1;
 
+	WARN_ON(!path->keep_locks);
 again:
 	cur = btrfs_lock_root_node(root);
 	level = btrfs_header_level(cur);
@@ -3708,6 +3710,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 	int slot;
 	struct extent_buffer *c;
 
+	WARN_ON(!path->keep_locks);
 	while(level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7449ecf32c5..607f5ff2791 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3471,6 +3471,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		unmap_extent_buffer(eb, eb->map_token, km);
 		eb->map_token = NULL;
 		save = 1;
+		WARN_ON(!mutex_is_locked(&eb->mutex));
 	}
 	err = map_private_extent_buffer(eb, start, min_len, token, map,
 				       map_start, map_len, km);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a79b3cc09e9..825364fae69 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2594,12 +2594,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 			stripe_nr = stripe_nr * map->num_stripes + i;
 		}
 		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		WARN_ON(nr >= map->num_stripes);
 		for (j = 0; j < nr; j++) {
 			if (buf[j] == bytenr)
 				break;
 		}
-		if (j == nr)
+		if (j == nr) {
+			WARN_ON(nr >= map->num_stripes);
 			buf[nr++] = bytenr;
+		}
 	}
 
 	for (i = 0; i > nr; i++) {
-- 
cgit v1.2.3


From c3027eb5523d6983f12628f3fe13d8a7576db701 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:40:21 -0500
Subject: Btrfs: Add inode sequence number for NFS and reserved space in a few
 structs

This adds a sequence number to the btrfs inode that is increased on
every update.  NFS will be able to use that to detect when an inode has
changed, without relying on inaccurate time fields.

While we're here, this also:

Puts reserved space into the super block and inode

Adds a log root transid to the super so we can pick the newest super
based on the fsync log as well as the main transaction ID.  For now
the log root transid is always zero, but that'll get fixed.

Adds a starting offset to the dev_item.  This will let us do better
alignment calculations if we know the start of a partition on the disk.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  6 +++---
 fs/btrfs/ctree.h       | 26 ++++++++++++++++++++++++++
 fs/btrfs/file.c        |  1 +
 fs/btrfs/inode.c       |  4 +++-
 fs/btrfs/volumes.c     |  1 +
 5 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0b2e623cf42..1b9ec1ab1f6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -49,9 +49,6 @@ struct btrfs_inode {
 	 */
 	struct extent_io_tree io_failure_tree;
 
-	/* held while inserting checksums to avoid races */
-	struct mutex csum_mutex;
-
 	/* held while inesrting or deleting extents from files */
 	struct mutex extent_mutex;
 
@@ -79,6 +76,9 @@ struct btrfs_inode {
 	 */
 	u64 generation;
 
+	/* sequence number for NFS changes */
+	u64 sequence;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 242b961ae6d..f72b4381934 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -196,6 +196,12 @@ struct btrfs_dev_item {
 	/* expected generation for this device */
 	__le64 generation;
 
+	/*
+	 * starting byte of this partition on the device,
+	 * to allowr for stripe alignment in the future
+	 */
+	__le64 start_offset;
+
 	/* grouping information for allocation decisions */
 	__le32 dev_group;
 
@@ -311,6 +317,9 @@ struct btrfs_super_block {
 	__le64 root;
 	__le64 chunk_root;
 	__le64 log_root;
+
+	/* this will help find the new super based on the log root */
+	__le64 log_root_transid;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -329,7 +338,11 @@ struct btrfs_super_block {
 	u8 chunk_root_level;
 	u8 log_root_level;
 	struct btrfs_dev_item dev_item;
+
 	char label[BTRFS_LABEL_SIZE];
+
+	/* future expansion */
+	__le64 reserved[32];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -463,6 +476,14 @@ struct btrfs_inode_item {
 	__le64 rdev;
 	__le64 flags;
 
+	/* modification sequence number for NFS */
+	__le64 sequence;
+
+	/*
+	 * a little future expansion, for more than this we can
+	 * just grow the inode item and version it
+	 */
+	__le64 reserved[4];
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
 	struct btrfs_timespec mtime;
@@ -1001,6 +1022,8 @@ BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
 BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
 BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
 BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+		   start_offset, 64);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
 BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
@@ -1135,6 +1158,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
 BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
 BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
@@ -1519,6 +1543,8 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
 			 chunk_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
 			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+			 log_root_transid, 64);
 BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
 			 log_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1c9243560ea..b5a6a2b6f66 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1055,6 +1055,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
 	mutex_lock(&inode->i_mutex);
+	BTRFS_I(inode)->sequence++;
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03d847b8c4..932d8c0b2c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1963,6 +1963,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2043,6 +2044,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 
 	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
 	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2945,6 +2947,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->i_default_acl = NULL;
 
 	bi->generation = 0;
+	bi->sequence = 0;
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
@@ -2959,7 +2962,6 @@ static noinline void init_btrfs_i(struct inode *inode)
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 825364fae69..4d210a731d4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -890,6 +890,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_group(leaf, dev_item, 0);
 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
+	btrfs_set_device_start_offset(leaf, dev_item, 0);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-- 
cgit v1.2.3


From 580afd76e451deb6772d0507de580fb1df14da6c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 19:15:39 -0500
Subject: Btrfs: Fix compressed checksum fsync log copies

The fsync logging code makes sure to onl copy the relevant checksum for each
extent based on the file extent pointers it finds.

But for compressed extents, it needs to copy the checksum for the
entire extent.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c     | 3 ++-
 fs/btrfs/tree-log.c | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b5a6a2b6f66..71bfe3a6a44 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1228,7 +1228,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	root->fs_info->tree_log_batch++;
-	filemap_fdatawait(inode->i_mapping);
+	filemap_fdatawrite(inode->i_mapping);
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	root->fs_info->tree_log_batch++;
 
 	/*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d3f9c2c663c..6ac1b7f72e2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2610,6 +2610,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				u64 cs = btrfs_file_extent_offset(src, extent);
 				u64 cl = btrfs_file_extent_num_bytes(src,
 								     extent);;
+				if (btrfs_file_extent_compression(src,
+								  extent)) {
+					cs = 0;
+					cl = dl;
+				}
 				/* ds == 0 is a hole */
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, log,
-- 
cgit v1.2.3


From 69423d99fc182a81f3c5db3eb5c140acc6fc64be Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Wed, 10 Dec 2008 13:37:21 +0000
Subject: [MTD] update internal API to support 64-bit device size

MTD internal API presently uses 32-bit values to represent
device size.  This patch updates them to 64-bits but leaves
the external API unchanged.  Extending the external API
is a separate issue for several reasons.  First, no one
needs it at the moment.  Secondly, whether the implementation
is done with IOCTLs, sysfs or both is still debated.  Thirdly
external API changes require the internal API to be accepted
first.

Note that although the MTD API will be able to support 64-bit
device sizes, existing drivers do not and are not required
to do so, although NAND base has been updated.

In general, changing from 32-bit to 64-bit values cause little
or no changes to the majority of the code with the following
exceptions:
    	- printk message formats
    	- division and modulus of 64-bit values
    	- NAND base support
	- 32-bit local variables used by mtdpart and mtdconcat
	- naughtily assuming one structure maps to another
	in MEMERASE ioctl

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/erase.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 259461b910a..c32b4a1ad6c 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
 	/* For NAND, if the failure did not occur at the device level for a
 	   specific physical page, don't bother updating the bad block table. */
-	if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) {
 		/* We had a device-level failure to erase.  Let's see if we've
 		   failed too many times. */
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
@@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr)
 	struct erase_priv_struct *priv = (void *)instr->priv;
 
 	if(instr->state != MTD_ERASE_DONE) {
-		printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state);
+		printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n",
+			(unsigned long long)instr->addr, instr->state);
 		jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr);
 	} else {
 		jffs2_erase_succeeded(priv->c, priv->jeb);
-- 
cgit v1.2.3


From 459931eca5f4b8c9ad259d07cc1ca49afed54804 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 10 Dec 2008 09:10:46 -0500
Subject: Btrfs: Delete csum items when freeing extents

This finishes off the new checksumming code by removing csum items
for extents that are no longer in use.

The trick is doing it without racing because a single csum item may
hold csums for more than one extent.  Extra checks are added to
btrfs_csum_file_blocks to make sure that we are using the correct
csum item after dropping locks.

A new btrfs_split_item is added to split a single csum item so it
can be split without dropping the leaf lock.  This is used to
remove csum bytes from the middle of an item.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 131 ++++++++++++++++++++++++++--
 fs/btrfs/ctree.h       |  13 +++
 fs/btrfs/extent-tree.c |   6 +-
 fs/btrfs/file-item.c   | 226 +++++++++++++++++++++++++++++++++++++++++--------
 4 files changed, 335 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19c0dd33b1e..c0c95cccbb5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1512,7 +1512,8 @@ cow_done:
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			if (ins_len > 0 && btrfs_header_nritems(b) >=
+			if ((p->search_for_split || ins_len > 0) &&
+			    btrfs_header_nritems(b) >=
 			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
 				int sret = split_node(trans, root, p, level);
 				BUG_ON(sret > 0);
@@ -1596,7 +1597,8 @@ cow_done:
 					goto done;
 				}
 			}
-			unlock_up(p, level, lowest_unlock);
+			if (!p->search_for_split)
+				unlock_up(p, level, lowest_unlock);
 			goto done;
 		}
 	}
@@ -2636,11 +2638,11 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int num_doubles = 0;
 	struct btrfs_disk_key disk_key;
 
-	if (extend)
+	if (extend && data_size)
 		space_needed = data_size;
 
 	/* first try to make some room by pushing left and right */
-	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0) {
 			return wret;
@@ -2721,7 +2723,7 @@ again:
 	} else {
 		if (leaf_space_used(l, 0, mid + 1) + space_needed >
 			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (!extend && slot == 0) {
+			if (!extend && data_size && slot == 0) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
 				btrfs_set_header_nritems(right, 0);
 				wret = insert_ptr(trans, root, path,
@@ -2742,7 +2744,7 @@ again:
 				}
 				btrfs_mark_buffer_dirty(right);
 				return ret;
-			} else if (extend && slot == 0) {
+			} else if ((extend || !data_size) && slot == 0) {
 				mid = 1;
 			} else {
 				mid = slot;
@@ -2827,6 +2829,123 @@ again:
 	return ret;
 }
 
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset)
+{
+	u32 item_size;
+	struct extent_buffer *leaf;
+	struct btrfs_key orig_key;
+	struct btrfs_item *item;
+	struct btrfs_item *new_item;
+	int ret = 0;
+	int slot;
+	u32 nritems;
+	u32 orig_offset;
+	struct btrfs_disk_key disk_key;
+	char *buf;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
+	if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
+		goto split;
+
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	btrfs_release_path(root, path);
+
+	path->search_for_split = 1;
+	path->keep_locks = 1;
+
+	ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+	path->search_for_split = 0;
+
+	/* if our item isn't there or got smaller, return now */
+	if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
+							path->slots[0])) {
+		path->keep_locks = 0;
+		return -EAGAIN;
+	}
+
+	ret = split_leaf(trans, root, &orig_key, path, 0, 0);
+	path->keep_locks = 0;
+	BUG_ON(ret);
+
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+	leaf = path->nodes[0];
+
+split:
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	orig_offset = btrfs_item_offset(leaf, item);
+	item_size = btrfs_item_size(leaf, item);
+
+
+	buf = kmalloc(item_size, GFP_NOFS);
+	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+			    path->slots[0]), item_size);
+	slot = path->slots[0] + 1;
+	leaf = path->nodes[0];
+
+	nritems = btrfs_header_nritems(leaf);
+
+	if (slot != nritems) {
+		/* shift the items */
+		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+			      btrfs_item_nr_offset(slot),
+			      (nritems - slot) * sizeof(struct btrfs_item));
+
+	}
+
+	btrfs_cpu_key_to_disk(&disk_key, new_key);
+	btrfs_set_item_key(leaf, &disk_key, slot);
+
+	new_item = btrfs_item_nr(leaf, slot);
+
+	btrfs_set_item_offset(leaf, new_item, orig_offset);
+	btrfs_set_item_size(leaf, new_item, item_size - split_offset);
+
+	btrfs_set_item_offset(leaf, item,
+			      orig_offset + item_size - split_offset);
+	btrfs_set_item_size(leaf, item, split_offset);
+
+	btrfs_set_header_nritems(leaf, nritems + 1);
+
+	/* write the data for the start of the original item */
+	write_extent_buffer(leaf, buf,
+			    btrfs_item_ptr_offset(leaf, path->slots[0]),
+			    split_offset);
+
+	/* write the data for the new item */
+	write_extent_buffer(leaf, buf + split_offset,
+			    btrfs_item_ptr_offset(leaf, slot),
+			    item_size - split_offset);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+	if (btrfs_leaf_free_space(root, leaf) < 0) {
+		btrfs_print_leaf(root, leaf);
+		BUG();
+	}
+	kfree(buf);
+	return ret;
+}
+
 /*
  * make the item pointed to by the path smaller.  new_size indicates
  * how small to make it, and from_end tells us if we just chop bytes
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f72b4381934..5b0c79d22c0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -409,6 +409,12 @@ struct btrfs_path {
 	int keep_locks;
 	int skip_locking;
 	int lowest_level;
+
+	/*
+	 * set by btrfs_split_item, tells search_slot to keep all locks
+	 * and to force calls to keep space in the nodes
+	 */
+	int search_for_split;
 };
 
 /*
@@ -1816,6 +1822,11 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_path *path,
 			u32 new_size, int from_end);
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_path *path,
+		     struct btrfs_key *new_key,
+		     unsigned long split_offset);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
@@ -1953,6 +1964,8 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 		       struct btrfs_key *location, int mod);
 
 /* file-item.c */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 803647bc840..cc74316dc42 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2484,7 +2484,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 				mark_free = 1;
 			BUG_ON(ret < 0);
 		}
-
 		/* block accounting for super block */
 		spin_lock_irq(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2504,6 +2503,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 					 mark_free);
 		BUG_ON(ret);
 
+		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+			BUG_ON(ret);
+		}
+
 #ifdef BIO_RW_DISCARD
 		/* Tell the block device(s) that the sectors can be discarded */
 		ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a3ad2ce0011..3ebef871ee6 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -310,6 +310,179 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	return 0;
 }
 
+/*
+ * helper function for csum removal, this expects the
+ * key to describe the csum pointed to by the path, and it expects
+ * the csum to overlap the range [bytenr, len]
+ *
+ * The csum should not be entirely contained in the range and the
+ * range should not be entirely contained in the csum.
+ *
+ * This calls btrfs_truncate_item with the correct args based on the
+ * overlap, and fixes up the key as required.
+ */
+static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct btrfs_key *key,
+				      u64 bytenr, u64 len)
+{
+	struct extent_buffer *leaf;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u64 csum_end;
+	u64 end_byte = bytenr + len;
+	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+	int ret;
+
+	leaf = path->nodes[0];
+	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+	csum_end <<= root->fs_info->sb->s_blocksize_bits;
+	csum_end += key->offset;
+
+	if (key->offset < bytenr && csum_end <= end_byte) {
+		/*
+		 *         [ bytenr - len ]
+		 *         [   ]
+		 *   [csum     ]
+		 *   A simple truncate off the end of the item
+		 */
+		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
+		new_size *= csum_size;
+		ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+		BUG_ON(ret);
+	} else if (key->offset >= bytenr && csum_end > end_byte &&
+		   end_byte > key->offset) {
+		/*
+		 *         [ bytenr - len ]
+		 *                 [ ]
+		 *                 [csum     ]
+		 * we need to truncate from the beginning of the csum
+		 */
+		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
+		new_size *= csum_size;
+
+		ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+		BUG_ON(ret);
+
+		key->offset = end_byte;
+		ret = btrfs_set_item_key_safe(trans, root, path, key);
+		BUG_ON(ret);
+	} else {
+		BUG();
+	}
+	return 0;
+}
+
+/*
+ * deletes the csum items from the csum tree for a given
+ * range of bytes.
+ */
+int btrfs_del_csums(struct btrfs_trans_handle *trans,
+		    struct btrfs_root *root, u64 bytenr, u64 len)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	u64 end_byte = bytenr + len;
+	u64 csum_end;
+	struct extent_buffer *leaf;
+	int ret;
+	u16 csum_size =
+		btrfs_super_csum_size(&root->fs_info->super_copy);
+	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
+
+	root = root->fs_info->csum_root;
+
+	path = btrfs_alloc_path();
+
+	while(1) {
+		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+		key.offset = end_byte - 1;
+		key.type = BTRFS_EXTENT_CSUM_KEY;
+
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				goto out;
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY) {
+			break;
+		}
+
+		if (key.offset >= end_byte)
+			break;
+
+		csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
+		csum_end <<= blocksize_bits;
+		csum_end += key.offset;
+
+		/* this csum ends before we start, we're done */
+		if (csum_end <= bytenr)
+			break;
+
+		/* delete the entire item, it is inside our range */
+		if (key.offset >= bytenr && csum_end <= end_byte) {
+			ret = btrfs_del_item(trans, root, path);
+			BUG_ON(ret);
+		} else if (key.offset < bytenr && csum_end > end_byte) {
+			unsigned long offset;
+			unsigned long shift_len;
+			unsigned long item_offset;
+			/*
+			 *        [ bytenr - len ]
+			 *     [csum                ]
+			 *
+			 * Our bytes are in the middle of the csum,
+			 * we need to split this item and insert a new one.
+			 *
+			 * But we can't drop the path because the
+			 * csum could change, get removed, extended etc.
+			 *
+			 * The trick here is the max size of a csum item leaves
+			 * enough room in the tree block for a single
+			 * item header.  So, we split the item in place,
+			 * adding a new header pointing to the existing
+			 * bytes.  Then we loop around again and we have
+			 * a nicely formed csum item that we can neatly
+			 * truncate.
+			 */
+			offset = (bytenr - key.offset) >> blocksize_bits;
+			offset *= csum_size;
+
+			shift_len = (len >> blocksize_bits) * csum_size;
+
+			item_offset = btrfs_item_ptr_offset(leaf,
+							    path->slots[0]);
+
+			memset_extent_buffer(leaf, 0, item_offset + offset,
+					     shift_len);
+			key.offset = bytenr;
+
+			/*
+			 * btrfs_split_item returns -EAGAIN when the
+			 * item changed size or key
+			 */
+			ret = btrfs_split_item(trans, root, path, &key, offset);
+			BUG_ON(ret && ret != -EAGAIN);
+
+			key.offset = end_byte - 1;
+		} else {
+			ret = truncate_one_csum(trans, root, path,
+						&key, bytenr, len);
+			BUG_ON(ret);
+		}
+		btrfs_release_path(root, path);
+	}
+out:
+	btrfs_free_path(path);
+	return 0;
+}
+
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums)
@@ -396,28 +569,40 @@ again:
 				csum_size, 1);
 	if (ret < 0)
 		goto fail_unlock;
-	if (ret == 0) {
-		BUG();
-	}
-	if (path->slots[0] == 0) {
-		goto insert;
+
+	if (ret > 0) {
+		if (path->slots[0] == 0)
+			goto insert;
+		path->slots[0]--;
 	}
-	path->slots[0]--;
+
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 	csum_offset = (bytenr - found_key.offset) >>
 			root->fs_info->sb->s_blocksize_bits;
+
 	if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
 	    found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
 	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
 		goto insert;
 	}
+
 	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
 	    csum_size) {
 		u32 diff = (csum_offset + 1) * csum_size;
+
+		/*
+		 * is the item big enough already?  we dropped our lock
+		 * before and need to recheck
+		 */
+		if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+			goto csum;
+
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != csum_size)
+		if (diff != csum_size) {
 			goto insert;
+		}
+
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
 		goto csum;
@@ -518,30 +703,3 @@ out:
 fail_unlock:
 	goto out;
 }
-
-int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, struct btrfs_path *path,
-			u64 isize)
-{
-	struct btrfs_key key;
-	struct extent_buffer *leaf = path->nodes[0];
-	int slot = path->slots[0];
-	int ret;
-	u32 new_item_size;
-	u64 new_item_span;
-	u64 blocks;
-
-	btrfs_item_key_to_cpu(leaf, &key, slot);
-	if (isize <= key.offset)
-		return 0;
-	new_item_span = isize - key.offset;
-	blocks = (new_item_span + root->sectorsize - 1) >>
-		root->fs_info->sb->s_blocksize_bits;
-	new_item_size = blocks *
-		btrfs_super_csum_size(&root->fs_info->super_copy);
-	if (new_item_size >= btrfs_item_size_nr(leaf, slot))
-		return 0;
-	ret = btrfs_truncate_item(trans, root, path, new_item_size, 1);
-	BUG_ON(ret);
-	return ret;
-}
-- 
cgit v1.2.3


From 0bc4382ae901311fe53be5735026cbe3ea6f235f Mon Sep 17 00:00:00 2001
From: David Woodhouse <David.Woodhouse@intel.com>
Date: Wed, 10 Dec 2008 15:34:49 +0000
Subject: [JFFS2] Clean up fs/jffs2/compr_rubin.c

Triggered by a smaller cleanup from Jianjun Kong <jianjun@zeuux.org>

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/compr_rubin.c | 120 ++++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index c73fa89b5f8..170d289ac78 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -22,9 +22,7 @@
 
 
 #define BIT_DIVIDER_MIPS 1043
-static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */
-
-#include <linux/errno.h>
+static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241};
 
 struct pushpull {
 	unsigned char *buf;
@@ -43,7 +41,9 @@ struct rubin_state {
 	int bits[8];
 };
 
-static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve)
+static inline void init_pushpull(struct pushpull *pp, char *buf,
+				 unsigned buflen, unsigned ofs,
+				 unsigned reserve)
 {
 	pp->buf = buf;
 	pp->buflen = buflen;
@@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen
 
 static inline int pushbit(struct pushpull *pp, int bit, int use_reserved)
 {
-	if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) {
+	if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve))
 		return -ENOSPC;
-	}
 
-	if (bit) {
-		pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7)));
-	}
-	else {
-		pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7)));
-	}
+	if (bit)
+		pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7)));
+	else
+		pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7)));
+
 	pp->ofs++;
 
 	return 0;
@@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits)
 	rs->p = (long) (2 * UPPER_BIT_RUBIN);
 	rs->bit_number = (long) 0;
 	rs->bit_divider = div;
+
 	for (c=0; c<8; c++)
 		rs->bits[c] = bits[c];
 }
@@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
 	long i0, i1;
 	int ret;
 
-	while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
+	while ((rs->q >= UPPER_BIT_RUBIN) ||
+	       ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) {
 		rs->bit_number++;
 
 		ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0);
@@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol)
 		rs->p <<= 1;
 	}
 	i0 = A * rs->p / (A + B);
-	if (i0 <= 0) {
+	if (i0 <= 0)
 		i0 = 1;
-	}
-	if (i0 >= rs->p) {
+
+	if (i0 >= rs->p)
 		i0 = rs->p - 1;
-	}
+
 	i1 = rs->p - i0;
 
 	if (symbol == 0)
@@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits)
 	/* behalve lower */
 	rs->rec_q = 0;
 
-	for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
+	for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE;
+	     rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp)))
 		;
 }
 
-static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q)
+static void __do_decode(struct rubin_state *rs, unsigned long p,
+			unsigned long q)
 {
 	register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN;
 	unsigned long rec_q;
@@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B)
 		__do_decode(rs, p, q);
 
 	i0 = A * rs->p / (A + B);
-	if (i0 <= 0) {
+	if (i0 <= 0)
 		i0 = 1;
-	}
-	if (i0 >= rs->p) {
+
+	if (i0 >= rs->p)
 		i0 = rs->p - 1;
-	}
 
 	threshold = rs->q + i0;
 	symbol = rs->rec_q >= threshold;
@@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte)
 	struct rubin_state rs_copy;
 	rs_copy = *rs;
 
-	for (i=0;i<8;i++) {
-		ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1);
+	for (i=0; i<8; i++) {
+		ret = encode(rs, rs->bit_divider-rs->bits[i],
+			     rs->bits[i], byte & 1);
 		if (ret) {
 			/* Failed. Restore old state */
 			*rs = rs_copy;
 			return ret;
 		}
-		byte=byte>>1;
+		byte >>= 1 ;
 	}
 	return 0;
 }
@@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs)
 	int i, result = 0, bit_divider = rs->bit_divider;
 
 	for (i = 0; i < 8; i++)
-		result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i;
+		result |= decode(rs, bit_divider - rs->bits[i],
+				 rs->bits[i]) << i;
 
 	return result;
 }
@@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs)
 
 
 static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
-		      unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen)
+			     unsigned char *cpage_out, uint32_t *sourcelen,
+			     uint32_t *dstlen)
 	{
 	int outpos = 0;
 	int pos=0;
@@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in,
 int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out,
 		   uint32_t *sourcelen, uint32_t *dstlen, void *model)
 {
-	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+	return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+				 cpage_out, sourcelen, dstlen);
 }
 #endif
 static int jffs2_dynrubin_compress(unsigned char *data_in,
@@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
 		return -1;
 
 	memset(histo, 0, 256);
-	for (i=0; i<mysrclen; i++) {
+	for (i=0; i<mysrclen; i++)
 		histo[data_in[i]]++;
-	}
 	memset(bits, 0, sizeof(int)*8);
 	for (i=0; i<256; i++) {
 		if (i&128)
@@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
 		cpage_out[i] = bits[i];
 	}
 
-	ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen);
+	ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen,
+				&mydstlen);
 	if (ret)
 		return ret;
 
@@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in,
 	return 0;
 }
 
-static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in,
-			 unsigned char *page_out, uint32_t srclen, uint32_t destlen)
+static void rubin_do_decompress(int bit_divider, int *bits,
+				unsigned char *cdata_in, 
+				unsigned char *page_out, uint32_t srclen,
+				uint32_t destlen)
 {
 	int outpos = 0;
 	struct rubin_state rs;
@@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata
 	init_pushpull(&rs.pp, cdata_in, srclen, 0, 0);
 	init_decode(&rs, bit_divider, bits);
 
-	while (outpos < destlen) {
+	while (outpos < destlen)
 		page_out[outpos++] = in_byte(&rs);
-	}
 }
 
 
@@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in,
 				      uint32_t sourcelen, uint32_t dstlen,
 				      void *model)
 {
-	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen);
+	rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in,
+			    cpage_out, sourcelen, dstlen);
 	return 0;
 }
 
@@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in,
 	for (c=0; c<8; c++)
 		bits[c] = data_in[c];
 
-	rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen);
+	rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8,
+			    dstlen);
 	return 0;
 }
 
 static struct jffs2_compressor jffs2_rubinmips_comp = {
-    .priority = JFFS2_RUBINMIPS_PRIORITY,
-    .name = "rubinmips",
-    .compr = JFFS2_COMPR_DYNRUBIN,
-    .compress = NULL, /*&jffs2_rubinmips_compress,*/
-    .decompress = &jffs2_rubinmips_decompress,
+	.priority = JFFS2_RUBINMIPS_PRIORITY,
+	.name = "rubinmips",
+	.compr = JFFS2_COMPR_DYNRUBIN,
+	.compress = NULL, /*&jffs2_rubinmips_compress,*/
+	.decompress = &jffs2_rubinmips_decompress,
 #ifdef JFFS2_RUBINMIPS_DISABLED
-    .disabled = 1,
+	.disabled = 1,
 #else
-    .disabled = 0,
+	.disabled = 0,
 #endif
 };
 
 int jffs2_rubinmips_init(void)
 {
-    return jffs2_register_compressor(&jffs2_rubinmips_comp);
+	return jffs2_register_compressor(&jffs2_rubinmips_comp);
 }
 
 void jffs2_rubinmips_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_rubinmips_comp);
+	jffs2_unregister_compressor(&jffs2_rubinmips_comp);
 }
 
 static struct jffs2_compressor jffs2_dynrubin_comp = {
-    .priority = JFFS2_DYNRUBIN_PRIORITY,
-    .name = "dynrubin",
-    .compr = JFFS2_COMPR_RUBINMIPS,
-    .compress = jffs2_dynrubin_compress,
-    .decompress = &jffs2_dynrubin_decompress,
+	.priority = JFFS2_DYNRUBIN_PRIORITY,
+	.name = "dynrubin",
+	.compr = JFFS2_COMPR_RUBINMIPS,
+	.compress = jffs2_dynrubin_compress,
+	.decompress = &jffs2_dynrubin_decompress,
 #ifdef JFFS2_DYNRUBIN_DISABLED
-    .disabled = 1,
+	.disabled = 1,
 #else
-    .disabled = 0,
+	.disabled = 0,
 #endif
 };
 
 int jffs2_dynrubin_init(void)
 {
-    return jffs2_register_compressor(&jffs2_dynrubin_comp);
+	return jffs2_register_compressor(&jffs2_dynrubin_comp);
 }
 
 void jffs2_dynrubin_exit(void)
 {
-    jffs2_unregister_compressor(&jffs2_dynrubin_comp);
+	jffs2_unregister_compressor(&jffs2_dynrubin_comp);
 }
-- 
cgit v1.2.3


From 0403e47ee26f26e960ee9038552bc89df4a1fb3d Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 10 Dec 2008 20:32:51 -0500
Subject: Btrfs: Add checking of csum tree in balancing code

This updates the space balancing code for the
new checksum format.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c     | 2 ++
 fs/btrfs/extent-tree.c | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c72f4f3b912..6f042de1ac4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1097,6 +1097,8 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		return fs_info->chunk_root;
 	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
 		return fs_info->dev_root;
+	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return fs_info->csum_root;
 
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
 				 (unsigned long)location->objectid);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cc74316dc42..673ff59c288 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -241,7 +241,7 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	key.objectid = last;
+	key.objectid = max_t(u64, last, BTRFS_SUPER_INFO_OFFSET);
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -4087,7 +4087,8 @@ static int is_cowonly_root(u64 root_objectid)
 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
-	    root_objectid == BTRFS_TREE_LOG_OBJECTID)
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
 		return 1;
 	return 0;
 }
@@ -5497,8 +5498,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_generation(leaf, item, 1);
 	btrfs_set_inode_size(leaf, item, size);
 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM |
-					  BTRFS_INODE_NOCOMPRESS);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root, path);
 out:
-- 
cgit v1.2.3


From cfc8ea87201dc9bb6aeb3fc80c61abee83e7cc06 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 11 Dec 2008 16:30:06 -0500
Subject: Btrfs: mnt_drop_write in ioctl_trans_end

Add missing mnt_drop_write to match the mnt_want_write in
btrfs_ioctl_trans_start.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/btrfs/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6228b69c2b9..69c4a07f586 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1059,6 +1059,8 @@ long btrfs_ioctl_trans_end(struct file *file)
 	root->fs_info->open_ioctl_trans--;
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	mnt_drop_write(file->f_path.mnt);
+
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From d2fb3437e4d8d12c73c587615ad187d5288547ec Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 11 Dec 2008 16:30:39 -0500
Subject: Btrfs: fix leaking block group on balance

The block group structs are referenced in many different
places, and it's not safe to free while balancing.  So, those block
group structs were simply leaked instead.

This patch replaces the block group pointer in the inode with the starting byte
offset of the block group and adds reference counting to the block group
struct.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   8 ++-
 fs/btrfs/ctree.h       |  17 ++++---
 fs/btrfs/extent-tree.c | 132 ++++++++++++++++++++++---------------------------
 fs/btrfs/inode.c       |  43 +++++-----------
 fs/btrfs/ioctl.c       |   2 +-
 fs/btrfs/transaction.c |   2 +-
 fs/btrfs/transaction.h |   2 +-
 7 files changed, 88 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1b9ec1ab1f6..a8c9693b75a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -28,11 +28,6 @@ struct btrfs_inode {
 	/* which subvolume this inode belongs to */
 	struct btrfs_root *root;
 
-	/* the block group preferred for allocations.  This pointer is buggy
-	 * and needs to be replaced with a bytenr instead
-	 */
-	struct btrfs_block_group_cache *block_group;
-
 	/* key used to find this inode on disk.  This is used by the code
 	 * to read in roots of subvolumes
 	 */
@@ -115,6 +110,9 @@ struct btrfs_inode {
 	 */
 	u64 index_cnt;
 
+	/* the start of block group preferred for allocations. */
+	u64 block_group;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5b0c79d22c0..8733081d97a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -653,6 +653,9 @@ struct btrfs_block_group_cache {
 
 	/* for block groups in the same raid type */
 	struct list_head list;
+
+	/* usage count */
+	atomic_t count;
 };
 
 struct btrfs_leaf_ref_tree {
@@ -1706,10 +1709,8 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr);
-struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
-						 struct btrfs_block_group_cache
-						 *hint, u64 search_start,
-						 int data, int owner);
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
 					     u32 blocksize, u64 parent,
@@ -1770,6 +1771,7 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 			    u64 owner_objectid);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -2019,10 +2021,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
-int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
-		struct btrfs_trans_handle *trans, u64 new_dirid,
-		struct btrfs_block_group_cache *block_group);
-
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, struct dentry *dentry,
+			     u64 new_dirid, u64 alloc_hint);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio, unsigned long bio_flags);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 673ff59c288..1cc89246ee2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -53,10 +53,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root, int all);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root, int all);
-static struct btrfs_block_group_cache *
-__btrfs_find_block_group(struct btrfs_root *root,
-			 struct btrfs_block_group_cache *hint,
-			 u64 search_start, int data, int owner);
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 bytenr, u64 num_bytes, int is_data);
@@ -142,6 +138,8 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 			break;
 		}
 	}
+	if (ret)
+		atomic_inc(&ret->count);
 	spin_unlock(&info->block_group_cache_lock);
 
 	return ret;
@@ -318,6 +316,12 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	return cache;
 }
 
+static inline void put_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (atomic_dec_and_test(&cache->count))
+		kfree(cache);
+}
+
 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 						  u64 flags)
 {
@@ -341,54 +345,16 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-static struct btrfs_block_group_cache *
-__btrfs_find_block_group(struct btrfs_root *root,
-			 struct btrfs_block_group_cache *hint,
-			 u64 search_start, int data, int owner)
+u64 btrfs_find_block_group(struct btrfs_root *root,
+			   u64 search_start, u64 search_hint, int owner)
 {
 	struct btrfs_block_group_cache *cache;
-	struct btrfs_block_group_cache *found_group = NULL;
-	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
-	u64 last = 0;
-	u64 free_check;
+	u64 last = max(search_hint, search_start);
+	u64 group_start = 0;
 	int full_search = 0;
-	int factor = 10;
+	int factor = 9;
 	int wrapped = 0;
-
-	if (data & BTRFS_BLOCK_GROUP_METADATA)
-		factor = 9;
-
-	if (search_start) {
-		struct btrfs_block_group_cache *shint;
-		shint = btrfs_lookup_first_block_group(info, search_start);
-		if (shint && block_group_bits(shint, data)) {
-			spin_lock(&shint->lock);
-			used = btrfs_block_group_used(&shint->item);
-			if (used + shint->pinned + shint->reserved <
-			    div_factor(shint->key.offset, factor)) {
-				spin_unlock(&shint->lock);
-				return shint;
-			}
-			spin_unlock(&shint->lock);
-		}
-	}
-	if (hint && block_group_bits(hint, data)) {
-		spin_lock(&hint->lock);
-		used = btrfs_block_group_used(&hint->item);
-		if (used + hint->pinned + hint->reserved <
-		    div_factor(hint->key.offset, factor)) {
-			spin_unlock(&hint->lock);
-			return hint;
-		}
-		spin_unlock(&hint->lock);
-		last = hint->key.objectid + hint->key.offset;
-	} else {
-		if (hint)
-			last = max(hint->key.objectid, search_start);
-		else
-			last = search_start;
-	}
 again:
 	while (1) {
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -399,16 +365,18 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
-		if (block_group_bits(cache, data)) {
-			free_check = div_factor(cache->key.offset, factor);
+		if ((full_search || !cache->ro) &&
+		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 			if (used + cache->pinned + cache->reserved <
-			    free_check) {
-				found_group = cache;
+			    div_factor(cache->key.offset, factor)) {
+				group_start = cache->key.objectid;
 				spin_unlock(&cache->lock);
+				put_block_group(cache);
 				goto found;
 			}
 		}
 		spin_unlock(&cache->lock);
+		put_block_group(cache);
 		cond_resched();
 	}
 	if (!wrapped) {
@@ -423,18 +391,7 @@ again:
 		goto again;
 	}
 found:
-	return found_group;
-}
-
-struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
-						 struct btrfs_block_group_cache
-						 *hint, u64 search_start,
-						 int data, int owner)
-{
-
-	struct btrfs_block_group_cache *ret;
-	ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
-	return ret;
+	return group_start;
 }
 
 /* simple helper to search for an existing extent at a given offset */
@@ -1809,6 +1766,19 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	return werr;
 }
 
+int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
+{
+	struct btrfs_block_group_cache *block_group;
+	int readonly = 0;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
+	if (!block_group || block_group->ro)
+		readonly = 1;
+	if (block_group)
+		put_block_group(block_group);
+	return readonly;
+}
+
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 			     u64 total_bytes, u64 bytes_used,
 			     struct btrfs_space_info **space_info)
@@ -1995,10 +1965,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 				int ret;
 				ret = btrfs_add_free_space(cache, bytenr,
 							   num_bytes);
-				if (ret)
-					return -1;
+				WARN_ON(ret);
 			}
 		}
+		put_block_group(cache);
 		total -= num_bytes;
 		bytenr += num_bytes;
 	}
@@ -2008,12 +1978,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 {
 	struct btrfs_block_group_cache *cache;
+	u64 bytenr;
 
 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
 	if (!cache)
 		return 0;
 
-	return cache->key.objectid;
+	bytenr = cache->key.objectid;
+	put_block_group(cache);
+
+	return bytenr;
 }
 
 int btrfs_update_pinned_extents(struct btrfs_root *root,
@@ -2055,6 +2029,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 			if (cache->cached)
 				btrfs_add_free_space(cache, bytenr, len);
 		}
+		put_block_group(cache);
 		bytenr += len;
 		num -= len;
 	}
@@ -2085,6 +2060,7 @@ static int update_reserved_extents(struct btrfs_root *root,
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
+		put_block_group(cache);
 		bytenr += len;
 		num -= len;
 	}
@@ -2724,6 +2700,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
 			BUG_ON(!cache);
 			btrfs_add_free_space(cache, bytenr, num_bytes);
+			put_block_group(cache);
 			update_reserved_extents(root, bytenr, num_bytes, 0);
 			return 0;
 		}
@@ -2928,6 +2905,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		}
 new_group:
 		mutex_unlock(&block_group->alloc_mutex);
+		put_block_group(block_group);
+		block_group = NULL;
 new_group_no_lock:
 		/* don't try to compare new allocations against the
 		 * last allocation any more
@@ -2997,6 +2976,8 @@ loop_check:
 
 		block_group = list_entry(cur, struct btrfs_block_group_cache,
 					 list);
+		atomic_inc(&block_group->count);
+
 		search_start = block_group->key.objectid;
 		cur = cur->next;
 	}
@@ -3004,7 +2985,7 @@ loop_check:
 	/* we found what we needed */
 	if (ins->objectid) {
 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
-			trans->block_group = block_group;
+			trans->block_group = block_group->key.objectid;
 
 		if (last_ptr)
 			*last_ptr = ins->objectid + ins->offset;
@@ -3015,6 +2996,8 @@ loop_check:
 		       loop, allowed_chunk_alloc);
 		ret = -ENOSPC;
 	}
+	if (block_group)
+		put_block_group(block_group);
 
 	up_read(&space_info->groups_sem);
 	return ret;
@@ -3124,6 +3107,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 		return -ENOSPC;
 	}
 	btrfs_add_free_space(cache, start, len);
+	put_block_group(cache);
 	update_reserved_extents(root, start, len, 0);
 	return 0;
 }
@@ -3288,6 +3272,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	ret = btrfs_remove_free_space(block_group, ins->objectid,
 				      ins->offset);
 	BUG_ON(ret);
+	put_block_group(block_group);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
 	return ret;
@@ -5703,6 +5688,7 @@ next:
 	WARN_ON(block_group->reserved > 0);
 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
 	spin_unlock(&block_group->lock);
+	put_block_group(block_group);
 	ret = 0;
 out:
 	btrfs_free_path(path);
@@ -5763,6 +5749,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		down_write(&block_group->space_info->groups_sem);
 		list_del(&block_group->list);
 		up_write(&block_group->space_info->groups_sem);
+
+		WARN_ON(atomic_read(&block_group->count) != 1);
 		kfree(block_group);
 
 		spin_lock(&info->block_group_cache_lock);
@@ -5807,6 +5795,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			break;
 		}
 
+		atomic_set(&cache->count, 1);
 		spin_lock_init(&cache->lock);
 		mutex_init(&cache->alloc_mutex);
 		mutex_init(&cache->cache_mutex);
@@ -5861,11 +5850,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	mutex_init(&cache->alloc_mutex);
 	mutex_init(&cache->cache_mutex);
 	INIT_LIST_HEAD(&cache->list);
-	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
@@ -5926,10 +5916,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	spin_unlock(&block_group->space_info->lock);
 	block_group->space_info->full = 0;
 
-	/*
-	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
-	kfree(shrink_block_group);
-	*/
+	put_block_group(block_group);
+	put_block_group(block_group);
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 932d8c0b2c0..0a28b770631 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -989,7 +989,6 @@ next_slot:
 
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			struct btrfs_block_group_cache *block_group;
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
@@ -1007,9 +1006,7 @@ next_slot:
 				goto out_check;
 			if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
 				goto out_check;
-			block_group = btrfs_lookup_block_group(root->fs_info,
-							       disk_bytenr);
-			if (!block_group || block_group->ro)
+			if (btrfs_extent_readonly(root, disk_bytenr))
 				goto out_check;
 			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
 			nocow = 1;
@@ -1969,16 +1966,11 @@ void btrfs_read_locked_inode(struct inode *inode)
 	rdev = btrfs_inode_rdev(leaf, inode_item);
 
 	BTRFS_I(inode)->index_cnt = (u64)-1;
+	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
-	BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
-						       alloc_group_block);
-	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-	if (!BTRFS_I(inode)->block_group) {
-		BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
-						 NULL, 0,
-						 BTRFS_BLOCK_GROUP_METADATA, 0);
-	}
+	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+						alloc_group_block, 0);
 	btrfs_free_path(path);
 	inode_item = NULL;
 
@@ -2048,8 +2040,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-	btrfs_set_inode_block_group(leaf, item,
-				    BTRFS_I(inode)->block_group->key.objectid);
+	btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
 }
 
 /*
@@ -3358,14 +3349,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *dir,
 				     const char *name, int name_len,
-				     u64 ref_objectid,
-				     u64 objectid,
-				     struct btrfs_block_group_cache *group,
-				     int mode, u64 *index)
+				     u64 ref_objectid, u64 objectid,
+				     u64 alloc_hint, int mode, u64 *index)
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_block_group_cache *new_inode_group;
 	struct btrfs_key *location;
 	struct btrfs_path *path;
 	struct btrfs_inode_ref *ref;
@@ -3401,13 +3389,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 0;
 	else
 		owner = 1;
-	new_inode_group = btrfs_find_block_group(root, group, 0,
-				       BTRFS_BLOCK_GROUP_METADATA, owner);
-	if (!new_inode_group) {
-		printk("find_block group failed\n");
-		new_inode_group = group;
-	}
-	BTRFS_I(inode)->block_group = new_inode_group;
+	BTRFS_I(inode)->block_group =
+			btrfs_find_block_group(root, 0, alloc_hint, owner);
 
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4366,16 +4349,16 @@ out:
 /*
  * create a new subvolume directory/inode (helper for the ioctl).
  */
-int btrfs_create_subvol_root(struct btrfs_root *new_root, struct dentry *dentry,
-		struct btrfs_trans_handle *trans, u64 new_dirid,
-		struct btrfs_block_group_cache *block_group)
+int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *new_root, struct dentry *dentry,
+			     u64 new_dirid, u64 alloc_hint)
 {
 	struct inode *inode;
 	int error;
 	u64 index = 0;
 
 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-				new_dirid, block_group, S_IFDIR | 0700, &index);
+				new_dirid, alloc_hint, S_IFDIR | 0700, &index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	inode->i_op = &btrfs_dir_inode_operations;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 69c4a07f586..5d67858ce99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -173,7 +173,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	trans = btrfs_start_transaction(new_root, 1);
 	BUG_ON(!trans);
 
-	ret = btrfs_create_subvol_root(new_root, dentry, trans, new_dirid,
+	ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
 				       BTRFS_I(dir)->block_group);
 	if (ret)
 		goto fail;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 47cd5fcad2c..4604178a43a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -182,7 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
 	h->blocks_used = 0;
-	h->block_group = NULL;
+	h->block_group = 0;
 	h->alloc_exclude_nr = 0;
 	h->alloc_exclude_start = 0;
 	root->fs_info->running_transaction->use_count++;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 202c8be6c05..ffe7f639732 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -41,7 +41,7 @@ struct btrfs_trans_handle {
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
 	struct btrfs_transaction *transaction;
-	struct btrfs_block_group_cache *block_group;
+	u64 block_group;
 	u64 alloc_exclude_start;
 	u64 alloc_exclude_nr;
 };
-- 
cgit v1.2.3


From e4404d6e8da678d852b7f767f665f8edf76c9e9f Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 12 Dec 2008 10:03:26 -0500
Subject: Btrfs: shared seed device

This patch makes seed device possible to be shared by
multiple mounted file systems. The sharing is achieved
by cloning seed device's btrfs_fs_devices structure.
Thanks you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c |  37 ++++----
 fs/btrfs/super.c       |   8 +-
 fs/btrfs/volumes.c     | 240 +++++++++++++++++++++++++++----------------------
 fs/btrfs/volumes.h     |   3 +-
 5 files changed, 156 insertions(+), 134 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6f042de1ac4..541a8279ac7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1711,7 +1711,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	mutex_lock(&fs_info->chunk_mutex);
-	ret = btrfs_read_sys_array(tree_root, btrfs_super_bytenr(disk_super));
+	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk("btrfs: failed to read the system array on %s\n",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1cc89246ee2..171057a3267 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -218,7 +218,7 @@ static int cache_block_group(struct btrfs_root *root,
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int slot;
-	u64 last = block_group->key.objectid;
+	u64 last;
 
 	if (!block_group)
 		return 0;
@@ -239,7 +239,8 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	key.objectid = max_t(u64, last, BTRFS_SUPER_INFO_OFFSET);
+	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5335,8 +5336,20 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 			prev_block = block_start;
 		}
 
-		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-		    pass >= 2) {
+		btrfs_record_root_in_trans(found_root);
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			/*
+			 * try to update data extent references while
+			 * keeping metadata shared between snapshots.
+			 */
+			if (pass == 1) {
+				ret = relocate_one_path(trans, found_root,
+						path, &first_key, ref_path,
+						group, reloc_inode);
+				if (ret < 0)
+					goto out;
+				continue;
+			}
 			/*
 			 * use fallback method to process the remaining
 			 * references.
@@ -5359,23 +5372,9 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 						path, extent_key,
 						&first_key, ref_path,
 						new_extents, nr_extents);
-			if (ret < 0)
-				goto out;
-			continue;
-		}
-
-		btrfs_record_root_in_trans(found_root);
-		if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		} else {
 			ret = relocate_tree_block(trans, found_root, path,
 						  &first_key, ref_path);
-		} else {
-			/*
-			 * try to update data extent references while
-			 * keeping metadata shared between snapshots.
-			 */
-			ret = relocate_one_path(trans, found_root, path,
-						&first_key, ref_path,
-						group, reloc_inode);
 		}
 		if (ret < 0)
 			goto out;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 09908f25fca..84c3b66564d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -58,14 +58,15 @@ static struct super_operations btrfs_super_ops;
 static void btrfs_put_super (struct super_block * sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs = root->fs_info;
 	int ret;
 
 	ret = close_ctree(root);
 	if (ret) {
 		printk("close ctree returns %d\n", ret);
 	}
-	btrfs_sysfs_del_super(fs);
+#if 0
+	btrfs_sysfs_del_super(root->fs_info);
+#endif
 	sb->s_fs_info = NULL;
 }
 
@@ -349,11 +350,12 @@ static int btrfs_fill_super(struct super_block * sb,
 		err = -ENOMEM;
 		goto fail_close;
 	}
-
+#if 0
 	/* this does the super kobj at the same time */
 	err = btrfs_sysfs_add_super(tree_root->fs_info);
 	if (err)
 		goto fail_close;
+#endif
 
 	sb->s_root = root_dentry;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4d210a731d4..6672adcec9f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -47,7 +47,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
 
-
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
@@ -74,34 +73,29 @@ static void unlock_chunks(struct btrfs_root *root)
 	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+	WARN_ON(fs_devices->opened);
+	while (!list_empty(&fs_devices->devices)) {
+		device = list_entry(fs_devices->devices.next,
+				    struct btrfs_device, dev_list);
+		list_del(&device->dev_list);
+		kfree(device->name);
+		kfree(device);
+	}
+	kfree(fs_devices);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
-	struct btrfs_device *dev;
 
 	while (!list_empty(&fs_uuids)) {
 		fs_devices = list_entry(fs_uuids.next,
 					struct btrfs_fs_devices, list);
 		list_del(&fs_devices->list);
-		while(!list_empty(&fs_devices->devices)) {
-			dev = list_entry(fs_devices->devices.next,
-					 struct btrfs_device, dev_list);
-			if (dev->bdev) {
-				close_bdev_exclusive(dev->bdev, dev->mode);
-				fs_devices->open_devices--;
-			}
-			fs_devices->num_devices--;
-			if (dev->writeable)
-				fs_devices->rw_devices--;
-			list_del(&dev->dev_list);
-			list_del(&dev->dev_alloc_list);
-			kfree(dev->name);
-			kfree(dev);
-		}
-		WARN_ON(fs_devices->num_devices);
-		WARN_ON(fs_devices->open_devices);
-		WARN_ON(fs_devices->rw_devices);
-		kfree(fs_devices);
+		free_fs_devices(fs_devices);
 	}
 	return 0;
 }
@@ -304,12 +298,55 @@ static noinline int device_list_add(const char *path,
 	return 0;
 }
 
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	struct btrfs_device *orig_dev;
+
+	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!fs_devices)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&fs_devices->devices);
+	INIT_LIST_HEAD(&fs_devices->alloc_list);
+	INIT_LIST_HEAD(&fs_devices->list);
+	fs_devices->latest_devid = orig->latest_devid;
+	fs_devices->latest_trans = orig->latest_trans;
+	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			goto error;
+
+		device->name = kstrdup(orig_dev->name, GFP_NOFS);
+		if (!device->name)
+			goto error;
+
+		device->devid = orig_dev->devid;
+		device->work.func = pending_bios_fn;
+		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
+		INIT_LIST_HEAD(&device->dev_list);
+		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		list_add(&device->dev_list, &fs_devices->devices);
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	}
+	return fs_devices;
+error:
+	free_fs_devices(fs_devices);
+	return ERR_PTR(-ENOMEM);
+}
+
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *tmp;
 	struct list_head *cur;
 	struct btrfs_device *device;
-	int seed_devices = 0;
 
 	mutex_lock(&uuid_mutex);
 again:
@@ -328,17 +365,14 @@ again:
 			device->writeable = 0;
 			fs_devices->rw_devices--;
 		}
-		if (!seed_devices) {
-			list_del_init(&device->dev_list);
-			fs_devices->num_devices--;
-			kfree(device->name);
-			kfree(device);
-		}
+		list_del_init(&device->dev_list);
+		fs_devices->num_devices--;
+		kfree(device->name);
+		kfree(device);
 	}
 
 	if (fs_devices->seed) {
 		fs_devices = fs_devices->seed;
-		seed_devices = 1;
 		goto again;
 	}
 
@@ -348,10 +382,9 @@ again:
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct btrfs_fs_devices *seed_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
-again:
+
 	if (--fs_devices->opened > 0)
 		return 0;
 
@@ -370,31 +403,38 @@ again:
 		device->writeable = 0;
 		device->in_fs_metadata = 0;
 	}
+	WARN_ON(fs_devices->open_devices);
+	WARN_ON(fs_devices->rw_devices);
 	fs_devices->opened = 0;
 	fs_devices->seeding = 0;
-	fs_devices->sprouted = 0;
 
-	seed_devices = fs_devices->seed;
-	fs_devices->seed = NULL;
-	if (seed_devices) {
-		fs_devices = seed_devices;
-		goto again;
-	}
 	return 0;
 }
 
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
+	struct btrfs_fs_devices *seed_devices = NULL;
 	int ret;
 
 	mutex_lock(&uuid_mutex);
 	ret = __btrfs_close_devices(fs_devices);
+	if (!fs_devices->opened) {
+		seed_devices = fs_devices->seed;
+		fs_devices->seed = NULL;
+	}
 	mutex_unlock(&uuid_mutex);
+
+	while (seed_devices) {
+		fs_devices = seed_devices;
+		seed_devices = fs_devices->seed;
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+	}
 	return ret;
 }
 
-int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-			 fmode_t flags, void *holder)
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+				fmode_t flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -490,12 +530,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 	mutex_lock(&uuid_mutex);
 	if (fs_devices->opened) {
-		if (fs_devices->sprouted) {
-			ret = -EBUSY;
-		} else {
-			fs_devices->opened++;
-			ret = 0;
-		}
+		fs_devices->opened++;
+		ret = 0;
 	} else {
 		ret = __btrfs_open_devices(fs_devices, flags, holder);
 	}
@@ -1043,12 +1079,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		goto error_brelse;
 
 	device->in_fs_metadata = 0;
-	if (device->fs_devices == root->fs_info->fs_devices) {
-		list_del_init(&device->dev_list);
-		root->fs_info->fs_devices->num_devices--;
-		if (device->bdev)
-			device->fs_devices->open_devices--;
-	}
+	list_del_init(&device->dev_list);
+	device->fs_devices->num_devices--;
 
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
 				 struct btrfs_device, dev_list);
@@ -1057,34 +1089,27 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
+	if (device->bdev) {
+		close_bdev_exclusive(device->bdev, device->mode);
+		device->bdev = NULL;
+		device->fs_devices->open_devices--;
+	}
+
 	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
 
-	if (device->fs_devices != root->fs_info->fs_devices) {
-		BUG_ON(device->writeable);
-		brelse(bh);
-		if (bdev)
-			close_bdev_exclusive(bdev, FMODE_READ);
-
-		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
-			device->bdev = NULL;
-			device->fs_devices->open_devices--;
-		}
-		if (device->fs_devices->open_devices == 0) {
-			struct btrfs_fs_devices *fs_devices;
-			fs_devices = root->fs_info->fs_devices;
-			while (fs_devices) {
-				if (fs_devices->seed == device->fs_devices)
-					break;
-				fs_devices = fs_devices->seed;
-			}
-			fs_devices->seed = device->fs_devices->seed;
-			device->fs_devices->seed = NULL;
-			__btrfs_close_devices(device->fs_devices);
+	if (device->fs_devices->open_devices == 0) {
+		struct btrfs_fs_devices *fs_devices;
+		fs_devices = root->fs_info->fs_devices;
+		while (fs_devices) {
+			if (fs_devices->seed == device->fs_devices)
+				break;
+			fs_devices = fs_devices->seed;
 		}
-		ret = 0;
-		goto out;
+		fs_devices->seed = device->fs_devices->seed;
+		device->fs_devices->seed = NULL;
+		__btrfs_close_devices(device->fs_devices);
+		free_fs_devices(device->fs_devices);
 	}
 
 	/*
@@ -1099,20 +1124,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
 	}
-	brelse(bh);
 
-	if (device->bdev) {
-		/* one close for the device struct or super_block */
-		close_bdev_exclusive(device->bdev, device->mode);
-	}
-	if (bdev) {
-		/* one close for us */
-		close_bdev_exclusive(bdev, FMODE_READ);
-	}
 	kfree(device->name);
 	kfree(device);
 	ret = 0;
-	goto out;
 
 error_brelse:
 	brelse(bh);
@@ -1133,34 +1148,41 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
+	struct btrfs_fs_devices *seed_devices;
 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
 	struct btrfs_device *device;
 	u64 super_flags;
 
 	BUG_ON(!mutex_is_locked(&uuid_mutex));
-	if (!fs_devices->seeding || fs_devices->opened != 1)
+	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!old_devices)
+	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!seed_devices)
 		return -ENOMEM;
 
-	memcpy(old_devices, fs_devices, sizeof(*old_devices));
-	old_devices->opened = 1;
-	old_devices->sprouted = 1;
-	INIT_LIST_HEAD(&old_devices->devices);
-	INIT_LIST_HEAD(&old_devices->alloc_list);
-	list_splice_init(&fs_devices->devices, &old_devices->devices);
-	list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
-	list_for_each_entry(device, &old_devices->devices, dev_list) {
-		device->fs_devices = old_devices;
+	old_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(old_devices)) {
+		kfree(seed_devices);
+		return PTR_ERR(old_devices);
 	}
+
 	list_add(&old_devices->list, &fs_uuids);
 
+	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+	seed_devices->opened = 1;
+	INIT_LIST_HEAD(&seed_devices->devices);
+	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	list_splice_init(&fs_devices->devices, &seed_devices->devices);
+	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+	list_for_each_entry(device, &seed_devices->devices, dev_list) {
+		device->fs_devices = seed_devices;
+	}
+
 	fs_devices->seeding = 0;
 	fs_devices->num_devices = 0;
 	fs_devices->open_devices = 0;
-	fs_devices->seed = old_devices;
+	fs_devices->seed = seed_devices;
 
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2642,7 +2664,6 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 				 NULL, 0, page);
 }
 
-
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
@@ -2840,6 +2861,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
+	device->fs_devices = fs_devices;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
 	INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -2980,8 +3002,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		ret = -ENOENT;
 		goto out;
 	}
-	if (fs_devices->opened) {
-		ret = -EBUSY;
+
+	fs_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(fs_devices)) {
+		ret = PTR_ERR(fs_devices);
 		goto out;
 	}
 
@@ -2992,13 +3016,13 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 
 	if (!fs_devices->seeding) {
 		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
-	fs_devices->sprouted = 1;
 out:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -3011,7 +3035,6 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
-	int seed_devices = 0;
 	u8 fs_uuid[BTRFS_UUID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
@@ -3025,14 +3048,13 @@ static int read_one_dev(struct btrfs_root *root,
 
 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
 		ret = open_seed_devices(root, fs_uuid);
-		if (ret)
+		if (ret && !btrfs_test_opt(root, DEGRADED))
 			return ret;
-		seed_devices = 1;
 	}
 
 	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
 	if (!device || !device->bdev) {
-		if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
+		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
 
 		if (!device) {
@@ -3074,7 +3096,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 	return read_one_dev(root, buf, dev_item);
 }
 
-int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
+int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb;
@@ -3089,7 +3111,7 @@ int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
 	u32 cur;
 	struct btrfs_key key;
 
-	sb = btrfs_find_create_tree_block(root, sb_bytenr,
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
 					  BTRFS_SUPER_INFO_SIZE);
 	if (!sb)
 		return -ENOMEM;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bdebe83c319..86c44e9ae11 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -93,7 +93,6 @@ struct btrfs_fs_devices {
 
 	struct btrfs_fs_devices *seed;
 	int seeding;
-	int sprouted;
 
 	int opened;
 };
@@ -127,7 +126,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len);
-int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr);
+int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 type);
-- 
cgit v1.2.3


From 17d217fe970d34720f4f1633dca73a6aa2f3d9d1 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 12 Dec 2008 10:03:38 -0500
Subject: Btrfs: fix nodatasum handling in balancing code

Checksums on data can be disabled by mount option, so it's
possible some data extents don't have checksums or have
invalid checksums. This causes trouble for data relocation.
This patch contains following things to make data relocation
work.

1) make nodatasum/nodatacow mount option only affects new
files. Checksums and COW on data are only controlled by the
inode flags.

2) check the existence of checksum in the nodatacow checker.
If checksums exist, force COW the data extent. This ensure that
checksum for a given block is either valid or does not exist.

3) update data relocation code to properly handle the case
of checksum missing.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/compression.c |   9 ++--
 fs/btrfs/ctree.h       |   5 ++-
 fs/btrfs/extent-tree.c |  50 ++++++++++++++++++++--
 fs/btrfs/extent_io.h   |   1 +
 fs/btrfs/file-item.c   | 114 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/file.c        |   8 ----
 fs/btrfs/inode.c       |  74 ++++++++++++++++++++++++++------
 7 files changed, 226 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ad727413730..2436163d543 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -124,8 +124,7 @@ static int check_compressed_csum(struct inode *inode,
 	u32 csum;
 	u32 *cb_sum = &cb->sums;
 
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
+	if (btrfs_test_flag(inode, NODATASUM))
 		return 0;
 
 	for (i = 0; i < cb->nr_pages; i++) {
@@ -671,8 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			atomic_inc(&cb->pending_bios);
 
-			if (!btrfs_test_opt(root, NODATASUM) &&
-			    !btrfs_test_flag(inode, NODATASUM)) {
+			if (!btrfs_test_flag(inode, NODATASUM)) {
 				btrfs_lookup_bio_sums(root, inode, comp_bio,
 						      sums);
 			}
@@ -699,8 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	if (!btrfs_test_opt(root, NODATASUM) &&
-	    !btrfs_test_flag(inode, NODATASUM)) {
+	if (!btrfs_test_flag(inode, NODATASUM)) {
 		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
 	}
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8733081d97a..b89999de456 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1702,7 +1702,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 bytenr);
+			  struct btrfs_root *root, u64 objectid, u64 bytenr);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
@@ -1789,6 +1789,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *buf, u64 orig_start);
 int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
@@ -1994,6 +1995,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+			     u64 end, struct list_head *list);
 /* inode.c */
 
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 171057a3267..8004695d24d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1359,7 +1359,7 @@ out:
 }
 
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 bytenr)
+			  struct btrfs_root *root, u64 objectid, u64 bytenr)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
 	struct btrfs_path *path;
@@ -1418,8 +1418,9 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 		ref_item = btrfs_item_ptr(leaf, path->slots[0],
 					  struct btrfs_extent_ref);
 		ref_root = btrfs_ref_root(leaf, ref_item);
-		if (ref_root != root->root_key.objectid &&
-		    ref_root != BTRFS_TREE_LOG_OBJECTID) {
+		if ((ref_root != root->root_key.objectid &&
+		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
+		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
 			ret = 1;
 			goto out;
 		}
@@ -5367,7 +5368,6 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 				if (ret)
 					goto out;
 			}
-			btrfs_record_root_in_trans(found_root);
 			ret = replace_one_extent(trans, found_root,
 						path, extent_key,
 						&first_key, ref_path,
@@ -5534,6 +5534,7 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	} else {
 		BUG_ON(1);
 	}
+	BTRFS_I(inode)->index_cnt = group->key.objectid;
 
 	err = btrfs_orphan_add(trans, inode);
 out:
@@ -5546,6 +5547,47 @@ out:
 	return inode;
 }
 
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct list_head list;
+	size_t offset;
+	int ret;
+	u64 disk_bytenr;
+
+	INIT_LIST_HEAD(&list);
+
+	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+	ret = btrfs_lookup_csums_range(root, disk_bytenr,
+				       disk_bytenr + len - 1, &list);
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del_init(&sums->list);
+
+		sector_sum = sums->sums;
+		sums->bytenr = ordered->start;
+
+		offset = 0;
+		while (offset < sums->len) {
+			sector_sum->bytenr += ordered->start - disk_bytenr;
+			sector_sum++;
+			offset += root->sectorsize;
+		}
+
+		btrfs_add_ordered_sum(inode, ordered, sums);
+	}
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 {
 	struct btrfs_trans_handle *trans;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2d5f67065b6..c5b483a7913 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,6 +16,7 @@
 #define EXTENT_ORDERED (1 << 9)
 #define EXTENT_ORDERED_METADATA (1 << 10)
 #define EXTENT_BOUNDARY (1 << 11)
+#define EXTENT_NODATASUM (1 << 12)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /* flags for bio submission */
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 3ebef871ee6..df0447632db 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -140,6 +140,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst)
 {
@@ -185,9 +186,16 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 				if (ret == -ENOENT || ret == -EFBIG)
 					ret = 0;
 				sum = 0;
-				printk("no csum found for inode %lu start "
-				       "%llu\n", inode->i_ino,
-				       (unsigned long long)offset);
+				if (BTRFS_I(inode)->root->root_key.objectid ==
+				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+					set_extent_bits(io_tree, offset,
+						offset + bvec->bv_len - 1,
+						EXTENT_NODATASUM, GFP_NOFS);
+				} else {
+					printk("no csum found for inode %lu "
+					       "start %llu\n", inode->i_ino,
+					       (unsigned long long)offset);
+				}
 				item = NULL;
 				btrfs_release_path(root, path);
 				goto found;
@@ -228,6 +236,106 @@ found:
 	return 0;
 }
 
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+			     struct list_head *list)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_csum_item *item;
+	unsigned long offset;
+	int ret;
+	size_t size;
+	u64 csum_end;
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key.offset = start;
+	key.type = BTRFS_EXTENT_CSUM_KEY;
+
+	ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
+				&key, path, 0, 0);
+	if (ret < 0)
+		goto fail;
+	if (ret > 0 && path->slots[0] > 0) {
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
+		if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
+		    key.type == BTRFS_EXTENT_CSUM_KEY) {
+			offset = (start - key.offset) >>
+				 root->fs_info->sb->s_blocksize_bits;
+			if (offset * csum_size <
+			    btrfs_item_size_nr(leaf, path->slots[0] - 1))
+				path->slots[0]--;
+		}
+	}
+
+	while (start <= end) {
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root->fs_info->csum_root, path);
+			if (ret < 0)
+				goto fail;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    key.type != BTRFS_EXTENT_CSUM_KEY)
+			break;
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.offset > end)
+			break;
+
+		if (key.offset > start)
+			start = key.offset;
+
+		size = btrfs_item_size_nr(leaf, path->slots[0]);
+		csum_end = key.offset + (size / csum_size) * root->sectorsize;
+
+		size = min(csum_end, end + 1) - start;
+		sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS);
+		BUG_ON(!sums);
+
+		sector_sum = sums->sums;
+		sums->bytenr = start;
+		sums->len = size;
+
+		offset = (start - key.offset) >>
+			 root->fs_info->sb->s_blocksize_bits;
+		offset *= csum_size;
+
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_csum_item);
+		while (size > 0) {
+			read_extent_buffer(path->nodes[0], &sector_sum->sum,
+					   ((unsigned long)item) + offset,
+					   csum_size);
+			sector_sum->bytenr = start;
+
+			size -= root->sectorsize;
+			start += root->sectorsize;
+			offset += csum_size;
+			sector_sum++;
+		}
+		list_add_tail(&sums->list, list);
+
+		path->slots[0]++;
+	}
+	ret = 0;
+fail:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio, u64 file_start, int contig)
 {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 71bfe3a6a44..507081059d9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1059,14 +1059,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 
-	/*
-	 * if this is a nodatasum mount, force summing off for the inode
-	 * all the time.  That way a later mount with summing on won't
-	 * get confused
-	 */
-	if (btrfs_test_opt(root, NODATASUM))
-		btrfs_set_flag(inode, NODATASUM);
-
 	/*
 	 * there are lots of better ways to do this, but this code
 	 * makes sure the first and last page in the file range are
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0a28b770631..e64a4fe19a6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -771,6 +771,13 @@ static noinline int cow_file_range(struct inode *inode,
 					       ram_size, cur_alloc_size, 0);
 		BUG_ON(ret);
 
+		if (root->root_key.objectid ==
+		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+			ret = btrfs_reloc_clone_csums(inode, start,
+						      cur_alloc_size);
+			BUG_ON(ret);
+		}
+
 		if (disk_num_bytes < cur_alloc_size) {
 			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
 			       cur_alloc_size);
@@ -910,6 +917,26 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	return 0;
 }
 
+static int noinline csum_exist_in_range(struct btrfs_root *root,
+					u64 bytenr, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_ordered_sum *sums;
+	LIST_HEAD(list);
+
+	ret = btrfs_lookup_csums_range(root, bytenr, bytenr + num_bytes - 1,
+				       &list);
+	if (ret == 0 && list_empty(&list))
+		return 0;
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	return 1;
+}
+
 /*
  * when nowcow writeback call back.  This checks for snapshots or COW copies
  * of the extents that exist in the file, and COWs the file as required.
@@ -971,6 +998,7 @@ next_slot:
 
 		nocow = 0;
 		disk_bytenr = 0;
+		num_bytes = 0;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
 		if (found_key.objectid > inode->i_ino ||
@@ -996,19 +1024,29 @@ next_slot:
 				path->slots[0]++;
 				goto next_slot;
 			}
+			if (disk_bytenr == 0)
+				goto out_check;
 			if (btrfs_file_extent_compression(leaf, fi) ||
 			    btrfs_file_extent_encryption(leaf, fi) ||
 			    btrfs_file_extent_other_encoding(leaf, fi))
 				goto out_check;
-			if (disk_bytenr == 0)
-				goto out_check;
 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
 				goto out_check;
-			if (btrfs_cross_ref_exist(trans, root, disk_bytenr))
-				goto out_check;
 			if (btrfs_extent_readonly(root, disk_bytenr))
 				goto out_check;
+			if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+						  disk_bytenr))
+				goto out_check;
 			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+			disk_bytenr += cur_offset - found_key.offset;
+			num_bytes = min(end + 1, extent_end) - cur_offset;
+			/*
+			 * force cow if csum exists in the range.
+			 * this ensure that csum for a given extent are
+			 * either valid or do not exist.
+			 */
+			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+				goto out_check;
 			nocow = 1;
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			extent_end = found_key.offset +
@@ -1041,8 +1079,6 @@ out_check:
 			cow_start = (u64)-1;
 		}
 
-		disk_bytenr += cur_offset - found_key.offset;
-		num_bytes = min(end + 1, extent_end) - cur_offset;
 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			struct extent_map *em;
 			struct extent_map_tree *em_tree;
@@ -1105,11 +1141,9 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 			      u64 start, u64 end, int *page_started,
 			      unsigned long *nr_written)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
-	if (btrfs_test_opt(root, NODATACOW) ||
-	    btrfs_test_flag(inode, NODATACOW))
+	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 1, nr_written);
 	else if (btrfs_test_flag(inode, PREALLOC))
@@ -1252,8 +1286,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
-	skip_sum = btrfs_test_opt(root, NODATASUM) ||
-		btrfs_test_flag(inode, NODATASUM);
+	skip_sum = btrfs_test_flag(inode, NODATASUM);
 
 	if (!(rw & (1 << BIO_RW))) {
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
@@ -1263,6 +1296,9 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			btrfs_lookup_bio_sums(root, inode, bio, NULL);
 		goto mapit;
 	} else if (!skip_sum) {
+		/* csum items have already been cloned */
+		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+			goto mapit;
 		/* we're doing a write, do the async checksumming */
 		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
@@ -1692,9 +1728,15 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ClearPageChecked(page);
 		goto good;
 	}
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
+	if (btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
+	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
+				  GFP_NOFS);
 		return 0;
+	}
 
 	if (state && state->start == start) {
 		private = state->private;
@@ -3391,6 +3433,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 1;
 	BTRFS_I(inode)->block_group =
 			btrfs_find_block_group(root, 0, alloc_hint, owner);
+	if ((mode & S_IFREG)) {
+		if (btrfs_test_opt(root, NODATASUM))
+			btrfs_set_flag(inode, NODATASUM);
+		if (btrfs_test_opt(root, NODATACOW))
+			btrfs_set_flag(inode, NODATACOW);
+	}
 
 	key[0].objectid = objectid;
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
-- 
cgit v1.2.3


From 42dc7babdcc99feadb04d461592ce5898a362550 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Dec 2008 11:44:56 -0500
Subject: Btrfs: Fix compressed writes on truncated pages

The compression code was using isize to limit the amount of data it
sent through zlib.  But, it wasn't properly limiting the looping to
just the pages inside i_size.  The end result was trying to compress
too many pages, including those that had not been setup and properly locked
down.  This made the compression code oops while trying find_get_page on a
page that didn't exist.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 fs/btrfs/inode.c | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c0c95cccbb5..f6f7a6af035 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2887,8 +2887,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
-	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
 	leaf = path->nodes[0];
+	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
 
 split:
 	item = btrfs_item_nr(leaf, path->slots[0]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e64a4fe19a6..5313a13a998 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -332,6 +332,7 @@ static noinline int compress_file_range(struct inode *inode,
 	u64 disk_num_bytes;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
+	u64 isize = i_size_read(inode);
 	int ret = 0;
 	struct page **pages = NULL;
 	unsigned long nr_pages;
@@ -345,12 +346,12 @@ static noinline int compress_file_range(struct inode *inode,
 
 	orig_start = start;
 
+	actual_end = min_t(u64, isize, end + 1);
 again:
 	will_compress = 0;
 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
-	actual_end = min_t(u64, i_size_read(inode), end + 1);
 	total_compressed = actual_end - start;
 
 	/* we want to make sure that amount of ram required to uncompress
@@ -488,7 +489,7 @@ again:
 		add_async_extent(async_cow, start, num_bytes,
 				 total_compressed, pages, nr_pages_ret);
 
-		if (start + num_bytes < end) {
+		if (start + num_bytes < end && start + num_bytes < actual_end) {
 			start += num_bytes;
 			pages = NULL;
 			cond_resched();
@@ -696,6 +697,7 @@ static noinline int cow_file_range(struct inode *inode,
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
+	u64 isize = i_size_read(inode);
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -705,7 +707,7 @@ static noinline int cow_file_range(struct inode *inode,
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
 
-	actual_end = min_t(u64, i_size_read(inode), end + 1);
+	actual_end = min_t(u64, isize, end + 1);
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-- 
cgit v1.2.3


From 75eff68ea6959efd999b125fc3dcf73931a3b30b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 15 Dec 2008 15:54:40 -0500
Subject: Btrfs: Don't use spin*lock_irq for the delalloc lock

The delalloc lock doesn't need to have irqs disabled, nobody that
changes the number of delalloc bytes in the FS is running with irqs off.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 12 ++++++------
 fs/btrfs/inode.c       | 34 ++++++++++++++++++++--------------
 2 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8004695d24d..fbd6a8f28b5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1047,11 +1047,11 @@ search:
 		end = pos;
 
 		/* update the free space counters */
-		spin_lock_irq(&info->delalloc_lock);
+		spin_lock(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
 		btrfs_set_super_bytes_used(&info->super_copy,
 					   super_used - bytes_freed);
-		spin_unlock_irq(&info->delalloc_lock);
+		spin_unlock(&info->delalloc_lock);
 
 		root_used = btrfs_root_used(&extent_root->root_item);
 		btrfs_set_root_used(&extent_root->root_item,
@@ -2463,11 +2463,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			BUG_ON(ret < 0);
 		}
 		/* block accounting for super block */
-		spin_lock_irq(&info->delalloc_lock);
+		spin_lock(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
 		btrfs_set_super_bytes_used(&info->super_copy,
 					   super_used - num_bytes);
-		spin_unlock_irq(&info->delalloc_lock);
+		spin_unlock(&info->delalloc_lock);
 
 		/* block accounting for root item */
 		root_used = btrfs_root_used(&root->root_item);
@@ -3151,10 +3151,10 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 		parent = ins->objectid;
 
 	/* block accounting for super block */
-	spin_lock_irq(&info->delalloc_lock);
+	spin_lock(&info->delalloc_lock);
 	super_used = btrfs_super_bytes_used(&info->super_copy);
 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
-	spin_unlock_irq(&info->delalloc_lock);
+	spin_unlock(&info->delalloc_lock);
 
 	/* block accounting for root item */
 	root_used = btrfs_root_used(&root->root_item);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5313a13a998..0577e77e661 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -101,10 +101,9 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	u64 total;
 	u64 used;
 	u64 thresh;
-	unsigned long flags;
 	int ret = 0;
 
-	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	spin_lock(&root->fs_info->delalloc_lock);
 	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
 	if (for_del)
@@ -116,7 +115,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 
 	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
 		ret = -ENOSPC;
-	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+	spin_unlock(&root->fs_info->delalloc_lock);
 	return ret;
 }
 
@@ -1166,17 +1165,21 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
-	unsigned long flags;
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testeing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
-		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
 		if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
 				      &root->fs_info->delalloc_inodes);
 		}
-		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+		spin_unlock(&root->fs_info->delalloc_lock);
 	}
 	return 0;
 }
@@ -1187,11 +1190,15 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			 unsigned long old, unsigned long bits)
 {
+	/*
+	 * set_bit and clear bit hooks normally require _irqsave/restore
+	 * but in this case, we are only testeing for the DELALLOC
+	 * bit, which is only set or cleared with irqs on
+	 */
 	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
-		unsigned long flags;
 
-		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+		spin_lock(&root->fs_info->delalloc_lock);
 		if (end - start + 1 > root->fs_info->delalloc_bytes) {
 			printk("warning: delalloc account %Lu %Lu\n",
 			       end - start + 1, root->fs_info->delalloc_bytes);
@@ -1205,7 +1212,7 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
 			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
 		}
-		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+		spin_unlock(&root->fs_info->delalloc_lock);
 	}
 	return 0;
 }
@@ -4651,27 +4658,26 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
 	struct inode *inode;
-	unsigned long flags;
 
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
-	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	spin_lock(&root->fs_info->delalloc_lock);
 	while(!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
 				    delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
 		if (!inode)
 			list_del_init(&binode->delalloc_inodes);
-		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+		spin_unlock(&root->fs_info->delalloc_lock);
 		if (inode) {
 			filemap_flush(inode->i_mapping);
 			iput(inode);
 		}
 		cond_resched();
-		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+		spin_lock(&root->fs_info->delalloc_lock);
 	}
-	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+	spin_unlock(&root->fs_info->delalloc_lock);
 
 	/* the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
-- 
cgit v1.2.3


From dcbdd4dcb9793b00b46ab023e9330922c8c7c54c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 16 Dec 2008 13:51:01 -0500
Subject: Btrfs: delete checksum items before marking blocks free

Btrfs maintains a cache of blocks available for allocation in ram.  The
code that frees extents was marking the extents free and then deleting
the checksum items.

This meant it was possible the extent would be reallocated before the
checksum item was actually deleted, leading to races and other
problems as the checksums were updated for the newly allocated extent.

The fix is to delete the checksum before marking the extent free.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 +++---
 fs/btrfs/file-item.c   | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fbd6a8f28b5..9ef2a2be268 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2477,15 +2477,15 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 				      num_to_del);
 		BUG_ON(ret);
 		btrfs_release_path(extent_root, path);
-		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
-					 mark_free);
-		BUG_ON(ret);
 
 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
 		}
 
+		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+					 mark_free);
+		BUG_ON(ret);
 #ifdef BIO_RW_DISCARD
 		/* Tell the block device(s) that the sectors can be discarded */
 		ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index df0447632db..7acadf3b742 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -537,6 +537,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		if (key.offset >= bytenr && csum_end <= end_byte) {
 			ret = btrfs_del_item(trans, root, path);
 			BUG_ON(ret);
+			if (key.offset == bytenr)
+				break;
 		} else if (key.offset < bytenr && csum_end > end_byte) {
 			unsigned long offset;
 			unsigned long shift_len;
@@ -583,6 +585,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 			ret = truncate_one_csum(trans, root, path,
 						&key, bytenr, len);
 			BUG_ON(ret);
+			if (key.offset < bytenr)
+				break;
 		}
 		btrfs_release_path(root, path);
 	}
-- 
cgit v1.2.3


From 87b29b208c6c38f3446d2de6ece946e2459052cf Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 17 Dec 2008 10:21:48 -0500
Subject: Btrfs: properly check free space for tree balancing

btrfs_insert_empty_items takes the space needed by the btrfs_item
structure into account when calculating the required free space.

So the tree balancing code shouldn't add sizeof(struct btrfs_item)
to the size when checking the free space. This patch removes these
superfluous additions.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.c     | 52 ++++++++++++++++++++++++++--------------------------
 fs/btrfs/file-item.c |  4 ++++
 fs/btrfs/tree-log.c  |  9 ++-------
 3 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f6f7a6af035..7fad2e3ad6f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1587,8 +1587,8 @@ cow_done:
 				btrfs_tree_lock(b);
 		} else {
 			p->slots[level] = slot;
-			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
-			    sizeof(struct btrfs_item) + ins_len) {
+			if (ins_len > 0 &&
+			    btrfs_leaf_free_space(root, b) < ins_len) {
 				int sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
 				BUG_ON(sret > 0);
@@ -2231,7 +2231,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item))
+	if (free_space < data_size)
 		goto out_unlock;
 
 	/* cow and double check */
@@ -2241,7 +2241,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		goto out_unlock;
 
 	free_space = btrfs_leaf_free_space(root, right);
-	if (free_space < data_size + sizeof(struct btrfs_item))
+	if (free_space < data_size)
 		goto out_unlock;
 
 	left_nritems = btrfs_header_nritems(left);
@@ -2254,7 +2254,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		nr = 1;
 
 	if (path->slots[0] >= left_nritems)
-		push_space += data_size + sizeof(*item);
+		push_space += data_size;
 
 	i = left_nritems - 1;
 	while (i >= nr) {
@@ -2271,7 +2271,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 
 		if (path->slots[0] == i)
-			push_space += data_size + sizeof(*item);
+			push_space += data_size;
 
 		if (!left->map_token) {
 			map_extent_buffer(left, (unsigned long)item,
@@ -2427,7 +2427,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
 	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
+	if (free_space < data_size) {
 		ret = 1;
 		goto out;
 	}
@@ -2442,7 +2442,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	}
 
 	free_space = btrfs_leaf_free_space(root, left);
-	if (free_space < data_size + sizeof(struct btrfs_item)) {
+	if (free_space < data_size) {
 		ret = 1;
 		goto out;
 	}
@@ -2473,7 +2473,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 
 		if (path->slots[0] == i)
-			push_space += data_size + sizeof(*item);
+			push_space += data_size;
 
 		this_item_size = btrfs_item_size(right, item);
 		if (this_item_size + sizeof(*item) + push_space > free_space)
@@ -2510,7 +2510,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		     btrfs_item_offset_nr(right, push_items - 1),
 		     push_space);
 	old_left_nritems = btrfs_header_nritems(left);
-	BUG_ON(old_left_nritems < 0);
+	BUG_ON(old_left_nritems <= 0);
 
 	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
 	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
@@ -2628,7 +2628,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int mid;
 	int slot;
 	struct extent_buffer *right;
-	int space_needed = data_size + sizeof(struct btrfs_item);
 	int data_copy_size;
 	int rt_data_off;
 	int i;
@@ -2638,9 +2637,6 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int num_doubles = 0;
 	struct btrfs_disk_key disk_key;
 
-	if (extend && data_size)
-		space_needed = data_size;
-
 	/* first try to make some room by pushing left and right */
 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
@@ -2655,7 +2651,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 		l = path->nodes[0];
 
 		/* did the pushes work? */
-		if (btrfs_leaf_free_space(root, l) >= space_needed)
+		if (btrfs_leaf_free_space(root, l) >= data_size)
 			return 0;
 	}
 
@@ -2694,7 +2690,7 @@ again:
 			    BTRFS_UUID_SIZE);
 	if (mid <= slot) {
 		if (nritems == 1 ||
-		    leaf_space_used(l, mid, nritems - mid) + space_needed >
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (slot >= nritems) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
@@ -2716,12 +2712,12 @@ again:
 			mid = slot;
 			if (mid != nritems &&
 			    leaf_space_used(l, mid, nritems - mid) +
-			    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
+			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
 				double_split = 1;
 			}
 		}
 	} else {
-		if (leaf_space_used(l, 0, mid + 1) + space_needed >
+		if (leaf_space_used(l, 0, mid) + data_size >
 			BTRFS_LEAF_DATA_SIZE(root)) {
 			if (!extend && data_size && slot == 0) {
 				btrfs_cpu_key_to_disk(&disk_key, ins_key);
@@ -2750,7 +2746,7 @@ again:
 				mid = slot;
 				if (mid != nritems &&
 				    leaf_space_used(l, mid, nritems - mid) +
-				    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
 					double_split = 1;
 				}
 			}
@@ -2883,7 +2879,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 		return -EAGAIN;
 	}
 
-	ret = split_leaf(trans, root, &orig_key, path, 0, 0);
+	ret = split_leaf(trans, root, &orig_key, path,
+			 sizeof(struct btrfs_item), 1);
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
@@ -3169,14 +3166,17 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 	struct btrfs_disk_key disk_key;
 	struct btrfs_key found_key;
 
-	found_key.objectid = 0;
-	nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));
-
-	for (i = 0; i < nr; i++)
+	for (i = 0; i < nr; i++) {
+		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
+		    BTRFS_LEAF_DATA_SIZE(root)) {
+			break;
+			nr = i;
+		}
 		total_data += data_size[i];
+		total_size += data_size[i] + sizeof(struct btrfs_item);
+	}
+	BUG_ON(nr == 0);
 
-	total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
-	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
 	if (ret == 0)
 		return -EEXIST;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7acadf3b742..cc6e0b6de94 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -300,6 +300,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 
 		size = btrfs_item_size_nr(leaf, path->slots[0]);
 		csum_end = key.offset + (size / csum_size) * root->sectorsize;
+		if (csum_end <= start) {
+			path->slots[0]++;
+			continue;
+		}
 
 		size = min(csum_end, end + 1) - start;
 		sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6ac1b7f72e2..33eee256ee8 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -374,13 +374,8 @@ insert:
 		if (found_size > item_size) {
 			btrfs_truncate_item(trans, root, path, item_size, 1);
 		} else if (found_size < item_size) {
-			ret = btrfs_del_item(trans, root,
-					     path);
-			BUG_ON(ret);
-
-			btrfs_release_path(root, path);
-			ret = btrfs_insert_empty_item(trans,
-				  root, path, key, item_size);
+			ret = btrfs_extend_item(trans, root, path,
+						item_size - found_size);
 			BUG_ON(ret);
 		}
 	} else if (ret) {
-- 
cgit v1.2.3


From cad321ad529400c6ab24c501a67c3be720a0744c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 17 Dec 2008 14:51:42 -0500
Subject: Btrfs: shift all end_io work to thread pools

bio_end_io for reads without checksumming on and btree writes were
happening without using async thread pools.  This means the extent_io.c
code had to use spin_lock_irq and friends on the rb tree locks for
extent state.

There were some irq safe vs unsafe lock inversions between the delallock
lock and the extent state locks.  This patch gets rid of them by moving
all end_io code into the thread pools.

To avoid contention and deadlocks between the data end_io processing and the
metadata end_io processing yet another thread pool is added to finish
off metadata writes.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h     |  1 +
 fs/btrfs/disk-io.c   | 39 ++++++++++++++++++++++++++-------------
 fs/btrfs/extent_io.c | 51 ++++++++++++++++++++++++---------------------------
 fs/btrfs/inode.c     | 12 ++++++------
 4 files changed, 57 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b89999de456..ccea0648e10 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -758,6 +758,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_meta_workers;
+	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
 	/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 541a8279ac7..04f8d7080b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -447,8 +447,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->work.flags = 0;
 
 	if (bio->bi_rw & (1 << BIO_RW)) {
-		btrfs_queue_worker(&fs_info->endio_write_workers,
-				   &end_io_wq->work);
+		if (end_io_wq->metadata)
+			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_write_workers,
+					   &end_io_wq->work);
 	} else {
 		if (end_io_wq->metadata)
 			btrfs_queue_worker(&fs_info->endio_meta_workers,
@@ -624,23 +628,24 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags)
 {
-	/*
-	 * kthread helpers are used to submit writes so that checksumming
-	 * can happen in parallel across all CPUs
-	 */
+	int ret;
+
+	ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+					  bio, 1);
+	BUG_ON(ret);
+
 	if (!(rw & (1 << BIO_RW))) {
-		int ret;
 		/*
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
 		 */
-		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
-					  bio, 1);
-		BUG_ON(ret);
-
 		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
 				     mirror_num, 0);
 	}
+	/*
+	 * kthread helpers are used to submit writes so that checksumming
+	 * can happen in parallel across all CPUs
+	 */
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num, 0,
 				   __btree_submit_bio_start,
@@ -1350,12 +1355,13 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio = end_io_wq->bio;
 	fs_info = end_io_wq->info;
 
-	/* metadata bios are special because the whole tree block must
+	/* metadata bio reads are special because the whole tree block must
 	 * be checksummed at once.  This makes sure the entire block is in
 	 * ram and up to date before trying to verify things.  For
 	 * blocksize <= pagesize, it is basically a noop
 	 */
-	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
+	if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata &&
+	    !bio_ready_for_csum(bio)) {
 		btrfs_queue_worker(&fs_info->endio_meta_workers,
 				   &end_io_wq->work);
 		return;
@@ -1668,6 +1674,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			   fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
 			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_meta_write_workers,
+			   "endio-meta-write", fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size);
 
@@ -1677,6 +1685,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
 	fs_info->endio_write_workers.idle_thresh = 64;
+	fs_info->endio_meta_write_workers.idle_thresh = 64;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
@@ -1685,6 +1694,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_meta_workers,
 			    fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_meta_write_workers,
+			    fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_write_workers,
 			    fs_info->thread_pool_size);
 
@@ -1866,6 +1877,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
@@ -2253,6 +2265,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 607f5ff2791..25ce2d18e5b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -477,7 +477,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
-	unsigned long flags;
 	int err;
 	int set = 0;
 
@@ -488,7 +487,7 @@ again:
 			return -ENOMEM;
 	}
 
-	spin_lock_irqsave(&tree->lock, flags);
+	spin_lock(&tree->lock);
 	/*
 	 * this search will find the extents that end after
 	 * our range starts
@@ -559,7 +558,7 @@ again:
 	goto search_again;
 
 out:
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock(&tree->lock);
 	if (prealloc)
 		free_extent_state(prealloc);
 
@@ -568,7 +567,7 @@ out:
 search_again:
 	if (start > end)
 		goto out;
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock(&tree->lock);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
@@ -582,9 +581,9 @@ static int wait_on_state(struct extent_io_tree *tree,
 {
 	DEFINE_WAIT(wait);
 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
-	spin_unlock_irq(&tree->lock);
+	spin_unlock(&tree->lock);
 	schedule();
-	spin_lock_irq(&tree->lock);
+	spin_lock(&tree->lock);
 	finish_wait(&state->wq, &wait);
 	return 0;
 }
@@ -599,7 +598,7 @@ int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 	struct extent_state *state;
 	struct rb_node *node;
 
-	spin_lock_irq(&tree->lock);
+	spin_lock(&tree->lock);
 again:
 	while (1) {
 		/*
@@ -628,13 +627,13 @@ again:
 			break;
 
 		if (need_resched()) {
-			spin_unlock_irq(&tree->lock);
+			spin_unlock(&tree->lock);
 			cond_resched();
-			spin_lock_irq(&tree->lock);
+			spin_lock(&tree->lock);
 		}
 	}
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock(&tree->lock);
 	return 0;
 }
 EXPORT_SYMBOL(wait_extent_bit);
@@ -668,7 +667,6 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int b
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
-	unsigned long flags;
 	int err = 0;
 	int set;
 	u64 last_start;
@@ -680,7 +678,7 @@ again:
 			return -ENOMEM;
 	}
 
-	spin_lock_irqsave(&tree->lock, flags);
+	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -800,7 +798,7 @@ again:
 	goto search_again;
 
 out:
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock(&tree->lock);
 	if (prealloc)
 		free_extent_state(prealloc);
 
@@ -809,7 +807,7 @@ out:
 search_again:
 	if (start > end)
 		goto out;
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock(&tree->lock);
 	if (mask & __GFP_WAIT)
 		cond_resched();
 	goto again;
@@ -1021,7 +1019,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 	struct extent_state *state;
 	int ret = 1;
 
-	spin_lock_irq(&tree->lock);
+	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -1044,7 +1042,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			break;
 	}
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock(&tree->lock);
 	return ret;
 }
 EXPORT_SYMBOL(find_first_extent_bit);
@@ -1097,7 +1095,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 	u64 found = 0;
 	u64 total_bytes = 0;
 
-	spin_lock_irq(&tree->lock);
+	spin_lock(&tree->lock);
 
 	/*
 	 * this search will find all the extents that end after
@@ -1134,7 +1132,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 			break;
 	}
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock(&tree->lock);
 	return found;
 }
 
@@ -1391,7 +1389,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		return 0;
 	}
 
-	spin_lock_irq(&tree->lock);
+	spin_lock(&tree->lock);
 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
 		total_bytes = tree->dirty_bytes;
 		goto out;
@@ -1424,7 +1422,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
 			break;
 	}
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock(&tree->lock);
 	return total_bytes;
 }
 
@@ -1501,7 +1499,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 	struct extent_state *state;
 	int ret = 0;
 
-	spin_lock_irq(&tree->lock);
+	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -1518,7 +1516,7 @@ int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 	}
 	state->private = private;
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock(&tree->lock);
 	return ret;
 }
 
@@ -1528,7 +1526,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 	struct extent_state *state;
 	int ret = 0;
 
-	spin_lock_irq(&tree->lock);
+	spin_lock(&tree->lock);
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -1545,7 +1543,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 	}
 	*private = state->private;
 out:
-	spin_unlock_irq(&tree->lock);
+	spin_unlock(&tree->lock);
 	return ret;
 }
 
@@ -1561,9 +1559,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state = NULL;
 	struct rb_node *node;
 	int bitset = 0;
-	unsigned long flags;
 
-	spin_lock_irqsave(&tree->lock, flags);
+	spin_lock(&tree->lock);
 	node = tree_search(tree, start);
 	while (node && start <= end) {
 		state = rb_entry(node, struct extent_state, rb_node);
@@ -1594,7 +1591,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			break;
 		}
 	}
-	spin_unlock_irqrestore(&tree->lock, flags);
+	spin_unlock(&tree->lock);
 	return bitset;
 }
 EXPORT_SYMBOL(test_range_bit);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0577e77e661..068bad46338 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1282,8 +1282,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 }
 
 /*
- * extent_io.c submission hook. This does the right thing for csum calculation on write,
- * or reading the csums from the tree before a read
+ * extent_io.c submission hook. This does the right thing for csum calculation
+ * on write, or reading the csums from the tree before a read
  */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
@@ -1292,11 +1292,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	int ret = 0;
 	int skip_sum;
 
+	skip_sum = btrfs_test_flag(inode, NODATASUM);
+
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
-	skip_sum = btrfs_test_flag(inode, NODATASUM);
-
 	if (!(rw & (1 << BIO_RW))) {
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
 			return btrfs_submit_compressed_read(inode, bio,
@@ -1648,13 +1648,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 			      failrec->logical, failrec->len);
 	failrec->last_mirror++;
 	if (!state) {
-		spin_lock_irq(&BTRFS_I(inode)->io_tree.lock);
+		spin_lock(&BTRFS_I(inode)->io_tree.lock);
 		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
 						    failrec->start,
 						    EXTENT_LOCKED);
 		if (state && state->start != failrec->start)
 			state = NULL;
-		spin_unlock_irq(&BTRFS_I(inode)->io_tree.lock);
+		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
 	}
 	if (!state || failrec->last_mirror > num_copies) {
 		set_state_private(failure_tree, failrec->start, 0);
-- 
cgit v1.2.3


From b16281c30c841c6d999ff36c8d903f42a69315f2 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh Weinraub <yehudasa@gmail.com>
Date: Wed, 17 Dec 2008 10:21:26 -0500
Subject: Btrfs: fix return value from btrfs_listxattr when buffer size is too
 small

The return value was being overwritten.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
---
 fs/btrfs/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index adb4b32a9d5..4146f0710e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -226,7 +226,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 
 		if (!buffer || (name_len + 1) > size_left) {
 			ret = -ERANGE;
-			break;
+			goto err;
 		}
 
 		name_ptr = (unsigned long)(di + 1);
-- 
cgit v1.2.3


From ab67b7c1f780a8a321fe7ee49117775009350fb3 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 19 Dec 2008 10:58:39 -0500
Subject: Btrfs: Add missing mnt_drop_write in ioctl.c

This patch adds the missing mnt_drop_write to match
mnt_want_write in btrfs_ioctl_defrag and btrfs_ioctl_clone

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ioctl.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5d67858ce99..ab429fe0fa0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -646,6 +646,7 @@ static int btrfs_ioctl_defrag(struct file *file)
 		break;
 	}
 
+	mnt_drop_write(file->f_path.mnt);
 	return 0;
 }
 
@@ -730,8 +731,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		return ret;
 
 	src_file = fget(srcfd);
-	if (!src_file)
-		return -EBADF;
+	if (!src_file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
 	src = src_file->f_dentry->d_inode;
 
 	ret = -EINVAL;
@@ -982,6 +985,8 @@ out_unlock:
 	btrfs_free_path(path);
 out_fput:
 	fput(src_file);
+out_drop_write:
+	mnt_drop_write(file->f_path.mnt);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 34bf63c4ddddd92bfba3387d134c37bf4426b2ce Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 19 Dec 2008 10:58:46 -0500
Subject: Btrfs: properly update block accounting for metadata

This adds the missing block accounting code to finish_current_insert and makes
block accounting for root item properly protected by the delalloc spin lock.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9ef2a2be268..274bb91efa2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1051,11 +1051,11 @@ search:
 		super_used = btrfs_super_bytes_used(&info->super_copy);
 		btrfs_set_super_bytes_used(&info->super_copy,
 					   super_used - bytes_freed);
-		spin_unlock(&info->delalloc_lock);
 
 		root_used = btrfs_root_used(&extent_root->root_item);
 		btrfs_set_root_used(&extent_root->root_item,
 				    root_used - bytes_freed);
+		spin_unlock(&info->delalloc_lock);
 
 		/* delete the items */
 		ret = btrfs_del_items(trans, extent_root, path,
@@ -2242,6 +2242,7 @@ again:
 				  extent_op->bytenr + extent_op->num_bytes - 1,
 				  EXTENT_WRITEBACK, GFP_NOFS);
 		if (extent_op->del) {
+			u64 used;
 			list_del_init(&extent_op->list);
 			unlock_extent(&info->extent_ins, extent_op->bytenr,
 				      extent_op->bytenr + extent_op->num_bytes
@@ -2253,6 +2254,15 @@ again:
 					     extent_op->num_bytes, 0);
 			mutex_unlock(&extent_root->fs_info->pinned_mutex);
 
+			spin_lock(&info->delalloc_lock);
+			used = btrfs_super_bytes_used(&info->super_copy);
+			btrfs_set_super_bytes_used(&info->super_copy,
+					used - extent_op->num_bytes);
+			used = btrfs_root_used(&extent_root->root_item);
+			btrfs_set_root_used(&extent_root->root_item,
+					used - extent_op->num_bytes);
+			spin_unlock(&info->delalloc_lock);
+
 			ret = update_block_group(trans, extent_root,
 						 extent_op->bytenr,
 						 extent_op->num_bytes,
@@ -2467,12 +2477,12 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		super_used = btrfs_super_bytes_used(&info->super_copy);
 		btrfs_set_super_bytes_used(&info->super_copy,
 					   super_used - num_bytes);
-		spin_unlock(&info->delalloc_lock);
 
 		/* block accounting for root item */
 		root_used = btrfs_root_used(&root->root_item);
 		btrfs_set_root_used(&root->root_item,
 					   root_used - num_bytes);
+		spin_unlock(&info->delalloc_lock);
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
@@ -3154,11 +3164,11 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	spin_lock(&info->delalloc_lock);
 	super_used = btrfs_super_bytes_used(&info->super_copy);
 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
-	spin_unlock(&info->delalloc_lock);
 
 	/* block accounting for root item */
 	root_used = btrfs_root_used(&root->root_item);
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
+	spin_unlock(&info->delalloc_lock);
 
 	if (root == extent_root) {
 		struct pending_extent_op *extent_op;
-- 
cgit v1.2.3


From 1f80e4db0fcb3bdc2be51389baf558a9519682f0 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 19 Dec 2008 10:59:04 -0500
Subject: Btrfs: set EXTENT_BOUNDARY bit before marking extent delalloc.

There is a race in relocate_inode_pages, it happens when
find_delalloc_range finds the delalloc extent before the
boundary bit is set. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 274bb91efa2..fe0e59ab33c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3994,10 +3994,10 @@ again:
 		}
 		set_page_extent_mapped(page);
 
-		btrfs_set_extent_delalloc(inode, page_start, page_end);
 		if (i == first_index)
 			set_extent_bits(io_tree, page_start, page_end,
 					EXTENT_BOUNDARY, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
 
 		set_page_dirty(page);
 		total_dirty++;
@@ -4405,7 +4405,7 @@ static int noinline get_new_locations(struct inode *reloc_inode,
 		path->slots[0]++;
 	}
 
-	WARN_ON(cur_pos + offset > last_byte);
+	BUG_ON(cur_pos + offset > last_byte);
 	if (cur_pos + offset < last_byte) {
 		ret = -ENOENT;
 		goto out;
@@ -5712,7 +5712,6 @@ next:
 	if (pass == 0) {
 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
 		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
-		WARN_ON(reloc_inode->i_mapping->nrpages);
 	}
 
 	if (total_found > 0) {
-- 
cgit v1.2.3


From b34b086c1c1d934c5314d46ba25ccfa9acc471ae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 19 Dec 2008 15:43:22 -0500
Subject: Btrfs: Fix compile warning around num_online_cpus() in a min
 statement

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 04f8d7080b1..40a540f3116 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1528,7 +1528,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
 
-	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
+	fs_info->thread_pool_size = min_t(unsigned long,
+					  num_online_cpus() + 2, 8);
 
 	INIT_LIST_HEAD(&fs_info->ordered_extents);
 	spin_lock_init(&fs_info->ordered_extent_lock);
-- 
cgit v1.2.3


From 7bbe5b5aa6d1e38af6f1fc866efc0aa461d73f19 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 9 Dec 2008 11:02:51 -0500
Subject: UBIFS: use PAGE_CACHE_MASK correctly

It has high bits set, not low bits set as the UBIFS code
assumed.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2624411d975..7f1de98e609 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -254,7 +254,7 @@ static int write_begin_slow(struct address_space *mapping,
 	}
 
 	if (!PageUptodate(page)) {
-		if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
 			SetPageChecked(page);
 		else {
 			err = do_readpage(page);
@@ -444,7 +444,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		/* The page is not loaded from the flash */
-		if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
 			/*
 			 * We change whole page so no need to load it. But we
 			 * have to set the @PG_checked flag to make the further
-- 
cgit v1.2.3


From 24fa9e9438b263600737c839b36543981d87d65b Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Wed, 17 Dec 2008 17:45:14 +0200
Subject: UBIFS: fix tnc dumping

debugfs tnc dumping was broken because of an obvious typo.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 934db1855f0..367d97520d9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2443,7 +2443,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 		spin_lock(&c->space_lock);
 		dbg_dump_budg(c);
 		spin_unlock(&c->space_lock);
-	} else if (file->f_path.dentry == d->dump_budg) {
+	} else if (file->f_path.dentry == d->dump_tnc) {
 		mutex_lock(&c->tnc_mutex);
 		dbg_dump_tnc(c);
 		mutex_unlock(&c->tnc_mutex);
-- 
cgit v1.2.3


From 21a60258976227daaf7a4c35e96c3d77d4988b15 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 12 Dec 2008 11:13:17 -0500
Subject: UBIFS: improve budgeting dump

Dump available space calculated by budgeting subsystem.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c |  4 ++--
 fs/ubifs/debug.c  | 13 +++++++++++++
 fs/ubifs/ubifs.h  |  2 +-
 3 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 1a4973e1066..d5a65037e17 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -713,8 +713,8 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
  * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
  * are able to write a file of size N. UBIFS attaches node headers to each data
  * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
+ * overhead, and UBIFS has to report sligtly less free space to meet the above
+ * expectetions.
  *
  * This function assumes free space is made up of uncompressed data nodes and
  * full index nodes (one per data node, tripled because we always allow enough
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 367d97520d9..6ecb01a99d1 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -597,7 +597,9 @@ void dbg_dump_budg(struct ubifs_info *c)
 	struct rb_node *rb;
 	struct ubifs_bud *bud;
 	struct ubifs_gced_idx_leb *idx_gc;
+	long long available, outstanding, free;
 
+	ubifs_assert(spin_is_locked(&c->space_lock));
 	spin_lock(&dbg_lock);
 	printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
 	       "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
@@ -630,6 +632,17 @@ void dbg_dump_budg(struct ubifs_info *c)
 		printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
 		       idx_gc->lnum, idx_gc->unmap);
 	printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+
+	/* Print budgeting predictions */
+	available = ubifs_calc_available(c, c->min_idx_lebs);
+	outstanding = c->budg_data_growth + c->budg_dd_growth;
+	if (available > outstanding)
+		free = ubifs_reported_space(c, available - outstanding);
+	else
+		free = 0;
+	printk(KERN_DEBUG "Budgeting predictions:\n");
+	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+	       available, outstanding, free);
 	spin_unlock(&dbg_lock);
 }
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 055c6b52d2f..e61c08106b4 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -419,7 +419,7 @@ struct ubifs_unclean_leb {
  *
  * LPROPS_UNCAT: not categorized
  * LPROPS_DIRTY: dirty > 0, not index
- * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
  * LPROPS_FREE: free > 0, not empty, not index
  * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
  * LPROPS_EMPTY: LEB is empty, not taken
-- 
cgit v1.2.3


From d3cf502b6ccee1c52890d42cd18cbc98b7526126 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 16 Dec 2008 17:52:35 +0200
Subject: UBIFS: various comment improvements and fixes

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lprops.c | 12 ++++++------
 fs/ubifs/ubifs.h  | 34 ++++++++++++++++++----------------
 2 files changed, 24 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 10ba663eb32..dfd2bcece27 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
  * @flags: new flags
  * @idx_gc_cnt: change to the count of idx_gc list
  *
- * This function changes LEB properties. This function does not change a LEB
- * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ * This function changes LEB properties (@free, @dirty or @flag). However, the
+ * property which has the %LPROPS_NC value is not changed. Returns a pointer to
+ * the updated LEB properties on success and a negative error code on failure.
  *
- * This function returns a pointer to the updated LEB properties on success
- * and a negative error code on failure. N.B. the LEB properties may have had to
- * be copied (due to COW) and consequently the pointer returned may not be the
- * same as the pointer passed.
+ * Note, the LEB properties may have had to be copied (due to COW) and
+ * consequently the pointer returned may not be the same as the pointer
+ * passed.
  */
 const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 					   const struct ubifs_lprops *lp,
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e61c08106b4..f8ef7c1def1 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -482,24 +482,26 @@ struct ubifs_lpt_lprops {
  * @empty_lebs: number of empty LEBs
  * @taken_empty_lebs: number of taken LEBs
  * @idx_lebs: number of indexing LEBs
- * @total_free: total free space in bytes
- * @total_dirty: total dirty space in bytes
- * @total_used: total used space in bytes (includes only data LEBs)
- * @total_dead: total dead space in bytes (includes only data LEBs)
- * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @total_free: total free space in bytes (includes all LEBs)
+ * @total_dirty: total dirty space in bytes (includes all LEBs)
+ * @total_used: total used space in bytes (does not include index LEBs)
+ * @total_dead: total dead space in bytes (does not include index LEBs)
+ * @total_dark: total dark space in bytes (does not include index LEBs)
  *
- * N.B. total_dirty and total_used are different to other total_* fields,
- * because they account _all_ LEBs, not just data LEBs.
+ * The @taken_empty_lebs field counts the LEBs that are in the transient state
+ * of having been "taken" for use but not yet written to. @taken_empty_lebs is
+ * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
+ * used by itself (in which case 'unused_lebs' would be a better name). In the
+ * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
+ * by GC, but unlike other empty LEBs that are "taken", it may not be written
+ * straight away (i.e. before the next commit start or unmount), so either
+ * @gc_lnum must be specially accounted for, or the current approach followed
+ * i.e. count it under @taken_empty_lebs.
  *
- * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
- * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
- * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
- * by itself (in which case 'unused_lebs' would be a better name). In the case
- * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
- * but unlike other empty LEBs that are 'taken', it may not be written straight
- * away (i.e. before the next commit start or unmount), so either gc_lnum must
- * be specially accounted for, or the current approach followed i.e. count it
- * under 'taken_empty_lebs'.
+ * @empty_lebs includes @taken_empty_lebs.
+ *
+ * @total_used, @total_dead and @total_dark fields do not account indexing
+ * LEBs.
  */
 struct ubifs_lp_stats {
 	int empty_lebs;
-- 
cgit v1.2.3


From af14a1ad792621942a03e4bd0e5f17b6e177e2e0 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 19 Dec 2008 19:26:29 +0200
Subject: UBIFS: fix available blocks count

Take into account that 2 eraseblocks are never available because
they are reserved for the index. This gives more realistic count
of FS blocks.

To avoid future confusions like this, introduce a constant.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 9 ++-------
 fs/ubifs/super.c  | 5 +++--
 fs/ubifs/ubifs.h  | 8 ++++++++
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index d5a65037e17..e4234254700 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -280,13 +280,8 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 	 * extra LEB to compensate.
 	 */
 	ret += 1;
-	/*
-	 * At present the index needs at least 2 LEBs: one for the index head
-	 * and one for in-the-gaps method (which currently does not cater for
-	 * the index head and so excludes it from consideration).
-	 */
-	if (ret < 2)
-		ret = 2;
+	if (ret < MIN_INDEX_LEBS)
+		ret = MIN_INDEX_LEBS;
 	return ret;
 }
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 2dbaa4fc2cb..a6a7798d020 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -695,9 +695,10 @@ static int init_constants_late(struct ubifs_info *c)
 	 * necessary to report something for the 'statfs()' call.
 	 *
 	 * Subtract the LEB reserved for GC, the LEB which is reserved for
-	 * deletions, and assume only one journal head is available.
+	 * deletions, minimum LEBs for the index, and assume only one journal
+	 * head is available.
 	 */
-	tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+	tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
 	tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
 	tmp64 = ubifs_reported_space(c, tmp64);
 	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f8ef7c1def1..543e850022e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -63,6 +63,14 @@
 #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
 #define SQNUM_WATERMARK      0xFFFFFFFFFF000000ULL
 
+/*
+ * Minimum amount of LEBs reserved for the index. At present the index needs at
+ * least 2 LEBs: one for the index head and one for in-the-gaps method (which
+ * currently does not cater for the index head and so excludes it from
+ * consideration).
+ */
+#define MIN_INDEX_LEBS 2
+
 /* Minimum amount of data UBIFS writes to the flash */
 #define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
 
-- 
cgit v1.2.3


From 4d61db4f87b527734ac0cc830dda8fcc4e2add2f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 18 Dec 2008 14:06:51 +0200
Subject: UBIFS: use nicer 64-bit math

Instead of using do_div(), use better primitives from
linux/math64.h.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 25 +++++++++++--------------
 fs/ubifs/debug.c  |  1 +
 fs/ubifs/lpt.c    | 15 ++++++---------
 fs/ubifs/sb.c     | 10 +++++-----
 fs/ubifs/super.c  | 12 ++++++------
 fs/ubifs/ubifs.h  |  2 +-
 6 files changed, 30 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index e4234254700..0bcb8031ca1 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -32,7 +32,7 @@
 
 #include "ubifs.h"
 #include <linux/writeback.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
 
 /*
  * When pessimistic budget calculations say that there is no enough space,
@@ -258,8 +258,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
  */
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 {
-	int ret;
-	uint64_t idx_size;
+	int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+	long long idx_size;
 
 	idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
 
@@ -271,18 +271,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
 	 * pair, nor similarly the two variables for the new index size, so we
 	 * have to do this costly 64-bit division on fast-path.
 	 */
-	if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
-		ret = idx_size + 1;
-	else
-		ret = idx_size;
+	idx_size += eff_leb_size - 1;
+	idx_lebs = div_u64(idx_size, eff_leb_size);
 	/*
 	 * The index head is not available for the in-the-gaps method, so add an
 	 * extra LEB to compensate.
 	 */
-	ret += 1;
-	if (ret < MIN_INDEX_LEBS)
-		ret = MIN_INDEX_LEBS;
-	return ret;
+	idx_lebs += 1;
+	if (idx_lebs < MIN_INDEX_LEBS)
+		idx_lebs = MIN_INDEX_LEBS;
+	return idx_lebs;
 }
 
 /**
@@ -718,7 +716,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
  * Note, the calculation is pessimistic, which means that most of the time
  * UBIFS reports less space than it actually has.
  */
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 {
 	int divisor, factor, f;
 
@@ -740,8 +738,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
 	divisor = UBIFS_MAX_DATA_NODE_SZ;
 	divisor += (c->max_idx_node_sz * 3) / (f - 1);
 	free *= factor;
-	do_div(free, divisor);
-	return free;
+	return div_u64(free, divisor);
 }
 
 /**
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 6ecb01a99d1..a2be11584ad 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -33,6 +33,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/debugfs.h>
+#include <linux/math64.h>
 
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 6d914160ec5..b2792e84d24 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -43,8 +43,9 @@
  * mounted.
  */
 
-#include <linux/crc16.h>
 #include "ubifs.h"
+#include <linux/crc16.h>
+#include <linux/math64.h>
 
 /**
  * do_calc_lpt_geom - calculate sizes for the LPT area.
@@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
 int ubifs_calc_lpt_geom(struct ubifs_info *c)
 {
 	int lebs_needed;
-	uint64_t sz;
+	long long sz;
 
 	do_calc_lpt_geom(c);
 
 	/* Verify that lpt_lebs is big enough */
 	sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
-	sz += c->leb_size - 1;
-	do_div(sz, c->leb_size);
-	lebs_needed = sz;
+	lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
 	if (lebs_needed > c->lpt_lebs) {
 		ubifs_err("too few LPT LEBs");
 		return -EINVAL;
@@ -175,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
 			      int *big_lpt)
 {
 	int i, lebs_needed;
-	uint64_t sz;
+	long long sz;
 
 	/* Start by assuming the minimum number of LPT LEBs */
 	c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
@@ -202,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
 	/* Now check there are enough LPT LEBs */
 	for (i = 0; i < 64 ; i++) {
 		sz = c->lpt_sz * 4; /* Allow 4 times the size */
-		sz += c->leb_size - 1;
-		do_div(sz, c->leb_size);
-		lebs_needed = sz;
+		lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
 		if (lebs_needed > c->lpt_lebs) {
 			/* Not enough LPT LEBs so try again with more */
 			c->lpt_lebs = lebs_needed;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index c5da201ab54..e070c643d1b 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,6 +28,7 @@
 
 #include "ubifs.h"
 #include <linux/random.h>
+#include <linux/math64.h>
 
 /*
  * Default journal size in logical eraseblocks as a percent of total
@@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
 	int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
 	int min_leb_cnt = UBIFS_MIN_LEB_CNT;
-	uint64_t tmp64, main_bytes;
+	long long tmp64, main_bytes;
 	__le64 tmp_le64;
 
 	/* Some functions called from here depend on the @c->key_len filed */
@@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	if (!sup)
 		return -ENOMEM;
 
-	tmp64 = (uint64_t)max_buds * c->leb_size;
+	tmp64 = (long long)max_buds * c->leb_size;
 	if (big_lpt)
 		sup_flags |= UBIFS_FLG_BIGLPT;
 
@@ -187,9 +188,8 @@ static int create_default_filesystem(struct ubifs_info *c)
 
 	generate_random_uuid(sup->uuid);
 
-	main_bytes = (uint64_t)main_lebs * c->leb_size;
-	tmp64 = main_bytes * DEFAULT_RP_PERCENT;
-	do_div(tmp64, 100);
+	main_bytes = (long long)main_lebs * c->leb_size;
+	tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
 	if (tmp64 > DEFAULT_MAX_RP_SIZE)
 		tmp64 = DEFAULT_MAX_RP_SIZE;
 	sup->rp_size = cpu_to_le64(tmp64);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a6a7798d020..c3cefc84137 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -34,6 +34,7 @@
 #include <linux/parser.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/math64.h>
 #include "ubifs.h"
 
 /*
@@ -612,7 +613,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 static int init_constants_late(struct ubifs_info *c)
 {
 	int tmp, err;
-	uint64_t tmp64;
+	long long tmp64;
 
 	c->main_bytes = (long long)c->main_lebs * c->leb_size;
 	c->max_znode_sz = sizeof(struct ubifs_znode) +
@@ -639,9 +640,8 @@ static int init_constants_late(struct ubifs_info *c)
 	 * Make sure that the log is large enough to fit reference nodes for
 	 * all buds plus one reserved LEB.
 	 */
-	tmp64 = c->max_bud_bytes;
-	tmp = do_div(tmp64, c->leb_size);
-	c->max_bud_cnt = tmp64 + !!tmp;
+	tmp64 = c->max_bud_bytes + c->leb_size - 1;
+	c->max_bud_cnt = div_u64(tmp64, c->leb_size);
 	tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
 	tmp /= c->leb_size;
 	tmp += 1;
@@ -677,7 +677,7 @@ static int init_constants_late(struct ubifs_info *c)
 	 * Consequently, if the journal is too small, UBIFS will treat it as
 	 * always full.
 	 */
-	tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+	tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
 	if (c->bg_bud_bytes < tmp64)
 		c->bg_bud_bytes = tmp64;
 	if (c->max_bud_bytes < tmp64 + c->leb_size)
@@ -699,7 +699,7 @@ static int init_constants_late(struct ubifs_info *c)
 	 * head is available.
 	 */
 	tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
-	tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
+	tmp64 *= (long long)c->leb_size - c->leb_overhead;
 	tmp64 = ubifs_reported_space(c, tmp64);
 	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 543e850022e..a17dd794ae9 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1498,7 +1498,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 long long ubifs_get_free_space(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
+long long ubifs_reported_space(const struct ubifs_info *c, long long free);
 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
 
 /* find.c */
-- 
cgit v1.2.3


From 650ed50f4298e76007070b7ab9d640dfe7228ab3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 22 Dec 2008 11:09:04 +0200
Subject: UBIFS: re-calculate min_idx_size after the commit

When we commit, but before we try to write anything to the flash
media, @c->min_idx_size is inaccurate, because we do not re-calculate
it after the commit. Do not forget to do this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/tnc_commit.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 3c0af452887..fde8d127c76 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
 	 * budgeting subsystem to assume the index is already committed,
 	 * even though it is not.
 	 */
+	ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
 	c->old_idx_sz = c->calc_idx_sz;
 	c->budg_uncommitted_idx = 0;
+	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 	spin_unlock(&c->space_lock);
 	mutex_unlock(&c->tnc_mutex);
 
-- 
cgit v1.2.3


From c8f915913afdfe1a796e312e21658b8edcf20868 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 19 Dec 2008 16:11:13 +0200
Subject: UBIFS: avoid unnecessary calculations

Do not calculate min_idx_lebs, because it is available in
c->min_idx_lebs

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 0bcb8031ca1..44cff803171 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -763,7 +763,8 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 	long long available, outstanding, free;
 
 	spin_lock(&c->space_lock);
-	min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+	min_idx_lebs = c->min_idx_lebs;
+	ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
 
 	/*
-- 
cgit v1.2.3


From d6d7b702a3a1ca50f7ca2bebaa79c80425156bac Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 12 Nov 2008 16:49:48 -0600
Subject: dlm: fix up memory allocation flags

Use ls_allocation for memory allocations, which a cluster fs sets to
GFP_NOFS.  Use GFP_NOFS for allocations when no lockspace struct is
available.  Taking dlm locks needs to avoid calling back into the
cluster fs because write-out can require taking dlm locks.

Cc: Christine Caulfield <ccaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 1 +
 fs/dlm/memory.c   | 6 +++---
 fs/dlm/midcomms.c | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 3962262f991..1e720316300 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con)
 	con->sock->sk->sk_write_space = lowcomms_write_space;
 	con->sock->sk->sk_state_change = lowcomms_state_change;
 	con->sock->sk->sk_user_data = con;
+	con->sock->sk->sk_allocation = GFP_NOFS;
 	return 0;
 }
 
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index 54c14c6d06c..c1775b84eba 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
 {
 	char *p;
 
-	p = kzalloc(ls->ls_lvblen, GFP_KERNEL);
+	p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
 	return p;
 }
 
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
 
 	DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
 
-	r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL);
+	r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
 	return r;
 }
 
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
 
-	lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL);
+	lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
 	return lkb;
 }
 
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 07ac709f3ed..f3396c622ae 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base,
 		   ordinary messages). */
 
 		if (msglen > sizeof(__tmp) && p == &__tmp.p) {
-			p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
+			p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
 			if (p == NULL)
 				return ret;
 		}
-- 
cgit v1.2.3


From cd8e4679bdcf9b54564f2cda2389bd0f0457e12d Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Wed, 12 Nov 2008 16:28:43 -0600
Subject: dlm: trivial annotation of be16 value

fs/dlm/dir.c:419:14: warning: incorrect type in assignment (different base types)
fs/dlm/dir.c:419:14:    expected unsigned short [unsigned] [addressable] [assigned] [usertype] be_namelen
fs/dlm/dir.c:419:14:    got restricted __be16 [usertype] <noident>

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dir.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 85defeb64df..92969f879a1 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
 	struct list_head *list;
 	struct dlm_rsb *r;
 	int offset = 0, dir_nodeid;
-	uint16_t be_namelen;
+	__be16 be_namelen;
 
 	down_read(&ls->ls_root_sem);
 
@@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
 
 		if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) {
 			/* Write end-of-block record */
-			be_namelen = 0;
-			memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
-			offset += sizeof(uint16_t);
+			be_namelen = cpu_to_be16(0);
+			memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+			offset += sizeof(__be16);
 			goto out;
 		}
 
 		be_namelen = cpu_to_be16(r->res_length);
-		memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
-		offset += sizeof(uint16_t);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
 		memcpy(outbuf + offset, r->res_name, r->res_length);
 		offset += r->res_length;
 	}
@@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
 
 	if ((list == &ls->ls_root_list) &&
 	    (offset + sizeof(uint16_t) <= outlen)) {
-		be_namelen = 0xFFFF;
-		memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t));
-		offset += sizeof(uint16_t);
+		be_namelen = cpu_to_be16(0xFFFF);
+		memcpy(outbuf + offset, &be_namelen, sizeof(__be16));
+		offset += sizeof(__be16);
 	}
 
  out:
-- 
cgit v1.2.3


From 1521848cbb42935a52d11305c054b14461ad061c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 12 Nov 2008 17:00:16 -0600
Subject: dlm: remove kmap/kunmap

The pages used in lowcomms are not highmem, so kmap is not necessary.

Cc: Christine Caulfield <ccaulfie@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 1e720316300..103a5ebd137 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -824,7 +824,6 @@ static void sctp_init_assoc(struct connection *con)
 	len = e->len;
 	offset = e->offset;
 	spin_unlock(&con->writequeue_lock);
-	kmap(e->page);
 
 	/* Send the first block off the write queue */
 	iov[0].iov_base = page_address(e->page)+offset;
@@ -855,7 +854,6 @@ static void sctp_init_assoc(struct connection *con)
 
 		if (e->len == 0 && e->users == 0) {
 			list_del(&e->list);
-			kunmap(e->page);
 			free_entry(e);
 		}
 		spin_unlock(&con->writequeue_lock);
@@ -1204,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
 
 	if (e) {
 	got_one:
-		if (users == 0)
-			kmap(e->page);
 		*ppc = page_address(e->page) + offset;
 		return e;
 	}
@@ -1234,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh)
 	if (users)
 		goto out;
 	e->len = e->end - e->offset;
-	kunmap(e->page);
 	spin_unlock(&con->writequeue_lock);
 
 	if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
@@ -1273,7 +1268,6 @@ static void send_to_sock(struct connection *con)
 		offset = e->offset;
 		BUG_ON(len == 0 && e->users == 0);
 		spin_unlock(&con->writequeue_lock);
-		kmap(e->page);
 
 		ret = 0;
 		if (len) {
@@ -1295,7 +1289,6 @@ static void send_to_sock(struct connection *con)
 
 		if (e->len == 0 && e->users == 0) {
 			list_del(&e->list);
-			kunmap(e->page);
 			free_entry(e);
 			continue;
 		}
-- 
cgit v1.2.3


From d61e9aac96317a43c192f1faabfa95d4d675b7ce Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Dec 2008 09:31:02 -0600
Subject: dlm: replace schedule with cond_resched

This is a one-liner to use cond_resched() rather than schedule()
in the ast delivery loop. It should not be necessary to schedule
every time, so this will save some cpu time while continuing to
allow scheduling when required.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 8bf31e3fbf0..30c11f3855b 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -101,7 +101,7 @@ static void process_asts(void)
 		   and may result in the lkb being freed */
 		dlm_put_lkb(lkb);
 
-		schedule();
+		cond_resched();
 	}
 }
 
-- 
cgit v1.2.3


From 03339696314fffb95dafb349b84243358e945ce6 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 8 Dec 2008 17:14:10 -0600
Subject: dlm: remove extra blocking callback check

Just before delivering a blocking callback (bast), the dlm_astd
thread checks again that the granted mode of the lkb actually
blocks the mode requested by the bast.  The idea behind this was
originally that the granted mode may have changed since the bast
was queued, making the callback now unnecessary.  Reasons for
removing this extra check are:
- dlm_astd doesn't lock the rsb before reading the lkb grmode, so
  it's not technically safe (this removes the long standing FIXME)
- after running some tests, it doesn't appear the check ever actually
  eliminates a bast
- delivering an unnecessary blocking callback isn't a bad thing and
  can happen anyway

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 30c11f3855b..09b167df790 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -89,13 +89,8 @@ static void process_asts(void)
 		if ((type & AST_COMP) && cast)
 			cast(lkb->lkb_astparam);
 
-		/* FIXME: Is it safe to look at lkb_grmode here
-		   without doing a lock_rsb() ?
-		   Look at other checks in v1 to avoid basts. */
-
 		if ((type & AST_BAST) && bast)
-			if (!dlm_modes_compat(lkb->lkb_grmode, bmode))
-				bast(lkb->lkb_astparam, bmode);
+			bast(lkb->lkb_astparam, bmode);
 
 		/* this removes the reference added by dlm_add_ast
 		   and may result in the lkb being freed */
-- 
cgit v1.2.3


From fd22a51bcc0b7b76fc729b02316214fd979f9fe1 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 9 Dec 2008 11:55:46 -0600
Subject: dlm: improve how bast mode handling

The lkb bastmode value is set in the context of processing the
lock, and read by the dlm_astd thread.  Because it's accessed
in these two separate contexts, the writing/reading ought to
be done under a lock.  This is simple to do by setting it and
reading it when the lkb is added to and removed from dlm_astd's
callback list which is properly locked.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c  | 14 ++++++++------
 fs/dlm/ast.h  |  4 ++--
 fs/dlm/lock.c |  8 +++-----
 fs/dlm/user.c |  4 +++-
 fs/dlm/user.h |  2 +-
 5 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 09b167df790..fbe840d0949 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
 	spin_unlock(&ast_queue_lock);
 }
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
 	if (lkb->lkb_flags & DLM_IFL_USER) {
-		dlm_user_add_ast(lkb, type);
+		dlm_user_add_ast(lkb, type, bastmode);
 		return;
 	}
 
@@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type)
 		list_add_tail(&lkb->lkb_astqueue, &ast_queue);
 	}
 	lkb->lkb_ast_type |= type;
+	if (bastmode)
+		lkb->lkb_bastmode = bastmode;
 	spin_unlock(&ast_queue_lock);
 
 	set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -59,7 +61,7 @@ static void process_asts(void)
 	struct dlm_lkb *lkb;
 	void (*cast) (void *astparam);
 	void (*bast) (void *astparam, int mode);
-	int type = 0, found, bmode;
+	int type = 0, found, bastmode;
 
 	for (;;) {
 		found = 0;
@@ -74,6 +76,7 @@ static void process_asts(void)
 			list_del(&lkb->lkb_astqueue);
 			type = lkb->lkb_ast_type;
 			lkb->lkb_ast_type = 0;
+			bastmode = lkb->lkb_bastmode;
 			found = 1;
 			break;
 		}
@@ -84,13 +87,12 @@ static void process_asts(void)
 
 		cast = lkb->lkb_astfn;
 		bast = lkb->lkb_bastfn;
-		bmode = lkb->lkb_bastmode;
 
 		if ((type & AST_COMP) && cast)
 			cast(lkb->lkb_astparam);
 
 		if ((type & AST_BAST) && bast)
-			bast(lkb->lkb_astparam, bmode);
+			bast(lkb->lkb_astparam, bastmode);
 
 		/* this removes the reference added by dlm_add_ast
 		   and may result in the lkb being freed */
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 6ee276c74c5..1b5fc5f428f 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -13,7 +13,7 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-void dlm_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 void dlm_del_ast(struct dlm_lkb *lkb);
 
 void dlm_astd_wake(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 724ddac9153..7b758dadbdd 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 	lkb->lkb_lksb->sb_status = rv;
 	lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
 
-	dlm_add_ast(lkb, AST_COMP);
+	dlm_add_ast(lkb, AST_COMP, 0);
 }
 
 static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -320,10 +320,8 @@ static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
 	if (is_master_copy(lkb))
 		send_bast(r, lkb, rqmode);
-	else {
-		lkb->lkb_bastmode = rqmode;
-		dlm_add_ast(lkb, AST_BAST);
-	}
+	else
+		dlm_add_ast(lkb, AST_BAST, rqmode);
 }
 
 /*
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index b3832c67194..065149e84f4 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
 /* we could possibly check if the cancel of an orphan has resulted in the lkb
    being removed and then remove that lkb from the orphans list and free it */
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
 {
 	struct dlm_ls *ls;
 	struct dlm_user_args *ua;
@@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
 
 	ast_type = lkb->lkb_ast_type;
 	lkb->lkb_ast_type |= type;
+	if (bastmode)
+		lkb->lkb_bastmode = bastmode;
 
 	if (!ast_type) {
 		kref_get(&lkb->lkb_ref);
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index 35eb6a13d61..1c968649228 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,7 @@
 #ifndef __USER_DOT_H__
 #define __USER_DOT_H__
 
-void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
+void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
 int dlm_user_init(void);
 void dlm_user_exit(void);
 int dlm_device_deregister(struct dlm_ls *ls);
-- 
cgit v1.2.3


From eeda418d8c2646f33f24e9ad33d86c239adc6de7 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 9 Dec 2008 14:12:21 -0600
Subject: dlm: change lock time stamping

Use ktime instead of jiffies for timestamping lkb's.  Also stamp the
time on every lkb whenever it's added to a resource queue, instead of
just stamping locks subject to timeouts.  This will allow us to use
timestamps more widely for debugging all locks.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c     | 14 +++++++-------
 fs/dlm/dlm_internal.h |  2 +-
 fs/dlm/lock.c         | 21 +++++++++++----------
 fs/dlm/netlink.c      |  1 -
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8fc24f4507a..19e4f9eb44e 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -162,21 +162,21 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
 
 static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
 {
-	unsigned int waiting = 0;
-	uint64_t xid = 0;
+	u64 xid = 0;
+	u64 us;
 
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
 
-	if (lkb->lkb_timestamp)
-		waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp);
+	/* microseconds since lkb was added to current queue */
+	us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp));
 
-	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms
+	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
 	   r_nodeid r_len r_name */
 
-	seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n",
+	seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
 		   lkb->lkb_id,
 		   lkb->lkb_nodeid,
 		   lkb->lkb_remid,
@@ -187,7 +187,7 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
 		   lkb->lkb_status,
 		   lkb->lkb_grmode,
 		   lkb->lkb_rqmode,
-		   waiting,
+		   (unsigned long long)us,
 		   r->res_nodeid,
 		   r->res_length,
 		   r->res_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 868e4c9ef12..e69135c83d5 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,7 +245,7 @@ struct dlm_lkb {
 	struct list_head	lkb_astqueue;	/* need ast to be sent */
 	struct list_head	lkb_ownqueue;	/* list of locks for a process */
 	struct list_head	lkb_time_list;
-	unsigned long		lkb_timestamp;
+	ktime_t			lkb_timestamp;
 	unsigned long		lkb_timeout_cs;
 
 	char			*lkb_lvbptr;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 7b758dadbdd..dfc57ae2704 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -742,6 +742,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
 
 	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
 
+	lkb->lkb_timestamp = ktime_get();
+
 	lkb->lkb_status = status;
 
 	switch (status) {
@@ -1011,10 +1013,8 @@ static void add_timeout(struct dlm_lkb *lkb)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 
-	if (is_master_copy(lkb)) {
-		lkb->lkb_timestamp = jiffies;
+	if (is_master_copy(lkb))
 		return;
-	}
 
 	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
 	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
@@ -1029,7 +1029,6 @@ static void add_timeout(struct dlm_lkb *lkb)
 	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
 	mutex_lock(&ls->ls_timeout_mutex);
 	hold_lkb(lkb);
-	lkb->lkb_timestamp = jiffies;
 	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
 	mutex_unlock(&ls->ls_timeout_mutex);
 }
@@ -1057,6 +1056,7 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 	struct dlm_rsb *r;
 	struct dlm_lkb *lkb;
 	int do_cancel, do_warn;
+	s64 wait_us;
 
 	for (;;) {
 		if (dlm_locking_stopped(ls))
@@ -1067,14 +1067,15 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 		mutex_lock(&ls->ls_timeout_mutex);
 		list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
 
+			wait_us = ktime_to_us(ktime_sub(ktime_get(),
+					      		lkb->lkb_timestamp));
+
 			if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
-			    time_after_eq(jiffies, lkb->lkb_timestamp +
-					  lkb->lkb_timeout_cs * HZ/100))
+			    wait_us >= (lkb->lkb_timeout_cs * 10000))
 				do_cancel = 1;
 
 			if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
-			    time_after_eq(jiffies, lkb->lkb_timestamp +
-				   	   dlm_config.ci_timewarn_cs * HZ/100))
+			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
 				do_warn = 1;
 
 			if (!do_cancel && !do_warn)
@@ -1120,12 +1121,12 @@ void dlm_scan_timeout(struct dlm_ls *ls)
 void dlm_adjust_timeouts(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
-	long adj = jiffies - ls->ls_recover_begin;
+	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
 
 	ls->ls_recover_begin = 0;
 	mutex_lock(&ls->ls_timeout_mutex);
 	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
-		lkb->lkb_timestamp += adj;
+		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
 	mutex_unlock(&ls->ls_timeout_mutex);
 }
 
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 18bda83cc89..46e582c8d60 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
 	data->status = lkb->lkb_status;
 	data->grmode = lkb->lkb_grmode;
 	data->rqmode = lkb->lkb_rqmode;
-	data->timestamp = lkb->lkb_timestamp;
 	if (lkb->lkb_ua)
 		data->xid = lkb->lkb_ua->xid;
 	if (r) {
-- 
cgit v1.2.3


From e3a84ad495d1fddb542e0922160f0194a1361950 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 9 Dec 2008 14:47:29 -0600
Subject: dlm: add time stamp of blocking callback

Record the time the latest blocking callback was queued for
a lock.  This will be used for debugging in combination with
lock queue timestamp changes in the previous patch.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h | 1 +
 fs/dlm/lock.c         | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e69135c83d5..0c488295192 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -245,6 +245,7 @@ struct dlm_lkb {
 	struct list_head	lkb_astqueue;	/* need ast to be sent */
 	struct list_head	lkb_ownqueue;	/* list of locks for a process */
 	struct list_head	lkb_time_list;
+	ktime_t			lkb_time_bast;	/* for debugging */
 	ktime_t			lkb_timestamp;
 	unsigned long		lkb_timeout_cs;
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index dfc57ae2704..6cfe65bbf4a 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -318,6 +318,8 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 
 static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 {
+	lkb->lkb_time_bast = ktime_get();
+
 	if (is_master_copy(lkb))
 		send_bast(r, lkb, rqmode);
 	else
-- 
cgit v1.2.3


From d022509d1c54be4918e7fc8f1195ee8c392e9a57 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Tue, 16 Dec 2008 14:53:23 -0600
Subject: dlm: add new debugfs entry

The new debugfs entry dumps all rsb and lkb structures, and includes
a lot more information than has been available before.  This includes
the new timestamps added by a previous patch for debugging callback
issues.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c     | 296 +++++++++++++++++++++++++++++++++++++++++---------
 fs/dlm/dlm_internal.h |   1 +
 2 files changed, 247 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 19e4f9eb44e..2f107d1a6a4 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -27,7 +27,7 @@ static struct dentry *dlm_root;
 
 struct rsb_iter {
 	int entry;
-	int locks;
+	int format;
 	int header;
 	struct dlm_ls *ls;
 	struct list_head *next;
@@ -60,8 +60,8 @@ static char *print_lockmode(int mode)
 	}
 }
 
-static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
-				struct dlm_rsb *res)
+static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       struct dlm_rsb *res)
 {
 	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
 
@@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	seq_printf(s, "\n");
 }
 
-static int print_resource(struct dlm_rsb *res, struct seq_file *s)
+static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
 	int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
@@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
 	/* Print the locks attached to this resource */
 	seq_printf(s, "Granted Queue\n");
 	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
-		print_resource_lock(s, lkb, res);
+		print_format1_lock(s, lkb, res);
 
 	seq_printf(s, "Conversion Queue\n");
 	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
-		print_resource_lock(s, lkb, res);
+		print_format1_lock(s, lkb, res);
 
 	seq_printf(s, "Waiting Queue\n");
 	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
-		print_resource_lock(s, lkb, res);
+		print_format1_lock(s, lkb, res);
 
 	if (list_empty(&res->res_lookup))
 		goto out;
@@ -160,7 +160,8 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s)
 	return 0;
 }
 
-static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r)
+static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       struct dlm_rsb *r)
 {
 	u64 xid = 0;
 	u64 us;
@@ -193,20 +194,108 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *
 		   r->res_name);
 }
 
-static int print_locks(struct dlm_rsb *r, struct seq_file *s)
+static int print_format2(struct dlm_rsb *r, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
 
 	lock_rsb(r);
 
 	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-		print_lock(s, lkb, r);
+		print_format2_lock(s, lkb, r);
 
 	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-		print_lock(s, lkb, r);
+		print_format2_lock(s, lkb, r);
 
 	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-		print_lock(s, lkb, r);
+		print_format2_lock(s, lkb, r);
+
+	unlock_rsb(r);
+	return 0;
+}
+
+static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			       int rsb_lookup)
+{
+	u64 xid = 0;
+
+	if (lkb->lkb_flags & DLM_IFL_USER) {
+		if (lkb->lkb_ua)
+			xid = lkb->lkb_ua->xid;
+	}
+
+	seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+		   lkb->lkb_id,
+		   lkb->lkb_nodeid,
+		   lkb->lkb_remid,
+		   lkb->lkb_ownpid,
+		   (unsigned long long)xid,
+		   lkb->lkb_exflags,
+		   lkb->lkb_flags,
+		   lkb->lkb_status,
+		   lkb->lkb_grmode,
+		   lkb->lkb_rqmode,
+		   lkb->lkb_highbast,
+		   rsb_lookup,
+		   lkb->lkb_wait_type,
+		   lkb->lkb_lvbseq,
+		   (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+		   (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+}
+
+static int print_format3(struct dlm_rsb *r, struct seq_file *s)
+{
+	struct dlm_lkb *lkb;
+	int i, lvblen = r->res_ls->ls_lvblen;
+	int print_name = 1;
+
+	lock_rsb(r);
+
+	seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+		   r,
+		   r->res_nodeid,
+		   r->res_first_lkid,
+		   r->res_flags,
+		   !list_empty(&r->res_root_list),
+		   !list_empty(&r->res_recover_list),
+		   r->res_recover_locks_count,
+		   r->res_length);
+
+	for (i = 0; i < r->res_length; i++) {
+		if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
+			print_name = 0;
+	}
+
+	seq_printf(s, "%s", print_name ? "str " : "hex");
+
+	for (i = 0; i < r->res_length; i++) {
+		if (print_name)
+			seq_printf(s, "%c", r->res_name[i]);
+		else
+			seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
+	}
+	seq_printf(s, "\n");
+
+	if (!r->res_lvbptr)
+		goto do_locks;
+
+	seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen);
+
+	for (i = 0; i < lvblen; i++)
+		seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
+	seq_printf(s, "\n");
+
+ do_locks:
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
+		print_format3_lock(s, lkb, 0);
+
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
+		print_format3_lock(s, lkb, 0);
+
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
+		print_format3_lock(s, lkb, 0);
+
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
+		print_format3_lock(s, lkb, 1);
 
 	unlock_rsb(r);
 	return 0;
@@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
 				break;
 			}
 			read_unlock(&ls->ls_rsbtbl[i].lock);
-                }
+		}
 		ri->entry = i;
 
 		if (ri->entry >= ls->ls_rsbtbl_size)
@@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri)
 			read_unlock(&ls->ls_rsbtbl[i].lock);
 			dlm_put_rsb(old);
 			goto top;
-                }
+		}
 		ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
 		dlm_hold_rsb(ri->rsb);
 		read_unlock(&ls->ls_rsbtbl[i].lock);
@@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
 	ri->ls = ls;
 	ri->entry = 0;
 	ri->next = NULL;
+	ri->format = 1;
 
 	if (rsb_iter_next(ri)) {
 		rsb_iter_free(ri);
@@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
 {
 	struct rsb_iter *ri = iter_ptr;
 
-	if (ri->locks) {
+	switch (ri->format) {
+	case 1:
+		print_format1(ri->rsb, file);
+		break;
+	case 2:
 		if (ri->header) {
-			seq_printf(file, "id nodeid remid pid xid exflags flags "
-					 "sts grmode rqmode time_ms r_nodeid "
-					 "r_len r_name\n");
+			seq_printf(file, "id nodeid remid pid xid exflags "
+					 "flags sts grmode rqmode time_ms "
+					 "r_nodeid r_len r_name\n");
 			ri->header = 0;
 		}
-		print_locks(ri->rsb, file);
-	} else {
-		print_resource(ri->rsb, file);
+		print_format2(ri->rsb, file);
+		break;
+	case 3:
+		if (ri->header) {
+			seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+			ri->header = 0;
+		}
+		print_format3(ri->rsb, file);
+		break;
 	}
 
 	return 0;
@@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
 	ri->ls = ls;
 	ri->entry = 0;
 	ri->next = NULL;
-	ri->locks = 1;
+	ri->format = 2;
 
 	if (*pos == 0)
 		ri->header = 1;
@@ -447,6 +547,84 @@ static const struct file_operations locks_fops = {
 	.release = seq_release
 };
 
+/*
+ * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
+ * This can replace both formats 1 and 2 eventually.
+ */
+
+static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
+{
+	struct rsb_iter *ri;
+
+	ri = kzalloc(sizeof *ri, GFP_KERNEL);
+	if (!ri)
+		return NULL;
+
+	ri->ls = ls;
+	ri->entry = 0;
+	ri->next = NULL;
+	ri->format = 3;
+
+	if (*pos == 0)
+		ri->header = 1;
+
+	if (rsb_iter_next(ri)) {
+		rsb_iter_free(ri);
+		return NULL;
+	}
+
+	return ri;
+}
+
+static void *all_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct rsb_iter *ri;
+	loff_t n = *pos;
+
+	ri = all_iter_init(file->private, pos);
+	if (!ri)
+		return NULL;
+
+	while (n--) {
+		if (rsb_iter_next(ri)) {
+			rsb_iter_free(ri);
+			return NULL;
+		}
+	}
+
+	return ri;
+}
+
+static struct seq_operations all_seq_ops = {
+	.start = all_seq_start,
+	.next  = rsb_seq_next,
+	.stop  = rsb_seq_stop,
+	.show  = rsb_seq_show,
+};
+
+static int all_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &all_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations all_fops = {
+	.owner   = THIS_MODULE,
+	.open    = all_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
 /*
  * dump lkb's on the ls_waiters list
  */
@@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = {
 	.read    = waiters_read
 };
 
+void dlm_delete_debug_file(struct dlm_ls *ls)
+{
+	if (ls->ls_debug_rsb_dentry)
+		debugfs_remove(ls->ls_debug_rsb_dentry);
+	if (ls->ls_debug_waiters_dentry)
+		debugfs_remove(ls->ls_debug_waiters_dentry);
+	if (ls->ls_debug_locks_dentry)
+		debugfs_remove(ls->ls_debug_locks_dentry);
+	if (ls->ls_debug_all_dentry)
+		debugfs_remove(ls->ls_debug_all_dentry);
+}
+
 int dlm_create_debug_file(struct dlm_ls *ls)
 {
 	char name[DLM_LOCKSPACE_LEN+8];
 
+	/* format 1 */
+
 	ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name,
 						      S_IFREG | S_IRUGO,
 						      dlm_root,
 						      ls,
 						      &rsb_fops);
 	if (!ls->ls_debug_rsb_dentry)
-		return -ENOMEM;
+		goto fail;
 
-	memset(name, 0, sizeof(name));
-	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
-
-	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
-							  S_IFREG | S_IRUGO,
-							  dlm_root,
-							  ls,
-							  &waiters_fops);
-	if (!ls->ls_debug_waiters_dentry) {
-		debugfs_remove(ls->ls_debug_rsb_dentry);
-		return -ENOMEM;
-	}
+	/* format 2 */
 
 	memset(name, 0, sizeof(name));
 	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name);
@@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 							dlm_root,
 							ls,
 							&locks_fops);
-	if (!ls->ls_debug_locks_dentry) {
-		debugfs_remove(ls->ls_debug_waiters_dentry);
-		debugfs_remove(ls->ls_debug_rsb_dentry);
-		return -ENOMEM;
-	}
+	if (!ls->ls_debug_locks_dentry)
+		goto fail;
+
+	/* format 3 */
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name);
+
+	ls->ls_debug_all_dentry = debugfs_create_file(name,
+						      S_IFREG | S_IRUGO,
+						      dlm_root,
+						      ls,
+						      &all_fops);
+	if (!ls->ls_debug_all_dentry)
+		goto fail;
+
+	memset(name, 0, sizeof(name));
+	snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name);
+
+	ls->ls_debug_waiters_dentry = debugfs_create_file(name,
+							  S_IFREG | S_IRUGO,
+							  dlm_root,
+							  ls,
+							  &waiters_fops);
+	if (!ls->ls_debug_waiters_dentry)
+		goto fail;
 
 	return 0;
-}
 
-void dlm_delete_debug_file(struct dlm_ls *ls)
-{
-	if (ls->ls_debug_rsb_dentry)
-		debugfs_remove(ls->ls_debug_rsb_dentry);
-	if (ls->ls_debug_waiters_dentry)
-		debugfs_remove(ls->ls_debug_waiters_dentry);
-	if (ls->ls_debug_locks_dentry)
-		debugfs_remove(ls->ls_debug_locks_dentry);
+ fail:
+	dlm_delete_debug_file(ls);
+	return -ENOMEM;
 }
 
 int __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 0c488295192..ef2f1e35396 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -482,6 +482,7 @@ struct dlm_ls {
 	struct dentry		*ls_debug_rsb_dentry; /* debugfs */
 	struct dentry		*ls_debug_waiters_dentry; /* debugfs */
 	struct dentry		*ls_debug_locks_dentry; /* debugfs */
+	struct dentry		*ls_debug_all_dentry; /* debugfs */
 
 	wait_queue_head_t	ls_uevent_wait;	/* user part of join/leave */
 	int			ls_uevent_result;
-- 
cgit v1.2.3


From 722d74219ea21223c74e5e894b0afcc5e4ca75a7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 23 Dec 2008 10:22:56 -0600
Subject: dlm: fs/dlm/ast.c: fix warning

fs/dlm/ast.c: In function 'dlm_astd':
fs/dlm/ast.c:64: warning: 'bastmode' may be used uninitialized in this function

Cleans code up.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 39 +++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index fbe840d0949..dc2ad6008b2 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -61,30 +61,23 @@ static void process_asts(void)
 	struct dlm_lkb *lkb;
 	void (*cast) (void *astparam);
 	void (*bast) (void *astparam, int mode);
-	int type = 0, found, bastmode;
-
-	for (;;) {
-		found = 0;
-		spin_lock(&ast_queue_lock);
-		list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
-			r = lkb->lkb_resource;
-			ls = r->res_ls;
-
-			if (dlm_locking_stopped(ls))
-				continue;
-
-			list_del(&lkb->lkb_astqueue);
-			type = lkb->lkb_ast_type;
-			lkb->lkb_ast_type = 0;
-			bastmode = lkb->lkb_bastmode;
-			found = 1;
-			break;
-		}
-		spin_unlock(&ast_queue_lock);
+	int type = 0, bastmode;
+
+repeat:
+	spin_lock(&ast_queue_lock);
+	list_for_each_entry(lkb, &ast_queue, lkb_astqueue) {
+		r = lkb->lkb_resource;
+		ls = r->res_ls;
 
-		if (!found)
-			break;
+		if (dlm_locking_stopped(ls))
+			continue;
 
+		list_del(&lkb->lkb_astqueue);
+		type = lkb->lkb_ast_type;
+		lkb->lkb_ast_type = 0;
+		bastmode = lkb->lkb_bastmode;
+
+		spin_unlock(&ast_queue_lock);
 		cast = lkb->lkb_astfn;
 		bast = lkb->lkb_bastfn;
 
@@ -99,7 +92,9 @@ static void process_asts(void)
 		dlm_put_lkb(lkb);
 
 		cond_resched();
+		goto repeat;
 	}
+	spin_unlock(&ast_queue_lock);
 }
 
 static inline int no_asts(void)
-- 
cgit v1.2.3


From 26ddd8d5cac8a563953d5febe8c6e40909f7bce1 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 26 Dec 2008 14:24:10 +0900
Subject: proc: remove ifdef CONFIG_SPARSE_IRQ from stat.c

Impact: cleanup

irq_desc can be NULL when CONFIG_SPARSE_IRQ=y only.
therefore, NULL checking can move into kstat_irqs_cpu() of SPARSE_IRQ version.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: "Yinghai Lu" <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/proc/stat.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 3bb1cf1e742..f75efa22df5 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,6 +9,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/irqnr.h>
 #include <asm/cputime.h>
 
 #ifndef arch_irq_stat_cpu
@@ -45,10 +46,6 @@ static int show_stat(struct seq_file *p, void *v)
 		steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
 		guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
 		for_each_irq_nr(j) {
-#ifdef CONFIG_SPARSE_IRQ
-			if (!irq_to_desc(j))
-				continue;
-#endif
 			sum += kstat_irqs_cpu(j, i);
 		}
 		sum += arch_irq_stat_cpu(i);
@@ -95,12 +92,6 @@ static int show_stat(struct seq_file *p, void *v)
 	/* sum again ? it could be updated? */
 	for_each_irq_nr(j) {
 		per_irq_sum = 0;
-#ifdef CONFIG_SPARSE_IRQ
-		if (!irq_to_desc(j)) {
-			seq_printf(p, " %u", per_irq_sum);
-			continue;
-		}
-#endif
 		for_each_possible_cpu(i)
 			per_irq_sum += kstat_irqs_cpu(j, i);
 
-- 
cgit v1.2.3


From cb78a0ce69fad2026825f957e24e2d9cda1ec9f1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 30 Dec 2008 09:05:14 +1030
Subject: bitmap: fix seq_bitmap and seq_cpumask to take const pointer

Impact: cleanup

seq_bitmap just calls bitmap_scnprintf on the bits: that arg can be const.
Similarly, seq_cpumask just calls seq_bitmap.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 fs/seq_file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c2..c99358a5217 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -462,7 +462,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 	return -1;
 }
 
-int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+				   unsigned int nr_bits)
 {
 	if (m->count < m->size) {
 		int len = bitmap_scnprintf(m->buf + m->count,
-- 
cgit v1.2.3


From 79807d075ab8d1ca3574f5f52421e0047c1f1256 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sat, 27 Dec 2008 19:18:00 +0200
Subject: UBIFS: fix constants initialization

The c->min_idx_lebs constant depends on c->old_idx_sz, which
is read from the master node. This means that we have to
initialize c->min_idx_lebs only after we have read the master
node.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c3cefc84137..13097830e8b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -602,7 +602,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
 }
 
 /*
- * init_constants_late - initialize UBIFS constants.
+ * init_constants_sb - initialize UBIFS constants.
  * @c: UBIFS file-system description object
  *
  * This is a helper function which initializes various UBIFS constants after
@@ -610,7 +610,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
  * makes sure they are all right. Returns zero in case of success and a
  * negative error code in case of failure.
  */
-static int init_constants_late(struct ubifs_info *c)
+static int init_constants_sb(struct ubifs_info *c)
 {
 	int tmp, err;
 	long long tmp64;
@@ -687,6 +687,21 @@ static int init_constants_late(struct ubifs_info *c)
 	if (err)
 		return err;
 
+	return 0;
+}
+
+/*
+ * init_constants_master - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the master node has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right.
+ */
+static void init_constants_master(struct ubifs_info *c)
+{
+	long long tmp64;
+
 	c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
 
 	/*
@@ -702,8 +717,6 @@ static int init_constants_late(struct ubifs_info *c)
 	tmp64 *= (long long)c->leb_size - c->leb_overhead;
 	tmp64 = ubifs_reported_space(c, tmp64);
 	c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
-
-	return 0;
 }
 
 /**
@@ -1138,7 +1151,7 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_free;
 	}
 
-	err = init_constants_late(c);
+	err = init_constants_sb(c);
 	if (err)
 		goto out_free;
 
@@ -1172,6 +1185,8 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (err)
 		goto out_master;
 
+	init_constants_master(c);
+
 	if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
 		ubifs_msg("recovery needed");
 		c->need_recovery = 1;
-- 
cgit v1.2.3


From 304d427cd99eb645b44b08d77e70ce308e6bcd8c Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 28 Dec 2008 08:04:17 +0200
Subject: UBIFS: fix file-system synchronization

Argh. The ->sync_fs call is called _before_ all inodes are flushed.
This means we first sync write buffers and commit, then all
inodes are synced, and we end up with unflushed write buffers!

Fix this by forcing synching all indoes from 'ubifs_sync_fs()'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 13097830e8b..471301799c5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -35,6 +35,7 @@
 #include <linux/seq_file.h>
 #include <linux/mount.h>
 #include <linux/math64.h>
+#include <linux/writeback.h>
 #include "ubifs.h"
 
 /*
@@ -431,6 +432,23 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	struct ubifs_info *c = sb->s_fs_info;
 	int i, ret = 0, err;
 	long long bud_bytes;
+	struct writeback_control wbc = {
+		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.range_start = 0,
+		.range_end   = LLONG_MAX,
+		.nr_to_write = LONG_MAX,
+	};
+
+	/*
+	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
+	 * pages, so synchronize them first, then commit the journal. Strictly
+	 * speaking, it is not necessary to commit the journal here,
+	 * synchronizing write-buffers would be enough. But committing makes
+	 * UBIFS free space predictions much more accurate, so we want to let
+	 * the user be able to get more accurate results of 'statfs()' after
+	 * they synchronize the file system.
+	 */
+	generic_sync_sb_inodes(sb, &wbc);
 
 	if (c->jheads) {
 		for (i = 0; i < c->jhead_cnt; i++) {
-- 
cgit v1.2.3


From f10383006c26b33539820759b9dc8656497b02a4 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 28 Dec 2008 08:16:32 +0200
Subject: UBIFS: always commit in sync_fs

Always run commit in sync_fs, because even if the journal seems
to be almost empty, there may be a deletion which removes a large
file, which affects the index greatly. And because we want
better free space predictions after 'sync_fs()', we have to
commit.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 471301799c5..ee8e7749eae 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -429,9 +429,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
 
 static int ubifs_sync_fs(struct super_block *sb, int wait)
 {
+	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
-	int i, ret = 0, err;
-	long long bud_bytes;
 	struct writeback_control wbc = {
 		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
 		.range_start = 0,
@@ -439,6 +438,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 		.nr_to_write = LONG_MAX,
 	};
 
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	/*
+	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
+	 * do this if it waits for an already running commit.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
 	/*
 	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
 	 * pages, so synchronize them first, then commit the journal. Strictly
@@ -450,30 +462,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	 */
 	generic_sync_sb_inodes(sb, &wbc);
 
-	if (c->jheads) {
-		for (i = 0; i < c->jhead_cnt; i++) {
-			err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
-			if (err && !ret)
-				ret = err;
-		}
-
-		/* Commit the journal unless it has too little data */
-		spin_lock(&c->buds_lock);
-		bud_bytes = c->bud_bytes;
-		spin_unlock(&c->buds_lock);
-		if (bud_bytes > c->leb_size) {
-			err = ubifs_run_commit(c);
-			if (err)
-				return err;
-		}
-	}
+	err = ubifs_run_commit(c);
+	if (err)
+		return err;
 
 	/*
 	 * We ought to call sync for c->ubi but it does not have one. If it had
 	 * it would in turn call mtd->sync, however mtd operations are
 	 * synchronous anyway, so we don't lose any sleep here.
 	 */
-	return ret;
+	return err;
 }
 
 /**
-- 
cgit v1.2.3


From cb5c6a2b2be59b480a3746c5113cb3411c053bff Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 28 Dec 2008 08:18:43 +0200
Subject: UBIFS: use ubi_sync

UBI now has (fake for now, though) synchronization call - use
it.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ee8e7749eae..a14703e0a9a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -466,12 +466,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	if (err)
 		return err;
 
-	/*
-	 * We ought to call sync for c->ubi but it does not have one. If it had
-	 * it would in turn call mtd->sync, however mtd operations are
-	 * synchronous anyway, so we don't lose any sleep here.
-	 */
-	return err;
+	return ubi_sync(c->vi.ubi_num);
 }
 
 /**
-- 
cgit v1.2.3


From 26d05777b0a23062a39e83c369c0a3583918f164 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 28 Dec 2008 09:11:02 +0200
Subject: UBIFS: always commit on unmount

UBIFS commits on unmount to make the next mount faster. Currently,
it commits only if there is more than LEB size bytes in the
journal. This is not very good, because journal size may be
large (512KiB). And there may be few deletions in the journal
which do not take much journal space, but which do introduce
a lot of TNC changes and make mount slow.

Thus, jurt remove this condition and always commit.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a14703e0a9a..1c1bbe4135c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1570,20 +1570,24 @@ out:
  * @c: UBIFS file-system description object
  *
  * This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled. It also avoids
- * committing the journal if it contains too few data.
+ * the journal unless the "fast unmount" mode is enabled.
  */
 static void commit_on_unmount(struct ubifs_info *c)
 {
-	if (!c->fast_unmount) {
-		long long bud_bytes;
+	struct super_block *sb = c->vfs_sb;
+	long long bud_bytes;
 
-		spin_lock(&c->buds_lock);
-		bud_bytes = c->bud_bytes;
-		spin_unlock(&c->buds_lock);
-		if (bud_bytes > c->leb_size)
-			ubifs_run_commit(c);
-	}
+	/*
+	 * This function is called before the background thread is stopped, so
+	 * we may race with ongoing commit, which means we have to take
+	 * @c->bud_lock to access @c->bud_bytes.
+	 */
+	spin_lock(&c->buds_lock);
+	bud_bytes = c->bud_bytes;
+	spin_unlock(&c->buds_lock);
+
+	if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
+		ubifs_run_commit(c);
 }
 
 /**
@@ -2009,7 +2013,7 @@ static void ubifs_kill_sb(struct super_block *sb)
 	 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
 	 * in order to be outside BKL.
 	 */
-	if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+	if (sb->s_root)
 		commit_on_unmount(c);
 	/* The un-mount routine is actually done in put_super() */
 	generic_shutdown_super(sb);
-- 
cgit v1.2.3


From 6edbfafda682b30ad984964cc432da6fa1c8fab5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 30 Dec 2008 20:06:49 +0200
Subject: UBIFS: restore budg_uncommitted_idx

UBIFS stores uncommitted index size in c->budg_uncommitted_idx,
and this affect budgeting calculations. When mounting and
replaying, this variable is not updated, so we may end up
with "over-budgeting". This patch fixes the issue.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/replay.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 21f7d047c30..ce42a7b0ca5 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
 		/*
 		 * If the replay order was perfect the dirty space would now be
 		 * zero. The order is not perfect because the the journal heads
-		 * race with eachother. This is not a problem but is does mean
+		 * race with each other. This is not a problem but is does mean
 		 * that the dirty space may temporarily exceed c->leb_size
 		 * during the replay.
 		 */
@@ -656,7 +656,7 @@ out_dump:
  * @dirty: amount of dirty space from padding and deletion nodes
  *
  * This function inserts a reference node to the replay tree and returns zero
- * in case of success ort a negative error code in case of failure.
+ * in case of success or a negative error code in case of failure.
  */
 static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
 			   unsigned long long sqnum, int free, int dirty)
@@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
 		 * This means that we reached end of log and now
 		 * look to the older log data, which was already
 		 * committed but the eraseblock was not erased (UBIFS
-		 * only unmaps it). So this basically means we have to
+		 * only un-maps it). So this basically means we have to
 		 * exit with "end of log" code.
 		 */
 		err = 1;
@@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c)
 	if (err)
 		goto out;
 
+	/*
+	 * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+	 * to roughly estimate index growth. Things like @c->min_idx_lebs
+	 * depend on it. This means we have to initialize it to make sure
+	 * budgeting works properly.
+	 */
+	c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+	c->budg_uncommitted_idx *= c->max_idx_node_sz;
+
 	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
 	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
 		"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
-- 
cgit v1.2.3


From 2edc2025c2583a18eafe5cdbc7deb36e320aaec5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 22 Dec 2008 11:21:03 +0200
Subject: UBIFS: do not lie about used blocks

Do not force UBIFS return 0 used space when it is empty. It leads
to a situation when creating any file immediately produces tens of
used blocks, which looks very weird. It is better to be honest and
say that some blocks are used even if the FS is empty. And ext2
does the same.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 44cff803171..3715d011495 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -766,16 +766,6 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 	min_idx_lebs = c->min_idx_lebs;
 	ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-
-	/*
-	 * Force the amount available to the total size reported if the used
-	 * space is zero.
-	 */
-	if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
-		spin_unlock(&c->space_lock);
-		return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
-	}
-
 	available = ubifs_calc_available(c, min_idx_lebs);
 
 	/*
-- 
cgit v1.2.3


From 2acf80675800d5e6775990d1280cca5c2ffb30e6 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 9 Dec 2008 11:04:40 -0500
Subject: UBIFS: simplify make_free_space

The 'make_free_space()' function was too complex and this patch
simplifies it. It also fixes a bug - the freespace test failed
straight away on UBI volumes with 512 bytes LEB size.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c | 151 ++++++++++++++++++------------------------------------
 1 file changed, 49 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 3715d011495..4d270f0a856 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -37,13 +37,10 @@
 /*
  * When pessimistic budget calculations say that there is no enough space,
  * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
- * or committing. The below constants define maximum number of times UBIFS
+ * or committing. The below constant defines maximum number of times UBIFS
  * repeats the operations.
  */
-#define MAX_SHRINK_RETRIES 8
-#define MAX_GC_RETRIES     4
-#define MAX_CMT_RETRIES    2
-#define MAX_NOSPC_RETRIES  1
+#define MAX_MKSPC_RETRIES 3
 
 /*
  * The below constant defines amount of dirty pages which should be written
@@ -51,30 +48,6 @@
  */
 #define NR_TO_WRITE 16
 
-/**
- * struct retries_info - information about re-tries while making free space.
- * @prev_liability: previous liability
- * @shrink_cnt: how many times the liability was shrinked
- * @shrink_retries: count of liability shrink re-tries (increased when
- *                  liability does not shrink)
- * @try_gc: GC should be tried first
- * @gc_retries: how many times GC was run
- * @cmt_retries: how many times commit has been done
- * @nospc_retries: how many times GC returned %-ENOSPC
- *
- * Since we consider budgeting to be the fast-path, and this structure has to
- * be allocated on stack and zeroed out, we make it smaller using bit-fields.
- */
-struct retries_info {
-	long long prev_liability;
-	unsigned int shrink_cnt;
-	unsigned int shrink_retries:5;
-	unsigned int try_gc:1;
-	unsigned int gc_retries:4;
-	unsigned int cmt_retries:3;
-	unsigned int nospc_retries:1;
-};
-
 /**
  * shrink_liability - write-back some dirty pages/inodes.
  * @c: UBIFS file-system description object
@@ -146,10 +119,26 @@ static int run_gc(struct ubifs_info *c)
 	return 0;
 }
 
+/**
+ * get_liability - calculate current liability.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns current UBIFS liability, i.e. the
+ * amount of bytes UBIFS has "promised" to write to the media.
+ */
+static long long get_liability(struct ubifs_info *c)
+{
+	long long liab;
+
+	spin_lock(&c->space_lock);
+	liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+	spin_unlock(&c->space_lock);
+	return liab;
+}
+
 /**
  * make_free_space - make more free space on the file-system.
  * @c: UBIFS file-system description object
- * @ri: information about previous invocations of this function
  *
  * This function is called when an operation cannot be budgeted because there
  * is supposedly no free space. But in most cases there is some free space:
@@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c)
  * Returns %-ENOSPC if it couldn't do more free space, and other negative error
  * codes on failures.
  */
-static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+static int make_free_space(struct ubifs_info *c)
 {
-	int err;
-
-	/*
-	 * If we have some dirty pages and inodes (liability), try to write
-	 * them back unless this was tried too many times without effect
-	 * already.
-	 */
-	if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
-		long long liability;
-
-		spin_lock(&c->space_lock);
-		liability = c->budg_idx_growth + c->budg_data_growth +
-			    c->budg_dd_growth;
-		spin_unlock(&c->space_lock);
+	int err, retries = 0;
+	long long liab1, liab2;
 
-		if (ri->prev_liability >= liability) {
-			/* Liability does not shrink, next time try GC then */
-			ri->shrink_retries += 1;
-			if (ri->gc_retries < MAX_GC_RETRIES)
-				ri->try_gc = 1;
-			dbg_budg("liability did not shrink: retries %d of %d",
-				 ri->shrink_retries, MAX_SHRINK_RETRIES);
-		}
-
-		dbg_budg("force write-back (count %d)", ri->shrink_cnt);
-		shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+	do {
+		liab1 = get_liability(c);
+		/*
+		 * We probably have some dirty pages or inodes (liability), try
+		 * to write them back.
+		 */
+		dbg_budg("liability %lld, run write-back", liab1);
+		shrink_liability(c, NR_TO_WRITE);
 
-		ri->prev_liability = liability;
-		ri->shrink_cnt += 1;
-		return -EAGAIN;
-	}
+		liab2 = get_liability(c);
+		if (liab2 < liab1)
+			return -EAGAIN;
 
-	/*
-	 * Try to run garbage collector unless it was already tried too many
-	 * times.
-	 */
-	if (ri->gc_retries < MAX_GC_RETRIES) {
-		ri->gc_retries += 1;
-		dbg_budg("run GC, retries %d of %d",
-			 ri->gc_retries, MAX_GC_RETRIES);
+		dbg_budg("new liability %lld (not shrinked)", liab2);
 
-		ri->try_gc = 0;
+		/* Liability did not shrink again, try GC */
+		dbg_budg("Run GC");
 		err = run_gc(c);
 		if (!err)
 			return -EAGAIN;
 
-		if (err == -EAGAIN) {
-			dbg_budg("GC asked to commit");
-			err = ubifs_run_commit(c);
-			if (err)
-				return err;
-			return -EAGAIN;
-		}
-
-		if (err != -ENOSPC)
+		if (err != -EAGAIN && err != -ENOSPC)
+			/* Some real error happened */
 			return err;
 
-		/*
-		 * GC could not make any progress. If this is the first time,
-		 * then it makes sense to try to commit, because it might make
-		 * some dirty space.
-		 */
-		dbg_budg("GC returned -ENOSPC, retries %d",
-			 ri->nospc_retries);
-		if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
-			return err;
-		ri->nospc_retries += 1;
-	}
-
-	/* Neither GC nor write-back helped, try to commit */
-	if (ri->cmt_retries < MAX_CMT_RETRIES) {
-		ri->cmt_retries += 1;
-		dbg_budg("run commit, retries %d of %d",
-			 ri->cmt_retries, MAX_CMT_RETRIES);
+		dbg_budg("Run commit (retries %d)", retries);
 		err = ubifs_run_commit(c);
 		if (err)
 			return err;
-		return -EAGAIN;
-	}
+	} while (retries++ < MAX_MKSPC_RETRIES);
+
 	return -ENOSPC;
 }
 
@@ -523,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
 int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 {
 	int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
-	int err, idx_growth, data_growth, dd_growth;
-	struct retries_info ri;
+	int err, idx_growth, data_growth, dd_growth, retried = 0;
 
 	ubifs_assert(req->new_page <= 1);
 	ubifs_assert(req->dirtied_page <= 1);
@@ -542,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
 	if (!data_growth && !dd_growth)
 		return 0;
 	idx_growth = calc_idx_growth(c, req);
-	memset(&ri, 0, sizeof(struct retries_info));
 
 again:
 	spin_lock(&c->space_lock);
@@ -580,12 +522,17 @@ again:
 		return err;
 	}
 
-	err = make_free_space(c, &ri);
+	err = make_free_space(c);
+	cond_resched();
 	if (err == -EAGAIN) {
 		dbg_budg("try again");
-		cond_resched();
 		goto again;
 	} else if (err == -ENOSPC) {
+		if (!retried) {
+			retried = 1;
+			dbg_budg("-ENOSPC, but anyway try once again");
+			goto again;
+		}
 		dbg_budg("FS is full, -ENOSPC");
 		c->nospace = 1;
 		if (can_use_rp(c) || c->rp_size == 0)
-- 
cgit v1.2.3


From 6a4a9b438fe43397f4652853838f284cddd629b5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 28 Dec 2008 11:00:55 +0200
Subject: UBIFS: fix sparse warnings

fs/ubifs/compress.c:111:8: warning: incorrect type in argument 5 (different signedness)
fs/ubifs/compress.c:111:8:    expected unsigned int *dlen
fs/ubifs/compress.c:111:8:    got int *out_len
fs/ubifs/compress.c:175:10: warning: incorrect type in argument 5 (different signedness)
fs/ubifs/compress.c:175:10:    expected unsigned int *dlen
fs/ubifs/compress.c:175:10:    got int *out_len

Fix this by adding a cast to (unsigned int *). We guarantee that
our lengths are small and no overflow is possible.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/compress.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 4c90ee2aef4..11e4132f314 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
 	if (compr->comp_mutex)
 		mutex_lock(compr->comp_mutex);
 	err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
-				   out_len);
+				   (unsigned int *)out_len);
 	if (compr->comp_mutex)
 		mutex_unlock(compr->comp_mutex);
 	if (unlikely(err)) {
@@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
 	if (compr->decomp_mutex)
 		mutex_lock(compr->decomp_mutex);
 	err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
-				     out_len);
+				     (unsigned int *)out_len);
 	if (compr->decomp_mutex)
 		mutex_unlock(compr->decomp_mutex);
 	if (err)
-- 
cgit v1.2.3


From f92b982680e4b4149c559789a54e1e9db190752a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Sun, 28 Dec 2008 11:34:26 +0200
Subject: UBIFS: fix checkpatch.pl warnings

These are mostly long lines and wrong indentation warning
fixes. But also there are two volatile variables and
checkpatch.pl complains about them:

WARNING: Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt
+       volatile int gc_seq;

WARNING: Use of volatile is usually wrong: see Documentation/volatile-considered-harmful.txt
+       volatile int gced_lnum;

Well, we anyway use smp_wmb() for c->gc_seq and c->gced_lnum, so
these 'volatile' modifiers can be just dropped.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c      |  3 ++-
 fs/ubifs/file.c       |  4 ++--
 fs/ubifs/journal.c    |  2 +-
 fs/ubifs/lpt_commit.c |  2 +-
 fs/ubifs/tnc.c        | 31 +++++++++++++++----------------
 fs/ubifs/ubifs.h      |  8 ++++----
 6 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index a2be11584ad..350fedecc83 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -703,7 +703,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
 	printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
 	printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
 	       c->nhead_lnum, c->nhead_offs);
-	printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+	printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+	       c->ltab_lnum, c->ltab_offs);
 	if (c->big_lpt)
 		printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
 		       c->lsave_lnum, c->lsave_offs);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 7f1de98e609..fe82d2464d4 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
 		return err;
 	}
 
-	ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum);
-
+	ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+		     ubifs_inode(inode)->creat_sqnum);
 	len = le32_to_cpu(dn->size);
 	if (len <= 0 || len > UBIFS_BLOCK_SIZE)
 		goto dump;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f91b745908e..3b0fa704d55 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 	data_key_init(c, &key, inum, blk);
 
 	bit = old_size & (UBIFS_BLOCK_SIZE - 1);
-	blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+	blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
 	data_key_init(c, &to_key, inum, blk);
 
 	err = ubifs_tnc_remove_range(c, &key, &to_key);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index da60b5a0fab..b8a06079423 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -548,7 +548,7 @@ static int write_cnodes(struct ubifs_info *c)
 no_space:
 	ubifs_err("LPT out of space mismatch");
 	dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
-	        "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+		"%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
 	dbg_dump_lpt_info(c);
 	dbg_dump_lpt_lebs(c);
 	dump_stack();
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6eef5344a14..f7e36f54552 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2245,12 +2245,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
 			if (found) {
 				/* Ensure the znode is dirtied */
 				if (znode->cnext || !ubifs_zn_dirty(znode)) {
-					    znode = dirty_cow_bottom_up(c,
-									znode);
-					    if (IS_ERR(znode)) {
-						    err = PTR_ERR(znode);
-						    goto out_unlock;
-					    }
+					znode = dirty_cow_bottom_up(c, znode);
+					if (IS_ERR(znode)) {
+						err = PTR_ERR(znode);
+						goto out_unlock;
+					}
 				}
 				zbr = &znode->zbranch[n];
 				lnc_free(zbr);
@@ -2317,11 +2316,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
 
 		/* Ensure the znode is dirtied */
 		if (znode->cnext || !ubifs_zn_dirty(znode)) {
-			    znode = dirty_cow_bottom_up(c, znode);
-			    if (IS_ERR(znode)) {
-				    err = PTR_ERR(znode);
-				    goto out_unlock;
-			    }
+			znode = dirty_cow_bottom_up(c, znode);
+			if (IS_ERR(znode)) {
+				err = PTR_ERR(znode);
+				goto out_unlock;
+			}
 		}
 
 		if (found == 1) {
@@ -2627,11 +2626,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
 
 		/* Ensure the znode is dirtied */
 		if (znode->cnext || !ubifs_zn_dirty(znode)) {
-			    znode = dirty_cow_bottom_up(c, znode);
-			    if (IS_ERR(znode)) {
-				    err = PTR_ERR(znode);
-				    goto out_unlock;
-			    }
+			znode = dirty_cow_bottom_up(c, znode);
+			if (IS_ERR(znode)) {
+				err = PTR_ERR(znode);
+				goto out_unlock;
+			}
 		}
 
 		/* Remove all keys in range except the first */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index a17dd794ae9..3275c89a358 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -481,8 +481,8 @@ struct ubifs_lprops {
 struct ubifs_lpt_lprops {
 	int free;
 	int dirty;
-	unsigned tgc : 1;
-	unsigned cmt : 1;
+	unsigned tgc:1;
+	unsigned cmt:1;
 };
 
 /**
@@ -1322,8 +1322,8 @@ struct ubifs_info {
 	void *sbuf;
 	struct list_head idx_gc;
 	int idx_gc_cnt;
-	volatile int gc_seq;
-	volatile int gced_lnum;
+	int gc_seq;
+	int gced_lnum;
 
 	struct list_head infos_list;
 	struct mutex umount_mutex;
-- 
cgit v1.2.3


From a9f2fc0e251e71a51deb8059b181c375a4a5e979 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 23 Dec 2008 14:39:14 +0200
Subject: UBIFS: fix writing uncompressed files

UBIFS does not disable compression if ui->flags is non-zero, e.g.
if the file has "sync" flag. This is because of the typo which
is fixed by this patch. The patch also adds a couple of useful
debugging prints.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/ioctl.c   | 2 ++
 fs/ubifs/journal.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5e82cffe969..6db7a6be6c9 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FS_IOC_GETFLAGS:
 		flags = ubifs2ioctl(ubifs_inode(inode)->flags);
 
+		dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
 		return put_user(flags, (int __user *) arg);
 
 	case FS_IOC_SETFLAGS: {
@@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		err = mnt_want_write(file->f_path.mnt);
 		if (err)
 			return err;
+		dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
 		err = setflags(inode, flags);
 		mnt_drop_write(file->f_path.mnt);
 		return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 3b0fa704d55..10ae25b7d1d 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 	data->size = cpu_to_le32(len);
 	zero_data_node_unused(data);
 
-	if (!(ui->flags && UBIFS_COMPR_FL))
+	if (!(ui->flags & UBIFS_COMPR_FL))
 		/* Compression is disabled for this inode */
 		compr_type = UBIFS_COMPR_NONE;
 	else
-- 
cgit v1.2.3


From 57a450e95932f7798677885b8a01443aca72fdc7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 30 Dec 2008 16:23:34 +0200
Subject: UBIFS: allow mounting when short of space

It is fine if there is not free space - we should still allow mounting
this FS. This patch relaxes the free space requirements and adds info
dumps.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1c1bbe4135c..2c91d6fa4e0 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1073,6 +1073,30 @@ again:
 	}
 }
 
+/**
+ * check_free_space - check if there is enough free space to mount.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free space to be mounted in
+ * read/write mode. UBIFS must always have some free space to allow deletions.
+ */
+static int check_free_space(struct ubifs_info *c)
+{
+	ubifs_assert(c->dark_wm > 0);
+	if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
+		ubifs_err("insufficient free space to mount in read/write mode");
+		dbg_dump_budg(c);
+		dbg_dump_lprops(c);
+		/*
+		 * We return %-EINVAL instead of %-ENOSPC because it seems to
+		 * be the closest error code mentioned in the mount function
+		 * documentation.
+		 */
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /**
  * mount_ubifs - mount UBIFS file-system.
  * @c: UBIFS file-system description object
@@ -1154,7 +1178,7 @@ static int mount_ubifs(struct ubifs_info *c)
 
 	/*
 	 * Make sure the compressor which is set as default in the superblock
-	 * or overriden by mount options is actually compiled in.
+	 * or overridden by mount options is actually compiled in.
 	 */
 	if (!ubifs_compr_present(c->default_compr)) {
 		ubifs_err("'compressor \"%s\" is not compiled in",
@@ -1236,12 +1260,9 @@ static int mount_ubifs(struct ubifs_info *c)
 	if (!mounted_read_only) {
 		int lnum;
 
-		/* Check for enough free space */
-		if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
-			ubifs_err("insufficient available space");
-			err = -EINVAL;
+		err = check_free_space(c);
+		if (err)
 			goto out_orphans;
-		}
 
 		/* Check for enough log space */
 		lnum = c->lhead_lnum + 1;
@@ -1442,12 +1463,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	c->remounting_rw = 1;
 	c->always_chk_crc = 1;
 
-	/* Check for enough free space */
-	if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
-		ubifs_err("insufficient available space");
-		err = -EINVAL;
+	err = check_free_space(c);
+	if (err)
 		goto out;
-	}
 
 	if (c->old_leb_cnt != c->leb_cnt) {
 		struct ubifs_sb_node *sup;
-- 
cgit v1.2.3


From 80736d41f895bc472b2433a1c27fa6d4afe6ca35 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 30 Dec 2008 17:44:02 +0200
Subject: UBIFS: fix numerous spelling mistakes

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c     | 14 +++++++-------
 fs/ubifs/lpt_commit.c |  8 ++++----
 fs/ubifs/ubifs.h      |  1 -
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 4d270f0a856..31870d8dab8 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -652,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
  * user-space. User-space application tend to expect that if the file-system
  * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
  * are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS has to report sligtly less free space to meet the above
- * expectetions.
+ * node and it has to write indexing nodes as well. This introduces additional
+ * overhead, and UBIFS has to report slightly less free space to meet the above
+ * expectations.
  *
  * This function assumes free space is made up of uncompressed data nodes and
  * full index nodes (one per data node, tripled because we always allow enough
@@ -677,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 	 * of data nodes, f - fanout. Because effective UBIFS fanout is twice
 	 * as less than maximum fanout, we assume that each data node
 	 * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
-	 * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+	 * Note, the multiplier 3 is because UBIFS reserves thrice as more space
 	 * for the index.
 	 */
 	f = c->fanout > 3 ? c->fanout >> 1 : 2;
@@ -695,10 +695,10 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * This function calculates amount of free space to report to user-space.
  *
  * Because UBIFS may introduce substantial overhead (the index, node headers,
- * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of eraseblocks, etc), it cannot report real
  * amount of free flash space it has (well, because not all dirty space is
- * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectetion about what free space is. Users seem to
+ * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectations about what free space is. Users seem to
  * accustomed to assume that if the file-system reports N bytes of free space,
  * they would be able to fit a file of N bytes to the FS. This almost works for
  * traditional file-systems, because they have way less overhead than UBIFS.
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index b8a06079423..96ca9570717 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -753,7 +753,7 @@ static void lpt_tgc_start(struct ubifs_info *c)
  * LPT trivial garbage collection is where a LPT LEB contains only dirty and
  * free space and so may be reused as soon as the next commit is completed.
  * This function is called after the commit is completed (master node has been
- * written) and unmaps LPT LEBs that were marked for trivial GC.
+ * written) and un-maps LPT LEBs that were marked for trivial GC.
  */
 static int lpt_tgc_end(struct ubifs_info *c)
 {
@@ -1467,7 +1467,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
 #ifdef CONFIG_UBIFS_FS_DEBUG
 
 /**
- * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
  * @buf: buffer
  * @len: buffer length
  */
@@ -1492,7 +1492,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
 	struct ubifs_nnode *nnode;
 	int hght;
 
-	/* Entire tree is in memory so first_nnode / next_nnode are ok */
+	/* Entire tree is in memory so first_nnode / next_nnode are OK */
 	nnode = first_nnode(c, &hght);
 	for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
 		struct ubifs_nbranch *branch;
@@ -1837,7 +1837,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
  * This function dumps an LEB from LPT area. Nodes in this area are very
  * different to nodes in the main area (e.g., they do not have common headers,
  * they do not have 8-byte alignments, etc), so we have a separate function to
- * dump LPT area LEBs. Note, LPT has to be locked by the coller.
+ * dump LPT area LEBs. Note, LPT has to be locked by the caller.
  */
 static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
 {
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 3275c89a358..fc2a4cc66d0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1168,7 +1168,6 @@ struct ubifs_debug_info;
  * @mount_opts: UBIFS-specific mount options
  *
  * @dbg: debugging-related information
- * @dfs: debugfs support-related information
  */
 struct ubifs_info {
 	struct super_block *vfs_sb;
-- 
cgit v1.2.3


From 5d38b3ac78e0e0e420fba134716fc3d20e6b978a Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 30 Dec 2008 17:58:42 +0200
Subject: UBIFS: print debugging messages properly

We cannot use ubifs_err() macro with DBGKEY() and DBGKEY1(),
because this is racy and holding dbg_lock is needed. Use
dbg_err() instead, which does have the lock held.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 350fedecc83..792c5a16c18 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -1010,20 +1010,20 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 	err = 1;
 	key_read(c, &dent1->key, &key);
 	if (keys_cmp(c, &zbr1->key, &key)) {
-		ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum,
-			  zbr1->offs, DBGKEY(&key));
-		ubifs_err("but it should have key %s according to tnc",
-			  DBGKEY(&zbr1->key));
+		dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
+			zbr1->offs, DBGKEY(&key));
+		dbg_err("but it should have key %s according to tnc",
+			DBGKEY(&zbr1->key));
 		dbg_dump_node(c, dent1);
 		goto out_free;
 	}
 
 	key_read(c, &dent2->key, &key);
 	if (keys_cmp(c, &zbr2->key, &key)) {
-		ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum,
-			  zbr1->offs, DBGKEY(&key));
-		ubifs_err("but it should have key %s according to tnc",
-			  DBGKEY(&zbr2->key));
+		dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+			zbr1->offs, DBGKEY(&key));
+		dbg_err("but it should have key %s according to tnc",
+			DBGKEY(&zbr2->key));
 		dbg_dump_node(c, dent2);
 		goto out_free;
 	}
@@ -1037,9 +1037,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
 		goto out_free;
 	}
 	if (cmp == 0 && nlen1 == nlen2)
-		ubifs_err("2 xent/dent nodes with the same name");
+		dbg_err("2 xent/dent nodes with the same name");
 	else
-		ubifs_err("bad order of colliding key %s",
+		dbg_err("bad order of colliding key %s",
 			DBGKEY(&key));
 
 	ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
-- 
cgit v1.2.3


From 8e5033adc78ff4fbeab7052134e7af1f6ff04187 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 30 Dec 2008 18:37:45 +0200
Subject: UBIFS: add more useful debugging prints

Print node sizes and maximum node sizes.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 2c91d6fa4e0..0d7564b95f8 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1361,8 +1361,20 @@ static int mount_ubifs(struct ubifs_info *c)
 	dbg_msg("tree fanout:         %d", c->fanout);
 	dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
 	dbg_msg("first main LEB:      %d", c->main_first);
+	dbg_msg("max. znode size      %d", c->max_znode_sz);
+	dbg_msg("max. index node size %d", c->max_idx_node_sz);
+	dbg_msg("node sizes:          data %zu, inode %zu, dentry %zu",
+		UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
+	dbg_msg("node sizes:          trun %zu, sb %zu, master %zu",
+		UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
+	dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
+		UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
+	dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu",
+	        UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+		UBIFS_MAX_DENT_NODE_SZ);
 	dbg_msg("dead watermark:      %d", c->dead_wm);
 	dbg_msg("dark watermark:      %d", c->dark_wm);
+	dbg_msg("LEB overhead:        %d", c->leb_overhead);
 	x = (long long)c->main_lebs * c->dark_wm;
 	dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
 		x, x >> 10, x >> 20);
-- 
cgit v1.2.3


From e3a2a0d4e5ace731e60e2eff4fb7056ecb34adc1 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 2 Dec 2008 11:16:03 +0100
Subject: anon_inodes: use fops->owner for module refcount

There is an imbalance for anonymous inodes. If the fops->owner field is set,
the module reference count of owner is decreases on release.
("filp_close" --> "__fput" ---> "fops_put")

On the other hand, anon_inode_getfd does not increase the module reference
count of owner. This causes two problems:

- if owner is set, the module refcount goes negative
- if owner is not set, the module can be unloaded while code is running

This patch changes anon_inode_getfd to be symmetric regarding fops->owner
handling.

I have checked all existing users of anon_inode_getfd. Noone sets fops->owner,
thats why nobody has seen the module refcount negative. The refcounting was
tested with a patched and unpatched KVM module.(see patch 2/2) I also did an
epoll_open/close test.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 fs/anon_inodes.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b01..3bbdb9d0237 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	if (IS_ERR(anon_inode_inode))
 		return -ENODEV;
 
+	if (fops->owner && !try_module_get(fops->owner))
+		return -ENOENT;
+
 	error = get_unused_fd_flags(flags);
 	if (error < 0)
-		return error;
+		goto err_module;
 	fd = error;
 
 	/*
@@ -128,6 +131,8 @@ err_dput:
 	dput(dentry);
 err_put_unused_fd:
 	put_unused_fd(fd);
+err_module:
+	module_put(fops->owner);
 	return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);
-- 
cgit v1.2.3


From be6d3e56a6b9b3a4ee44a0685e39e595073c6f0d Mon Sep 17 00:00:00 2001
From: Kentaro Takeda <takedakn@nttdata.co.jp>
Date: Wed, 17 Dec 2008 13:24:15 +0900
Subject: introduce new LSM hooks where vfsmount is available.

Add new LSM hooks for path-based checks.  Call them on directory-modifying
operations at the points where we still know the vfsmount involved.

Signed-off-by: Kentaro Takeda <takedakn@nttdata.co.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Toshiharu Harada <haradats@nttdata.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 36 ++++++++++++++++++++++++++++++++++++
 fs/open.c  |  5 +++++
 2 files changed, 41 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index af3783fff1d..ab441af4196 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1556,6 +1556,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		 * Refuse to truncate files with mandatory locks held on them.
 		 */
 		error = locks_verify_locked(inode);
+		if (!error)
+			error = security_path_truncate(&nd->path, 0,
+					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
 			DQUOT_INIT(inode);
 
@@ -1586,7 +1589,11 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
 
 	if (!IS_POSIXACL(dir->d_inode))
 		mode &= ~current->fs->umask;
+	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
+	if (error)
+		goto out_unlock;
 	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
@@ -1999,6 +2006,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_mknod(&nd.path, dentry, mode, dev);
+	if (error)
+		goto out_drop_write;
 	switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
 			error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2011,6 +2021,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
 			error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
 			break;
 	}
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2070,7 +2081,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_mkdir(&nd.path, dentry, mode);
+	if (error)
+		goto out_drop_write;
 	error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2180,7 +2195,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto exit3;
+	error = security_path_rmdir(&nd.path, dentry);
+	if (error)
+		goto exit4;
 	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
 	mnt_drop_write(nd.path.mnt);
 exit3:
 	dput(dentry);
@@ -2265,7 +2284,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 		error = mnt_want_write(nd.path.mnt);
 		if (error)
 			goto exit2;
+		error = security_path_unlink(&nd.path, dentry);
+		if (error)
+			goto exit3;
 		error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
 		mnt_drop_write(nd.path.mnt);
 	exit2:
 		dput(dentry);
@@ -2346,7 +2369,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_symlink(&nd.path, dentry, from);
+	if (error)
+		goto out_drop_write;
 	error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(dentry);
@@ -2443,7 +2470,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(nd.path.mnt);
 	if (error)
 		goto out_dput;
+	error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+	if (error)
+		goto out_drop_write;
 	error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
 	mnt_drop_write(nd.path.mnt);
 out_dput:
 	dput(new_dentry);
@@ -2679,8 +2710,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
 	error = mnt_want_write(oldnd.path.mnt);
 	if (error)
 		goto exit5;
+	error = security_path_rename(&oldnd.path, old_dentry,
+				     &newnd.path, new_dentry);
+	if (error)
+		goto exit6;
 	error = vfs_rename(old_dir->d_inode, old_dentry,
 				   new_dir->d_inode, new_dentry);
+exit6:
 	mnt_drop_write(oldnd.path.mnt);
 exit5:
 	dput(new_dentry);
diff --git a/fs/open.c b/fs/open.c
index c0a426d5766..1cd7d40e999 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 		goto put_write_and_out;
 
 	error = locks_verify_truncate(inode, NULL, length);
+	if (!error)
+		error = security_path_truncate(&path, length, 0);
 	if (!error) {
 		DQUOT_INIT(inode);
 		error = do_truncate(path.dentry, length, 0, NULL);
@@ -328,6 +330,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 		goto out_putf;
 
 	error = locks_verify_truncate(inode, file, length);
+	if (!error)
+		error = security_path_truncate(&file->f_path, length,
+					       ATTR_MTIME|ATTR_CTIME);
 	if (!error)
 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 out_putf:
-- 
cgit v1.2.3


From e2b689d82c0394e5239a3557a217f19e2f47f1be Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Thu, 4 Dec 2008 11:17:47 +0000
Subject: fs: reorder struct inotify_device on 64bits to remove padding

Reorder struct inotify_device to remove 8 bytes of padding on 64bit
builds, reducing size to 128 bytes . Therefore allocating from a smaller
slab & using one fewer cachelines.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>

----
Hi,
patch against 2.6.28-rc7.
built & tested on AMDX2 desktop.

I've not been able to send this to the listed inotify maintainers, I
just get mail failures. So I guessed filesystem was the best home for
it, hope that's ok.

regards
Richard
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index e2425bbd871..400f8064a54 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
 	struct mutex		ev_mutex;	/* protects event queue */
 	struct mutex		up_mutex;	/* synchronizes watch updates */
 	struct list_head 	events;		/* list of queued events */
-	atomic_t		count;		/* reference count */
 	struct user_struct	*user;		/* user who opened this dev */
 	struct inotify_handle	*ih;		/* inotify handle */
 	struct fasync_struct    *fa;            /* async notification */
+	atomic_t		count;		/* reference count */
 	unsigned int		queue_size;	/* size of the queue (bytes) */
 	unsigned int		event_count;	/* number of pending events */
 	unsigned int		max_events;	/* maximum number of events */
-- 
cgit v1.2.3


From c2452f32786159ed85f0e4b21fec09258f822fc8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Mon, 1 Dec 2008 09:33:43 +0100
Subject: shrink struct dentry

struct dentry is one of the most critical structures in the kernel. So it's
sad to see it going neglected.

With CONFIG_PROFILING turned on (which is probably the common case at least
for distros and kernel developers), sizeof(struct dcache) == 208 here
(64-bit). This gives 19 objects per slab.

I packed d_mounted into a hole, and took another 4 bytes off the inline
name length to take the padding out from the end of the structure. This
shinks it to 200 bytes. I could have gone the other way and increased the
length to 40, but I'm aiming for a magic number, read on...

I then got rid of the d_cookie pointer. This shrinks it to 192 bytes. Rant:
why was this ever a good idea? The cookie system should increase its hash
size or use a tree or something if lookups are a problem. Also the "fast
dcookie lookups" in oprofile should be moved into the dcookie code -- how
can oprofile possibly care about the dcookie_mutex? It gets dropped after
get_dcookie() returns so it can't be providing any sort of protection.

At 192 bytes, 21 objects fit into a 4K page, saving about 3MB on my system
with ~140 000 entries allocated. 192 is also a multiple of 64, so we get
nice cacheline alignment on 64 and 32 byte line systems -- any given dentry
will now require 3 cachelines to touch all fields wheras previously it
would require 4.

I know the inline name size was chosen quite carefully, however with the
reduction in cacheline footprint, it should actually be just about as fast
to do a name lookup for a 36 character name as it was before the patch (and
faster for other sizes). The memory footprint savings for names which are
<= 32 or > 36 bytes long should more than make up for the memory cost for
33-36 byte names.

Performance is a feature...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c   |  4 ----
 fs/dcookies.c | 28 +++++++++++++++++++---------
 2 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e6..fd244c7a7cc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
 #include <linux/bootmem.h>
 #include "internal.h"
 
-
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
 	dentry->d_op = NULL;
 	dentry->d_fsdata = NULL;
 	dentry->d_mounted = 0;
-#ifdef CONFIG_PROFILING
-	dentry->d_cookie = NULL;
-#endif
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619..180e9fec4ad 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
 {
 	struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
 							GFP_KERNEL);
+	struct dentry *d;
 	if (!dcs)
 		return NULL;
 
-	path->dentry->d_cookie = dcs;
+	d = path->dentry;
+	spin_lock(&d->d_lock);
+	d->d_flags |= DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
 	dcs->path = *path;
 	path_get(path);
 	hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
 		goto out;
 	}
 
-	dcs = path->dentry->d_cookie;
-
-	if (!dcs)
+	if (path->dentry->d_flags & DCACHE_COOKIE) {
+		dcs = find_dcookie((unsigned long)path->dentry);
+	} else {
 		dcs = alloc_dcookie(path);
-
-	if (!dcs) {
-		err = -ENOMEM;
-		goto out;
+		if (!dcs) {
+			err = -ENOMEM;
+			goto out;
+		}
 	}
 
 	*cookie = dcookie_value(dcs);
@@ -251,7 +256,12 @@ out_kmem:
 
 static void free_dcookie(struct dcookie_struct * dcs)
 {
-	dcs->path.dentry->d_cookie = NULL;
+	struct dentry *d = dcs->path.dentry;
+
+	spin_lock(&d->d_lock);
+	d->d_flags &= ~DCACHE_COOKIE;
+	spin_unlock(&d->d_lock);
+
 	path_put(&dcs->path);
 	kmem_cache_free(dcookie_cache, dcs);
 }
-- 
cgit v1.2.3


From 5cc4a0341a1295ea56b2e62eb70d96d8fdb94ded Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 1 Dec 2008 14:34:51 -0800
Subject: fs/namespace.c: drop code after return

The extra semicolon serves no purpose.

Signed-off-by: Julia Lawall <julia@diku.dk>
Reviewed-by: Richard Genoud <richard.genoud@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1c09cab8f7c..a40685d800a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 	if (!new_ns->root) {
 		up_write(&namespace_sem);
 		kfree(new_ns);
-		return ERR_PTR(-ENOMEM);;
+		return ERR_PTR(-ENOMEM);
 	}
 	spin_lock(&vfsmount_lock);
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-- 
cgit v1.2.3


From a17d5232de7b53d34229de79ec22f4bb04adb7e4 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:10 +0000
Subject: eCryptfs: check readlink result was not an error before using it

The result from readlink is being used to index into the link name
buffer without checking whether it is a valid length. If readlink
returns an error this will fault or cause memory corruption.

Cc: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: ecryptfs-devel@lists.launchpad.net
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Michael Halcrow <mhalcrow@us.ibm.com>
Acked-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f00f9c..5e78fc17988 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -673,10 +673,11 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
 			"dentry->d_name.name = [%s]\n", dentry->d_name.name);
 	rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
-	buf[rc] = '\0';
 	set_fs(old_fs);
 	if (rc < 0)
 		goto out_free;
+	else
+		buf[rc] = '\0';
 	rc = 0;
 	nd_set_link(nd, buf);
 	goto out;
-- 
cgit v1.2.3


From ebd09abbd9699f328165aee50a070403fbf55a37 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:12 +0000
Subject: vfs: ensure page symlinks are NUL-terminated

On-disk data corruption could cause a page link to have its i_size set
to PAGE_SIZE (or a multiple thereof) and its contents all non-NUL.
NUL-terminate the link name to ensure this doesn't cause further
problems for the kernel.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ab441af4196..9ed5e2818f8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2786,13 +2786,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
-	struct page * page;
+	char *kaddr;
+	struct page *page;
 	struct address_space *mapping = dentry->d_inode->i_mapping;
 	page = read_mapping_page(mapping, 0, NULL);
 	if (IS_ERR(page))
 		return (char*)page;
 	*ppage = page;
-	return kmap(page);
+	kaddr = kmap(page);
+	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+	return kaddr;
 }
 
 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-- 
cgit v1.2.3


From 8d6d0c4da2dbbe0a69fea3692146af39f139f8b4 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:13 +0000
Subject: ext2: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33e265..02b39a5deb7 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
@@ -1286,9 +1287,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 		else
 			inode->i_mapping->a_ops = &ext2_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext2_inode_is_fast_symlink(inode))
+		if (ext2_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext2_fast_symlink_inode_operations;
-		else {
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
 			inode->i_op = &ext2_symlink_inode_operations;
 			if (test_opt(inode->i_sb, NOBH))
 				inode->i_mapping->a_ops = &ext2_nobh_aops;
-- 
cgit v1.2.3


From b5ed3112b5f74c8ec1c7aa03a76c596635e85197 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:14 +0000
Subject: ext3: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Stephen Tweedie <sct@redhat.com>
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad8997..c4bdccf976b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/fiemap.h>
+#include <linux/namei.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext3_dir_inode_operations;
 		inode->i_fop = &ext3_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext3_inode_is_fast_symlink(inode))
+		if (ext3_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext3_fast_symlink_inode_operations;
-		else {
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
 			inode->i_op = &ext3_symlink_inode_operations;
 			ext3_set_aops(inode);
 		}
-- 
cgit v1.2.3


From e83c1397cafc4e44f868289db5e417463c0d09a4 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:15 +0000
Subject: ext4: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: adilger@sun.com
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/inode.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5ae33c..7c3325e0b00 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
+#include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
@@ -4164,9 +4165,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext4_dir_inode_operations;
 		inode->i_fop = &ext4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
-		if (ext4_inode_is_fast_symlink(inode))
+		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
-		else {
+			nd_terminate_link(ei->i_data, inode->i_size,
+				sizeof(ei->i_data) - 1);
+		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
-- 
cgit v1.2.3


From 21acaf8e8da00235be59a3e489d5fa2a8721cafc Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:16 +0000
Subject: sysv: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435baa4..3d81bf58dae 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
 #include <linux/init.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
+#include <linux/namei.h>
 #include <asm/byteorder.h>
 #include "sysv.h"
 
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
 		if (inode->i_blocks) {
 			inode->i_op = &sysv_symlink_inode_operations;
 			inode->i_mapping->a_ops = &sysv_aops;
-		} else
+		} else {
 			inode->i_op = &sysv_fast_symlink_inode_operations;
+			nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+				sizeof(SYSV_I(inode)->i_data) - 1);
+		}
 	} else
 		init_special_inode(inode, inode->i_mode, rdev);
 }
-- 
cgit v1.2.3


From a63d0ff31a136bdf52350c4e6c2929eaf47ea2b2 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:17 +0000
Subject: freevxfs: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/freevxfs/vxfs_inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ceb73f..03a6ea5e99f 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 		if (!VXFS_ISIMMED(vip)) {
 			ip->i_op = &page_symlink_inode_operations;
 			ip->i_mapping->a_ops = &vxfs_aops;
-		} else
+		} else {
 			ip->i_op = &vxfs_immed_symlink_iops;
+			vip->vii_immed.vi_immed[ip->i_size] = '\0';
+		}
 	} else
 		init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
 
-- 
cgit v1.2.3


From 7df5fa06de89a4ac311957e0cb9c1d87552b4325 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Fri, 19 Dec 2008 20:47:18 +0000
Subject: befs: ensure fast symlinks are NUL-terminated

Ensure fast symlink targets are NUL-terminated, even if corrupted
on-disk.

Cc: Sergey S. Kostyliov <rathamahata@php4.ru>
Signed-off-by: Duane Griffin <duaneg@dghda.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/befs/linuxvfs.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee37c7b..d06cb023ad0 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_size = 0;
 		inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
 		strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
-			BEFS_SYMLINK_LEN);
+			BEFS_SYMLINK_LEN - 1);
+		befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
 	} else {
 		int num_blks;
 
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 			kfree(link);
 			befs_error(sb, "Failed to read entire long symlink");
 			link = ERR_PTR(-EIO);
+		} else {
+			link[len - 1] = '\0';
 		}
 	} else {
 		link = befs_ino->i_data.symlink;
-- 
cgit v1.2.3


From dc711ca35f9d95a1eec02118e0c298b5e3068315 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Nov 2008 15:03:50 -0500
Subject: fix switch_names() breakage in short-to-short case

We want ->name.len to match the resulting name on *both*
source and target

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index fd244c7a7cc..eeafc14c2a1 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1616,8 +1616,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 			 */
 			memcpy(dentry->d_iname, target->d_name.name,
 					target->d_name.len + 1);
+			dentry->d_name.len = target->d_name.len;
+			return;
 		}
 	}
+	do_switch(dentry->d_name.len, target->d_name.len);
 }
 
 /*
@@ -1677,7 +1680,6 @@ already_unhashed:
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
-	do_switch(dentry->d_name.len, target->d_name.len);
 	do_switch(dentry->d_name.hash, target->d_name.hash);
 
 	/* ... and switch the parents */
@@ -1787,7 +1789,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	struct dentry *dparent, *aparent;
 
 	switch_names(dentry, anon);
-	do_switch(dentry->d_name.len, anon->d_name.len);
 	do_switch(dentry->d_name.hash, anon->d_name.hash);
 
 	dparent = dentry->d_parent;
-- 
cgit v1.2.3


From be42c4c433c2c0d3f1583c08908fead00d36d222 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 1 Dec 2008 14:34:58 -0800
Subject: correct wrong function name of d_put in kernel document and source
 comment

no function named d_put(), it should be dput().

Impact: fix document and comment, no functionality changed

Signed-off-by: Zhao Lei <zhaolei@cn.fuijtsu.com>
Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index eeafc14c2a1..c231a639c2a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1332,7 +1332,7 @@ err_out:
  *
  * Searches the children of the parent dentry for the name in question. If
  * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use d_put to free the entry when it has
+ * is returned. The caller must use dput to free the entry when it has
  * finished using it. %NULL is returned on failure.
  *
  * __d_lookup is dcache_lock free. The hash list is protected using RCU.
-- 
cgit v1.2.3


From 52afeefb9dac9287429642189996426a2bfd6a25 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Mon, 1 Dec 2008 14:35:00 -0800
Subject: expand some comments (d_path / seq_path)

Explain that you really need to use the return value of d_path rather than
the buffer you passed into it.

Also fix the comment for seq_path(), the function arguments changed
recently but the comment hadn't been updated in sync.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c   |  8 ++++++--
 fs/seq_file.c | 10 ++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index c231a639c2a..bdb3f50248a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1908,7 +1908,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
  * Convert a dentry into an ASCII path name. If the entry has been deleted
  * the string " (deleted)" is appended. Note that this is ambiguous.
  *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
  *
  * "buflen" should be positive. Caller holds the dcache_lock.
  *
@@ -1984,7 +1985,10 @@ Elong:
  * Convert a dentry into an ASCII path name. If the entry has been deleted
  * the string " (deleted)" is appended. Note that this is ambiguous.
  *
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
  *
  * "buflen" should be positive.
  */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c2..99d8b8cfc9b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -389,8 +389,14 @@ char *mangle_path(char *s, char *p, char *esc)
 }
 EXPORT_SYMBOL(mangle_path);
 
-/*
- * return the absolute path of 'dentry' residing in mount 'mnt'.
+/**
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
  */
 int seq_path(struct seq_file *m, struct path *path, char *esc)
 {
-- 
cgit v1.2.3


From 66f221875dc10813aa2f06c83ad60d0eb1356406 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Nov 2008 15:04:29 +0100
Subject: remove incorrect comment in inode_permission

We now pass on all MAY_ flags to the filesystems permission routines,
so remove the comment stating the contrary.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9ed5e2818f8..631cfdd45c6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -247,7 +247,6 @@ int inode_permission(struct inode *inode, int mask)
 			return -EACCES;
 	}
 
-	/* Ordinary permission routines do not understand MAY_APPEND. */
 	if (inode->i_op && inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
 	else
-- 
cgit v1.2.3


From b4091d5f6fde28ab762e1094a1a26d81f3badfa5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Nov 2008 15:07:21 +0100
Subject: kill walk_init_root

walk_init_root is a tiny helper that is marked __always_inline, has just
one caller and an unused argument.  Just merge it into the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 631cfdd45c6..d4d0b59ed2c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -526,18 +526,6 @@ out_unlock:
 	return result;
 }
 
-/* SMP-safe */
-static __always_inline void
-walk_init_root(const char *name, struct nameidata *nd)
-{
-	struct fs_struct *fs = current->fs;
-
-	read_lock(&fs->lock);
-	nd->path = fs->root;
-	path_get(&fs->root);
-	read_unlock(&fs->lock);
-}
-
 /*
  * Wrapper to retry pathname resolution whenever the underlying
  * file system returns an ESTALE.
@@ -575,9 +563,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		goto fail;
 
 	if (*link == '/') {
+		struct fs_struct *fs = current->fs;
+
 		path_put(&nd->path);
-		walk_init_root(link, nd);
+
+		read_lock(&fs->lock);
+		nd->path = fs->root;
+		path_get(&fs->root);
+		read_unlock(&fs->lock);
 	}
+
 	res = link_path_walk(link, nd);
 	if (nd->depth || res || nd->last_type!=LAST_NORM)
 		return res;
-- 
cgit v1.2.3


From 3fb64190aa3c23c10e6e9fd0124ac030115c99bf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2008 09:58:10 +0200
Subject: pass a struct path * to may_open

No need for the nameidata in may_open - a struct path is enough.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c  | 14 +++++++-------
 fs/nfsctl.c |  5 +++--
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d4d0b59ed2c..5cc0dc95a7a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1487,9 +1487,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	return error;
 }
 
-int may_open(struct nameidata *nd, int acc_mode, int flag)
+int may_open(struct path *path, int acc_mode, int flag)
 {
-	struct dentry *dentry = nd->path.dentry;
+	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
 	int error;
 
@@ -1510,13 +1510,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 	    	flag &= ~O_TRUNC;
 	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
-		if (nd->path.mnt->mnt_flags & MNT_NODEV)
+		if (path->mnt->mnt_flags & MNT_NODEV)
 			return -EACCES;
 
 		flag &= ~O_TRUNC;
 	}
 
-	error = vfs_permission(nd, acc_mode);
+	error = inode_permission(inode, acc_mode);
 	if (error)
 		return error;
 	/*
@@ -1551,7 +1551,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 		 */
 		error = locks_verify_locked(inode);
 		if (!error)
-			error = security_path_truncate(&nd->path, 0,
+			error = security_path_truncate(path, 0,
 					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
 			DQUOT_INIT(inode);
@@ -1594,7 +1594,7 @@ out_unlock:
 	if (error)
 		return error;
 	/* Don't check for write permission, don't truncate */
-	return may_open(nd, 0, flag & ~O_TRUNC);
+	return may_open(&nd->path, 0, flag & ~O_TRUNC);
 }
 
 /*
@@ -1780,7 +1780,7 @@ ok:
 		if (error)
 			goto exit;
 	}
-	error = may_open(&nd, acc_mode, flag);
+	error = may_open(&nd.path, acc_mode, flag);
 	if (error) {
 		if (will_write)
 			mnt_drop_write(nd.path.mnt);
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6ab6f..b27451909df 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
 		return ERR_PTR(error);
 
 	if (flags == O_RDWR)
-		error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
+		error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+					   FMODE_READ|FMODE_WRITE);
 	else
-		error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
+		error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
 
 	if (!error)
 		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
-- 
cgit v1.2.3


From cb23beb55100171646e69e248fb45f10db6e99a4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 24 Oct 2008 09:59:29 +0200
Subject: kill vfs_permission

With all the nameidata removal there's no point anymore for this helper.
Of the three callers left two will go away with the next lookup series
anyway.

Also add proper kerneldoc to inode_permission as this is the main
permission check routine now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c  |  5 +++--
 fs/namei.c | 31 +++++++++++++------------------
 2 files changed, 16 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 02d2e120542..dfbf7009fbe 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -127,7 +127,8 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
-	error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+	error = inode_permission(nd.path.dentry->d_inode,
+				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
 
@@ -680,7 +681,7 @@ struct file *open_exec(const char *name)
 	if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
 		goto out_path_put;
 
-	err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
 
diff --git a/fs/namei.c b/fs/namei.c
index 5cc0dc95a7a..3f88e043d45 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask,
 	return -EACCES;
 }
 
+/**
+ * inode_permission  -  check for access rights to a given inode
+ * @inode:	inode to check permission on
+ * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
 int inode_permission(struct inode *inode, int mask)
 {
 	int retval;
@@ -263,21 +273,6 @@ int inode_permission(struct inode *inode, int mask)
 			mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
 }
 
-/**
- * vfs_permission  -  check for access rights to a given path
- * @nd:		lookup result that describes the path
- * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on a path.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
- */
-int vfs_permission(struct nameidata *nd, int mask)
-{
-	return inode_permission(nd->path.dentry->d_inode, mask);
-}
-
 /**
  * file_permission  -  check for additional access rights to a given file
  * @file:	file to check access rights for
@@ -288,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask)
  *
  * Note:
  *	Do not use this function in new code.  All access checks should
- *	be done using vfs_permission().
+ *	be done using inode_permission().
  */
 int file_permission(struct file *file, int mask)
 {
@@ -853,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		nd->flags |= LOOKUP_CONTINUE;
 		err = exec_permission_lite(inode);
 		if (err == -EAGAIN)
-			err = vfs_permission(nd, MAY_EXEC);
+			err = inode_permission(nd->path.dentry->d_inode,
+					       MAY_EXEC);
  		if (err)
 			break;
 
@@ -2882,7 +2878,6 @@ EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
-- 
cgit v1.2.3


From 18d8fda7c3c9439be04d7ea2e82da2513b121acb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Dec 2008 00:35:37 -0500
Subject: take init_fs to saner place

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 3f88e043d45..e203691b9d1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2893,3 +2893,10 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.count		= ATOMIC_INIT(1),
+	.lock		= RW_LOCK_UNLOCKED,
+	.umask		= 0022,
+};
-- 
cgit v1.2.3


From 1239f26c05899f1f3c541b41e719c59d58038786 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 10 Dec 2008 18:37:28 -0500
Subject: make INIT_FS use the __RW_LOCK_UNLOCKED initialization

[AV: rediffed on top of unification of init_fs]
Initialization of init_fs still uses the deprecated RW_LOCK_UNLOCKED macro.
This patch updates it to use the __RW_LOCK_UNLOCKED(lock) macro.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e203691b9d1..dd5c9f0bf82 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2897,6 +2897,6 @@ EXPORT_SYMBOL(generic_readlink);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.count		= ATOMIC_INIT(1),
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
-- 
cgit v1.2.3


From b6b3fdead251d432f32f2cfce2a893ab8a658110 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 10 Dec 2008 09:35:45 -0800
Subject: filp_cachep can be static in fs/file_table.c

Instead of creating the "filp" kmem_cache in vfs_caches_init(),
we can do it a litle be later in files_init(), so that filp_cachep
is static to fs/file_table.c

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c     |  6 ------
 fs/file_table.c | 10 +++++++++-
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index bdb3f50248a..e88c23b85a3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2314,9 +2314,6 @@ static void __init dcache_init(void)
 /* SLAB cache for __getname() consumers */
 struct kmem_cache *names_cachep __read_mostly;
 
-/* SLAB cache for file structures */
-struct kmem_cache *filp_cachep __read_mostly;
-
 EXPORT_SYMBOL(d_genocide);
 
 void __init vfs_caches_init_early(void)
@@ -2338,9 +2335,6 @@ void __init vfs_caches_init(unsigned long mempages)
 	names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
-	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-
 	dcache_init();
 	inode_init();
 	files_init(mempages);
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc3ea7..bbeeac6efa1 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,6 +32,9 @@ struct files_stat_struct files_stat = {
 /* public. Not pretty! */
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
+
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
 static inline void file_free_rcu(struct rcu_head *head)
@@ -397,7 +400,12 @@ too_bad:
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
-	/* One file with associated inode and dcache is very roughly 1K. 
+
+	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+	/*
+	 * One file with associated inode and dcache is very roughly 1K.
 	 * Per default don't use more than 10% of our memory for files. 
 	 */ 
 
-- 
cgit v1.2.3


From 6badd79bd002788aaec27b50a74ab69ef65ab8ee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 26 Dec 2008 00:57:40 -0500
Subject: kill ->dir_notify()

Remove the hopelessly misguided ->dir_notify().  The only instance (cifs)
has been broken by design from the very beginning; the objects it creates
are never destroyed, keep references to struct file they can outlive, nothing
that could possibly evict them exists on close(2) path *and* no locking
whatsoever is done to prevent races with close(), should the previous, er,
deficiencies someday be dealt with.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bad_inode.c   |   6 ---
 fs/cifs/Makefile |   2 +-
 fs/cifs/cifsfs.c |   7 ----
 fs/cifs/cifsfs.h |   1 -
 fs/cifs/fcntl.c  | 118 -------------------------------------------------------
 fs/dnotify.c     |   3 --
 6 files changed, 1 insertion(+), 136 deletions(-)
 delete mode 100644 fs/cifs/fcntl.c

(limited to 'fs')

diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1..a05287a23f6 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
 	return -EIO;
 }
 
-static int bad_file_dir_notify(struct file *file, unsigned long arg)
-{
-	return -EIO;
-}
-
 static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
 	.sendpage	= bad_file_sendpage,
 	.get_unmapped_area = bad_file_get_unmapped_area,
 	.check_flags	= bad_file_check_flags,
-	.dir_notify	= bad_file_dir_notify,
 	.flock		= bad_file_flock,
 	.splice_write	= bad_file_splice_write,
 	.splice_read	= bad_file_splice_read,
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346f..9948c0030e8 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
-	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+	  md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
 	  readdir.o ioctl.o sess.o export.o cifsacl.o
 
 cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a194a75..13ea53251dc 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
 	.setlease = cifs_setlease,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
 };
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
 	.readdir = cifs_readdir,
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
-	.dir_notify = cifs_dir_notify,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
 	.unlocked_ioctl  = cifs_ioctl,
 	.llseek = generic_file_llseek,
 };
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c73d74..7ac481841f8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
 extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
-extern int cifs_dir_notify(struct file *, unsigned long arg);
 
 /* Functions related to dir entries */
 extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b..00000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- *   fs/cifs/fcntl.c
- *
- *   vfs operations that deal with the file control API
- *
- *   Copyright (C) International Business Machines  Corp., 2003,2004
- *   Author(s): Steve French (sfrench@us.ibm.com)
- *
- *   This library is free software; you can redistribute it and/or modify
- *   it under the terms of the GNU Lesser General Public License as published
- *   by the Free Software Foundation; either version 2.1 of the License, or
- *   (at your option) any later version.
- *
- *   This library is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU Lesser General Public License for more details.
- *
- *   You should have received a copy of the GNU Lesser General Public License
- *   along with this library; if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "cifsfs.h"
-
-static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
-{
-	__u32 cifs_ntfy_flags = 0;
-
-	/* No way on Linux VFS to ask to monitor xattr
-	changes (and no stream support either */
-	if (fcntl_notify_flags & DN_ACCESS)
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
-	if (fcntl_notify_flags & DN_MODIFY) {
-		/* What does this mean on directories? */
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
-			FILE_NOTIFY_CHANGE_SIZE;
-	}
-	if (fcntl_notify_flags & DN_CREATE) {
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
-			FILE_NOTIFY_CHANGE_LAST_WRITE;
-	}
-	if (fcntl_notify_flags & DN_DELETE)
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
-	if (fcntl_notify_flags & DN_RENAME) {
-		/* BB review this - checking various server behaviors */
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
-			FILE_NOTIFY_CHANGE_FILE_NAME;
-	}
-	if (fcntl_notify_flags & DN_ATTRIB) {
-		cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
-			FILE_NOTIFY_CHANGE_ATTRIBUTES;
-	}
-/*	if (fcntl_notify_flags & DN_MULTISHOT) {
-		cifs_ntfy_flags |= ;
-	} */ /* BB fixme - not sure how to handle this with CIFS yet */
-
-	return cifs_ntfy_flags;
-}
-
-int cifs_dir_notify(struct file *file, unsigned long arg)
-{
-	int xid;
-	int rc = -EINVAL;
-	int oplock = 0;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
-	char *full_path = NULL;
-	__u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
-	__u16 netfid;
-
-	if (experimEnabled == 0)
-		return 0;
-
-	xid = GetXid();
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
-
-	full_path = build_path_from_dentry(file->f_path.dentry);
-
-	if (full_path == NULL) {
-		rc = -ENOMEM;
-	} else {
-		cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
-		rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
-			GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
-			&netfid, &oplock, NULL, cifs_sb->local_nls,
-			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
-		/* BB fixme - add this handle to a notify handle list */
-		if (rc) {
-			cFYI(1, ("Could not open directory for notify"));
-		} else {
-			filter = convert_to_cifs_notify_flags(arg);
-			if (filter != 0) {
-				rc = CIFSSMBNotify(xid, pTcon,
-					0 /* no subdirs */, netfid,
-					filter, file, arg & DN_MULTISHOT,
-					cifs_sb->local_nls);
-			} else {
-				rc = -EINVAL;
-			}
-			/* BB add code to close file eventually (at unmount
-			it would close automatically but may be a way
-			to do it easily when inode freed or when
-			notify info is cleared/changed */
-			cFYI(1, ("notify rc %d", rc));
-		}
-	}
-
-	FreeXid(xid);
-	return rc;
-}
diff --git a/fs/dnotify.c b/fs/dnotify.c
index 676073b8dda..b0aa2cde80b 100644
--- a/fs/dnotify.c
+++ b/fs/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	dn->dn_next = inode->i_dnotify;
 	inode->i_dnotify = dn;
 	spin_unlock(&inode->i_lock);
-
-	if (filp->f_op && filp->f_op->dir_notify)
-		return filp->f_op->dir_notify(filp, arg);
 	return 0;
 
 out_free:
-- 
cgit v1.2.3


From c2acf7b90821785fe812cc0aa05148e5a1f84204 Mon Sep 17 00:00:00 2001
From: Denis ChengRq <crquan@gmail.com>
Date: Mon, 1 Dec 2008 14:34:56 -0800
Subject: fs/block_dev.c: __read_mostly improvement and sb_is_blkdev_sb
 utilization

- iget5_locked in bdget really needs blockdev_superblock, instead of
  bd_mnt, so bd_mnt could be just a local variable;

- blockdev_superblock really needs __read_mostly, while local var bd_mnt
  not;

- make use of sb_is_blkdev_sb in bd_forget, instead of direct reference
  to blockdev_superblock.

Signed-off-by: Denis ChengRq <crquan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1a4c7..349a26c1000 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -326,12 +326,13 @@ static struct file_system_type bd_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static struct vfsmount *bd_mnt __read_mostly;
-struct super_block *blockdev_superblock;
+struct super_block *blockdev_superblock __read_mostly;
 
 void __init bdev_cache_init(void)
 {
 	int err;
+	struct vfsmount *bd_mnt;
+
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 				SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +374,7 @@ struct block_device *bdget(dev_t dev)
 	struct block_device *bdev;
 	struct inode *inode;
 
-	inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),
+	inode = iget5_locked(blockdev_superblock, hash(dev),
 			bdev_test, bdev_set, &dev);
 
 	if (!inode)
@@ -463,7 +464,7 @@ void bd_forget(struct inode *inode)
 
 	spin_lock(&bdev_lock);
 	if (inode->i_bdev) {
-		if (inode->i_sb != blockdev_superblock)
+		if (!sb_is_blkdev_sb(inode->i_sb))
 			bdev = inode->i_bdev;
 		__bd_forget(inode);
 	}
-- 
cgit v1.2.3


From 272eb01485dda98e3b8910c7c1a53d597616b0a0 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 17 Dec 2008 13:59:41 -0500
Subject: filesystem notification: create fs/notify to contain all fs
 notification

Creating a generic filesystem notification interface, fsnotify, which will be
used by inotify, dnotify, and eventually fanotify is really starting to
clutter the fs directory.  This patch simply moves inotify and dnotify into
fs/notify/inotify and fs/notify/dnotify respectively to make both current fs/
and future notification tidier.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig                       |  39 +-
 fs/Makefile                      |   5 +-
 fs/dnotify.c                     | 191 --------
 fs/inotify.c                     | 913 ---------------------------------------
 fs/inotify_user.c                | 778 ---------------------------------
 fs/notify/Kconfig                |   2 +
 fs/notify/Makefile               |   2 +
 fs/notify/dnotify/Kconfig        |  10 +
 fs/notify/dnotify/Makefile       |   1 +
 fs/notify/dnotify/dnotify.c      | 191 ++++++++
 fs/notify/inotify/Kconfig        |  27 ++
 fs/notify/inotify/Makefile       |   2 +
 fs/notify/inotify/inotify.c      | 913 +++++++++++++++++++++++++++++++++++++++
 fs/notify/inotify/inotify_user.c | 778 +++++++++++++++++++++++++++++++++
 14 files changed, 1928 insertions(+), 1924 deletions(-)
 delete mode 100644 fs/dnotify.c
 delete mode 100644 fs/inotify.c
 delete mode 100644 fs/inotify_user.c
 create mode 100644 fs/notify/Kconfig
 create mode 100644 fs/notify/Makefile
 create mode 100644 fs/notify/dnotify/Kconfig
 create mode 100644 fs/notify/dnotify/Makefile
 create mode 100644 fs/notify/dnotify/dnotify.c
 create mode 100644 fs/notify/inotify/Kconfig
 create mode 100644 fs/notify/inotify/Makefile
 create mode 100644 fs/notify/inotify/inotify.c
 create mode 100644 fs/notify/inotify/inotify_user.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca..ff0e8198020 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -270,44 +270,7 @@ config OCFS2_COMPAT_JBD
 
 endif # BLOCK
 
-config DNOTIFY
-	bool "Dnotify support"
-	default y
-	help
-	  Dnotify is a directory-based per-fd file change notification system
-	  that uses signals to communicate events to user-space.  There exist
-	  superior alternatives, but some applications may still rely on
-	  dnotify.
-
-	  If unsure, say Y.
-
-config INOTIFY
-	bool "Inotify file change notification support"
-	default y
-	---help---
-	  Say Y here to enable inotify support.  Inotify is a file change
-	  notification system and a replacement for dnotify.  Inotify fixes
-	  numerous shortcomings in dnotify and introduces several new features
-	  including multiple file events, one-shot support, and unmount
-	  notification.
-
-	  For more information, see <file:Documentation/filesystems/inotify.txt>
-
-	  If unsure, say Y.
-
-config INOTIFY_USER
-	bool "Inotify support for userspace"
-	depends on INOTIFY
-	default y
-	---help---
-	  Say Y here to enable inotify support for userspace, including the
-	  associated system calls.  Inotify allows monitoring of both files and
-	  directories via a single open fd.  Events are read from the file
-	  descriptor, which is also select()- and poll()-able.
-
-	  For more information, see <file:Documentation/filesystems/inotify.txt>
-
-	  If unsure, say Y.
+source "fs/notify/Kconfig"
 
 config QUOTA
 	bool "Quota support"
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c..e6f423d1d22 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y +=	no-block.o
 endif
 
 obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
-obj-$(CONFIG_INOTIFY)		+= inotify.o
-obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
+obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
@@ -57,8 +56,6 @@ obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
 
-obj-$(CONFIG_DNOTIFY)		+= dnotify.o
-
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
 obj-$(CONFIG_SYSFS)		+= sysfs/
diff --git a/fs/dnotify.c b/fs/dnotify.c
deleted file mode 100644
index b0aa2cde80b..00000000000
--- a/fs/dnotify.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Directory notifications for Linux.
- *
- * Copyright (C) 2000,2001,2002 Stephen Rothwell
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/dnotify.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/fdtable.h>
-
-int dir_notify_enable __read_mostly = 1;
-
-static struct kmem_cache *dn_cache __read_mostly;
-
-static void redo_inode_mask(struct inode *inode)
-{
-	unsigned long new_mask;
-	struct dnotify_struct *dn;
-
-	new_mask = 0;
-	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
-		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
-	inode->i_dnotify_mask = new_mask;
-}
-
-void dnotify_flush(struct file *filp, fl_owner_t id)
-{
-	struct dnotify_struct *dn;
-	struct dnotify_struct **prev;
-	struct inode *inode;
-
-	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode))
-		return;
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
-			*prev = dn->dn_next;
-			redo_inode_mask(inode);
-			kmem_cache_free(dn_cache, dn);
-			break;
-		}
-		prev = &dn->dn_next;
-	}
-	spin_unlock(&inode->i_lock);
-}
-
-int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
-{
-	struct dnotify_struct *dn;
-	struct dnotify_struct *odn;
-	struct dnotify_struct **prev;
-	struct inode *inode;
-	fl_owner_t id = current->files;
-	struct file *f;
-	int error = 0;
-
-	if ((arg & ~DN_MULTISHOT) == 0) {
-		dnotify_flush(filp, id);
-		return 0;
-	}
-	if (!dir_notify_enable)
-		return -EINVAL;
-	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode))
-		return -ENOTDIR;
-	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
-	if (dn == NULL)
-		return -ENOMEM;
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((odn = *prev) != NULL) {
-		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-			odn->dn_fd = fd;
-			odn->dn_mask |= arg;
-			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-			goto out_free;
-		}
-		prev = &odn->dn_next;
-	}
-
-	rcu_read_lock();
-	f = fcheck(fd);
-	rcu_read_unlock();
-	/* we'd lost the race with close(), sod off silently */
-	/* note that inode->i_lock prevents reordering problems
-	 * between accesses to descriptor table and ->i_dnotify */
-	if (f != filp)
-		goto out_free;
-
-	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	if (error)
-		goto out_free;
-
-	dn->dn_mask = arg;
-	dn->dn_fd = fd;
-	dn->dn_filp = filp;
-	dn->dn_owner = id;
-	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-	dn->dn_next = inode->i_dnotify;
-	inode->i_dnotify = dn;
-	spin_unlock(&inode->i_lock);
-	return 0;
-
-out_free:
-	spin_unlock(&inode->i_lock);
-	kmem_cache_free(dn_cache, dn);
-	return error;
-}
-
-void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	struct dnotify_struct *	dn;
-	struct dnotify_struct **prev;
-	struct fown_struct *	fown;
-	int			changed = 0;
-
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_mask & event) == 0) {
-			prev = &dn->dn_next;
-			continue;
-		}
-		fown = &dn->dn_filp->f_owner;
-		send_sigio(fown, dn->dn_fd, POLL_MSG);
-		if (dn->dn_mask & DN_MULTISHOT)
-			prev = &dn->dn_next;
-		else {
-			*prev = dn->dn_next;
-			changed = 1;
-			kmem_cache_free(dn_cache, dn);
-		}
-	}
-	if (changed)
-		redo_inode_mask(inode);
-	spin_unlock(&inode->i_lock);
-}
-
-EXPORT_SYMBOL(__inode_dir_notify);
-
-/*
- * This is hopelessly wrong, but unfixable without API changes.  At
- * least it doesn't oops the kernel...
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-	struct dentry *parent;
-
-	if (!dir_notify_enable)
-		return;
-
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		__inode_dir_notify(parent->d_inode, event);
-		dput(parent);
-	} else {
-		spin_unlock(&dentry->d_lock);
-	}
-}
-EXPORT_SYMBOL_GPL(dnotify_parent);
-
-static int __init dnotify_init(void)
-{
-	dn_cache = kmem_cache_create("dnotify_cache",
-		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
-	return 0;
-}
-
-module_init(dnotify_init)
diff --git a/fs/inotify.c b/fs/inotify.c
deleted file mode 100644
index dae3f28f30d..00000000000
--- a/fs/inotify.c
+++ /dev/null
@@ -1,913 +0,0 @@
-/*
- * fs/inotify.c - inode-based file event notifications
- *
- * Authors:
- *	John McCutchan	<ttb@tentacle.dhs.org>
- *	Robert Love	<rml@novell.com>
- *
- * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/idr.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/writeback.h>
-#include <linux/inotify.h>
-
-static atomic_t inotify_cookie;
-
-/*
- * Lock ordering:
- *
- * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
- * iprune_mutex (synchronize shrink_icache_memory())
- * 	inode_lock (protects the super_block->s_inodes list)
- * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
- * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
- *
- * The inode->inotify_mutex and inotify_handle->mutex and held during execution
- * of a caller's event handler.  Thus, the caller must not hold any locks
- * taken in their event handler while calling any of the published inotify
- * interfaces.
- */
-
-/*
- * Lifetimes of the three main data structures--inotify_handle, inode, and
- * inotify_watch--are managed by reference count.
- *
- * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
- * Additional references can bump the count via get_inotify_handle() and drop
- * the count via put_inotify_handle().
- *
- * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
- * to remove_watch_no_event().  Additional references can bump the count via
- * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
- * is reponsible for the final put after receiving IN_IGNORED, or when using
- * IN_ONESHOT after receiving the first event.  Inotify does the final put if
- * inotify_destroy() is called.
- *
- * inode: Pinned so long as the inode is associated with a watch, from
- * inotify_add_watch() to the final put_inotify_watch().
- */
-
-/*
- * struct inotify_handle - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_handle {
-	struct idr		idr;		/* idr mapping wd -> watch */
-	struct mutex		mutex;		/* protects this bad boy */
-	struct list_head	watches;	/* list of watches */
-	atomic_t		count;		/* reference count */
-	u32			last_wd;	/* the last wd allocated */
-	const struct inotify_operations *in_ops; /* inotify caller operations */
-};
-
-static inline void get_inotify_handle(struct inotify_handle *ih)
-{
-	atomic_inc(&ih->count);
-}
-
-static inline void put_inotify_handle(struct inotify_handle *ih)
-{
-	if (atomic_dec_and_test(&ih->count)) {
-		idr_destroy(&ih->idr);
-		kfree(ih);
-	}
-}
-
-/**
- * get_inotify_watch - grab a reference to an inotify_watch
- * @watch: watch to grab
- */
-void get_inotify_watch(struct inotify_watch *watch)
-{
-	atomic_inc(&watch->count);
-}
-EXPORT_SYMBOL_GPL(get_inotify_watch);
-
-int pin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	spin_lock(&sb_lock);
-	if (sb->s_count >= S_BIAS) {
-		atomic_inc(&sb->s_active);
-		spin_unlock(&sb_lock);
-		atomic_inc(&watch->count);
-		return 1;
-	}
-	spin_unlock(&sb_lock);
-	return 0;
-}
-
-/**
- * put_inotify_watch - decrements the ref count on a given watch.  cleans up
- * watch references if the count reaches zero.  inotify_watch is freed by
- * inotify callers via the destroy_watch() op.
- * @watch: watch to release
- */
-void put_inotify_watch(struct inotify_watch *watch)
-{
-	if (atomic_dec_and_test(&watch->count)) {
-		struct inotify_handle *ih = watch->ih;
-
-		iput(watch->inode);
-		ih->in_ops->destroy_watch(watch);
-		put_inotify_handle(ih);
-	}
-}
-EXPORT_SYMBOL_GPL(put_inotify_watch);
-
-void unpin_inotify_watch(struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	deactivate_super(sb);
-}
-
-/*
- * inotify_handle_get_wd - returns the next WD for use by the given handle
- *
- * Callers must hold ih->mutex.  This function can sleep.
- */
-static int inotify_handle_get_wd(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	int ret;
-
-	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
-			return -ENOSPC;
-		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
-	} while (ret == -EAGAIN);
-
-	if (likely(!ret))
-		ih->last_wd = watch->wd;
-
-	return ret;
-}
-
-/*
- * inotify_inode_watched - returns nonzero if there are watches on this inode
- * and zero otherwise.  We call this lockless, we do not care if we race.
- */
-static inline int inotify_inode_watched(struct inode *inode)
-{
-	return !list_empty(&inode->inotify_watches);
-}
-
-/*
- * Get child dentry flag into synch with parent inode.
- * Flag should always be clear for negative dentrys.
- */
-static void set_dentry_child_flags(struct inode *inode, int watched)
-{
-	struct dentry *alias;
-
-	spin_lock(&dcache_lock);
-	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
-		struct dentry *child;
-
-		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
-			if (!child->d_inode)
-				continue;
-
-			spin_lock(&child->d_lock);
-			if (watched)
-				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-			else
-				child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
-			spin_unlock(&child->d_lock);
-		}
-	}
-	spin_unlock(&dcache_lock);
-}
-
-/*
- * inotify_find_handle - find the watch associated with the given inode and
- * handle
- *
- * Callers must hold inode->inotify_mutex.
- */
-static struct inotify_watch *inode_find_handle(struct inode *inode,
-					       struct inotify_handle *ih)
-{
-	struct inotify_watch *watch;
-
-	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
-		if (watch->ih == ih)
-			return watch;
-	}
-
-	return NULL;
-}
-
-/*
- * remove_watch_no_event - remove watch without the IN_IGNORED event.
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-static void remove_watch_no_event(struct inotify_watch *watch,
-				  struct inotify_handle *ih)
-{
-	list_del(&watch->i_list);
-	list_del(&watch->h_list);
-
-	if (!inotify_inode_watched(watch->inode))
-		set_dentry_child_flags(watch->inode, 0);
-
-	idr_remove(&ih->idr, watch->wd);
-}
-
-/**
- * inotify_remove_watch_locked - Remove a watch from both the handle and the
- * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
- * watched.  May be invoked from a caller's event handler.
- * @ih: inotify handle associated with watch
- * @watch: watch to remove
- *
- * Callers must hold both inode->inotify_mutex and ih->mutex.
- */
-void inotify_remove_watch_locked(struct inotify_handle *ih,
-				 struct inotify_watch *watch)
-{
-	remove_watch_no_event(watch, ih);
-	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
-}
-EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
-
-/* Kernel API for producing events */
-
-/*
- * inotify_d_instantiate - instantiate dcache entry for inode
- */
-void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
-{
-	struct dentry *parent;
-
-	if (!inode)
-		return;
-
-	spin_lock(&entry->d_lock);
-	parent = entry->d_parent;
-	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	spin_unlock(&entry->d_lock);
-}
-
-/*
- * inotify_d_move - dcache entry has been moved
- */
-void inotify_d_move(struct dentry *entry)
-{
-	struct dentry *parent;
-
-	parent = entry->d_parent;
-	if (inotify_inode_watched(parent->d_inode))
-		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
-	else
-		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
-}
-
-/**
- * inotify_inode_queue_event - queue an event to all watches on this inode
- * @inode: inode event is originating from
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- * @n_inode: inode associated with name
- */
-void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
-			       const char *name, struct inode *n_inode)
-{
-	struct inotify_watch *watch, *next;
-
-	if (!inotify_inode_watched(inode))
-		return;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		u32 watch_mask = watch->mask;
-		if (watch_mask & mask) {
-			struct inotify_handle *ih= watch->ih;
-			mutex_lock(&ih->mutex);
-			if (watch_mask & IN_ONESHOT)
-				remove_watch_no_event(watch, ih);
-			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
-						 name, n_inode);
-			mutex_unlock(&ih->mutex);
-		}
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
-
-/**
- * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
- * @dentry: the dentry in question, we queue against this dentry's parent
- * @mask: event mask describing this event
- * @cookie: cookie for synchronization, or zero
- * @name: filename, if any
- */
-void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
-				       u32 cookie, const char *name)
-{
-	struct dentry *parent;
-	struct inode *inode;
-
-	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
-		return;
-
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	inode = parent->d_inode;
-
-	if (inotify_inode_watched(inode)) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		inotify_inode_queue_event(inode, mask, cookie, name,
-					  dentry->d_inode);
-		dput(parent);
-	} else
-		spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
-
-/**
- * inotify_get_cookie - return a unique cookie for use in synchronizing events.
- */
-u32 inotify_get_cookie(void)
-{
-	return atomic_inc_return(&inotify_cookie);
-}
-EXPORT_SYMBOL_GPL(inotify_get_cookie);
-
-/**
- * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @list: list of inodes being unmounted (sb->s_inodes)
- *
- * Called with inode_lock held, protecting the unmounting super block's list
- * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
- * We temporarily drop inode_lock, however, and CAN block.
- */
-void inotify_unmount_inodes(struct list_head *list)
-{
-	struct inode *inode, *next_i, *need_iput = NULL;
-
-	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
-		struct inotify_watch *watch, *next_w;
-		struct inode *need_iput_tmp;
-		struct list_head *watches;
-
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count))
-			continue;
-
-		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
-		 * I_WILL_FREE which is fine because by that point the inode
-		 * cannot have any associated watches.
-		 */
-		if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
-			continue;
-
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-		/* In case inotify_remove_watch_locked() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
-		/* In case the dropping of a reference would nuke next_i. */
-		if ((&next_i->i_sb_list != list) &&
-				atomic_read(&next_i->i_count) &&
-				!(next_i->i_state & (I_CLEAR | I_FREEING |
-					I_WILL_FREE))) {
-			__iget(next_i);
-			need_iput = next_i;
-		}
-
-		/*
-		 * We can safely drop inode_lock here because we hold
-		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.  Finally,
-		 * iprune_mutex keeps shrink_icache_memory() away.
-		 */
-		spin_unlock(&inode_lock);
-
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
-
-		/* for each watch, send IN_UNMOUNT and then remove it */
-		mutex_lock(&inode->inotify_mutex);
-		watches = &inode->inotify_watches;
-		list_for_each_entry_safe(watch, next_w, watches, i_list) {
-			struct inotify_handle *ih= watch->ih;
-			get_inotify_watch(watch);
-			mutex_lock(&ih->mutex);
-			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
-						 NULL, NULL);
-			inotify_remove_watch_locked(ih, watch);
-			mutex_unlock(&ih->mutex);
-			put_inotify_watch(watch);
-		}
-		mutex_unlock(&inode->inotify_mutex);
-		iput(inode);		
-
-		spin_lock(&inode_lock);
-	}
-}
-EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
-
-/**
- * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
- * @inode: inode that is about to be removed
- */
-void inotify_inode_is_dead(struct inode *inode)
-{
-	struct inotify_watch *watch, *next;
-
-	mutex_lock(&inode->inotify_mutex);
-	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
-		struct inotify_handle *ih = watch->ih;
-		mutex_lock(&ih->mutex);
-		inotify_remove_watch_locked(ih, watch);
-		mutex_unlock(&ih->mutex);
-	}
-	mutex_unlock(&inode->inotify_mutex);
-}
-EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
-
-/* Kernel Consumer API */
-
-/**
- * inotify_init - allocate and initialize an inotify instance
- * @ops: caller's inotify operations
- */
-struct inotify_handle *inotify_init(const struct inotify_operations *ops)
-{
-	struct inotify_handle *ih;
-
-	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
-	if (unlikely(!ih))
-		return ERR_PTR(-ENOMEM);
-
-	idr_init(&ih->idr);
-	INIT_LIST_HEAD(&ih->watches);
-	mutex_init(&ih->mutex);
-	ih->last_wd = 0;
-	ih->in_ops = ops;
-	atomic_set(&ih->count, 0);
-	get_inotify_handle(ih);
-
-	return ih;
-}
-EXPORT_SYMBOL_GPL(inotify_init);
-
-/**
- * inotify_init_watch - initialize an inotify watch
- * @watch: watch to initialize
- */
-void inotify_init_watch(struct inotify_watch *watch)
-{
-	INIT_LIST_HEAD(&watch->h_list);
-	INIT_LIST_HEAD(&watch->i_list);
-	atomic_set(&watch->count, 0);
-	get_inotify_watch(watch); /* initial get */
-}
-EXPORT_SYMBOL_GPL(inotify_init_watch);
-
-/*
- * Watch removals suck violently.  To kick the watch out we need (in this
- * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
- * a hold on inode; however, for all other cases we need to make damn sure
- * we don't race with umount.  We can *NOT* just grab a reference to a
- * watch - inotify_unmount_inodes() will happily sail past it and we'll end
- * with reference to inode potentially outliving its superblock.  Ideally
- * we just want to grab an active reference to superblock if we can; that
- * will make sure we won't go into inotify_umount_inodes() until we are
- * done.  Cleanup is just deactivate_super().  However, that leaves a messy
- * case - what if we *are* racing with umount() and active references to
- * superblock can't be acquired anymore?  We can bump ->s_count, grab
- * ->s_umount, which will almost certainly wait until the superblock is shut
- * down and the watch in question is pining for fjords.  That's fine, but
- * there is a problem - we might have hit the window between ->s_active
- * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
- * is past the point of no return and is heading for shutdown) and the
- * moment when deactivate_super() acquires ->s_umount.  We could just do
- * drop_super() yield() and retry, but that's rather antisocial and this
- * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
- * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
- * that we won't race with inotify_umount_inodes().  So we could grab a
- * reference to watch and do the rest as above, just with drop_super() instead
- * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
- * could grab ->s_umount.  So the watch could've been gone already.
- *
- * That still can be dealt with - we need to save watch->wd, do idr_find()
- * and compare its result with our pointer.  If they match, we either have
- * the damn thing still alive or we'd lost not one but two races at once,
- * the watch had been killed and a new one got created with the same ->wd
- * at the same address.  That couldn't have happened in inotify_destroy(),
- * but inotify_rm_wd() could run into that.  Still, "new one got created"
- * is not a problem - we have every right to kill it or leave it alone,
- * whatever's more convenient.
- *
- * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
- * "grab it and kill it" check.  If it's been our original watch, we are
- * fine, if it's a newcomer - nevermind, just pretend that we'd won the
- * race and kill the fscker anyway; we are safe since we know that its
- * superblock won't be going away.
- *
- * And yes, this is far beyond mere "not very pretty"; so's the entire
- * concept of inotify to start with.
- */
-
-/**
- * pin_to_kill - pin the watch down for removal
- * @ih: inotify handle
- * @watch: watch to kill
- *
- * Called with ih->mutex held, drops it.  Possible return values:
- * 0 - nothing to do, it has died
- * 1 - remove it, drop the reference and deactivate_super()
- * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
- * that variant, since it involved a lot of PITA, but that's the best that
- * could've been done.
- */
-static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	s32 wd = watch->wd;
-
-	spin_lock(&sb_lock);
-	if (sb->s_count >= S_BIAS) {
-		atomic_inc(&sb->s_active);
-		spin_unlock(&sb_lock);
-		get_inotify_watch(watch);
-		mutex_unlock(&ih->mutex);
-		return 1;	/* the best outcome */
-	}
-	sb->s_count++;
-	spin_unlock(&sb_lock);
-	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
-	down_read(&sb->s_umount);
-	if (likely(!sb->s_root)) {
-		/* fs is already shut down; the watch is dead */
-		drop_super(sb);
-		return 0;
-	}
-	/* raced with the final deactivate_super() */
-	mutex_lock(&ih->mutex);
-	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
-		/* the watch is dead */
-		mutex_unlock(&ih->mutex);
-		drop_super(sb);
-		return 0;
-	}
-	/* still alive or freed and reused with the same sb and wd; kill */
-	get_inotify_watch(watch);
-	mutex_unlock(&ih->mutex);
-	return 2;
-}
-
-static void unpin_and_kill(struct inotify_watch *watch, int how)
-{
-	struct super_block *sb = watch->inode->i_sb;
-	put_inotify_watch(watch);
-	switch (how) {
-	case 1:
-		deactivate_super(sb);
-		break;
-	case 2:
-		drop_super(sb);
-	}
-}
-
-/**
- * inotify_destroy - clean up and destroy an inotify instance
- * @ih: inotify handle
- */
-void inotify_destroy(struct inotify_handle *ih)
-{
-	/*
-	 * Destroy all of the watches for this handle. Unfortunately, not very
-	 * pretty.  We cannot do a simple iteration over the list, because we
-	 * do not know the inode until we iterate to the watch.  But we need to
-	 * hold inode->inotify_mutex before ih->mutex.  The following works.
-	 *
-	 * AV: it had to become even uglier to start working ;-/
-	 */
-	while (1) {
-		struct inotify_watch *watch;
-		struct list_head *watches;
-		struct super_block *sb;
-		struct inode *inode;
-		int how;
-
-		mutex_lock(&ih->mutex);
-		watches = &ih->watches;
-		if (list_empty(watches)) {
-			mutex_unlock(&ih->mutex);
-			break;
-		}
-		watch = list_first_entry(watches, struct inotify_watch, h_list);
-		sb = watch->inode->i_sb;
-		how = pin_to_kill(ih, watch);
-		if (!how)
-			continue;
-
-		inode = watch->inode;
-		mutex_lock(&inode->inotify_mutex);
-		mutex_lock(&ih->mutex);
-
-		/* make sure we didn't race with another list removal */
-		if (likely(idr_find(&ih->idr, watch->wd))) {
-			remove_watch_no_event(watch, ih);
-			put_inotify_watch(watch);
-		}
-
-		mutex_unlock(&ih->mutex);
-		mutex_unlock(&inode->inotify_mutex);
-		unpin_and_kill(watch, how);
-	}
-
-	/* free this handle: the put matching the get in inotify_init() */
-	put_inotify_handle(ih);
-}
-EXPORT_SYMBOL_GPL(inotify_destroy);
-
-/**
- * inotify_find_watch - find an existing watch for an (ih,inode) pair
- * @ih: inotify handle
- * @inode: inode to watch
- * @watchp: pointer to existing inotify_watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
-		       struct inotify_watch **watchp)
-{
-	struct inotify_watch *old;
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	old = inode_find_handle(inode, ih);
-	if (unlikely(old)) {
-		get_inotify_watch(old); /* caller must put watch */
-		*watchp = old;
-		ret = old->wd;
-	}
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_watch);
-
-/**
- * inotify_find_update_watch - find and update the mask of an existing watch
- * @ih: inotify handle
- * @inode: inode's watch to update
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- */
-s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
-			      u32 mask)
-{
-	struct inotify_watch *old;
-	int mask_add = 0;
-	int ret;
-
-	if (mask & IN_MASK_ADD)
-		mask_add = 1;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/*
-	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
-	 * are already watching.  We just update the mask and return its wd.
-	 */
-	old = inode_find_handle(inode, ih);
-	if (unlikely(!old)) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	if (mask_add)
-		old->mask |= mask;
-	else
-		old->mask = mask;
-	ret = old->wd;
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_find_update_watch);
-
-/**
- * inotify_add_watch - add a watch to an inotify instance
- * @ih: inotify handle
- * @watch: caller allocated watch structure
- * @inode: inode to watch
- * @mask: mask of events to watch
- *
- * Caller must pin given inode (via nameidata).
- * Caller must ensure it only calls inotify_add_watch() once per watch.
- * Calls inotify_handle_get_wd() so may sleep.
- */
-s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
-		      struct inode *inode, u32 mask)
-{
-	int ret = 0;
-	int newly_watched;
-
-	/* don't allow invalid bits: we don't want flags set */
-	mask &= IN_ALL_EVENTS | IN_ONESHOT;
-	if (unlikely(!mask))
-		return -EINVAL;
-	watch->mask = mask;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, watch);
-	if (unlikely(ret))
-		goto out;
-	ret = watch->wd;
-
-	/* save a reference to handle and bump the count to make it official */
-	get_inotify_handle(ih);
-	watch->ih = ih;
-
-	/*
-	 * Save a reference to the inode and bump the ref count to make it
-	 * official.  We hold a reference to nameidata, which makes this safe.
-	 */
-	watch->inode = igrab(inode);
-
-	/* Add the watch to the handle's and the inode's list */
-	newly_watched = !inotify_inode_watched(inode);
-	list_add(&watch->h_list, &ih->watches);
-	list_add(&watch->i_list, &inode->inotify_watches);
-	/*
-	 * Set child flags _after_ adding the watch, so there is no race
-	 * windows where newly instantiated children could miss their parent's
-	 * watched flag.
-	 */
-	if (newly_watched)
-		set_dentry_child_flags(inode, 1);
-
-out:
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(inotify_add_watch);
-
-/**
- * inotify_clone_watch - put the watch next to existing one
- * @old: already installed watch
- * @new: new watch
- *
- * Caller must hold the inotify_mutex of inode we are dealing with;
- * it is expected to remove the old watch before unlocking the inode.
- */
-s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
-{
-	struct inotify_handle *ih = old->ih;
-	int ret = 0;
-
-	new->mask = old->mask;
-	new->ih = ih;
-
-	mutex_lock(&ih->mutex);
-
-	/* Initialize a new watch */
-	ret = inotify_handle_get_wd(ih, new);
-	if (unlikely(ret))
-		goto out;
-	ret = new->wd;
-
-	get_inotify_handle(ih);
-
-	new->inode = igrab(old->inode);
-
-	list_add(&new->h_list, &ih->watches);
-	list_add(&new->i_list, &old->inode->inotify_watches);
-out:
-	mutex_unlock(&ih->mutex);
-	return ret;
-}
-
-void inotify_evict_watch(struct inotify_watch *watch)
-{
-	get_inotify_watch(watch);
-	mutex_lock(&watch->ih->mutex);
-	inotify_remove_watch_locked(watch->ih, watch);
-	mutex_unlock(&watch->ih->mutex);
-}
-
-/**
- * inotify_rm_wd - remove a watch from an inotify instance
- * @ih: inotify handle
- * @wd: watch descriptor to remove
- *
- * Can sleep.
- */
-int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
-{
-	struct inotify_watch *watch;
-	struct super_block *sb;
-	struct inode *inode;
-	int how;
-
-	mutex_lock(&ih->mutex);
-	watch = idr_find(&ih->idr, wd);
-	if (unlikely(!watch)) {
-		mutex_unlock(&ih->mutex);
-		return -EINVAL;
-	}
-	sb = watch->inode->i_sb;
-	how = pin_to_kill(ih, watch);
-	if (!how)
-		return 0;
-
-	inode = watch->inode;
-
-	mutex_lock(&inode->inotify_mutex);
-	mutex_lock(&ih->mutex);
-
-	/* make sure that we did not race */
-	if (likely(idr_find(&ih->idr, wd) == watch))
-		inotify_remove_watch_locked(ih, watch);
-
-	mutex_unlock(&ih->mutex);
-	mutex_unlock(&inode->inotify_mutex);
-	unpin_and_kill(watch, how);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(inotify_rm_wd);
-
-/**
- * inotify_rm_watch - remove a watch from an inotify instance
- * @ih: inotify handle
- * @watch: watch to remove
- *
- * Can sleep.
- */
-int inotify_rm_watch(struct inotify_handle *ih,
-		     struct inotify_watch *watch)
-{
-	return inotify_rm_wd(ih, watch->wd);
-}
-EXPORT_SYMBOL_GPL(inotify_rm_watch);
-
-/*
- * inotify_setup - core initialization function
- */
-static int __init inotify_setup(void)
-{
-	atomic_set(&inotify_cookie, 0);
-
-	return 0;
-}
-
-module_init(inotify_setup);
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
deleted file mode 100644
index 400f8064a54..00000000000
--- a/fs/inotify_user.c
+++ /dev/null
@@ -1,778 +0,0 @@
-/*
- * fs/inotify_user.c - inotify support for userspace
- *
- * Authors:
- *	John McCutchan	<ttb@tentacle.dhs.org>
- *	Robert Love	<rml@novell.com>
- *
- * Copyright (C) 2005 John McCutchan
- * Copyright 2006 Hewlett-Packard Development Company, L.P.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/inotify.h>
-#include <linux/syscalls.h>
-#include <linux/magic.h>
-
-#include <asm/ioctls.h>
-
-static struct kmem_cache *watch_cachep __read_mostly;
-static struct kmem_cache *event_cachep __read_mostly;
-
-static struct vfsmount *inotify_mnt __read_mostly;
-
-/* these are configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
-static int inotify_max_queued_events __read_mostly;
-
-/*
- * Lock ordering:
- *
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- * 	inode->inotify_mutex (protects inode's watch list)
- * 		inotify_handle->mutex (protects inotify_handle's watch list)
- * 			inotify_dev->ev_mutex (protects device's event queue)
- */
-
-/*
- * Lifetimes of the main data structures:
- *
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
- */
-
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
-	struct mutex		ev_mutex;	/* protects event queue */
-	struct mutex		up_mutex;	/* synchronizes watch updates */
-	struct list_head 	events;		/* list of queued events */
-	struct user_struct	*user;		/* user who opened this dev */
-	struct inotify_handle	*ih;		/* inotify handle */
-	struct fasync_struct    *fa;            /* async notification */
-	atomic_t		count;		/* reference count */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
-};
-
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-	struct inotify_device	*dev;	/* associated device */
-	struct inotify_watch	wdata;	/* inotify watch data */
-};
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/sysctl.h>
-
-static int zero;
-
-ctl_table inotify_table[] = {
-	{
-		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
-		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
-		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-	{
-		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
-		.procname	= "max_queued_events",
-		.data		= &inotify_max_queued_events,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero
-	},
-	{ .ctl_name = 0 }
-};
-#endif /* CONFIG_SYSCTL */
-
-static inline void get_inotify_dev(struct inotify_device *dev)
-{
-	atomic_inc(&dev->count);
-}
-
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		kfree(dev);
-	}
-}
-
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
-
-	atomic_dec(&dev->user->inotify_watches);
-	put_inotify_dev(dev);
-	kmem_cache_free(watch_cachep, watch);
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_KERNEL);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
-{
-	if (list_empty(&dev->events))
-		return NULL;
-	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_queue_event - event handler registered with core inotify, adds
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-				    u32 cookie, const char *name,
-				    struct inode *ignored)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-	struct inotify_kernel_event *kevent, *last;
-
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
-
-	mutex_lock(&dev->ev_mutex);
-
-	/* we can safely put the watch as we don't reference it while
-	 * generating the event
-	 */
-	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-		put_inotify_watch(w); /* final put */
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_last_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			goto out;
-		if (name && lastname && !strcmp(lastname, name))
-			goto out;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		goto out;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		goto out;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-	kill_fasync(&dev->fa, SIGIO, POLL_IN);
-
-out:
-	mutex_unlock(&dev->ev_mutex);
-}
-
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
-
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
-
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
-		free_kevent(kevent);
-	}
-}
-
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-		      unsigned flags)
-{
-	int error;
-
-	error = user_path_at(AT_FDCWD, dirname, flags, path);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
-	if (error)
-		path_put(path);
-	return error;
-}
-
-/*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-			u32 mask)
-{
-	struct inotify_user_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return -ENOSPC;
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return -ENOMEM;
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	inotify_init_watch(&watch->wdata);
-	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-	if (ret < 0)
-		free_inotify_user_watch(&watch->wdata);
-
-	return ret;
-}
-
-/* Device Interface */
-
-static unsigned int inotify_poll(struct file *file, poll_table *wait)
-{
-	struct inotify_device *dev = file->private_data;
-	int ret = 0;
-
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->ev_mutex);
-	if (!list_empty(&dev->events))
-		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->ev_mutex);
-
-	return ret;
-}
-
-static ssize_t inotify_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *pos)
-{
-	size_t event_size = sizeof (struct inotify_event);
-	struct inotify_device *dev;
-	char __user *start;
-	int ret;
-	DEFINE_WAIT(wait);
-
-	start = buf;
-	dev = file->private_data;
-
-	while (1) {
-
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
-
-		mutex_lock(&dev->ev_mutex);
-		if (!list_empty(&dev->events)) {
-			ret = 0;
-			break;
-		}
-		mutex_unlock(&dev->ev_mutex);
-
-		if (file->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-
-		schedule();
-	}
-
-	finish_wait(&dev->wq, &wait);
-	if (ret)
-		return ret;
-
-	while (1) {
-		struct inotify_kernel_event *kevent;
-
-		ret = buf - start;
-		if (list_empty(&dev->events))
-			break;
-
-		kevent = inotify_dev_get_event(dev);
-		if (event_size + kevent->event.len > count) {
-			if (ret == 0 && count > 0) {
-				/*
-				 * could not get a single event because we
-				 * didn't have enough buffer space.
-				 */
-				ret = -EINVAL;
-			}
-			break;
-		}
-		remove_kevent(dev, kevent);
-
-		/*
-		 * Must perform the copy_to_user outside the mutex in order
-		 * to avoid a lock order reversal with mmap_sem.
-		 */
-		mutex_unlock(&dev->ev_mutex);
-
-		if (copy_to_user(buf, &kevent->event, event_size)) {
-			ret = -EFAULT;
-			break;
-		}
-		buf += event_size;
-		count -= event_size;
-
-		if (kevent->name) {
-			if (copy_to_user(buf, kevent->name, kevent->event.len)){
-				ret = -EFAULT;
-				break;
-			}
-			buf += kevent->event.len;
-			count -= kevent->event.len;
-		}
-
-		free_kevent(kevent);
-
-		mutex_lock(&dev->ev_mutex);
-	}
-	mutex_unlock(&dev->ev_mutex);
-
-	return ret;
-}
-
-static int inotify_fasync(int fd, struct file *file, int on)
-{
-	struct inotify_device *dev = file->private_data;
-
-	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
-}
-
-static int inotify_release(struct inode *ignored, struct file *file)
-{
-	struct inotify_device *dev = file->private_data;
-
-	inotify_destroy(dev->ih);
-
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->ev_mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->ev_mutex);
-
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
-
-	return 0;
-}
-
-static long inotify_ioctl(struct file *file, unsigned int cmd,
-			  unsigned long arg)
-{
-	struct inotify_device *dev;
-	void __user *p;
-	int ret = -ENOTTY;
-
-	dev = file->private_data;
-	p = (void __user *) arg;
-
-	switch (cmd) {
-	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
-		break;
-	}
-
-	return ret;
-}
-
-static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.fasync         = inotify_fasync,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
-	.compat_ioctl	= inotify_ioctl,
-};
-
-static const struct inotify_operations inotify_user_ops = {
-	.handle_event	= inotify_dev_queue_event,
-	.destroy_watch	= free_inotify_user_watch,
-};
-
-asmlinkage long sys_inotify_init1(int flags)
-{
-	struct inotify_device *dev;
-	struct inotify_handle *ih;
-	struct user_struct *user;
-	struct file *filp;
-	int fd, ret;
-
-	/* Check the IN_* constants for consistency.  */
-	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
-	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
-
-	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
-		return -EINVAL;
-
-	fd = get_unused_fd_flags(flags & O_CLOEXEC);
-	if (fd < 0)
-		return fd;
-
-	filp = get_empty_filp();
-	if (!filp) {
-		ret = -ENFILE;
-		goto out_put_fd;
-	}
-
-	user = get_current_user();
-	if (unlikely(atomic_read(&user->inotify_devs) >=
-			inotify_max_user_instances)) {
-		ret = -EMFILE;
-		goto out_free_uid;
-	}
-
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
-		goto out_free_uid;
-	}
-
-	ih = inotify_init(&inotify_user_ops);
-	if (IS_ERR(ih)) {
-		ret = PTR_ERR(ih);
-		goto out_free_dev;
-	}
-	dev->ih = ih;
-	dev->fa = NULL;
-
-	filp->f_op = &inotify_fops;
-	filp->f_path.mnt = mntget(inotify_mnt);
-	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
-	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-	filp->f_mode = FMODE_READ;
-	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = dev;
-
-	INIT_LIST_HEAD(&dev->events);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->ev_mutex);
-	mutex_init(&dev->up_mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
-	atomic_inc(&user->inotify_devs);
-	fd_install(fd, filp);
-
-	return fd;
-out_free_dev:
-	kfree(dev);
-out_free_uid:
-	free_uid(user);
-	put_filp(filp);
-out_put_fd:
-	put_unused_fd(fd);
-	return ret;
-}
-
-asmlinkage long sys_inotify_init(void)
-{
-	return sys_inotify_init1(0);
-}
-
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
-{
-	struct inode *inode;
-	struct inotify_device *dev;
-	struct path path;
-	struct file *filp;
-	int ret, fput_needed;
-	unsigned flags = 0;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
-
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto fput_and_out;
-	}
-
-	if (!(mask & IN_DONT_FOLLOW))
-		flags |= LOOKUP_FOLLOW;
-	if (mask & IN_ONLYDIR)
-		flags |= LOOKUP_DIRECTORY;
-
-	ret = find_inode(pathname, &path, flags);
-	if (unlikely(ret))
-		goto fput_and_out;
-
-	/* inode held in place by reference to path; dev by fget on fd */
-	inode = path.dentry->d_inode;
-	dev = filp->private_data;
-
-	mutex_lock(&dev->up_mutex);
-	ret = inotify_find_update_watch(dev->ih, inode, mask);
-	if (ret == -ENOENT)
-		ret = create_watch(dev, inode, mask);
-	mutex_unlock(&dev->up_mutex);
-
-	path_put(&path);
-fput_and_out:
-	fput_light(filp, fput_needed);
-	return ret;
-}
-
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
-{
-	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
-		return -EBADF;
-
-	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	dev = filp->private_data;
-
-	/* we free our watch data when we get IN_IGNORED */
-	ret = inotify_rm_wd(dev->ih, wd);
-
-out:
-	fput_light(filp, fput_needed);
-	return ret;
-}
-
-static int
-inotify_get_sb(struct file_system_type *fs_type, int flags,
-	       const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "inotify", NULL,
-			INOTIFYFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
-};
-
-/*
- * inotify_user_setup - Our initialization function.  Note that we cannnot return
- * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
- * must result in panic().
- */
-static int __init inotify_user_setup(void)
-{
-	int ret;
-
-	ret = register_filesystem(&inotify_fs_type);
-	if (unlikely(ret))
-		panic("inotify: register_filesystem returned %d!\n", ret);
-
-	inotify_mnt = kern_mount(&inotify_fs_type);
-	if (IS_ERR(inotify_mnt))
-		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
-
-	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
-
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_user_watch),
-					 0, SLAB_PANIC, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL);
-
-	return 0;
-}
-
-module_init(inotify_user_setup);
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 00000000000..50914d7303c
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 00000000000..5a95b6010ce
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
+obj-y			+= dnotify/
+obj-y			+= inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 00000000000..26adf5dfa64
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
+config DNOTIFY
+	bool "Dnotify support"
+	default y
+	help
+	  Dnotify is a directory-based per-fd file change notification system
+	  that uses signals to communicate events to user-space.  There exist
+	  superior alternatives, but some applications may still rely on
+	  dnotify.
+
+	  If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 00000000000..f145251dcad
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY)		+= dnotify.o
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
new file mode 100644
index 00000000000..b0aa2cde80b
--- /dev/null
+++ b/fs/notify/dnotify/dnotify.c
@@ -0,0 +1,191 @@
+/*
+ * Directory notifications for Linux.
+ *
+ * Copyright (C) 2000,2001,2002 Stephen Rothwell
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/dnotify.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/fdtable.h>
+
+int dir_notify_enable __read_mostly = 1;
+
+static struct kmem_cache *dn_cache __read_mostly;
+
+static void redo_inode_mask(struct inode *inode)
+{
+	unsigned long new_mask;
+	struct dnotify_struct *dn;
+
+	new_mask = 0;
+	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
+		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
+	inode->i_dnotify_mask = new_mask;
+}
+
+void dnotify_flush(struct file *filp, fl_owner_t id)
+{
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct inode *inode;
+
+	inode = filp->f_path.dentry->d_inode;
+	if (!S_ISDIR(inode->i_mode))
+		return;
+	spin_lock(&inode->i_lock);
+	prev = &inode->i_dnotify;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
+			*prev = dn->dn_next;
+			redo_inode_mask(inode);
+			kmem_cache_free(dn_cache, dn);
+			break;
+		}
+		prev = &dn->dn_next;
+	}
+	spin_unlock(&inode->i_lock);
+}
+
+int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+{
+	struct dnotify_struct *dn;
+	struct dnotify_struct *odn;
+	struct dnotify_struct **prev;
+	struct inode *inode;
+	fl_owner_t id = current->files;
+	struct file *f;
+	int error = 0;
+
+	if ((arg & ~DN_MULTISHOT) == 0) {
+		dnotify_flush(filp, id);
+		return 0;
+	}
+	if (!dir_notify_enable)
+		return -EINVAL;
+	inode = filp->f_path.dentry->d_inode;
+	if (!S_ISDIR(inode->i_mode))
+		return -ENOTDIR;
+	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
+	if (dn == NULL)
+		return -ENOMEM;
+	spin_lock(&inode->i_lock);
+	prev = &inode->i_dnotify;
+	while ((odn = *prev) != NULL) {
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= arg;
+			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
+			goto out_free;
+		}
+		prev = &odn->dn_next;
+	}
+
+	rcu_read_lock();
+	f = fcheck(fd);
+	rcu_read_unlock();
+	/* we'd lost the race with close(), sod off silently */
+	/* note that inode->i_lock prevents reordering problems
+	 * between accesses to descriptor table and ->i_dnotify */
+	if (f != filp)
+		goto out_free;
+
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+	if (error)
+		goto out_free;
+
+	dn->dn_mask = arg;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
+	dn->dn_next = inode->i_dnotify;
+	inode->i_dnotify = dn;
+	spin_unlock(&inode->i_lock);
+	return 0;
+
+out_free:
+	spin_unlock(&inode->i_lock);
+	kmem_cache_free(dn_cache, dn);
+	return error;
+}
+
+void __inode_dir_notify(struct inode *inode, unsigned long event)
+{
+	struct dnotify_struct *	dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *	fown;
+	int			changed = 0;
+
+	spin_lock(&inode->i_lock);
+	prev = &inode->i_dnotify;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & event) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			changed = 1;
+			kmem_cache_free(dn_cache, dn);
+		}
+	}
+	if (changed)
+		redo_inode_mask(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+EXPORT_SYMBOL(__inode_dir_notify);
+
+/*
+ * This is hopelessly wrong, but unfixable without API changes.  At
+ * least it doesn't oops the kernel...
+ *
+ * To safely access ->d_parent we need to keep d_move away from it.  Use the
+ * dentry's d_lock for this.
+ */
+void dnotify_parent(struct dentry *dentry, unsigned long event)
+{
+	struct dentry *parent;
+
+	if (!dir_notify_enable)
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	if (parent->d_inode->i_dnotify_mask & event) {
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		__inode_dir_notify(parent->d_inode, event);
+		dput(parent);
+	} else {
+		spin_unlock(&dentry->d_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(dnotify_parent);
+
+static int __init dnotify_init(void)
+{
+	dn_cache = kmem_cache_create("dnotify_cache",
+		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+	return 0;
+}
+
+module_init(dnotify_init)
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 00000000000..44679284102
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
+config INOTIFY
+	bool "Inotify file change notification support"
+	default y
+	---help---
+	  Say Y here to enable inotify support.  Inotify is a file change
+	  notification system and a replacement for dnotify.  Inotify fixes
+	  numerous shortcomings in dnotify and introduces several new features
+	  including multiple file events, one-shot support, and unmount
+	  notification.
+
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
+
+	  If unsure, say Y.
+
+config INOTIFY_USER
+	bool "Inotify support for userspace"
+	depends on INOTIFY
+	default y
+	---help---
+	  Say Y here to enable inotify support for userspace, including the
+	  associated system calls.  Inotify allows monitoring of both files and
+	  directories via a single open fd.  Events are read from the file
+	  descriptor, which is also select()- and poll()-able.
+
+	  For more information, see <file:Documentation/filesystems/inotify.txt>
+
+	  If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 00000000000..e290f3bb9d8
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INOTIFY)		+= inotify.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
new file mode 100644
index 00000000000..dae3f28f30d
--- /dev/null
+++ b/fs/notify/inotify/inotify.c
@@ -0,0 +1,913 @@
+/*
+ * fs/inotify.c - inode-based file event notifications
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Kernel API added by: Amy Griffis <amy.griffis@hp.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/writeback.h>
+#include <linux/inotify.h>
+
+static atomic_t inotify_cookie;
+
+/*
+ * Lock ordering:
+ *
+ * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
+ * iprune_mutex (synchronize shrink_icache_memory())
+ * 	inode_lock (protects the super_block->s_inodes list)
+ * 	inode->inotify_mutex (protects inode->inotify_watches and watches->i_list)
+ * 		inotify_handle->mutex (protects inotify_handle and watches->h_list)
+ *
+ * The inode->inotify_mutex and inotify_handle->mutex and held during execution
+ * of a caller's event handler.  Thus, the caller must not hold any locks
+ * taken in their event handler while calling any of the published inotify
+ * interfaces.
+ */
+
+/*
+ * Lifetimes of the three main data structures--inotify_handle, inode, and
+ * inotify_watch--are managed by reference count.
+ *
+ * inotify_handle: Lifetime is from inotify_init() to inotify_destroy().
+ * Additional references can bump the count via get_inotify_handle() and drop
+ * the count via put_inotify_handle().
+ *
+ * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch()
+ * to remove_watch_no_event().  Additional references can bump the count via
+ * get_inotify_watch() and drop the count via put_inotify_watch().  The caller
+ * is reponsible for the final put after receiving IN_IGNORED, or when using
+ * IN_ONESHOT after receiving the first event.  Inotify does the final put if
+ * inotify_destroy() is called.
+ *
+ * inode: Pinned so long as the inode is associated with a watch, from
+ * inotify_add_watch() to the final put_inotify_watch().
+ */
+
+/*
+ * struct inotify_handle - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_handle {
+	struct idr		idr;		/* idr mapping wd -> watch */
+	struct mutex		mutex;		/* protects this bad boy */
+	struct list_head	watches;	/* list of watches */
+	atomic_t		count;		/* reference count */
+	u32			last_wd;	/* the last wd allocated */
+	const struct inotify_operations *in_ops; /* inotify caller operations */
+};
+
+static inline void get_inotify_handle(struct inotify_handle *ih)
+{
+	atomic_inc(&ih->count);
+}
+
+static inline void put_inotify_handle(struct inotify_handle *ih)
+{
+	if (atomic_dec_and_test(&ih->count)) {
+		idr_destroy(&ih->idr);
+		kfree(ih);
+	}
+}
+
+/**
+ * get_inotify_watch - grab a reference to an inotify_watch
+ * @watch: watch to grab
+ */
+void get_inotify_watch(struct inotify_watch *watch)
+{
+	atomic_inc(&watch->count);
+}
+EXPORT_SYMBOL_GPL(get_inotify_watch);
+
+int pin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		atomic_inc(&watch->count);
+		return 1;
+	}
+	spin_unlock(&sb_lock);
+	return 0;
+}
+
+/**
+ * put_inotify_watch - decrements the ref count on a given watch.  cleans up
+ * watch references if the count reaches zero.  inotify_watch is freed by
+ * inotify callers via the destroy_watch() op.
+ * @watch: watch to release
+ */
+void put_inotify_watch(struct inotify_watch *watch)
+{
+	if (atomic_dec_and_test(&watch->count)) {
+		struct inotify_handle *ih = watch->ih;
+
+		iput(watch->inode);
+		ih->in_ops->destroy_watch(watch);
+		put_inotify_handle(ih);
+	}
+}
+EXPORT_SYMBOL_GPL(put_inotify_watch);
+
+void unpin_inotify_watch(struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	deactivate_super(sb);
+}
+
+/*
+ * inotify_handle_get_wd - returns the next WD for use by the given handle
+ *
+ * Callers must hold ih->mutex.  This function can sleep.
+ */
+static int inotify_handle_get_wd(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
+{
+	int ret;
+
+	do {
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+			return -ENOSPC;
+		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
+	} while (ret == -EAGAIN);
+
+	if (likely(!ret))
+		ih->last_wd = watch->wd;
+
+	return ret;
+}
+
+/*
+ * inotify_inode_watched - returns nonzero if there are watches on this inode
+ * and zero otherwise.  We call this lockless, we do not care if we race.
+ */
+static inline int inotify_inode_watched(struct inode *inode)
+{
+	return !list_empty(&inode->inotify_watches);
+}
+
+/*
+ * Get child dentry flag into synch with parent inode.
+ * Flag should always be clear for negative dentrys.
+ */
+static void set_dentry_child_flags(struct inode *inode, int watched)
+{
+	struct dentry *alias;
+
+	spin_lock(&dcache_lock);
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode)
+				continue;
+
+			spin_lock(&child->d_lock);
+			if (watched)
+				child->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+			else
+				child->d_flags &=~DCACHE_INOTIFY_PARENT_WATCHED;
+			spin_unlock(&child->d_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/*
+ * inotify_find_handle - find the watch associated with the given inode and
+ * handle
+ *
+ * Callers must hold inode->inotify_mutex.
+ */
+static struct inotify_watch *inode_find_handle(struct inode *inode,
+					       struct inotify_handle *ih)
+{
+	struct inotify_watch *watch;
+
+	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
+		if (watch->ih == ih)
+			return watch;
+	}
+
+	return NULL;
+}
+
+/*
+ * remove_watch_no_event - remove watch without the IN_IGNORED event.
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
+ */
+static void remove_watch_no_event(struct inotify_watch *watch,
+				  struct inotify_handle *ih)
+{
+	list_del(&watch->i_list);
+	list_del(&watch->h_list);
+
+	if (!inotify_inode_watched(watch->inode))
+		set_dentry_child_flags(watch->inode, 0);
+
+	idr_remove(&ih->idr, watch->wd);
+}
+
+/**
+ * inotify_remove_watch_locked - Remove a watch from both the handle and the
+ * inode.  Sends the IN_IGNORED event signifying that the inode is no longer
+ * watched.  May be invoked from a caller's event handler.
+ * @ih: inotify handle associated with watch
+ * @watch: watch to remove
+ *
+ * Callers must hold both inode->inotify_mutex and ih->mutex.
+ */
+void inotify_remove_watch_locked(struct inotify_handle *ih,
+				 struct inotify_watch *watch)
+{
+	remove_watch_no_event(watch, ih);
+	ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(inotify_remove_watch_locked);
+
+/* Kernel API for producing events */
+
+/*
+ * inotify_d_instantiate - instantiate dcache entry for inode
+ */
+void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
+{
+	struct dentry *parent;
+
+	if (!inode)
+		return;
+
+	spin_lock(&entry->d_lock);
+	parent = entry->d_parent;
+	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	spin_unlock(&entry->d_lock);
+}
+
+/*
+ * inotify_d_move - dcache entry has been moved
+ */
+void inotify_d_move(struct dentry *entry)
+{
+	struct dentry *parent;
+
+	parent = entry->d_parent;
+	if (inotify_inode_watched(parent->d_inode))
+		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
+	else
+		entry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED;
+}
+
+/**
+ * inotify_inode_queue_event - queue an event to all watches on this inode
+ * @inode: inode event is originating from
+ * @mask: event mask describing this event
+ * @cookie: cookie for synchronization, or zero
+ * @name: filename, if any
+ * @n_inode: inode associated with name
+ */
+void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
+			       const char *name, struct inode *n_inode)
+{
+	struct inotify_watch *watch, *next;
+
+	if (!inotify_inode_watched(inode))
+		return;
+
+	mutex_lock(&inode->inotify_mutex);
+	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
+		u32 watch_mask = watch->mask;
+		if (watch_mask & mask) {
+			struct inotify_handle *ih= watch->ih;
+			mutex_lock(&ih->mutex);
+			if (watch_mask & IN_ONESHOT)
+				remove_watch_no_event(watch, ih);
+			ih->in_ops->handle_event(watch, watch->wd, mask, cookie,
+						 name, n_inode);
+			mutex_unlock(&ih->mutex);
+		}
+	}
+	mutex_unlock(&inode->inotify_mutex);
+}
+EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
+
+/**
+ * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
+ * @dentry: the dentry in question, we queue against this dentry's parent
+ * @mask: event mask describing this event
+ * @cookie: cookie for synchronization, or zero
+ * @name: filename, if any
+ */
+void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
+				       u32 cookie, const char *name)
+{
+	struct dentry *parent;
+	struct inode *inode;
+
+	if (!(dentry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED))
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	inode = parent->d_inode;
+
+	if (inotify_inode_watched(inode)) {
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		inotify_inode_queue_event(inode, mask, cookie, name,
+					  dentry->d_inode);
+		dput(parent);
+	} else
+		spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
+
+/**
+ * inotify_get_cookie - return a unique cookie for use in synchronizing events.
+ */
+u32 inotify_get_cookie(void)
+{
+	return atomic_inc_return(&inotify_cookie);
+}
+EXPORT_SYMBOL_GPL(inotify_get_cookie);
+
+/**
+ * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void inotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inotify_watch *watch, *next_w;
+		struct inode *need_iput_tmp;
+		struct list_head *watches;
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count))
+			continue;
+
+		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
+		 * I_WILL_FREE which is fine because by that point the inode
+		 * cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
+			continue;
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+		/* In case inotify_remove_watch_locked() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+		/* In case the dropping of a reference would nuke next_i. */
+		if ((&next_i->i_sb_list != list) &&
+				atomic_read(&next_i->i_count) &&
+				!(next_i->i_state & (I_CLEAR | I_FREEING |
+					I_WILL_FREE))) {
+			__iget(next_i);
+			need_iput = next_i;
+		}
+
+		/*
+		 * We can safely drop inode_lock here because we hold
+		 * references on both inode and next_i.  Also no new inodes
+		 * will be added since the umount has begun.  Finally,
+		 * iprune_mutex keeps shrink_icache_memory() away.
+		 */
+		spin_unlock(&inode_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send IN_UNMOUNT and then remove it */
+		mutex_lock(&inode->inotify_mutex);
+		watches = &inode->inotify_watches;
+		list_for_each_entry_safe(watch, next_w, watches, i_list) {
+			struct inotify_handle *ih= watch->ih;
+			get_inotify_watch(watch);
+			mutex_lock(&ih->mutex);
+			ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0,
+						 NULL, NULL);
+			inotify_remove_watch_locked(ih, watch);
+			mutex_unlock(&ih->mutex);
+			put_inotify_watch(watch);
+		}
+		mutex_unlock(&inode->inotify_mutex);
+		iput(inode);		
+
+		spin_lock(&inode_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
+
+/**
+ * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
+ * @inode: inode that is about to be removed
+ */
+void inotify_inode_is_dead(struct inode *inode)
+{
+	struct inotify_watch *watch, *next;
+
+	mutex_lock(&inode->inotify_mutex);
+	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
+		struct inotify_handle *ih = watch->ih;
+		mutex_lock(&ih->mutex);
+		inotify_remove_watch_locked(ih, watch);
+		mutex_unlock(&ih->mutex);
+	}
+	mutex_unlock(&inode->inotify_mutex);
+}
+EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
+
+/* Kernel Consumer API */
+
+/**
+ * inotify_init - allocate and initialize an inotify instance
+ * @ops: caller's inotify operations
+ */
+struct inotify_handle *inotify_init(const struct inotify_operations *ops)
+{
+	struct inotify_handle *ih;
+
+	ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL);
+	if (unlikely(!ih))
+		return ERR_PTR(-ENOMEM);
+
+	idr_init(&ih->idr);
+	INIT_LIST_HEAD(&ih->watches);
+	mutex_init(&ih->mutex);
+	ih->last_wd = 0;
+	ih->in_ops = ops;
+	atomic_set(&ih->count, 0);
+	get_inotify_handle(ih);
+
+	return ih;
+}
+EXPORT_SYMBOL_GPL(inotify_init);
+
+/**
+ * inotify_init_watch - initialize an inotify watch
+ * @watch: watch to initialize
+ */
+void inotify_init_watch(struct inotify_watch *watch)
+{
+	INIT_LIST_HEAD(&watch->h_list);
+	INIT_LIST_HEAD(&watch->i_list);
+	atomic_set(&watch->count, 0);
+	get_inotify_watch(watch); /* initial get */
+}
+EXPORT_SYMBOL_GPL(inotify_init_watch);
+
+/*
+ * Watch removals suck violently.  To kick the watch out we need (in this
+ * order) inode->inotify_mutex and ih->mutex.  That's fine if we have
+ * a hold on inode; however, for all other cases we need to make damn sure
+ * we don't race with umount.  We can *NOT* just grab a reference to a
+ * watch - inotify_unmount_inodes() will happily sail past it and we'll end
+ * with reference to inode potentially outliving its superblock.  Ideally
+ * we just want to grab an active reference to superblock if we can; that
+ * will make sure we won't go into inotify_umount_inodes() until we are
+ * done.  Cleanup is just deactivate_super().  However, that leaves a messy
+ * case - what if we *are* racing with umount() and active references to
+ * superblock can't be acquired anymore?  We can bump ->s_count, grab
+ * ->s_umount, which will almost certainly wait until the superblock is shut
+ * down and the watch in question is pining for fjords.  That's fine, but
+ * there is a problem - we might have hit the window between ->s_active
+ * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
+ * is past the point of no return and is heading for shutdown) and the
+ * moment when deactivate_super() acquires ->s_umount.  We could just do
+ * drop_super() yield() and retry, but that's rather antisocial and this
+ * stuff is luser-triggerable.  OTOH, having grabbed ->s_umount and having
+ * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
+ * that we won't race with inotify_umount_inodes().  So we could grab a
+ * reference to watch and do the rest as above, just with drop_super() instead
+ * of deactivate_super(), right?  Wrong.  We had to drop ih->mutex before we
+ * could grab ->s_umount.  So the watch could've been gone already.
+ *
+ * That still can be dealt with - we need to save watch->wd, do idr_find()
+ * and compare its result with our pointer.  If they match, we either have
+ * the damn thing still alive or we'd lost not one but two races at once,
+ * the watch had been killed and a new one got created with the same ->wd
+ * at the same address.  That couldn't have happened in inotify_destroy(),
+ * but inotify_rm_wd() could run into that.  Still, "new one got created"
+ * is not a problem - we have every right to kill it or leave it alone,
+ * whatever's more convenient.
+ *
+ * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
+ * "grab it and kill it" check.  If it's been our original watch, we are
+ * fine, if it's a newcomer - nevermind, just pretend that we'd won the
+ * race and kill the fscker anyway; we are safe since we know that its
+ * superblock won't be going away.
+ *
+ * And yes, this is far beyond mere "not very pretty"; so's the entire
+ * concept of inotify to start with.
+ */
+
+/**
+ * pin_to_kill - pin the watch down for removal
+ * @ih: inotify handle
+ * @watch: watch to kill
+ *
+ * Called with ih->mutex held, drops it.  Possible return values:
+ * 0 - nothing to do, it has died
+ * 1 - remove it, drop the reference and deactivate_super()
+ * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
+ * that variant, since it involved a lot of PITA, but that's the best that
+ * could've been done.
+ */
+static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	s32 wd = watch->wd;
+
+	spin_lock(&sb_lock);
+	if (sb->s_count >= S_BIAS) {
+		atomic_inc(&sb->s_active);
+		spin_unlock(&sb_lock);
+		get_inotify_watch(watch);
+		mutex_unlock(&ih->mutex);
+		return 1;	/* the best outcome */
+	}
+	sb->s_count++;
+	spin_unlock(&sb_lock);
+	mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
+	down_read(&sb->s_umount);
+	if (likely(!sb->s_root)) {
+		/* fs is already shut down; the watch is dead */
+		drop_super(sb);
+		return 0;
+	}
+	/* raced with the final deactivate_super() */
+	mutex_lock(&ih->mutex);
+	if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
+		/* the watch is dead */
+		mutex_unlock(&ih->mutex);
+		drop_super(sb);
+		return 0;
+	}
+	/* still alive or freed and reused with the same sb and wd; kill */
+	get_inotify_watch(watch);
+	mutex_unlock(&ih->mutex);
+	return 2;
+}
+
+static void unpin_and_kill(struct inotify_watch *watch, int how)
+{
+	struct super_block *sb = watch->inode->i_sb;
+	put_inotify_watch(watch);
+	switch (how) {
+	case 1:
+		deactivate_super(sb);
+		break;
+	case 2:
+		drop_super(sb);
+	}
+}
+
+/**
+ * inotify_destroy - clean up and destroy an inotify instance
+ * @ih: inotify handle
+ */
+void inotify_destroy(struct inotify_handle *ih)
+{
+	/*
+	 * Destroy all of the watches for this handle. Unfortunately, not very
+	 * pretty.  We cannot do a simple iteration over the list, because we
+	 * do not know the inode until we iterate to the watch.  But we need to
+	 * hold inode->inotify_mutex before ih->mutex.  The following works.
+	 *
+	 * AV: it had to become even uglier to start working ;-/
+	 */
+	while (1) {
+		struct inotify_watch *watch;
+		struct list_head *watches;
+		struct super_block *sb;
+		struct inode *inode;
+		int how;
+
+		mutex_lock(&ih->mutex);
+		watches = &ih->watches;
+		if (list_empty(watches)) {
+			mutex_unlock(&ih->mutex);
+			break;
+		}
+		watch = list_first_entry(watches, struct inotify_watch, h_list);
+		sb = watch->inode->i_sb;
+		how = pin_to_kill(ih, watch);
+		if (!how)
+			continue;
+
+		inode = watch->inode;
+		mutex_lock(&inode->inotify_mutex);
+		mutex_lock(&ih->mutex);
+
+		/* make sure we didn't race with another list removal */
+		if (likely(idr_find(&ih->idr, watch->wd))) {
+			remove_watch_no_event(watch, ih);
+			put_inotify_watch(watch);
+		}
+
+		mutex_unlock(&ih->mutex);
+		mutex_unlock(&inode->inotify_mutex);
+		unpin_and_kill(watch, how);
+	}
+
+	/* free this handle: the put matching the get in inotify_init() */
+	put_inotify_handle(ih);
+}
+EXPORT_SYMBOL_GPL(inotify_destroy);
+
+/**
+ * inotify_find_watch - find an existing watch for an (ih,inode) pair
+ * @ih: inotify handle
+ * @inode: inode to watch
+ * @watchp: pointer to existing inotify_watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+		       struct inotify_watch **watchp)
+{
+	struct inotify_watch *old;
+	int ret = -ENOENT;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	old = inode_find_handle(inode, ih);
+	if (unlikely(old)) {
+		get_inotify_watch(old); /* caller must put watch */
+		*watchp = old;
+		ret = old->wd;
+	}
+
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_find_watch);
+
+/**
+ * inotify_find_update_watch - find and update the mask of an existing watch
+ * @ih: inotify handle
+ * @inode: inode's watch to update
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ */
+s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode,
+			      u32 mask)
+{
+	struct inotify_watch *old;
+	int mask_add = 0;
+	int ret;
+
+	if (mask & IN_MASK_ADD)
+		mask_add = 1;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	/*
+	 * Handle the case of re-adding a watch on an (inode,ih) pair that we
+	 * are already watching.  We just update the mask and return its wd.
+	 */
+	old = inode_find_handle(inode, ih);
+	if (unlikely(!old)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (mask_add)
+		old->mask |= mask;
+	else
+		old->mask = mask;
+	ret = old->wd;
+out:
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_find_update_watch);
+
+/**
+ * inotify_add_watch - add a watch to an inotify instance
+ * @ih: inotify handle
+ * @watch: caller allocated watch structure
+ * @inode: inode to watch
+ * @mask: mask of events to watch
+ *
+ * Caller must pin given inode (via nameidata).
+ * Caller must ensure it only calls inotify_add_watch() once per watch.
+ * Calls inotify_handle_get_wd() so may sleep.
+ */
+s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch,
+		      struct inode *inode, u32 mask)
+{
+	int ret = 0;
+	int newly_watched;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS | IN_ONESHOT;
+	if (unlikely(!mask))
+		return -EINVAL;
+	watch->mask = mask;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	/* Initialize a new watch */
+	ret = inotify_handle_get_wd(ih, watch);
+	if (unlikely(ret))
+		goto out;
+	ret = watch->wd;
+
+	/* save a reference to handle and bump the count to make it official */
+	get_inotify_handle(ih);
+	watch->ih = ih;
+
+	/*
+	 * Save a reference to the inode and bump the ref count to make it
+	 * official.  We hold a reference to nameidata, which makes this safe.
+	 */
+	watch->inode = igrab(inode);
+
+	/* Add the watch to the handle's and the inode's list */
+	newly_watched = !inotify_inode_watched(inode);
+	list_add(&watch->h_list, &ih->watches);
+	list_add(&watch->i_list, &inode->inotify_watches);
+	/*
+	 * Set child flags _after_ adding the watch, so there is no race
+	 * windows where newly instantiated children could miss their parent's
+	 * watched flag.
+	 */
+	if (newly_watched)
+		set_dentry_child_flags(inode, 1);
+
+out:
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(inotify_add_watch);
+
+/**
+ * inotify_clone_watch - put the watch next to existing one
+ * @old: already installed watch
+ * @new: new watch
+ *
+ * Caller must hold the inotify_mutex of inode we are dealing with;
+ * it is expected to remove the old watch before unlocking the inode.
+ */
+s32 inotify_clone_watch(struct inotify_watch *old, struct inotify_watch *new)
+{
+	struct inotify_handle *ih = old->ih;
+	int ret = 0;
+
+	new->mask = old->mask;
+	new->ih = ih;
+
+	mutex_lock(&ih->mutex);
+
+	/* Initialize a new watch */
+	ret = inotify_handle_get_wd(ih, new);
+	if (unlikely(ret))
+		goto out;
+	ret = new->wd;
+
+	get_inotify_handle(ih);
+
+	new->inode = igrab(old->inode);
+
+	list_add(&new->h_list, &ih->watches);
+	list_add(&new->i_list, &old->inode->inotify_watches);
+out:
+	mutex_unlock(&ih->mutex);
+	return ret;
+}
+
+void inotify_evict_watch(struct inotify_watch *watch)
+{
+	get_inotify_watch(watch);
+	mutex_lock(&watch->ih->mutex);
+	inotify_remove_watch_locked(watch->ih, watch);
+	mutex_unlock(&watch->ih->mutex);
+}
+
+/**
+ * inotify_rm_wd - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @wd: watch descriptor to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
+{
+	struct inotify_watch *watch;
+	struct super_block *sb;
+	struct inode *inode;
+	int how;
+
+	mutex_lock(&ih->mutex);
+	watch = idr_find(&ih->idr, wd);
+	if (unlikely(!watch)) {
+		mutex_unlock(&ih->mutex);
+		return -EINVAL;
+	}
+	sb = watch->inode->i_sb;
+	how = pin_to_kill(ih, watch);
+	if (!how)
+		return 0;
+
+	inode = watch->inode;
+
+	mutex_lock(&inode->inotify_mutex);
+	mutex_lock(&ih->mutex);
+
+	/* make sure that we did not race */
+	if (likely(idr_find(&ih->idr, wd) == watch))
+		inotify_remove_watch_locked(ih, watch);
+
+	mutex_unlock(&ih->mutex);
+	mutex_unlock(&inode->inotify_mutex);
+	unpin_and_kill(watch, how);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(inotify_rm_wd);
+
+/**
+ * inotify_rm_watch - remove a watch from an inotify instance
+ * @ih: inotify handle
+ * @watch: watch to remove
+ *
+ * Can sleep.
+ */
+int inotify_rm_watch(struct inotify_handle *ih,
+		     struct inotify_watch *watch)
+{
+	return inotify_rm_wd(ih, watch->wd);
+}
+EXPORT_SYMBOL_GPL(inotify_rm_watch);
+
+/*
+ * inotify_setup - core initialization function
+ */
+static int __init inotify_setup(void)
+{
+	atomic_set(&inotify_cookie, 0);
+
+	return 0;
+}
+
+module_init(inotify_setup);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
new file mode 100644
index 00000000000..400f8064a54
--- /dev/null
+++ b/fs/notify/inotify/inotify_user.c
@@ -0,0 +1,778 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/inotify.h>
+#include <linux/syscalls.h>
+#include <linux/magic.h>
+
+#include <asm/ioctls.h>
+
+static struct kmem_cache *watch_cachep __read_mostly;
+static struct kmem_cache *event_cachep __read_mostly;
+
+static struct vfsmount *inotify_mnt __read_mostly;
+
+/* these are configurable via /proc/sys/fs/inotify/ */
+static int inotify_max_user_instances __read_mostly;
+static int inotify_max_user_watches __read_mostly;
+static int inotify_max_queued_events __read_mostly;
+
+/*
+ * Lock ordering:
+ *
+ * inotify_dev->up_mutex (ensures we don't re-add the same watch)
+ * 	inode->inotify_mutex (protects inode's watch list)
+ * 		inotify_handle->mutex (protects inotify_handle's watch list)
+ * 			inotify_dev->ev_mutex (protects device's event queue)
+ */
+
+/*
+ * Lifetimes of the main data structures:
+ *
+ * inotify_device: Lifetime is managed by reference count, from
+ * sys_inotify_init() until release.  Additional references can bump the count
+ * via get_inotify_dev() and drop the count via put_inotify_dev().
+ *
+ * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
+ * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
+ * first event, or to inotify_destroy().
+ */
+
+/*
+ * struct inotify_device - represents an inotify instance
+ *
+ * This structure is protected by the mutex 'mutex'.
+ */
+struct inotify_device {
+	wait_queue_head_t 	wq;		/* wait queue for i/o */
+	struct mutex		ev_mutex;	/* protects event queue */
+	struct mutex		up_mutex;	/* synchronizes watch updates */
+	struct list_head 	events;		/* list of queued events */
+	struct user_struct	*user;		/* user who opened this dev */
+	struct inotify_handle	*ih;		/* inotify handle */
+	struct fasync_struct    *fa;            /* async notification */
+	atomic_t		count;		/* reference count */
+	unsigned int		queue_size;	/* size of the queue (bytes) */
+	unsigned int		event_count;	/* number of pending events */
+	unsigned int		max_events;	/* maximum number of events */
+};
+
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->ev_mutex of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+	struct inotify_event	event;	/* the user-space event */
+	struct list_head        list;	/* entry in inotify_device's list */
+	char			*name;	/* filename, if any */
+};
+
+/*
+ * struct inotify_user_watch - our version of an inotify_watch, we add
+ * a reference to the associated inotify_device.
+ */
+struct inotify_user_watch {
+	struct inotify_device	*dev;	/* associated device */
+	struct inotify_watch	wdata;	/* inotify watch data */
+};
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table inotify_table[] = {
+	{
+		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+	atomic_inc(&dev->count);
+}
+
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+	if (atomic_dec_and_test(&dev->count)) {
+		atomic_dec(&dev->user->inotify_devs);
+		free_uid(dev->user);
+		kfree(dev);
+	}
+}
+
+/*
+ * free_inotify_user_watch - cleans up the watch and its references
+ */
+static void free_inotify_user_watch(struct inotify_watch *w)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	atomic_dec(&dev->user->inotify_watches);
+	put_inotify_dev(dev);
+	kmem_cache_free(watch_cachep, watch);
+}
+
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+						  const char *name)
+{
+	struct inotify_kernel_event *kevent;
+
+	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
+	if (unlikely(!kevent))
+		return NULL;
+
+	/* we hand this out to user-space, so zero it just in case */
+	memset(&kevent->event, 0, sizeof(struct inotify_event));
+
+	kevent->event.wd = wd;
+	kevent->event.mask = mask;
+	kevent->event.cookie = cookie;
+
+	INIT_LIST_HEAD(&kevent->list);
+
+	if (name) {
+		size_t len, rem, event_size = sizeof(struct inotify_event);
+
+		/*
+		 * We need to pad the filename so as to properly align an
+		 * array of inotify_event structures.  Because the structure is
+		 * small and the common case is a small filename, we just round
+		 * up to the next multiple of the structure's sizeof.  This is
+		 * simple and safe for all architectures.
+		 */
+		len = strlen(name) + 1;
+		rem = event_size - len;
+		if (len > event_size) {
+			rem = event_size - (len % event_size);
+			if (len % event_size == 0)
+				rem = 0;
+		}
+
+		kevent->name = kmalloc(len + rem, GFP_KERNEL);
+		if (unlikely(!kevent->name)) {
+			kmem_cache_free(event_cachep, kevent);
+			return NULL;
+		}
+		memcpy(kevent->name, name, len);
+		if (rem)
+			memset(kevent->name + len, 0, rem);
+		kevent->event.len = len + rem;
+	} else {
+		kevent->event.len = 0;
+		kevent->name = NULL;
+	}
+
+	return kevent;
+}
+
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+	return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_get_last_event - return the last event in the given dev's queue
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_last_event(struct inotify_device *dev)
+{
+	if (list_empty(&dev->events))
+		return NULL;
+	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_queue_event - event handler registered with core inotify, adds
+ * a new event to the given device
+ *
+ * Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
+				    u32 cookie, const char *name,
+				    struct inode *ignored)
+{
+	struct inotify_user_watch *watch;
+	struct inotify_device *dev;
+	struct inotify_kernel_event *kevent, *last;
+
+	watch = container_of(w, struct inotify_user_watch, wdata);
+	dev = watch->dev;
+
+	mutex_lock(&dev->ev_mutex);
+
+	/* we can safely put the watch as we don't reference it while
+	 * generating the event
+	 */
+	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
+		put_inotify_watch(w); /* final put */
+
+	/* coalescing: drop this event if it is a dupe of the previous */
+	last = inotify_dev_get_last_event(dev);
+	if (last && last->event.mask == mask && last->event.wd == wd &&
+			last->event.cookie == cookie) {
+		const char *lastname = last->name;
+
+		if (!name && !lastname)
+			goto out;
+		if (name && lastname && !strcmp(lastname, name))
+			goto out;
+	}
+
+	/* the queue overflowed and we already sent the Q_OVERFLOW event */
+	if (unlikely(dev->event_count > dev->max_events))
+		goto out;
+
+	/* if the queue overflows, we need to notify user space */
+	if (unlikely(dev->event_count == dev->max_events))
+		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+	else
+		kevent = kernel_event(wd, mask, cookie, name);
+
+	if (unlikely(!kevent))
+		goto out;
+
+	/* queue the event and wake up anyone waiting */
+	dev->event_count++;
+	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+	list_add_tail(&kevent->list, &dev->events);
+	wake_up_interruptible(&dev->wq);
+	kill_fasync(&dev->fa, SIGIO, POLL_IN);
+
+out:
+	mutex_unlock(&dev->ev_mutex);
+}
+
+/*
+ * remove_kevent - cleans up the given kevent
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void remove_kevent(struct inotify_device *dev,
+			  struct inotify_kernel_event *kevent)
+{
+	list_del(&kevent->list);
+
+	dev->event_count--;
+	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
+
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
+	kfree(kevent->name);
+	kmem_cache_free(event_cachep, kevent);
+}
+
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->ev_mutex.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+	if (!list_empty(&dev->events)) {
+		struct inotify_kernel_event *kevent;
+		kevent = inotify_dev_get_event(dev);
+		remove_kevent(dev, kevent);
+		free_kevent(kevent);
+	}
+}
+
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int find_inode(const char __user *dirname, struct path *path,
+		      unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
+}
+
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->up_mutex.
+ */
+static int create_watch(struct inotify_device *dev, struct inode *inode,
+			u32 mask)
+{
+	struct inotify_user_watch *watch;
+	int ret;
+
+	if (atomic_read(&dev->user->inotify_watches) >=
+			inotify_max_user_watches)
+		return -ENOSPC;
+
+	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+	if (unlikely(!watch))
+		return -ENOMEM;
+
+	/* save a reference to device and bump the count to make it official */
+	get_inotify_dev(dev);
+	watch->dev = dev;
+
+	atomic_inc(&dev->user->inotify_watches);
+
+	inotify_init_watch(&watch->wdata);
+	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
+	if (ret < 0)
+		free_inotify_user_watch(&watch->wdata);
+
+	return ret;
+}
+
+/* Device Interface */
+
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct inotify_device *dev = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &dev->wq, wait);
+	mutex_lock(&dev->ev_mutex);
+	if (!list_empty(&dev->events))
+		ret = POLLIN | POLLRDNORM;
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	size_t event_size = sizeof (struct inotify_event);
+	struct inotify_device *dev;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	dev = file->private_data;
+
+	while (1) {
+
+		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+
+		mutex_lock(&dev->ev_mutex);
+		if (!list_empty(&dev->events)) {
+			ret = 0;
+			break;
+		}
+		mutex_unlock(&dev->ev_mutex);
+
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		schedule();
+	}
+
+	finish_wait(&dev->wq, &wait);
+	if (ret)
+		return ret;
+
+	while (1) {
+		struct inotify_kernel_event *kevent;
+
+		ret = buf - start;
+		if (list_empty(&dev->events))
+			break;
+
+		kevent = inotify_dev_get_event(dev);
+		if (event_size + kevent->event.len > count) {
+			if (ret == 0 && count > 0) {
+				/*
+				 * could not get a single event because we
+				 * didn't have enough buffer space.
+				 */
+				ret = -EINVAL;
+			}
+			break;
+		}
+		remove_kevent(dev, kevent);
+
+		/*
+		 * Must perform the copy_to_user outside the mutex in order
+		 * to avoid a lock order reversal with mmap_sem.
+		 */
+		mutex_unlock(&dev->ev_mutex);
+
+		if (copy_to_user(buf, &kevent->event, event_size)) {
+			ret = -EFAULT;
+			break;
+		}
+		buf += event_size;
+		count -= event_size;
+
+		if (kevent->name) {
+			if (copy_to_user(buf, kevent->name, kevent->event.len)){
+				ret = -EFAULT;
+				break;
+			}
+			buf += kevent->event.len;
+			count -= kevent->event.len;
+		}
+
+		free_kevent(kevent);
+
+		mutex_lock(&dev->ev_mutex);
+	}
+	mutex_unlock(&dev->ev_mutex);
+
+	return ret;
+}
+
+static int inotify_fasync(int fd, struct file *file, int on)
+{
+	struct inotify_device *dev = file->private_data;
+
+	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct inotify_device *dev = file->private_data;
+
+	inotify_destroy(dev->ih);
+
+	/* destroy all of the events on this device */
+	mutex_lock(&dev->ev_mutex);
+	while (!list_empty(&dev->events))
+		inotify_dev_event_dequeue(dev);
+	mutex_unlock(&dev->ev_mutex);
+
+	/* free this device: the put matching the get in inotify_init() */
+	put_inotify_dev(dev);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inotify_device *dev;
+	void __user *p;
+	int ret = -ENOTTY;
+
+	dev = file->private_data;
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		ret = put_user(dev->queue_size, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static const struct file_operations inotify_fops = {
+	.poll           = inotify_poll,
+	.read           = inotify_read,
+	.fasync         = inotify_fasync,
+	.release        = inotify_release,
+	.unlocked_ioctl = inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+};
+
+static const struct inotify_operations inotify_user_ops = {
+	.handle_event	= inotify_dev_queue_event,
+	.destroy_watch	= free_inotify_user_watch,
+};
+
+asmlinkage long sys_inotify_init1(int flags)
+{
+	struct inotify_device *dev;
+	struct inotify_handle *ih;
+	struct user_struct *user;
+	struct file *filp;
+	int fd, ret;
+
+	/* Check the IN_* constants for consistency.  */
+	BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
+	if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+		return -EINVAL;
+
+	fd = get_unused_fd_flags(flags & O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+
+	filp = get_empty_filp();
+	if (!filp) {
+		ret = -ENFILE;
+		goto out_put_fd;
+	}
+
+	user = get_current_user();
+	if (unlikely(atomic_read(&user->inotify_devs) >=
+			inotify_max_user_instances)) {
+		ret = -EMFILE;
+		goto out_free_uid;
+	}
+
+	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+	if (unlikely(!dev)) {
+		ret = -ENOMEM;
+		goto out_free_uid;
+	}
+
+	ih = inotify_init(&inotify_user_ops);
+	if (IS_ERR(ih)) {
+		ret = PTR_ERR(ih);
+		goto out_free_dev;
+	}
+	dev->ih = ih;
+	dev->fa = NULL;
+
+	filp->f_op = &inotify_fops;
+	filp->f_path.mnt = mntget(inotify_mnt);
+	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
+	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
+	filp->f_mode = FMODE_READ;
+	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+	filp->private_data = dev;
+
+	INIT_LIST_HEAD(&dev->events);
+	init_waitqueue_head(&dev->wq);
+	mutex_init(&dev->ev_mutex);
+	mutex_init(&dev->up_mutex);
+	dev->event_count = 0;
+	dev->queue_size = 0;
+	dev->max_events = inotify_max_queued_events;
+	dev->user = user;
+	atomic_set(&dev->count, 0);
+
+	get_inotify_dev(dev);
+	atomic_inc(&user->inotify_devs);
+	fd_install(fd, filp);
+
+	return fd;
+out_free_dev:
+	kfree(dev);
+out_free_uid:
+	free_uid(user);
+	put_filp(filp);
+out_put_fd:
+	put_unused_fd(fd);
+	return ret;
+}
+
+asmlinkage long sys_inotify_init(void)
+{
+	return sys_inotify_init1(0);
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
+{
+	struct inode *inode;
+	struct inotify_device *dev;
+	struct path path;
+	struct file *filp;
+	int ret, fput_needed;
+	unsigned flags = 0;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto fput_and_out;
+	}
+
+	if (!(mask & IN_DONT_FOLLOW))
+		flags |= LOOKUP_FOLLOW;
+	if (mask & IN_ONLYDIR)
+		flags |= LOOKUP_DIRECTORY;
+
+	ret = find_inode(pathname, &path, flags);
+	if (unlikely(ret))
+		goto fput_and_out;
+
+	/* inode held in place by reference to path; dev by fget on fd */
+	inode = path.dentry->d_inode;
+	dev = filp->private_data;
+
+	mutex_lock(&dev->up_mutex);
+	ret = inotify_find_update_watch(dev->ih, inode, mask);
+	if (ret == -ENOENT)
+		ret = create_watch(dev, inode, mask);
+	mutex_unlock(&dev->up_mutex);
+
+	path_put(&path);
+fput_and_out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+	struct file *filp;
+	struct inotify_device *dev;
+	int ret, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(!filp))
+		return -EBADF;
+
+	/* verify that this is indeed an inotify instance */
+	if (unlikely(filp->f_op != &inotify_fops)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	dev = filp->private_data;
+
+	/* we free our watch data when we get IN_IGNORED */
+	ret = inotify_rm_wd(dev->ih, wd);
+
+out:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+static int
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+	       const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "inotify", NULL,
+			INOTIFYFS_SUPER_MAGIC, mnt);
+}
+
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+
+/*
+ * inotify_user_setup - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_user_setup(void)
+{
+	int ret;
+
+	ret = register_filesystem(&inotify_fs_type);
+	if (unlikely(ret))
+		panic("inotify: register_filesystem returned %d!\n", ret);
+
+	inotify_mnt = kern_mount(&inotify_fs_type);
+	if (IS_ERR(inotify_mnt))
+		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
+
+	inotify_max_queued_events = 16384;
+	inotify_max_user_instances = 128;
+	inotify_max_user_watches = 8192;
+
+	watch_cachep = kmem_cache_create("inotify_watch_cache",
+					 sizeof(struct inotify_user_watch),
+					 0, SLAB_PANIC, NULL);
+	event_cachep = kmem_cache_create("inotify_event_cache",
+					 sizeof(struct inotify_kernel_event),
+					 0, SLAB_PANIC, NULL);
+
+	return 0;
+}
+
+module_init(inotify_user_setup);
-- 
cgit v1.2.3


From 261bca86ed4f7f391d1938167624e78da61dcc6b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 01:48:21 -0500
Subject: nfsd/create race fixes, infrastructure

new helpers - insert_inode_locked() and insert_inode_locked4().
Hash new inode, making sure that there's no such inode in icache
already.  If there is and it does not end up unhashed (as would
happen if we have nfsd trying to resolve a bogus fhandle), fail.
Otherwise insert our inode into hash and succeed.

In either case have i_state set to new+locked; cleanup ends up
being simpler with such calling conventions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 098a2443196..7de1cda9248 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1032,6 +1032,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
 
 EXPORT_SYMBOL(iget_locked);
 
+int insert_inode_locked(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	ino_t ino = inode->i_ino;
+	struct hlist_head *head = inode_hashtable + hash(sb, ino);
+	struct inode *old;
+
+	inode->i_state |= I_LOCK|I_NEW;
+	while (1) {
+		spin_lock(&inode_lock);
+		old = find_inode_fast(sb, head, ino);
+		if (likely(!old)) {
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&inode_lock);
+		wait_on_inode(old);
+		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+
+EXPORT_SYMBOL(insert_inode_locked);
+
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct super_block *sb = inode->i_sb;
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *old;
+
+	inode->i_state |= I_LOCK|I_NEW;
+
+	while (1) {
+		spin_lock(&inode_lock);
+		old = find_inode(sb, head, test, data);
+		if (likely(!old)) {
+			hlist_add_head(&inode->i_hash, head);
+			spin_unlock(&inode_lock);
+			return 0;
+		}
+		__iget(old);
+		spin_unlock(&inode_lock);
+		wait_on_inode(old);
+		if (unlikely(!hlist_unhashed(&old->i_hash))) {
+			iput(old);
+			return -EBUSY;
+		}
+		iput(old);
+	}
+}
+
+EXPORT_SYMBOL(insert_inode_locked4);
+
 /**
  *	__insert_inode_hash - hash an inode
  *	@inode: unhashed inode
-- 
cgit v1.2.3


From 41080b5a240113328c607f22b849f653373db0ce Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 01:52:35 -0500
Subject: nfsd race fixes: ext2

* make ext2_new_inode() put the inode into icache in locked state
* do not unlock until the inode is fully set up; otherwise nfsd
might pick it in half-baked state.
* make sure that ext2_new_inode() does *not* lead to two inodes with the
same inumber hashed at the same time; otherwise a bogus fhandle coming
from nfsd might race with inode creation:

nfsd: iget_locked() creates inode
nfsd: try to read from disk, block on that.
ext2_new_inode(): allocate inode with that inumber
ext2_new_inode(): insert it into icache, set it up and dirty
ext2_write_inode(): get the relevant part of inode table in cache,
set the entry for our inode (and start writing to disk)
nfsd: get CPU again, look into inode table, see nice and sane on-disk
inode, set the in-core inode from it

oops - we have two in-core inodes with the same inumber live in icache,
both used for IO.  Welcome to fs corruption...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/ialloc.c |  6 +++++-
 fs/ext2/namei.c  | 15 ++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8d0add62587..c454d5db28a 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -585,7 +585,10 @@ got:
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EINVAL;
+		goto fail_drop;
+	}
 
 	if (DQUOT_ALLOC_INODE(inode)) {
 		err = -EDQUOT;
@@ -612,6 +615,7 @@ fail_drop:
 	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
+	unlock_new_inode(inode);
 	iput(inode);
 	return ERR_PTR(err);
 
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a747252ec1..90ea17998a7 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
 	int err = ext2_add_link(dentry, inode);
 	if (!err) {
 		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
 		return 0;
 	}
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -170,6 +172,7 @@ out:
 
 out_fail:
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput (inode);
 	goto out;
 }
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	struct dentry *dentry)
 {
 	struct inode *inode = old_dentry->d_inode;
+	int err;
 
 	if (inode->i_nlink >= EXT2_LINK_MAX)
 		return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
 	inode_inc_link_count(inode);
 	atomic_inc(&inode->i_count);
 
-	return ext2_add_nondir(dentry, inode);
+	err = ext2_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
 }
 
 static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 		goto out_fail;
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 out:
 	return err;
 
 out_fail:
 	inode_dec_link_count(inode);
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 out_dir:
 	inode_dec_link_count(dir);
-- 
cgit v1.2.3


From c38012daa7ad902a39a4213ba2b3fe50e81157ea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 02:02:50 -0500
Subject: nfsd race fixes: ext3

ext3 analog of the previous patch

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/ialloc.c |  6 +++++-
 fs/ext3/namei.c  | 15 ++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 490bd0ed789..5655fbcbd11 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -579,7 +579,10 @@ got:
 	ext3_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
 		handle->h_sync = 1;
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EINVAL;
+		goto fail_drop;
+	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +630,7 @@ fail_drop:
 	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
+	unlock_new_inode(inode);
 	iput(inode);
 	brelse(bitmap_bh);
 	return ERR_PTR(err);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc92aa0..297ea8dfac7 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1652,9 +1652,11 @@ static int ext3_add_nondir(handle_t *handle,
 	if (!err) {
 		ext3_mark_inode_dirty(handle, inode);
 		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
 		return 0;
 	}
 	drop_nlink(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -1765,6 +1767,7 @@ retry:
 	dir_block = ext3_bread (handle, inode, 0, 1, &err);
 	if (!dir_block) {
 		drop_nlink(inode); /* is this nlink == 0? */
+		unlock_new_inode(inode);
 		ext3_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -1792,6 +1795,7 @@ retry:
 	err = ext3_add_entry (handle, dentry, inode);
 	if (err) {
 		inode->i_nlink = 0;
+		unlock_new_inode(inode);
 		ext3_mark_inode_dirty(handle, inode);
 		iput (inode);
 		goto out_stop;
@@ -1800,6 +1804,7 @@ retry:
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 out_stop:
 	ext3_journal_stop(handle);
 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2174,6 +2179,7 @@ retry:
 				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
 			drop_nlink(inode);
+			unlock_new_inode(inode);
 			ext3_mark_inode_dirty(handle, inode);
 			iput (inode);
 			goto out_stop;
@@ -2221,7 +2227,14 @@ retry:
 	inc_nlink(inode);
 	atomic_inc(&inode->i_count);
 
-	err = ext3_add_nondir(handle, dentry, inode);
+	err = ext3_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext3_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
 	ext3_journal_stop(handle);
 	if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
-- 
cgit v1.2.3


From 6b38e842bb832a3dbeb17e382404aef3c40ac5f9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 02:03:31 -0500
Subject: nfsd race fixes: ext4

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/ialloc.c |  6 +++++-
 fs/ext4/namei.c  | 14 +++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 08cac9fcace..6e6052879aa 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -826,7 +826,10 @@ got:
 	ext4_set_inode_flags(inode);
 	if (IS_DIRSYNC(inode))
 		handle->h_sync = 1;
-	insert_inode_hash(inode);
+	if (insert_inode_locked(inode) < 0) {
+		err = -EINVAL;
+		goto fail_drop;
+	}
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
 	spin_unlock(&sbi->s_next_gen_lock);
@@ -881,6 +884,7 @@ fail_drop:
 	DQUOT_DROP(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
+	unlock_new_inode(inode);
 	iput(inode);
 	brelse(bitmap_bh);
 	return ERR_PTR(err);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb79298..da98a9012fa 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1693,9 +1693,11 @@ static int ext4_add_nondir(handle_t *handle,
 	if (!err) {
 		ext4_mark_inode_dirty(handle, inode);
 		d_instantiate(dentry, inode);
+		unlock_new_inode(inode);
 		return 0;
 	}
 	drop_nlink(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	return err;
 }
@@ -1830,6 +1832,7 @@ retry:
 	if (err) {
 out_clear_inode:
 		clear_nlink(inode);
+		unlock_new_inode(inode);
 		ext4_mark_inode_dirty(handle, inode);
 		iput(inode);
 		goto out_stop;
@@ -1838,6 +1841,7 @@ out_clear_inode:
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 out_stop:
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -2212,6 +2216,7 @@ retry:
 				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 		if (err) {
 			clear_nlink(inode);
+			unlock_new_inode(inode);
 			ext4_mark_inode_dirty(handle, inode);
 			iput(inode);
 			goto out_stop;
@@ -2262,7 +2267,14 @@ retry:
 	ext4_inc_count(handle, inode);
 	atomic_inc(&inode->i_count);
 
-	err = ext4_add_nondir(handle, dentry, inode);
+	err = ext4_add_entry(handle, dentry, inode);
+	if (!err) {
+		ext4_mark_inode_dirty(handle, inode);
+		d_instantiate(dentry, inode);
+	} else {
+		drop_nlink(inode);
+		iput(inode);
+	}
 	ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
-- 
cgit v1.2.3


From c1eaa26b671299b3ec01d40c6c71ee19a4f81517 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 30 Dec 2008 02:03:58 -0500
Subject: nfsd race fixes: reiserfs

... and the same for reiserfs.  The difference here is that we need
insert_inode_locked4() to match iget5_locked().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/inode.c | 15 ++++++++++-----
 fs/reiserfs/namei.c |  8 ++++++++
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c69449..145c2d3e5e0 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		       struct inode *inode)
 {
 	struct super_block *sb;
+	struct reiserfs_iget_args args;
 	INITIALIZE_PATH(path_to_key);
 	struct cpu_key key;
 	struct item_head ih;
@@ -1780,6 +1781,14 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		err = -ENOMEM;
 		goto out_bad_inode;
 	}
+	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+	if (insert_inode_locked4(inode, args.objectid,
+			     reiserfs_find_actor, &args) < 0) {
+		err = -EINVAL;
+		goto out_bad_inode;
+	}
 	if (old_format_only(sb))
 		/* not a perfect generation count, as object ids can be reused, but 
 		 ** this is as good as reiserfs can do right now.
@@ -1859,13 +1868,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	} else {
 		inode2sd(&sd, inode, inode->i_size);
 	}
-	// these do not go to on-disk stat data
-	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-
 	// store in in-core inode the key of stat data and version all
 	// object items will have (directory items will have old offset
 	// format, other new objects will consist of new items)
-	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
 		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
 	else
@@ -1929,7 +1934,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		reiserfs_mark_inode_private(inode);
 	}
 
-	insert_inode_hash(inode);
 	reiserfs_update_sd(th, inode);
 	reiserfs_check_path(&path_to_key);
 
@@ -1956,6 +1960,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
       out_inserted_sd:
 	inode->i_nlink = 0;
 	th->t_trans_id = 0;	/* so the caller can't use this handle later */
+	unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
 
 	/* If we were inheriting an ACL, we need to release the lock so that
 	 * iput doesn't deadlock in reiserfs_delete_xattrs. The locking
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4f322e5ed84..738967f6c8e 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	reiserfs_update_inode_transaction(dir);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
       out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 		err = journal_end(&th, dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	reiserfs_update_sd(&th, dir);
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, dir->i_sb, jbegin_count);
       out_failed:
 	if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
 		err = journal_end(&th, parent_dir->i_sb, jbegin_count);
 		if (err)
 			retval = err;
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out_failed;
 	}
 
 	d_instantiate(dentry, inode);
+	unlock_new_inode(inode);
 	retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
       out_failed:
 	reiserfs_write_unlock(parent_dir->i_sb);
-- 
cgit v1.2.3


From 1f3403fa640f9f7b135dee79f2d39d01c8ad4a08 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Tue, 30 Dec 2008 22:08:37 -0600
Subject: nfsd race fixes: jfs

jfs version of Al Viro's nfsd race patches

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/jfs_inode.c | 29 +++++++++++++++++++++--------
 fs/jfs/namei.c     | 24 ++++++++++++++++--------
 2 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 70022fd1c53..d4d142c2edd 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	inode = new_inode(sb);
 	if (!inode) {
 		jfs_warn("ialloc: new_inode returned NULL!");
-		return ERR_PTR(-ENOMEM);
+		rc = -ENOMEM;
+		goto fail;
 	}
 
 	jfs_inode = JFS_IP(inode);
@@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 		jfs_warn("ialloc: diAlloc returned %d!", rc);
 		if (rc == -EIO)
 			make_bad_inode(inode);
-		iput(inode);
-		return ERR_PTR(rc);
+		goto fail_put;
+	}
+
+	if (insert_inode_locked(inode) < 0) {
+		rc = -EINVAL;
+		goto fail_unlock;
 	}
 
 	inode->i_uid = current_fsuid();
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	 * Allocate inode to quota.
 	 */
 	if (DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
-		inode->i_flags |= S_NOQUOTA;
-		inode->i_nlink = 0;
-		iput(inode);
-		return ERR_PTR(-EDQUOT);
+		rc = -EDQUOT;
+		goto fail_drop;
 	}
 
 	inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	jfs_info("ialloc returns inode = 0x%p\n", inode);
 
 	return inode;
+
+fail_drop:
+	DQUOT_DROP(inode);
+	inode->i_flags |= S_NOQUOTA;
+fail_unlock:
+	inode->i_nlink = 0;
+	unlock_new_inode(inode);
+fail_put:
+	iput(inode);
+fail:
+	return ERR_PTR(rc);
 }
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedffbfa..b4de56b851e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	ip->i_fop = &jfs_file_operations;
 	ip->i_mapping->a_ops = &jfs_aops;
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out2:
 	free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	ip->i_op = &jfs_dir_inode_operations;
 	ip->i_fop = &jfs_dir_operations;
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	/* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out2:
 	free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 		goto out3;
 	}
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out2:
 	free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	jfs_ip->dev = new_encode_dev(rdev);
 	init_special_inode(ip, ip->i_mode, rdev);
 
-	insert_inode_hash(ip);
 	mark_inode_dirty(ip);
 
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (rc) {
 		free_ea_wmap(ip);
 		ip->i_nlink = 0;
+		unlock_new_inode(ip);
 		iput(ip);
-	} else
+	} else {
 		d_instantiate(dentry, ip);
+		unlock_new_inode(ip);
+	}
 
       out1:
 	free_UCSname(&dname);
-- 
cgit v1.2.3


From e623ddb4e940b266adc77ba1cc28a3554aa90e79 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 18 Dec 2008 14:49:36 +0900
Subject: [SCSI] block: fix bio_add_page misuse with rq_map_data

This fixes bio_add_page misuse in bio_copy_user_iov with rq_map_data,
which only sg uses now.

rq_map_data carries page frames for bio_add_pc_page. bio_copy_user_iov
uses bio_add_pc_page with a larger size than PAGE_SIZE. It's clearly
wrong.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/bio.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 711cee10360..356e7423b92 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -815,28 +815,30 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 
 	ret = 0;
 	i = 0;
+	if (map_data)
+		nr_pages = 1 << map_data->page_order;
 	while (len) {
-		unsigned int bytes;
-
-		if (map_data)
-			bytes = 1U << (PAGE_SHIFT + map_data->page_order);
-		else
-			bytes = PAGE_SIZE;
+		unsigned int bytes = PAGE_SIZE;
 
 		if (bytes > len)
 			bytes = len;
 
 		if (map_data) {
-			if (i == map_data->nr_entries) {
+			if (i == map_data->nr_entries * nr_pages) {
 				ret = -ENOMEM;
 				break;
 			}
-			page = map_data->pages[i++];
-		} else
+
+			page = map_data->pages[i / nr_pages];
+			page += (i % nr_pages);
+
+			i++;
+		} else {
 			page = alloc_page(q->bounce_gfp | gfp_mask);
-		if (!page) {
-			ret = -ENOMEM;
-			break;
+			if (!page) {
+				ret = -ENOMEM;
+				break;
+			}
 		}
 
 		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
-- 
cgit v1.2.3


From 56c451f4b583ccdf80c9e676179c9cb49de86745 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 18 Dec 2008 14:49:37 +0900
Subject: [SCSI] block: fix the partial mappings with struct rq_map_data

This fixes bio_copy_user_iov to properly handle the partial mappings
with struct rq_map_data (which only sg uses for now but st and osst
will shortly). It adds the offset member to struct rq_map_data and
changes blk_rq_map_user to update it so that bio_copy_user_iov can add
an appropriate page frame via bio_add_pc_page().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/bio.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 356e7423b92..13be075806b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	int i, ret;
 	int nr_pages = 0;
 	unsigned int len = 0;
+	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
 
 	for (i = 0; i < iov_count; i++) {
 		unsigned long uaddr;
@@ -814,12 +815,16 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	bio->bi_rw |= (!write_to_vm << BIO_RW);
 
 	ret = 0;
-	i = 0;
-	if (map_data)
+
+	if (map_data) {
 		nr_pages = 1 << map_data->page_order;
+		i = map_data->offset / PAGE_SIZE;
+	}
 	while (len) {
 		unsigned int bytes = PAGE_SIZE;
 
+		bytes -= offset;
+
 		if (bytes > len)
 			bytes = len;
 
@@ -841,10 +846,11 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 			}
 		}
 
-		if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
 			break;
 
 		len -= bytes;
+		offset = 0;
 	}
 
 	if (ret)
-- 
cgit v1.2.3


From 97ae77a1cd332c7b011d71315c8faabce6840c72 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Thu, 18 Dec 2008 14:49:38 +0900
Subject: [SCSI] block: make blk_rq_map_user take a NULL user-space buffer for
 WRITE

The commit 818827669d85b84241696ffef2de485db46b0b5e (block: make
blk_rq_map_user take a NULL user-space buffer) extended
blk_rq_map_user to accept a NULL user-space buffer with a READ
command. It was necessary to convert sg to use the block layer mapping
API.

This patch extends blk_rq_map_user again for a WRITE command. It is
necessary to convert st and osst drivers to use the block layer
apping API.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 13be075806b..062299acbcc 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -859,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	/*
 	 * success
 	 */
-	if (!write_to_vm) {
+	if (!write_to_vm && (!map_data || !map_data->null_mapped)) {
 		ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
 		if (ret)
 			goto cleanup;
-- 
cgit v1.2.3


From 59e55e6cf86eb472e8373831c4234252916c53ef Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Fri, 2 Jan 2009 13:41:11 +0000
Subject: Remove devpts_root global

Remove the 'devpts_root' global variable and find the root dentry using
the super_block. The super-block can be found from the device inode, using
the new wrapper, pts_sb_from_inode().

Changelog: This patch is based on an earlier patchset from Serge Hallyn
	   and Matt Helsley.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5d61b7c06e1..f96e10a109f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -34,7 +34,6 @@ static DEFINE_IDA(allocated_ptys);
 static DEFINE_MUTEX(allocated_ptys_lock);
 
 static struct vfsmount *devpts_mnt;
-static struct dentry *devpts_root;
 
 static struct {
 	int setuid;
@@ -56,6 +55,14 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
+static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+{
+	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+		return inode->i_sb;
+
+	return devpts_mnt->mnt_sb;
+}
+
 static int devpts_remount(struct super_block *sb, int *flags, char *data)
 {
 	char *p;
@@ -142,7 +149,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	inode->i_fop = &simple_dir_operations;
 	inode->i_nlink = 2;
 
-	devpts_root = s->s_root = d_alloc_root(inode);
+	s->s_root = d_alloc_root(inode);
 	if (s->s_root)
 		return 0;
 	
@@ -211,7 +218,9 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 	struct tty_driver *driver = tty->driver;
 	dev_t device = MKDEV(driver->major, driver->minor_start+number);
 	struct dentry *dentry;
-	struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct inode *inode = new_inode(sb);
+	struct dentry *root = sb->s_root;
 	char s[12];
 
 	/* We're supposed to be given the slave end of a pty */
@@ -231,15 +240,15 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 
 	sprintf(s, "%d", number);
 
-	mutex_lock(&devpts_root->d_inode->i_mutex);
+	mutex_lock(&root->d_inode->i_mutex);
 
-	dentry = d_alloc_name(devpts_root, s);
+	dentry = d_alloc_name(root, s);
 	if (!IS_ERR(dentry)) {
 		d_add(dentry, inode);
-		fsnotify_create(devpts_root->d_inode, dentry);
+		fsnotify_create(root->d_inode, dentry);
 	}
 
-	mutex_unlock(&devpts_root->d_inode->i_mutex);
+	mutex_unlock(&root->d_inode->i_mutex);
 
 	return 0;
 }
@@ -256,11 +265,13 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
 void devpts_pty_kill(struct tty_struct *tty)
 {
 	struct inode *inode = tty->driver_data;
+	struct super_block *sb = pts_sb_from_inode(inode);
+	struct dentry *root = sb->s_root;
 	struct dentry *dentry;
 
 	BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
 
-	mutex_lock(&devpts_root->d_inode->i_mutex);
+	mutex_lock(&root->d_inode->i_mutex);
 
 	dentry = d_find_alias(inode);
 	if (dentry && !IS_ERR(dentry)) {
@@ -269,7 +280,7 @@ void devpts_pty_kill(struct tty_struct *tty)
 		dput(dentry);
 	}
 
-	mutex_unlock(&devpts_root->d_inode->i_mutex);
+	mutex_unlock(&root->d_inode->i_mutex);
 }
 
 static int __init init_devpts_fs(void)
-- 
cgit v1.2.3


From e76b7c01e598d2d14ddfdb6ae5c6afe45245d0de Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Fri, 2 Jan 2009 13:41:21 +0000
Subject: Per-mount allocated_ptys

To enable multiple mounts of devpts, 'allocated_ptys' must be a per-mount
variable rather than a global variable.  Move 'allocated_ptys' into the
super_block's s_fs_info.

Changelog[v2]:
	Define and use DEVPTS_SB() wrapper.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 48 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index f96e10a109f..49d879d911b 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -30,7 +30,6 @@
 #define PTMX_MINOR	2
 
 extern int pty_limit;			/* Config limit on Unix98 ptys */
-static DEFINE_IDA(allocated_ptys);
 static DEFINE_MUTEX(allocated_ptys_lock);
 
 static struct vfsmount *devpts_mnt;
@@ -55,6 +54,15 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
+struct pts_fs_info {
+	struct ida allocated_ptys;
+};
+
+static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
 static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 {
 	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
@@ -126,6 +134,19 @@ static const struct super_operations devpts_sops = {
 	.show_options	= devpts_show_options,
 };
 
+static void *new_pts_fs_info(void)
+{
+	struct pts_fs_info *fsi;
+
+	fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+	if (!fsi)
+		return NULL;
+
+	ida_init(&fsi->allocated_ptys);
+
+	return fsi;
+}
+
 static int
 devpts_fill_super(struct super_block *s, void *data, int silent)
 {
@@ -137,9 +158,13 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_op = &devpts_sops;
 	s->s_time_gran = 1;
 
+	s->s_fs_info = new_pts_fs_info();
+	if (!s->s_fs_info)
+		goto fail;
+
 	inode = new_inode(s);
 	if (!inode)
-		goto fail;
+		goto free_fsi;
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blocks = 0;
@@ -155,6 +180,9 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	
 	printk("devpts: get root dentry failed\n");
 	iput(inode);
+
+free_fsi:
+	kfree(s->s_fs_info);
 fail:
 	return -ENOMEM;
 }
@@ -165,11 +193,19 @@ static int devpts_get_sb(struct file_system_type *fs_type,
 	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
 }
 
+static void devpts_kill_sb(struct super_block *sb)
+{
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
+	kfree(fsi);
+	kill_anon_super(sb);
+}
+
 static struct file_system_type devpts_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "devpts",
 	.get_sb		= devpts_get_sb,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= devpts_kill_sb,
 };
 
 /*
@@ -179,16 +215,18 @@ static struct file_system_type devpts_fs_type = {
 
 int devpts_new_index(struct inode *ptmx_inode)
 {
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	int index;
 	int ida_ret;
 
 retry:
-	if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
+	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) {
 		return -ENOMEM;
 	}
 
 	mutex_lock(&allocated_ptys_lock);
-	ida_ret = ida_get_new(&allocated_ptys, &index);
+	ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
 	if (ida_ret < 0) {
 		mutex_unlock(&allocated_ptys_lock);
 		if (ida_ret == -EAGAIN)
@@ -197,7 +235,7 @@ retry:
 	}
 
 	if (index >= pty_limit) {
-		ida_remove(&allocated_ptys, index);
+		ida_remove(&fsi->allocated_ptys, index);
 		mutex_unlock(&allocated_ptys_lock);
 		return -EIO;
 	}
@@ -207,8 +245,11 @@ retry:
 
 void devpts_kill_index(struct inode *ptmx_inode, int idx)
 {
+	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
 	mutex_lock(&allocated_ptys_lock);
-	ida_remove(&allocated_ptys, idx);
+	ida_remove(&fsi->allocated_ptys, idx);
 	mutex_unlock(&allocated_ptys_lock);
 }
 
-- 
cgit v1.2.3


From 31af0abbdafb66ad8e27e3df878faec2ebe1132e Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Fri, 2 Jan 2009 13:41:33 +0000
Subject: Per-mount 'config' object

With support for multiple mounts of devpts, the 'config' structure really
represents per-mount options rather than config parameters. Rename 'config'
structure to 'pts_mount_opts' and store it in the super-block.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 49d879d911b..b793e6e3c21 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -34,13 +34,13 @@ static DEFINE_MUTEX(allocated_ptys_lock);
 
 static struct vfsmount *devpts_mnt;
 
-static struct {
+struct pts_mount_opts {
 	int setuid;
 	int setgid;
 	uid_t   uid;
 	gid_t   gid;
 	umode_t mode;
-} config = {.mode = DEVPTS_DEFAULT_MODE};
+};
 
 enum {
 	Opt_uid, Opt_gid, Opt_mode,
@@ -56,6 +56,7 @@ static const match_table_t tokens = {
 
 struct pts_fs_info {
 	struct ida allocated_ptys;
+	struct pts_mount_opts mount_opts;
 };
 
 static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
@@ -74,12 +75,14 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 static int devpts_remount(struct super_block *sb, int *flags, char *data)
 {
 	char *p;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
 
-	config.setuid  = 0;
-	config.setgid  = 0;
-	config.uid     = 0;
-	config.gid     = 0;
-	config.mode    = DEVPTS_DEFAULT_MODE;
+	opts->setuid  = 0;
+	opts->setgid  = 0;
+	opts->uid     = 0;
+	opts->gid     = 0;
+	opts->mode    = DEVPTS_DEFAULT_MODE;
 
 	while ((p = strsep(&data, ",")) != NULL) {
 		substring_t args[MAX_OPT_ARGS];
@@ -94,19 +97,19 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 		case Opt_uid:
 			if (match_int(&args[0], &option))
 				return -EINVAL;
-			config.uid = option;
-			config.setuid = 1;
+			opts->uid = option;
+			opts->setuid = 1;
 			break;
 		case Opt_gid:
 			if (match_int(&args[0], &option))
 				return -EINVAL;
-			config.gid = option;
-			config.setgid = 1;
+			opts->gid = option;
+			opts->setgid = 1;
 			break;
 		case Opt_mode:
 			if (match_octal(&args[0], &option))
 				return -EINVAL;
-			config.mode = option & S_IALLUGO;
+			opts->mode = option & S_IALLUGO;
 			break;
 		default:
 			printk(KERN_ERR "devpts: called with bogus options\n");
@@ -119,11 +122,14 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 
 static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-	if (config.setuid)
-		seq_printf(seq, ",uid=%u", config.uid);
-	if (config.setgid)
-		seq_printf(seq, ",gid=%u", config.gid);
-	seq_printf(seq, ",mode=%03o", config.mode);
+	struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	if (opts->setuid)
+		seq_printf(seq, ",uid=%u", opts->uid);
+	if (opts->setgid)
+		seq_printf(seq, ",gid=%u", opts->gid);
+	seq_printf(seq, ",mode=%03o", opts->mode);
 
 	return 0;
 }
@@ -143,6 +149,7 @@ static void *new_pts_fs_info(void)
 		return NULL;
 
 	ida_init(&fsi->allocated_ptys);
+	fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
 
 	return fsi;
 }
@@ -262,6 +269,8 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
 	struct inode *inode = new_inode(sb);
 	struct dentry *root = sb->s_root;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
 	char s[12];
 
 	/* We're supposed to be given the slave end of a pty */
@@ -275,7 +284,7 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 	inode->i_uid = config.setuid ? config.uid : current_fsuid();
 	inode->i_gid = config.setgid ? config.gid : current_fsgid();
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	init_special_inode(inode, S_IFCHR|config.mode, device);
+	init_special_inode(inode, S_IFCHR|opts->mode, device);
 	inode->i_private = tty;
 	tty->driver_data = inode;
 
-- 
cgit v1.2.3


From 53af8ee4094d80ddaac7efefb572b1c22ae49367 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Fri, 2 Jan 2009 13:41:47 +0000
Subject: Extract option parsing to new function

Move code to parse mount options into a separate function so it can
(later) be shared between mount and remount operations.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index b793e6e3c21..00530e82673 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -72,11 +72,9 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 	return devpts_mnt->mnt_sb;
 }
 
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+static int parse_mount_options(char *data, struct pts_mount_opts *opts)
 {
 	char *p;
-	struct pts_fs_info *fsi = DEVPTS_SB(sb);
-	struct pts_mount_opts *opts = &fsi->mount_opts;
 
 	opts->setuid  = 0;
 	opts->setgid  = 0;
@@ -120,6 +118,14 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
+static int devpts_remount(struct super_block *sb, int *flags, char *data)
+{
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	return parse_mount_options(data, opts);
+}
+
 static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
-- 
cgit v1.2.3


From 1f8f1e296583f9f832c2fe7b5a219675b74bf43e Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Fri, 2 Jan 2009 13:42:02 +0000
Subject: Define mknod_ptmx()

/dev/ptmx is closely tied to the devpts filesystem. An open of /dev/ptmx,
allocates the next pty index and the associated device shows up in the
devpts fs as /dev/pts/n.

Wih multiple instancs of devpts filesystem, during an open of /dev/ptmx
we would be unable to determine which instance of the devpts is being
accessed.

So we move the 'ptmx' node into /dev/pts and use the inode of the 'ptmx'
node to identify the superblock and hence the devpts instance.  This patch
adds ability for the kernel to internally create the [ptmx, c, 5:2] device
when mounting devpts filesystem.  Since the ptmx node in devpts is new and
may surprise some userspace scripts, the default permissions for the new
node is 0000.  These permissions can be changed either using chmod or by
remounting with the new '-o ptmxmode=0666' mount option.

Changelog[v5]:
	- [Serge Hallyn bugfix]: Letting new_inode() assign inode number to
	  ptmx can collide with hand-assigning inode numbers to ptys. So,
	  hand-assign specific inode number to ptmx node also.
	- [Serge Hallyn]: Maybe safer to grab root dentry mutex while creating
	  ptmx node
	- [Bugfix with Serge Hallyn] Replace lookup_one_len() in mknod_ptmx()
	  wih d_alloc_name() (lookup during ->get_sb() locks up system). To
	  simplify patchset, fold the ptmx_dentry patch into this.

Changelog[v4]:
	- Change default permissions of pts/ptmx node to 0000.
	- Move code for ptmxmode under #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES.

Changelog[v3]:
	- Rename ptmx_mode to ptmxmode (for consistency with 'newinstance')

Changelog[v2]:
	- [H. Peter Anvin] Remove mknod() system call support and create the
	  ptmx node internally.

Changelog[v1]:
	- Earlier version of this patch enabled creating /dev/pts/tty as
	  well. As pointed out by Al Viro and H. Peter Anvin, that is not
	  really necessary.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 110 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 00530e82673..8ee9dc2f9e4 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,6 +27,13 @@
 #define DEVPTS_SUPER_MAGIC 0x1cd1
 
 #define DEVPTS_DEFAULT_MODE 0600
+/*
+ * ptmx is a new node in /dev/pts and will be unused in legacy (single-
+ * instance) mode. To prevent surprises in user space, set permissions of
+ * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
+ * permissions.
+ */
+#define DEVPTS_DEFAULT_PTMX_MODE 0000
 #define PTMX_MINOR	2
 
 extern int pty_limit;			/* Config limit on Unix98 ptys */
@@ -40,10 +47,11 @@ struct pts_mount_opts {
 	uid_t   uid;
 	gid_t   gid;
 	umode_t mode;
+	umode_t ptmxmode;
 };
 
 enum {
-	Opt_uid, Opt_gid, Opt_mode,
+	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode,
 	Opt_err
 };
 
@@ -51,12 +59,16 @@ static const match_table_t tokens = {
 	{Opt_uid, "uid=%u"},
 	{Opt_gid, "gid=%u"},
 	{Opt_mode, "mode=%o"},
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	{Opt_ptmxmode, "ptmxmode=%o"},
+#endif
 	{Opt_err, NULL}
 };
 
 struct pts_fs_info {
 	struct ida allocated_ptys;
 	struct pts_mount_opts mount_opts;
+	struct dentry *ptmx_dentry;
 };
 
 static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
@@ -81,6 +93,7 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
 	opts->uid     = 0;
 	opts->gid     = 0;
 	opts->mode    = DEVPTS_DEFAULT_MODE;
+	opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
 
 	while ((p = strsep(&data, ",")) != NULL) {
 		substring_t args[MAX_OPT_ARGS];
@@ -109,6 +122,13 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
 				return -EINVAL;
 			opts->mode = option & S_IALLUGO;
 			break;
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+		case Opt_ptmxmode:
+			if (match_octal(&args[0], &option))
+				return -EINVAL;
+			opts->ptmxmode = option & S_IALLUGO;
+			break;
+#endif
 		default:
 			printk(KERN_ERR "devpts: called with bogus options\n");
 			return -EINVAL;
@@ -118,12 +138,93 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
 	return 0;
 }
 
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int mknod_ptmx(struct super_block *sb)
+{
+	int mode;
+	int rc = -ENOMEM;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct dentry *root = sb->s_root;
+	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_mount_opts *opts = &fsi->mount_opts;
+
+	mutex_lock(&root->d_inode->i_mutex);
+
+	/* If we have already created ptmx node, return */
+	if (fsi->ptmx_dentry) {
+		rc = 0;
+		goto out;
+	}
+
+	dentry = d_alloc_name(root, "ptmx");
+	if (!dentry) {
+		printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
+		goto out;
+	}
+
+	/*
+	 * Create a new 'ptmx' node in this mount of devpts.
+	 */
+	inode = new_inode(sb);
+	if (!inode) {
+		printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
+		dput(dentry);
+		goto out;
+	}
+
+	inode->i_ino = 2;
+	inode->i_uid = inode->i_gid = 0;
+	inode->i_blocks = 0;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+	mode = S_IFCHR|opts->ptmxmode;
+	init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+
+	d_add(dentry, inode);
+
+	fsi->ptmx_dentry = dentry;
+	rc = 0;
+
+	printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
+			inode->i_ino);
+out:
+	mutex_unlock(&root->d_inode->i_mutex);
+	return rc;
+}
+
+static void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+	struct inode *inode;
+	if (fsi->ptmx_dentry) {
+		inode = fsi->ptmx_dentry->d_inode;
+		inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
+	}
+}
+#else
+static inline void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+       return;
+}
+#endif
+
 static int devpts_remount(struct super_block *sb, int *flags, char *data)
 {
+	int err;
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
 
-	return parse_mount_options(data, opts);
+	err = parse_mount_options(data, opts);
+
+	/*
+	 * parse_mount_options() restores options to default values
+	 * before parsing and may have changed ptmxmode. So, update the
+	 * mode in the inode too. Bogus options don't fail the remount,
+	 * so do this even on error return.
+	 */
+	update_ptmx_mode(fsi);
+
+	return err;
 }
 
 static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
@@ -136,6 +237,9 @@ static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	if (opts->setgid)
 		seq_printf(seq, ",gid=%u", opts->gid);
 	seq_printf(seq, ",mode=%03o", opts->mode);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+	seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+#endif
 
 	return 0;
 }
@@ -156,6 +260,7 @@ static void *new_pts_fs_info(void)
 
 	ida_init(&fsi->allocated_ptys);
 	fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+	fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
 
 	return fsi;
 }
@@ -163,7 +268,7 @@ static void *new_pts_fs_info(void)
 static int
 devpts_fill_super(struct super_block *s, void *data, int silent)
 {
-	struct inode * inode;
+	struct inode *inode;
 
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
@@ -190,7 +295,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	s->s_root = d_alloc_root(inode);
 	if (s->s_root)
 		return 0;
-	
+
 	printk("devpts: get root dentry failed\n");
 	iput(inode);
 
@@ -211,7 +316,7 @@ static void devpts_kill_sb(struct super_block *sb)
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 
 	kfree(fsi);
-	kill_anon_super(sb);
+	kill_litter_super(sb);
 }
 
 static struct file_system_type devpts_fs_type = {
-- 
cgit v1.2.3


From d4076ac55bf8755ce6c5706478631c1726cf0179 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Fri, 2 Jan 2009 13:42:19 +0000
Subject: Define get_init_pts_sb()

See comments in the function header for details. The new interface will
be used in a follow-on patch.

Changelog [v2]:
	[Dave Hansen] Replace get_sb_ref() in fs/super.c with get_init_pts_sb()
	and make the new interface private to devpts

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 8ee9dc2f9e4..2d0eb2cf99e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -305,10 +305,63 @@ fail:
 	return -ENOMEM;
 }
 
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+	if (devpts_mnt)
+		return devpts_mnt->mnt_sb == s;
+
+	return 0;
+}
+
+/*
+ * get_init_pts_sb()
+ *
+ *     This interface is needed to support multiple namespace semantics in
+ *     devpts while preserving backward compatibility of the current 'single-
+ *     namespace' semantics. i.e all mounts of devpts without the 'newinstance'
+ *     mount option should bind to the initial kernel mount, like
+ *     get_sb_single().
+ *
+ *     Mounts with 'newinstance' option create a new private namespace.
+ *
+ *     But for single-mount semantics, devpts cannot use get_sb_single(),
+ *     because get_sb_single()/sget() find and use the super-block from
+ *     the most recent mount of devpts. But that recent mount may be a
+ *     'newinstance' mount and get_sb_single() would pick the newinstance
+ *     super-block instead of the initial super-block.
+ *
+ *     This interface is identical to get_sb_single() except that it
+ *     consistently selects the 'single-namespace' superblock even in the
+ *     presence of the private namespace (i.e 'newinstance') super-blocks.
+ */
+static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
+		void *data, struct vfsmount *mnt)
+{
+        struct super_block *s;
+        int error;
+
+        s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+        if (IS_ERR(s))
+                return PTR_ERR(s);
+
+        if (!s->s_root) {
+                s->s_flags = flags;
+                error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+                if (error) {
+                        up_write(&s->s_umount);
+                        deactivate_super(s);
+                        return error;
+                }
+                s->s_flags |= MS_ACTIVE;
+        }
+        do_remount_sb(s, flags, data, 0);
+        return simple_set_mnt(mnt, s);
+}
+
 static int devpts_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+	return get_init_pts_sb(fs_type, flags, data, mnt);
 }
 
 static void devpts_kill_sb(struct super_block *sb)
-- 
cgit v1.2.3


From 2a1b2dc0c83bbfc24d72cafd5e69810a149b44e4 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Fri, 2 Jan 2009 13:42:27 +0000
Subject: Enable multiple instances of devpts

To support containers, allow multiple instances of devpts filesystem, such
that indices of ptys allocated in one instance are independent of ptys
allocated in other instances of devpts.

But to preserve backward compatibility, enable this support for multiple
instances only if:

	- CONFIG_DEVPTS_MULTIPLE_INSTANCES is set to Y, and
	- '-o newinstance' mount option is specified while mounting devpts

To use multi-instance mount, a container startup script could:

	$ ns_exec -cm /bin/bash
	$ umount /dev/pts
	$ mount -t devpts -o newinstance lxcpts /dev/pts
	$ mount -o bind /dev/pts/ptmx /dev/ptmx
	$ /usr/sbin/sshd -p 1234

where 'ns_exec -cm /bin/bash' is calls clone() with CLONE_NEWNS flag and execs
/bin/bash in the child process. A pty created by the sshd is not visible in
the original mount of /dev/pts.

USER-SPACE-IMPACT:
	- See Documentation/fs/devpts.txt (included in next patch) for user-
	  space impact in multi-instance and mixed-mode operation.
TODO:
	- Update mount(8), pts(4) man pages. Highlight impact of not
	  redirecting /dev/ptmx to /dev/pts/ptmx after a multi-instance mount.

Changelog[v6]:
	- [Dave Hansen] Use new get_init_pts_sb() interface
	- [Serge Hallyn] Don't bother displaying 'newinstance' in show_options
	- [Serge Hallyn] Use macros (PARSE_REMOUNT/PARSE_MOUNT) instead of 0/1.
	- [Serge Hallyn] Check error return from get_sb_single() (now
	  get_init_pts_sb())
	- devpts_pty_kill(): don't dput error dentries

Changelog[v5]:
	- Move get_sb_ref() definition to earlier patch
	- Move usage info to Documentation/filesystems/devpts.txt (next patch)
	- Make ptmx node even in init_pts_ns, now that default mode is 0000
	  (defined in earlier patch, enabled here).
	- Cache ptmx dentry and use to update mode during remount
	  (defined in earlier patch, enabled here).
	- Bugfix: explicitly ignore newinstance on remount (if newinstance was
	  specified on remount of initial mount, it would be ignored but
	  /proc/mounts would imply that the option was set)

Changelog[v4]:

	- Update patch description to address H. Peter Anvin's comments
	- Consolidate multi-instance mode code under new config token,
	  CONFIG_DEVPTS_MULTIPLE_INSTANCE.
	- Move usage-details from patch description to
	  Documentation/fs/devpts.txt

Changelog[v3]:
	- Rename new mount option to 'newinstance'
	- Create ptmx nodes only in 'newinstance' mounts
	- Bugfix: parse_mount_options() modifies @data but since we need to
	  parse the @data twice (once in devpts_get_sb() and once during
	  do_remount_sb()), parse a local copy of @data in devpts_get_sb().
	  (restructured code in devpts_get_sb() to fix this)

Changelog[v2]:
	- Support both single-mount and multiple-mount semantics and
	  provide '-onewmnt' option to select the semantics.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 163 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 2d0eb2cf99e..b4a89fa2167 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -48,10 +48,11 @@ struct pts_mount_opts {
 	gid_t   gid;
 	umode_t mode;
 	umode_t ptmxmode;
+	int newinstance;
 };
 
 enum {
-	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode,
+	Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
 	Opt_err
 };
 
@@ -61,6 +62,7 @@ static const match_table_t tokens = {
 	{Opt_mode, "mode=%o"},
 #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 	{Opt_ptmxmode, "ptmxmode=%o"},
+	{Opt_newinstance, "newinstance"},
 #endif
 	{Opt_err, NULL}
 };
@@ -78,13 +80,17 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
 
 static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 {
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
 		return inode->i_sb;
-
+#endif
 	return devpts_mnt->mnt_sb;
 }
 
-static int parse_mount_options(char *data, struct pts_mount_opts *opts)
+#define PARSE_MOUNT	0
+#define PARSE_REMOUNT	1
+
+static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
 {
 	char *p;
 
@@ -95,6 +101,10 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
 	opts->mode    = DEVPTS_DEFAULT_MODE;
 	opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
 
+	/* newinstance makes sense only on initial mount */
+	if (op == PARSE_MOUNT)
+		opts->newinstance = 0;
+
 	while ((p = strsep(&data, ",")) != NULL) {
 		substring_t args[MAX_OPT_ARGS];
 		int token;
@@ -128,6 +138,11 @@ static int parse_mount_options(char *data, struct pts_mount_opts *opts)
 				return -EINVAL;
 			opts->ptmxmode = option & S_IALLUGO;
 			break;
+		case Opt_newinstance:
+			/* newinstance makes sense only on initial mount */
+			if (op == PARSE_MOUNT)
+				opts->newinstance = 1;
+			break;
 #endif
 		default:
 			printk(KERN_ERR "devpts: called with bogus options\n");
@@ -214,7 +229,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
 	struct pts_fs_info *fsi = DEVPTS_SB(sb);
 	struct pts_mount_opts *opts = &fsi->mount_opts;
 
-	err = parse_mount_options(data, opts);
+	err = parse_mount_options(data, PARSE_REMOUNT, opts);
 
 	/*
 	 * parse_mount_options() restores options to default values
@@ -309,8 +324,100 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 {
 	if (devpts_mnt)
 		return devpts_mnt->mnt_sb == s;
+	return 0;
+}
+
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+/*
+ * Safely parse the mount options in @data and update @opts.
+ *
+ * devpts ends up parsing options two times during mount, due to the
+ * two modes of operation it supports. The first parse occurs in
+ * devpts_get_sb() when determining the mode (single-instance or
+ * multi-instance mode). The second parse happens in devpts_remount()
+ * or new_pts_mount() depending on the mode.
+ *
+ * Parsing of options modifies the @data making subsequent parsing
+ * incorrect. So make a local copy of @data and parse it.
+ *
+ * Return: 0 On success, -errno on error
+ */
+static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
+{
+	int rc;
+	void *datacp;
+
+	if (!data)
+		return 0;
+
+	/* Use kstrdup() ?  */
+	datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!datacp)
+		return -ENOMEM;
+
+	memcpy(datacp, data, PAGE_SIZE);
+	rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
+	kfree(datacp);
+
+	return rc;
+}
+
+/*
+ * Mount a new (private) instance of devpts.  PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
+ */
+static int new_pts_mount(struct file_system_type *fs_type, int flags,
+		void *data, struct vfsmount *mnt)
+{
+	int err;
+	struct pts_fs_info *fsi;
+	struct pts_mount_opts *opts;
+
+	printk(KERN_NOTICE "devpts: newinstance mount\n");
+
+	err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
+	if (err)
+		return err;
+
+	fsi = DEVPTS_SB(mnt->mnt_sb);
+	opts = &fsi->mount_opts;
+
+	err = parse_mount_options(data, PARSE_MOUNT, opts);
+	if (err)
+		goto fail;
+
+	err = mknod_ptmx(mnt->mnt_sb);
+	if (err)
+		goto fail;
 
 	return 0;
+
+fail:
+	dput(mnt->mnt_sb->s_root);
+	deactivate_super(mnt->mnt_sb);
+	return err;
+}
+
+/*
+ * Check if 'newinstance' mount option was specified in @data.
+ *
+ * Return: -errno  	on error (eg: invalid mount options specified)
+ * 	 : 1 		if 'newinstance' mount option was specified
+ * 	 : 0 		if 'newinstance' mount option was NOT specified
+ */
+static int is_new_instance_mount(void *data)
+{
+	int rc;
+	struct pts_mount_opts opts;
+
+	if (!data)
+		return 0;
+
+	rc = safe_parse_mount_options(data, &opts);
+	if (!rc)
+		rc = opts.newinstance;
+
+	return rc;
 }
 
 /*
@@ -358,11 +465,54 @@ static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
         return simple_set_mnt(mnt, s);
 }
 
+/*
+ * Mount or remount the initial kernel mount of devpts. This type of
+ * mount maintains the legacy, single-instance semantics, while the
+ * kernel still allows multiple-instances.
+ */
+static int init_pts_mount(struct file_system_type *fs_type, int flags,
+		void *data, struct vfsmount *mnt)
+{
+	int err;
+
+	err = get_init_pts_sb(fs_type, flags, data, mnt);
+	if (err)
+		 return err;
+
+	err = mknod_ptmx(mnt->mnt_sb);
+	if (err) {
+		dput(mnt->mnt_sb->s_root);
+		deactivate_super(mnt->mnt_sb);
+	}
+
+	return err;
+}
+
 static int devpts_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_init_pts_sb(fs_type, flags, data, mnt);
+	int new;
+
+	new = is_new_instance_mount(data);
+	if (new < 0)
+		return new;
+
+	if (new)
+		return new_pts_mount(fs_type, flags, data, mnt);
+
+	return init_pts_mount(fs_type, flags, data, mnt);
 }
+#else
+/*
+ * This supports only the legacy single-instance semantics (no
+ * multiple-instance semantics)
+ */
+static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+		const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
+}
+#endif
 
 static void devpts_kill_sb(struct super_block *sb)
 {
@@ -488,12 +638,18 @@ void devpts_pty_kill(struct tty_struct *tty)
 	mutex_lock(&root->d_inode->i_mutex);
 
 	dentry = d_find_alias(inode);
-	if (dentry && !IS_ERR(dentry)) {
+	if (IS_ERR(dentry))
+		goto out;
+
+	if (dentry) {
 		inode->i_nlink--;
 		d_delete(dentry);
-		dput(dentry);
+		dput(dentry);	// d_alloc_name() in devpts_pty_new()
 	}
 
+	dput(dentry);		// d_find_alias above
+
+out:
 	mutex_unlock(&root->d_inode->i_mutex);
 }
 
-- 
cgit v1.2.3


From 835aa440f1c3fe16a622015bc1b52dffedf6d90e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Fri, 2 Jan 2009 13:42:48 +0000
Subject: devpts: Coding style clean up

Just nail the oddments now while this code is being touched

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 53 ++++++++++++++++++++++++++---------------------------
 1 file changed, 26 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index b4a89fa2167..b02c24313d5 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -311,7 +311,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 	if (s->s_root)
 		return 0;
 
-	printk("devpts: get root dentry failed\n");
+	printk(KERN_ERR "devpts: get root dentry failed\n");
 	iput(inode);
 
 free_fsi:
@@ -444,25 +444,25 @@ static int is_new_instance_mount(void *data)
 static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
 		void *data, struct vfsmount *mnt)
 {
-        struct super_block *s;
-        int error;
-
-        s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
-        if (IS_ERR(s))
-                return PTR_ERR(s);
-
-        if (!s->s_root) {
-                s->s_flags = flags;
-                error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-                if (error) {
-                        up_write(&s->s_umount);
-                        deactivate_super(s);
-                        return error;
-                }
-                s->s_flags |= MS_ACTIVE;
-        }
-        do_remount_sb(s, flags, data, 0);
-        return simple_set_mnt(mnt, s);
+	struct super_block *s;
+	int error;
+
+	s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
+
+	if (!s->s_root) {
+		s->s_flags = flags;
+		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			up_write(&s->s_umount);
+			deactivate_super(s);
+			return error;
+		}
+		s->s_flags |= MS_ACTIVE;
+	}
+	do_remount_sb(s, flags, data, 0);
+	return simple_set_mnt(mnt, s);
 }
 
 /*
@@ -477,7 +477,7 @@ static int init_pts_mount(struct file_system_type *fs_type, int flags,
 
 	err = get_init_pts_sb(fs_type, flags, data, mnt);
 	if (err)
-		 return err;
+		return err;
 
 	err = mknod_ptmx(mnt->mnt_sb);
 	if (err) {
@@ -542,9 +542,8 @@ int devpts_new_index(struct inode *ptmx_inode)
 	int ida_ret;
 
 retry:
-	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) {
+	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
 		return -ENOMEM;
-	}
 
 	mutex_lock(&allocated_ptys_lock);
 	ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
@@ -576,7 +575,8 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
 
 int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 {
-	int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
+	/* tty layer puts index from devpts_new_index() in here */
+	int number = tty->index;
 	struct tty_driver *driver = tty->driver;
 	dev_t device = MKDEV(driver->major, driver->minor_start+number);
 	struct dentry *dentry;
@@ -644,11 +644,10 @@ void devpts_pty_kill(struct tty_struct *tty)
 	if (dentry) {
 		inode->i_nlink--;
 		d_delete(dentry);
-		dput(dentry);	// d_alloc_name() in devpts_pty_new()
+		dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
 	}
 
-	dput(dentry);		// d_find_alias above
-
+	dput(dentry);		/* d_find_alias above */
 out:
 	mutex_unlock(&root->d_inode->i_mutex);
 }
-- 
cgit v1.2.3


From 8c056e5b148498192832678cf2957760945e8c71 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 2 Jan 2009 13:44:12 +0000
Subject: devpts: fix unused function warning

fs/devpts/inode.c:324: warning: 'compare_init_pts_sb' defined but not used

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index b02c24313d5..3f309f181de 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -320,6 +320,7 @@ fail:
 	return -ENOMEM;
 }
 
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 static int compare_init_pts_sb(struct super_block *s, void *p)
 {
 	if (devpts_mnt)
@@ -327,7 +328,6 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 	return 0;
 }
 
-#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
 /*
  * Safely parse the mount options in @data and update @opts.
  *
-- 
cgit v1.2.3


From d0eafc7db8f170d534a16b5f04617e98ae2025de Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 2 Jan 2009 13:44:49 +0000
Subject: CRED: Wrap task credential accesses in the devpts filesystem

Wrap access to task credentials so that they can be separated more easily from
the task_struct during the introduction of COW creds.

Change most current->(|e|s|fs)[ug]id to current_(|e|s|fs)[ug]id().

Change some task->e?[ug]id to task_e?[ug]id().  In some places it makes more
sense to use RCU directly rather than a convenient wrapper; these will be
addressed by later patches.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 3f309f181de..fff96e152c0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -594,9 +594,9 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
 	if (!inode)
 		return -ENOMEM;
 
-	inode->i_ino = number+2;
-	inode->i_uid = config.setuid ? config.uid : current_fsuid();
-	inode->i_gid = config.setgid ? config.gid : current_fsgid();
+	inode->i_ino = number + 3;
+	inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
+	inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	init_special_inode(inode, S_IFCHR|opts->mode, device);
 	inode->i_private = tty;
-- 
cgit v1.2.3


From 5d1b1b3f492f8696ea18950a454a141381b0f926 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 22:19:52 -0500
Subject: ext4: fix BUG when calling ext4_error with locked block group

The mballoc code likes to call ext4_error while it is holding locked
block groups.  This can causes a scheduling in atomic context BUG.  We
can't just unlock the block group and relock it after/if ext4_error
returns since that might result in race conditions in the case where
the filesystem is set to continue after finding errors.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/mballoc.c | 30 +++++++++++++++---------------
 fs/ext4/mballoc.h | 47 -----------------------------------------------
 fs/ext4/super.c   | 45 +++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 105 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8152b5603f0..f0b1db6acf8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1126,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
+				const char *, const char *, ...)
+	__attribute__ ((format (printf, 4, 5)));
 extern void ext4_update_dynamic_rev(struct super_block *sb);
 extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
 					__u32 compat);
@@ -1249,6 +1252,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 	return ;
 }
 
+struct ext4_group_info {
+	unsigned long   bb_state;
+	struct rb_root  bb_free_root;
+	unsigned short  bb_first_free;
+	unsigned short  bb_free;
+	unsigned short  bb_fragments;
+	struct          list_head bb_prealloc_list;
+#ifdef DOUBLE_CHECK
+	void            *bb_bitmap;
+#endif
+	struct rw_semaphore alloc_sem;
+	unsigned short  bb_counters[];
+};
+
+#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
+#define EXT4_GROUP_INFO_LOCKED_BIT	1
+
+#define EXT4_MB_GRP_NEED_INIT(grp)	\
+	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline void ext4_unlock_group(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+}
+
+static inline int ext4_is_group_locked(struct super_block *sb,
+					ext4_group_t group)
+{
+	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+
+	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
+						&(grinfo->bb_state));
+}
+
 /*
  * Inodes and files operations
  */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0bf4c4c06b1..cda69632eea 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -457,8 +457,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr += first + i;
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
-			ext4_error(sb, __func__, "double-free of inode"
+			ext4_grp_locked_error(sb, e4b->bd_group,
+				   __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %u)",
 				   inode ? inode->i_ino : 0, blocknr,
 				   first + i, e4b->bd_group);
@@ -702,7 +702,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 	grp->bb_fragments = fragments;
 
 	if (free != grp->bb_free) {
-		ext4_error(sb, __func__,
+		ext4_grp_locked_error(sb, group,  __func__,
 			"EXT4-fs: group %u: %u blocks in bitmap, %u in gd",
 			group, free, grp->bb_free);
 		/*
@@ -1095,8 +1095,6 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 
 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			  int first, int count)
-__releases(bitlock)
-__acquires(bitlock)
 {
 	int block = 0;
 	int max = 0;
@@ -1135,12 +1133,11 @@ __acquires(bitlock)
 			blocknr += block;
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-			ext4_unlock_group(sb, e4b->bd_group);
-			ext4_error(sb, __func__, "double-free of inode"
+			ext4_grp_locked_error(sb, e4b->bd_group,
+				   __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %u)",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
-			ext4_lock_group(sb, e4b->bd_group);
 		}
 		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
 		e4b->bd_info->bb_counters[order]++;
@@ -1623,7 +1620,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 			 * free blocks even though group info says we
 			 * we have free blocks
 			 */
-			ext4_error(sb, __func__, "%d free blocks as per "
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					__func__, "%d free blocks as per "
 					"group info. But bitmap says 0",
 					free);
 			break;
@@ -1632,7 +1630,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
 		mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
 		BUG_ON(ex.fe_len <= 0);
 		if (free < ex.fe_len) {
-			ext4_error(sb, __func__, "%d free blocks as per "
+			ext4_grp_locked_error(sb, e4b->bd_group,
+					__func__, "%d free blocks as per "
 					"group info. But got %d blocks",
 					free, ex.fe_len);
 			/*
@@ -3822,8 +3821,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			pa, (unsigned long) pa->pa_lstart,
 			(unsigned long) pa->pa_pstart,
 			(unsigned long) pa->pa_len);
-		ext4_error(sb, __func__, "free %u, pa_free %u",
-						free, pa->pa_free);
+		ext4_grp_locked_error(sb, group,
+					__func__, "free %u, pa_free %u",
+					free, pa->pa_free);
 		/*
 		 * pa is already deleted so we use the value obtained
 		 * from the bitmap and continue.
@@ -4633,9 +4633,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		else if (block >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			ext4_error(sb, __func__,
-			    "Double free of blocks %d (%d %d)",
-			    block, entry->start_blk, entry->count);
+			ext4_grp_locked_error(sb, e4b->bd_group, __func__,
+					"Double free of blocks %d (%d %d)",
+					block, entry->start_blk, entry->count);
 			return 0;
 		}
 	}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 997f78fff12..95d4c7f29a8 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -118,27 +118,6 @@ struct ext4_free_data {
 	tid_t	t_tid;
 };
 
-struct ext4_group_info {
-	unsigned long	bb_state;
-	struct rb_root  bb_free_root;
-	unsigned short	bb_first_free;
-	unsigned short	bb_free;
-	unsigned short	bb_fragments;
-	struct		list_head bb_prealloc_list;
-#ifdef DOUBLE_CHECK
-	void		*bb_bitmap;
-#endif
-	struct rw_semaphore alloc_sem;
-	unsigned short	bb_counters[];
-};
-
-#define EXT4_GROUP_INFO_NEED_INIT_BIT	0
-#define EXT4_GROUP_INFO_LOCKED_BIT	1
-
-#define EXT4_MB_GRP_NEED_INIT(grp)	\
-	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
-
-
 struct ext4_prealloc_space {
 	struct list_head	pa_inode_list;
 	struct list_head	pa_group_list;
@@ -264,32 +243,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
-
-
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline void ext4_unlock_group(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
-}
-
-static inline int ext4_is_group_locked(struct super_block *sb,
-					ext4_group_t group)
-{
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-						&(grinfo->bb_state));
-}
-
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)
 {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9dd1170bfe..2415e2b0970 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -366,6 +366,44 @@ void ext4_warning(struct super_block *sb, const char *function,
 	va_end(args);
 }
 
+void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
+				const char *function, const char *fmt, ...)
+__releases(bitlock)
+__acquires(bitlock)
+{
+	va_list args;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	va_start(args, fmt);
+	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+
+	if (test_opt(sb, ERRORS_CONT)) {
+		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+		ext4_commit_super(sb, es, 0);
+		return;
+	}
+	ext4_unlock_group(sb, grp);
+	ext4_handle_error(sb);
+	/*
+	 * We only get here in the ERRORS_RO case; relocking the group
+	 * may be dangerous, but nothing bad will happen since the
+	 * filesystem will have already been marked read/only and the
+	 * journal has been aborted.  We return 1 as a hint to callers
+	 * who might what to use the return value from
+	 * ext4_grp_locked_error() to distinguish beween the
+	 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
+	 * aggressively from the ext4 function in question, with a
+	 * more appropriate error code.
+	 */
+	ext4_lock_group(sb, grp);
+	return;
+}
+
+
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -2868,8 +2906,11 @@ static void ext4_commit_super(struct super_block *sb,
 		set_buffer_uptodate(sbh);
 	}
 	es->s_wtime = cpu_to_le32(get_seconds());
-	ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
-	es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+					&EXT4_SB(sb)->s_freeblocks_counter));
+	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
+					&EXT4_SB(sb)->s_freeinodes_counter));
+
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
-- 
cgit v1.2.3


From e8134b27e351e813414da3b95aa8eac6d3908088 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:38:26 -0500
Subject: ext4: Fix race between read_block_bitmap() and mark_diskspace_used()

We need to make sure we update the block bitmap and clear
EXT4_BG_BLOCK_UNINIT flag with sb_bgl_lock held, since
ext4_read_block_bitmap() looks at EXT4_BG_BLOCK_UNINIT to decide
whether to initialize the block bitmap each time it is called
(introduced by commit c806e68f), and this can race with block
allocations in ext4_mb_mark_diskspace_used().

ext4_read_block_bitmap does:

spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
	ext4_init_block_bitmap(sb, bh, block_group, desc);

Now on the block allocation side we do

mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
			ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
....
spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
	gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);

ie on allocation we update the bitmap then we take the sb_bgl_lock
and clear the EXT4_BG_BLOCK_UNINIT flag. What can happen is a
parallel ext4_read_block_bitmap can zero out the bitmap in between
the above mb_set_bits and spin_lock(sb_bg_lock..)

The race results in below user visible errors
EXT4-fs error (device sdb1): ext4_mb_release_inode_pa: free 100, pa_free 105
EXT4-fs error (device sdb1): mb_free_blocks: double-free of inode 0's block ..

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cda69632eea..d559a03f3eb 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1070,7 +1070,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		mb_clear_bit_atomic(lock, cur, bm);
+		if (lock)
+			mb_clear_bit_atomic(lock, cur, bm);
+		else
+			mb_clear_bit(cur, bm);
 		cur++;
 	}
 }
@@ -1088,7 +1091,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		mb_set_bit_atomic(lock, cur, bm);
+		if (lock)
+			mb_set_bit_atomic(lock, cur, bm);
+		else
+			mb_set_bit(cur, bm);
 		cur++;
 	}
 }
@@ -3035,10 +3041,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		}
 	}
 #endif
-	mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
-				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
-
 	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+	mb_set_bits(NULL, bitmap_bh->b_data,
+				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 		gdp->bg_free_blocks_count =
-- 
cgit v1.2.3


From 560671a0d3c9ad2d647fa6d09375a262e1f19c4f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 22:20:24 -0500
Subject: ext4: Use high 16 bits of the block group descriptor's free counts
 fields

Rename the lower bits with suffix _lo and add helper
to access the values. Also rename bg_itable_unused_hi
to bg_pad as in e2fsprogs.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 13 +++++----
 fs/ext4/ext4.h    | 26 +++++++++++++----
 fs/ext4/ialloc.c  | 83 +++++++++++++++++++++++++++++--------------------------
 fs/ext4/inode.c   |  2 +-
 fs/ext4/mballoc.c | 15 +++++-----
 fs/ext4/resize.c  |  4 +--
 fs/ext4/super.c   | 68 +++++++++++++++++++++++++++++++++++++++++++--
 7 files changed, 149 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 404d81cc915..902bf66c8df 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -102,9 +102,9 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 			ext4_error(sb, __func__,
 				  "Checksum bad for group %u", block_group);
-			gdp->bg_free_blocks_count = 0;
-			gdp->bg_free_inodes_count = 0;
-			gdp->bg_itable_unused = 0;
+			ext4_free_blks_set(sb, gdp, 0);
+			ext4_free_inodes_set(sb, gdp, 0);
+			ext4_itable_unused_set(sb, gdp, 0);
 			memset(bh->b_data, 0xff, sb->s_blocksize);
 			return 0;
 		}
@@ -372,7 +372,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	struct ext4_group_desc *desc;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int err = 0, ret;
+	int err = 0, ret, blk_free_count;
 	ext4_grpblk_t blocks_freed;
 	struct ext4_group_info *grp;
 
@@ -444,7 +444,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 		}
 	}
 	spin_lock(sb_bgl_lock(sbi, block_group));
-	le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
+	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+	ext4_free_blks_set(sb, desc, blk_free_count);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
@@ -685,7 +686,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		desc_count += ext4_free_blks_count(sb, gdp);
 	}
 
 	return desc_count;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f0b1db6acf8..ec862f4ca89 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -156,12 +156,12 @@ struct ext4_group_desc
 	__le32	bg_block_bitmap_lo;	/* Blocks bitmap block */
 	__le32	bg_inode_bitmap_lo;	/* Inodes bitmap block */
 	__le32	bg_inode_table_lo;	/* Inodes table block */
-	__le16	bg_free_blocks_count;	/* Free blocks count */
-	__le16	bg_free_inodes_count;	/* Free inodes count */
-	__le16	bg_used_dirs_count;	/* Directories count */
+	__le16	bg_free_blocks_count_lo;/* Free blocks count */
+	__le16	bg_free_inodes_count_lo;/* Free inodes count */
+	__le16	bg_used_dirs_count_lo;	/* Directories count */
 	__le16	bg_flags;		/* EXT4_BG_flags (INODE_UNINIT, etc) */
 	__u32	bg_reserved[2];		/* Likely block/inode bitmap checksum */
-	__le16  bg_itable_unused;	/* Unused inodes count */
+	__le16  bg_itable_unused_lo;	/* Unused inodes count */
 	__le16  bg_checksum;		/* crc16(sb_uuid+group+desc) */
 	__le32	bg_block_bitmap_hi;	/* Blocks bitmap block MSB */
 	__le32	bg_inode_bitmap_hi;	/* Inodes bitmap block MSB */
@@ -169,7 +169,7 @@ struct ext4_group_desc
 	__le16	bg_free_blocks_count_hi;/* Free blocks count MSB */
 	__le16	bg_free_inodes_count_hi;/* Free inodes count MSB */
 	__le16	bg_used_dirs_count_hi;	/* Directories count MSB */
-	__le16	bg_itable_unused_hi;	/* Unused inodes count MSB */
+	__le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
 	__u32	bg_reserved2[3];
 };
 
@@ -1142,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 				      struct ext4_group_desc *bg);
 extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 				     struct ext4_group_desc *bg);
+extern __u32 ext4_free_blks_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_free_inodes_count(struct super_block *sb,
+				 struct ext4_group_desc *bg);
+extern __u32 ext4_used_dirs_count(struct super_block *sb,
+				struct ext4_group_desc *bg);
+extern __u32 ext4_itable_unused_count(struct super_block *sb,
+				   struct ext4_group_desc *bg);
 extern void ext4_block_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_bitmap_set(struct super_block *sb,
 				  struct ext4_group_desc *bg, ext4_fsblk_t blk);
 extern void ext4_inode_table_set(struct super_block *sb,
 				 struct ext4_group_desc *bg, ext4_fsblk_t blk);
+extern void ext4_free_blks_set(struct super_block *sb,
+			       struct ext4_group_desc *bg, __u32 count);
+extern void ext4_free_inodes_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_used_dirs_set(struct super_block *sb,
+				struct ext4_group_desc *bg, __u32 count);
+extern void ext4_itable_unused_set(struct super_block *sb,
+				   struct ext4_group_desc *bg, __u32 count);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index cac3617ec78..11c4f6f5bd6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -76,9 +76,9 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
 		ext4_error(sb, __func__, "Checksum bad for group %u",
 			   block_group);
-		gdp->bg_free_blocks_count = 0;
-		gdp->bg_free_inodes_count = 0;
-		gdp->bg_itable_unused = 0;
+		ext4_free_blks_set(sb, gdp, 0);
+		ext4_free_inodes_set(sb, gdp, 0);
+		ext4_itable_unused_set(sb, gdp, 0);
 		memset(bh->b_data, 0xff, sb->s_blocksize);
 		return 0;
 	}
@@ -168,7 +168,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int fatal = 0, err;
+	int fatal = 0, err, count;
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
@@ -236,9 +236,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 		if (gdp) {
 			spin_lock(sb_bgl_lock(sbi, block_group));
-			le16_add_cpu(&gdp->bg_free_inodes_count, 1);
-			if (is_directory)
-				le16_add_cpu(&gdp->bg_used_dirs_count, -1);
+			count = ext4_free_inodes_count(sb, gdp) + 1;
+			ext4_free_inodes_set(sb, gdp, count);
+			if (is_directory) {
+				count = ext4_used_dirs_count(sb, gdp) - 1;
+				ext4_used_dirs_set(sb, gdp, count);
+			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
 			spin_unlock(sb_bgl_lock(sbi, block_group));
@@ -291,13 +294,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 
 	for (group = 0; group < ngroups; group++) {
 		desc = ext4_get_group_desc(sb, group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
+		if (!desc || !ext4_free_inodes_count(sb, desc))
 			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+		if (ext4_free_inodes_count(sb, desc) < avefreei)
 			continue;
 		if (!best_desc ||
-		    (le16_to_cpu(desc->bg_free_blocks_count) >
-		     le16_to_cpu(best_desc->bg_free_blocks_count))) {
+		    (ext4_free_blks_count(sb, desc) >
+		     ext4_free_blks_count(sb, best_desc))) {
 			*best_group = group;
 			best_desc = desc;
 			ret = 0;
@@ -369,7 +372,7 @@ found_flexbg:
 	for (i = best_flex * flex_size; i < ngroups &&
 		     i < (best_flex + 1) * flex_size; i++) {
 		desc = ext4_get_group_desc(sb, i, &bh);
-		if (le16_to_cpu(desc->bg_free_inodes_count)) {
+		if (ext4_free_inodes_count(sb, desc)) {
 			*best_group = i;
 			goto out;
 		}
@@ -443,17 +446,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		for (i = 0; i < ngroups; i++) {
 			grp = (parent_group + i) % ngroups;
 			desc = ext4_get_group_desc(sb, grp, NULL);
-			if (!desc || !desc->bg_free_inodes_count)
+			if (!desc || !ext4_free_inodes_count(sb, desc))
 				continue;
-			if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir)
+			if (ext4_used_dirs_count(sb, desc) >= best_ndir)
 				continue;
-			if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
+			if (ext4_free_inodes_count(sb, desc) < avefreei)
 				continue;
-			if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb)
+			if (ext4_free_blks_count(sb, desc) < avefreeb)
 				continue;
 			*group = grp;
 			ret = 0;
-			best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
+			best_ndir = ext4_used_dirs_count(sb, desc);
 		}
 		if (ret == 0)
 			return ret;
@@ -479,13 +482,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	for (i = 0; i < ngroups; i++) {
 		*group = (parent_group + i) % ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (!desc || !desc->bg_free_inodes_count)
+		if (!desc || !ext4_free_inodes_count(sb, desc))
 			continue;
-		if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
+		if (ext4_used_dirs_count(sb, desc) >= max_dirs)
 			continue;
-		if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes)
+		if (ext4_free_inodes_count(sb, desc) < min_inodes)
 			continue;
-		if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks)
+		if (ext4_free_blks_count(sb, desc) < min_blocks)
 			continue;
 		return 0;
 	}
@@ -494,8 +497,8 @@ fallback:
 	for (i = 0; i < ngroups; i++) {
 		*group = (parent_group + i) % ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && desc->bg_free_inodes_count &&
-			le16_to_cpu(desc->bg_free_inodes_count) >= avefreei)
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+			ext4_free_inodes_count(sb, desc) >= avefreei)
 			return 0;
 	}
 
@@ -524,8 +527,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 	 */
 	*group = parent_group;
 	desc = ext4_get_group_desc(sb, *group, NULL);
-	if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-			le16_to_cpu(desc->bg_free_blocks_count))
+	if (desc && ext4_free_inodes_count(sb, desc) &&
+			ext4_free_blks_count(sb, desc))
 		return 0;
 
 	/*
@@ -548,8 +551,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		if (*group >= ngroups)
 			*group -= ngroups;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count) &&
-				le16_to_cpu(desc->bg_free_blocks_count))
+		if (desc && ext4_free_inodes_count(sb, desc) &&
+				ext4_free_blks_count(sb, desc))
 			return 0;
 	}
 
@@ -562,7 +565,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 		if (++*group >= ngroups)
 			*group = 0;
 		desc = ext4_get_group_desc(sb, *group, NULL);
-		if (desc && le16_to_cpu(desc->bg_free_inodes_count))
+		if (desc && ext4_free_inodes_count(sb, desc))
 			return 0;
 	}
 
@@ -591,7 +594,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
-	int ret2, err = 0;
+	int ret2, err = 0, count;
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
@@ -718,7 +721,7 @@ got:
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			free = ext4_free_blocks_after_init(sb, group, gdp);
-			gdp->bg_free_blocks_count = cpu_to_le16(free);
+			ext4_free_blks_set(sb, gdp, free);
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
 		}
@@ -753,7 +756,7 @@ got:
 			free = 0;
 		} else {
 			free = EXT4_INODES_PER_GROUP(sb) -
-				le16_to_cpu(gdp->bg_itable_unused);
+				ext4_itable_unused_count(sb, gdp);
 		}
 
 		/*
@@ -763,13 +766,15 @@ got:
 		 *
 		 */
 		if (ino > free)
-			gdp->bg_itable_unused =
-				cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
+			ext4_itable_unused_set(sb, gdp,
+					(EXT4_INODES_PER_GROUP(sb) - ino));
 	}
 
-	le16_add_cpu(&gdp->bg_free_inodes_count, -1);
+	count = ext4_free_inodes_count(sb, gdp) - 1;
+	ext4_free_inodes_set(sb, gdp, count);
 	if (S_ISDIR(mode)) {
-		le16_add_cpu(&gdp->bg_used_dirs_count, 1);
+		count = ext4_used_dirs_count(sb, gdp) + 1;
+		ext4_used_dirs_set(sb, gdp, count);
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group));
@@ -987,7 +992,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		desc_count += ext4_free_inodes_count(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_inode_bitmap(sb, i);
 		if (!bitmap_bh)
@@ -995,7 +1000,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 
 		x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
 		printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+			i, ext4_free_inodes_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
@@ -1009,7 +1014,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+		desc_count += ext4_free_inodes_count(sb, gdp);
 		cond_resched();
 	}
 	return desc_count;
@@ -1026,7 +1031,7 @@ unsigned long ext4_count_dirs(struct super_block * sb)
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		count += le16_to_cpu(gdp->bg_used_dirs_count);
+		count += ext4_used_dirs_count(sb, gdp);
 	}
 	return count;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bcd5ffa76c0..56142accf5c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4014,7 +4014,7 @@ make_io:
 			num = EXT4_INODES_PER_GROUP(sb);
 			if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
 				       EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
-				num -= le16_to_cpu(gdp->bg_itable_unused);
+				num -= ext4_itable_unused_count(sb, gdp);
 			table += num / inodes_per_block;
 			if (end > table)
 				end = table;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d559a03f3eb..3809a9348f2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2515,7 +2515,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 			ext4_free_blocks_after_init(sb, group, desc);
 	} else {
 		meta_group_info[i]->bb_free =
-			le16_to_cpu(desc->bg_free_blocks_count);
+			ext4_free_blks_count(sb, desc);
 	}
 
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
@@ -3046,12 +3046,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
-		gdp->bg_free_blocks_count =
-			cpu_to_le16(ext4_free_blocks_after_init(sb,
-						ac->ac_b_ex.fe_group,
-						gdp));
+		ext4_free_blks_set(sb, gdp,
+					ext4_free_blocks_after_init(sb,
+					ac->ac_b_ex.fe_group, gdp));
 	}
-	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+	len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
+	ext4_free_blks_set(sb, gdp, len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
 	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
@@ -4823,7 +4823,8 @@ do_more:
 	}
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
-	le16_add_cpu(&gdp->bg_free_blocks_count, count);
+	ret = ext4_free_blks_count(sb, gdp) + count;
+	ext4_free_blks_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 92034d2c8a7..2d24f423fd8 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -866,8 +866,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
 	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
 	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
-	gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
-	gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
+	ext4_free_blks_set(sb, gdp, input->free_blocks_count);
+	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
 	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2415e2b0970..a3321bf2231 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -93,6 +93,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
+__u32 ext4_free_blks_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+}
+
+__u32 ext4_free_inodes_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+}
+
+__u32 ext4_used_dirs_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+}
+
+__u32 ext4_itable_unused_count(struct super_block *sb,
+			      struct ext4_group_desc *bg)
+{
+	return le16_to_cpu(bg->bg_itable_unused_lo) |
+		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
+		(__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+}
+
 void ext4_block_bitmap_set(struct super_block *sb,
 			   struct ext4_group_desc *bg, ext4_fsblk_t blk)
 {
@@ -117,6 +149,38 @@ void ext4_inode_table_set(struct super_block *sb,
 		bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 }
 
+void ext4_free_blks_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_free_inodes_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_used_dirs_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
+}
+
+void ext4_itable_unused_set(struct super_block *sb,
+			  struct ext4_group_desc *bg, __u32 count)
+{
+	bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
+	if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
+		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+}
+
 /*
  * Wrappers for jbd2_journal_start/end.
  *
@@ -1561,9 +1625,9 @@ static int ext4_fill_flex_info(struct super_block *sb)
 
 		flex_group = ext4_flex_group(sbi, i);
 		sbi->s_flex_groups[flex_group].free_inodes +=
-			le16_to_cpu(gdp->bg_free_inodes_count);
+			ext4_free_inodes_count(sb, gdp);
 		sbi->s_flex_groups[flex_group].free_blocks +=
-			le16_to_cpu(gdp->bg_free_blocks_count);
+			ext4_free_blks_count(sb, gdp);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 3300beda523136f9f87821e4fba85c5c9e319645 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 3 Jan 2009 22:33:39 -0500
Subject: ext4: code cleanup

Rename some variables.  We also unlock locks in the reverse order we
acquired as a part of cleanup.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  2 +-
 fs/ext4/ialloc.c  | 65 ++++++++++++++++++++++++++++++-------------------------
 fs/ext4/mballoc.c |  2 +-
 3 files changed, 37 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 902bf66c8df..1b26b68aa42 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -329,8 +329,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
 		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		unlock_buffer(bh);
 		return bh;
 	}
 	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 11c4f6f5bd6..b47427a21f1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -124,8 +124,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
 		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		unlock_buffer(bh);
 		return bh;
 	}
 	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
@@ -585,8 +585,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 {
 	struct super_block *sb;
-	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *bh2;
+	struct buffer_head *inode_bitmap_bh = NULL;
+	struct buffer_head *group_desc_bh;
 	ext4_group_t group = 0;
 	unsigned long ino = 0;
 	struct inode *inode;
@@ -634,41 +634,44 @@ got_group:
 	for (i = 0; i < sbi->s_groups_count; i++) {
 		err = -EIO;
 
-		gdp = ext4_get_group_desc(sb, group, &bh2);
+		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
 		if (!gdp)
 			goto fail;
 
-		brelse(bitmap_bh);
-		bitmap_bh = ext4_read_inode_bitmap(sb, group);
-		if (!bitmap_bh)
+		brelse(inode_bitmap_bh);
+		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
+		if (!inode_bitmap_bh)
 			goto fail;
 
 		ino = 0;
 
 repeat_in_this_group:
 		ino = ext4_find_next_zero_bit((unsigned long *)
-				bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino);
+					      inode_bitmap_bh->b_data,
+					      EXT4_INODES_PER_GROUP(sb), ino);
+
 		if (ino < EXT4_INODES_PER_GROUP(sb)) {
 
-			BUFFER_TRACE(bitmap_bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle, bitmap_bh);
+			BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle,
+							    inode_bitmap_bh);
 			if (err)
 				goto fail;
 
 			if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
-						ino, bitmap_bh->b_data)) {
+						ino, inode_bitmap_bh->b_data)) {
 				/* we won it */
-				BUFFER_TRACE(bitmap_bh,
+				BUFFER_TRACE(inode_bitmap_bh,
 					"call ext4_handle_dirty_metadata");
 				err = ext4_handle_dirty_metadata(handle,
-								inode,
-								bitmap_bh);
+								 inode,
+							inode_bitmap_bh);
 				if (err)
 					goto fail;
 				goto got;
 			}
 			/* we lost it */
-			ext4_handle_release_buffer(handle, bitmap_bh);
+			ext4_handle_release_buffer(handle, inode_bitmap_bh);
 
 			if (++ino < EXT4_INODES_PER_GROUP(sb))
 				goto repeat_in_this_group;
@@ -699,19 +702,21 @@ got:
 		goto fail;
 	}
 
-	BUFFER_TRACE(bh2, "get_write_access");
-	err = ext4_journal_get_write_access(handle, bh2);
-	if (err) goto fail;
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, group_desc_bh);
+	if (err)
+		goto fail;
 
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
+		struct buffer_head *block_bitmap_bh;
 
-		BUFFER_TRACE(block_bh, "get block bitmap access");
-		err = ext4_journal_get_write_access(handle, block_bh);
+		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
-			brelse(block_bh);
+			brelse(block_bitmap_bh);
 			goto fail;
 		}
 
@@ -719,8 +724,8 @@ got:
 		spin_lock(sb_bgl_lock(sbi, group));
 		/* recheck and clear flag under lock if we still need to */
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			free = ext4_free_blocks_after_init(sb, group, gdp);
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 			ext4_free_blks_set(sb, gdp, free);
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
@@ -729,12 +734,12 @@ got:
 
 		/* Don't need to dirty bitmap block if we didn't change it */
 		if (free) {
-			BUFFER_TRACE(block_bh, "dirty block bitmap");
+			BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
 			err = ext4_handle_dirty_metadata(handle,
-							NULL, block_bh);
+							NULL, block_bitmap_bh);
 		}
 
-		brelse(block_bh);
+		brelse(block_bitmap_bh);
 		if (err)
 			goto fail;
 	}
@@ -778,8 +783,8 @@ got:
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group));
-	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, NULL, bh2);
+	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
 	if (err) goto fail;
 
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
@@ -881,7 +886,7 @@ out:
 	iput(inode);
 	ret = ERR_PTR(err);
 really_out:
-	brelse(bitmap_bh);
+	brelse(inode_bitmap_bh);
 	return ret;
 
 fail_free_drop:
@@ -893,7 +898,7 @@ fail_drop:
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
 	iput(inode);
-	brelse(bitmap_bh);
+	brelse(inode_bitmap_bh);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3809a9348f2..aac33590ac6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -804,8 +804,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			ext4_init_block_bitmap(sb, bh[i],
 						first_group + i, desc);
 			set_buffer_uptodate(bh[i]);
-			unlock_buffer(bh[i]);
 			spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+			unlock_buffer(bh[i]);
 			continue;
 		}
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
-- 
cgit v1.2.3


From 393418676a7602e1d7d3f6e560159c65c8cbd50e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:38:14 -0500
Subject: ext4: Fix the race between read_inode_bitmap() and ext4_new_inode()

We need to make sure we update the inode bitmap and clear
EXT4_BG_INODE_UNINIT flag with sb_bgl_lock held, since
ext4_read_inode_bitmap() looks at EXT4_BG_INODE_UNINIT to decide
whether to initialize the inode bitmap each time it is called.
(introduced by commit c806e68f.)

ext4_read_inode_bitmap does:

spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
	ext4_init_inode_bitmap(sb, bh, block_group, desc);

and ext4_new_inode does
if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
                   ino, inode_bitmap_bh->b_data))
		   ......
		   ...
spin_lock(sb_bgl_lock(sbi, group));

gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
i.e., on allocation we update the bitmap then we take the sb_bgl_lock
and clear the EXT4_BG_INODE_UNINIT flag. What can happen is a
parallel ext4_read_inode_bitmap can zero out the bitmap in between
the above ext4_set_bit_atomic and spin_lock(sb_bg_lock..)

The race results in below user visible errors
EXT4-fs error (device sdb1): ext4_free_inode: bit already cleared for inode 168449
EXT4-fs warning (device sdb1): ext4_unlink: Deleting nonexistent file ...
EXT4-fs warning (device sdb1): ext4_rmdir: empty directory has too many links ...
# ls -al /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71
ls: /mnt/tmp/f/p369/d3/d6/d39/db2/dee/d10f/d3f/l71: Stale NFS file handle

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ialloc.c | 146 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 86 insertions(+), 60 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b47427a21f1..d4e544f30be 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -572,6 +572,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 	return -1;
 }
 
+/*
+ * claim the inode from the inode bitmap. If the group
+ * is uninit we need to take the groups's sb_bgl_lock
+ * and clear the uninit flag. The inode bitmap update
+ * and group desc uninit flag clear should be done
+ * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * doesn't race with the ext4_claim_inode
+ */
+static int ext4_claim_inode(struct super_block *sb,
+			struct buffer_head *inode_bitmap_bh,
+			unsigned long ino, ext4_group_t group, int mode)
+{
+	int free = 0, retval = 0, count;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+	spin_lock(sb_bgl_lock(sbi, group));
+	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
+		/* not a free inode */
+		retval = 1;
+		goto err_ret;
+	}
+	ino++;
+	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
+			ino > EXT4_INODES_PER_GROUP(sb)) {
+		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_error(sb, __func__,
+			   "reserved inode or inode > inodes count - "
+			   "block_group = %u, inode=%lu", group,
+			   ino + group * EXT4_INODES_PER_GROUP(sb));
+		return 1;
+	}
+	/* If we didn't allocate from within the initialized part of the inode
+	 * table then we need to initialize up to this inode. */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+
+		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+			/* When marking the block group with
+			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
+			 * on the value of bg_itable_unused even though
+			 * mke2fs could have initialized the same for us.
+			 * Instead we calculated the value below
+			 */
+
+			free = 0;
+		} else {
+			free = EXT4_INODES_PER_GROUP(sb) -
+				ext4_itable_unused_count(sb, gdp);
+		}
+
+		/*
+		 * Check the relative inode number against the last used
+		 * relative inode number in this group. if it is greater
+		 * we need to  update the bg_itable_unused count
+		 *
+		 */
+		if (ino > free)
+			ext4_itable_unused_set(sb, gdp,
+					(EXT4_INODES_PER_GROUP(sb) - ino));
+	}
+	count = ext4_free_inodes_count(sb, gdp) - 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (S_ISDIR(mode)) {
+		count = ext4_used_dirs_count(sb, gdp) + 1;
+		ext4_used_dirs_set(sb, gdp, count);
+	}
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+err_ret:
+	spin_unlock(sb_bgl_lock(sbi, group));
+	return retval;
+}
+
 /*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
@@ -594,7 +667,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
-	int ret2, err = 0, count;
+	int ret2, err = 0;
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
@@ -658,8 +731,13 @@ repeat_in_this_group:
 			if (err)
 				goto fail;
 
-			if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
-						ino, inode_bitmap_bh->b_data)) {
+			BUFFER_TRACE(group_desc_bh, "get_write_access");
+			err = ext4_journal_get_write_access(handle,
+								group_desc_bh);
+			if (err)
+				goto fail;
+			if (!ext4_claim_inode(sb, inode_bitmap_bh,
+						ino, group, mode)) {
 				/* we won it */
 				BUFFER_TRACE(inode_bitmap_bh,
 					"call ext4_handle_dirty_metadata");
@@ -668,10 +746,13 @@ repeat_in_this_group:
 							inode_bitmap_bh);
 				if (err)
 					goto fail;
+				/* zero bit is inode number 1*/
+				ino++;
 				goto got;
 			}
 			/* we lost it */
 			ext4_handle_release_buffer(handle, inode_bitmap_bh);
+			ext4_handle_release_buffer(handle, group_desc_bh);
 
 			if (++ino < EXT4_INODES_PER_GROUP(sb))
 				goto repeat_in_this_group;
@@ -691,22 +772,6 @@ repeat_in_this_group:
 	goto out;
 
 got:
-	ino++;
-	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-	    ino > EXT4_INODES_PER_GROUP(sb)) {
-		ext4_error(sb, __func__,
-			   "reserved inode or inode > inodes count - "
-			   "block_group = %u, inode=%lu", group,
-			   ino + group * EXT4_INODES_PER_GROUP(sb));
-		err = -EIO;
-		goto fail;
-	}
-
-	BUFFER_TRACE(group_desc_bh, "get_write_access");
-	err = ext4_journal_get_write_access(handle, group_desc_bh);
-	if (err)
-		goto fail;
-
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -743,49 +808,10 @@ got:
 		if (err)
 			goto fail;
 	}
-
-	spin_lock(sb_bgl_lock(sbi, group));
-	/* If we didn't allocate from within the initialized part of the inode
-	 * table then we need to initialize up to this inode. */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-
-			/* When marking the block group with
-			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
-			 * on the value of bg_itable_unused even though
-			 * mke2fs could have initialized the same for us.
-			 * Instead we calculated the value below
-			 */
-
-			free = 0;
-		} else {
-			free = EXT4_INODES_PER_GROUP(sb) -
-				ext4_itable_unused_count(sb, gdp);
-		}
-
-		/*
-		 * Check the relative inode number against the last used
-		 * relative inode number in this group. if it is greater
-		 * we need to  update the bg_itable_unused count
-		 *
-		 */
-		if (ino > free)
-			ext4_itable_unused_set(sb, gdp,
-					(EXT4_INODES_PER_GROUP(sb) - ino));
-	}
-
-	count = ext4_free_inodes_count(sb, gdp) - 1;
-	ext4_free_inodes_set(sb, gdp, count);
-	if (S_ISDIR(mode)) {
-		count = ext4_used_dirs_count(sb, gdp) + 1;
-		ext4_used_dirs_set(sb, gdp, count);
-	}
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, group));
 	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
-	if (err) goto fail;
+	if (err)
+		goto fail;
 
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
-- 
cgit v1.2.3


From 2ccb5fb9f113dae969d1ae9b6c10e80fa34f8cd3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:49:55 -0500
Subject: ext4: Use new buffer_head flag to check uninit group bitmaps
 initialization

For uninit block group, the on-disk bitmap is not initialized. That
implies we cannot depend on the uptodate flag on the bitmap
buffer_head to find bitmap validity.  Use a new buffer_head flag which
would be set after we properly initialize the bitmap.  This also
prevents (re-)initializing the uninit group bitmap every time we call
ext4_read_block_bitmap().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/balloc.c  | 25 +++++++++++++++++++++++--
 fs/ext4/ext4.h    | 18 ++++++++++++++++++
 fs/ext4/ialloc.c  | 24 ++++++++++++++++++++++--
 fs/ext4/mballoc.c | 24 ++++++++++++++++++++++--
 4 files changed, 85 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1b26b68aa42..6bba06b09dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -320,20 +320,41 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (buffer_uptodate(bh) &&
-	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+
+	if (bitmap_uptodate(bh))
 		return bh;
 
 	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	}
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 		unlock_buffer(bh);
 		return bh;
 	}
 	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	/*
+	 * submit the buffer_head for read. We can
+	 * safely mark the bitmap as uptodate now.
+	 * We do it here so the bitmap uptodate bit
+	 * get set with buffer lock held.
+	 */
+	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
 		ext4_error(sb, __func__,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ec862f4ca89..695b45cc34e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -19,6 +19,7 @@
 #include <linux/types.h>
 #include <linux/blkdev.h>
 #include <linux/magic.h>
+#include <linux/jbd2.h>
 #include "ext4_i.h"
 
 /*
@@ -1352,6 +1353,23 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
+/*
+ * Add new method to test wether block and inode bitmaps are properly
+ * initialized. With uninit_bg reading the block from disk is not enough
+ * to mark the bitmap uptodate. We need to also zero-out the bitmap
+ */
+#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
+
+static inline int bitmap_uptodate(struct buffer_head *bh)
+{
+	return (buffer_uptodate(bh) &&
+			test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
+}
+static inline void set_bitmap_uptodate(struct buffer_head *bh)
+{
+	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
+}
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index d4e544f30be..7b12aedc531 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -115,20 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 			    block_group, bitmap_blk);
 		return NULL;
 	}
-	if (buffer_uptodate(bh) &&
-	    !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+	if (bitmap_uptodate(bh))
 		return bh;
 
 	lock_buffer(bh);
+	if (bitmap_uptodate(bh)) {
+		unlock_buffer(bh);
+		return bh;
+	}
 	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
+		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 		unlock_buffer(bh);
 		return bh;
 	}
 	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	if (buffer_uptodate(bh)) {
+		/*
+		 * if not uninit if bh is uptodate,
+		 * bitmap is also uptodate
+		 */
+		set_bitmap_uptodate(bh);
+		unlock_buffer(bh);
+		return bh;
+	}
+	/*
+	 * submit the buffer_head for read. We can
+	 * safely mark the bitmap as uptodate now.
+	 * We do it here so the bitmap uptodate bit
+	 * get set with buffer lock held.
+	 */
+	set_bitmap_uptodate(bh);
 	if (bh_submit_read(bh) < 0) {
 		put_bh(bh);
 		ext4_error(sb, __func__,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index aac33590ac6..18a52d39d09 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -794,22 +794,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (bh[i] == NULL)
 			goto out;
 
-		if (buffer_uptodate(bh[i]) &&
-		    !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+		if (bitmap_uptodate(bh[i]))
 			continue;
 
 		lock_buffer(bh[i]);
+		if (bitmap_uptodate(bh[i])) {
+			unlock_buffer(bh[i]);
+			continue;
+		}
 		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
 						first_group + i, desc);
+			set_bitmap_uptodate(bh[i]);
 			set_buffer_uptodate(bh[i]);
 			spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
 			unlock_buffer(bh[i]);
 			continue;
 		}
 		spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		if (buffer_uptodate(bh[i])) {
+			/*
+			 * if not uninit if bh is uptodate,
+			 * bitmap is also uptodate
+			 */
+			set_bitmap_uptodate(bh[i]);
+			unlock_buffer(bh[i]);
+			continue;
+		}
 		get_bh(bh[i]);
+		/*
+		 * submit the buffer_head for read. We can
+		 * safely mark the bitmap as uptodate now.
+		 * We do it here so the bitmap uptodate bit
+		 * get set with buffer lock held.
+		 */
+		set_bitmap_uptodate(bh[i]);
 		bh[i]->b_end_io = end_buffer_read_sync;
 		submit_bh(READ, bh[i]);
 		mb_debug("read bitmap for group %u\n", first_group + i);
-- 
cgit v1.2.3


From 648f5879f5892dddd3ba71cd0d285599f40f2512 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:46:04 -0500
Subject: ext4: mark the blocks/inode bitmap beyond end of group as used

We need to mark the block/inode bitmap beyond the end of the group
with '1'.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ialloc.c  | 2 +-
 fs/ext4/mballoc.c | 4 ++--
 fs/ext4/resize.c  | 6 ++----
 3 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7b12aedc531..e3aa3fa3860 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 	}
 
 	memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
 
 	return EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 18a52d39d09..7d7f6f91d55 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3038,8 +3038,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	    in_range(block + len - 1, ext4_inode_table(sb, gdp),
 		     EXT4_SB(sb)->s_itb_per_group)) {
 		ext4_error(sb, __func__,
-			   "Allocating block in system zone - block = %llu",
-			   block);
+			   "Allocating block %llu in system zone of %d group\n",
+			   block, ac->ac_b_ex.fe_group);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 2d24f423fd8..c328be5d688 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
 	if ((err = extend_or_restart_transaction(handle, 2, bh)))
 		goto exit_bh;
 
-	mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
-			bh->b_data);
+	mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
 	ext4_handle_dirty_metadata(handle, NULL, bh);
 	brelse(bh);
-
 	/* Mark unused entries in inode bitmap used */
 	ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
 		   input->inode_bitmap, input->inode_bitmap - start);
@@ -297,7 +295,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 		goto exit_journal;
 	}
 
-	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
+	mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 			bh->b_data);
 	ext4_handle_dirty_metadata(handle, NULL, bh);
 exit_bh:
-- 
cgit v1.2.3


From 8556e8f3b6c4c11601ce1e9ea8090a6d8bd5daae Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:46:55 -0500
Subject: ext4: Don't allow new groups to be added during block allocation

After we mark the blocks in the buddy cache as allocated,
we need to ensure that we don't reinit the buddy cache until
the block bitmap is updated.  This commit achieves this by holding
the group_info alloc_semaphore till ext4_mb_release_context

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 16 +++++++++++++---
 fs/ext4/mballoc.h |  5 +++++
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7d7f6f91d55..0c7e247f714 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1052,7 +1052,8 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
 	if (e4b->bd_buddy_page)
 		page_cache_release(e4b->bd_buddy_page);
 	/* Done with the buddy cache */
-	up_read(e4b->alloc_semp);
+	if (e4b->alloc_semp)
+		up_read(e4b->alloc_semp);
 }
 
 
@@ -1371,7 +1372,9 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	get_page(ac->ac_bitmap_page);
 	ac->ac_buddy_page = e4b->bd_buddy_page;
 	get_page(ac->ac_buddy_page);
-
+	/* on allocation we use ac to track the held semaphore */
+	ac->alloc_semp =  e4b->alloc_semp;
+	e4b->alloc_semp = NULL;
 	/* store last allocated for subsequent stream allocation */
 	if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
 		spin_lock(&sbi->s_md_lock);
@@ -4289,6 +4292,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	ac->ac_pa = NULL;
 	ac->ac_bitmap_page = NULL;
 	ac->ac_buddy_page = NULL;
+	ac->alloc_semp = NULL;
 	ac->ac_lg = NULL;
 
 	/* we have to define context: we'll we work with a file or
@@ -4469,6 +4473,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 		}
 		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
+	if (ac->alloc_semp)
+		up_read(ac->alloc_semp);
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
 	if (ac->ac_buddy_page)
@@ -4569,10 +4575,14 @@ repeat:
 				ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
 			ext4_mb_new_preallocation(ac);
 	}
-
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
 		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
 		if (*errp ==  -EAGAIN) {
+			/*
+			 * drop the reference that we took
+			 * in ext4_mb_use_best_found
+			 */
+			ext4_mb_release_context(ac);
 			ac->ac_b_ex.fe_group = 0;
 			ac->ac_b_ex.fe_start = 0;
 			ac->ac_b_ex.fe_len = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 95d4c7f29a8..10a2921baf1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -195,6 +195,11 @@ struct ext4_allocation_context {
 	__u8 ac_op;		/* operation, for history only */
 	struct page *ac_bitmap_page;
 	struct page *ac_buddy_page;
+	/*
+	 * pointer to the held semaphore upon successful
+	 * block allocation
+	 */
+	struct rw_semaphore *alloc_semp;
 	struct ext4_prealloc_space *ac_pa;
 	struct ext4_locality_group *ac_lg;
 };
-- 
cgit v1.2.3


From 29eaf024980e07cc01f31ae4ea5d68c917f4b7da Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:48:56 -0500
Subject: ext4: Init the complete page while building buddy cache

We need to init the complete page during buddy cache init
by setting the contents to '1'.  Otherwise we can see the
following errors after doing an online resize of the
filesystem:

EXT4-fs error (device sdb1): ext4_mb_mark_diskspace_used:
	Allocating block 1040385 in system zone of 127 group

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0c7e247f714..fd2294de404 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -846,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	err = 0;
 	first_block = page->index * blocks_per_page;
+	/* init the page  */
+	memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
 		struct ext4_group_info *grinfo;
@@ -872,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore == NULL);
 			mb_debug("put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
-			memset(data, 0xff, blocksize);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
-- 
cgit v1.2.3


From 0087d9fb3f29f59e8d42c8b058376d80e5adde4c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 5 Jan 2009 21:49:12 -0500
Subject: ext4: Fix s_dirty_blocks_counter if block allocation failed with
 nodelalloc

With nodelalloc option we need to update the dirty block counter on
block allocation failure. This is needed because we increment the
dirty block counter early in the block allocation phase. Without
the patch s_dirty_blocks_counter goes wrong so that filesystem's
free blocks decreases incorrectly.

Tested-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/mballoc.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index fd2294de404..05d9f81956c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4541,7 +4541,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 	if (ar->len == 0) {
 		*errp = -EDQUOT;
-		return 0;
+		goto out3;
 	}
 	inquota = ar->len;
 
@@ -4614,6 +4614,13 @@ out2:
 out1:
 	if (ar->len < inquota)
 		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+out3:
+	if (!ar->len) {
+		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+			/* release all the reserved blocks if non delalloc */
+			percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+						reserv_blks);
+	}
 
 	return block;
 }
-- 
cgit v1.2.3


From 87d8fe1ee6b8d2f95076142d58c440dba4e7bdc2 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Jan 2009 09:47:09 -0500
Subject: add releasepage hooks to block devices which can be used by file
 systems

Implement blkdev_releasepage() to release the buffer_heads and pages
after we release private data belonging to a mounted filesystem.

Cc: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/block_dev.c | 15 +++++++++++++++
 fs/super.c     |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 349a26c1000..1dd07e66e98 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1220,6 +1220,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
+/*
+ * Try to release a page associated with block device when the system
+ * is under memory pressure.
+ */
+static int blkdev_releasepage(struct page *page, gfp_t wait)
+{
+	struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super;
+
+	if (super && super->s_op->bdev_try_to_free_page)
+		return super->s_op->bdev_try_to_free_page(super, page, wait);
+
+	return try_to_free_buffers(page);
+}
+
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.writepage	= blkdev_writepage,
@@ -1227,6 +1241,7 @@ static const struct address_space_operations def_blk_aops = {
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
 	.writepages	= generic_writepages,
+	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
 };
 
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a9..d5fd4498548 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,6 +800,7 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		}
 
 		s->s_flags |= MS_ACTIVE;
+		bdev->bd_super = s;
 	}
 
 	return simple_set_mnt(mnt, s);
@@ -819,6 +820,7 @@ void kill_block_super(struct super_block *sb)
 	struct block_device *bdev = sb->s_bdev;
 	fmode_t mode = sb->s_mode;
 
+	bdev->bd_super = 0;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
 	close_bdev_exclusive(bdev, mode);
-- 
cgit v1.2.3


From fe30af971d896c144ef4708f97cf9d3186303c42 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 3 Jan 2009 07:16:13 +0000
Subject: remove the rudiment of a.out for sparc

it's been used only in sunos compat

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_aout.c | 65 +-------------------------------------------------------
 1 file changed, 1 insertion(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f1f3f4192a6..8a3b32f5b78 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -99,88 +99,53 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
 #       define START_DATA(u)	(u.start_data)
 #elif defined(__arm__)
 #	define START_DATA(u)	((u.u_tsize << PAGE_SHIFT) + u.start_code)
-#elif defined(__sparc__)
-#       define START_DATA(u)    (u.u_tsize)
 #elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
 #       define START_DATA(u)	(u.u_tsize << PAGE_SHIFT)
 #endif
-#ifdef __sparc__
-#       define START_STACK(u)   ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
-#else
 #       define START_STACK(u)   (u.start_stack)
-#endif
 
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 	has_dumped = 1;
 	current->flags |= PF_DUMPCORE;
        	strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
-#ifndef __sparc__
 	dump.u_ar0 = offsetof(struct user, regs);
-#endif
 	dump.signal = signr;
 	aout_dump_thread(regs, &dump);
 
 /* If the size of the dump file exceeds the rlimit, then see what would happen
    if we wrote the stack, but not the data area.  */
-#ifdef __sparc__
-	if ((dump.u_dsize + dump.u_ssize) > limit)
-		dump.u_dsize = 0;
-#else
 	if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
 		dump.u_dsize = 0;
-#endif
 
 /* Make sure we have enough room to write the stack and data areas. */
-#ifdef __sparc__
-	if (dump.u_ssize > limit)
-		dump.u_ssize = 0;
-#else
 	if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
 		dump.u_ssize = 0;
-#endif
 
 /* make sure we actually have a data and stack area to dump */
 	set_fs(USER_DS);
-#ifdef __sparc__
-	if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize))
-		dump.u_dsize = 0;
-	if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize))
-		dump.u_ssize = 0;
-#else
 	if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
 		dump.u_dsize = 0;
 	if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
 		dump.u_ssize = 0;
-#endif
 
 	set_fs(KERNEL_DS);
 /* struct user */
 	DUMP_WRITE(&dump,sizeof(dump));
 /* Now dump all of the user data.  Include malloced stuff as well */
-#ifndef __sparc__
 	DUMP_SEEK(PAGE_SIZE);
-#endif
 /* now we start writing out the user space info */
 	set_fs(USER_DS);
 /* Dump the data area */
 	if (dump.u_dsize != 0) {
 		dump_start = START_DATA(dump);
-#ifdef __sparc__
-		dump_size = dump.u_dsize;
-#else
 		dump_size = dump.u_dsize << PAGE_SHIFT;
-#endif
 		DUMP_WRITE(dump_start,dump_size);
 	}
 /* Now prepare to dump the stack area */
 	if (dump.u_ssize != 0) {
 		dump_start = START_STACK(dump);
-#ifdef __sparc__
-		dump_size = dump.u_ssize;
-#else
 		dump_size = dump.u_ssize << PAGE_SHIFT;
-#endif
 		DUMP_WRITE(dump_start,dump_size);
 	}
 /* Finally dump the task struct.  Not be used by gdb, but could be useful */
@@ -205,11 +170,6 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
 	int envc = bprm->envc;
 
 	sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __sparc__
-	/* This imposes the proper stack alignment for a new process. */
-	sp = (void __user *) (((unsigned long) sp) & ~7);
-	if ((envc+argc+3)&1) --sp;
-#endif
 #ifdef __alpha__
 /* whee.. test-programs are so much fun. */
 	put_user(0, --sp);
@@ -302,11 +262,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	/* OK, This is the point of no return */
 #if defined(__alpha__)
 	SET_AOUT_PERSONALITY(bprm, ex);
-#elif defined(__sparc__)
-	set_personality(PER_SUNOS);
-#if !defined(__sparc_v9__)
-	memcpy(&current->thread.core_exec, &ex, sizeof(struct exec));
-#endif
 #else
 	set_personality(PER_LINUX);
 #endif
@@ -322,24 +277,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	install_exec_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
-#ifdef __sparc__
-	if (N_MAGIC(ex) == NMAGIC) {
-		loff_t pos = fd_offset;
-		/* Fuck me plenty... */
-		/* <AOL></AOL> */
-		down_write(&current->mm->mmap_sem);	
-		error = do_brk(N_TXTADDR(ex), ex.a_text);
-		up_write(&current->mm->mmap_sem);
-		bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
-			  ex.a_text, &pos);
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(N_DATADDR(ex), ex.a_data);
-		up_write(&current->mm->mmap_sem);
-		bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
-			  ex.a_data, &pos);
-		goto beyond_if;
-	}
-#endif
 
 	if (N_MAGIC(ex) == OMAGIC) {
 		unsigned long text_addr, map_size;
@@ -347,7 +284,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 		text_addr = N_TXTADDR(ex);
 
-#if defined(__alpha__) || defined(__sparc__)
+#ifdef __alpha__
 		pos = fd_offset;
 		map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
 #else
-- 
cgit v1.2.3


From 17580d7f2f632ff8c9786d609508c35c9f56e1f3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 3 Jan 2009 07:16:23 +0000
Subject: sanitize ifdefs in binfmt_aout

They are actually alpha vs.  i386/arm/m68k i.e. ecoff vs. aout.

In the only place where we actually tried to handle arm and i386/m68k in
different ways (START_DATA() in coredump handling), the arm variant
works for all of them (i386 and m68k have u.start_code set to 0).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_aout.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 8a3b32f5b78..b639dcf7c77 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -95,12 +95,10 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
 	int has_dumped = 0;
 	unsigned long dump_start, dump_size;
 	struct user dump;
-#if defined(__alpha__)
+#ifdef __alpha__
 #       define START_DATA(u)	(u.start_data)
-#elif defined(__arm__)
+#else
 #	define START_DATA(u)	((u.u_tsize << PAGE_SHIFT) + u.start_code)
-#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
-#       define START_DATA(u)	(u.u_tsize << PAGE_SHIFT)
 #endif
 #       define START_STACK(u)   (u.start_stack)
 
@@ -176,18 +174,18 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
 	put_user(0, --sp);
 	if (bprm->loader) {
 		put_user(0, --sp);
-		put_user(0x3eb, --sp);
+		put_user(1003, --sp);
 		put_user(bprm->loader, --sp);
-		put_user(0x3ea, --sp);
+		put_user(1002, --sp);
 	}
 	put_user(bprm->exec, --sp);
-	put_user(0x3e9, --sp);
+	put_user(1001, --sp);
 #endif
 	sp -= envc+1;
 	envp = (char __user * __user *) sp;
 	sp -= argc+1;
 	argv = (char __user * __user *) sp;
-#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__)
+#ifndef __alpha__
 	put_user((unsigned long) envp,--sp);
 	put_user((unsigned long) argv,--sp);
 #endif
@@ -260,7 +258,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		return retval;
 
 	/* OK, This is the point of no return */
-#if defined(__alpha__)
+#ifdef __alpha__
 	SET_AOUT_PERSONALITY(bprm, ex);
 #else
 	set_personality(PER_LINUX);
-- 
cgit v1.2.3


From 3bfacef412b4bc993a8992217e50f1245f2fd3a6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 3 Jan 2009 07:16:33 +0000
Subject: get rid of special-casing the /sbin/loader on alpha

... just make it a binfmt handler like #! one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index dfbf7009fbe..3ef9cf9b187 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -57,11 +57,6 @@
 #include <asm/tlb.h>
 #include "internal.h"
 
-#ifdef __alpha__
-/* for /sbin/loader handling in search_binary_handler() */
-#include <linux/a.out.h>
-#endif
-
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -1172,41 +1167,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	unsigned int depth = bprm->recursion_depth;
 	int try,retval;
 	struct linux_binfmt *fmt;
-#ifdef __alpha__
-	/* handle /sbin/loader.. */
-	{
-	    struct exec * eh = (struct exec *) bprm->buf;
 
-	    if (!bprm->loader && eh->fh.f_magic == 0x183 &&
-		(eh->fh.f_flags & 0x3000) == 0x3000)
-	    {
-		struct file * file;
-		unsigned long loader;
-
-		allow_write_access(bprm->file);
-		fput(bprm->file);
-		bprm->file = NULL;
-
-		loader = bprm->vma->vm_end - sizeof(void *);
-
-		file = open_exec("/sbin/loader");
-		retval = PTR_ERR(file);
-		if (IS_ERR(file))
-			return retval;
-
-		/* Remember if the application is TASO.  */
-		bprm->taso = eh->ah.entry < 0x100000000UL;
-
-		bprm->file = file;
-		bprm->loader = loader;
-		retval = prepare_binprm(bprm);
-		if (retval<0)
-			return retval;
-		/* should call search_binary_handler recursively here,
-		   but it does not matter */
-	    }
-	}
-#endif
 	retval = security_bprm_check(bprm);
 	if (retval)
 		return retval;
-- 
cgit v1.2.3


From 6b082b531228c43d454c082fc0f969da1695b060 Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Mon, 5 Jan 2009 22:38:14 -0500
Subject: ext3: provide function to release metadata pages under memory
 pressure

Pages in the page cache belonging to ext3 data files are released via
the ext3_releasepage() function specified in the ext3 inode's
address_space_ops.  However, metadata blocks (such as indirect blocks,
directory blocks, etc) are managed via the block device
address_space_ops, and they can not be released by
try_to_free_buffers() if they have a journal head attached to them.

To address this, we supply a try_to_free_pages() function which calls
journal_try_to_free_buffers() function to free the metadata, and which
is called by the block device's blkdev_releasepage() function.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ext3/super.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 541d5e4f7f6..6900ff05e3a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -682,6 +682,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid,
 				    ext3_nfs_get_inode);
 }
 
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
+{
+	journal_t *journal = EXT3_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return journal_try_to_free_buffers(journal, page, 
+						   wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -746,6 +766,7 @@ static const struct super_operations ext3_sops = {
 	.quota_read	= ext3_quota_read,
 	.quota_write	= ext3_quota_write,
 #endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
 static const struct export_operations ext3_export_ops = {
-- 
cgit v1.2.3


From c39a7f84d7845aa95d1c7c168f38215aedcc13c2 Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Mon, 5 Jan 2009 22:38:48 -0500
Subject: ext4: provide function to release metadata pages under memory
 pressure

Pages in the page cache belonging to ext4 data files are released via
the ext4_releasepage() function specified in the ext4 inode's
address_space_ops.  However, metadata blocks (such as indirect blocks,
directory blocks, etc) are managed via the block device
address_space_ops, and they can not be released by
try_to_free_buffers() if they have a journal head attached to them.

To address this, we supply a release_metadata function which calls
jbd2_journal_try_to_free_buffers() function to free the metadata, and
which is called by the block device's blkdev_releasepage() function.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/ext4/super.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a3321bf2231..e5ab520724d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -907,6 +907,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
 				    ext4_nfs_get_inode);
 }
 
+/*
+ * Try to release metadata pages (indirect blocks, directories) which are
+ * mapped via the block device.  Since these pages could have journal heads
+ * which would prevent try_to_free_buffers() from freeing them, we must use
+ * jbd2 layer's try_to_free_buffers() function to release them.
+ */
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+{
+	journal_t *journal = EXT4_SB(sb)->s_journal;
+
+	WARN_ON(PageChecked(page));
+	if (!page_has_buffers(page))
+		return 0;
+	if (journal)
+		return jbd2_journal_try_to_free_buffers(journal, page,
+							wait & ~__GFP_WAIT);
+	return try_to_free_buffers(page);
+}
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -971,6 +990,7 @@ static const struct super_operations ext4_sops = {
 	.quota_read	= ext4_quota_read,
 	.quota_write	= ext4_quota_write,
 #endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
 static const struct export_operations ext4_export_ops = {
-- 
cgit v1.2.3


From c31910672376dfb8d020e32afa7249763bcd924a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 6 Jan 2009 11:14:25 -0500
Subject: ext4: Remove code to create the journal inode

This code has been obsolete in quite some time, since the supported
method for adding a journal inode is to use tune2fs (or to creating
new filesystem with a journal via mke2fs or mkfs.ext4).

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c   | 68 ++++------------------------------------------------
 fs/jbd2/journal.c | 72 -------------------------------------------------------
 2 files changed, 4 insertions(+), 136 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5ab520724d..8036392b212 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
-			       unsigned int);
 static void ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
@@ -1006,7 +1004,7 @@ enum {
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
-	Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
+	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
@@ -1048,7 +1046,6 @@ static const match_table_t tokens = {
 	{Opt_min_batch_time, "min_batch_time=%u"},
 	{Opt_max_batch_time, "max_batch_time=%u"},
 	{Opt_journal_update, "journal=update"},
-	{Opt_journal_inum, "journal=%u"},
 	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_journal_checksum, "journal_checksum"},
 	{Opt_journal_async_commit, "journal_async_commit"},
@@ -1102,7 +1099,7 @@ static ext4_fsblk_t get_sb_block(void **data)
 }
 
 static int parse_options(char *options, struct super_block *sb,
-			 unsigned int *inum, unsigned long *journal_devnum,
+			 unsigned long *journal_devnum,
 			 ext4_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1226,16 +1223,6 @@ static int parse_options(char *options, struct super_block *sb,
 			}
 			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
-		case Opt_journal_inum:
-			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*inum = option;
-			break;
 		case Opt_journal_dev:
 			if (is_remount) {
 				printk(KERN_ERR "EXT4-fs: cannot specify "
@@ -2035,7 +2022,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
 	unsigned long offset = 0;
-	unsigned int journal_inum = 0;
 	unsigned long journal_devnum = 0;
 	unsigned long def_mount_opts;
 	struct inode *root;
@@ -2155,8 +2141,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
 
-	if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
-			   NULL, 0))
+	if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2460,9 +2445,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				goto failed_mount4;
 			}
 		}
-	} else if (journal_inum) {
-		if (ext4_create_journal(sb, es, journal_inum))
-			goto failed_mount3;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		printk(KERN_ERR "EXT4-fs: required journal recovery "
@@ -2926,48 +2908,6 @@ static int ext4_load_journal(struct super_block *sb,
 	return 0;
 }
 
-static int ext4_create_journal(struct super_block *sb,
-			       struct ext4_super_block *es,
-			       unsigned int journal_inum)
-{
-	journal_t *journal;
-	int err;
-
-	if (sb->s_flags & MS_RDONLY) {
-		printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to "
-				"create journal.\n");
-		return -EROFS;
-	}
-
-	journal = ext4_get_journal(sb, journal_inum);
-	if (!journal)
-		return -EINVAL;
-
-	printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n",
-	       journal_inum);
-
-	err = jbd2_journal_create(journal);
-	if (err) {
-		printk(KERN_ERR "EXT4-fs: error creating journal.\n");
-		jbd2_journal_destroy(journal);
-		return -EIO;
-	}
-
-	EXT4_SB(sb)->s_journal = journal;
-
-	ext4_update_dynamic_rev(sb);
-	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-	EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL);
-
-	es->s_journal_inum = cpu_to_le32(journal_inum);
-	sb->s_dirt = 1;
-
-	/* Make sure we flush the recovery flag to disk. */
-	ext4_commit_super(sb, es, 1);
-
-	return 0;
-}
-
 static void ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync)
 {
@@ -3209,7 +3149,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 34ef9805720..b10d7283ba5 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -66,7 +66,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
-EXPORT_SYMBOL(jbd2_journal_create);
 EXPORT_SYMBOL(jbd2_journal_load);
 EXPORT_SYMBOL(jbd2_journal_destroy);
 EXPORT_SYMBOL(jbd2_journal_abort);
@@ -1162,77 +1161,6 @@ static int journal_reset(journal_t *journal)
 	return jbd2_journal_start_thread(journal);
 }
 
-/**
- * int jbd2_journal_create() - Initialise the new journal file
- * @journal: Journal to create. This structure must have been initialised
- *
- * Given a journal_t structure which tells us which disk blocks we can
- * use, create a new journal superblock and initialise all of the
- * journal fields from scratch.
- **/
-int jbd2_journal_create(journal_t *journal)
-{
-	unsigned long long blocknr;
-	struct buffer_head *bh;
-	journal_superblock_t *sb;
-	int i, err;
-
-	if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) {
-		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
-			journal->j_maxlen);
-		journal_fail_superblock(journal);
-		return -EINVAL;
-	}
-
-	if (journal->j_inode == NULL) {
-		/*
-		 * We don't know what block to start at!
-		 */
-		printk(KERN_EMERG
-		       "%s: creation of journal on external device!\n",
-		       __func__);
-		BUG();
-	}
-
-	/* Zero out the entire journal on disk.  We cannot afford to
-	   have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */
-	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
-	for (i = 0; i < journal->j_maxlen; i++) {
-		err = jbd2_journal_bmap(journal, i, &blocknr);
-		if (err)
-			return err;
-		bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-		lock_buffer(bh);
-		memset (bh->b_data, 0, journal->j_blocksize);
-		BUFFER_TRACE(bh, "marking dirty");
-		mark_buffer_dirty(bh);
-		BUFFER_TRACE(bh, "marking uptodate");
-		set_buffer_uptodate(bh);
-		unlock_buffer(bh);
-		__brelse(bh);
-	}
-
-	sync_blockdev(journal->j_dev);
-	jbd_debug(1, "JBD: journal cleared.\n");
-
-	/* OK, fill in the initial static fields in the new superblock */
-	sb = journal->j_superblock;
-
-	sb->s_header.h_magic	 = cpu_to_be32(JBD2_MAGIC_NUMBER);
-	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-
-	sb->s_blocksize	= cpu_to_be32(journal->j_blocksize);
-	sb->s_maxlen	= cpu_to_be32(journal->j_maxlen);
-	sb->s_first	= cpu_to_be32(1);
-
-	journal->j_transaction_sequence = 1;
-
-	journal->j_flags &= ~JBD2_ABORT;
-	journal->j_format_version = 2;
-
-	return journal_reset(journal);
-}
-
 /**
  * void jbd2_journal_update_superblock() - Update journal sb on disk.
  * @journal: The journal to update.
-- 
cgit v1.2.3


From ba80b1019aa722b24506db1ee755e0bb2f513022 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Jan 2009 20:03:21 -0500
Subject: ext4: Add markers for better debuggability

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c  |  9 +++++++++
 fs/ext4/inode.c   | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/mballoc.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 116 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e3aa3fa3860..369c34c6429 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -210,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 
 	ino = inode->i_ino;
 	ext4_debug("freeing inode %lu\n", ino);
+	trace_mark(ext4_free_inode,
+		   "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu",
+		   sb->s_id, inode->i_ino, inode->i_mode,
+		   (unsigned long) inode->i_uid, (unsigned long) inode->i_gid,
+		   (unsigned long long) inode->i_blocks);
 
 	/*
 	 * Note: we must free any quota before locking the superblock,
@@ -698,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		return ERR_PTR(-EPERM);
 
 	sb = dir->i_sb;
+	trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
+		   dir->i_ino, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
@@ -925,6 +932,8 @@ got:
 	}
 
 	ext4_debug("allocating inode %lu\n", inode->i_ino);
+	trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d",
+		   sb->s_id, inode->i_ino, dir->i_ino, mode);
 	goto really_out;
 fail:
 	ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 56142accf5c..4cac8da4e0c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1351,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
  	pgoff_t index;
 	unsigned from, to;
 
+	trace_mark(ext4_write_begin,
+		   "dev %s ino %lu pos %llu len %u flags %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, flags);
  	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1422,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file,
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 
+	trace_mark(ext4_ordered_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
@@ -1460,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file,
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
+	trace_mark(ext4_writeback_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	new_i_size = pos + copied;
 	if (new_i_size > EXT4_I(inode)->i_disksize) {
 		ext4_update_i_disksize(inode, new_i_size);
@@ -1495,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file,
 	unsigned from, to;
 	loff_t new_i_size;
 
+	trace_mark(ext4_journalled_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
@@ -2311,6 +2327,9 @@ static int ext4_da_writepage(struct page *page,
 	struct buffer_head *page_bufs;
 	struct inode *inode = page->mapping->host;
 
+	trace_mark(ext4_da_writepage,
+		   "dev %s ino %lu page_index %lu",
+		   inode->i_sb->s_id, inode->i_ino, page->index);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2421,6 +2440,20 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
+	trace_mark(ext4_da_writepages,
+		   "dev %s ino %lu nr_t_write %ld "
+		   "pages_skipped %ld range_start %llu "
+		   "range_end %llu nonblocking %d "
+		   "for_kupdate %d for_reclaim %d "
+		   "for_writepages %d range_cyclic %d",
+		   inode->i_sb->s_id, inode->i_ino,
+		   wbc->nr_to_write, wbc->pages_skipped,
+		   (unsigned long long) wbc->range_start,
+		   (unsigned long long) wbc->range_end,
+		   wbc->nonblocking, wbc->for_kupdate,
+		   wbc->for_reclaim, wbc->for_writepages,
+		   wbc->range_cyclic);
+
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
 	 * a transaction for special inodes like journal inode on last iput()
@@ -2539,6 +2572,14 @@ out_writepages:
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
+	trace_mark(ext4_da_writepage_result,
+		   "dev %s ino %lu ret %d pages_written %d "
+		   "pages_skipped %ld congestion %d "
+		   "more_io %d no_nrwrite_index_update %d",
+		   inode->i_sb->s_id, inode->i_ino, ret,
+		   pages_written, wbc->pages_skipped,
+		   wbc->encountered_congestion, wbc->more_io,
+		   wbc->no_nrwrite_index_update);
 	return ret;
 }
 
@@ -2590,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 					len, flags, pagep, fsdata);
 	}
 	*fsdata = (void *)0;
+
+	trace_mark(ext4_da_write_begin,
+		   "dev %s ino %lu pos %llu len %u flags %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, flags);
 retry:
 	/*
 	 * With delayed allocation, we don't log the i_disksize update
@@ -2679,6 +2725,10 @@ static int ext4_da_write_end(struct file *file,
 		}
 	}
 
+	trace_mark(ext4_da_write_end,
+		   "dev %s ino %lu pos %llu len %u copied %u",
+		   inode->i_sb->s_id, inode->i_ino,
+		   (unsigned long long) pos, len, copied);
 	start = pos & (PAGE_CACHE_SIZE - 1);
 	end = start + copied - 1;
 
@@ -2892,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page,
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
+	trace_mark(ext4_normal_writepage,
+		   "dev %s ino %lu page_index %lu",
+		   inode->i_sb->s_id, inode->i_ino, page->index);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
@@ -2977,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page,
 	loff_t size = i_size_read(inode);
 	loff_t len;
 
+	trace_mark(ext4_journalled_writepage,
+		   "dev %s ino %lu page_index %lu",
+		   inode->i_sb->s_id, inode->i_ino, page->index);
 	J_ASSERT(PageLocked(page));
 	if (page->index == size >> PAGE_CACHE_SHIFT)
 		len = size & ~PAGE_CACHE_MASK;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 05d9f81956c..918aec0c8a1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2878,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
 			+ entry->start_blk
 			+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
-			   (unsigned long long) discard_block, entry->count);
+		trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u",
+			   sb->s_id, (unsigned long long) discard_block,
+			   entry->count);
 		sb_issue_discard(sb, discard_block, entry->count);
 
 		kmem_cache_free(ext4_free_ext_cachep, entry);
@@ -3697,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 
 	mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
 			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_mark(ext4_mb_new_inode_pa,
+		   "dev %s ino %lu pstart %llu len %u lstart %u",
+		   sb->s_id, ac->ac_inode->i_ino,
+		   pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 
 	ext4_mb_use_inode_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3754,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	pa->pa_linear = 1;
 
 	mb_debug("new group pa %p: %llu/%u for %u\n", pa,
-			pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+		 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
+	trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u",
+		   sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart);
 
 	ext4_mb_use_group_pa(ac, pa);
 	atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
@@ -3807,12 +3814,14 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 	unsigned int next;
 	ext4_group_t group;
 	ext4_grpblk_t bit;
+	unsigned long long grp_blk_start;
 	sector_t start;
 	int err = 0;
 	int free = 0;
 
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
+	grp_blk_start = pa->pa_pstart - bit;
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
 	end = bit + pa->pa_len;
 
@@ -3842,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			ext4_mb_store_history(ac);
 		}
 
+		trace_mark(ext4_mb_release_inode_pa,
+			   "dev %s ino %lu block %llu count %u",
+			   sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit,
+			   next - bit);
 		mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
 		bit = next + 1;
 	}
@@ -3875,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	if (ac)
 		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 
+	trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d",
+		   sb->s_id, pa->pa_pstart, pa->pa_len);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
 	BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
@@ -4040,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode)
 	}
 
 	mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+	trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id,
+		   inode->i_ino);
 
 	INIT_LIST_HEAD(&list);
 
@@ -4492,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 	int ret;
 	int freed = 0;
 
+	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
+		   sb->s_id, needed);
 	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
@@ -4520,6 +4539,18 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	sb = ar->inode->i_sb;
 	sbi = EXT4_SB(sb);
 
+	trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu "
+		   "lblk %llu goal %llu lleft %llu lright %llu "
+		   "pleft %llu pright %llu ",
+		   sb->s_id, ar->flags, ar->len,
+		   ar->inode ? ar->inode->i_ino : 0,
+		   (unsigned long long) ar->logical,
+		   (unsigned long long) ar->goal,
+		   (unsigned long long) ar->lleft,
+		   (unsigned long long) ar->lright,
+		   (unsigned long long) ar->pleft,
+		   (unsigned long long) ar->pright);
+
 	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
 		/*
 		 * With delalloc we already reserved the blocks
@@ -4622,6 +4653,19 @@ out3:
 						reserv_blks);
 	}
 
+	trace_mark(ext4_allocate_blocks,
+		   "dev %s block %llu flags %u len %u ino %lu "
+		   "logical %llu goal %llu lleft %llu lright %llu "
+		   "pleft %llu pright %llu ",
+		   sb->s_id, (unsigned long long) block,
+		   ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0,
+		   (unsigned long long) ar->logical,
+		   (unsigned long long) ar->goal,
+		   (unsigned long long) ar->lleft,
+		   (unsigned long long) ar->lright,
+		   (unsigned long long) ar->pleft,
+		   (unsigned long long) ar->pright);
+
 	return block;
 }
 
@@ -4755,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	ext4_debug("freeing block %lu\n", block);
+	trace_mark(ext4_free_blocks,
+		   "dev %s block %llu count %lu metadata %d ino %lu",
+		   sb->s_id, (unsigned long long) block, count, metadata,
+		   inode ? inode->i_ino : 0);
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (ac) {
-- 
cgit v1.2.3


From 4a9bf99b205448ec1f0cbdee1776a29f9c503ce4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 3 Jan 2009 22:56:44 -0500
Subject: jbd2: Add pid and journal device name to the "kjournald2 starting"
 message

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b10d7283ba5..fe20e40ee7c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -131,8 +131,9 @@ static int kjournald2(void *arg)
 	journal->j_task = current;
 	wake_up(&journal->j_wait_done_commit);
 
-	printk(KERN_INFO "kjournald2 starting.  Commit interval %ld seconds\n",
-			journal->j_commit_interval / HZ);
+	printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
+	       "commit interval %ld seconds\n", current->pid,
+	       journal->j_devname, journal->j_commit_interval / HZ);
 
 	/*
 	 * And now, wait forever for commit wakeup events.
-- 
cgit v1.2.3


From 157cf649a735a2f7e8dba0ed08e6e38b6c30d886 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Dec 2008 04:57:47 -0500
Subject: sanitize audit_fd_pair()

* no allocations
* return void

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b..891697112f6 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
 		goto err_fdr;
 	fdw = error;
 
-	error = audit_fd_pair(fdr, fdw);
-	if (error < 0)
-		goto err_fdw;
-
+	audit_fd_pair(fdr, fdw);
 	fd_install(fdr, fr);
 	fd_install(fdw, fw);
 	fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
 
 	return 0;
 
- err_fdw:
-	put_unused_fd(fdw);
  err_fdr:
 	put_unused_fd(fdr);
  err_read_pipe:
-- 
cgit v1.2.3


From c644f0e4b56f9a2fc066cd0d75a18074d130e4a3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 4 Jan 2009 12:00:48 -0800
Subject: fs: introduce bgl_lock_ptr()

As suggested by Andreas Dilger, introduce a bgl_lock_ptr() helper in
<linux/blockgroup_lock.h> and add separate sb_bgl_lock() helpers to
filesystem specific header files to break the hidden dependency to
struct ext[234]_sb_info.

Also, while at it, convert the macros to static inlines to try make up
for all the times I broke Andrew Morton's tree.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/ext4_sb.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df..b21f16713db 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -146,4 +146,10 @@ struct ext4_sb_info {
 	struct flex_groups *s_flex_groups;
 };
 
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
+
 #endif	/* _EXT4_SB */
-- 
cgit v1.2.3


From 54566b2c1594c2326a645a3551f9d989f7ba3c5e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sun, 4 Jan 2009 12:00:53 -0800
Subject: fs: symlink write_begin allocation context fix

With the write_begin/write_end aops, page_symlink was broken because it
could no longer pass a GFP_NOFS type mask into the point where the
allocations happened.  They are done in write_begin, which would always
assume that the filesystem can be entered from reclaim.  This bug could
cause filesystem deadlocks.

The funny thing with having a gfp_t mask there is that it doesn't really
allow the caller to arbitrarily tinker with the context in which it can be
called.  It couldn't ever be GFP_ATOMIC, for example, because it needs to
take the page lock.  The only thing any callers care about is __GFP_FS
anyway, so turn that into a single flag.

Add a new flag for write_begin, AOP_FLAG_NOFS.  Filesystems can now act on
this flag in their write_begin function.  Change __grab_cache_page to
accept a nofs argument as well, to honour that flag (while we're there,
change the name to grab_cache_page_write_begin which is more instructive
and does away with random leading underscores).

This is really a more flexible way to go in the end anyway -- if a
filesystem happens to want any extra allocations aside from the pagecache
ones in ints write_begin function, it may now use GFP_KERNEL (rather than
GFP_NOFS) for common case allocations (eg.  ocfs2_alloc_write_ctxt, for a
random example).

[kosaki.motohiro@jp.fujitsu.com: fix ubifs]
[kosaki.motohiro@jp.fujitsu.com: fix fuse]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: <stable@kernel.org>		[2.6.28.x]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ Cleaned up the calling convention: just pass in the AOP flags
  untouched to the grab_cache_page_write_begin() function.  That
  just simplifies everybody, and may even allow future expansion of the
  logic.   - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/affs/file.c          |  2 +-
 fs/afs/write.c          |  2 +-
 fs/buffer.c             |  4 ++--
 fs/cifs/file.c          |  2 +-
 fs/ecryptfs/mmap.c      |  2 +-
 fs/ext3/inode.c         |  2 +-
 fs/ext3/namei.c         |  3 +--
 fs/ext4/inode.c         |  4 ++--
 fs/ext4/namei.c         |  3 +--
 fs/fuse/file.c          |  4 ++--
 fs/gfs2/ops_address.c   |  2 +-
 fs/hostfs/hostfs_kern.c |  2 +-
 fs/jffs2/file.c         |  2 +-
 fs/libfs.c              |  2 +-
 fs/namei.c              | 13 +++++++++----
 fs/nfs/file.c           |  2 +-
 fs/reiserfs/inode.c     |  2 +-
 fs/smbfs/file.c         |  2 +-
 fs/ubifs/file.c         |  9 +++++----
 19 files changed, 34 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6..9246cb4aa01 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 	}
 
 	index = pos >> PAGE_CACHE_SHIFT;
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35f..3fb36d43362 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	candidate->state = AFS_WBACK_PENDING;
 	init_waitqueue_head(&candidate->waitq);
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		kfree(candidate);
 		return -ENOMEM;
diff --git a/fs/buffer.c b/fs/buffer.c
index 776ae091d3b..a13f09b696f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1996,7 +1996,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
 	page = *pagep;
 	if (page == NULL) {
 		ownpage = 1;
-		page = __grab_cache_page(mapping, index);
+		page = grab_cache_page_write_begin(mapping, index, flags);
 		if (!page) {
 			status = -ENOMEM;
 			goto out;
@@ -2502,7 +2502,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6..12bb656fbe7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 
 	cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		rc = -ENOMEM;
 		goto out;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac..46cec2b6979 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
 	loff_t prev_page_end_size;
 	int rc = 0;
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c4bdccf976b..5fa453b49a6 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1161,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
 	to = from + len;
 
 retry:
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 297ea8dfac7..1dd2abe6313 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2175,8 +2175,7 @@ retry:
 		 * We have a transaction open.  All is sweetness.  It also sets
 		 * i_size in generic_commit_write().
 		 */
-		err = __page_symlink(inode, symname, l,
-				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+		err = __page_symlink(inode, symname, l, 1);
 		if (err) {
 			drop_nlink(inode);
 			unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c3325e0b00..6702a49992a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1346,7 +1346,7 @@ retry:
 		goto out;
 	}
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
@@ -2550,7 +2550,7 @@ retry:
 		goto out;
 	}
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
 		ret = -ENOMEM;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index da98a9012fa..9fd2a5e1be4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2212,8 +2212,7 @@ retry:
 		 * We have a transaction open.  All is sweetness.  It also sets
 		 * i_size in generic_commit_write().
 		 */
-		err = __page_symlink(inode, symname, l,
-				mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+		err = __page_symlink(inode, symname, l, 1);
 		if (err) {
 			clear_nlink(inode);
 			unlock_new_inode(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b8..4c9ee701126 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -646,7 +646,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 
-	*pagep = __grab_cache_page(mapping, index);
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
 	if (!*pagep)
 		return -ENOMEM;
 	return 0;
@@ -779,7 +779,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 			break;
 
 		err = -ENOMEM;
-		page = __grab_cache_page(mapping, index);
+		page = grab_cache_page_write_begin(mapping, index, 0);
 		if (!page)
 			break;
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c..15f710f2d4d 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -675,7 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		goto out_trans_fail;
 
 	error = -ENOMEM;
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	*pagep = page;
 	if (unlikely(!page))
 		goto out_endtrans;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac17..5c538e0ec14 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 
-	*pagep = __grab_cache_page(mapping, index);
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
 	if (!*pagep)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c85..5edc2bf2058 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	uint32_t pageofs = index << PAGE_CACHE_SHIFT;
 	int ret = 0;
 
-	pg = __grab_cache_page(mapping, index);
+	pg = grab_cache_page_write_begin(mapping, index, flags);
 	if (!pg)
 		return -ENOMEM;
 	*pagep = pg;
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a832190..bdaec17fa38 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -360,7 +360,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 
diff --git a/fs/namei.c b/fs/namei.c
index dd5c9f0bf82..df2d3df4f04 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2817,18 +2817,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 	}
 }
 
-int __page_symlink(struct inode *inode, const char *symname, int len,
-		gfp_t gfp_mask)
+/*
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 	void *fsdata;
 	int err;
 	char *kaddr;
+	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+	if (nofs)
+		flags |= AOP_FLAG_NOFS;
 
 retry:
 	err = pagecache_write_begin(NULL, mapping, 0, len-1,
-				AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+				flags, &page, &fsdata);
 	if (err)
 		goto fail;
 
@@ -2852,7 +2857,7 @@ fail:
 int page_symlink(struct inode *inode, const char *symname, int len)
 {
 	return __page_symlink(inode, symname, len,
-			mapping_gfp_mask(inode->i_mapping));
+			!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
 }
 
 const struct inode_operations page_symlink_inode_operations = {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f0..90f292b520d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 		file->f_path.dentry->d_name.name,
 		mapping->host->i_ino, len, (long long) pos);
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 145c2d3e5e0..ed04f47007f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2561,7 +2561,7 @@ static int reiserfs_write_begin(struct file *file,
 	}
 
 	index = pos >> PAGE_CACHE_SHIFT;
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a555..92d5e8ffb63 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	*pagep = __grab_cache_page(mapping, index);
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
 	if (!*pagep)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index fe82d2464d4..bf37374567f 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
 }
 
 static int write_begin_slow(struct address_space *mapping,
-			    loff_t pos, unsigned len, struct page **pagep)
+			    loff_t pos, unsigned len, struct page **pagep,
+			    unsigned flags)
 {
 	struct inode *inode = mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,7 +248,7 @@ static int write_begin_slow(struct address_space *mapping,
 	if (unlikely(err))
 		return err;
 
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (unlikely(!page)) {
 		ubifs_release_budget(c, &req);
 		return -ENOMEM;
@@ -438,7 +439,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 		return -EROFS;
 
 	/* Try out the fast-path part first */
-	page = __grab_cache_page(mapping, index);
+	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (unlikely(!page))
 		return -ENOMEM;
 
@@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 		unlock_page(page);
 		page_cache_release(page);
 
-		return write_begin_slow(mapping, pos, len, pagep);
+		return write_begin_slow(mapping, pos, len, pagep, flags);
 	}
 
 	/*
-- 
cgit v1.2.3


From 40a1984d22294ab202f616e432bb8d3481897675 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 4 Jan 2009 19:55:57 -0500
Subject: jbd2: Submit writes to the journal using WRITE_SYNC

Since we will be waiting the write of the commit record to the journal
to complete in journal_submit_commit_record(), submit it using
WRITE_SYNC.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/commit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 0ad84162c42..073124a29b8 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		set_buffer_ordered(bh);
 		barrier_done = 1;
 	}
-	ret = submit_bh(WRITE, bh);
+	ret = submit_bh(WRITE_SYNC, bh);
 	if (barrier_done)
 		clear_buffer_ordered(bh);
 
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		clear_buffer_dirty(bh);
-		ret = submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 	}
 	*cbh = bh;
 	return ret;
-- 
cgit v1.2.3


From e9079cce201784632aed4b1a3121ee38c1ced0b6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 14 Oct 2008 14:43:29 +0100
Subject: GFS2: Support for FIEMAP ioctl

This patch implements the FIEMAP ioctl for GFS2. We can use the generic
code (aside from a lock order issue, solved as per Ted Tso's suggestion)
for which I've introduced a new variant of the generic function. We also
have one exception to deal with, namely stuffed files, so we do that
"by hand", setting all the required flags.

This has been tested with a modified (I could only find an old version) of
Eric's test program, and appears to work correctly.

This patch does not currently support FIEMAP of xattrs, but the plan is to add
that feature at some future point.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Eric Sandeen <sandeen@redhat.com>
---
 fs/gfs2/ops_inode.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ioctl.c          | 44 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 80 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d232991b904..1e24b65e1d2 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -19,6 +19,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/fiemap.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -1212,6 +1213,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
 }
 
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		       u64 start, u64 len)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int ret;
+
+	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+	if (ret)
+		return ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+	if (ret)
+		goto out;
+
+	if (gfs2_is_stuffed(ip)) {
+		u64 phys = ip->i_no_addr << inode->i_blkbits;
+		u64 size = i_size_read(inode);
+		u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+			    FIEMAP_EXTENT_DATA_INLINE;
+		phys += sizeof(struct gfs2_dinode);
+		phys += start;
+		if (start + len > size)
+			len = size - start;
+		if (start < size)
+			ret = fiemap_fill_next_extent(fieinfo, start, phys,
+						      len, flags);
+		if (ret == 1)
+			ret = 0;
+	} else {
+		ret = __generic_block_fiemap(inode, fieinfo, start, len,
+					     gfs2_block_map);
+	}
+
+	gfs2_glock_dq_uninit(&gh);
+out:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+
 const struct inode_operations gfs2_file_iops = {
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
@@ -1220,6 +1263,7 @@ const struct inode_operations gfs2_file_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
 };
 
 const struct inode_operations gfs2_dir_iops = {
@@ -1239,6 +1283,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
 };
 
 const struct inode_operations gfs2_symlink_iops = {
@@ -1251,5 +1296,6 @@ const struct inode_operations gfs2_symlink_iops = {
 	.getxattr = gfs2_getxattr,
 	.listxattr = gfs2_listxattr,
 	.removexattr = gfs2_removexattr,
+	.fiemap = gfs2_fiemap,
 };
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 43e8b2c0664..cc3f1aa1cf7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
 #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
 
-/*
+/**
+ * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
  * @inode - the inode to map
  * @arg - the pointer to userspace where we copy everything to
  * @get_block - the fs's get_block function
@@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
  *
  * If it is possible to have data blocks beyond a hole past @inode->i_size, then
  * please do not use this function, it will stop at the first unmapped block
- * beyond i_size
+ * beyond i_size.
+ *
+ * If you use this function directly, you need to do your own locking. Use
+ * generic_block_fiemap if you want the locking done for you.
  */
-int generic_block_fiemap(struct inode *inode,
-			 struct fiemap_extent_info *fieinfo, u64 start,
-			 u64 len, get_block_t *get_block)
+
+int __generic_block_fiemap(struct inode *inode,
+			   struct fiemap_extent_info *fieinfo, u64 start,
+			   u64 len, get_block_t *get_block)
 {
 	struct buffer_head tmp;
 	unsigned int start_blk;
@@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode,
 
 	start_blk = logical_to_blk(inode, start);
 
-	/* guard against change */
-	mutex_lock(&inode->i_mutex);
-
 	length = (long long)min_t(u64, len, i_size_read(inode));
 	map_len = length;
 
@@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode,
 		cond_resched();
 	} while (1);
 
-	mutex_unlock(&inode->i_mutex);
-
 	/* if ret is 1 then we just hit the end of the extent array */
 	if (ret == 1)
 		ret = 0;
 
 	return ret;
 }
+EXPORT_SYMBOL(__generic_block_fiemap);
+
+/**
+ * generic_block_fiemap - FIEMAP for block based inodes
+ * @inode: The inode to map
+ * @fieinfo: The mapping information
+ * @start: The initial block to map
+ * @len: The length of the extect to attempt to map
+ * @get_block: The block mapping function for the fs
+ *
+ * Calls __generic_block_fiemap to map the inode, after taking
+ * the inode's mutex lock.
+ */
+
+int generic_block_fiemap(struct inode *inode,
+			 struct fiemap_extent_info *fieinfo, u64 start,
+			 u64 len, get_block_t *get_block)
+{
+	int ret;
+	mutex_lock(&inode->i_mutex);
+	ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
 EXPORT_SYMBOL(generic_block_fiemap);
 
 #endif  /*  CONFIG_BLOCK  */
-- 
cgit v1.2.3


From b276058371f5c2ad92f9f27373a72b219ed580ed Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 14 Oct 2008 16:05:55 +0100
Subject: GFS2: Rationalise header files

Move the contents of some headers which contained very
little into more sensible places, and remove the original
header files. This should make it easier to find things.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c      |  1 -
 fs/gfs2/inode.h      | 11 +++++++++++
 fs/gfs2/main.c       |  2 +-
 fs/gfs2/ops_dentry.c |  2 +-
 fs/gfs2/ops_dentry.h | 17 -----------------
 fs/gfs2/ops_export.c |  3 +--
 fs/gfs2/ops_file.c   |  1 -
 fs/gfs2/ops_fstype.c |  3 ---
 fs/gfs2/ops_fstype.h | 19 -------------------
 fs/gfs2/ops_inode.c  |  3 +--
 fs/gfs2/ops_inode.h  | 25 -------------------------
 fs/gfs2/ops_super.c  |  1 -
 fs/gfs2/ops_super.h  | 17 -----------------
 fs/gfs2/super.h      |  8 ++++++++
 14 files changed, 23 insertions(+), 90 deletions(-)
 delete mode 100644 fs/gfs2/ops_dentry.h
 delete mode 100644 fs/gfs2/ops_fstype.h
 delete mode 100644 fs/gfs2/ops_inode.h
 delete mode 100644 fs/gfs2/ops_super.h

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d57616840e8..bf4676d7acd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -32,7 +32,6 @@
 #include "log.h"
 #include "meta_io.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2d43f69610a..c3577906f0a 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,6 +10,7 @@
 #ifndef __INODE_DOT_H__
 #define __INODE_DOT_H__
 
+#include <linux/fs.h>
 #include "util.h"
 
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
 void gfs2_dinode_print(const struct gfs2_inode *ip);
 
+extern const struct inode_operations gfs2_file_iops;
+extern const struct inode_operations gfs2_dir_iops;
+extern const struct inode_operations gfs2_symlink_iops;
+extern const struct file_operations gfs2_file_fops;
+extern const struct file_operations gfs2_dir_fops;
+extern const struct file_operations gfs2_file_fops_nolock;
+extern const struct file_operations gfs2_dir_fops_nolock;
+
+extern void gfs2_set_inode_flags(struct inode *inode);
+
 #endif /* __INODE_DOT_H__ */
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bb2cc303ac2..3eea03c7853 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -19,7 +19,7 @@
 
 #include "gfs2.h"
 #include "incore.h"
-#include "ops_fstype.h"
+#include "super.h"
 #include "sys.h"
 #include "util.h"
 #include "glock.h"
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 4a5e676b442..c2ad36330ca 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -19,7 +19,7 @@
 #include "incore.h"
 #include "dir.h"
 #include "glock.h"
-#include "ops_dentry.h"
+#include "super.h"
 #include "util.h"
 #include "inode.h"
 
diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h
deleted file mode 100644
index 5caa3db4d3f..00000000000
--- a/fs/gfs2/ops_dentry.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_DENTRY_DOT_H__
-#define __OPS_DENTRY_DOT_H__
-
-#include <linux/dcache.h>
-
-extern struct dentry_operations gfs2_dops;
-
-#endif /* __OPS_DENTRY_DOT_H__ */
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index bbb8c36403a..3a9b9b43834 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -22,8 +22,7 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "ops_dentry.h"
-#include "ops_fstype.h"
+#include "super.h"
 #include "rgrp.h"
 #include "util.h"
 
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3a747f8e218..fcfaaefc92f 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -39,7 +39,6 @@
 #include "util.h"
 #include "eaops.h"
 #include "ops_address.h"
-#include "ops_inode.h"
 
 /**
  * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b117fcf2c4f..ca463a450eb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -27,9 +27,6 @@
 #include "glops.h"
 #include "inode.h"
 #include "mount.h"
-#include "ops_fstype.h"
-#include "ops_dentry.h"
-#include "ops_super.h"
 #include "recovery.h"
 #include "rgrp.h"
 #include "super.h"
diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h
deleted file mode 100644
index da849051183..00000000000
--- a/fs/gfs2/ops_fstype.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_FSTYPE_DOT_H__
-#define __OPS_FSTYPE_DOT_H__
-
-#include <linux/fs.h>
-
-extern struct file_system_type gfs2_fs_type;
-extern struct file_system_type gfs2meta_fs_type;
-extern const struct export_operations gfs2_export_ops;
-
-#endif /* __OPS_FSTYPE_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1e24b65e1d2..98440fef01c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -32,12 +32,11 @@
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
-#include "ops_dentry.h"
-#include "ops_inode.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
+#include "super.h"
 
 /**
  * gfs2_create - Create a file
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
deleted file mode 100644
index 14b4b797622..00000000000
--- a/fs/gfs2/ops_inode.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_INODE_DOT_H__
-#define __OPS_INODE_DOT_H__
-
-#include <linux/fs.h>
-
-extern const struct inode_operations gfs2_file_iops;
-extern const struct inode_operations gfs2_dir_iops;
-extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
-
-extern void gfs2_set_inode_flags(struct inode *inode);
-
-#endif /* __OPS_INODE_DOT_H__ */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index d5355d9b592..9c7678db08f 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "mount.h"
-#include "ops_super.h"
 #include "quota.h"
 #include "recovery.h"
 #include "rgrp.h"
diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h
deleted file mode 100644
index 442a274c627..00000000000
--- a/fs/gfs2/ops_super.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_SUPER_DOT_H__
-#define __OPS_SUPER_DOT_H__
-
-#include <linux/fs.h>
-
-extern const struct super_operations gfs2_super_ops;
-
-#endif /* __OPS_SUPER_DOT_H__ */
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 50a4c9b1215..1848dad3ecb 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -10,6 +10,8 @@
 #ifndef __SUPER_DOT_H__
 #define __SUPER_DOT_H__
 
+#include <linux/fs.h>
+#include <linux/dcache.h>
 #include "incore.h"
 
 void gfs2_lm_unmount(struct gfs2_sbd *sdp);
@@ -46,5 +48,11 @@ int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
 int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
 
+extern struct file_system_type gfs2_fs_type;
+extern struct file_system_type gfs2meta_fs_type;
+extern const struct export_operations gfs2_export_ops;
+extern const struct super_operations gfs2_super_ops;
+extern struct dentry_operations gfs2_dops;
+
 #endif /* __SUPER_DOT_H__ */
 
-- 
cgit v1.2.3


From 1bb7322fd0d5abdce396de51cbc5dbc489523018 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 15 Oct 2008 09:46:39 +0100
Subject: GFS2: Fix up jdata writepage/delete_inode

There is a bug in writepage and delete_inode which allows jdata files to
invalidate pages from the address space without being in a transaction at
the time. This causes problems in case the pages are in the journal. This
patch fixes that case and prevents the resulting oops.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 16 +++++++---------
 fs/gfs2/ops_super.c   |  7 ++++---
 2 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 15f710f2d4d..574b222feef 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc
 {
 	struct inode *inode = page->mapping->host;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	int error;
+	int ret;
 	int done_trans = 0;
 
-	error = gfs2_writepage_common(page, wbc);
-	if (error <= 0)
-		return error;
-
 	if (PageChecked(page)) {
 		if (wbc->sync_mode != WB_SYNC_ALL)
 			goto out_ignore;
-		error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
-		if (error)
+		ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0);
+		if (ret)
 			goto out_ignore;
 		done_trans = 1;
 	}
-	error = __gfs2_jdata_writepage(page, wbc);
+	ret = gfs2_writepage_common(page, wbc);
+	if (ret > 0)
+		ret = __gfs2_jdata_writepage(page, wbc);
 	if (done_trans)
 		gfs2_trans_end(sdp);
-	return error;
+	return ret;
 
 out_ignore:
 	redirty_page_for_writepage(wbc, page);
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 9c7678db08f..2cb744ba3b7 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -493,7 +493,7 @@ static void gfs2_delete_inode(struct inode *inode)
 	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
 	error = gfs2_glock_nq(&ip->i_iopen_gh);
 	if (error)
-		goto out_uninit;
+		goto out_truncate;
 
 	if (S_ISDIR(inode->i_mode) &&
 	    (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
@@ -518,6 +518,7 @@ static void gfs2_delete_inode(struct inode *inode)
 	if (error)
 		goto out_unlock;
 
+out_truncate:
 	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 	if (error)
 		goto out_unlock;
@@ -526,8 +527,8 @@ static void gfs2_delete_inode(struct inode *inode)
 	gfs2_trans_end(sdp);
 
 out_unlock:
-	gfs2_glock_dq(&ip->i_iopen_gh);
-out_uninit:
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
 	if (error && error != GLR_TRYFAILED)
-- 
cgit v1.2.3


From 55ba474daed9763b2f6fe26ad762ee373554d65e Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 24 Oct 2008 11:31:12 -0700
Subject: GFS2: sparse annotation of gl->gl_spin

fs/gfs2/glock.c:308:5: warning: context problem in 'do_promote': '_spin_unlock' expected different context
fs/gfs2/glock.c:308:5:    context '*gl+28': wanted >= 1, got 0
fs/gfs2/glock.c:529:2: warning: context problem in 'do_xmote': '_spin_unlock' expected different context
fs/gfs2/glock.c:529:2:    context '*gl+28': wanted >= 1, got 0
fs/gfs2/glock.c:925:3: warning: context problem in 'add_to_queue': '_spin_unlock' expected different context
fs/gfs2/glock.c:925:3:    context '*gl+28': wanted >= 1, got 0

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c962283d4e7..27cb9cca9c0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -293,6 +293,8 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
  */
 
 static int do_promote(struct gfs2_glock *gl)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh, *tmp;
@@ -511,6 +513,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
  */
 
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -576,6 +580,8 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
  */
 
 static void run_queue(struct gfs2_glock *gl, const int nonblock)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	struct gfs2_holder *gh = NULL;
 
@@ -877,6 +883,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
  */
 
 static inline void add_to_queue(struct gfs2_holder *gh)
+__releases(&gl->gl_spin)
+__acquires(&gl->gl_spin)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-- 
cgit v1.2.3


From bcf0b5b348a1f49c2c878ffdb78e68c930baabb8 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 3 Nov 2008 13:39:46 +0000
Subject: GFS2: Move generation number into "proper" part of inode

This moves the generation number from the gfs2_dinode_host
into the gfs2_inode structure. Eventually the plan is to get
rid of the gfs2_dinode_host structure completely.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 2 +-
 fs/gfs2/inode.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f566ec1b4e8..4ff1d7ecd98 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -235,7 +235,6 @@ enum {
 
 struct gfs2_dinode_host {
 	u64 di_size;		/* number of bytes in file */
-	u64 di_generation;	/* generation number for NFS */
 	u32 di_flags;		/* GFS2_DIF_... */
 	/* These only apply to directories  */
 	u32 di_entries;		/* The number of entries in the directory */
@@ -246,6 +245,7 @@ struct gfs2_inode {
 	struct inode i_inode;
 	u64 i_no_addr;
 	u64 i_no_formal_ino;
+	u64 i_generation;
 	unsigned long i_flags;		/* GIF_... */
 
 	struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index bf4676d7acd..9d97f699c81 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -286,7 +286,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
 
 	ip->i_goal = be64_to_cpu(str->di_goal_meta);
-	di->di_generation = be64_to_cpu(str->di_generation);
+	ip->i_generation = be64_to_cpu(str->di_generation);
 
 	di->di_flags = be32_to_cpu(str->di_flags);
 	gfs2_set_inode_flags(&ip->i_inode);
@@ -1263,7 +1263,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 	str->di_goal_meta = cpu_to_be64(ip->i_goal);
 	str->di_goal_data = cpu_to_be64(ip->i_goal);
-	str->di_generation = cpu_to_be64(di->di_generation);
+	str->di_generation = cpu_to_be64(ip->i_generation);
 
 	str->di_flags = cpu_to_be32(di->di_flags);
 	str->di_height = cpu_to_be16(ip->i_height);
-- 
cgit v1.2.3


From ad6203f2b46c2217f74b2e88299640eef5889e72 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 3 Nov 2008 13:59:19 +0000
Subject: GFS2: Move "entries" into "proper" inode

This moves the directory entry count into the proper inode.
Potentially we could get this to share the space used by
something else in the future, but this is one more step
on the way to removing the gfs2_dinode_host structure.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c       | 20 ++++++++++----------
 fs/gfs2/incore.h    |  3 +--
 fs/gfs2/inode.c     | 10 +++++-----
 fs/gfs2/ops_inode.c | 14 +++++++-------
 4 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index eed040d8ba3..830cf48184e 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode)
 		return -ENOSPC;
 	bn = bh->b_blocknr;
 
-	gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16));
-	leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries);
+	gfs2_assert(sdp, dip->i_entries < (1 << 16));
+	leaf->lf_entries = cpu_to_be16(dip->i_entries);
 
 	/*  Copy dirents  */
 
@@ -1426,7 +1426,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 	int copied = 0;
 	int error;
 
-	if (!dip->i_di.di_entries)
+	if (!dip->i_entries)
 		return 0;
 
 	if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
@@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 			error = PTR_ERR(dent);
 			goto out;
 		}
-		if (dip->i_di.di_entries != g.offset) {
+		if (dip->i_entries != g.offset) {
 			fs_warn(sdp, "Number of entries corrupt in dir %llu, "
-				"ip->i_di.di_entries (%u) != g.offset (%u)\n",
+				"ip->i_entries (%u) != g.offset (%u)\n",
 				(unsigned long long)dip->i_no_addr,
-				dip->i_di.di_entries,
+				dip->i_entries,
 				g.offset);
 			error = -EIO;
 			goto out;
 		}
 		error = do_filldir_main(dip, offset, opaque, filldir, darr,
-					dip->i_di.di_entries, &copied);
+					dip->i_entries, &copied);
 out:
 		kfree(darr);
 	}
@@ -1621,7 +1621,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			if (error)
 				break;
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			ip->i_di.di_entries++;
+			ip->i_entries++;
 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 			gfs2_dinode_out(ip, bh->b_data);
 			brelse(bh);
@@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
 	if (error)
 		return error;
 
-	if (!dip->i_di.di_entries)
+	if (!dip->i_entries)
 		gfs2_consist_inode(dip);
 	gfs2_trans_add_bh(dip->i_gl, bh, 1);
-	dip->i_di.di_entries--;
+	dip->i_entries--;
 	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(dip, bh->b_data);
 	brelse(bh);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4ff1d7ecd98..15ca3a75cf1 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -236,8 +236,6 @@ enum {
 struct gfs2_dinode_host {
 	u64 di_size;		/* number of bytes in file */
 	u32 di_flags;		/* GFS2_DIF_... */
-	/* These only apply to directories  */
-	u32 di_entries;		/* The number of entries in the directory */
 	u64 di_eattr;		/* extended attribute block number */
 };
 
@@ -256,6 +254,7 @@ struct gfs2_inode {
 	struct gfs2_alloc *i_alloc;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
+	u32 i_entries;
 	u8 i_height;
 	u8 i_depth;
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9d97f699c81..015d4c00708 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -299,7 +299,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
 		goto corrupt;
 	ip->i_depth = (u8)depth;
-	di->di_entries = be32_to_cpu(str->di_entries);
+	ip->i_entries = be32_to_cpu(str->di_entries);
 
 	di->di_eattr = be64_to_cpu(str->di_eattr);
 	if (S_ISREG(ip->i_inode.i_mode))
@@ -689,7 +689,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 		return error;
 	}
 
-	if (dip->i_di.di_entries == (u32)-1)
+	if (dip->i_entries == (u32)-1)
 		return -EFBIG;
 	if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1)
 		return -EMLINK;
@@ -1067,7 +1067,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 	struct qstr dotname;
 	int error;
 
-	if (ip->i_di.di_entries != 2) {
+	if (ip->i_entries != 2) {
 		if (gfs2_consist_inode(ip))
 			gfs2_dinode_print(ip);
 		return -EIO;
@@ -1271,7 +1271,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 					     !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
 					     GFS2_FORMAT_DE : 0);
 	str->di_depth = cpu_to_be16(ip->i_depth);
-	str->di_entries = cpu_to_be32(di->di_entries);
+	str->di_entries = cpu_to_be32(ip->i_entries);
 
 	str->di_eattr = cpu_to_be64(di->di_eattr);
 	str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
@@ -1295,7 +1295,7 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
 	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
 	printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
-	printk(KERN_INFO "  di_entries = %u\n", di->di_entries);
+	printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
 	printk(KERN_INFO "  di_eattr = %llu\n",
 	       (unsigned long long)di->di_eattr);
 }
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 98440fef01c..48468f48d7b 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (!dip->i_inode.i_nlink)
 		goto out_gunlock;
 	error = -EFBIG;
-	if (dip->i_di.di_entries == (u32)-1)
+	if (dip->i_entries == (u32)-1)
 		goto out_gunlock;
 	error = -EPERM;
 	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -427,7 +427,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ip->i_inode.i_nlink = 2;
 	ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
 	ip->i_di.di_flags |= GFS2_DIF_JDATA;
-	ip->i_di.di_entries = 2;
+	ip->i_entries = 2;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 
@@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
 	if (error)
 		goto out_gunlock;
 
-	if (ip->i_di.di_entries < 2) {
+	if (ip->i_entries < 2) {
 		if (gfs2_consist_inode(ip))
 			gfs2_dinode_print(ip);
 		error = -EIO;
 		goto out_gunlock;
 	}
-	if (ip->i_di.di_entries > 2) {
+	if (ip->i_entries > 2) {
 		error = -ENOTEMPTY;
 		goto out_gunlock;
 	}
@@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock;
 
 		if (S_ISDIR(nip->i_inode.i_mode)) {
-			if (nip->i_di.di_entries < 2) {
+			if (nip->i_entries < 2) {
 				if (gfs2_consist_inode(nip))
 					gfs2_dinode_print(nip);
 				error = -EIO;
 				goto out_gunlock;
 			}
-			if (nip->i_di.di_entries > 2) {
+			if (nip->i_entries > 2) {
 				error = -ENOTEMPTY;
 				goto out_gunlock;
 			}
@@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 				error = -EINVAL;
 				goto out_gunlock;
 			}
-			if (ndip->i_di.di_entries == (u32)-1) {
+			if (ndip->i_entries == (u32)-1) {
 				error = -EFBIG;
 				goto out_gunlock;
 			}
-- 
cgit v1.2.3


From 3767ac21f471fe669a7d9f6abef682ddac8fc3d8 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 3 Nov 2008 14:28:42 +0000
Subject: GFS2: Move di_eattr into "proper" inode

This moves the di_eattr field out of gfs2_inode_host and
into the inode proper.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/acl.c       |  2 +-
 fs/gfs2/eattr.c     | 26 +++++++++++++-------------
 fs/gfs2/incore.h    |  2 +-
 fs/gfs2/inode.c     |  8 ++++----
 fs/gfs2/ops_super.c |  2 +-
 5 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3e9bd46f27e..e335dceb6a4 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
 	struct gfs2_ea_location el_this;
 	int error;
 
-	if (!ip->i_di.di_eattr)
+	if (!ip->i_eattr)
 		return 0;
 
 	memset(&er, 0, sizeof(struct gfs2_ea_request));
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index e3f76f451b0..1c1e06136aa 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -114,7 +114,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 	__be64 *eablk, *end;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
 	if (error)
 		return error;
 
@@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	if (error)
 		return error;
 
-	if (ip->i_di.di_eattr) {
+	if (ip->i_eattr) {
 		struct ea_list ei = { .ei_er = er, .ei_size = 0 };
 
 		error = ea_foreach(ip, ea_list_i, &ei);
@@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	struct gfs2_ea_location el;
 	int error;
 
-	if (!ip->i_di.di_eattr)
+	if (!ip->i_eattr)
 		return -ENODATA;
 
 	error = gfs2_ea_find(ip, er, &el);
@@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (error)
 		return error;
 
-	ip->i_di.di_eattr = bh->b_blocknr;
+	ip->i_eattr = bh->b_blocknr;
 	error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er);
 
 	brelse(bh);
@@ -938,7 +938,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
 		__be64 *end;
 
-		error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT,
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
 				       &indbh);
 		if (error)
 			return error;
@@ -972,8 +972,8 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		gfs2_buffer_clear_tail(indbh, mh_size);
 
 		eablk = (__be64 *)(indbh->b_data + mh_size);
-		*eablk = cpu_to_be64(ip->i_di.di_eattr);
-		ip->i_di.di_eattr = blk;
+		*eablk = cpu_to_be64(ip->i_eattr);
+		ip->i_eattr = blk;
 		ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 
@@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	struct gfs2_ea_location el;
 	int error;
 
-	if (!ip->i_di.di_eattr) {
+	if (!ip->i_eattr) {
 		if (er->er_flags & XATTR_REPLACE)
 			return -ENODATA;
 		return ea_init(ip, er);
@@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 	struct gfs2_ea_location el;
 	int error;
 
-	if (!ip->i_di.di_eattr)
+	if (!ip->i_eattr)
 		return -ENODATA;
 
 	error = gfs2_ea_find(ip, er, &el);
@@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
 	if (error)
 		return error;
 
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	struct buffer_head *dibh;
 	int error;
 
-	rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr);
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr);
 	if (!rgd) {
 		gfs2_consist_inode(ip);
 		return -EIO;
@@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	if (error)
 		goto out_gunlock;
 
-	gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
+	gfs2_free_meta(ip, ip->i_eattr, 1);
 
-	ip->i_di.di_eattr = 0;
+	ip->i_eattr = 0;
 	gfs2_add_inode_blocks(&ip->i_inode, -1);
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 15ca3a75cf1..fb2fd4adaae 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -236,7 +236,6 @@ enum {
 struct gfs2_dinode_host {
 	u64 di_size;		/* number of bytes in file */
 	u32 di_flags;		/* GFS2_DIF_... */
-	u64 di_eattr;		/* extended attribute block number */
 };
 
 struct gfs2_inode {
@@ -244,6 +243,7 @@ struct gfs2_inode {
 	u64 i_no_addr;
 	u64 i_no_formal_ino;
 	u64 i_generation;
+	u64 i_eattr;
 	unsigned long i_flags;		/* GIF_... */
 
 	struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 015d4c00708..91735b8cecd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -301,7 +301,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	ip->i_depth = (u8)depth;
 	ip->i_entries = be32_to_cpu(str->di_entries);
 
-	di->di_eattr = be64_to_cpu(str->di_eattr);
+	ip->i_eattr = be64_to_cpu(str->di_eattr);
 	if (S_ISREG(ip->i_inode.i_mode))
 		gfs2_set_aops(&ip->i_inode);
 
@@ -1273,7 +1273,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_depth = cpu_to_be16(ip->i_depth);
 	str->di_entries = cpu_to_be32(ip->i_entries);
 
-	str->di_eattr = cpu_to_be64(di->di_eattr);
+	str->di_eattr = cpu_to_be64(ip->i_eattr);
 	str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
 	str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
 	str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
@@ -1296,7 +1296,7 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
 	printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
 	printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
-	printk(KERN_INFO "  di_eattr = %llu\n",
-	       (unsigned long long)di->di_eattr);
+	printk(KERN_INFO "  i_eattr = %llu\n",
+	       (unsigned long long)ip->i_eattr);
 }
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 2cb744ba3b7..aee6cbaf58d 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -502,7 +502,7 @@ static void gfs2_delete_inode(struct inode *inode)
 			goto out_unlock;
 	}
 
-	if (ip->i_di.di_eattr) {
+	if (ip->i_eattr) {
 		error = gfs2_ea_dealloc(ip);
 		if (error)
 			goto out_unlock;
-- 
cgit v1.2.3


From c9e98886776386f1f7828d9685e78cd341849867 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 4 Nov 2008 09:47:33 +0000
Subject: GFS2: Move i_size from gfs2_dinode_host and rename it to i_disksize

This patch moved the i_size field from the gfs2_dinode_host and
following the ext3 convention renames it i_disksize.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c        | 26 +++++++++++++-------------
 fs/gfs2/dir.c         | 26 +++++++++++++-------------
 fs/gfs2/incore.h      |  2 +-
 fs/gfs2/inode.c       | 13 +++++++------
 fs/gfs2/ops_address.c | 10 +++++-----
 fs/gfs2/ops_file.c    |  2 +-
 fs/gfs2/ops_fstype.c  |  2 +-
 fs/gfs2/ops_inode.c   | 10 +++++-----
 fs/gfs2/quota.c       |  6 +++---
 fs/gfs2/rgrp.c        |  6 +++---
 fs/gfs2/super.c       |  8 ++++----
 11 files changed, 56 insertions(+), 55 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index bec76b1c2bb..b43aee75d3c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 		void *kaddr = kmap(page);
 
 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-		       ip->i_di.di_size);
-		memset(kaddr + ip->i_di.di_size, 0,
-		       PAGE_CACHE_SIZE - ip->i_di.di_size);
+		       ip->i_disksize);
+		memset(kaddr + ip->i_disksize, 0,
+		       PAGE_CACHE_SIZE - ip->i_disksize);
 		kunmap(page);
 
 		SetPageUptodate(page);
@@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	if (error)
 		goto out;
 
-	if (ip->i_di.di_size) {
+	if (ip->i_disksize) {
 		/* Get a free block, fill it with the stuffed data,
 		   and write it out to disk */
 
@@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 	di = (struct gfs2_dinode *)dibh->b_data;
 	gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 
-	if (ip->i_di.di_size) {
+	if (ip->i_disksize) {
 		*(__be64 *)(di + 1) = cpu_to_be64(block);
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 		di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
 		}
 	}
 
-	ip->i_di.di_size = size;
+	ip->i_disksize = size;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
@@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 		goto out;
 
 	if (gfs2_is_stuffed(ip)) {
-		ip->i_di.di_size = size;
+		ip->i_disksize = size;
 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
@@ -1045,7 +1045,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 			error = gfs2_block_truncate_page(ip->i_inode.i_mapping);
 
 		if (!error) {
-			ip->i_di.di_size = size;
+			ip->i_disksize = size;
 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 			ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
 			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -1114,7 +1114,7 @@ static int trunc_end(struct gfs2_inode *ip)
 	if (error)
 		goto out;
 
-	if (!ip->i_di.di_size) {
+	if (!ip->i_disksize) {
 		ip->i_height = 0;
 		ip->i_goal = ip->i_no_addr;
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode)))
 		return -EINVAL;
 
-	if (size > ip->i_di.di_size)
+	if (size > ip->i_disksize)
 		error = do_grow(ip, size);
-	else if (size < ip->i_di.di_size)
+	else if (size < ip->i_disksize)
 		error = do_shrink(ip, size);
 	else
 		/* update time stamps */
@@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size)
 int gfs2_truncatei_resume(struct gfs2_inode *ip)
 {
 	int error;
-	error = trunc_dealloc(ip, ip->i_di.di_size);
+	error = trunc_dealloc(ip, ip->i_disksize);
 	if (!error)
 		error = trunc_end(ip);
 	return error;
@@ -1298,7 +1298,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 		lblock_stop = offset + len + bsize - 1;
 		do_div(lblock_stop, bsize);
 	} else {
-		u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
+		u64 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
 		lblock = offset >> shift;
 		lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
 		if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 830cf48184e..d8d82324054 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
-	if (ip->i_di.di_size < offset + size)
-		ip->i_di.di_size = offset + size;
+	if (ip->i_disksize < offset + size)
+		ip->i_disksize = offset + size;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 	gfs2_dinode_out(ip, dibh->b_data);
 
@@ -226,8 +226,8 @@ out:
 	if (error)
 		return error;
 
-	if (ip->i_di.di_size < offset + copied)
-		ip->i_di.di_size = offset + copied;
+	if (ip->i_disksize < offset + copied)
+		ip->i_disksize = offset + copied;
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
 	int copied = 0;
 	int error = 0;
 
-	if (offset >= ip->i_di.di_size)
+	if (offset >= ip->i_disksize)
 		return 0;
 
-	if (offset + size > ip->i_di.di_size)
-		size = ip->i_di.di_size - offset;
+	if (offset + size > ip->i_disksize)
+		size = ip->i_disksize - offset;
 
 	if (!size)
 		return 0;
@@ -760,7 +760,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 		unsigned hsize = 1 << ip->i_depth;
 		unsigned index;
 		u64 ln;
-		if (hsize * sizeof(u64) != ip->i_di.di_size) {
+		if (hsize * sizeof(u64) != ip->i_disksize) {
 			gfs2_consist_inode(ip);
 			return ERR_PTR(-EIO);
 		}
@@ -905,7 +905,7 @@ static int dir_make_exhash(struct inode *inode)
 	for (x = sdp->sd_hash_ptrs; x--; lp++)
 		*lp = cpu_to_be64(bn);
 
-	dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
+	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
 	gfs2_add_inode_blocks(&dip->i_inode, 1);
 	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
 
@@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+	if (hsize * sizeof(u64) != dip->i_disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
 
 	buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
 
-	for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
+	for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) {
 		error = gfs2_dir_read_data(dip, (char *)buf,
 					    block * sdp->sd_hash_bsize,
 					    sdp->sd_hash_bsize, 1);
@@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
 	unsigned depth = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+	if (hsize * sizeof(u64) != dip->i_disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
@@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
 	int error = 0;
 
 	hsize = 1 << dip->i_depth;
-	if (hsize * sizeof(u64) != dip->i_di.di_size) {
+	if (hsize * sizeof(u64) != dip->i_disksize) {
 		gfs2_consist_inode(dip);
 		return -EIO;
 	}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fb2fd4adaae..4596cd254be 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -234,7 +234,6 @@ enum {
 };
 
 struct gfs2_dinode_host {
-	u64 di_size;		/* number of bytes in file */
 	u32 di_flags;		/* GFS2_DIF_... */
 };
 
@@ -244,6 +243,7 @@ struct gfs2_inode {
 	u64 i_no_formal_ino;
 	u64 i_generation;
 	u64 i_eattr;
+	loff_t i_disksize;
 	unsigned long i_flags;		/* GIF_... */
 
 	struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 91735b8cecd..baf8b24b2de 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -273,8 +273,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	 * to do that.
 	 */
 	ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
-	di->di_size = be64_to_cpu(str->di_size);
-	i_size_write(&ip->i_inode, di->di_size);
+	ip->i_disksize = be64_to_cpu(str->di_size);
+	i_size_write(&ip->i_inode, ip->i_disksize);
 	gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
 	atime.tv_sec = be64_to_cpu(str->di_atime);
 	atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1167,7 +1167,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
 		return error;
 	}
 
-	if (!ip->i_di.di_size) {
+	if (!ip->i_disksize) {
 		gfs2_consist_inode(ip);
 		error = -EIO;
 		goto out;
@@ -1177,7 +1177,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
 	if (error)
 		goto out;
 
-	x = ip->i_di.di_size + 1;
+	x = ip->i_disksize + 1;
 	if (x > *len) {
 		*buf = kmalloc(x, GFP_NOFS);
 		if (!*buf) {
@@ -1255,7 +1255,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
 	str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
 	str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
-	str->di_size = cpu_to_be64(di->di_size);
+	str->di_size = cpu_to_be64(ip->i_disksize);
 	str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 	str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
 	str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1287,7 +1287,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	       (unsigned long long)ip->i_no_formal_ino);
 	printk(KERN_INFO "  no_addr = %llu\n",
 	       (unsigned long long)ip->i_no_addr);
-	printk(KERN_INFO "  di_size = %llu\n", (unsigned long long)di->di_size);
+	printk(KERN_INFO "  i_disksize = %llu\n",
+	       (unsigned long long)ip->i_disksize);
 	printk(KERN_INFO "  blocks = %llu\n",
 	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
 	printk(KERN_INFO "  i_goal = %llu\n",
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 574b222feef..0df560f4269 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -451,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 
 	kaddr = kmap_atomic(page, KM_USER0);
 	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode),
-	       ip->i_di.di_size);
-	memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size);
+	       ip->i_disksize);
+	memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize);
 	kunmap_atomic(kaddr, KM_USER0);
 	flush_dcache_page(page);
 	brelse(dibh);
@@ -780,7 +780,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 
 	if (inode->i_size < to) {
 		i_size_write(inode, to);
-		ip->i_di.di_size = inode->i_size;
+		ip->i_disksize = inode->i_size;
 		di->di_size = cpu_to_be64(inode->i_size);
 		mark_inode_dirty(inode);
 	}
@@ -845,9 +845,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
 
-	if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) {
+	if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
 		di = (struct gfs2_dinode *)dibh->b_data;
-		ip->i_di.di_size = inode->i_size;
+		ip->i_disksize = inode->i_size;
 		di->di_size = cpu_to_be64(inode->i_size);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index fcfaaefc92f..d7e649ed62f 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -478,7 +478,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
 			goto fail;
 
 		if (!(file->f_flags & O_LARGEFILE) &&
-		    ip->i_di.di_size > MAX_NON_LFS) {
+		    ip->i_disksize > MAX_NON_LFS) {
 			error = -EOVERFLOW;
 			goto fail_gunlock;
 		}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ca463a450eb..dd83e832235 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -617,7 +617,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 
 	prev_db = 0;
 
-	for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) {
+	for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) {
 		bh.b_state = 0;
 		bh.b_blocknr = 0;
 		bh.b_size = 1 << ip->i_inode.i_blkbits;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 48468f48d7b..b932d72b5f5 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
 
 	ip = ghs[1].gh_gl->gl_object;
 
-	ip->i_di.di_size = size;
+	ip->i_disksize = size;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 
@@ -425,7 +425,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	ip = ghs[1].gh_gl->gl_object;
 
 	ip->i_inode.i_nlink = 2;
-	ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
+	ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
 	ip->i_di.di_flags |= GFS2_DIF_JDATA;
 	ip->i_entries = 2;
 
@@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 
-	if (attr->ia_size != ip->i_di.di_size) {
+	if (attr->ia_size != ip->i_disksize) {
 		error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
 		if (error)
 			return error;
@@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
 	}
 
 	error = gfs2_truncatei(ip, attr->ia_size);
-	if (error && (inode->i_size != ip->i_di.di_size))
-		i_size_write(inode, ip->i_di.di_size);
+	if (error && (inode->i_size != ip->i_disksize))
+		i_size_write(inode, ip->i_disksize);
 
 	return error;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3e073f5144f..188d0a277fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1100,15 +1100,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
-	unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+	unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
 
-	if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) ||
-	    ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) {
+	if (!ip->i_disksize || ip->i_disksize > (64 << 20) ||
+	    ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
 		gfs2_consist_inode(ip);
 		return -EIO;
 	}
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2d90fb25350..bdad0dffc6b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -501,7 +501,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
 	for (rgrps = 0;; rgrps++) {
 		loff_t pos = rgrps * sizeof(struct gfs2_rindex);
 
-		if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size)
+		if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize)
 			break;
 		error = gfs2_internal_read(ip, &ra_state, buf, &pos,
 					   sizeof(struct gfs2_rindex));
@@ -590,7 +590,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct inode *inode = &ip->i_inode;
 	struct file_ra_state ra_state;
-	u64 rgrp_count = ip->i_di.di_size;
+	u64 rgrp_count = ip->i_disksize;
 	int error;
 
 	if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) {
@@ -634,7 +634,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
 	for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
 		/* Ignore partials */
 		if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
-		    ip->i_di.di_size)
+		    ip->i_disksize)
 			break;
 		error = read_rindex_entry(ip, &ra_state);
 		if (error) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c3ba3d9d0aa..f5cef2ad7ae 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -206,14 +206,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 	int ar;
 	int error;
 
-	if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) ||
-	    (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) {
+	if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) ||
+	    (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
 		gfs2_consist_inode(ip);
 		return -EIO;
 	}
-	jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift;
+	jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
 
-	error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar);
+	error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar);
 	if (!error && ar) {
 		gfs2_consist_inode(ip);
 		error = -EIO;
-- 
cgit v1.2.3


From 383f01fbf4a701b73f5e35ea805ed1700b4b4db9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 4 Nov 2008 10:05:22 +0000
Subject: GFS2: Banish struct gfs2_dinode_host

The final field in gfs2_dinode_host was the i_flags field. Thats
renamed to i_diskflags in order to avoid confusion with the existing
inode flags, and moved into the inode proper at a suitable location
to avoid creating a "hole".

At that point struct gfs2_dinode_host is no longer needed and as
promised (quite some time ago!) it can now be removed completely.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c       |  4 ++--
 fs/gfs2/dir.c        | 16 ++++++++--------
 fs/gfs2/eattr.c      | 14 +++++++-------
 fs/gfs2/glops.c      |  2 +-
 fs/gfs2/incore.h     |  7 +------
 fs/gfs2/inode.c      | 16 ++++++----------
 fs/gfs2/inode.h      |  2 +-
 fs/gfs2/ops_export.c |  2 +-
 fs/gfs2/ops_file.c   | 17 ++++++++---------
 fs/gfs2/ops_inode.c  |  2 +-
 fs/gfs2/ops_super.c  |  2 +-
 fs/gfs2/quota.c      |  2 +-
 12 files changed, 38 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index b43aee75d3c..789f28cfdc2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1047,7 +1047,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 		if (!error) {
 			ip->i_disksize = size;
 			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-			ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG;
+			ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
 			gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 			gfs2_dinode_out(ip, dibh->b_data);
 		}
@@ -1120,7 +1120,7 @@ static int trunc_end(struct gfs2_inode *ip)
 		gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 	}
 	ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-	ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG;
+	ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
 
 	gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 	gfs2_dinode_out(ip, dibh->b_data);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index d8d82324054..b7c8e5c7079 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -36,7 +36,7 @@
  * the block.  In leaves, they begin at offset sizeof(struct gfs2_leaf) from the
  * beginning of the leaf block. The dirents reside in leaves when
  *
- * dip->i_di.di_flags & GFS2_DIF_EXHASH is true
+ * dip->i_diskflags & GFS2_DIF_EXHASH is true
  *
  * Otherwise, the dirents are "linear", within a single stuffed dinode block.
  *
@@ -755,7 +755,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int error;
 
-	if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+	if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 		struct gfs2_leaf *leaf;
 		unsigned hsize = 1 << ip->i_depth;
 		unsigned index;
@@ -907,7 +907,7 @@ static int dir_make_exhash(struct inode *inode)
 
 	dip->i_disksize = sdp->sd_sb.sb_bsize / 2;
 	gfs2_add_inode_blocks(&dip->i_inode, 1);
-	dip->i_di.di_flags |= GFS2_DIF_EXHASH;
+	dip->i_diskflags |= GFS2_DIF_EXHASH;
 
 	for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
 	dip->i_depth = y;
@@ -1429,7 +1429,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
 	if (!dip->i_entries)
 		return 0;
 
-	if (dip->i_di.di_flags & GFS2_DIF_EXHASH)
+	if (dip->i_diskflags & GFS2_DIF_EXHASH)
 		return dir_e_read(inode, offset, opaque, filldir);
 
 	if (!gfs2_is_stuffed(dip)) {
@@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			dent = gfs2_init_dirent(inode, dent, name, bh);
 			gfs2_inum_out(nip, dent);
 			dent->de_type = cpu_to_be16(type);
-			if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
+			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 				leaf = (struct gfs2_leaf *)bh->b_data;
 				be16_add_cpu(&leaf->lf_entries, 1);
 			}
@@ -1628,7 +1628,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
 			error = 0;
 			break;
 		}
-		if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+		if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) {
 			error = dir_make_exhash(inode);
 			if (error)
 				break;
@@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
 	}
 
 	dirent_del(dip, bh, prev, dent);
-	if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
 		struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
 		u16 entries = be16_to_cpu(leaf->lf_entries);
 		if (!entries)
@@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 	gfs2_inum_out(nip, dent);
 	dent->de_type = cpu_to_be16(new_type);
 
-	if (dip->i_di.di_flags & GFS2_DIF_EXHASH) {
+	if (dip->i_diskflags & GFS2_DIF_EXHASH) {
 		brelse(bh);
 		error = gfs2_meta_inode_buffer(dip, &bh);
 		if (error)
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 1c1e06136aa..0d1c76d906a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -118,7 +118,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 	if (error)
 		return error;
 
-	if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) {
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) {
 		error = ea_foreach_i(ip, bh, ea_call, data);
 		goto out;
 	}
@@ -935,7 +935,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	int error;
 	int mh_size = sizeof(struct gfs2_meta_header);
 
-	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		__be64 *end;
 
 		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
@@ -974,7 +974,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 		eablk = (__be64 *)(indbh->b_data + mh_size);
 		*eablk = cpu_to_be64(ip->i_eattr);
 		ip->i_eattr = blk;
-		ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
+		ip->i_diskflags |= GFS2_DIF_EA_INDIRECT;
 		gfs2_add_inode_blocks(&ip->i_inode, 1);
 
 		eablk++;
@@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (error)
 		return error;
 
-	if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT))
+	if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
 		blks++;
 	if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
 		blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
@@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 		return error;
 
 	if (el.el_ea) {
-		if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) {
+		if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
 			brelse(el.el_bh);
 			return -EPERM;
 		}
@@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 	if (bstart)
 		gfs2_free_meta(ip, bstart, blen);
 
-	ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT;
+	ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
 	if (!error) {
@@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
 	if (error)
 		goto out_rindex;
 
-	if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) {
+	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		error = ea_dealloc_indirect(ip);
 		if (error)
 			goto out_rindex;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c6c318c2a0f..848d64c8b62 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -239,7 +239,7 @@ static int inode_go_lock(struct gfs2_holder *gh)
 			return error;
 	}
 
-	if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) &&
+	if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
 	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
 	    (gh->gh_state == LM_ST_EXCLUSIVE))
 		error = gfs2_truncatei_resume(ip);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4596cd254be..6f67e753f88 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -233,9 +233,6 @@ enum {
 	GIF_USER                = 4, /* user inode, not metadata addr space */
 };
 
-struct gfs2_dinode_host {
-	u32 di_flags;		/* GFS2_DIF_... */
-};
 
 struct gfs2_inode {
 	struct inode i_inode;
@@ -245,9 +242,6 @@ struct gfs2_inode {
 	u64 i_eattr;
 	loff_t i_disksize;
 	unsigned long i_flags;		/* GIF_... */
-
-	struct gfs2_dinode_host i_di; /* To be replaced by ref to block */
-
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
@@ -255,6 +249,7 @@ struct gfs2_inode {
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
 	u32 i_entries;
+	u32 i_diskflags;
 	u8 i_height;
 	u8 i_depth;
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index baf8b24b2de..97d3ce65e26 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -247,7 +247,6 @@ fail:
 
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 {
-	struct gfs2_dinode_host *di = &ip->i_di;
 	const struct gfs2_dinode *str = buf;
 	struct timespec atime;
 	u16 height, depth;
@@ -288,7 +287,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
 	ip->i_goal = be64_to_cpu(str->di_goal_meta);
 	ip->i_generation = be64_to_cpu(str->di_generation);
 
-	di->di_flags = be32_to_cpu(str->di_flags);
+	ip->i_diskflags = be32_to_cpu(str->di_flags);
 	gfs2_set_inode_flags(&ip->i_inode);
 	height = be16_to_cpu(str->di_height);
 	if (unlikely(height > GFS2_MAX_META_HEIGHT))
@@ -789,11 +788,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 	di->di_flags = 0;
 
 	if (S_ISREG(mode)) {
-		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
+		if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
 		    gfs2_tune_get(sdp, gt_new_files_jdata))
 			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
 	} else if (S_ISDIR(mode)) {
-		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
+		di->di_flags |= cpu_to_be32(dip->i_diskflags &
 					    GFS2_DIF_INHERIT_JDATA);
 	}
 
@@ -1241,7 +1240,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
-	const struct gfs2_dinode_host *di = &ip->i_di;
 	struct gfs2_dinode *str = buf;
 
 	str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -1265,10 +1263,10 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 	str->di_goal_data = cpu_to_be64(ip->i_goal);
 	str->di_generation = cpu_to_be64(ip->i_generation);
 
-	str->di_flags = cpu_to_be32(di->di_flags);
+	str->di_flags = cpu_to_be32(ip->i_diskflags);
 	str->di_height = cpu_to_be16(ip->i_height);
 	str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
-					     !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
+					     !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
 					     GFS2_FORMAT_DE : 0);
 	str->di_depth = cpu_to_be16(ip->i_depth);
 	str->di_entries = cpu_to_be32(ip->i_entries);
@@ -1281,8 +1279,6 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 
 void gfs2_dinode_print(const struct gfs2_inode *ip)
 {
-	const struct gfs2_dinode_host *di = &ip->i_di;
-
 	printk(KERN_INFO "  no_formal_ino = %llu\n",
 	       (unsigned long long)ip->i_no_formal_ino);
 	printk(KERN_INFO "  no_addr = %llu\n",
@@ -1293,7 +1289,7 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
 	       (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
 	printk(KERN_INFO "  i_goal = %llu\n",
 	       (unsigned long long)ip->i_goal);
-	printk(KERN_INFO "  di_flags = 0x%.8X\n", di->di_flags);
+	printk(KERN_INFO "  i_diskflags = 0x%.8X\n", ip->i_diskflags);
 	printk(KERN_INFO "  i_height = %u\n", ip->i_height);
 	printk(KERN_INFO "  i_depth = %u\n", ip->i_depth);
 	printk(KERN_INFO "  i_entries = %u\n", ip->i_entries);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c3577906f0a..d5329364cdf 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -20,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 
 static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
 {
-	return ip->i_di.di_flags & GFS2_DIF_JDATA;
+	return ip->i_diskflags & GFS2_DIF_JDATA;
 }
 
 static inline int gfs2_is_writeback(const struct gfs2_inode *ip)
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 3a9b9b43834..7fdeb14ddd1 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -213,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
 	}
 
 	error = -EIO;
-	if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) {
+	if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) {
 		iput(inode);
 		goto fail;
 	}
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index d7e649ed62f..a6b7a733fd4 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -157,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 	if (error)
 		return error;
 
-	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
-	if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags);
+	if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA)
 		fsflags |= FS_JOURNAL_DATA_FL;
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
@@ -171,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 void gfs2_set_inode_flags(struct inode *inode)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_dinode_host *di = &ip->i_di;
 	unsigned int flags = inode->i_flags;
 
 	flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	if (di->di_flags & GFS2_DIF_IMMUTABLE)
+	if (ip->i_diskflags & GFS2_DIF_IMMUTABLE)
 		flags |= S_IMMUTABLE;
-	if (di->di_flags & GFS2_DIF_APPENDONLY)
+	if (ip->i_diskflags & GFS2_DIF_APPENDONLY)
 		flags |= S_APPEND;
-	if (di->di_flags & GFS2_DIF_NOATIME)
+	if (ip->i_diskflags & GFS2_DIF_NOATIME)
 		flags |= S_NOATIME;
-	if (di->di_flags & GFS2_DIF_SYNC)
+	if (ip->i_diskflags & GFS2_DIF_SYNC)
 		flags |= S_SYNC;
 	inode->i_flags = flags;
 }
@@ -220,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if (error)
 		goto out_drop_write;
 
-	flags = ip->i_di.di_flags;
+	flags = ip->i_diskflags;
 	new_flags = (flags & ~mask) | (reqflags & mask);
 	if ((new_flags ^ flags) == 0)
 		goto out;
@@ -259,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	if (error)
 		goto out_trans_end;
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
-	ip->i_di.di_flags = new_flags;
+	ip->i_diskflags = new_flags;
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	gfs2_set_inode_flags(inode);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index b932d72b5f5..49877546beb 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -426,7 +426,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	ip->i_inode.i_nlink = 2;
 	ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode);
-	ip->i_di.di_flags |= GFS2_DIF_JDATA;
+	ip->i_diskflags |= GFS2_DIF_JDATA;
 	ip->i_entries = 2;
 
 	error = gfs2_meta_inode_buffer(ip, &dibh);
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index aee6cbaf58d..ad36af254fe 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -496,7 +496,7 @@ static void gfs2_delete_inode(struct inode *inode)
 		goto out_truncate;
 
 	if (S_ISDIR(inode->i_mode) &&
-	    (ip->i_di.di_flags & GFS2_DIF_EXHASH)) {
+	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
 		error = gfs2_dir_exhash_dealloc(ip);
 		if (error)
 			goto out_unlock;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 188d0a277fa..228a4659618 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1013,7 +1013,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 
 	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
 		return;
-	if (ip->i_di.di_flags & GFS2_DIF_SYSTEM)
+	if (ip->i_diskflags & GFS2_DIF_SYSTEM)
 		return;
 
 	for (x = 0; x < al->al_qd_num; x++) {
-- 
cgit v1.2.3


From d8b71f7381769177998acb2f59ddc73465a60fe0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 4 Nov 2008 10:19:03 +0000
Subject: GFS2: Move rg_igeneration into struct gfs2_rgrpd

This moves one of the fields of struct gfs2_rgrpd_host into
the struct gfs2_rgrpd with the eventual aim of removing
the struct rgrpd_host completely.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 2 +-
 fs/gfs2/rgrp.c   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 6f67e753f88..869ac83297e 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -71,7 +71,6 @@ struct gfs2_bitmap {
 struct gfs2_rgrp_host {
 	u32 rg_free;
 	u32 rg_dinodes;
-	u64 rg_igeneration;
 };
 
 struct gfs2_rgrpd {
@@ -84,6 +83,7 @@ struct gfs2_rgrpd {
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
 	struct gfs2_rgrp_host rd_rg;
+	u64 rd_igeneration;
 	struct gfs2_bitmap *rd_bits;
 	unsigned int rd_bh_count;
 	struct mutex rd_mutex;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bdad0dffc6b..8e93d62991c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -702,7 +702,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
 	rg->rg_free = be32_to_cpu(str->rg_free);
 	rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
-	rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
+	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
@@ -717,7 +717,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 	str->rg_free = cpu_to_be32(rg->rg_free);
 	str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
 	str->__pad = cpu_to_be32(0);
-	str->rg_igeneration = cpu_to_be64(rg->rg_igeneration);
+	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
 }
 
@@ -1448,7 +1448,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
 	rgd->rd_rg.rg_free--;
 	rgd->rd_rg.rg_dinodes++;
-	*generation = rgd->rd_rg.rg_igeneration++;
+	*generation = rgd->rd_igeneration++;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
 
-- 
cgit v1.2.3


From cfc8b54922db7b647b6d88914dc7ef8c63b6671d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 4 Nov 2008 10:25:13 +0000
Subject: GFS2: Move rg_free from gfs2_rgrpd_host to gfs2_rgrpd

The second of three fields which need to move, in order
to remove the struct gfs2_rgrpd_host.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  2 +-
 fs/gfs2/rgrp.c   | 28 ++++++++++++++--------------
 fs/gfs2/super.c  |  2 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 869ac83297e..f8d97736251 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -69,7 +69,6 @@ struct gfs2_bitmap {
 };
 
 struct gfs2_rgrp_host {
-	u32 rg_free;
 	u32 rg_dinodes;
 };
 
@@ -82,6 +81,7 @@ struct gfs2_rgrpd {
 	u32 rd_length;			/* length of rgrp header in fs blocks */
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
+	u32 rd_free;
 	struct gfs2_rgrp_host rd_rg;
 	u64 rd_igeneration;
 	struct gfs2_bitmap *rd_bits;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8e93d62991c..bab9cfab34c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -269,15 +269,15 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 						  bi->bi_len, x);
 	}
 
-	if (count[0] != rgd->rd_rg.rg_free) {
+	if (count[0] != rgd->rd_free) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "free data mismatch:  %u != %u\n",
-			       count[0], rgd->rd_rg.rg_free);
+			       count[0], rgd->rd_free);
 		return;
 	}
 
 	tmp = rgd->rd_data -
-		rgd->rd_rg.rg_free -
+		rgd->rd_free -
 		rgd->rd_rg.rg_dinodes;
 	if (count[1] + count[2] != tmp) {
 		if (gfs2_consist_rgrpd(rgd))
@@ -700,7 +700,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 		rgd->rd_flags |= GFS2_RDF_NOALLOC;
 	else
 		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
-	rg->rg_free = be32_to_cpu(str->rg_free);
+	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
 }
@@ -714,7 +714,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
 		rg_flags |= GFS2_RGF_NOALLOC;
 	str->rg_flags = cpu_to_be32(rg_flags);
-	str->rg_free = cpu_to_be32(rg->rg_free);
+	str->rg_free = cpu_to_be32(rgd->rd_free);
 	str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
 	str->__pad = cpu_to_be32(0);
 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
@@ -776,7 +776,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone = rgd->rd_rg.rg_free;
+	rgd->rd_free_clone = rgd->rd_free;
 	rgd->rd_bh_count++;
 	spin_unlock(&sdp->sd_rindex_spin);
 
@@ -850,7 +850,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-	rgd->rd_free_clone = rgd->rd_rg.rg_free;
+	rgd->rd_free_clone = rgd->rd_free;
 	spin_unlock(&sdp->sd_rindex_spin);
 }
 
@@ -1403,8 +1403,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 	block = rgd->rd_data0 + blk;
 	ip->i_goal = block;
 
-	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
-	rgd->rd_rg.rg_free -= *n;
+	gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+	rgd->rd_free -= *n;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1445,8 +1445,8 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 
 	block = rgd->rd_data0 + blk;
 
-	gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
-	rgd->rd_rg.rg_free--;
+	gfs2_assert_withdraw(sdp, rgd->rd_free);
+	rgd->rd_free--;
 	rgd->rd_rg.rg_dinodes++;
 	*generation = rgd->rd_igeneration++;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1481,7 +1481,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	if (!rgd)
 		return;
 
-	rgd->rd_rg.rg_free += blen;
+	rgd->rd_free += blen;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1509,7 +1509,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	if (!rgd)
 		return;
 
-	rgd->rd_rg.rg_free += blen;
+	rgd->rd_free += blen;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1549,7 +1549,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 	if (!rgd->rd_rg.rg_dinodes)
 		gfs2_consist_rgrpd(rgd);
 	rgd->rd_rg.rg_dinodes--;
-	rgd->rd_rg.rg_free++;
+	rgd->rd_free++;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f5cef2ad7ae..e76907691ad 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -468,7 +468,7 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
 {
 	gfs2_rgrp_verify(rgd);
 	sc->sc_total += rgd->rd_data;
-	sc->sc_free += rgd->rd_rg.rg_free;
+	sc->sc_free += rgd->rd_free;
 	sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
 	return 0;
 }
-- 
cgit v1.2.3


From 73f749483ed18f3b5759909cc4187b1741f54b10 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 4 Nov 2008 10:32:57 +0000
Subject: GFS2: Banish struct gfs2_rgrpd_host

This patch moves the final field so that we can get rid
of struct gfs2_rgrpd_host, as promised some time ago. Also
by rearranging the fields slightly, we are able to reduce
the size of the gfs2_rgrpd structure at the same time.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 12 ++++--------
 fs/gfs2/rgrp.c   | 20 ++++++++------------
 fs/gfs2/super.c  |  2 +-
 3 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index f8d97736251..9e3b613d0ba 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -68,10 +68,6 @@ struct gfs2_bitmap {
 	u32 bi_len;
 };
 
-struct gfs2_rgrp_host {
-	u32 rg_dinodes;
-};
-
 struct gfs2_rgrpd {
 	struct list_head rd_list;	/* Link with superblock */
 	struct list_head rd_list_mru;
@@ -82,15 +78,15 @@ struct gfs2_rgrpd {
 	u32 rd_data;			/* num of data blocks in rgrp */
 	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
 	u32 rd_free;
-	struct gfs2_rgrp_host rd_rg;
+	u32 rd_free_clone;
+	u32 rd_dinodes;
 	u64 rd_igeneration;
 	struct gfs2_bitmap *rd_bits;
-	unsigned int rd_bh_count;
 	struct mutex rd_mutex;
-	u32 rd_free_clone;
 	struct gfs2_log_element rd_le;
-	u32 rd_last_alloc;
 	struct gfs2_sbd *rd_sbd;
+	unsigned int rd_bh_count;
+	u32 rd_last_alloc;
 	unsigned char rd_flags;
 #define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
 #define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index bab9cfab34c..8b01c635d92 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -276,9 +276,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 		return;
 	}
 
-	tmp = rgd->rd_data -
-		rgd->rd_free -
-		rgd->rd_rg.rg_dinodes;
+	tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
 	if (count[1] + count[2] != tmp) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "used data mismatch:  %u != %u\n",
@@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 		return;
 	}
 
-	if (count[3] != rgd->rd_rg.rg_dinodes) {
+	if (count[3] != rgd->rd_dinodes) {
 		if (gfs2_consist_rgrpd(rgd))
 			fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-			       count[3], rgd->rd_rg.rg_dinodes);
+			       count[3], rgd->rd_dinodes);
 		return;
 	}
 
@@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
 static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 {
 	const struct gfs2_rgrp *str = buf;
-	struct gfs2_rgrp_host *rg = &rgd->rd_rg;
 	u32 rg_flags;
 
 	rg_flags = be32_to_cpu(str->rg_flags);
@@ -701,21 +698,20 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 	else
 		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
 	rgd->rd_free = be32_to_cpu(str->rg_free);
-	rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
+	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
 }
 
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
-	struct gfs2_rgrp_host *rg = &rgd->rd_rg;
 	u32 rg_flags = 0;
 
 	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
 		rg_flags |= GFS2_RGF_NOALLOC;
 	str->rg_flags = cpu_to_be32(rg_flags);
 	str->rg_free = cpu_to_be32(rgd->rd_free);
-	str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
+	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
 	str->__pad = cpu_to_be32(0);
 	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
 	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
@@ -1447,7 +1443,7 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
 
 	gfs2_assert_withdraw(sdp, rgd->rd_free);
 	rgd->rd_free--;
-	rgd->rd_rg.rg_dinodes++;
+	rgd->rd_dinodes++;
 	*generation = rgd->rd_igeneration++;
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
 	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1546,9 +1542,9 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
 		return;
 	gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
 
-	if (!rgd->rd_rg.rg_dinodes)
+	if (!rgd->rd_dinodes)
 		gfs2_consist_rgrpd(rgd);
-	rgd->rd_rg.rg_dinodes--;
+	rgd->rd_dinodes--;
 	rgd->rd_free++;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e76907691ad..b85877062a4 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -469,7 +469,7 @@ static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
 	gfs2_rgrp_verify(rgd);
 	sc->sc_total += rgd->rd_data;
 	sc->sc_free += rgd->rd_free;
-	sc->sc_dinodes += rgd->rd_rg.rg_dinodes;
+	sc->sc_dinodes += rgd->rd_dinodes;
 	return 0;
 }
 
-- 
cgit v1.2.3


From fa75cedc3da5923b8ea3877be9d5bc09b02e3860 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 10 Nov 2008 10:10:12 +0000
Subject: GFS2: Add more detail to debugfs glock dumps

Although the glock dumps print quite a lot of information about
the glocks themselves, there are more things which can be
usefully added to the dump realting to the objects themselves.

This patch adds a few more fields to the inode and resource
group lines, which should be useful for debugging.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 848d64c8b62..68ee66552d1 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	const struct gfs2_inode *ip = gl->gl_object;
 	if (ip == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n",
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
-		  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
+		  (unsigned int)ip->i_diskflags,
+		  (unsigned long long)ip->i_inode.i_size,
+		  (unsigned long long)ip->i_disksize);
 	return 0;
 }
 
@@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 	const struct gfs2_rgrpd *rgd = gl->gl_object;
 	if (rgd == NULL)
 		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 37b2c8377c98acb60cf4d0126e385ef2153bded9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 17 Nov 2008 14:25:37 +0000
Subject: GFS2: Clean up & move gfs2_quotad

This patch is a clean up of gfs2_quotad prior to giving it an
extra job to do in addition to the current portfolio of updating
the quota and statfs information from time to time.

As a result it has been moved into quota.c allowing one of the
functions it calls to be made static. Also the clean up allows
the two existing functions to have separate timeouts and also
to coexist with its future role of dealing with the "truncate in
progress" inode flag.

The (pointless) setting of gfs2_quotad_secs is removed since we
arrange to only wake up quotad when one of the two timers expires.

In addition the struct gfs2_quota_data is moved into a slab cache,
mainly for easier debugging. It should also be possible to use
a shrinker in the future, rather than the current scheme of scanning
the quota data entries from time to time.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/daemon.c     | 53 ------------------------------------
 fs/gfs2/incore.h     |  4 +--
 fs/gfs2/main.c       | 10 +++++++
 fs/gfs2/ops_fstype.c |  5 +---
 fs/gfs2/quota.c      | 76 +++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/gfs2/quota.h      |  1 -
 fs/gfs2/sys.c        |  2 --
 fs/gfs2/util.c       |  1 +
 fs/gfs2/util.h       |  1 +
 9 files changed, 84 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index e51991947d2..5668aa77b95 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -23,7 +23,6 @@
 #include "daemon.h"
 #include "glock.h"
 #include "log.h"
-#include "quota.h"
 #include "recovery.h"
 #include "super.h"
 #include "util.h"
@@ -82,55 +81,3 @@ int gfs2_recoverd(void *data)
 	return 0;
 }
 
-/**
- * gfs2_quotad - Write cached quota changes into the quota file
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_quotad(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-	int error;
-
-	while (!kthread_should_stop()) {
-		/* Update the master statfs file */
-
-		t = sdp->sd_statfs_sync_time +
-		    gfs2_tune_get(sdp, gt_statfs_quantum) * HZ;
-
-		if (time_after_eq(jiffies, t)) {
-			error = gfs2_statfs_sync(sdp);
-			if (error &&
-			    error != -EROFS &&
-			    !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-				fs_err(sdp, "quotad: (1) error=%d\n", error);
-			sdp->sd_statfs_sync_time = jiffies;
-		}
-
-		/* Update quota file */
-
-		t = sdp->sd_quota_sync_time +
-		    gfs2_tune_get(sdp, gt_quota_quantum) * HZ;
-
-		if (time_after_eq(jiffies, t)) {
-			error = gfs2_quota_sync(sdp);
-			if (error &&
-			    error != -EROFS &&
-			    !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-				fs_err(sdp, "quotad: (2) error=%d\n", error);
-			sdp->sd_quota_sync_time = jiffies;
-		}
-
-		gfs2_quota_scan(sdp);
-
-		t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
-	return 0;
-}
-
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9e3b613d0ba..cfebc179357 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -402,7 +402,6 @@ struct gfs2_tune {
 
 	unsigned int gt_recoverd_secs;
 	unsigned int gt_logd_secs;
-	unsigned int gt_quotad_secs;
 
 	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
 	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
@@ -509,7 +508,6 @@ struct gfs2_sbd {
 	spinlock_t sd_statfs_spin;
 	struct gfs2_statfs_change_host sd_statfs_master;
 	struct gfs2_statfs_change_host sd_statfs_local;
-	unsigned long sd_statfs_sync_time;
 
 	/* Resource group stuff */
 
@@ -551,13 +549,13 @@ struct gfs2_sbd {
 	atomic_t sd_quota_count;
 	spinlock_t sd_quota_spin;
 	struct mutex sd_quota_mutex;
+	wait_queue_head_t sd_quota_wait;
 
 	unsigned int sd_quota_slots;
 	unsigned int sd_quota_chunks;
 	unsigned char **sd_quota_bitmap;
 
 	u64 sd_quota_sync_gen;
-	unsigned long sd_quota_sync_time;
 
 	/* Log stuff */
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 3eea03c7853..e3f6f1844a2 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -93,6 +93,12 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_rgrpd_cachep)
 		goto fail;
 
+	gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad",
+					       sizeof(struct gfs2_quota_data),
+					       0, 0, NULL);
+	if (!gfs2_quotad_cachep)
+		goto fail;
+
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
 		goto fail;
@@ -112,6 +118,9 @@ fail_unregister:
 fail:
 	gfs2_glock_exit();
 
+	if (gfs2_quotad_cachep)
+		kmem_cache_destroy(gfs2_quotad_cachep);
+
 	if (gfs2_rgrpd_cachep)
 		kmem_cache_destroy(gfs2_rgrpd_cachep);
 
@@ -140,6 +149,7 @@ static void __exit exit_gfs2_fs(void)
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
 
+	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
 	kmem_cache_destroy(gfs2_bufdata_cachep);
 	kmem_cache_destroy(gfs2_inode_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index dd83e832235..5d137063b67 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -60,7 +60,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_log_flush_secs = 60;
 	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
-	gt->gt_quotad_secs = 5;
 	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
@@ -107,6 +106,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_quota_list);
 	spin_lock_init(&sdp->sd_quota_spin);
 	mutex_init(&sdp->sd_quota_mutex);
+	init_waitqueue_head(&sdp->sd_quota_wait);
 
 	spin_lock_init(&sdp->sd_log_lock);
 
@@ -970,9 +970,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
 	}
 	sdp->sd_logd_process = p;
 
-	sdp->sd_statfs_sync_time = jiffies;
-	sdp->sd_quota_sync_time = jiffies;
-
 	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
 	error = IS_ERR(p);
 	if (error) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 228a4659618..0cfe44f0b6a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -46,6 +46,8 @@
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	struct gfs2_quota_data *qd;
 	int error;
 
-	qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
+	qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
 	if (!qd)
 		return -ENOMEM;
 
@@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	return 0;
 
 fail:
-	kfree(qd);
+	kmem_cache_free(gfs2_quotad_cachep, qd);
 	return error;
 }
 
@@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 		if (qd || !create) {
 			if (new_qd) {
 				gfs2_lvb_unhold(new_qd->qd_gl);
-				kfree(new_qd);
+				kmem_cache_free(gfs2_quotad_cachep, new_qd);
 			}
 			*qdp = qd;
 			return 0;
@@ -1195,7 +1197,7 @@ fail:
 	return error;
 }
 
-void gfs2_quota_scan(struct gfs2_sbd *sdp)
+static void gfs2_quota_scan(struct gfs2_sbd *sdp)
 {
 	struct gfs2_quota_data *qd, *safe;
 	LIST_HEAD(dead);
@@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp)
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
 		gfs2_lvb_unhold(qd->qd_gl);
-		kfree(qd);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
 	}
 }
 
@@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
 		gfs2_lvb_unhold(qd->qd_gl);
-		kfree(qd);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
 
 		spin_lock(&sdp->sd_quota_spin);
 	}
@@ -1272,3 +1274,65 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	}
 }
 
+static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
+{
+	if (error == 0 || error == -EROFS)
+		return;
+	if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
+}
+
+static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
+			       int (*fxn)(struct gfs2_sbd *sdp),
+			       unsigned long t, unsigned long *timeo,
+			       unsigned int *new_timeo)
+{
+	if (t >= *timeo) {
+		int error = fxn(sdp);
+		quotad_error(sdp, msg, error);
+		*timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ;
+	} else {
+		*timeo -= t;
+	}
+}
+
+/**
+ * gfs2_quotad - Write cached quota changes into the quota file
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_quotad(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	struct gfs2_tune *tune = &sdp->sd_tune;
+	unsigned long statfs_timeo = 0;
+	unsigned long quotad_timeo = 0;
+	unsigned long t = 0;
+	DEFINE_WAIT(wait);
+
+	while (!kthread_should_stop()) {
+
+		/* Update the master statfs file */
+		quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t,
+				   &statfs_timeo, &tune->gt_statfs_quantum);
+
+		/* Update quota file */
+		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
+				   &quotad_timeo, &tune->gt_quota_quantum);
+
+		/* FIXME: This should be turned into a shrinker */
+		gfs2_quota_scan(sdp);
+
+		if (freezing(current))
+			refrigerator();
+		t = min(quotad_timeo, statfs_timeo);
+
+		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
+		t -= schedule_timeout(t);
+		finish_wait(&sdp->sd_quota_wait, &wait);
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 3b7f4b0e5df..1d08aeef07e 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -29,7 +29,6 @@ int gfs2_quota_sync(struct gfs2_sbd *sdp);
 int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 
 int gfs2_quota_init(struct gfs2_sbd *sdp);
-void gfs2_quota_scan(struct gfs2_sbd *sdp);
 void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
 
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7e1879f1a02..59e36fd8090 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -408,7 +408,6 @@ TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
 TUNE_ATTR_DAEMON(logd_secs, logd_process);
-TUNE_ATTR_DAEMON(quotad_secs, quotad_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
@@ -426,7 +425,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_recoverd_secs.attr,
 	&tune_attr_logd_secs.attr,
-	&tune_attr_quotad_secs.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
 	NULL,
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index d31e355c61f..374f50e9549 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly;
 struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
+struct kmem_cache *gfs2_quotad_cachep __read_mostly;
 
 void gfs2_assert_i(struct gfs2_sbd *sdp)
 {
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 7f48576289c..33e96b0ce9a 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep;
 extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
+extern struct kmem_cache *gfs2_quotad_cachep;
 
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
 					   unsigned int *p)
-- 
cgit v1.2.3


From 813e0c46c9e2a0c6f0b6e774faac82afd7a2e812 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 18 Nov 2008 13:38:48 +0000
Subject: GFS2: Fix "truncate in progress" hang

Following on from the recent clean up of gfs2_quotad, this patch moves
the processing of "truncate in progress" inodes from the glock workqueue
into gfs2_quotad. This fixes a hang due to the "truncate in progress"
processing requiring glocks in order to complete.

It might seem odd to use gfs2_quotad for this particular item, but
we have to use a pre-existing thread since creating a thread implies
a GFP_KERNEL memory allocation which is not allowed from the glock
workqueue context. Of the existing threads, gfs2_logd and gfs2_recoverd
may deadlock if used for this operation. gfs2_scand and gfs2_glockd are
both scheduled for removal at some (hopefully not too distant) future
point. That leaves only gfs2_quotad whose workload is generally fairly
light and is easily adapted for this extra task.

Also, as a result of this change, it opens the way for a future patch to
make the reading of the inode's information asynchronous with respect to
the glock workqueue, which is another improvement that has been on the list
for some time now.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c      | 33 +++++++++++++++++++++++++++++----
 fs/gfs2/glock.h      |  1 +
 fs/gfs2/glops.c      | 11 +++++++++--
 fs/gfs2/incore.h     |  3 +++
 fs/gfs2/main.c       |  1 +
 fs/gfs2/ops_fstype.c |  2 ++
 fs/gfs2/quota.c      | 31 ++++++++++++++++++++++++++++++-
 7 files changed, 75 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 27cb9cca9c0..4ddf3bd55dd 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -40,6 +40,7 @@
 #include "quota.h"
 #include "super.h"
 #include "util.h"
+#include "bmap.h"
 
 struct gfs2_gl_hash_bucket {
         struct hlist_head hb_list;
@@ -289,7 +290,8 @@ static void gfs2_holder_wake(struct gfs2_holder *gh)
  * do_promote - promote as many requests as possible on the current queue
  * @gl: The glock
  * 
- * Returns: true if there is a blocked holder at the head of the list
+ * Returns: 1 if there is a blocked holder at the head of the list, or 2
+ *          if a type specific operation is underway.
  */
 
 static int do_promote(struct gfs2_glock *gl)
@@ -312,6 +314,8 @@ restart:
 				ret = glops->go_lock(gh);
 				spin_lock(&gl->gl_spin);
 				if (ret) {
+					if (ret == 1)
+						return 2;
 					gh->gh_error = ret;
 					list_del_init(&gh->gh_list);
 					gfs2_holder_wake(gh);
@@ -416,6 +420,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_holder *gh;
 	unsigned state = ret & LM_OUT_ST_MASK;
+	int rv;
 
 	spin_lock(&gl->gl_spin);
 	state_change(gl, state);
@@ -470,7 +475,6 @@ retry:
 		gfs2_demote_wake(gl);
 	if (state != LM_ST_UNLOCKED) {
 		if (glops->go_xmote_bh) {
-			int rv;
 			spin_unlock(&gl->gl_spin);
 			rv = glops->go_xmote_bh(gl, gh);
 			if (rv == -EAGAIN)
@@ -481,10 +485,13 @@ retry:
 				goto out;
 			}
 		}
-		do_promote(gl);
+		rv = do_promote(gl);
+		if (rv == 2)
+			goto out_locked;
 	}
 out:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
+out_locked:
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_put(gl);
 }
@@ -584,6 +591,7 @@ __releases(&gl->gl_spin)
 __acquires(&gl->gl_spin)
 {
 	struct gfs2_holder *gh = NULL;
+	int ret;
 
 	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
 		return;
@@ -602,8 +610,11 @@ __acquires(&gl->gl_spin)
 	} else {
 		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
 			gfs2_demote_wake(gl);
-		if (do_promote(gl) == 0)
+		ret = do_promote(gl);
+		if (ret == 0)
 			goto out;
+		if (ret == 2)
+			return;
 		gh = find_first_waiter(gl);
 		gl->gl_target = gh->gh_state;
 		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
@@ -1556,6 +1567,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 	}
 }
 
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
+{
+	struct gfs2_glock *gl = ip->i_gl;
+	int ret;
+
+	ret = gfs2_truncatei_resume(ip);
+	gfs2_assert_withdraw(gl->gl_sbd, ret == 0);
+
+	spin_lock(&gl->gl_spin);
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	run_queue(gl, 1);
+	spin_unlock(&gl->gl_spin);
+}
+
 static const char *state2str(unsigned state)
 {
 	switch(state) {
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 695c6b19361..13a64ee6523 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -132,6 +132,7 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 68ee66552d1..8ebff8ebae2 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -227,6 +227,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl)
 static int inode_go_lock(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_inode *ip = gl->gl_object;
 	int error = 0;
 
@@ -241,8 +242,14 @@ static int inode_go_lock(struct gfs2_holder *gh)
 
 	if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
 	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
-	    (gh->gh_state == LM_ST_EXCLUSIVE))
-		error = gfs2_truncatei_resume(ip);
+	    (gh->gh_state == LM_ST_EXCLUSIVE)) {
+		spin_lock(&sdp->sd_trunc_lock);
+		if (list_empty(&ip->i_trunc_list))
+			list_add(&sdp->sd_trunc_list, &ip->i_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		wake_up(&sdp->sd_quota_wait);
+		return 1;
+	}
 
 	return error;
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index cfebc179357..dd7d0f8f357 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -244,6 +244,7 @@ struct gfs2_inode {
 	struct gfs2_alloc *i_alloc;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
+	struct list_head i_trunc_list;
 	u32 i_entries;
 	u32 i_diskflags;
 	u8 i_height;
@@ -550,6 +551,8 @@ struct gfs2_sbd {
 	spinlock_t sd_quota_spin;
 	struct mutex sd_quota_mutex;
 	wait_queue_head_t sd_quota_wait;
+	struct list_head sd_trunc_list;
+	spinlock_t sd_trunc_lock;
 
 	unsigned int sd_quota_slots;
 	unsigned int sd_quota_chunks;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index e3f6f1844a2..cf39295ccb9 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo)
 
 	inode_init_once(&ip->i_inode);
 	init_rwsem(&ip->i_rw_mutex);
+	INIT_LIST_HEAD(&ip->i_trunc_list);
 	ip->i_alloc = NULL;
 }
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 5d137063b67..a9a83804eea 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -107,6 +107,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	spin_lock_init(&sdp->sd_quota_spin);
 	mutex_init(&sdp->sd_quota_mutex);
 	init_waitqueue_head(&sdp->sd_quota_wait);
+	INIT_LIST_HEAD(&sdp->sd_trunc_list);
+	spin_lock_init(&sdp->sd_trunc_lock);
 
 	spin_lock_init(&sdp->sd_log_lock);
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 0cfe44f0b6a..b08d09696b3 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1296,6 +1296,25 @@ static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg,
 	}
 }
 
+static void quotad_check_trunc_list(struct gfs2_sbd *sdp)
+{
+	struct gfs2_inode *ip;
+
+	while(1) {
+		ip = NULL;
+		spin_lock(&sdp->sd_trunc_lock);
+		if (!list_empty(&sdp->sd_trunc_list)) {
+			ip = list_entry(sdp->sd_trunc_list.next,
+					struct gfs2_inode, i_trunc_list);
+			list_del_init(&ip->i_trunc_list);
+		}
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (ip == NULL)
+			return;
+		gfs2_glock_finish_truncate(ip);
+	}
+}
+
 /**
  * gfs2_quotad - Write cached quota changes into the quota file
  * @sdp: Pointer to GFS2 superblock
@@ -1310,6 +1329,7 @@ int gfs2_quotad(void *data)
 	unsigned long quotad_timeo = 0;
 	unsigned long t = 0;
 	DEFINE_WAIT(wait);
+	int empty;
 
 	while (!kthread_should_stop()) {
 
@@ -1324,12 +1344,21 @@ int gfs2_quotad(void *data)
 		/* FIXME: This should be turned into a shrinker */
 		gfs2_quota_scan(sdp);
 
+		/* Check for & recover partially truncated inodes */
+		quotad_check_trunc_list(sdp);
+
 		if (freezing(current))
 			refrigerator();
 		t = min(quotad_timeo, statfs_timeo);
 
 		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE);
-		t -= schedule_timeout(t);
+		spin_lock(&sdp->sd_trunc_lock);
+		empty = list_empty(&sdp->sd_trunc_list);
+		spin_unlock(&sdp->sd_trunc_lock);
+		if (empty)
+			t -= schedule_timeout(t);
+		else
+			t = 0;
 		finish_wait(&sdp->sd_quota_wait, &wait);
 	}
 
-- 
cgit v1.2.3


From 9ac1b4d9b6f885ccd7d8f56bceb609003a920ff7 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 19 Nov 2008 10:08:22 +0000
Subject: GFS2: Move gfs2_recoverd into recovery.c

By moving gfs2_recoverd, we can make an additional function static
and it also leaves only (the already scheduled for removal) gfs2_glockd
in daemon.c.

At the same time the declaration of gfs2_quotad is moved to quota.h
to reflect the new location of gfs2_quotad in a previous patch. Also
the recovery.h and quota.h headers are cleaned up.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/daemon.c     | 22 ----------------------
 fs/gfs2/daemon.h     |  2 --
 fs/gfs2/ops_fstype.c |  1 +
 fs/gfs2/quota.h      | 23 ++++++++++++-----------
 fs/gfs2/recovery.c   | 26 +++++++++++++++++++++++++-
 fs/gfs2/recovery.h   | 14 +++++++-------
 6 files changed, 45 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
index 5668aa77b95..2662df0d5b9 100644
--- a/fs/gfs2/daemon.c
+++ b/fs/gfs2/daemon.c
@@ -59,25 +59,3 @@ int gfs2_glockd(void *data)
 	return 0;
 }
 
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_recoverd(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-
-	while (!kthread_should_stop()) {
-		gfs2_check_journals(sdp);
-		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
-	return 0;
-}
-
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
index 4be084fb6a6..5258954a234 100644
--- a/fs/gfs2/daemon.h
+++ b/fs/gfs2/daemon.h
@@ -11,7 +11,5 @@
 #define __DAEMON_DOT_H__
 
 int gfs2_glockd(void *data);
-int gfs2_recoverd(void *data);
-int gfs2_quotad(void *data);
 
 #endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a9a83804eea..d159e7e7272 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -33,6 +33,7 @@
 #include "sys.h"
 #include "util.h"
 #include "log.h"
+#include "quota.h"
 
 #define DO 0
 #define UNDO 1
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 1d08aeef07e..cec9032be97 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -15,21 +15,22 @@ struct gfs2_sbd;
 
 #define NO_QUOTA_CHANGE ((u32)-1)
 
-int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unhold(struct gfs2_inode *ip);
+extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unhold(struct gfs2_inode *ip);
 
-int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_unlock(struct gfs2_inode *ip);
+extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_unlock(struct gfs2_inode *ip);
 
-int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
-void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
-		       u32 uid, u32 gid);
+extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid);
+extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
+			      u32 uid, u32 gid);
 
-int gfs2_quota_sync(struct gfs2_sbd *sdp);
-int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
+extern int gfs2_quota_sync(struct gfs2_sbd *sdp);
+extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id);
 
-int gfs2_quota_init(struct gfs2_sbd *sdp);
-void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quota_init(struct gfs2_sbd *sdp);
+extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
+extern int gfs2_quotad(void *data);
 
 static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 {
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index d5e91f4f6a0..b56ba3db777 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -14,6 +14,8 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/lm_interface.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -589,7 +591,7 @@ fail:
  *
  */
 
-void gfs2_check_journals(struct gfs2_sbd *sdp)
+static void gfs2_check_journals(struct gfs2_sbd *sdp)
 {
 	struct gfs2_jdesc *jd;
 
@@ -603,3 +605,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp)
 	}
 }
 
+/**
+ * gfs2_recoverd - Recover dead machine's journals
+ * @sdp: Pointer to GFS2 superblock
+ *
+ */
+
+int gfs2_recoverd(void *data)
+{
+	struct gfs2_sbd *sdp = data;
+	unsigned long t;
+
+	while (!kthread_should_stop()) {
+		gfs2_check_journals(sdp);
+		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
+		if (freezing(current))
+			refrigerator();
+		schedule_timeout_interruptible(t);
+	}
+
+	return 0;
+}
+
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index f7235e61c72..a8218ea15b5 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
 	        *blk = 0;
 }
 
-int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
+extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
 			   struct buffer_head **bh);
 
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
+extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 
-int gfs2_find_jhead(struct gfs2_jdesc *jd,
+extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
 		    struct gfs2_log_header_host *head);
-int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-void gfs2_check_journals(struct gfs2_sbd *sdp);
+extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
+extern int gfs2_recoverd(void *data);
 
 #endif /* __RECOVERY_DOT_H__ */
 
-- 
cgit v1.2.3


From 97cc1025b1a91c52e84f12478dcf0f853abc6564 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 20 Nov 2008 13:39:47 +0000
Subject: GFS2: Kill two daemons with one patch

This patch removes the two daemons, gfs2_scand and gfs2_glockd
and replaces them with a shrinker which is called from the VM.

The net result is that GFS2 responds better when there is memory
pressure, since it shrinks the glock cache at the same rate
as the VFS shrinks the dcache and icache. There are no longer
any time based criteria for shrinking glocks, they are kept
until such time as the VM asks for more memory and then we
demote just as many glocks as required.

There are potential future changes to this code, including the
possibility of sorting the glocks which are to be written back
into inode number order, to get a better I/O ordering. It would
be very useful to have an elevator based workqueue implementation
for this, as that would automatically deal with the read I/O cases
at the same time.

This patch is my answer to Andrew Morton's remark, made during
the initial review of GFS2, asking why GFS2 needs so many kernel
threads, the answer being that it doesn't :-) This patch is a
net loss of about 200 lines of code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile     |   2 +-
 fs/gfs2/daemon.c     |  61 -------------
 fs/gfs2/daemon.h     |  15 ----
 fs/gfs2/glock.c      | 248 ++++++++++++++++++++++-----------------------------
 fs/gfs2/glock.h      |   1 -
 fs/gfs2/glops.c      |  32 ++++---
 fs/gfs2/incore.h     |  16 +---
 fs/gfs2/inode.c      |   1 -
 fs/gfs2/main.c       |   2 +-
 fs/gfs2/mount.c      |  21 +----
 fs/gfs2/ops_fstype.c |  25 ------
 fs/gfs2/ops_super.c  |   5 --
 fs/gfs2/sys.c        |  42 +--------
 13 files changed, 130 insertions(+), 341 deletions(-)
 delete mode 100644 fs/gfs2/daemon.c
 delete mode 100644 fs/gfs2/daemon.h

(limited to 'fs')

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index ec65851ec80..c1b4ec6a965 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
 	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c
deleted file mode 100644
index 2662df0d5b9..00000000000
--- a/fs/gfs2/daemon.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
-#include <linux/freezer.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "daemon.h"
-#include "glock.h"
-#include "log.h"
-#include "recovery.h"
-#include "super.h"
-#include "util.h"
-
-/* This uses schedule_timeout() instead of msleep() because it's good for
-   the daemons to wake up more often than the timeout when unmounting so
-   the user's unmount doesn't sit there forever.
-
-   The kthread functions used to start these daemons block and flush signals. */
-
-/**
- * gfs2_glockd - Reclaim unused glock structures
- * @sdp: Pointer to GFS2 superblock
- *
- * One or more of these daemons run, reclaiming glocks on sd_reclaim_list.
- * Number of daemons can be set by user, with num_glockd mount option.
- */
-
-int gfs2_glockd(void *data)
-{
-	struct gfs2_sbd *sdp = data;
-
-	while (!kthread_should_stop()) {
-		while (atomic_read(&sdp->sd_reclaim_count))
-			gfs2_reclaim_glock(sdp);
-
-		wait_event_interruptible(sdp->sd_reclaim_wq,
-					 (atomic_read(&sdp->sd_reclaim_count) ||
-					 kthread_should_stop()));
-		if (freezing(current))
-			refrigerator();
-	}
-
-	return 0;
-}
-
diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h
deleted file mode 100644
index 5258954a234..00000000000
--- a/fs/gfs2/daemon.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __DAEMON_DOT_H__
-#define __DAEMON_DOT_H__
-
-int gfs2_glockd(void *data);
-
-#endif /* __DAEMON_DOT_H__ */
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4ddf3bd55dd..07ffc8123d7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -62,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
-static struct task_struct *scand_process;
-static unsigned int scand_secs = 5;
 static struct workqueue_struct *glock_workqueue;
+static LIST_HEAD(lru_list);
+static atomic_t lru_count = ATOMIC_INIT(0);
+static spinlock_t lru_lock = SPIN_LOCK_UNLOCKED;
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -174,6 +175,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 	atomic_inc(&gl->gl_ref);
 }
 
+/**
+ * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
+ * @gl: the glock
+ *
+ */
+
+static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
+{
+	spin_lock(&lru_lock);
+	if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+		list_add_tail(&gl->gl_lru, &lru_list);
+		atomic_inc(&lru_count);
+	}
+	spin_unlock(&lru_lock);
+}
+
 /**
  * gfs2_glock_put() - Decrement reference count on glock
  * @gl: The glock to put
@@ -188,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 	if (atomic_dec_and_test(&gl->gl_ref)) {
 		hlist_del(&gl->gl_list);
 		write_unlock(gl_lock_addr(gl->gl_hash));
+		spin_lock(&lru_lock);
+		if (!list_empty(&gl->gl_lru)) {
+			list_del_init(&gl->gl_lru);
+			atomic_dec(&lru_count);
+		}
+		spin_unlock(&lru_lock);
 		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
 		rv = 1;
 		goto out;
 	}
 	write_unlock(gl_lock_addr(gl->gl_hash));
+	/* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+	if (atomic_read(&gl->gl_ref) == 2)
+		gfs2_glock_schedule_for_reclaim(gl);
 out:
 	return rv;
 }
@@ -837,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl)
  */
 
 static void handle_callback(struct gfs2_glock *gl, unsigned int state,
-			    int remote, unsigned long delay)
+			    unsigned long delay)
 {
 	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
 
@@ -845,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
 		gl->gl_demote_state = state;
 		gl->gl_demote_time = jiffies;
-		if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-		    gl->gl_object)
-			gfs2_glock_schedule_for_reclaim(gl);
 	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
 			gl->gl_demote_state != state) {
 		gl->gl_demote_state = LM_ST_UNLOCKED;
@@ -1017,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 
 	spin_lock(&gl->gl_spin);
 	if (gh->gh_flags & GL_NOCACHE)
-		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+		handle_callback(gl, LM_ST_UNLOCKED, 0);
 
 	list_del_init(&gh->gh_list);
 	if (find_first_holder(gl) == NULL) {
@@ -1288,7 +1311,7 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		delay = gl->gl_ops->go_min_hold_time;
 
 	spin_lock(&gl->gl_spin);
-	handle_callback(gl, state, 1, delay);
+	handle_callback(gl, state, delay);
 	spin_unlock(&gl->gl_spin);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
@@ -1357,80 +1380,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
  * Returns: 1 if it's ok
  */
 
-static int demote_ok(struct gfs2_glock *gl)
+static int demote_ok(const struct gfs2_glock *gl)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	int demote = 1;
-
-	if (test_bit(GLF_STICKY, &gl->gl_flags))
-		demote = 0;
-	else if (glops->go_demote_ok)
-		demote = glops->go_demote_ok(gl);
-
-	return demote;
-}
 
-/**
- * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
- * @gl: the glock
- *
- */
-
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-
-	spin_lock(&sdp->sd_reclaim_lock);
-	if (list_empty(&gl->gl_reclaim)) {
-		gfs2_glock_hold(gl);
-		list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
-		atomic_inc(&sdp->sd_reclaim_count);
-		spin_unlock(&sdp->sd_reclaim_lock);
-		wake_up(&sdp->sd_reclaim_wq);
-	} else
-		spin_unlock(&sdp->sd_reclaim_lock);
+	if (gl->gl_state == LM_ST_UNLOCKED)
+		return 0;
+	if (!list_empty(&gl->gl_holders))
+		return 0;
+	if (glops->go_demote_ok)
+		return glops->go_demote_ok(gl);
+	return 1;
 }
 
-/**
- * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list
- * @sdp: the filesystem
- *
- * Called from gfs2_glockd() glock reclaim daemon, or when promoting a
- * different glock and we notice that there are a lot of glocks in the
- * reclaim list.
- *
- */
 
-void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
+static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
 	struct gfs2_glock *gl;
-	int done_callback = 0;
+	int may_demote;
+	int nr_skipped = 0;
+	int got_ref = 0;
+	LIST_HEAD(skipped);
 
-	spin_lock(&sdp->sd_reclaim_lock);
-	if (list_empty(&sdp->sd_reclaim_list)) {
-		spin_unlock(&sdp->sd_reclaim_lock);
-		return;
-	}
-	gl = list_entry(sdp->sd_reclaim_list.next,
-			struct gfs2_glock, gl_reclaim);
-	list_del_init(&gl->gl_reclaim);
-	spin_unlock(&sdp->sd_reclaim_lock);
+	if (nr == 0)
+		goto out;
 
-	atomic_dec(&sdp->sd_reclaim_count);
-	atomic_inc(&sdp->sd_reclaimed);
+	if (!(gfp_mask & __GFP_FS))
+		return -1;
 
-	spin_lock(&gl->gl_spin);
-	if (find_first_holder(gl) == NULL &&
-	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
-		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-		done_callback = 1;
+	spin_lock(&lru_lock);
+	while(nr && !list_empty(&lru_list)) {
+		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
+		list_del_init(&gl->gl_lru);
+		atomic_dec(&lru_count);
+
+		/* Test for being demotable */
+		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+			gfs2_glock_hold(gl);
+			got_ref = 1;
+			spin_unlock(&lru_lock);
+			spin_lock(&gl->gl_spin);
+			may_demote = demote_ok(gl);
+			spin_unlock(&gl->gl_spin);
+			clear_bit(GLF_LOCK, &gl->gl_flags);
+			if (may_demote) {
+				handle_callback(gl, LM_ST_UNLOCKED, 0);
+				nr--;
+				if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+					gfs2_glock_put(gl);
+			}
+			spin_lock(&lru_lock);
+			if (may_demote)
+				continue;
+		}
+		if (list_empty(&gl->gl_lru) &&
+		    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
+			nr_skipped++;
+			list_add(&gl->gl_lru, &skipped);
+		}
+		if (got_ref) {
+			spin_unlock(&lru_lock);
+			gfs2_glock_put(gl);
+			spin_lock(&lru_lock);
+			got_ref = 0;
+		}
 	}
-	spin_unlock(&gl->gl_spin);
-	if (!done_callback ||
-	    queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-		gfs2_glock_put(gl);
+	list_splice(&skipped, &lru_list);
+	atomic_add(nr_skipped, &lru_count);
+	spin_unlock(&lru_lock);
+out:
+	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
 }
 
+static struct shrinker glock_shrinker = {
+	.shrink = gfs2_shrink_glock_memory,
+	.seeks = DEFAULT_SEEKS,
+};
+
 /**
  * examine_bucket - Call a function for glock in a hash bucket
  * @examiner: the function
@@ -1475,26 +1501,6 @@ out:
 	return has_entries;
 }
 
-/**
- * scan_glock - look at a glock and see if we can reclaim it
- * @gl: the glock to look at
- *
- */
-
-static void scan_glock(struct gfs2_glock *gl)
-{
-	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
-		return;
-	if (test_bit(GLF_LOCK, &gl->gl_flags))
-		return;
-
-	spin_lock(&gl->gl_spin);
-	if (find_first_holder(gl) == NULL &&
-	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-		gfs2_glock_schedule_for_reclaim(gl);
-	spin_unlock(&gl->gl_spin);
-}
-
 /**
  * clear_glock - look at a glock and see if we can free it from glock cache
  * @gl: the glock to look at
@@ -1503,23 +1509,16 @@ static void scan_glock(struct gfs2_glock *gl)
 
 static void clear_glock(struct gfs2_glock *gl)
 {
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int released;
-
-	spin_lock(&sdp->sd_reclaim_lock);
-	if (!list_empty(&gl->gl_reclaim)) {
-		list_del_init(&gl->gl_reclaim);
-		atomic_dec(&sdp->sd_reclaim_count);
-		spin_unlock(&sdp->sd_reclaim_lock);
-		released = gfs2_glock_put(gl);
-		gfs2_assert(sdp, !released);
-	} else {
-		spin_unlock(&sdp->sd_reclaim_lock);
+	spin_lock(&lru_lock);
+	if (!list_empty(&gl->gl_lru)) {
+		list_del_init(&gl->gl_lru);
+		atomic_dec(&lru_count);
 	}
+	spin_unlock(&lru_lock);
 
 	spin_lock(&gl->gl_spin);
 	if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
-		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+		handle_callback(gl, LM_ST_UNLOCKED, 0);
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -1656,8 +1655,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 	char *p = buf;
 	if (test_bit(GLF_LOCK, gflags))
 		*p++ = 'l';
-	if (test_bit(GLF_STICKY, gflags))
-		*p++ = 's';
 	if (test_bit(GLF_DEMOTE, gflags))
 		*p++ = 'D';
 	if (test_bit(GLF_PENDING_DEMOTE, gflags))
@@ -1776,34 +1773,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
 	return error;
 }
 
-/**
- * gfs2_scand - Look for cached glocks and inodes to toss from memory
- * @sdp: Pointer to GFS2 superblock
- *
- * One of these daemons runs, finding candidates to add to sd_reclaim_list.
- * See gfs2_glockd()
- */
-
-static int gfs2_scand(void *data)
-{
-	unsigned x;
-	unsigned delay;
-
-	while (!kthread_should_stop()) {
-		for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
-			examine_bucket(scan_glock, NULL, x);
-		if (freezing(current))
-			refrigerator();
-		delay = scand_secs;
-		if (delay < 1)
-			delay = 1;
-		schedule_timeout_interruptible(delay * HZ);
-	}
-
-	return 0;
-}
-
-
 
 int __init gfs2_glock_init(void)
 {
@@ -1817,28 +1786,21 @@ int __init gfs2_glock_init(void)
 	}
 #endif
 
-	scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand");
-	if (IS_ERR(scand_process))
-		return PTR_ERR(scand_process);
-
 	glock_workqueue = create_workqueue("glock_workqueue");
-	if (IS_ERR(glock_workqueue)) {
-		kthread_stop(scand_process);
+	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
-	}
+
+	register_shrinker(&glock_shrinker);
 
 	return 0;
 }
 
 void gfs2_glock_exit(void)
 {
+	unregister_shrinker(&glock_shrinker);
 	destroy_workqueue(glock_workqueue);
-	kthread_stop(scand_process);
 }
 
-module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
 	struct gfs2_glock *gl;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 13a64ee6523..543ec7ecfbd 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -129,7 +129,6 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 8ebff8ebae2..8522d3aa64f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
  * Returns: 1 if it's ok
  */
 
-static int inode_go_demote_ok(struct gfs2_glock *gl)
+static int inode_go_demote_ok(const struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int demote = 0;
-
-	if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages)
-		demote = 1;
-	else if (!sdp->sd_args.ar_localcaching &&
-		 time_after_eq(jiffies, gl->gl_stamp +
-			       gfs2_tune_get(sdp, gt_demote_secs) * HZ))
-		demote = 1;
-
-	return demote;
+	if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
+		return 0;
+	return 1;
 }
 
 /**
@@ -284,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
  * Returns: 1 if it's ok
  */
 
-static int rgrp_go_demote_ok(struct gfs2_glock *gl)
+static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
 {
 	return !gl->gl_aspace->i_mapping->nrpages;
 }
@@ -385,6 +378,18 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 	return 0;
 }
 
+/**
+ * trans_go_demote_ok
+ * @gl: the glock
+ *
+ * Always returns 0
+ */
+
+static int trans_go_demote_ok(const struct gfs2_glock *gl)
+{
+	return 0;
+}
+
 /**
  * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
  * @gl: the glock
@@ -392,7 +397,7 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
  * Returns: 1 if it's ok
  */
 
-static int quota_go_demote_ok(struct gfs2_glock *gl)
+static int quota_go_demote_ok(const struct gfs2_glock *gl)
 {
 	return !atomic_read(&gl->gl_lvb_count);
 }
@@ -426,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 const struct gfs2_glock_operations gfs2_trans_glops = {
 	.go_xmote_th = trans_go_sync,
 	.go_xmote_bh = trans_go_xmote_bh,
+	.go_demote_ok = trans_go_demote_ok,
 	.go_type = LM_TYPE_NONDISK,
 };
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dd7d0f8f357..608849d0002 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -125,7 +125,7 @@ struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
 	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
 	void (*go_inval) (struct gfs2_glock *gl, int flags);
-	int (*go_demote_ok) (struct gfs2_glock *gl);
+	int (*go_demote_ok) (const struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
 	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
@@ -155,7 +155,6 @@ struct gfs2_holder {
 
 enum {
 	GLF_LOCK			= 1,
-	GLF_STICKY			= 2,
 	GLF_DEMOTE			= 3,
 	GLF_PENDING_DEMOTE		= 4,
 	GLF_DEMOTE_IN_PROGRESS		= 5,
@@ -190,7 +189,7 @@ struct gfs2_glock {
 	unsigned long gl_tchange;
 	void *gl_object;
 
-	struct list_head gl_reclaim;
+	struct list_head gl_lru;
 
 	struct gfs2_sbd *gl_sbd;
 
@@ -397,7 +396,6 @@ struct gfs2_args {
 struct gfs2_tune {
 	spinlock_t gt_spin;
 
-	unsigned int gt_demote_secs; /* Cache retention for unheld glock */
 	unsigned int gt_incore_log_blocks;
 	unsigned int gt_log_flush_secs;
 
@@ -478,10 +476,6 @@ struct gfs2_sbd {
 	/* Lock Stuff */
 
 	struct lm_lockstruct sd_lockstruct;
-	struct list_head sd_reclaim_list;
-	spinlock_t sd_reclaim_lock;
-	wait_queue_head_t sd_reclaim_wq;
-	atomic_t sd_reclaim_count;
 	struct gfs2_holder sd_live_gh;
 	struct gfs2_glock *sd_rename_gl;
 	struct gfs2_glock *sd_trans_gl;
@@ -541,8 +535,6 @@ struct gfs2_sbd {
 	struct task_struct *sd_recoverd_process;
 	struct task_struct *sd_logd_process;
 	struct task_struct *sd_quotad_process;
-	struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX];
-	unsigned int sd_glockd_num;
 
 	/* Quota stuff */
 
@@ -615,10 +607,6 @@ struct gfs2_sbd {
 	struct mutex sd_freeze_lock;
 	unsigned int sd_freeze_count;
 
-	/* Counters */
-
-	atomic_t sd_reclaimed;
-
 	char sd_fsname[GFS2_FSNAME_LEN];
 	char sd_table_name[GFS2_FSNAME_LEN];
 	char sd_proto_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 97d3ce65e26..3b87c188da4 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -386,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
 	gfs2_free_di(rgd, ip);
 
 	gfs2_trans_end(sdp);
-	clear_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 
 out_rg_gunlock:
 	gfs2_glock_dq_uninit(&al->al_rgd_gh);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index cf39295ccb9..7cacfde3219 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -43,7 +43,7 @@ static void gfs2_init_glock_once(void *foo)
 	INIT_LIST_HEAD(&gl->gl_holders);
 	gl->gl_lvb = NULL;
 	atomic_set(&gl->gl_lvb_count, 0);
-	INIT_LIST_HEAD(&gl->gl_reclaim);
+	INIT_LIST_HEAD(&gl->gl_lru);
 	INIT_LIST_HEAD(&gl->gl_ail_list);
 	atomic_set(&gl->gl_ail_count, 0);
 }
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f96eb90a2cf..8c0f16e301f 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -32,7 +32,6 @@ enum {
 	Opt_debug,
 	Opt_nodebug,
 	Opt_upgrade,
-	Opt_num_glockd,
 	Opt_acl,
 	Opt_noacl,
 	Opt_quota_off,
@@ -57,7 +56,6 @@ static const match_table_t tokens = {
 	{Opt_debug, "debug"},
 	{Opt_nodebug, "nodebug"},
 	{Opt_upgrade, "upgrade"},
-	{Opt_num_glockd, "num_glockd=%d"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
 	{Opt_quota_off, "quota=off"},
@@ -96,7 +94,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 		spin_unlock(&gfs2_sys_margs_lock);
 
 		/*  Set some defaults  */
-		args->ar_num_glockd = GFS2_GLOCKD_DEFAULT;
 		args->ar_quota = GFS2_QUOTA_DEFAULT;
 		args->ar_data = GFS2_DATA_DEFAULT;
 	}
@@ -105,7 +102,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	   process them */
 
 	for (options = data; (o = strsep(&options, ",")); ) {
-		int token, option;
+		int token;
 		substring_t tmp[MAX_OPT_ARGS];
 
 		if (!*o)
@@ -196,22 +193,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 				goto cant_remount;
 			args->ar_upgrade = 1;
 			break;
-		case Opt_num_glockd:
-			if ((error = match_int(&tmp[0], &option))) {
-				fs_info(sdp, "problem getting num_glockd\n");
-				goto out_error;
-			}
-
-			if (remount && option != args->ar_num_glockd)
-				goto cant_remount;
-			if (!option || option > GFS2_GLOCKD_MAX) {
-				fs_info(sdp, "0 < num_glockd <= %u  (not %u)\n",
-				        GFS2_GLOCKD_MAX, option);
-				error = -EINVAL;
-				goto out_error;
-			}
-			args->ar_num_glockd = option;
-			break;
 		case Opt_acl:
 			args->ar_posix_acl = 1;
 			sdp->sd_vfs->s_flags |= MS_POSIXACL;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d159e7e7272..fc300eafda8 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -22,7 +22,6 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
-#include "daemon.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
@@ -56,7 +55,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 {
 	spin_lock_init(&gt->gt_spin);
 
-	gt->gt_demote_secs = 300;
 	gt->gt_incore_log_blocks = 1024;
 	gt->gt_log_flush_secs = 60;
 	gt->gt_recoverd_secs = 60;
@@ -88,10 +86,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
 	gfs2_tune_init(&sdp->sd_tune);
 
-	INIT_LIST_HEAD(&sdp->sd_reclaim_list);
-	spin_lock_init(&sdp->sd_reclaim_lock);
-	init_waitqueue_head(&sdp->sd_reclaim_wq);
-
 	mutex_init(&sdp->sd_inum_mutex);
 	spin_lock_init(&sdp->sd_statfs_spin);
 
@@ -443,24 +437,11 @@ out:
 static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
 			int undo)
 {
-	struct task_struct *p;
 	int error = 0;
 
 	if (undo)
 		goto fail_trans;
 
-	for (sdp->sd_glockd_num = 0;
-	     sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd;
-	     sdp->sd_glockd_num++) {
-		p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd");
-		error = IS_ERR(p);
-		if (error) {
-			fs_err(sdp, "can't start glockd thread: %d\n", error);
-			goto fail;
-		}
-		sdp->sd_glockd_process[sdp->sd_glockd_num] = p;
-	}
-
 	error = gfs2_glock_nq_num(sdp,
 				  GFS2_MOUNT_LOCK, &gfs2_nondisk_glops,
 				  LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE,
@@ -493,7 +474,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
 		fs_err(sdp, "can't create transaction glock: %d\n", error);
 		goto fail_rename;
 	}
-	set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags);
 
 	return 0;
 
@@ -506,9 +486,6 @@ fail_live:
 fail_mount:
 	gfs2_glock_dq_uninit(mount_gh);
 fail:
-	while (sdp->sd_glockd_num--)
-		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
-
 	return error;
 }
 
@@ -681,7 +658,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		return PTR_ERR(sdp->sd_jindex);
 	}
 	ip = GFS2_I(sdp->sd_jindex);
-	set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 
 	/* Load in the journal index special file */
 
@@ -832,7 +808,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 		goto fail_statfs;
 	}
 	ip = GFS2_I(sdp->sd_rindex);
-	set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
 	sdp->sd_rindex_uptodate = 0;
 
 	/* Read in the quota inode */
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index ad36af254fe..29f8a5c0b45 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -142,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb)
 	kthread_stop(sdp->sd_quotad_process);
 	kthread_stop(sdp->sd_logd_process);
 	kthread_stop(sdp->sd_recoverd_process);
-	while (sdp->sd_glockd_num--)
-		kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
@@ -369,7 +367,6 @@ static void gfs2_clear_inode(struct inode *inode)
 	 */
 	if (test_bit(GIF_USER, &ip->i_flags)) {
 		ip->i_gl->gl_object = NULL;
-		gfs2_glock_schedule_for_reclaim(ip->i_gl);
 		gfs2_glock_put(ip->i_gl);
 		ip->i_gl = NULL;
 		if (ip->i_iopen_gh.gh_gl) {
@@ -422,8 +419,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		seq_printf(s, ",debug");
 	if (args->ar_upgrade)
 		seq_printf(s, ",upgrade");
-	if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT)
-		seq_printf(s, ",num_glockd=%u", args->ar_num_glockd);
 	if (args->ar_posix_acl)
 		seq_printf(s, ",acl");
 	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 59e36fd8090..67ba5b7b759 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -263,7 +263,6 @@ ARGS_ATTR(localcaching,    "%d\n");
 ARGS_ATTR(localflocks,     "%d\n");
 ARGS_ATTR(debug,           "%d\n");
 ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(num_glockd,      "%u\n");
 ARGS_ATTR(posix_acl,       "%d\n");
 ARGS_ATTR(quota,           "%u\n");
 ARGS_ATTR(suiddir,         "%d\n");
@@ -279,7 +278,6 @@ static struct attribute *args_attrs[] = {
 	&args_attr_localflocks.attr,
 	&args_attr_debug.attr,
 	&args_attr_upgrade.attr,
-	&args_attr_num_glockd.attr,
 	&args_attr_posix_acl.attr,
 	&args_attr_quota.attr,
 	&args_attr_suiddir.attr,
@@ -287,30 +285,6 @@ static struct attribute *args_attrs[] = {
 	NULL,
 };
 
-/*
- * display counters from superblock
- */
-
-struct counters_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
-#define COUNTERS_ATTR(name, fmt)                                            \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt,                                \
-			(unsigned int)atomic_read(&sdp->sd_##name));        \
-}                                                                           \
-static struct counters_attr counters_attr_##name = __ATTR_RO(name)
-
-COUNTERS_ATTR(reclaimed,        "%u\n");
-
-static struct attribute *counters_attrs[] = {
-	&counters_attr_reclaimed.attr,
-	NULL,
-};
-
 /*
  * get and set struct gfs2_tune fields
  */
@@ -393,7 +367,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
 
-TUNE_ATTR(demote_secs, 0);
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -411,7 +384,6 @@ TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
-	&tune_attr_demote_secs.attr,
 	&tune_attr_incore_log_blocks.attr,
 	&tune_attr_log_flush_secs.attr,
 	&tune_attr_quota_warn_period.attr,
@@ -435,11 +407,6 @@ static struct attribute_group lockstruct_group = {
 	.attrs = lockstruct_attrs,
 };
 
-static struct attribute_group counters_group = {
-	.name = "counters",
-	.attrs = counters_attrs,
-};
-
 static struct attribute_group args_group = {
 	.name = "args",
 	.attrs = args_attrs,
@@ -464,13 +431,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_reg;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &counters_group);
-	if (error)
-		goto fail_lockstruct;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
 	if (error)
-		goto fail_counters;
+		goto fail_lockstruct;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
@@ -481,8 +444,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_counters:
-	sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 fail_lockstruct:
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
@@ -496,7 +457,6 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-	sysfs_remove_group(&sdp->sd_kobj, &counters_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 	kobject_put(&sdp->sd_kobj);
 }
-- 
cgit v1.2.3


From fdd1062ebaa422c5684f97fa91da06f91167d76b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Nov 2008 10:26:38 +0000
Subject: GFS2: Send some sensible sysfs stuff

We ought to inform the user of the locktable and lockproto for each
uevent we generate.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/sysfs.c | 16 +++++++++++++++-
 fs/gfs2/sys.c               | 16 +++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index 4ec571c3d8a..9b7edcf7bd4 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls)
 	kobject_put(&ls->kobj);
 }
 
+static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
+		       struct kobj_uevent_env *env)
+{
+        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
+        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
+        add_uevent_var(env, "LOCKPROTO=lock_dlm");
+        return 0;
+}
+
+static struct kset_uevent_ops gdlm_uevent_ops = {
+	.uevent = gdlm_uevent,
+};
+
+
 int gdlm_sysfs_init(void)
 {
-	gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj);
+	gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
 	if (!gdlm_kset) {
 		printk(KERN_WARNING "%s: can not create kset\n", __func__);
 		return -ENOMEM;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 67ba5b7b759..298bcb6c271 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -461,11 +461,25 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 	kobject_put(&sdp->sd_kobj);
 }
 
+static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
+		       struct kobj_uevent_env *env)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
+	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	return 0;
+}
+
+static struct kset_uevent_ops gfs2_uevent_ops = {
+	.uevent = gfs2_uevent,
+};
+
+
 int gfs2_sys_init(void)
 {
 	gfs2_sys_margs = NULL;
 	spin_lock_init(&gfs2_sys_margs_lock);
-	gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj);
+	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
 	if (!gfs2_kset)
 		return -ENOMEM;
 	return 0;
-- 
cgit v1.2.3


From b52896813c2f16bcc5c5b67bb3c3f75bc084439b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Nov 2008 12:49:26 +0000
Subject: GFS2: Fix bug in gfs2_lock_fs_check_clean()

gfs2_lock_fs_check_clean() should not be calling gfs2_jindex_hold()
since it doesn't work like rindex hold, despite the comment. That
allows gfs2_jindex_hold() to be moved into ops_fstype.c where it
can be made static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.h        |  1 +
 fs/gfs2/ops_fstype.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/super.c      | 74 ----------------------------------------------------
 fs/gfs2/super.h      |  1 -
 4 files changed, 68 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 8a468cac932..4f919440c3b 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -11,6 +11,7 @@
 #define __DIR_DOT_H__
 
 #include <linux/dcache.h>
+#include <linux/crc32.h>
 
 struct inode;
 struct gfs2_inode;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index fc300eafda8..4cae60f4a17 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -33,6 +33,7 @@
 #include "util.h"
 #include "log.h"
 #include "quota.h"
+#include "dir.h"
 
 #define DO 0
 #define UNDO 1
@@ -638,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 					sdp->sd_lockstruct.ls_lockspace);
 }
 
+/**
+ * gfs2_jindex_hold - Grab a lock on the jindex
+ * @sdp: The GFS2 superblock
+ * @ji_gh: the holder for the jindex glock
+ *
+ * Returns: errno
+ */
+
+static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
+{
+	struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
+	struct qstr name;
+	char buf[20];
+	struct gfs2_jdesc *jd;
+	int error;
+
+	name.name = buf;
+
+	mutex_lock(&sdp->sd_jindex_mutex);
+
+	for (;;) {
+		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
+		if (error)
+			break;
+
+		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
+		name.hash = gfs2_disk_hash(name.name, name.len);
+
+		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
+		if (error == -ENOENT) {
+			error = 0;
+			break;
+		}
+
+		gfs2_glock_dq_uninit(ji_gh);
+
+		if (error)
+			break;
+
+		error = -ENOMEM;
+		jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
+		if (!jd)
+			break;
+
+		INIT_LIST_HEAD(&jd->extent_list);
+		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
+		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
+			if (!jd->jd_inode)
+				error = -ENOENT;
+			else
+				error = PTR_ERR(jd->jd_inode);
+			kfree(jd);
+			break;
+		}
+
+		spin_lock(&sdp->sd_jindex_spin);
+		jd->jd_jid = sdp->sd_journals++;
+		list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
+		spin_unlock(&sdp->sd_jindex_spin);
+	}
+
+	mutex_unlock(&sdp->sd_jindex_mutex);
+
+	return error;
+}
+
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct inode *master = sdp->sd_master_dir->d_inode;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b85877062a4..3dd9f5788cb 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,76 +33,6 @@
 #include "trans.h"
 #include "util.h"
 
-/**
- * gfs2_jindex_hold - Grab a lock on the jindex
- * @sdp: The GFS2 superblock
- * @ji_gh: the holder for the jindex glock
- *
- * This is very similar to the gfs2_rindex_hold() function, except that
- * in general we hold the jindex lock for longer periods of time and
- * we grab it far less frequently (in general) then the rgrp lock.
- *
- * Returns: errno
- */
-
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
-{
-	struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex);
-	struct qstr name;
-	char buf[20];
-	struct gfs2_jdesc *jd;
-	int error;
-
-	name.name = buf;
-
-	mutex_lock(&sdp->sd_jindex_mutex);
-
-	for (;;) {
-		error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh);
-		if (error)
-			break;
-
-		name.len = sprintf(buf, "journal%u", sdp->sd_journals);
-		name.hash = gfs2_disk_hash(name.name, name.len);
-
-		error = gfs2_dir_check(sdp->sd_jindex, &name, NULL);
-		if (error == -ENOENT) {
-			error = 0;
-			break;
-		}
-
-		gfs2_glock_dq_uninit(ji_gh);
-
-		if (error)
-			break;
-
-		error = -ENOMEM;
-		jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL);
-		if (!jd)
-			break;
-
-		INIT_LIST_HEAD(&jd->extent_list);
-		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
-		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
-			if (!jd->jd_inode)
-				error = -ENOENT;
-			else
-				error = PTR_ERR(jd->jd_inode);
-			kfree(jd);
-			break;
-		}
-
-		spin_lock(&sdp->sd_jindex_spin);
-		jd->jd_jid = sdp->sd_journals++;
-		list_add_tail(&jd->jd_list, &sdp->sd_jindex_list);
-		spin_unlock(&sdp->sd_jindex_spin);
-	}
-
-	mutex_unlock(&sdp->sd_jindex_mutex);
-
-	return error;
-}
-
 /**
  * gfs2_jindex_free - Clear all the journal index information
  * @sdp: The GFS2 superblock
@@ -580,10 +510,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 	struct gfs2_log_header_host lh;
 	int error;
 
-	error = gfs2_jindex_hold(sdp, &ji_gh);
-	if (error)
-		return error;
-
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
 		if (!lfcc) {
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 1848dad3ecb..c6254596713 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,6 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 	return x;
 }
 
-int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh);
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-- 
cgit v1.2.3


From 2bfb6449b7a1f29a2a63e1d869103b5811c3b69f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Nov 2008 13:30:49 +0000
Subject: GFS2: Move four functions from super.c

The functions which are being moved can all be marked
static in their new locations, since they only have
a single caller each. Their new locations are more
logical than before and some of the functions are
small enough that the compiler might well inline them.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c     |  14 +++++
 fs/gfs2/ops_super.c | 131 +++++++++++++++++++++++++++++++++++++++++
 fs/gfs2/recovery.c  |  22 +++++++
 fs/gfs2/super.c     | 164 ----------------------------------------------------
 fs/gfs2/super.h     |   4 --
 5 files changed, 167 insertions(+), 168 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 07ffc8123d7..6e298b07011 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1317,6 +1317,20 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		gfs2_glock_put(gl);
 }
 
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_jid != jid)
+			continue;
+		jd->jd_dirty = 1;
+		break;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+}
+
 /**
  * gfs2_glock_cb - Callback used by locking module
  * @sdp: Pointer to the superblock
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 29f8a5c0b45..08837a72863 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -256,6 +256,137 @@ static void gfs2_unlockfs(struct super_block *sb)
 	gfs2_unfreeze_fs(sb->s_fs_info);
 }
 
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change_host *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_data;
+	sc->sc_free += rgd->rd_free;
+	sc->sc_dinodes += rgd->rd_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_holder ri_gh;
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto out;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+	kfree(gha);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
 /**
  * gfs2_statfs - Gather and return stats about the filesystem
  * @sb: The superblock
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index b56ba3db777..efd09c3d2b2 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -585,6 +585,28 @@ fail:
 	return error;
 }
 
+static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
+{
+	struct gfs2_jdesc *jd;
+	int found = 0;
+
+	spin_lock(&sdp->sd_jindex_spin);
+
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_dirty) {
+			jd->jd_dirty = 0;
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	if (!found)
+		jd = NULL;
+
+	return jd;
+}
+
 /**
  * gfs2_check_journals - Recover any dirty journals
  * @sdp: the filesystem
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 3dd9f5788cb..141b781f2fc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -96,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid)
 	return jd;
 }
 
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-	struct gfs2_jdesc *jd;
-
-	spin_lock(&sdp->sd_jindex_spin);
-	jd = jdesc_find_i(&sdp->sd_jindex_list, jid);
-	if (jd)
-		jd->jd_dirty = 1;
-	spin_unlock(&sdp->sd_jindex_spin);
-}
-
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-	struct gfs2_jdesc *jd;
-	int found = 0;
-
-	spin_lock(&sdp->sd_jindex_spin);
-
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_dirty) {
-			jd->jd_dirty = 0;
-			found = 1;
-			break;
-		}
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	if (!found)
-		jd = NULL;
-
-	return jd;
-}
-
 int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 {
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
@@ -353,137 +320,6 @@ out:
 	return error;
 }
 
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-
-	spin_lock(&sdp->sd_statfs_spin);
-
-	*sc = *m_sc;
-	sc->sc_total += l_sc->sc_total;
-	sc->sc_free += l_sc->sc_free;
-	sc->sc_dinodes += l_sc->sc_dinodes;
-
-	spin_unlock(&sdp->sd_statfs_spin);
-
-	if (sc->sc_free < 0)
-		sc->sc_free = 0;
-	if (sc->sc_free > sc->sc_total)
-		sc->sc_free = sc->sc_total;
-	if (sc->sc_dinodes < 0)
-		sc->sc_dinodes = 0;
-
-	return 0;
-}
-
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-			    struct gfs2_statfs_change_host *sc)
-{
-	gfs2_rgrp_verify(rgd);
-	sc->sc_total += rgd->rd_data;
-	sc->sc_free += rgd->rd_free;
-	sc->sc_dinodes += rgd->rd_dinodes;
-	return 0;
-}
-
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_holder ri_gh;
-	struct gfs2_rgrpd *rgd_next;
-	struct gfs2_holder *gha, *gh;
-	unsigned int slots = 64;
-	unsigned int x;
-	int done;
-	int error = 0, err;
-
-	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-	if (!gha)
-		return -ENOMEM;
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto out;
-
-	rgd_next = gfs2_rgrpd_get_first(sdp);
-
-	for (;;) {
-		done = 1;
-
-		for (x = 0; x < slots; x++) {
-			gh = gha + x;
-
-			if (gh->gh_gl && gfs2_glock_poll(gh)) {
-				err = gfs2_glock_wait(gh);
-				if (err) {
-					gfs2_holder_uninit(gh);
-					error = err;
-				} else {
-					if (!error)
-						error = statfs_slow_fill(
-							gh->gh_gl->gl_object, sc);
-					gfs2_glock_dq_uninit(gh);
-				}
-			}
-
-			if (gh->gh_gl)
-				done = 0;
-			else if (rgd_next && !error) {
-				error = gfs2_glock_nq_init(rgd_next->rd_gl,
-							   LM_ST_SHARED,
-							   GL_ASYNC,
-							   gh);
-				rgd_next = gfs2_rgrpd_get_next(rgd_next);
-				done = 0;
-			}
-
-			if (signal_pending(current))
-				error = -ERESTARTSYS;
-		}
-
-		if (done)
-			break;
-
-		yield();
-	}
-
-	gfs2_glock_dq_uninit(&ri_gh);
-
-out:
-	kfree(gha);
-	return error;
-}
-
 struct lfcc {
 	struct list_head list;
 	struct gfs2_holder gh;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index c6254596713..f6b8b00ad88 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -28,8 +28,6 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid);
-struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp);
 int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
 int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
@@ -41,8 +39,6 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
 			s64 total, s64 free, s64 dinodes);
 int gfs2_statfs_sync(struct gfs2_sbd *sdp);
-int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
-int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc);
 
 int gfs2_freeze_fs(struct gfs2_sbd *sdp);
 void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
-- 
cgit v1.2.3


From 2e204703a1161e9bae38ba0d3d0df04a679e6f4f Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 26 Nov 2008 14:01:26 +0000
Subject: GFS2: Remove ancient, unused code

Remove code that used to have something to do with initrd
but has been unused for a long time.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/mount.c | 8 --------
 fs/gfs2/sys.c   | 6 ------
 fs/gfs2/sys.h   | 4 ----
 3 files changed, 18 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 8c0f16e301f..3cb0a44ba02 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -85,14 +85,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 	int error = 0;
 
 	if (!remount) {
-		/*  If someone preloaded options, use those instead  */
-		spin_lock(&gfs2_sys_margs_lock);
-		if (gfs2_sys_margs) {
-			data = gfs2_sys_margs;
-			gfs2_sys_margs = NULL;
-		}
-		spin_unlock(&gfs2_sys_margs_lock);
-
 		/*  Set some defaults  */
 		args->ar_quota = GFS2_QUOTA_DEFAULT;
 		args->ar_data = GFS2_DATA_DEFAULT;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 298bcb6c271..26c1fa777a9 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,9 +26,6 @@
 #include "quota.h"
 #include "util.h"
 
-char *gfs2_sys_margs;
-spinlock_t gfs2_sys_margs_lock;
-
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -477,8 +474,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
 
 int gfs2_sys_init(void)
 {
-	gfs2_sys_margs = NULL;
-	spin_lock_init(&gfs2_sys_margs_lock);
 	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
 	if (!gfs2_kset)
 		return -ENOMEM;
@@ -487,7 +482,6 @@ int gfs2_sys_init(void)
 
 void gfs2_sys_uninit(void)
 {
-	kfree(gfs2_sys_margs);
 	kset_unregister(gfs2_kset);
 }
 
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index 1ca8cdac530..e94560e836d 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -13,10 +13,6 @@
 #include <linux/spinlock.h>
 struct gfs2_sbd;
 
-/* Allow args to be passed to GFS2 when using an initial ram disk */
-extern char *gfs2_sys_margs;
-extern spinlock_t gfs2_sys_margs_lock;
-
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp);
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
 
-- 
cgit v1.2.3


From 3af165ac4d099385b12e3e75a9ee3ffd02da33e0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 27 Nov 2008 08:27:28 +0000
Subject: GFS2: Fix use-after-free bug on umount

There was a use-after-free with the GFS2 super block during
umount. This patch moves almost all of the umount code from
->put_super into ->kill_sb, the only bit that cannot be moved
being the glock hash clearing which has to remain as ->put_super
due to umount ordering requirements. As a result its now obvious
that the kfree is the final operation, whereas before it was
hidden in ->put_super.

Also gfs2_jindex_free is then only referenced from a single file
so thats moved and marked static too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c      |  3 +-
 fs/gfs2/glock.h      |  2 +-
 fs/gfs2/ops_fstype.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++------
 fs/gfs2/ops_super.c  | 68 ++----------------------------------
 fs/gfs2/super.c      | 34 ------------------
 fs/gfs2/super.h      |  3 +-
 6 files changed, 94 insertions(+), 114 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6e298b07011..5eae62e7f77 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1547,8 +1547,9 @@ static void clear_glock(struct gfs2_glock *gl)
  * Called when unmounting the filesystem.
  */
 
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
+void gfs2_gl_hash_clear(struct super_block *sb)
 {
+	struct gfs2_sbd *sdp = sb->s_fs_info;
 	unsigned long t;
 	unsigned int x;
 	int cont;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 543ec7ecfbd..ce54f338cff 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -130,7 +130,7 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
+void gfs2_gl_hash_clear(struct super_block *sb);
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 
 int __init gfs2_glock_init(void);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4cae60f4a17..2e735bece6b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -705,6 +705,40 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 	return error;
 }
 
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+
+static void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+	struct list_head list, *head;
+	struct gfs2_jdesc *jd;
+	struct gfs2_journal_extent *jext;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_add(&list, &sdp->sd_jindex_list);
+	list_del_init(&sdp->sd_jindex_list);
+	sdp->sd_journals = 0;
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	while (!list_empty(&list)) {
+		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+		head = &jd->extent_list;
+		while (!list_empty(head)) {
+			jext = list_entry(head->next,
+					  struct gfs2_journal_extent,
+					  extent_list);
+			list_del(&jext->extent_list);
+			kfree(jext);
+		}
+		list_del(&jd->jd_list);
+		iput(jd->jd_inode);
+		kfree(jd);
+	}
+}
+
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct inode *master = sdp->sd_master_dir->d_inode;
@@ -1203,7 +1237,7 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-	gfs2_gl_hash_clear(sdp);
+	gfs2_gl_hash_clear(sb);
 	gfs2_lm_unmount(sdp);
 	while (invalidate_inodes(sb))
 		yield();
@@ -1263,17 +1297,61 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	if (sdp) {
-		gfs2_meta_syncfs(sdp);
-		dput(sdp->sd_root_dir);
-		dput(sdp->sd_master_dir);
-		sdp->sd_root_dir = NULL;
-		sdp->sd_master_dir = NULL;
+
+	if (sdp == NULL) {
+		kill_block_super(sb);
+		return;
 	}
-	shrink_dcache_sb(sb);
+	gfs2_meta_syncfs(sdp);
+	dput(sdp->sd_root_dir);
+	dput(sdp->sd_master_dir);
+	sdp->sd_root_dir = NULL;
+	sdp->sd_master_dir = NULL;
+
+	/*  Unfreeze the filesystem, if we need to  */
+	mutex_lock(&sdp->sd_freeze_lock);
+	if (sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+	kthread_stop(sdp->sd_recoverd_process);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		int error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+
+	/* At this point, we're through modifying the disk */
+	gfs2_jindex_free(sdp);
+	gfs2_clear_rgrpd(sdp);
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_inum_inode);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_trans_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_ir_inode);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
 	kill_block_super(sb);
-	if (sdp)
-		gfs2_delete_debugfs_file(sdp);
+	gfs2_lm_unmount(sdp);
+	gfs2_sys_fs_del(sdp);
+	gfs2_delete_debugfs_file(sdp);
+	kfree(sdp);
 }
 
 struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 08837a72863..bd08a0a8d9b 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -95,7 +95,7 @@ do_flush:
  * Returns: errno
  */
 
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 {
 	struct gfs2_holder t_gh;
 	int error;
@@ -121,70 +121,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	return error;
 }
 
-/**
- * gfs2_put_super - Unmount the filesystem
- * @sb: The VFS superblock
- *
- */
-
-static void gfs2_put_super(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	int error;
-
-	/*  Unfreeze the filesystem, if we need to  */
-
-	mutex_lock(&sdp->sd_freeze_lock);
-	if (sdp->sd_freeze_count)
-		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-	mutex_unlock(&sdp->sd_freeze_lock);
-
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-	kthread_stop(sdp->sd_recoverd_process);
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		error = gfs2_make_fs_ro(sdp);
-		if (error)
-			gfs2_io_error(sdp);
-	}
-	/*  At this point, we're through modifying the disk  */
-
-	/*  Release stuff  */
-
-	iput(sdp->sd_jindex);
-	iput(sdp->sd_inum_inode);
-	iput(sdp->sd_statfs_inode);
-	iput(sdp->sd_rindex);
-	iput(sdp->sd_quota_inode);
-
-	gfs2_glock_put(sdp->sd_rename_gl);
-	gfs2_glock_put(sdp->sd_trans_gl);
-
-	if (!sdp->sd_args.ar_spectator) {
-		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-		iput(sdp->sd_ir_inode);
-		iput(sdp->sd_sc_inode);
-		iput(sdp->sd_qc_inode);
-	}
-
-	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
-	gfs2_clear_rgrpd(sdp);
-	gfs2_jindex_free(sdp);
-	/*  Take apart glock structures and buffer lists  */
-	gfs2_gl_hash_clear(sdp);
-	/*  Unmount the locking protocol  */
-	gfs2_lm_unmount(sdp);
-
-	/*  At this point, we're through participating in the lockspace  */
-	gfs2_sys_fs_del(sdp);
-	kfree(sdp);
-}
-
 /**
  * gfs2_write_super
  * @sb: the superblock
@@ -686,7 +622,7 @@ const struct super_operations gfs2_super_ops = {
 	.destroy_inode		= gfs2_destroy_inode,
 	.write_inode		= gfs2_write_inode,
 	.delete_inode		= gfs2_delete_inode,
-	.put_super		= gfs2_put_super,
+	.put_super		= gfs2_gl_hash_clear,
 	.write_super		= gfs2_write_super,
 	.sync_fs		= gfs2_sync_fs,
 	.write_super_lockfs 	= gfs2_write_super_lockfs,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 141b781f2fc..f14658b2020 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,40 +33,6 @@
 #include "trans.h"
 #include "util.h"
 
-/**
- * gfs2_jindex_free - Clear all the journal index information
- * @sdp: The GFS2 superblock
- *
- */
-
-void gfs2_jindex_free(struct gfs2_sbd *sdp)
-{
-	struct list_head list, *head;
-	struct gfs2_jdesc *jd;
-	struct gfs2_journal_extent *jext;
-
-	spin_lock(&sdp->sd_jindex_spin);
-	list_add(&list, &sdp->sd_jindex_list);
-	list_del_init(&sdp->sd_jindex_list);
-	sdp->sd_journals = 0;
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	while (!list_empty(&list)) {
-		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
-		head = &jd->extent_list;
-		while (!list_empty(head)) {
-			jext = list_entry(head->next,
-					  struct gfs2_journal_extent,
-					  extent_list);
-			list_del(&jext->extent_list);
-			kfree(jext);
-		}
-		list_del(&jd->jd_list);
-		iput(jd->jd_inode);
-		kfree(jd);
-	}
-}
-
 static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
 {
 	struct gfs2_jdesc *jd;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index f6b8b00ad88..4d2492b3e7e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,8 +25,6 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 	return x;
 }
 
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
-
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
@@ -34,6 +32,7 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
 			      struct gfs2_inode **ipp);
 
 int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 
 int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
-- 
cgit v1.2.3


From 9a776db7371b9c77a8f4f0d2ac6374d78ac7db7d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 27 Nov 2008 09:42:51 +0000
Subject: GFS2: Send useful information with uevent messages

In order to distinguish between two differing uevent messages
and to avoid using the (racy) method of reading status from
sysfs in future, this adds some status information to our
uevent messages.

Btw, before anybody says "sysfs isn't racy", I'm aware of that,
but the way that GFS2 was using it (send an ambiugous uevent and
then expect the receiver to read sysfs to find out the status
of the reported operation) was.

The additional benefit of using the new interface is that it
should be possible for a node to recover multiple journals
at the same time, since there is no longer any confusion as
to which journal the status belongs to.

At some future stage, when all the userland programs have been
converted, I intend to remove the old interface.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/locking/dlm/mount.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 0c4cbe6c828..1aa7eb6a022 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -194,17 +194,25 @@ out:
 static void gdlm_recovery_done(void *lockspace, unsigned int jid,
                                unsigned int message)
 {
+	char env_jid[20];
+	char env_status[20];
+	char *envp[] = { env_jid, env_status, NULL };
 	struct gdlm_ls *ls = lockspace;
 	ls->recover_jid_done = jid;
 	ls->recover_jid_status = message;
-	kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+	sprintf(env_jid, "JID=%d", jid);
+	sprintf(env_status, "RECOVERY=%s",
+		message == LM_RD_SUCCESS ? "Done" : "Failed");
+	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 
 static void gdlm_others_may_mount(void *lockspace)
 {
+	char *message = "FIRSTMOUNT=Done";
+	char *envp[] = { message, NULL };
 	struct gdlm_ls *ls = lockspace;
 	ls->first_done = 1;
-	kobject_uevent(&ls->kobj, KOBJ_CHANGE);
+	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
 }
 
 /* Userspace gets the offline uevent, blocks new gfs locks on
-- 
cgit v1.2.3


From 7ed122e42c72b3e4531f8b4a9f72159e8303ac15 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Dec 2008 10:28:10 +0000
Subject: GFS2: Streamline alloc calculations for writes

This patch removes some unused code, and make the calculation
of the number of blocks required conditional in order to reduce
the number of times this (potentially expensive) calculation
is done.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c        | 49 +++++++------------------------------------------
 fs/gfs2/bmap.h        | 34 ++++++++++++++++++++++++++++++----
 fs/gfs2/ops_address.c |  6 ++++--
 fs/gfs2/ops_file.c    |  2 +-
 4 files changed, 42 insertions(+), 49 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 789f28cfdc2..11ffc56f1f8 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1230,35 +1230,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
 	return trunc_dealloc(ip, 0);
 }
 
-/**
- * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
- * @ip: the file
- * @len: the number of bytes to be written to the file
- * @data_blocks: returns the number of data blocks required
- * @ind_blocks: returns the number of indirect blocks required
- *
- */
-
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-			    unsigned int *data_blocks, unsigned int *ind_blocks)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int tmp;
-
-	if (gfs2_is_dir(ip)) {
-		*data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2;
-		*ind_blocks = 3 * (sdp->sd_max_jheight - 1);
-	} else {
-		*data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
-		*ind_blocks = 3 * (sdp->sd_max_height - 1);
-	}
-
-	for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
-		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
-		*ind_blocks += tmp;
-	}
-}
-
 /**
  * gfs2_write_alloc_required - figure out if a write will require an allocation
  * @ip: the file being written to
@@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 	struct buffer_head bh;
 	unsigned int shift;
 	u64 lblock, lblock_stop, size;
+	u64 end_of_file;
 
 	*alloc_required = 0;
 
@@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 
 	*alloc_required = 1;
 	shift = sdp->sd_sb.sb_bsize_shift;
-	if (gfs2_is_dir(ip)) {
-		unsigned int bsize = sdp->sd_jbsize;
-		lblock = offset;
-		do_div(lblock, bsize);
-		lblock_stop = offset + len + bsize - 1;
-		do_div(lblock_stop, bsize);
-	} else {
-		u64 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
-		lblock = offset >> shift;
-		lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
-		if (lblock_stop > end_of_file)
-			return 0;
-	}
+	BUG_ON(gfs2_is_dir(ip));
+	end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift;
+	lblock = offset >> shift;
+	lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
+	if (lblock_stop > end_of_file)
+		return 0;
 
 	size = (lblock_stop - lblock) << shift;
 	do {
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e6cde2943b..c983177e05a 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -10,10 +10,40 @@
 #ifndef __BMAP_DOT_H__
 #define __BMAP_DOT_H__
 
+#include "inode.h"
+
 struct inode;
 struct gfs2_inode;
 struct page;
 
+
+/**
+ * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file
+ * @ip: the file
+ * @len: the number of bytes to be written to the file
+ * @data_blocks: returns the number of data blocks required
+ * @ind_blocks: returns the number of indirect blocks required
+ *
+ */
+
+static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
+					  unsigned int len,
+					  unsigned int *data_blocks,
+					  unsigned int *ind_blocks)
+{
+	const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	unsigned int tmp;
+
+	BUG_ON(gfs2_is_dir(ip));
+	*data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3;
+	*ind_blocks = 3 * (sdp->sd_max_height - 1);
+
+	for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) {
+		tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
+		*ind_blocks += tmp;
+	}
+}
+
 int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
 int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create);
 int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen);
@@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 int gfs2_truncatei(struct gfs2_inode *ip, u64 size);
 int gfs2_truncatei_resume(struct gfs2_inode *ip);
 int gfs2_file_dealloc(struct gfs2_inode *ip);
-
-void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len,
-			    unsigned int *data_blocks,
-			    unsigned int *ind_blocks);
 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
 			      unsigned int len, int *alloc_required);
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 0df560f4269..6e4ea36c660 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -625,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct gfs2_inode *ip = GFS2_I(mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
-	unsigned int data_blocks, ind_blocks, rblocks;
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
 	int alloc_required;
 	int error = 0;
 	struct gfs2_alloc *al;
@@ -639,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	if (unlikely(error))
 		goto out_uninit;
 
-	gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
 	error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
 	if (error)
 		goto out_unlock;
 
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
+
 	if (alloc_required) {
 		al = gfs2_alloc_get(ip);
 		if (!al) {
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index a6b7a733fd4..289c5f54ba5 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -355,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 		goto out;
 
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
-	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
 	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
 	if (ret || !alloc_required)
 		goto out_unlock;
@@ -367,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	ret = gfs2_quota_lock_check(ip);
 	if (ret)
 		goto out_alloc_put;
+	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
 	al->al_requested = data_blocks + ind_blocks;
 	ret = gfs2_inplace_reserve(ip);
 	if (ret)
-- 
cgit v1.2.3


From fefc03bfedeff2002f14e848ecb7c0cd77ee0b15 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 19 Dec 2008 15:32:06 +0000
Subject: Revert "GFS2: Fix use-after-free bug on umount"

This reverts commit 78802499912f1ba31ce83a94c55b5a980f250a43.

The original patch is causing problems in relation to order of
operations at umount in relation to jdata files. I need to fix
this a different way.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c      |  3 +-
 fs/gfs2/glock.h      |  2 +-
 fs/gfs2/ops_fstype.c | 98 ++++++----------------------------------------------
 fs/gfs2/ops_super.c  | 68 ++++++++++++++++++++++++++++++++++--
 fs/gfs2/super.c      | 34 ++++++++++++++++++
 fs/gfs2/super.h      |  3 +-
 6 files changed, 114 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 5eae62e7f77..6e298b07011 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1547,9 +1547,8 @@ static void clear_glock(struct gfs2_glock *gl)
  * Called when unmounting the filesystem.
  */
 
-void gfs2_gl_hash_clear(struct super_block *sb)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
-	struct gfs2_sbd *sdp = sb->s_fs_info;
 	unsigned long t;
 	unsigned int x;
 	int cont;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index ce54f338cff..543ec7ecfbd 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -130,7 +130,7 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct super_block *sb);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
 
 int __init gfs2_glock_init(void);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2e735bece6b..4cae60f4a17 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -705,40 +705,6 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 	return error;
 }
 
-/**
- * gfs2_jindex_free - Clear all the journal index information
- * @sdp: The GFS2 superblock
- *
- */
-
-static void gfs2_jindex_free(struct gfs2_sbd *sdp)
-{
-	struct list_head list, *head;
-	struct gfs2_jdesc *jd;
-	struct gfs2_journal_extent *jext;
-
-	spin_lock(&sdp->sd_jindex_spin);
-	list_add(&list, &sdp->sd_jindex_list);
-	list_del_init(&sdp->sd_jindex_list);
-	sdp->sd_journals = 0;
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	while (!list_empty(&list)) {
-		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
-		head = &jd->extent_list;
-		while (!list_empty(head)) {
-			jext = list_entry(head->next,
-					  struct gfs2_journal_extent,
-					  extent_list);
-			list_del(&jext->extent_list);
-			kfree(jext);
-		}
-		list_del(&jd->jd_list);
-		iput(jd->jd_inode);
-		kfree(jd);
-	}
-}
-
 static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct inode *master = sdp->sd_master_dir->d_inode;
@@ -1237,7 +1203,7 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-	gfs2_gl_hash_clear(sb);
+	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
 	while (invalidate_inodes(sb))
 		yield();
@@ -1297,61 +1263,17 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-
-	if (sdp == NULL) {
-		kill_block_super(sb);
-		return;
-	}
-	gfs2_meta_syncfs(sdp);
-	dput(sdp->sd_root_dir);
-	dput(sdp->sd_master_dir);
-	sdp->sd_root_dir = NULL;
-	sdp->sd_master_dir = NULL;
-
-	/*  Unfreeze the filesystem, if we need to  */
-	mutex_lock(&sdp->sd_freeze_lock);
-	if (sdp->sd_freeze_count)
-		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-	mutex_unlock(&sdp->sd_freeze_lock);
-
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-	kthread_stop(sdp->sd_recoverd_process);
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		int error = gfs2_make_fs_ro(sdp);
-		if (error)
-			gfs2_io_error(sdp);
-	}
-
-	/* At this point, we're through modifying the disk */
-	gfs2_jindex_free(sdp);
-	gfs2_clear_rgrpd(sdp);
-	iput(sdp->sd_jindex);
-	iput(sdp->sd_inum_inode);
-	iput(sdp->sd_statfs_inode);
-	iput(sdp->sd_rindex);
-	iput(sdp->sd_quota_inode);
-
-	gfs2_glock_put(sdp->sd_rename_gl);
-	gfs2_glock_put(sdp->sd_trans_gl);
-
-	if (!sdp->sd_args.ar_spectator) {
-		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-		iput(sdp->sd_ir_inode);
-		iput(sdp->sd_sc_inode);
-		iput(sdp->sd_qc_inode);
+	if (sdp) {
+		gfs2_meta_syncfs(sdp);
+		dput(sdp->sd_root_dir);
+		dput(sdp->sd_master_dir);
+		sdp->sd_root_dir = NULL;
+		sdp->sd_master_dir = NULL;
 	}
-	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	shrink_dcache_sb(sb);
 	kill_block_super(sb);
-	gfs2_lm_unmount(sdp);
-	gfs2_sys_fs_del(sdp);
-	gfs2_delete_debugfs_file(sdp);
-	kfree(sdp);
+	if (sdp)
+		gfs2_delete_debugfs_file(sdp);
 }
 
 struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index bd08a0a8d9b..08837a72863 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -95,7 +95,7 @@ do_flush:
  * Returns: errno
  */
 
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 {
 	struct gfs2_holder t_gh;
 	int error;
@@ -121,6 +121,70 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	return error;
 }
 
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	/*  Unfreeze the filesystem, if we need to  */
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	if (sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+	kthread_stop(sdp->sd_recoverd_process);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+	/*  At this point, we're through modifying the disk  */
+
+	/*  Release stuff  */
+
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_inum_inode);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_trans_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_ir_inode);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	gfs2_clear_rgrpd(sdp);
+	gfs2_jindex_free(sdp);
+	/*  Take apart glock structures and buffer lists  */
+	gfs2_gl_hash_clear(sdp);
+	/*  Unmount the locking protocol  */
+	gfs2_lm_unmount(sdp);
+
+	/*  At this point, we're through participating in the lockspace  */
+	gfs2_sys_fs_del(sdp);
+	kfree(sdp);
+}
+
 /**
  * gfs2_write_super
  * @sb: the superblock
@@ -622,7 +686,7 @@ const struct super_operations gfs2_super_ops = {
 	.destroy_inode		= gfs2_destroy_inode,
 	.write_inode		= gfs2_write_inode,
 	.delete_inode		= gfs2_delete_inode,
-	.put_super		= gfs2_gl_hash_clear,
+	.put_super		= gfs2_put_super,
 	.write_super		= gfs2_write_super,
 	.sync_fs		= gfs2_sync_fs,
 	.write_super_lockfs 	= gfs2_write_super_lockfs,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f14658b2020..141b781f2fc 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,6 +33,40 @@
 #include "trans.h"
 #include "util.h"
 
+/**
+ * gfs2_jindex_free - Clear all the journal index information
+ * @sdp: The GFS2 superblock
+ *
+ */
+
+void gfs2_jindex_free(struct gfs2_sbd *sdp)
+{
+	struct list_head list, *head;
+	struct gfs2_jdesc *jd;
+	struct gfs2_journal_extent *jext;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_add(&list, &sdp->sd_jindex_list);
+	list_del_init(&sdp->sd_jindex_list);
+	sdp->sd_journals = 0;
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	while (!list_empty(&list)) {
+		jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
+		head = &jd->extent_list;
+		while (!list_empty(head)) {
+			jext = list_entry(head->next,
+					  struct gfs2_journal_extent,
+					  extent_list);
+			list_del(&jext->extent_list);
+			kfree(jext);
+		}
+		list_del(&jd->jd_list);
+		iput(jd->jd_inode);
+		kfree(jd);
+	}
+}
+
 static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid)
 {
 	struct gfs2_jdesc *jd;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 4d2492b3e7e..f6b8b00ad88 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,6 +25,8 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 	return x;
 }
 
+void gfs2_jindex_free(struct gfs2_sbd *sdp);
+
 struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
 int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
@@ -32,7 +34,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
 			      struct gfs2_inode **ipp);
 
 int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
 
 int gfs2_statfs_init(struct gfs2_sbd *sdp);
 void gfs2_statfs_change(struct gfs2_sbd *sdp,
-- 
cgit v1.2.3


From 88a19ad066c1aab2f9713beb670525fcc06e1c09 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 19 Dec 2008 15:43:05 +0000
Subject: GFS2: Fix use-after-free bug on umount (try #2)

This should solve the issue with the previous attempt at fixing this.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 20 ++++++++++++--------
 fs/gfs2/ops_super.c  |  1 -
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4cae60f4a17..f91eebdde58 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1263,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 static void gfs2_kill_sb(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
-	if (sdp) {
-		gfs2_meta_syncfs(sdp);
-		dput(sdp->sd_root_dir);
-		dput(sdp->sd_master_dir);
-		sdp->sd_root_dir = NULL;
-		sdp->sd_master_dir = NULL;
+
+	if (sdp == NULL) {
+		kill_block_super(sb);
+		return;
 	}
+
+	gfs2_meta_syncfs(sdp);
+	dput(sdp->sd_root_dir);
+	dput(sdp->sd_master_dir);
+	sdp->sd_root_dir = NULL;
+	sdp->sd_master_dir = NULL;
 	shrink_dcache_sb(sb);
 	kill_block_super(sb);
-	if (sdp)
-		gfs2_delete_debugfs_file(sdp);
+	gfs2_delete_debugfs_file(sdp);
+	kfree(sdp);
 }
 
 struct file_system_type gfs2_fs_type = {
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 08837a72863..777783deddc 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -182,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb)
 
 	/*  At this point, we're through participating in the lockspace  */
 	gfs2_sys_fs_del(sdp);
-	kfree(sdp);
 }
 
 /**
-- 
cgit v1.2.3


From eb8374e71f941a1b3c2ed6ea19dc809e7124dc5d Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 25 Dec 2008 15:35:27 +0100
Subject: GFS2: Use DEFINE_SPINLOCK

SPIN_LOCK_UNLOCKED is deprecated.  The following makes the change suggested
in Documentation/spinlocks.txt

The semantic patch that makes this change is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@@
declarer name DEFINE_SPINLOCK;
identifier xxx_lock;
@@

- spinlock_t xxx_lock = SPIN_LOCK_UNLOCKED;
+ DEFINE_SPINLOCK(xxx_lock);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6e298b07011..6b983aef785 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -65,7 +65,7 @@ static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
-static spinlock_t lru_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(lru_lock);
 
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
-- 
cgit v1.2.3


From 6545b246a2c815a8fcd07d58240effb6ec3481b1 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:22 +0000
Subject: Squashfs: inode operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/inode.c | 346 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 346 insertions(+)
 create mode 100644 fs/squashfs/inode.c

(limited to 'fs')

diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
new file mode 100644
index 00000000000..7a63398bb85
--- /dev/null
+++ b/fs/squashfs/inode.c
@@ -0,0 +1,346 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * inode.c
+ */
+
+/*
+ * This file implements code to create and read inodes from disk.
+ *
+ * Inodes in Squashfs are identified by a 48-bit inode which encodes the
+ * location of the compressed metadata block containing the inode, and the byte
+ * offset into that block where the inode is placed (<block, offset>).
+ *
+ * To maximise compression there are different inodes for each file type
+ * (regular file, directory, device, etc.), the inode contents and length
+ * varying with the type.
+ *
+ * To further maximise compression, two types of regular file inode and
+ * directory inode are defined: inodes optimised for frequently occurring
+ * regular files and directories, and extended types where extra
+ * information has to be stored.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Initialise VFS inode with the base inode information common to all
+ * Squashfs inode types.  Sqsh_ino contains the unswapped base inode
+ * off disk.
+ */
+static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
+				struct squashfs_base_inode *sqsh_ino)
+{
+	int err;
+
+	err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid);
+	if (err)
+		return err;
+
+	err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid);
+	if (err)
+		return err;
+
+	inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
+	inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
+	inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
+	inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+	inode->i_mode = le16_to_cpu(sqsh_ino->mode);
+	inode->i_size = 0;
+
+	return err;
+}
+
+
+struct inode *squashfs_iget(struct super_block *sb, long long ino,
+				unsigned int ino_number)
+{
+	struct inode *inode = iget_locked(sb, ino_number);
+	int err;
+
+	TRACE("Entered squashfs_iget\n");
+
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+
+	err = squashfs_read_inode(inode, ino);
+	if (err) {
+		iget_failed(inode);
+		return ERR_PTR(err);
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+}
+
+
+/*
+ * Initialise VFS inode by reading inode from inode table (compressed
+ * metadata).  The format and amount of data read depends on type.
+ */
+int squashfs_read_inode(struct inode *inode, long long ino)
+{
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+	int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
+	union squashfs_inode squashfs_ino;
+	struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+
+	TRACE("Entered squashfs_read_inode\n");
+
+	/*
+	 * Read inode base common to all inode types.
+	 */
+	err = squashfs_read_metadata(sb, sqshb_ino, &block,
+				&offset, sizeof(*sqshb_ino));
+	if (err < 0)
+		goto failed_read;
+
+	err = squashfs_new_inode(sb, inode, sqshb_ino);
+	if (err)
+		goto failed_read;
+
+	block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table;
+	offset = SQUASHFS_INODE_OFFSET(ino);
+
+	type = le16_to_cpu(sqshb_ino->inode_type);
+	switch (type) {
+	case SQUASHFS_REG_TYPE: {
+		unsigned int frag_offset, frag_size, frag;
+		u64 frag_blk;
+		struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+							sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		frag = le32_to_cpu(sqsh_ino->fragment);
+		if (frag != SQUASHFS_INVALID_FRAG) {
+			frag_offset = le32_to_cpu(sqsh_ino->offset);
+			frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+			if (frag_size < 0) {
+				err = frag_size;
+				goto failed_read;
+			}
+		} else {
+			frag_blk = SQUASHFS_INVALID_BLK;
+			frag_size = 0;
+			frag_offset = 0;
+		}
+
+		inode->i_nlink = 1;
+		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mode |= S_IFREG;
+		inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
+		squashfs_i(inode)->fragment_block = frag_blk;
+		squashfs_i(inode)->fragment_size = frag_size;
+		squashfs_i(inode)->fragment_offset = frag_offset;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->block_list_start = block;
+		squashfs_i(inode)->offset = offset;
+		inode->i_data.a_ops = &squashfs_aops;
+
+		TRACE("File inode %x:%x, start_block %llx, block_list_start "
+			"%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+			offset, squashfs_i(inode)->start, block, offset);
+		break;
+	}
+	case SQUASHFS_LREG_TYPE: {
+		unsigned int frag_offset, frag_size, frag;
+		u64 frag_blk;
+		struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+							sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		frag = le32_to_cpu(sqsh_ino->fragment);
+		if (frag != SQUASHFS_INVALID_FRAG) {
+			frag_offset = le32_to_cpu(sqsh_ino->offset);
+			frag_size = squashfs_frag_lookup(sb, frag, &frag_blk);
+			if (frag_size < 0) {
+				err = frag_size;
+				goto failed_read;
+			}
+		} else {
+			frag_blk = SQUASHFS_INVALID_BLK;
+			frag_size = 0;
+			frag_offset = 0;
+		}
+
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+		inode->i_fop = &generic_ro_fops;
+		inode->i_mode |= S_IFREG;
+		inode->i_blocks = ((inode->i_size -
+				le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1;
+
+		squashfs_i(inode)->fragment_block = frag_blk;
+		squashfs_i(inode)->fragment_size = frag_size;
+		squashfs_i(inode)->fragment_offset = frag_offset;
+		squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->block_list_start = block;
+		squashfs_i(inode)->offset = offset;
+		inode->i_data.a_ops = &squashfs_aops;
+
+		TRACE("File inode %x:%x, start_block %llx, block_list_start "
+			"%llx, offset %x\n", SQUASHFS_INODE_BLK(ino),
+			offset, squashfs_i(inode)->start, block, offset);
+		break;
+	}
+	case SQUASHFS_DIR_TYPE: {
+		struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le16_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_dir_inode_ops;
+		inode->i_fop = &squashfs_dir_ops;
+		inode->i_mode |= S_IFDIR;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+		squashfs_i(inode)->dir_idx_cnt = 0;
+		squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+		TRACE("Directory inode %x:%x, start_block %llx, offset %x\n",
+				SQUASHFS_INODE_BLK(ino), offset,
+				squashfs_i(inode)->start,
+				le16_to_cpu(sqsh_ino->offset));
+		break;
+	}
+	case SQUASHFS_LDIR_TYPE: {
+		struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le32_to_cpu(sqsh_ino->file_size);
+		inode->i_op = &squashfs_dir_inode_ops;
+		inode->i_fop = &squashfs_dir_ops;
+		inode->i_mode |= S_IFDIR;
+		squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block);
+		squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset);
+		squashfs_i(inode)->dir_idx_start = block;
+		squashfs_i(inode)->dir_idx_offset = offset;
+		squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count);
+		squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode);
+
+		TRACE("Long directory inode %x:%x, start_block %llx, offset "
+				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
+				squashfs_i(inode)->start,
+				le16_to_cpu(sqsh_ino->offset));
+		break;
+	}
+	case SQUASHFS_SYMLINK_TYPE:
+	case SQUASHFS_LSYMLINK_TYPE: {
+		struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+		inode->i_op = &page_symlink_inode_operations;
+		inode->i_data.a_ops = &squashfs_symlink_aops;
+		inode->i_mode |= S_IFLNK;
+		squashfs_i(inode)->start = block;
+		squashfs_i(inode)->offset = offset;
+
+		TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
+				"%x\n", SQUASHFS_INODE_BLK(ino), offset,
+				block, offset);
+		break;
+	}
+	case SQUASHFS_BLKDEV_TYPE:
+	case SQUASHFS_CHRDEV_TYPE:
+	case SQUASHFS_LBLKDEV_TYPE:
+	case SQUASHFS_LCHRDEV_TYPE: {
+		struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
+		unsigned int rdev;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_CHRDEV_TYPE)
+			inode->i_mode |= S_IFCHR;
+		else
+			inode->i_mode |= S_IFBLK;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		rdev = le32_to_cpu(sqsh_ino->rdev);
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+		TRACE("Device inode %x:%x, rdev %x\n",
+				SQUASHFS_INODE_BLK(ino), offset, rdev);
+		break;
+	}
+	case SQUASHFS_FIFO_TYPE:
+	case SQUASHFS_SOCKET_TYPE:
+	case SQUASHFS_LFIFO_TYPE:
+	case SQUASHFS_LSOCKET_TYPE: {
+		struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
+
+		err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+				sizeof(*sqsh_ino));
+		if (err < 0)
+			goto failed_read;
+
+		if (type == SQUASHFS_FIFO_TYPE)
+			inode->i_mode |= S_IFIFO;
+		else
+			inode->i_mode |= S_IFSOCK;
+		inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+		init_special_inode(inode, inode->i_mode, 0);
+		break;
+	}
+	default:
+		ERROR("Unknown inode type %d in squashfs_iget!\n", type);
+		return -EINVAL;
+	}
+
+	return 0;
+
+failed_read:
+	ERROR("Unable to read inode 0x%llx\n", ino);
+	return err;
+}
-- 
cgit v1.2.3


From c88da2c979369e6bf8d2c0c80fad2f90c35e64ce Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:23 +0000
Subject: Squashfs: directory lookup operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/namei.c | 242 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 fs/squashfs/namei.c

(limited to 'fs')

diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
new file mode 100644
index 00000000000..9e398653b22
--- /dev/null
+++ b/fs/squashfs/namei.c
@@ -0,0 +1,242 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * namei.c
+ */
+
+/*
+ * This file implements code to do filename lookup in directories.
+ *
+ * Like inodes, directories are packed into compressed metadata blocks, stored
+ * in a directory table.  Directories are accessed using the start address of
+ * the metablock containing the directory and the offset into the
+ * decompressed block (<block, offset>).
+ *
+ * Directories are organised in a slightly complex way, and are not simply
+ * a list of file names.  The organisation takes advantage of the
+ * fact that (in most cases) the inodes of the files will be in the same
+ * compressed metadata block, and therefore, can share the start block.
+ * Directories are therefore organised in a two level list, a directory
+ * header containing the shared start block value, and a sequence of directory
+ * entries, each of which share the shared start block.  A new directory header
+ * is written once/if the inode start block changes.  The directory
+ * header/directory entry list is repeated as many times as necessary.
+ *
+ * Directories are sorted, and can contain a directory index to speed up
+ * file lookup.  Directory indexes store one entry per metablock, each entry
+ * storing the index/filename mapping to the first directory header
+ * in each metadata block.  Directories are sorted in alphabetical order,
+ * and at lookup the index is scanned linearly looking for the first filename
+ * alphabetically larger than the filename being looked up.  At this point the
+ * location of the metadata block the filename is in has been found.
+ * The general idea of the index is ensure only one metadata block needs to be
+ * decompressed to do a lookup irrespective of the length of the directory.
+ * This scheme has the advantage that it doesn't require extra memory overhead
+ * and doesn't require much extra storage on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dcache.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Lookup name in the directory index, returning the location of the metadata
+ * block containing it, and the directory index this represents.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_name(struct super_block *sb,
+			u64 *next_block, int *next_offset, u64 index_start,
+			int index_offset, int i_count, const char *name,
+			int len)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int i, size, length = 0, err;
+	struct squashfs_dir_index *index;
+	char *str;
+
+	TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count);
+
+	index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL);
+	if (index == NULL) {
+		ERROR("Failed to allocate squashfs_dir_index\n");
+		goto out;
+	}
+
+	str = &index->name[SQUASHFS_NAME_LEN + 1];
+	strncpy(str, name, len);
+	str[len] = '\0';
+
+	for (i = 0; i < i_count; i++) {
+		err = squashfs_read_metadata(sb, index, &index_start,
+					&index_offset, sizeof(*index));
+		if (err < 0)
+			break;
+
+
+		size = le32_to_cpu(index->size) + 1;
+
+		err = squashfs_read_metadata(sb, index->name, &index_start,
+					&index_offset, size);
+		if (err < 0)
+			break;
+
+		index->name[size] = '\0';
+
+		if (strcmp(index->name, str) > 0)
+			break;
+
+		length = le32_to_cpu(index->index);
+		*next_block = le32_to_cpu(index->start_block) +
+					msblk->directory_table;
+	}
+
+	*next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+	kfree(index);
+
+out:
+	/*
+	 * Return index (f_pos) of the looked up metadata block.  Translate
+	 * from internal f_pos to external f_pos which is offset by 3 because
+	 * we invent "." and ".." entries which are not actually stored in the
+	 * directory.
+	 */
+	return length + 3;
+}
+
+
+static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	struct inode *inode = NULL;
+	struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info;
+	struct squashfs_dir_header dirh;
+	struct squashfs_dir_entry *dire;
+	u64 block = squashfs_i(dir)->start + msblk->directory_table;
+	int offset = squashfs_i(dir)->offset;
+	int err, length = 0, dir_count, size;
+
+	TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset);
+
+	dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+	if (dire == NULL) {
+		ERROR("Failed to allocate squashfs_dir_entry\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (len > SQUASHFS_NAME_LEN) {
+		err = -ENAMETOOLONG;
+		goto failed;
+	}
+
+	length = get_dir_index_using_name(dir->i_sb, &block, &offset,
+				squashfs_i(dir)->dir_idx_start,
+				squashfs_i(dir)->dir_idx_offset,
+				squashfs_i(dir)->dir_idx_cnt, name, len);
+
+	while (length < i_size_read(dir)) {
+		/*
+		 * Read directory header.
+		 */
+		err = squashfs_read_metadata(dir->i_sb, &dirh, &block,
+				&offset, sizeof(dirh));
+		if (err < 0)
+			goto read_failure;
+
+		length += sizeof(dirh);
+
+		dir_count = le32_to_cpu(dirh.count) + 1;
+		while (dir_count--) {
+			/*
+			 * Read directory entry.
+			 */
+			err = squashfs_read_metadata(dir->i_sb, dire, &block,
+					&offset, sizeof(*dire));
+			if (err < 0)
+				goto read_failure;
+
+			size = le16_to_cpu(dire->size) + 1;
+
+			err = squashfs_read_metadata(dir->i_sb, dire->name,
+					&block, &offset, size);
+			if (err < 0)
+				goto read_failure;
+
+			length += sizeof(*dire) + size;
+
+			if (name[0] < dire->name[0])
+				goto exit_lookup;
+
+			if (len == size && !strncmp(name, dire->name, len)) {
+				unsigned int blk, off, ino_num;
+				long long ino;
+				blk = le32_to_cpu(dirh.start_block);
+				off = le16_to_cpu(dire->offset);
+				ino_num = le32_to_cpu(dirh.inode_number) +
+					(short) le16_to_cpu(dire->inode_number);
+				ino = SQUASHFS_MKINODE(blk, off);
+
+				TRACE("calling squashfs_iget for directory "
+					"entry %s, inode  %x:%x, %d\n", name,
+					blk, off, ino_num);
+
+				inode = squashfs_iget(dir->i_sb, ino, ino_num);
+				if (IS_ERR(inode)) {
+					err = PTR_ERR(inode);
+					goto failed;
+				}
+
+				goto exit_lookup;
+			}
+		}
+	}
+
+exit_lookup:
+	kfree(dire);
+	if (inode)
+		return d_splice_alias(inode, dentry);
+	d_add(dentry, inode);
+	return ERR_PTR(0);
+
+read_failure:
+	ERROR("Unable to read directory block [%llx:%x]\n",
+		squashfs_i(dir)->start + msblk->directory_table,
+		squashfs_i(dir)->offset);
+failed:
+	kfree(dire);
+	return ERR_PTR(err);
+}
+
+
+const struct inode_operations squashfs_dir_inode_ops = {
+	.lookup = squashfs_lookup
+};
-- 
cgit v1.2.3


From 07972dde75c321162d076a925e3464ba259e73d7 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:23 +0000
Subject: Squashfs: directory readdir operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/dir.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 fs/squashfs/dir.c

(limited to 'fs')

diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
new file mode 100644
index 00000000000..566b0eaed86
--- /dev/null
+++ b/fs/squashfs/dir.c
@@ -0,0 +1,235 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * dir.c
+ */
+
+/*
+ * This file implements code to read directories from disk.
+ *
+ * See namei.c for a description of directory organisation on disk.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const unsigned char squashfs_filetype_table[] = {
+	DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK
+};
+
+/*
+ * Lookup offset (f_pos) in the directory index, returning the
+ * metadata block containing it.
+ *
+ * If we get an error reading the index then return the part of the index
+ * (if any) we have managed to read - the index isn't essential, just
+ * quicker.
+ */
+static int get_dir_index_using_offset(struct super_block *sb,
+	u64 *next_block, int *next_offset, u64 index_start, int index_offset,
+	int i_count, u64 f_pos)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int err, i, index, length = 0;
+	struct squashfs_dir_index dir_index;
+
+	TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n",
+					i_count, f_pos);
+
+	/*
+	 * Translate from external f_pos to the internal f_pos.  This
+	 * is offset by 3 because we invent "." and ".." entries which are
+	 * not actually stored in the directory.
+	 */
+	if (f_pos < 3)
+		return f_pos;
+	f_pos -= 3;
+
+	for (i = 0; i < i_count; i++) {
+		err = squashfs_read_metadata(sb, &dir_index, &index_start,
+				&index_offset, sizeof(dir_index));
+		if (err < 0)
+			break;
+
+		index = le32_to_cpu(dir_index.index);
+		if (index > f_pos)
+			/*
+			 * Found the index we're looking for.
+			 */
+			break;
+
+		err = squashfs_read_metadata(sb, NULL, &index_start,
+				&index_offset, le32_to_cpu(dir_index.size) + 1);
+		if (err < 0)
+			break;
+
+		length = index;
+		*next_block = le32_to_cpu(dir_index.start_block) +
+					msblk->directory_table;
+	}
+
+	*next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE;
+
+	/*
+	 * Translate back from internal f_pos to external f_pos.
+	 */
+	return length + 3;
+}
+
+
+static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	u64 block = squashfs_i(inode)->start + msblk->directory_table;
+	int offset = squashfs_i(inode)->offset, length = 0, dir_count, size,
+				type, err;
+	unsigned int inode_number;
+	struct squashfs_dir_header dirh;
+	struct squashfs_dir_entry *dire;
+
+	TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset);
+
+	dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL);
+	if (dire == NULL) {
+		ERROR("Failed to allocate squashfs_dir_entry\n");
+		goto finish;
+	}
+
+	/*
+	 * Return "." and  ".." entries as the first two filenames in the
+	 * directory.  To maximise compression these two entries are not
+	 * stored in the directory, and so we invent them here.
+	 *
+	 * It also means that the external f_pos is offset by 3 from the
+	 * on-disk directory f_pos.
+	 */
+	while (file->f_pos < 3) {
+		char *name;
+		int i_ino;
+
+		if (file->f_pos == 0) {
+			name = ".";
+			size = 1;
+			i_ino = inode->i_ino;
+		} else {
+			name = "..";
+			size = 2;
+			i_ino = squashfs_i(inode)->parent;
+		}
+
+		TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n",
+				dirent, name, size, file->f_pos, i_ino,
+				squashfs_filetype_table[1]);
+
+		if (filldir(dirent, name, size, file->f_pos, i_ino,
+				squashfs_filetype_table[1]) < 0) {
+				TRACE("Filldir returned less than 0\n");
+			goto finish;
+		}
+
+		file->f_pos += size;
+	}
+
+	length = get_dir_index_using_offset(inode->i_sb, &block, &offset,
+				squashfs_i(inode)->dir_idx_start,
+				squashfs_i(inode)->dir_idx_offset,
+				squashfs_i(inode)->dir_idx_cnt,
+				file->f_pos);
+
+	while (length < i_size_read(inode)) {
+		/*
+		 * Read directory header
+		 */
+		err = squashfs_read_metadata(inode->i_sb, &dirh, &block,
+					&offset, sizeof(dirh));
+		if (err < 0)
+			goto failed_read;
+
+		length += sizeof(dirh);
+
+		dir_count = le32_to_cpu(dirh.count) + 1;
+		while (dir_count--) {
+			/*
+			 * Read directory entry.
+			 */
+			err = squashfs_read_metadata(inode->i_sb, dire, &block,
+					&offset, sizeof(*dire));
+			if (err < 0)
+				goto failed_read;
+
+			size = le16_to_cpu(dire->size) + 1;
+
+			err = squashfs_read_metadata(inode->i_sb, dire->name,
+					&block, &offset, size);
+			if (err < 0)
+				goto failed_read;
+
+			length += sizeof(*dire) + size;
+
+			if (file->f_pos >= length)
+				continue;
+
+			dire->name[size] = '\0';
+			inode_number = le32_to_cpu(dirh.inode_number) +
+				((short) le16_to_cpu(dire->inode_number));
+			type = le16_to_cpu(dire->type);
+
+			TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)"
+					"\n", dirent, dire->name, size,
+					file->f_pos,
+					le32_to_cpu(dirh.start_block),
+					le16_to_cpu(dire->offset),
+					inode_number,
+					squashfs_filetype_table[type]);
+
+			if (filldir(dirent, dire->name, size, file->f_pos,
+					inode_number,
+					squashfs_filetype_table[type]) < 0) {
+				TRACE("Filldir returned less than 0\n");
+				goto finish;
+			}
+
+			file->f_pos = length;
+		}
+	}
+
+finish:
+	kfree(dire);
+	return 0;
+
+failed_read:
+	ERROR("Unable to read directory block [%llx:%x]\n", block, offset);
+	kfree(dire);
+	return 0;
+}
+
+
+const struct file_operations squashfs_dir_ops = {
+	.read = generic_read_dir,
+	.readdir = squashfs_readdir
+};
-- 
cgit v1.2.3


From 1701aecb6849cc69bd54890532bcf92eedb00b74 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:24 +0000
Subject: Squashfs: regular file operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/file.c | 502 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 502 insertions(+)
 create mode 100644 fs/squashfs/file.c

(limited to 'fs')

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
new file mode 100644
index 00000000000..717767d831d
--- /dev/null
+++ b/fs/squashfs/file.c
@@ -0,0 +1,502 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * file.c
+ */
+
+/*
+ * This file contains code for handling regular files.  A regular file
+ * consists of a sequence of contiguous compressed blocks, and/or a
+ * compressed fragment block (tail-end packed block).   The compressed size
+ * of each datablock is stored in a block list contained within the
+ * file inode (itself stored in one or more compressed metadata blocks).
+ *
+ * To speed up access to datablocks when reading 'large' files (256 Mbytes or
+ * larger), the code implements an index cache that caches the mapping from
+ * block index to datablock location on disk.
+ *
+ * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+ * retaining a simple and space-efficient block list on disk.  The cache
+ * is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+ * Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+ * The index cache is designed to be memory efficient, and by default uses
+ * 16 KiB.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Locate cache slot in range [offset, index] for specified inode.  If
+ * there's more than one return the slot closest to index.
+ */
+static struct meta_index *locate_meta_index(struct inode *inode, int offset,
+				int index)
+{
+	struct meta_index *meta = NULL;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int i;
+
+	mutex_lock(&msblk->meta_index_mutex);
+
+	TRACE("locate_meta_index: index %d, offset %d\n", index, offset);
+
+	if (msblk->meta_index == NULL)
+		goto not_allocated;
+
+	for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+		if (msblk->meta_index[i].inode_number == inode->i_ino &&
+				msblk->meta_index[i].offset >= offset &&
+				msblk->meta_index[i].offset <= index &&
+				msblk->meta_index[i].locked == 0) {
+			TRACE("locate_meta_index: entry %d, offset %d\n", i,
+					msblk->meta_index[i].offset);
+			meta = &msblk->meta_index[i];
+			offset = meta->offset;
+		}
+	}
+
+	if (meta)
+		meta->locked = 1;
+
+not_allocated:
+	mutex_unlock(&msblk->meta_index_mutex);
+
+	return meta;
+}
+
+
+/*
+ * Find and initialise an empty cache slot for index offset.
+ */
+static struct meta_index *empty_meta_index(struct inode *inode, int offset,
+				int skip)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	struct meta_index *meta = NULL;
+	int i;
+
+	mutex_lock(&msblk->meta_index_mutex);
+
+	TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip);
+
+	if (msblk->meta_index == NULL) {
+		/*
+		 * First time cache index has been used, allocate and
+		 * initialise.  The cache index could be allocated at
+		 * mount time but doing it here means it is allocated only
+		 * if a 'large' file is read.
+		 */
+		msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS,
+			sizeof(*(msblk->meta_index)), GFP_KERNEL);
+		if (msblk->meta_index == NULL) {
+			ERROR("Failed to allocate meta_index\n");
+			goto failed;
+		}
+		for (i = 0; i < SQUASHFS_META_SLOTS; i++) {
+			msblk->meta_index[i].inode_number = 0;
+			msblk->meta_index[i].locked = 0;
+		}
+		msblk->next_meta_index = 0;
+	}
+
+	for (i = SQUASHFS_META_SLOTS; i &&
+			msblk->meta_index[msblk->next_meta_index].locked; i--)
+		msblk->next_meta_index = (msblk->next_meta_index + 1) %
+			SQUASHFS_META_SLOTS;
+
+	if (i == 0) {
+		TRACE("empty_meta_index: failed!\n");
+		goto failed;
+	}
+
+	TRACE("empty_meta_index: returned meta entry %d, %p\n",
+			msblk->next_meta_index,
+			&msblk->meta_index[msblk->next_meta_index]);
+
+	meta = &msblk->meta_index[msblk->next_meta_index];
+	msblk->next_meta_index = (msblk->next_meta_index + 1) %
+			SQUASHFS_META_SLOTS;
+
+	meta->inode_number = inode->i_ino;
+	meta->offset = offset;
+	meta->skip = skip;
+	meta->entries = 0;
+	meta->locked = 1;
+
+failed:
+	mutex_unlock(&msblk->meta_index_mutex);
+	return meta;
+}
+
+
+static void release_meta_index(struct inode *inode, struct meta_index *meta)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	mutex_lock(&msblk->meta_index_mutex);
+	meta->locked = 0;
+	mutex_unlock(&msblk->meta_index_mutex);
+}
+
+
+/*
+ * Read the next n blocks from the block list, starting from
+ * metadata block <start_block, offset>.
+ */
+static long long read_indexes(struct super_block *sb, int n,
+				u64 *start_block, int *offset)
+{
+	int err, i;
+	long long block = 0;
+	__le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+
+	if (blist == NULL) {
+		ERROR("read_indexes: Failed to allocate block_list\n");
+		return -ENOMEM;
+	}
+
+	while (n) {
+		int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+
+		err = squashfs_read_metadata(sb, blist, start_block,
+				offset, blocks << 2);
+		if (err < 0) {
+			ERROR("read_indexes: reading block [%llx:%x]\n",
+				*start_block, *offset);
+			goto failure;
+		}
+
+		for (i = 0; i < blocks; i++) {
+			int size = le32_to_cpu(blist[i]);
+			block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size);
+		}
+		n -= blocks;
+	}
+
+	kfree(blist);
+	return block;
+
+failure:
+	kfree(blist);
+	return err;
+}
+
+
+/*
+ * Each cache index slot has SQUASHFS_META_ENTRIES, each of which
+ * can cache one index -> datablock/blocklist-block mapping.  We wish
+ * to distribute these over the length of the file, entry[0] maps index x,
+ * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on.
+ * The larger the file, the greater the skip factor.  The skip factor is
+ * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure
+ * the number of metadata blocks that need to be read fits into the cache.
+ * If the skip factor is limited in this way then the file will use multiple
+ * slots.
+ */
+static inline int calculate_skip(int blocks)
+{
+	int skip = blocks / ((SQUASHFS_META_ENTRIES + 1)
+		 * SQUASHFS_META_INDEXES);
+	return min(SQUASHFS_CACHED_BLKS - 1, skip + 1);
+}
+
+
+/*
+ * Search and grow the index cache for the specified inode, returning the
+ * on-disk locations of the datablock and block list metadata block
+ * <index_block, index_offset> for index (scaled to nearest cache index).
+ */
+static int fill_meta_index(struct inode *inode, int index,
+		u64 *index_block, int *index_offset, u64 *data_block)
+{
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int skip = calculate_skip(i_size_read(inode) >> msblk->block_log);
+	int offset = 0;
+	struct meta_index *meta;
+	struct meta_entry *meta_entry;
+	u64 cur_index_block = squashfs_i(inode)->block_list_start;
+	int cur_offset = squashfs_i(inode)->offset;
+	u64 cur_data_block = squashfs_i(inode)->start;
+	int err, i;
+
+	/*
+	 * Scale index to cache index (cache slot entry)
+	 */
+	index /= SQUASHFS_META_INDEXES * skip;
+
+	while (offset < index) {
+		meta = locate_meta_index(inode, offset + 1, index);
+
+		if (meta == NULL) {
+			meta = empty_meta_index(inode, offset + 1, skip);
+			if (meta == NULL)
+				goto all_done;
+		} else {
+			offset = index < meta->offset + meta->entries ? index :
+				meta->offset + meta->entries - 1;
+			meta_entry = &meta->meta_entry[offset - meta->offset];
+			cur_index_block = meta_entry->index_block +
+				msblk->inode_table;
+			cur_offset = meta_entry->offset;
+			cur_data_block = meta_entry->data_block;
+			TRACE("get_meta_index: offset %d, meta->offset %d, "
+				"meta->entries %d\n", offset, meta->offset,
+				meta->entries);
+			TRACE("get_meta_index: index_block 0x%llx, offset 0x%x"
+				" data_block 0x%llx\n", cur_index_block,
+				cur_offset, cur_data_block);
+		}
+
+		/*
+		 * If necessary grow cache slot by reading block list.  Cache
+		 * slot is extended up to index or to the end of the slot, in
+		 * which case further slots will be used.
+		 */
+		for (i = meta->offset + meta->entries; i <= index &&
+				i < meta->offset + SQUASHFS_META_ENTRIES; i++) {
+			int blocks = skip * SQUASHFS_META_INDEXES;
+			long long res = read_indexes(inode->i_sb, blocks,
+					&cur_index_block, &cur_offset);
+
+			if (res < 0) {
+				if (meta->entries == 0)
+					/*
+					 * Don't leave an empty slot on read
+					 * error allocated to this inode...
+					 */
+					meta->inode_number = 0;
+				err = res;
+				goto failed;
+			}
+
+			cur_data_block += res;
+			meta_entry = &meta->meta_entry[i - meta->offset];
+			meta_entry->index_block = cur_index_block -
+				msblk->inode_table;
+			meta_entry->offset = cur_offset;
+			meta_entry->data_block = cur_data_block;
+			meta->entries++;
+			offset++;
+		}
+
+		TRACE("get_meta_index: meta->offset %d, meta->entries %d\n",
+				meta->offset, meta->entries);
+
+		release_meta_index(inode, meta);
+	}
+
+all_done:
+	*index_block = cur_index_block;
+	*index_offset = cur_offset;
+	*data_block = cur_data_block;
+
+	/*
+	 * Scale cache index (cache slot entry) to index
+	 */
+	return offset * SQUASHFS_META_INDEXES * skip;
+
+failed:
+	release_meta_index(inode, meta);
+	return err;
+}
+
+
+/*
+ * Get the on-disk location and compressed size of the datablock
+ * specified by index.  Fill_meta_index() does most of the work.
+ */
+static int read_blocklist(struct inode *inode, int index, u64 *block)
+{
+	u64 start;
+	long long blks;
+	int offset;
+	__le32 size;
+	int res = fill_meta_index(inode, index, &start, &offset, block);
+
+	TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset"
+		       " 0x%x, block 0x%llx\n", res, index, start, offset,
+			*block);
+
+	if (res < 0)
+		return res;
+
+	/*
+	 * res contains the index of the mapping returned by fill_meta_index(),
+	 * this will likely be less than the desired index (because the
+	 * meta_index cache works at a higher granularity).  Read any
+	 * extra block indexes needed.
+	 */
+	if (res < index) {
+		blks = read_indexes(inode->i_sb, index - res, &start, &offset);
+		if (blks < 0)
+			return (int) blks;
+		*block += blks;
+	}
+
+	/*
+	 * Read length of block specified by index.
+	 */
+	res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset,
+			sizeof(size));
+	if (res < 0)
+		return res;
+	return le32_to_cpu(size);
+}
+
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+	int bytes, i, offset = 0, sparse = 0;
+	struct squashfs_cache_entry *buffer = NULL;
+	void *pageaddr;
+
+	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+	int start_index = page->index & ~mask;
+	int end_index = start_index | mask;
+	int file_end = i_size_read(inode) >> msblk->block_log;
+
+	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+				page->index, squashfs_i(inode)->start);
+
+	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT))
+		goto out;
+
+	if (index < file_end || squashfs_i(inode)->fragment_block ==
+					SQUASHFS_INVALID_BLK) {
+		/*
+		 * Reading a datablock from disk.  Need to read block list
+		 * to get location and block size.
+		 */
+		u64 block = 0;
+		int bsize = read_blocklist(inode, index, &block);
+		if (bsize < 0)
+			goto error_out;
+
+		if (bsize == 0) { /* hole */
+			bytes = index == file_end ?
+				(i_size_read(inode) & (msblk->block_size - 1)) :
+				 msblk->block_size;
+			sparse = 1;
+		} else {
+			/*
+			 * Read and decompress datablock.
+			 */
+			buffer = squashfs_get_datablock(inode->i_sb,
+								block, bsize);
+			if (buffer->error) {
+				ERROR("Unable to read page, block %llx, size %x"
+					"\n", block, bsize);
+				squashfs_cache_put(buffer);
+				goto error_out;
+			}
+			bytes = buffer->length;
+		}
+	} else {
+		/*
+		 * Datablock is stored inside a fragment (tail-end packed
+		 * block).
+		 */
+		buffer = squashfs_get_fragment(inode->i_sb,
+				squashfs_i(inode)->fragment_block,
+				squashfs_i(inode)->fragment_size);
+
+		if (buffer->error) {
+			ERROR("Unable to read page, block %llx, size %x\n",
+				squashfs_i(inode)->fragment_block,
+				squashfs_i(inode)->fragment_size);
+			squashfs_cache_put(buffer);
+			goto error_out;
+		}
+		bytes = i_size_read(inode) & (msblk->block_size - 1);
+		offset = squashfs_i(inode)->fragment_offset;
+	}
+
+	/*
+	 * Loop copying datablock into pages.  As the datablock likely covers
+	 * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
+	 * grab the pages from the page cache, except for the page that we've
+	 * been called to fill.
+	 */
+	for (i = start_index; i <= end_index && bytes > 0; i++,
+			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+		struct page *push_page;
+		int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+
+		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
+
+		push_page = (i == page->index) ? page :
+			grab_cache_page_nowait(page->mapping, i);
+
+		if (!push_page)
+			continue;
+
+		if (PageUptodate(push_page))
+			goto skip_page;
+
+		pageaddr = kmap_atomic(push_page, KM_USER0);
+		squashfs_copy_data(pageaddr, buffer, offset, avail);
+		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		kunmap_atomic(pageaddr, KM_USER0);
+		flush_dcache_page(push_page);
+		SetPageUptodate(push_page);
+skip_page:
+		unlock_page(push_page);
+		if (i != page->index)
+			page_cache_release(push_page);
+	}
+
+	if (!sparse)
+		squashfs_cache_put(buffer);
+
+	return 0;
+
+error_out:
+	SetPageError(page);
+out:
+	pageaddr = kmap_atomic(page, KM_USER0);
+	memset(pageaddr, 0, PAGE_CACHE_SIZE);
+	kunmap_atomic(pageaddr, KM_USER0);
+	flush_dcache_page(page);
+	if (!PageError(page))
+		SetPageUptodate(page);
+	unlock_page(page);
+
+	return 0;
+}
+
+
+const struct address_space_operations squashfs_aops = {
+	.readpage = squashfs_readpage
+};
-- 
cgit v1.2.3


From 1dc4bba39dd29c6d6f77ca7bf63cd3adeb6fc162 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:24 +0000
Subject: Squashfs: symlink operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/symlink.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100644 fs/squashfs/symlink.c

(limited to 'fs')

diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
new file mode 100644
index 00000000000..83d87880aac
--- /dev/null
+++ b/fs/squashfs/symlink.c
@@ -0,0 +1,118 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * symlink.c
+ */
+
+/*
+ * This file implements code to handle symbolic links.
+ *
+ * The data contents of symbolic links are stored inside the symbolic
+ * link inode within the inode table.  This allows the normally small symbolic
+ * link to be compressed as part of the inode table, achieving much greater
+ * compression than if the symbolic link was compressed individually.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static int squashfs_symlink_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int index = page->index << PAGE_CACHE_SHIFT;
+	u64 block = squashfs_i(inode)->start;
+	int offset = squashfs_i(inode)->offset;
+	int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+	int bytes, copied;
+	void *pageaddr;
+	struct squashfs_cache_entry *entry;
+
+	TRACE("Entered squashfs_symlink_readpage, page index %ld, start block "
+			"%llx, offset %x\n", page->index, block, offset);
+
+	/*
+	 * Skip index bytes into symlink metadata.
+	 */
+	if (index) {
+		bytes = squashfs_read_metadata(sb, NULL, &block, &offset,
+								index);
+		if (bytes < 0) {
+			ERROR("Unable to read symlink [%llx:%x]\n",
+				squashfs_i(inode)->start,
+				squashfs_i(inode)->offset);
+			goto error_out;
+		}
+	}
+
+	/*
+	 * Read length bytes from symlink metadata.  Squashfs_read_metadata
+	 * is not used here because it can sleep and we want to use
+	 * kmap_atomic to map the page.  Instead call the underlying
+	 * squashfs_cache_get routine.  As length bytes may overlap metadata
+	 * blocks, we may need to call squashfs_cache_get multiple times.
+	 */
+	for (bytes = 0; bytes < length; offset = 0, bytes += copied) {
+		entry = squashfs_cache_get(sb, msblk->block_cache, block, 0);
+		if (entry->error) {
+			ERROR("Unable to read symlink [%llx:%x]\n",
+				squashfs_i(inode)->start,
+				squashfs_i(inode)->offset);
+			squashfs_cache_put(entry);
+			goto error_out;
+		}
+
+		pageaddr = kmap_atomic(page, KM_USER0);
+		copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
+								length - bytes);
+		if (copied == length - bytes)
+			memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+		else
+			block = entry->next_index;
+		kunmap_atomic(pageaddr, KM_USER0);
+		squashfs_cache_put(entry);
+	}
+
+	flush_dcache_page(page);
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+
+error_out:
+	SetPageError(page);
+	unlock_page(page);
+	return 0;
+}
+
+
+const struct address_space_operations squashfs_symlink_aops = {
+	.readpage = squashfs_symlink_readpage
+};
-- 
cgit v1.2.3


From 0aa666190509ffab81c202c5095a166be23961ac Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:25 +0000
Subject: Squashfs: super block operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/super.c | 440 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 440 insertions(+)
 create mode 100644 fs/squashfs/super.c

(limited to 'fs')

diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
new file mode 100644
index 00000000000..a0466d7467b
--- /dev/null
+++ b/fs/squashfs/super.c
@@ -0,0 +1,440 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * super.c
+ */
+
+/*
+ * This file implements code to read the superblock, read and initialise
+ * in-memory structures at mount time, and all the VFS glue code to register
+ * the filesystem.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static struct file_system_type squashfs_fs_type;
+static struct super_operations squashfs_super_ops;
+
+static int supported_squashfs_filesystem(short major, short minor, short comp)
+{
+	if (major < SQUASHFS_MAJOR) {
+		ERROR("Major/Minor mismatch, older Squashfs %d.%d "
+			"filesystems are unsupported\n", major, minor);
+		return -EINVAL;
+	} else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) {
+		ERROR("Major/Minor mismatch, trying to mount newer "
+			"%d.%d filesystem\n", major, minor);
+		ERROR("Please update your kernel\n");
+		return -EINVAL;
+	}
+
+	if (comp != ZLIB_COMPRESSION)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct squashfs_sb_info *msblk;
+	struct squashfs_super_block *sblk = NULL;
+	char b[BDEVNAME_SIZE];
+	struct inode *root;
+	long long root_inode;
+	unsigned short flags;
+	unsigned int fragments;
+	u64 lookup_table_start;
+	int err;
+
+	TRACE("Entered squashfs_fill_superblock\n");
+
+	sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
+	if (sb->s_fs_info == NULL) {
+		ERROR("Failed to allocate squashfs_sb_info\n");
+		return -ENOMEM;
+	}
+	msblk = sb->s_fs_info;
+
+	msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(),
+		GFP_KERNEL);
+	if (msblk->stream.workspace == NULL) {
+		ERROR("Failed to allocate zlib workspace\n");
+		goto failure;
+	}
+
+	sblk = kzalloc(sizeof(*sblk), GFP_KERNEL);
+	if (sblk == NULL) {
+		ERROR("Failed to allocate squashfs_super_block\n");
+		goto failure;
+	}
+
+	msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE);
+	msblk->devblksize_log2 = ffz(~msblk->devblksize);
+
+	mutex_init(&msblk->read_data_mutex);
+	mutex_init(&msblk->meta_index_mutex);
+
+	/*
+	 * msblk->bytes_used is checked in squashfs_read_table to ensure reads
+	 * are not beyond filesystem end.  But as we're using
+	 * squashfs_read_table here to read the superblock (including the value
+	 * of bytes_used) we need to set it to an initial sensible dummy value
+	 */
+	msblk->bytes_used = sizeof(*sblk);
+	err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk));
+
+	if (err < 0) {
+		ERROR("unable to read squashfs_super_block\n");
+		goto failed_mount;
+	}
+
+	/* Check it is a SQUASHFS superblock */
+	sb->s_magic = le32_to_cpu(sblk->s_magic);
+	if (sb->s_magic != SQUASHFS_MAGIC) {
+		if (!silent)
+			ERROR("Can't find a SQUASHFS superblock on %s\n",
+						bdevname(sb->s_bdev, b));
+		err = -EINVAL;
+		goto failed_mount;
+	}
+
+	/* Check the MAJOR & MINOR versions and compression type */
+	err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major),
+			le16_to_cpu(sblk->s_minor),
+			le16_to_cpu(sblk->compression));
+	if (err < 0)
+		goto failed_mount;
+
+	err = -EINVAL;
+
+	/*
+	 * Check if there's xattrs in the filesystem.  These are not
+	 * supported in this version, so warn that they will be ignored.
+	 */
+	if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
+		ERROR("Xattrs in filesystem, these will be ignored\n");
+
+	/* Check the filesystem does not extend beyond the end of the
+	   block device */
+	msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
+	if (msblk->bytes_used < 0 || msblk->bytes_used >
+			i_size_read(sb->s_bdev->bd_inode))
+		goto failed_mount;
+
+	/* Check block size for sanity */
+	msblk->block_size = le32_to_cpu(sblk->block_size);
+	if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE)
+		goto failed_mount;
+
+	msblk->block_log = le16_to_cpu(sblk->block_log);
+	if (msblk->block_log > SQUASHFS_FILE_MAX_LOG)
+		goto failed_mount;
+
+	/* Check the root inode for sanity */
+	root_inode = le64_to_cpu(sblk->root_inode);
+	if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE)
+		goto failed_mount;
+
+	msblk->inode_table = le64_to_cpu(sblk->inode_table_start);
+	msblk->directory_table = le64_to_cpu(sblk->directory_table_start);
+	msblk->inodes = le32_to_cpu(sblk->inodes);
+	flags = le16_to_cpu(sblk->flags);
+
+	TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+	TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
+				? "un" : "");
+	TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
+				? "un" : "");
+	TRACE("Filesystem size %lld bytes\n", msblk->bytes_used);
+	TRACE("Block size %d\n", msblk->block_size);
+	TRACE("Number of inodes %d\n", msblk->inodes);
+	TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments));
+	TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids));
+	TRACE("sblk->inode_table_start %llx\n", msblk->inode_table);
+	TRACE("sblk->directory_table_start %llx\n", msblk->directory_table);
+	TRACE("sblk->fragment_table_start %llx\n",
+		(u64) le64_to_cpu(sblk->fragment_table_start));
+	TRACE("sblk->id_table_start %llx\n",
+		(u64) le64_to_cpu(sblk->id_table_start));
+
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_flags |= MS_RDONLY;
+	sb->s_op = &squashfs_super_ops;
+
+	err = -ENOMEM;
+
+	msblk->block_cache = squashfs_cache_init("metadata",
+			SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
+	if (msblk->block_cache == NULL)
+		goto failed_mount;
+
+	/* Allocate read_page block */
+	msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+	if (msblk->read_page == NULL) {
+		ERROR("Failed to allocate read_page block\n");
+		goto failed_mount;
+	}
+
+	/* Allocate and read id index table */
+	msblk->id_table = squashfs_read_id_index_table(sb,
+		le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
+	if (IS_ERR(msblk->id_table)) {
+		err = PTR_ERR(msblk->id_table);
+		msblk->id_table = NULL;
+		goto failed_mount;
+	}
+
+	fragments = le32_to_cpu(sblk->fragments);
+	if (fragments == 0)
+		goto allocate_lookup_table;
+
+	msblk->fragment_cache = squashfs_cache_init("fragment",
+		SQUASHFS_CACHED_FRAGMENTS, msblk->block_size);
+	if (msblk->fragment_cache == NULL) {
+		err = -ENOMEM;
+		goto failed_mount;
+	}
+
+	/* Allocate and read fragment index table */
+	msblk->fragment_index = squashfs_read_fragment_index_table(sb,
+		le64_to_cpu(sblk->fragment_table_start), fragments);
+	if (IS_ERR(msblk->fragment_index)) {
+		err = PTR_ERR(msblk->fragment_index);
+		msblk->fragment_index = NULL;
+		goto failed_mount;
+	}
+
+allocate_lookup_table:
+	lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
+	if (lookup_table_start == SQUASHFS_INVALID_BLK)
+		goto allocate_root;
+
+	/* Allocate and read inode lookup table */
+	msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
+		lookup_table_start, msblk->inodes);
+	if (IS_ERR(msblk->inode_lookup_table)) {
+		err = PTR_ERR(msblk->inode_lookup_table);
+		msblk->inode_lookup_table = NULL;
+		goto failed_mount;
+	}
+
+	sb->s_export_op = &squashfs_export_ops;
+
+allocate_root:
+	root = new_inode(sb);
+	if (!root) {
+		err = -ENOMEM;
+		goto failed_mount;
+	}
+
+	err = squashfs_read_inode(root, root_inode);
+	if (err) {
+		iget_failed(root);
+		goto failed_mount;
+	}
+	insert_inode_hash(root);
+
+	sb->s_root = d_alloc_root(root);
+	if (sb->s_root == NULL) {
+		ERROR("Root inode create failed\n");
+		err = -ENOMEM;
+		iput(root);
+		goto failed_mount;
+	}
+
+	TRACE("Leaving squashfs_fill_super\n");
+	kfree(sblk);
+	return 0;
+
+failed_mount:
+	squashfs_cache_delete(msblk->block_cache);
+	squashfs_cache_delete(msblk->fragment_cache);
+	squashfs_cache_delete(msblk->read_page);
+	kfree(msblk->inode_lookup_table);
+	kfree(msblk->fragment_index);
+	kfree(msblk->id_table);
+	kfree(msblk->stream.workspace);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+	kfree(sblk);
+	return err;
+
+failure:
+	kfree(msblk->stream.workspace);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+	return -ENOMEM;
+}
+
+
+static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info;
+
+	TRACE("Entered squashfs_statfs\n");
+
+	buf->f_type = SQUASHFS_MAGIC;
+	buf->f_bsize = msblk->block_size;
+	buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1;
+	buf->f_bfree = buf->f_bavail = 0;
+	buf->f_files = msblk->inodes;
+	buf->f_ffree = 0;
+	buf->f_namelen = SQUASHFS_NAME_LEN;
+
+	return 0;
+}
+
+
+static int squashfs_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_RDONLY;
+	return 0;
+}
+
+
+static void squashfs_put_super(struct super_block *sb)
+{
+	if (sb->s_fs_info) {
+		struct squashfs_sb_info *sbi = sb->s_fs_info;
+		squashfs_cache_delete(sbi->block_cache);
+		squashfs_cache_delete(sbi->fragment_cache);
+		squashfs_cache_delete(sbi->read_page);
+		kfree(sbi->id_table);
+		kfree(sbi->fragment_index);
+		kfree(sbi->meta_index);
+		kfree(sbi->stream.workspace);
+		kfree(sb->s_fs_info);
+		sb->s_fs_info = NULL;
+	}
+}
+
+
+static int squashfs_get_sb(struct file_system_type *fs_type, int flags,
+				const char *dev_name, void *data,
+				struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super,
+				mnt);
+}
+
+
+static struct kmem_cache *squashfs_inode_cachep;
+
+
+static void init_once(void *foo)
+{
+	struct squashfs_inode_info *ei = foo;
+
+	inode_init_once(&ei->vfs_inode);
+}
+
+
+static int __init init_inodecache(void)
+{
+	squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
+		sizeof(struct squashfs_inode_info), 0,
+		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+
+	return squashfs_inode_cachep ? 0 : -ENOMEM;
+}
+
+
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(squashfs_inode_cachep);
+}
+
+
+static int __init init_squashfs_fs(void)
+{
+	int err = init_inodecache();
+
+	if (err)
+		return err;
+
+	err = register_filesystem(&squashfs_fs_type);
+	if (err) {
+		destroy_inodecache();
+		return err;
+	}
+
+	printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+		"Phillip Lougher\n");
+
+	return 0;
+}
+
+
+static void __exit exit_squashfs_fs(void)
+{
+	unregister_filesystem(&squashfs_fs_type);
+	destroy_inodecache();
+}
+
+
+static struct inode *squashfs_alloc_inode(struct super_block *sb)
+{
+	struct squashfs_inode_info *ei =
+		kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+
+	return ei ? &ei->vfs_inode : NULL;
+}
+
+
+static void squashfs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode));
+}
+
+
+static struct file_system_type squashfs_fs_type = {
+	.owner = THIS_MODULE,
+	.name = "squashfs",
+	.get_sb = squashfs_get_sb,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV
+};
+
+static struct super_operations squashfs_super_ops = {
+	.alloc_inode = squashfs_alloc_inode,
+	.destroy_inode = squashfs_destroy_inode,
+	.statfs = squashfs_statfs,
+	.put_super = squashfs_put_super,
+	.remount_fs = squashfs_remount
+};
+
+module_init(init_squashfs_fs);
+module_exit(exit_squashfs_fs);
+MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem");
+MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 122601408d20c77704268f1dea9f9ce4abf997c2 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:25 +0000
Subject: Squashfs: export operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/export.c | 155 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 fs/squashfs/export.c

(limited to 'fs')

diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
new file mode 100644
index 00000000000..69e971d5ddc
--- /dev/null
+++ b/fs/squashfs/export.c
@@ -0,0 +1,155 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * export.c
+ */
+
+/*
+ * This file implements code to make Squashfs filesystems exportable (NFS etc.)
+ *
+ * The export code uses an inode lookup table to map inode numbers passed in
+ * filehandles to an inode location on disk.  This table is stored compressed
+ * into metadata blocks.  A second index table is used to locate these.  This
+ * second index table for speed of access (and because it is small) is read at
+ * mount time and cached in memory.
+ *
+ * The inode lookup table is used only by the export code, inode disk
+ * locations are directly encoded in directories, enabling direct access
+ * without an intermediate lookup for all operations except the export ops.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up inode number (ino) in table, returning the inode location.
+ */
+static long long squashfs_inode_lookup(struct super_block *sb, int ino_num)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1);
+	int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1);
+	u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]);
+	__le64 ino;
+	int err;
+
+	TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num);
+
+	err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino));
+	if (err < 0)
+		return err;
+
+	TRACE("squashfs_inode_lookup, inode = 0x%llx\n",
+		(u64) le64_to_cpu(ino));
+
+	return le64_to_cpu(ino);
+}
+
+
+static struct dentry *squashfs_export_iget(struct super_block *sb,
+	unsigned int ino_num)
+{
+	long long ino;
+	struct dentry *dentry = ERR_PTR(-ENOENT);
+
+	TRACE("Entered squashfs_export_iget\n");
+
+	ino = squashfs_inode_lookup(sb, ino_num);
+	if (ino >= 0)
+		dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num));
+
+	return dentry;
+}
+
+
+static struct dentry *squashfs_fh_to_dentry(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT)
+			|| fh_len < 2)
+		return NULL;
+
+	return squashfs_export_iget(sb, fid->i32.ino);
+}
+
+
+static struct dentry *squashfs_fh_to_parent(struct super_block *sb,
+		struct fid *fid, int fh_len, int fh_type)
+{
+	if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4)
+		return NULL;
+
+	return squashfs_export_iget(sb, fid->i32.parent_ino);
+}
+
+
+static struct dentry *squashfs_get_parent(struct dentry *child)
+{
+	struct inode *inode = child->d_inode;
+	unsigned int parent_ino = squashfs_i(inode)->parent;
+
+	return squashfs_export_iget(inode->i_sb, parent_ino);
+}
+
+
+/*
+ * Read uncompressed inode lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
+		u64 lookup_table_start, unsigned int inodes)
+{
+	unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes);
+	__le64 *inode_lookup_table;
+	int err;
+
+	TRACE("In read_inode_lookup_table, length %d\n", length);
+
+	/* Allocate inode lookup table indexes */
+	inode_lookup_table = kmalloc(length, GFP_KERNEL);
+	if (inode_lookup_table == NULL) {
+		ERROR("Failed to allocate inode lookup table\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start,
+			length);
+	if (err < 0) {
+		ERROR("unable to read inode lookup table\n");
+		kfree(inode_lookup_table);
+		return ERR_PTR(err);
+	}
+
+	return inode_lookup_table;
+}
+
+
+const struct export_operations squashfs_export_ops = {
+	.fh_to_dentry = squashfs_fh_to_dentry,
+	.fh_to_parent = squashfs_fh_to_parent,
+	.get_parent = squashfs_get_parent
+};
-- 
cgit v1.2.3


From 122edd1514aaaa90fd894663ecfcb4135063c63d Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:25 +0000
Subject: Squashfs: fragment block operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/fragment.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 fs/squashfs/fragment.c

(limited to 'fs')

diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
new file mode 100644
index 00000000000..b5a2c15bbbc
--- /dev/null
+++ b/fs/squashfs/fragment.c
@@ -0,0 +1,98 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * fragment.c
+ */
+
+/*
+ * This file implements code to handle compressed fragments (tail-end packed
+ * datablocks).
+ *
+ * Regular files contain a fragment index which is mapped to a fragment
+ * location on disk and compressed size using a fragment lookup table.
+ * Like everything in Squashfs this fragment lookup table is itself stored
+ * compressed into metadata blocks.  A second index table is used to locate
+ * these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up fragment using the fragment index table.  Return the on disk
+ * location of the fragment and its compressed size
+ */
+int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment,
+				u64 *fragment_block)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_FRAGMENT_INDEX(fragment);
+	int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment);
+	u64 start_block = le64_to_cpu(msblk->fragment_index[block]);
+	struct squashfs_fragment_entry fragment_entry;
+	int size;
+
+	size = squashfs_read_metadata(sb, &fragment_entry, &start_block,
+					&offset, sizeof(fragment_entry));
+	if (size < 0)
+		return size;
+
+	*fragment_block = le64_to_cpu(fragment_entry.start_block);
+	size = le32_to_cpu(fragment_entry.size);
+
+	return size;
+}
+
+
+/*
+ * Read the uncompressed fragment lookup table indexes off disk into memory
+ */
+__le64 *squashfs_read_fragment_index_table(struct super_block *sb,
+	u64 fragment_table_start, unsigned int fragments)
+{
+	unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments);
+	__le64 *fragment_index;
+	int err;
+
+	/* Allocate fragment lookup table indexes */
+	fragment_index = kmalloc(length, GFP_KERNEL);
+	if (fragment_index == NULL) {
+		ERROR("Failed to allocate fragment index table\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = squashfs_read_table(sb, fragment_index, fragment_table_start,
+			length);
+	if (err < 0) {
+		ERROR("unable to read fragment index table\n");
+		kfree(fragment_index);
+		return ERR_PTR(err);
+	}
+
+	return fragment_index;
+}
-- 
cgit v1.2.3


From 8256c8f631937bb08b3881c380c42ff6874a82f0 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:26 +0000
Subject: Squashfs: uid/gid lookup operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/id.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 fs/squashfs/id.c

(limited to 'fs')

diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
new file mode 100644
index 00000000000..3795b837ba2
--- /dev/null
+++ b/fs/squashfs/id.c
@@ -0,0 +1,94 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * id.c
+ */
+
+/*
+ * This file implements code to handle uids and gids.
+ *
+ * For space efficiency regular files store uid and gid indexes, which are
+ * converted to 32-bit uids/gids using an id look up table.  This table is
+ * stored compressed into metadata blocks.  A second index table is used to
+ * locate these.  This second index table for speed of access (and because it
+ * is small) is read at mount time and cached in memory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Map uid/gid index into real 32-bit uid/gid using the id look up table
+ */
+int squashfs_get_id(struct super_block *sb, unsigned int index,
+					unsigned int *id)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int block = SQUASHFS_ID_BLOCK(index);
+	int offset = SQUASHFS_ID_BLOCK_OFFSET(index);
+	u64 start_block = le64_to_cpu(msblk->id_table[block]);
+	__le32 disk_id;
+	int err;
+
+	err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset,
+							sizeof(disk_id));
+	if (err < 0)
+		return err;
+
+	*id = le32_to_cpu(disk_id);
+	return 0;
+}
+
+
+/*
+ * Read uncompressed id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_id_index_table(struct super_block *sb,
+			u64 id_table_start, unsigned short no_ids)
+{
+	unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids);
+	__le64 *id_table;
+	int err;
+
+	TRACE("In read_id_index_table, length %d\n", length);
+
+	/* Allocate id lookup table indexes */
+	id_table = kmalloc(length, GFP_KERNEL);
+	if (id_table == NULL) {
+		ERROR("Failed to allocate id index table\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = squashfs_read_table(sb, id_table, id_table_start, length);
+	if (err < 0) {
+		ERROR("unable to read id index table\n");
+		kfree(id_table);
+		return ERR_PTR(err);
+	}
+
+	return id_table;
+}
-- 
cgit v1.2.3


From f400e12656ab518be107febfe2315fb1eab5a342 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:26 +0000
Subject: Squashfs: cache operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/cache.c | 412 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 412 insertions(+)
 create mode 100644 fs/squashfs/cache.c

(limited to 'fs')

diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
new file mode 100644
index 00000000000..f29eda16d25
--- /dev/null
+++ b/fs/squashfs/cache.c
@@ -0,0 +1,412 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * cache.c
+ */
+
+/*
+ * Blocks in Squashfs are compressed.  To avoid repeatedly decompressing
+ * recently accessed data Squashfs uses two small metadata and fragment caches.
+ *
+ * This file implements a generic cache implementation used for both caches,
+ * plus functions layered ontop of the generic cache implementation to
+ * access the metadata and fragment caches.
+ *
+ * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
+ *
+ * It should be noted that the cache is not used for file datablocks, these
+ * are decompressed and cached in the page-cache in the normal way.  The
+ * cache is only used to temporarily cache fragment and metadata blocks
+ * which have been read as as a result of a metadata (i.e. inode or
+ * directory) or fragment access.  Because metadata and fragments are packed
+ * together into blocks (to gain greater compression) the read of a particular
+ * piece of metadata or fragment will retrieve other metadata/fragments which
+ * have been packed with it, these because of locality-of-reference may be read
+ * in the near future. Temporarily caching them ensures they are available for
+ * near future access without requiring an additional read and decompress.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/zlib.h>
+#include <linux/pagemap.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Look-up block in cache, and increment usage count.  If not in cache, read
+ * and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
+	struct squashfs_cache *cache, u64 block, int length)
+{
+	int i, n;
+	struct squashfs_cache_entry *entry;
+
+	spin_lock(&cache->lock);
+
+	while (1) {
+		for (i = 0; i < cache->entries; i++)
+			if (cache->entry[i].block == block)
+				break;
+
+		if (i == cache->entries) {
+			/*
+			 * Block not in cache, if all cache entries are used
+			 * go to sleep waiting for one to become available.
+			 */
+			if (cache->unused == 0) {
+				cache->num_waiters++;
+				spin_unlock(&cache->lock);
+				wait_event(cache->wait_queue, cache->unused);
+				spin_lock(&cache->lock);
+				cache->num_waiters--;
+				continue;
+			}
+
+			/*
+			 * At least one unused cache entry.  A simple
+			 * round-robin strategy is used to choose the entry to
+			 * be evicted from the cache.
+			 */
+			i = cache->next_blk;
+			for (n = 0; n < cache->entries; n++) {
+				if (cache->entry[i].refcount == 0)
+					break;
+				i = (i + 1) % cache->entries;
+			}
+
+			cache->next_blk = (i + 1) % cache->entries;
+			entry = &cache->entry[i];
+
+			/*
+			 * Initialise choosen cache entry, and fill it in from
+			 * disk.
+			 */
+			cache->unused--;
+			entry->block = block;
+			entry->refcount = 1;
+			entry->pending = 1;
+			entry->num_waiters = 0;
+			entry->error = 0;
+			spin_unlock(&cache->lock);
+
+			entry->length = squashfs_read_data(sb, entry->data,
+				block, length, &entry->next_index,
+				cache->block_size);
+
+			spin_lock(&cache->lock);
+
+			if (entry->length < 0)
+				entry->error = entry->length;
+
+			entry->pending = 0;
+
+			/*
+			 * While filling this entry one or more other processes
+			 * have looked it up in the cache, and have slept
+			 * waiting for it to become available.
+			 */
+			if (entry->num_waiters) {
+				spin_unlock(&cache->lock);
+				wake_up_all(&entry->wait_queue);
+			} else
+				spin_unlock(&cache->lock);
+
+			goto out;
+		}
+
+		/*
+		 * Block already in cache.  Increment refcount so it doesn't
+		 * get reused until we're finished with it, if it was
+		 * previously unused there's one less cache entry available
+		 * for reuse.
+		 */
+		entry = &cache->entry[i];
+		if (entry->refcount == 0)
+			cache->unused--;
+		entry->refcount++;
+
+		/*
+		 * If the entry is currently being filled in by another process
+		 * go to sleep waiting for it to become available.
+		 */
+		if (entry->pending) {
+			entry->num_waiters++;
+			spin_unlock(&cache->lock);
+			wait_event(entry->wait_queue, !entry->pending);
+		} else
+			spin_unlock(&cache->lock);
+
+		goto out;
+	}
+
+out:
+	TRACE("Got %s %d, start block %lld, refcount %d, error %d\n",
+		cache->name, i, entry->block, entry->refcount, entry->error);
+
+	if (entry->error)
+		ERROR("Unable to read %s cache entry [%llx]\n", cache->name,
+							block);
+	return entry;
+}
+
+
+/*
+ * Release cache entry, once usage count is zero it can be reused.
+ */
+void squashfs_cache_put(struct squashfs_cache_entry *entry)
+{
+	struct squashfs_cache *cache = entry->cache;
+
+	spin_lock(&cache->lock);
+	entry->refcount--;
+	if (entry->refcount == 0) {
+		cache->unused++;
+		/*
+		 * If there's any processes waiting for a block to become
+		 * available, wake one up.
+		 */
+		if (cache->num_waiters) {
+			spin_unlock(&cache->lock);
+			wake_up(&cache->wait_queue);
+			return;
+		}
+	}
+	spin_unlock(&cache->lock);
+}
+
+/*
+ * Delete cache reclaiming all kmalloced buffers.
+ */
+void squashfs_cache_delete(struct squashfs_cache *cache)
+{
+	int i, j;
+
+	if (cache == NULL)
+		return;
+
+	for (i = 0; i < cache->entries; i++) {
+		if (cache->entry[i].data) {
+			for (j = 0; j < cache->pages; j++)
+				kfree(cache->entry[i].data[j]);
+			kfree(cache->entry[i].data);
+		}
+	}
+
+	kfree(cache->entry);
+	kfree(cache);
+}
+
+
+/*
+ * Initialise cache allocating the specified number of entries, each of
+ * size block_size.  To avoid vmalloc fragmentation issues each entry
+ * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers.
+ */
+struct squashfs_cache *squashfs_cache_init(char *name, int entries,
+	int block_size)
+{
+	int i, j;
+	struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+
+	if (cache == NULL) {
+		ERROR("Failed to allocate %s cache\n", name);
+		return NULL;
+	}
+
+	cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL);
+	if (cache->entry == NULL) {
+		ERROR("Failed to allocate %s cache\n", name);
+		goto cleanup;
+	}
+
+	cache->next_blk = 0;
+	cache->unused = entries;
+	cache->entries = entries;
+	cache->block_size = block_size;
+	cache->pages = block_size >> PAGE_CACHE_SHIFT;
+	cache->name = name;
+	cache->num_waiters = 0;
+	spin_lock_init(&cache->lock);
+	init_waitqueue_head(&cache->wait_queue);
+
+	for (i = 0; i < entries; i++) {
+		struct squashfs_cache_entry *entry = &cache->entry[i];
+
+		init_waitqueue_head(&cache->entry[i].wait_queue);
+		entry->cache = cache;
+		entry->block = SQUASHFS_INVALID_BLK;
+		entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
+		if (entry->data == NULL) {
+			ERROR("Failed to allocate %s cache entry\n", name);
+			goto cleanup;
+		}
+
+		for (j = 0; j < cache->pages; j++) {
+			entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+			if (entry->data[j] == NULL) {
+				ERROR("Failed to allocate %s buffer\n", name);
+				goto cleanup;
+			}
+		}
+	}
+
+	return cache;
+
+cleanup:
+	squashfs_cache_delete(cache);
+	return NULL;
+}
+
+
+/*
+ * Copy upto length bytes from cache entry to buffer starting at offset bytes
+ * into the cache entry.  If there's not length bytes then copy the number of
+ * bytes available.  In all cases return the number of bytes copied.
+ */
+int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
+		int offset, int length)
+{
+	int remaining = length;
+
+	if (length == 0)
+		return 0;
+	else if (buffer == NULL)
+		return min(length, entry->length - offset);
+
+	while (offset < entry->length) {
+		void *buff = entry->data[offset / PAGE_CACHE_SIZE]
+				+ (offset % PAGE_CACHE_SIZE);
+		int bytes = min_t(int, entry->length - offset,
+				PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+
+		if (bytes >= remaining) {
+			memcpy(buffer, buff, remaining);
+			remaining = 0;
+			break;
+		}
+
+		memcpy(buffer, buff, bytes);
+		buffer += bytes;
+		remaining -= bytes;
+		offset += bytes;
+	}
+
+	return length - remaining;
+}
+
+
+/*
+ * Read length bytes from metadata position <block, offset> (block is the
+ * start of the compressed block on disk, and offset is the offset into
+ * the block once decompressed).  Data is packed into consecutive blocks,
+ * and length bytes may require reading more than one block.
+ */
+int squashfs_read_metadata(struct super_block *sb, void *buffer,
+		u64 *block, int *offset, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	int bytes, copied = length;
+	struct squashfs_cache_entry *entry;
+
+	TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
+
+	while (length) {
+		entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
+		if (entry->error)
+			return entry->error;
+		else if (*offset >= entry->length)
+			return -EIO;
+
+		bytes = squashfs_copy_data(buffer, entry, *offset, length);
+		if (buffer)
+			buffer += bytes;
+		length -= bytes;
+		*offset += bytes;
+
+		if (*offset == entry->length) {
+			*block = entry->next_index;
+			*offset = 0;
+		}
+
+		squashfs_cache_put(entry);
+	}
+
+	return copied;
+}
+
+
+/*
+ * Look-up in the fragmment cache the fragment located at <start_block> in the
+ * filesystem.  If necessary read and decompress it from disk.
+ */
+struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb,
+				u64 start_block, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+	return squashfs_cache_get(sb, msblk->fragment_cache, start_block,
+		length);
+}
+
+
+/*
+ * Read and decompress the datablock located at <start_block> in the
+ * filesystem.  The cache is used here to avoid duplicating locking and
+ * read/decompress code.
+ */
+struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
+				u64 start_block, int length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+
+	return squashfs_cache_get(sb, msblk->read_page, start_block, length);
+}
+
+
+/*
+ * Read a filesystem table (uncompressed sequence of bytes) from disk
+ */
+int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
+	int length)
+{
+	int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int i, res;
+	void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	if (data == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+		data[i] = buffer;
+	res = squashfs_read_data(sb, data, block, length |
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+	kfree(data);
+	return res;
+}
-- 
cgit v1.2.3


From e2780ab159ac60ef5bfc083081f44f8ff58579dc Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:27 +0000
Subject: Squashfs: block operations

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/block.c | 274 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 fs/squashfs/block.c

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
new file mode 100644
index 00000000000..c837dfc2b3c
--- /dev/null
+++ b/fs/squashfs/block.c
@@ -0,0 +1,274 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * block.c
+ */
+
+/*
+ * This file implements the low-level routines to read and decompress
+ * datablocks and metadata blocks.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/buffer_head.h>
+#include <linux/zlib.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Read the metadata block length, this is stored in the first two
+ * bytes of the metadata block.
+ */
+static struct buffer_head *get_block_length(struct super_block *sb,
+			u64 *cur_index, int *offset, int *length)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head *bh;
+
+	bh = sb_bread(sb, *cur_index);
+	if (bh == NULL)
+		return NULL;
+
+	if (msblk->devblksize - *offset == 1) {
+		*length = (unsigned char) bh->b_data[*offset];
+		put_bh(bh);
+		bh = sb_bread(sb, ++(*cur_index));
+		if (bh == NULL)
+			return NULL;
+		*length |= (unsigned char) bh->b_data[0] << 8;
+		*offset = 1;
+	} else {
+		*length = (unsigned char) bh->b_data[*offset] |
+			(unsigned char) bh->b_data[*offset + 1] << 8;
+		*offset += 2;
+	}
+
+	return bh;
+}
+
+
+/*
+ * Read and decompress a metadata block or datablock.  Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block.  A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with zlib).
+ */
+int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
+			int length, u64 *next_index, int srclength)
+{
+	struct squashfs_sb_info *msblk = sb->s_fs_info;
+	struct buffer_head **bh;
+	int offset = index & ((1 << msblk->devblksize_log2) - 1);
+	u64 cur_index = index >> msblk->devblksize_log2;
+	int bytes, compressed, b = 0, k = 0, page = 0, avail;
+
+
+	bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+				sizeof(*bh), GFP_KERNEL);
+	if (bh == NULL)
+		return -ENOMEM;
+
+	if (length) {
+		/*
+		 * Datablock.
+		 */
+		bytes = -offset;
+		compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+		length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+		if (next_index)
+			*next_index = index + length;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
+			index, compressed ? "" : "un", length, srclength);
+
+		if (length < 0 || length > srclength ||
+				(index + length) > msblk->bytes_used)
+			goto read_failure;
+
+		for (b = 0; bytes < length; b++, cur_index++) {
+			bh[b] = sb_getblk(sb, cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b, bh);
+	} else {
+		/*
+		 * Metadata block.
+		 */
+		if ((index + 2) > msblk->bytes_used)
+			goto read_failure;
+
+		bh[0] = get_block_length(sb, &cur_index, &offset, &length);
+		if (bh[0] == NULL)
+			goto read_failure;
+		b = 1;
+
+		bytes = msblk->devblksize - offset;
+		compressed = SQUASHFS_COMPRESSED(length);
+		length = SQUASHFS_COMPRESSED_SIZE(length);
+		if (next_index)
+			*next_index = index + length + 2;
+
+		TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
+				compressed ? "" : "un", length);
+
+		if (length < 0 || length > srclength ||
+					(index + length) > msblk->bytes_used)
+			goto block_release;
+
+		for (; bytes < length; b++) {
+			bh[b] = sb_getblk(sb, ++cur_index);
+			if (bh[b] == NULL)
+				goto block_release;
+			bytes += msblk->devblksize;
+		}
+		ll_rw_block(READ, b - 1, bh + 1);
+	}
+
+	if (compressed) {
+		int zlib_err = 0, zlib_init = 0;
+
+		/*
+		 * Uncompress block.
+		 */
+
+		mutex_lock(&msblk->read_data_mutex);
+
+		msblk->stream.avail_out = 0;
+		msblk->stream.avail_in = 0;
+
+		bytes = length;
+		do {
+			if (msblk->stream.avail_in == 0 && k < b) {
+				avail = min(bytes, msblk->devblksize - offset);
+				bytes -= avail;
+				wait_on_buffer(bh[k]);
+				if (!buffer_uptodate(bh[k]))
+					goto release_mutex;
+
+				if (avail == 0) {
+					offset = 0;
+					put_bh(bh[k++]);
+					continue;
+				}
+
+				msblk->stream.next_in = bh[k]->b_data + offset;
+				msblk->stream.avail_in = avail;
+				offset = 0;
+			}
+
+			if (msblk->stream.avail_out == 0) {
+				msblk->stream.next_out = buffer[page++];
+				msblk->stream.avail_out = PAGE_CACHE_SIZE;
+			}
+
+			if (!zlib_init) {
+				zlib_err = zlib_inflateInit(&msblk->stream);
+				if (zlib_err != Z_OK) {
+					ERROR("zlib_inflateInit returned"
+						" unexpected result 0x%x,"
+						" srclength %d\n", zlib_err,
+						srclength);
+					goto release_mutex;
+				}
+				zlib_init = 1;
+			}
+
+			zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+
+			if (msblk->stream.avail_in == 0 && k < b)
+				put_bh(bh[k++]);
+		} while (zlib_err == Z_OK);
+
+		if (zlib_err != Z_STREAM_END) {
+			ERROR("zlib_inflate returned unexpected result"
+				" 0x%x, srclength %d, avail_in %d,"
+				" avail_out %d\n", zlib_err, srclength,
+				msblk->stream.avail_in,
+				msblk->stream.avail_out);
+			goto release_mutex;
+		}
+
+		zlib_err = zlib_inflateEnd(&msblk->stream);
+		if (zlib_err != Z_OK) {
+			ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
+				" srclength %d\n", zlib_err, srclength);
+			goto release_mutex;
+		}
+		length = msblk->stream.total_out;
+		mutex_unlock(&msblk->read_data_mutex);
+	} else {
+		/*
+		 * Block is uncompressed.
+		 */
+		int i, in, pg_offset = 0;
+
+		for (i = 0; i < b; i++) {
+			wait_on_buffer(bh[i]);
+			if (!buffer_uptodate(bh[i]))
+				goto block_release;
+		}
+
+		for (bytes = length; k < b; k++) {
+			in = min(bytes, msblk->devblksize - offset);
+			bytes -= in;
+			while (in) {
+				if (pg_offset == PAGE_CACHE_SIZE) {
+					page++;
+					pg_offset = 0;
+				}
+				avail = min_t(int, in, PAGE_CACHE_SIZE -
+						pg_offset);
+				memcpy(buffer[page] + pg_offset,
+						bh[k]->b_data + offset, avail);
+				in -= avail;
+				pg_offset += avail;
+				offset += avail;
+			}
+			offset = 0;
+			put_bh(bh[k]);
+		}
+	}
+
+	kfree(bh);
+	return length;
+
+release_mutex:
+	mutex_unlock(&msblk->read_data_mutex);
+
+block_release:
+	for (; k < b; k++)
+		put_bh(bh[k]);
+
+read_failure:
+	ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+	kfree(bh);
+	return -EIO;
+}
-- 
cgit v1.2.3


From ffae2cd73a9e828b1a188f83c5dedce16f7c0c68 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:27 +0000
Subject: Squashfs: header files

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/squashfs.h       |  90 ++++++++++
 fs/squashfs/squashfs_fs.h    | 381 +++++++++++++++++++++++++++++++++++++++++++
 fs/squashfs/squashfs_fs_i.h  |  45 +++++
 fs/squashfs/squashfs_fs_sb.h |  76 +++++++++
 4 files changed, 592 insertions(+)
 create mode 100644 fs/squashfs/squashfs.h
 create mode 100644 fs/squashfs/squashfs_fs.h
 create mode 100644 fs/squashfs/squashfs_fs_i.h
 create mode 100644 fs/squashfs/squashfs_fs_sb.h

(limited to 'fs')

diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
new file mode 100644
index 00000000000..6b2515d027d
--- /dev/null
+++ b/fs/squashfs/squashfs.h
@@ -0,0 +1,90 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs.h
+ */
+
+#define TRACE(s, args...)	pr_debug("SQUASHFS: "s, ## args)
+
+#define ERROR(s, args...)	pr_err("SQUASHFS error: "s, ## args)
+
+#define WARNING(s, args...)	pr_warning("SQUASHFS: "s, ## args)
+
+static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
+{
+	return list_entry(inode, struct squashfs_inode_info, vfs_inode);
+}
+
+/* block.c */
+extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
+				int);
+
+/* cache.c */
+extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
+extern void squashfs_cache_delete(struct squashfs_cache *);
+extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *,
+				struct squashfs_cache *, u64, int);
+extern void squashfs_cache_put(struct squashfs_cache_entry *);
+extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int);
+extern int squashfs_read_metadata(struct super_block *, void *, u64 *,
+				int *, int);
+extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *,
+				u64, int);
+extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *,
+				u64, int);
+extern int squashfs_read_table(struct super_block *, void *, u64, int);
+
+/* export.c */
+extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
+				unsigned int);
+
+/* fragment.c */
+extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
+extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
+				u64, unsigned int);
+
+/* id.c */
+extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
+extern __le64 *squashfs_read_id_index_table(struct super_block *, u64,
+				unsigned short);
+
+/* inode.c */
+extern struct inode *squashfs_iget(struct super_block *, long long,
+				unsigned int);
+extern int squashfs_read_inode(struct inode *, long long);
+
+/*
+ * Inodes and files operations
+ */
+
+/* dir.c */
+extern const struct file_operations squashfs_dir_ops;
+
+/* export.c */
+extern const struct export_operations squashfs_export_ops;
+
+/* file.c */
+extern const struct address_space_operations squashfs_aops;
+
+/* namei.c */
+extern const struct inode_operations squashfs_dir_inode_ops;
+
+/* symlink.c */
+extern const struct address_space_operations squashfs_symlink_aops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
new file mode 100644
index 00000000000..6840da1bf21
--- /dev/null
+++ b/fs/squashfs/squashfs_fs.h
@@ -0,0 +1,381 @@
+#ifndef SQUASHFS_FS
+#define SQUASHFS_FS
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs.h
+ */
+
+#define SQUASHFS_CACHED_FRAGMENTS	CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
+#define SQUASHFS_MAJOR			4
+#define SQUASHFS_MINOR			0
+#define SQUASHFS_MAGIC			0x73717368
+#define SQUASHFS_START			0
+
+/* size of metadata (inode and directory) blocks */
+#define SQUASHFS_METADATA_SIZE		8192
+#define SQUASHFS_METADATA_LOG		13
+
+/* default size of data blocks */
+#define SQUASHFS_FILE_SIZE		131072
+#define SQUASHFS_FILE_LOG		17
+
+#define SQUASHFS_FILE_MAX_SIZE		1048576
+#define SQUASHFS_FILE_MAX_LOG		20
+
+/* Max number of uids and gids */
+#define SQUASHFS_IDS			65536
+
+/* Max length of filename (not 255) */
+#define SQUASHFS_NAME_LEN		256
+
+#define SQUASHFS_INVALID_FRAG		(0xffffffffU)
+#define SQUASHFS_INVALID_BLK		(-1LL)
+
+/* Filesystem flags */
+#define SQUASHFS_NOI			0
+#define SQUASHFS_NOD			1
+#define SQUASHFS_NOF			3
+#define SQUASHFS_NO_FRAG		4
+#define SQUASHFS_ALWAYS_FRAG		5
+#define SQUASHFS_DUPLICATE		6
+#define SQUASHFS_EXPORT			7
+
+#define SQUASHFS_BIT(flag, bit)		((flag >> bit) & 1)
+
+#define SQUASHFS_UNCOMPRESSED_INODES(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOI)
+
+#define SQUASHFS_UNCOMPRESSED_DATA(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOD)
+
+#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_NOF)
+
+#define SQUASHFS_NO_FRAGMENTS(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_NO_FRAG)
+
+#define SQUASHFS_ALWAYS_FRAGMENTS(flags)	SQUASHFS_BIT(flags, \
+						SQUASHFS_ALWAYS_FRAG)
+
+#define SQUASHFS_DUPLICATES(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_DUPLICATE)
+
+#define SQUASHFS_EXPORTABLE(flags)		SQUASHFS_BIT(flags, \
+						SQUASHFS_EXPORT)
+
+/* Max number of types and file types */
+#define SQUASHFS_DIR_TYPE		1
+#define SQUASHFS_REG_TYPE		2
+#define SQUASHFS_SYMLINK_TYPE		3
+#define SQUASHFS_BLKDEV_TYPE		4
+#define SQUASHFS_CHRDEV_TYPE		5
+#define SQUASHFS_FIFO_TYPE		6
+#define SQUASHFS_SOCKET_TYPE		7
+#define SQUASHFS_LDIR_TYPE		8
+#define SQUASHFS_LREG_TYPE		9
+#define SQUASHFS_LSYMLINK_TYPE		10
+#define SQUASHFS_LBLKDEV_TYPE		11
+#define SQUASHFS_LCHRDEV_TYPE		12
+#define SQUASHFS_LFIFO_TYPE		13
+#define SQUASHFS_LSOCKET_TYPE		14
+
+/* Flag whether block is compressed or uncompressed, bit is set if block is
+ * uncompressed */
+#define SQUASHFS_COMPRESSED_BIT		(1 << 15)
+
+#define SQUASHFS_COMPRESSED_SIZE(B)	(((B) & ~SQUASHFS_COMPRESSED_BIT) ? \
+		(B) & ~SQUASHFS_COMPRESSED_BIT :  SQUASHFS_COMPRESSED_BIT)
+
+#define SQUASHFS_COMPRESSED(B)		(!((B) & SQUASHFS_COMPRESSED_BIT))
+
+#define SQUASHFS_COMPRESSED_BIT_BLOCK	(1 << 24)
+
+#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B)	((B) & \
+						~SQUASHFS_COMPRESSED_BIT_BLOCK)
+
+#define SQUASHFS_COMPRESSED_BLOCK(B)	(!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK))
+
+/*
+ * Inode number ops.  Inodes consist of a compressed block number, and an
+ * uncompressed offset within that block
+ */
+#define SQUASHFS_INODE_BLK(A)		((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_INODE_OFFSET(A)	((unsigned int) ((A) & 0xffff))
+
+#define SQUASHFS_MKINODE(A, B)		((long long)(((long long) (A)\
+					<< 16) + (B)))
+
+/* Translate between VFS mode and squashfs mode */
+#define SQUASHFS_MODE(A)		((A) & 0xfff)
+
+/* fragment and fragment table defines */
+#define SQUASHFS_FRAGMENT_BYTES(A)	\
+				((A) * sizeof(struct squashfs_fragment_entry))
+
+#define SQUASHFS_FRAGMENT_INDEX(A)	(SQUASHFS_FRAGMENT_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A)	(SQUASHFS_FRAGMENT_BYTES(A) % \
+						SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEXES(A)	((SQUASHFS_FRAGMENT_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_FRAGMENT_INDEX_BYTES(A)	(SQUASHFS_FRAGMENT_INDEXES(A) *\
+						sizeof(u64))
+
+/* inode lookup table defines */
+#define SQUASHFS_LOOKUP_BYTES(A)	((A) * sizeof(u64))
+
+#define SQUASHFS_LOOKUP_BLOCK(A)	(SQUASHFS_LOOKUP_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A)	(SQUASHFS_LOOKUP_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCKS(A)	((SQUASHFS_LOOKUP_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_LOOKUP_BLOCK_BYTES(A)	(SQUASHFS_LOOKUP_BLOCKS(A) *\
+					sizeof(u64))
+
+/* uid/gid lookup table defines */
+#define SQUASHFS_ID_BYTES(A)		((A) * sizeof(unsigned int))
+
+#define SQUASHFS_ID_BLOCK(A)		(SQUASHFS_ID_BYTES(A) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_OFFSET(A)	(SQUASHFS_ID_BYTES(A) % \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCKS(A)		((SQUASHFS_ID_BYTES(A) + \
+					SQUASHFS_METADATA_SIZE - 1) / \
+					SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_ID_BLOCK_BYTES(A)	(SQUASHFS_ID_BLOCKS(A) *\
+					sizeof(u64))
+
+/* cached data constants for filesystem */
+#define SQUASHFS_CACHED_BLKS		8
+
+#define SQUASHFS_MAX_FILE_SIZE_LOG	64
+
+#define SQUASHFS_MAX_FILE_SIZE		(1LL << \
+					(SQUASHFS_MAX_FILE_SIZE_LOG - 2))
+
+#define SQUASHFS_MARKER_BYTE		0xff
+
+/* meta index cache */
+#define SQUASHFS_META_INDEXES	(SQUASHFS_METADATA_SIZE / sizeof(unsigned int))
+#define SQUASHFS_META_ENTRIES	127
+#define SQUASHFS_META_SLOTS	8
+
+struct meta_entry {
+	u64			data_block;
+	unsigned int		index_block;
+	unsigned short		offset;
+	unsigned short		pad;
+};
+
+struct meta_index {
+	unsigned int		inode_number;
+	unsigned int		offset;
+	unsigned short		entries;
+	unsigned short		skip;
+	unsigned short		locked;
+	unsigned short		pad;
+	struct meta_entry	meta_entry[SQUASHFS_META_ENTRIES];
+};
+
+
+/*
+ * definitions for structures on disk
+ */
+#define ZLIB_COMPRESSION	 1
+
+struct squashfs_super_block {
+	__le32			s_magic;
+	__le32			inodes;
+	__le32			mkfs_time;
+	__le32			block_size;
+	__le32			fragments;
+	__le16			compression;
+	__le16			block_log;
+	__le16			flags;
+	__le16			no_ids;
+	__le16			s_major;
+	__le16			s_minor;
+	__le64			root_inode;
+	__le64			bytes_used;
+	__le64			id_table_start;
+	__le64			xattr_table_start;
+	__le64			inode_table_start;
+	__le64			directory_table_start;
+	__le64			fragment_table_start;
+	__le64			lookup_table_start;
+};
+
+struct squashfs_dir_index {
+	__le32			index;
+	__le32			start_block;
+	__le32			size;
+	unsigned char		name[0];
+};
+
+struct squashfs_base_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+};
+
+struct squashfs_ipc_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+};
+
+struct squashfs_dev_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+	__le32			rdev;
+};
+
+struct squashfs_symlink_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+	__le32			symlink_size;
+	char			symlink[0];
+};
+
+struct squashfs_reg_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			start_block;
+	__le32			fragment;
+	__le32			offset;
+	__le32			file_size;
+	__le16			block_list[0];
+};
+
+struct squashfs_lreg_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le64			start_block;
+	__le64			file_size;
+	__le64			sparse;
+	__le32			nlink;
+	__le32			fragment;
+	__le32			offset;
+	__le32			xattr;
+	__le16			block_list[0];
+};
+
+struct squashfs_dir_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			start_block;
+	__le32			nlink;
+	__le16			file_size;
+	__le16			offset;
+	__le32			parent_inode;
+};
+
+struct squashfs_ldir_inode {
+	__le16			inode_type;
+	__le16			mode;
+	__le16			uid;
+	__le16			guid;
+	__le32			mtime;
+	__le32	 		inode_number;
+	__le32			nlink;
+	__le32			file_size;
+	__le32			start_block;
+	__le32			parent_inode;
+	__le16			i_count;
+	__le16			offset;
+	__le32			xattr;
+	struct squashfs_dir_index	index[0];
+};
+
+union squashfs_inode {
+	struct squashfs_base_inode		base;
+	struct squashfs_dev_inode		dev;
+	struct squashfs_symlink_inode		symlink;
+	struct squashfs_reg_inode		reg;
+	struct squashfs_lreg_inode		lreg;
+	struct squashfs_dir_inode		dir;
+	struct squashfs_ldir_inode		ldir;
+	struct squashfs_ipc_inode		ipc;
+};
+
+struct squashfs_dir_entry {
+	__le16			offset;
+	__le16			inode_number;
+	__le16			type;
+	__le16			size;
+	char			name[0];
+};
+
+struct squashfs_dir_header {
+	__le32			count;
+	__le32			start_block;
+	__le32			inode_number;
+};
+
+struct squashfs_fragment_entry {
+	__le64			start_block;
+	__le32			size;
+	unsigned int		unused;
+};
+
+#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
new file mode 100644
index 00000000000..fbfca30c0c6
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -0,0 +1,45 @@
+#ifndef SQUASHFS_FS_I
+#define SQUASHFS_FS_I
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_i.h
+ */
+
+struct squashfs_inode_info {
+	u64		start;
+	int		offset;
+	union {
+		struct {
+			u64		fragment_block;
+			int		fragment_size;
+			int		fragment_offset;
+			u64		block_list_start;
+		};
+		struct {
+			u64		dir_idx_start;
+			int		dir_idx_offset;
+			int		dir_idx_cnt;
+			int		parent;
+		};
+	};
+	struct inode	vfs_inode;
+};
+#endif
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
new file mode 100644
index 00000000000..c8c65614dd1
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -0,0 +1,76 @@
+#ifndef SQUASHFS_FS_SB
+#define SQUASHFS_FS_SB
+/*
+ * Squashfs
+ *
+ * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * squashfs_fs_sb.h
+ */
+
+#include "squashfs_fs.h"
+
+struct squashfs_cache {
+	char			*name;
+	int			entries;
+	int			next_blk;
+	int			num_waiters;
+	int			unused;
+	int			block_size;
+	int			pages;
+	spinlock_t		lock;
+	wait_queue_head_t	wait_queue;
+	struct squashfs_cache_entry *entry;
+};
+
+struct squashfs_cache_entry {
+	u64			block;
+	int			length;
+	int			refcount;
+	u64			next_index;
+	int			pending;
+	int			error;
+	int			num_waiters;
+	wait_queue_head_t	wait_queue;
+	struct squashfs_cache	*cache;
+	void			**data;
+};
+
+struct squashfs_sb_info {
+	int			devblksize;
+	int			devblksize_log2;
+	struct squashfs_cache	*block_cache;
+	struct squashfs_cache	*fragment_cache;
+	struct squashfs_cache	*read_page;
+	int			next_meta_index;
+	__le64			*id_table;
+	__le64			*fragment_index;
+	unsigned int		*fragment_index_2;
+	struct mutex		read_data_mutex;
+	struct mutex		meta_index_mutex;
+	struct meta_index	*meta_index;
+	z_stream		stream;
+	__le64			*inode_lookup_table;
+	u64			inode_table;
+	u64			directory_table;
+	unsigned int		block_size;
+	unsigned short		block_log;
+	long long		bytes_used;
+	unsigned int		inodes;
+};
+#endif
-- 
cgit v1.2.3


From fcef6fb6c5cf54927e1ca86b86a991e7aa9391f5 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:27 +0000
Subject: Squashfs: Makefiles

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/Makefile          | 1 +
 fs/squashfs/Makefile | 8 ++++++++
 2 files changed, 9 insertions(+)
 create mode 100644 fs/squashfs/Makefile

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index e6f423d1d22..3f8843c62d9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -73,6 +73,7 @@ obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
 obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
+obj-$(CONFIG_SQUASHFS)		+= squashfs/
 obj-y				+= ramfs/
 obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
 obj-$(CONFIG_CODA_FS)		+= coda/
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
new file mode 100644
index 00000000000..8258cf9a031
--- /dev/null
+++ b/fs/squashfs/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the linux squashfs routines.
+#
+
+obj-$(CONFIG_SQUASHFS) += squashfs.o
+squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
+squashfs-y += namei.o super.o symlink.o
+#squashfs-y += squashfs2_0.o
-- 
cgit v1.2.3


From 6ab5c1ca71ea8e04e97cd8ed473bd04e636850fd Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Mon, 5 Jan 2009 08:46:28 +0000
Subject: Squashfs: Kconfig entry

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/Kconfig | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index ff0e8198020..2553e0bbd01 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -894,6 +894,58 @@ config CRAMFS
 
 	  If unsure, say N.
 
+config SQUASHFS
+	tristate "SquashFS 4.0 - Squashed file system support"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for SquashFS 4.0 (a Compressed
+	  Read-Only File System).  Squashfs is a highly compressed read-only
+	  filesystem for Linux.  It uses zlib compression to compress both
+	  files, inodes and directories.  Inodes in the system are very small
+	  and all blocks are packed to minimise data overhead. Block sizes
+	  greater than 4K are supported up to a maximum of 1 Mbytes (default
+	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
+	  (larger than 4GB), full uid/gid information, hard links and
+	  timestamps.  
+
+	  Squashfs is intended for general read-only filesystem use, for
+	  archival use (i.e. in cases where a .tar.gz file may be used), and in
+	  embedded systems where low overhead is needed.  Further information
+	  and tools are available from http://squashfs.sourceforge.net.
+
+	  If you want to compile this as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want),
+	  say M here and read <file:Documentation/modules.txt>.  The module
+	  will be called squashfs.  Note that the root file system (the one
+	  containing the directory /) cannot be compiled as a module.
+
+	  If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+
+	bool "Additional option for memory-constrained systems" 
+	depends on SQUASHFS
+	default n
+	help
+	  Saying Y here allows you to specify cache size.
+
+	  If unsure, say N.
+
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+	int "Number of fragments cached" if SQUASHFS_EMBEDDED
+	depends on SQUASHFS
+	default "3"
+	help
+	  By default SquashFS caches the last 3 fragments read from
+	  the filesystem.  Increasing this amount may mean SquashFS
+	  has to re-read fragments less often from disk, at the expense
+	  of extra system memory.  Decreasing this amount will mean
+	  SquashFS uses less memory at the expense of extra reads from disk.
+
+	  Note there must be at least one cached fragment.  Anything
+	  much more than three will probably not make much difference.
+
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
 	depends on BLOCK
-- 
cgit v1.2.3


From b4df2b92d8461444fac429c75ba6e125c63056bc Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 27 Oct 2008 22:48:36 +0300
Subject: proc: stop using BKL

There are four BKL users in proc: de_put(), proc_lookup_de(),
proc_readdir_de(), proc_root_readdir(),

1) de_put()
-----------
de_put() is classic atomic_dec_and_test() refcount wrapper -- no BKL
needed. BKL doesn't matter to possible refcount leak as well.

2) proc_lookup_de()
-------------------
Walking PDE list is protected by proc_subdir_lock(), proc_get_inode() is
potentially blocking, all callers of proc_lookup_de() eventually end up
from ->lookup hooks which is protected by directory's ->i_mutex -- BKL
doesn't protect anything.

3) proc_readdir_de()
--------------------
"." and ".." part doesn't need BKL, walking PDE list is under
proc_subdir_lock, calling filldir callback is potentially blocking
because it writes to luserspace. All proc_readdir_de() callers
eventually come from ->readdir hook which is under directory's
->i_mutex -- BKL doesn't protect anything.

4) proc_root_readdir_de()
-------------------------
proc_root_readdir_de is ->readdir hook, see (3).

Since readdir hooks doesn't use BKL anymore, switch to
generic_file_llseek, since it also takes directory's i_mutex.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/generic.c  | 8 ++------
 fs/proc/inode.c    | 3 ---
 fs/proc/proc_net.c | 2 +-
 fs/proc/root.c     | 8 +-------
 4 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 60a359b3558..db7fa5cab98 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -14,7 +14,6 @@
 #include <linux/stat.h>
 #include <linux/module.h>
 #include <linux/mount.h>
-#include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/idr.h>
 #include <linux/namei.h>
@@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 	struct inode *inode = NULL;
 	int error = -ENOENT;
 
-	lock_kernel();
 	spin_lock(&proc_subdir_lock);
 	for (de = de->subdir; de ; de = de->next) {
 		if (de->namelen != dentry->d_name.len)
@@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 	}
 	spin_unlock(&proc_subdir_lock);
 out_unlock:
-	unlock_kernel();
 
 	if (inode) {
 		dentry->d_op = &proc_dentry_operations;
@@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
 
-	lock_kernel();
-
 	ino = inode->i_ino;
 	i = filp->f_pos;
 	switch (i) {
@@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 			spin_unlock(&proc_subdir_lock);
 	}
 	ret = 1;
-out:	unlock_kernel();
+out:
 	return ret;	
 }
 
@@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
  * the /proc directory.
  */
 static const struct file_operations proc_dir_operations = {
+	.llseek			= generic_file_llseek,
 	.read			= generic_read_dir,
 	.readdir		= proc_readdir,
 };
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 2543fd00c65..3e76bb9b3ad 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de)
  */
 void de_put(struct proc_dir_entry *de)
 {
-	lock_kernel();
 	if (!atomic_read(&de->count)) {
 		printk("de_put: entry %s already free!\n", de->name);
-		unlock_kernel();
 		return;
 	}
 
 	if (atomic_dec_and_test(&de->count))
 		free_proc_entry(de);
-	unlock_kernel();
 }
 
 /*
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7bc296f424a..04d1270f1c3 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -18,7 +18,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent,
 }
 
 const struct file_operations proc_net_operations = {
+	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= proc_tgid_net_readdir,
 };
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 7761602af9d..f6299a25594 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -16,7 +16,6 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
-#include <linux/smp_lock.h>
 #include <linux/mount.h>
 #include <linux/pid_namespace.h>
 
@@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp,
 	unsigned int nr = filp->f_pos;
 	int ret;
 
-	lock_kernel();
-
 	if (nr < FIRST_PROCESS_ENTRY) {
 		int error = proc_readdir(filp, dirent, filldir);
-		if (error <= 0) {
-			unlock_kernel();
+		if (error <= 0)
 			return error;
-		}
 		filp->f_pos = FIRST_PROCESS_ENTRY;
 	}
-	unlock_kernel();
 
 	ret = proc_pid_readdir(filp, dirent, filldir);
 	return ret;
-- 
cgit v1.2.3


From ecae934edc0c29ec7405da18855004c317de26c6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 9 Nov 2008 23:12:18 +0300
Subject: proc: remove useless WARN_ONs

NULL "struct inode *" means VFS passed NULL inode to ->open.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/base.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b..8642623ea79 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1186,8 +1186,6 @@ static int sched_show(struct seq_file *m, void *v)
 	struct inode *inode = m->private;
 	struct task_struct *p;
 
-	WARN_ON(!inode);
-
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
@@ -1205,8 +1203,6 @@ sched_write(struct file *file, const char __user *buf,
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct task_struct *p;
 
-	WARN_ON(!inode);
-
 	p = get_proc_task(inode);
 	if (!p)
 		return -ESRCH;
-- 
cgit v1.2.3


From 631f9c1868b970197747c80fc5168ad7d9fd5d53 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 10 Nov 2008 01:32:52 +0300
Subject: proc: remove '##' usage

Inability to jump to /proc/*/foo handlers with ctags is annoying.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/base.c | 183 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 90 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8642623ea79..ce7a6da1b6a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -109,25 +109,22 @@ struct pid_entry {
 	.op   = OP,					\
 }
 
-#define DIR(NAME, MODE, OTYPE)							\
-	NOD(NAME, (S_IFDIR|(MODE)),						\
-		&proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,	\
-		{} )
-#define LNK(NAME, OTYPE)					\
+#define DIR(NAME, MODE, iops, fops)	\
+	NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
+#define LNK(NAME, get_link)					\
 	NOD(NAME, (S_IFLNK|S_IRWXUGO),				\
 		&proc_pid_link_inode_operations, NULL,		\
-		{ .proc_get_link = &proc_##OTYPE##_link } )
-#define REG(NAME, MODE, OTYPE)				\
-	NOD(NAME, (S_IFREG|(MODE)), NULL,		\
-		&proc_##OTYPE##_operations, {})
-#define INF(NAME, MODE, OTYPE)				\
+		{ .proc_get_link = get_link } )
+#define REG(NAME, MODE, fops)				\
+	NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
+#define INF(NAME, MODE, read)				\
 	NOD(NAME, (S_IFREG|(MODE)), 			\
 		NULL, &proc_info_file_operations,	\
-		{ .proc_read = &proc_##OTYPE } )
-#define ONE(NAME, MODE, OTYPE)				\
+		{ .proc_read = read } )
+#define ONE(NAME, MODE, show)				\
 	NOD(NAME, (S_IFREG|(MODE)), 			\
 		NULL, &proc_single_file_operations,	\
-		{ .proc_show = &proc_##OTYPE } )
+		{ .proc_show = show } )
 
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
@@ -2134,12 +2131,12 @@ static const struct file_operations proc_pid_attr_operations = {
 };
 
 static const struct pid_entry attr_dir_stuff[] = {
-	REG("current",    S_IRUGO|S_IWUGO, pid_attr),
-	REG("prev",       S_IRUGO,	   pid_attr),
-	REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
-	REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
-	REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
-	REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
+	REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("prev",       S_IRUGO,	   proc_pid_attr_operations),
+	REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
+	REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
 };
 
 static int proc_attr_dir_readdir(struct file * filp,
@@ -2461,74 +2458,74 @@ static const struct file_operations proc_task_operations;
 static const struct inode_operations proc_task_inode_operations;
 
 static const struct pid_entry tgid_base_stuff[] = {
-	DIR("task",       S_IRUGO|S_IXUGO, task),
-	DIR("fd",         S_IRUSR|S_IXUSR, fd),
-	DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
+	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
+	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 #ifdef CONFIG_NET
-	DIR("net",        S_IRUGO|S_IXUGO, net),
+	DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
 #endif
-	REG("environ",    S_IRUSR, environ),
-	INF("auxv",       S_IRUSR, pid_auxv),
-	ONE("status",     S_IRUGO, pid_status),
-	ONE("personality", S_IRUSR, pid_personality),
-	INF("limits",	  S_IRUSR, pid_limits),
+	REG("environ",    S_IRUSR, proc_environ_operations),
+	INF("auxv",       S_IRUSR, proc_pid_auxv),
+	ONE("status",     S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUSR, proc_pid_personality),
+	INF("limits",	  S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-	REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+	REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-	INF("syscall",    S_IRUSR, pid_syscall),
+	INF("syscall",    S_IRUSR, proc_pid_syscall),
 #endif
-	INF("cmdline",    S_IRUGO, pid_cmdline),
-	ONE("stat",       S_IRUGO, tgid_stat),
-	ONE("statm",      S_IRUGO, pid_statm),
-	REG("maps",       S_IRUGO, maps),
+	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
+	ONE("stat",       S_IRUGO, proc_tgid_stat),
+	ONE("statm",      S_IRUGO, proc_pid_statm),
+	REG("maps",       S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-	REG("numa_maps",  S_IRUGO, numa_maps),
+	REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
 #endif
-	REG("mem",        S_IRUSR|S_IWUSR, mem),
-	LNK("cwd",        cwd),
-	LNK("root",       root),
-	LNK("exe",        exe),
-	REG("mounts",     S_IRUGO, mounts),
-	REG("mountinfo",  S_IRUGO, mountinfo),
-	REG("mountstats", S_IRUSR, mountstats),
+	REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",        proc_cwd_link),
+	LNK("root",       proc_root_link),
+	LNK("exe",        proc_exe_link),
+	REG("mounts",     S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
+	REG("mountstats", S_IRUSR, proc_mountstats_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-	REG("clear_refs", S_IWUSR, clear_refs),
-	REG("smaps",      S_IRUGO, smaps),
-	REG("pagemap",    S_IRUSR, pagemap),
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",      S_IRUGO, proc_smaps_operations),
+	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
+	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-	INF("wchan",      S_IRUGO, pid_wchan),
+	INF("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-	INF("schedstat",  S_IRUGO, pid_schedstat),
+	INF("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-	REG("latency",  S_IRUGO, lstats),
+	REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-	REG("cpuset",     S_IRUGO, cpuset),
+	REG("cpuset",     S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-	REG("cgroup",  S_IRUGO, cgroup),
+	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-	INF("oom_score",  S_IRUGO, oom_score),
-	REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
+	INF("oom_score",  S_IRUGO, proc_oom_score),
+	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-	REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
-	REG("sessionid",  S_IRUGO, sessionid),
+	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
-	REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
+	REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, tgid_io_accounting),
+	INF("io",	S_IRUGO, proc_tgid_io_accounting),
 #endif
 };
 
@@ -2801,66 +2798,66 @@ out_no_task:
  * Tasks
  */
 static const struct pid_entry tid_base_stuff[] = {
-	DIR("fd",        S_IRUSR|S_IXUSR, fd),
-	DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
-	REG("environ",   S_IRUSR, environ),
-	INF("auxv",      S_IRUSR, pid_auxv),
-	ONE("status",    S_IRUGO, pid_status),
-	ONE("personality", S_IRUSR, pid_personality),
-	INF("limits",	 S_IRUSR, pid_limits),
+	DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+	REG("environ",   S_IRUSR, proc_environ_operations),
+	INF("auxv",      S_IRUSR, proc_pid_auxv),
+	ONE("status",    S_IRUGO, proc_pid_status),
+	ONE("personality", S_IRUSR, proc_pid_personality),
+	INF("limits",	 S_IRUSR, proc_pid_limits),
 #ifdef CONFIG_SCHED_DEBUG
-	REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+	REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
 #endif
 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
-	INF("syscall",   S_IRUSR, pid_syscall),
+	INF("syscall",   S_IRUSR, proc_pid_syscall),
 #endif
-	INF("cmdline",   S_IRUGO, pid_cmdline),
-	ONE("stat",      S_IRUGO, tid_stat),
-	ONE("statm",     S_IRUGO, pid_statm),
-	REG("maps",      S_IRUGO, maps),
+	INF("cmdline",   S_IRUGO, proc_pid_cmdline),
+	ONE("stat",      S_IRUGO, proc_tid_stat),
+	ONE("statm",     S_IRUGO, proc_pid_statm),
+	REG("maps",      S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
-	REG("numa_maps", S_IRUGO, numa_maps),
+	REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
 #endif
-	REG("mem",       S_IRUSR|S_IWUSR, mem),
-	LNK("cwd",       cwd),
-	LNK("root",      root),
-	LNK("exe",       exe),
-	REG("mounts",    S_IRUGO, mounts),
-	REG("mountinfo",  S_IRUGO, mountinfo),
+	REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
+	LNK("cwd",       proc_cwd_link),
+	LNK("root",      proc_root_link),
+	LNK("exe",       proc_exe_link),
+	REG("mounts",    S_IRUGO, proc_mounts_operations),
+	REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
 #ifdef CONFIG_PROC_PAGE_MONITOR
-	REG("clear_refs", S_IWUSR, clear_refs),
-	REG("smaps",     S_IRUGO, smaps),
-	REG("pagemap",    S_IRUSR, pagemap),
+	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
+	REG("smaps",     S_IRUGO, proc_smaps_operations),
+	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
 #endif
 #ifdef CONFIG_SECURITY
-	DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
+	DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
 #endif
 #ifdef CONFIG_KALLSYMS
-	INF("wchan",     S_IRUGO, pid_wchan),
+	INF("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
 #ifdef CONFIG_SCHEDSTATS
-	INF("schedstat", S_IRUGO, pid_schedstat),
+	INF("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
 #ifdef CONFIG_LATENCYTOP
-	REG("latency",  S_IRUGO, lstats),
+	REG("latency",  S_IRUGO, proc_lstats_operations),
 #endif
 #ifdef CONFIG_PROC_PID_CPUSET
-	REG("cpuset",    S_IRUGO, cpuset),
+	REG("cpuset",    S_IRUGO, proc_cpuset_operations),
 #endif
 #ifdef CONFIG_CGROUPS
-	REG("cgroup",  S_IRUGO, cgroup),
+	REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
-	INF("oom_score", S_IRUGO, oom_score),
-	REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
+	INF("oom_score", S_IRUGO, proc_oom_score),
+	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
 #ifdef CONFIG_AUDITSYSCALL
-	REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
-	REG("sessionid",  S_IRUSR, sessionid),
+	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
+	REG("sessionid",  S_IRUSR, proc_sessionid_operations),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
-	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
+	REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-	INF("io",	S_IRUGO, tid_io_accounting),
+	INF("io",	S_IRUGO, proc_tid_io_accounting),
 #endif
 };
 
-- 
cgit v1.2.3


From 2ec220e27f5040aec1e88901c1b6ea3d135787ad Mon Sep 17 00:00:00 2001
From: Ken Chen <kenchen@google.com>
Date: Mon, 10 Nov 2008 11:26:08 +0300
Subject: proc: add /proc/*/stack

/proc/*/stack adds the ability to query a task's stack trace. It is more
useful than /proc/*/wchan as it provides full stack trace instead of single
depth. Example output:

	$ cat /proc/self/stack
	[<c010a271>] save_stack_trace_tsk+0x17/0x35
	[<c01827b4>] proc_pid_stack+0x4a/0x76
	[<c018312d>] proc_single_show+0x4a/0x5e
	[<c016bdec>] seq_read+0xf3/0x29f
	[<c015a004>] vfs_read+0x6d/0x91
	[<c015a0c1>] sys_read+0x3b/0x60
	[<c0102eda>] syscall_call+0x7/0xb
	[<ffffffff>] 0xffffffff

[add save_stack_trace_tsk() on mips, ACK Ralf --adobriyan]
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/base.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ce7a6da1b6a..eb7b4654d6a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -65,6 +65,7 @@
 #include <linux/mm.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
+#include <linux/stacktrace.h>
 #include <linux/resource.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -337,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 }
 #endif /* CONFIG_KALLSYMS */
 
+#ifdef CONFIG_STACKTRACE
+
+#define MAX_STACK_TRACE_DEPTH	64
+
+static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
+			  struct pid *pid, struct task_struct *task)
+{
+	struct stack_trace trace;
+	unsigned long *entries;
+	int i;
+
+	entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
+	if (!entries)
+		return -ENOMEM;
+
+	trace.nr_entries	= 0;
+	trace.max_entries	= MAX_STACK_TRACE_DEPTH;
+	trace.entries		= entries;
+	trace.skip		= 0;
+	save_stack_trace_tsk(task, &trace);
+
+	for (i = 0; i < trace.nr_entries; i++) {
+		seq_printf(m, "[<%p>] %pS\n",
+			   (void *)entries[i], (void *)entries[i]);
+	}
+	kfree(entries);
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Provides /proc/PID/schedstat
@@ -2500,6 +2532,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_KALLSYMS
 	INF("wchan",      S_IRUGO, proc_pid_wchan),
 #endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUSR, proc_pid_stack),
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat",  S_IRUGO, proc_pid_schedstat),
 #endif
@@ -2835,6 +2870,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_KALLSYMS
 	INF("wchan",     S_IRUGO, proc_pid_wchan),
 #endif
+#ifdef CONFIG_STACKTRACE
+	ONE("stack",      S_IRUSR, proc_pid_stack),
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat", S_IRUGO, proc_pid_schedstat),
 #endif
-- 
cgit v1.2.3


From dfe6b7d9406c631d697f8bbe1eae5569b808154f Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 30 Dec 2008 18:49:13 +0300
Subject: proc: fix sparse warning

fs/proc/base.c:312:4: warning: do-while statement is not a compound statement

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index eb7b4654d6a..a9ccc125273 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -306,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
 	struct mm_struct *mm = get_task_mm(task);
 	if (mm) {
 		unsigned int nwords = 0;
-		do
+		do {
 			nwords += 2;
-		while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+		} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
 		res = nwords * sizeof(mm->saved_auxv[0]);
 		if (res > PAGE_SIZE)
 			res = PAGE_SIZE;
-- 
cgit v1.2.3


From 230e40fbda242544389a5428a2efac568178ddfe Mon Sep 17 00:00:00 2001
From: WANG Cong <wangcong@zeuux.org>
Date: Tue, 30 Dec 2008 19:10:35 +0300
Subject: proc: remove write-only variable in proc_pident_lookup()

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/base.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a9ccc125273..0bc9ca03b91 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2001,13 +2001,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 					 const struct pid_entry *ents,
 					 unsigned int nents)
 {
-	struct inode *inode;
 	struct dentry *error;
 	struct task_struct *task = get_proc_task(dir);
 	const struct pid_entry *p, *last;
 
 	error = ERR_PTR(-ENOENT);
-	inode = NULL;
 
 	if (!task)
 		goto out_no_task;
-- 
cgit v1.2.3


From 4ac6032d6c92f0ac65cf5bc56b68557b3f099b66 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Sat, 18 Oct 2008 19:11:42 -0700
Subject: ocfs2: Field prefixes for the xattr_bucket structure

The ocfs2_xattr_bucket structure keeps track of the buffers for one
xattr bucket.  Let's prefix the fields for easier code navigation.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 100 +++++++++++++++++++++++++++----------------------------
 1 file changed, 50 insertions(+), 50 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade1..9c0ee42eb93 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -61,8 +61,8 @@ struct ocfs2_xattr_def_value_root {
 };
 
 struct ocfs2_xattr_bucket {
-	struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
-	struct ocfs2_xattr_header *xh;
+	struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+	struct ocfs2_xattr_header *bu_xh;
 };
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
@@ -795,11 +795,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
-								xs->bucket.xh,
+								xs->bucket.bu_xh,
 								i,
 								&block_off,
 								&name_offset);
-			xs->base = xs->bucket.bhs[block_off]->b_data;
+			xs->base = xs->bucket.bu_bhs[block_off]->b_data;
 		}
 		if (ocfs2_xattr_is_local(xs->here)) {
 			memcpy(buffer, (void *)xs->base +
@@ -818,7 +818,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	ret = size;
 cleanup:
 	for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
-		brelse(xs->bucket.bhs[i]);
+		brelse(xs->bucket.bu_bhs[i]);
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
 	brelse(xs->xattr_bh);
@@ -2032,7 +2032,7 @@ cleanup:
 	brelse(di_bh);
 	brelse(xbs.xattr_bh);
 	for (i = 0; i < blk_per_bucket; i++)
-		brelse(xbs.bucket.bhs[i]);
+		brelse(xbs.bucket.bu_bhs[i]);
 
 	return ret;
 }
@@ -2276,13 +2276,13 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		lower_bh = bh;
 		bh = NULL;
 	}
-	xs->bucket.bhs[0] = lower_bh;
-	xs->bucket.xh = (struct ocfs2_xattr_header *)
-					xs->bucket.bhs[0]->b_data;
+	xs->bucket.bu_bhs[0] = lower_bh;
+	xs->bucket.bu_xh = (struct ocfs2_xattr_header *)
+					xs->bucket.bu_bhs[0]->b_data;
 	lower_bh = NULL;
 
-	xs->header = xs->bucket.xh;
-	xs->base = xs->bucket.bhs[0]->b_data;
+	xs->header = xs->bucket.bu_xh;
+	xs->base = xs->bucket.bu_bhs[0]->b_data;
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
 	if (found) {
@@ -2290,8 +2290,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		 * If we have found the xattr enty, read all the blocks in
 		 * this bucket.
 		 */
-		ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
-					blk_per_bucket - 1, &xs->bucket.bhs[1],
+		ret = ocfs2_read_blocks(inode, xs->bucket.bu_bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
 					0);
 		if (ret) {
 			mlog_errno(ret);
@@ -2300,7 +2300,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		xs->here = &xs->header->xh_entries[index];
 		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-		     (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+		     (unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr, index);
 	} else
 		ret = -ENODATA;
 
@@ -2370,23 +2370,23 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 
 	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
 		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
-					bucket.bhs, 0);
+					bucket.bu_bhs, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
+		bucket.bu_xh = (struct ocfs2_xattr_header *)bucket.bu_bhs[0]->b_data;
 		/*
 		 * The real bucket num in this series of blocks is stored
 		 * in the 1st bucket.
 		 */
 		if (i == 0)
-			num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+			num_buckets = le16_to_cpu(bucket.bu_xh->xh_num_buckets);
 
 		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
 		     (unsigned long long)blkno,
-		     le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+		     le32_to_cpu(bucket.bu_xh->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, &bucket, para);
 			if (ret) {
@@ -2396,13 +2396,13 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		}
 
 		for (j = 0; j < blk_per_bucket; j++)
-			brelse(bucket.bhs[j]);
+			brelse(bucket.bu_bhs[j]);
 		memset(&bucket, 0, sizeof(bucket));
 	}
 
 out:
 	for (j = 0; j < blk_per_bucket; j++)
-		brelse(bucket.bhs[j]);
+		brelse(bucket.bu_bhs[j]);
 
 	return ret;
 }
@@ -2441,21 +2441,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 	int i, block_off, new_offset;
 	const char *prefix, *name;
 
-	for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
-		struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+	for (i = 0 ; i < le16_to_cpu(bucket->bu_xh->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket->bu_xh->xh_entries[i];
 		type = ocfs2_xattr_get_type(entry);
 		prefix = ocfs2_xattr_prefix(type);
 
 		if (prefix) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
-								bucket->xh,
+								bucket->bu_xh,
 								i,
 								&block_off,
 								&new_offset);
 			if (ret)
 				break;
 
-			name = (const char *)bucket->bhs[block_off]->b_data +
+			name = (const char *)bucket->bu_bhs[block_off]->b_data +
 				new_offset;
 			ret = ocfs2_xattr_list_entry(xl->buffer,
 						     xl->buffer_size,
@@ -2626,10 +2626,10 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 	int i, blocksize = inode->i_sb->s_blocksize;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-	xs->bucket.bhs[0] = new_bh;
+	xs->bucket.bu_bhs[0] = new_bh;
 	get_bh(new_bh);
-	xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
-	xs->header = xs->bucket.xh;
+	xs->bucket.bu_xh = (struct ocfs2_xattr_header *)xs->bucket.bu_bhs[0]->b_data;
+	xs->header = xs->bucket.bu_xh;
 
 	xs->base = new_bh->b_data;
 	xs->end = xs->base + inode->i_sb->s_blocksize;
@@ -2637,8 +2637,8 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 	if (!xs->not_found) {
 		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
 			ret = ocfs2_read_blocks(inode,
-					xs->bucket.bhs[0]->b_blocknr + 1,
-					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					xs->bucket.bu_bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
 					0);
 			if (ret) {
 				mlog_errno(ret);
@@ -2835,7 +2835,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	size_t end, offset, len, value_len;
 	struct ocfs2_xattr_header *xh;
 	char *entries, *buf, *bucket_buf = NULL;
-	u64 blkno = bucket->bhs[0]->b_blocknr;
+	u64 blkno = bucket->bu_bhs[0]->b_blocknr;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u16 xh_free_start;
 	size_t blocksize = inode->i_sb->s_blocksize;
@@ -3929,7 +3929,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 	int block_off = offs >> inode->i_sb->s_blocksize_bits;
 
 	offs = offs % inode->i_sb->s_blocksize;
-	return bucket->bhs[block_off]->b_data + offs;
+	return bucket->bu_bhs[block_off]->b_data + offs;
 }
 
 /*
@@ -4124,12 +4124,12 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
 	     (unsigned long)xi->value_len, xi->name_index,
-	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+	     (unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr);
 
-	if (!xs->bucket.bhs[1]) {
+	if (!xs->bucket.bu_bhs[1]) {
 		ret = ocfs2_read_blocks(inode,
-					xs->bucket.bhs[0]->b_blocknr + 1,
-					blk_per_bucket - 1, &xs->bucket.bhs[1],
+					xs->bucket.bu_bhs[0]->b_blocknr + 1,
+					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
 					0);
 		if (ret) {
 			mlog_errno(ret);
@@ -4146,7 +4146,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 	}
 
 	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
+		ret = ocfs2_journal_access(handle, inode, xs->bucket.bu_bhs[i],
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -4158,7 +4158,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 
 	/*Only dirty the blocks we have touched in set xattr. */
 	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
-						xs->bucket.bhs, blk_per_bucket);
+						xs->bucket.bu_bhs, blk_per_bucket);
 	if (ret)
 		mlog_errno(ret);
 out:
@@ -4272,10 +4272,10 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 	struct ocfs2_xattr_entry *xe = xs->here;
 	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
 
-	BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+	BUG_ON(!xs->bucket.bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
 
 	offset = xe - xh->xh_entries;
-	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bu_bhs[0],
 						offset, len);
 	if (ret)
 		mlog_errno(ret);
@@ -4395,7 +4395,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 					 struct ocfs2_xattr_search *xs)
 {
 	handle_t *handle = NULL;
-	struct ocfs2_xattr_header *xh = xs->bucket.xh;
+	struct ocfs2_xattr_header *xh = xs->bucket.bu_xh;
 	struct ocfs2_xattr_entry *last = &xh->xh_entries[
 						le16_to_cpu(xh->xh_count) - 1];
 	int ret = 0;
@@ -4407,7 +4407,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 		return;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
+	ret = ocfs2_journal_access(handle, inode, xs->bucket.bu_bhs[0],
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -4420,7 +4420,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
 	le16_add_cpu(&xh->xh_count, -1);
 
-	ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+	ret = ocfs2_journal_dirty(handle, xs->bucket.bu_bhs[0]);
 	if (ret < 0)
 		mlog_errno(ret);
 out_commit:
@@ -4530,7 +4530,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 					      struct ocfs2_xattr_bucket *bucket,
 					      const char *name)
 {
-	struct ocfs2_xattr_header *xh = bucket->xh;
+	struct ocfs2_xattr_header *xh = bucket->bu_xh;
 	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
 
 	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +4540,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 	    xh->xh_entries[0].xe_name_hash) {
 		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
 		     "hash = %u\n",
-		     (unsigned long long)bucket->bhs[0]->b_blocknr,
+		     (unsigned long long)bucket->bu_bhs[0]->b_blocknr,
 		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
 		return -ENOSPC;
 	}
@@ -4574,7 +4574,7 @@ try_again:
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
-			(unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+			(unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr,
 			header_size);
 
 	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,7 +4614,7 @@ try_again:
 	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
 	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
 	     " %u\n", xs->not_found,
-	     (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+	     (unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr,
 	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
 	     le16_to_cpu(xh->xh_name_value_len));
 
@@ -4667,14 +4667,14 @@ try_again:
 
 		ret = ocfs2_add_new_xattr_bucket(inode,
 						 xs->xattr_bh,
-						 xs->bucket.bhs[0]);
+						 xs->bucket.bu_bhs[0]);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
 		for (i = 0; i < blk_per_bucket; i++)
-			brelse(xs->bucket.bhs[i]);
+			brelse(xs->bucket.bu_bhs[i]);
 
 		memset(&xs->bucket, 0, sizeof(xs->bucket));
 
@@ -4700,7 +4700,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 					void *para)
 {
 	int ret = 0;
-	struct ocfs2_xattr_header *xh = bucket->xh;
+	struct ocfs2_xattr_header *xh = bucket->bu_xh;
 	u16 i;
 	struct ocfs2_xattr_entry *xe;
 
@@ -4710,7 +4710,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 			continue;
 
 		ret = ocfs2_xattr_bucket_value_truncate(inode,
-							bucket->bhs[0],
+							bucket->bu_bhs[0],
 							i, 0);
 		if (ret) {
 			mlog_errno(ret);
-- 
cgit v1.2.3


From 9c7759aa670918a48f0c6e06779cd20f2781a2ac Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 16:21:03 -0700
Subject: ocfs2: Convenient access to an xattr bucket's block number.

The xattr code often wants to know the block number of an xattr bucket.
This is usually found by dereferencing the first bh hanging off of the
ocfs2_xattr_bucket structure.  Rather than do this all the time, let's
provide a nice little macro.  The idea is ripped from the ocfs2_path
code.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9c0ee42eb93..3cf8e80b2b6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -154,6 +154,8 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
 	return len / sizeof(struct ocfs2_xattr_entry);
 }
 
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -2290,7 +2292,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		 * If we have found the xattr enty, read all the blocks in
 		 * this bucket.
 		 */
-		ret = ocfs2_read_blocks(inode, xs->bucket.bu_bhs[0]->b_blocknr + 1,
+		ret = ocfs2_read_blocks(inode, bucket_blkno(&xs->bucket) + 1,
 					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
 					0);
 		if (ret) {
@@ -2300,7 +2302,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		xs->here = &xs->header->xh_entries[index];
 		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-		     (unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr, index);
+		     (unsigned long long)bucket_blkno(&xs->bucket), index);
 	} else
 		ret = -ENODATA;
 
@@ -2637,7 +2639,7 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 	if (!xs->not_found) {
 		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
 			ret = ocfs2_read_blocks(inode,
-					xs->bucket.bu_bhs[0]->b_blocknr + 1,
+					bucket_blkno(&xs->bucket) + 1,
 					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
 					0);
 			if (ret) {
@@ -2835,7 +2837,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	size_t end, offset, len, value_len;
 	struct ocfs2_xattr_header *xh;
 	char *entries, *buf, *bucket_buf = NULL;
-	u64 blkno = bucket->bu_bhs[0]->b_blocknr;
+	u64 blkno = bucket_blkno(bucket);
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u16 xh_free_start;
 	size_t blocksize = inode->i_sb->s_blocksize;
@@ -4124,11 +4126,11 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
 	     (unsigned long)xi->value_len, xi->name_index,
-	     (unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr);
+	     (unsigned long long)bucket_blkno(&xs->bucket));
 
 	if (!xs->bucket.bu_bhs[1]) {
 		ret = ocfs2_read_blocks(inode,
-					xs->bucket.bu_bhs[0]->b_blocknr + 1,
+					bucket_blkno(&xs->bucket) + 1,
 					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
 					0);
 		if (ret) {
@@ -4540,7 +4542,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 	    xh->xh_entries[0].xe_name_hash) {
 		mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
 		     "hash = %u\n",
-		     (unsigned long long)bucket->bu_bhs[0]->b_blocknr,
+		     (unsigned long long)bucket_blkno(bucket),
 		     le32_to_cpu(xh->xh_entries[0].xe_name_hash));
 		return -ENOSPC;
 	}
@@ -4574,7 +4576,7 @@ try_again:
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
-			(unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr,
+			(unsigned long long)bucket_blkno(&xs->bucket),
 			header_size);
 
 	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,7 +4616,7 @@ try_again:
 	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
 	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
 	     " %u\n", xs->not_found,
-	     (unsigned long long)xs->bucket.bu_bhs[0]->b_blocknr,
+	     (unsigned long long)bucket_blkno(&xs->bucket),
 	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
 	     le16_to_cpu(xh->xh_name_value_len));
 
-- 
cgit v1.2.3


From 51def39f0cabd46131c7c4df08751cb0cb9433d1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 16:57:21 -0700
Subject: ocfs2: Convenient access to xattr bucket data blocks.

The xattr code often wants to access the data pointer for blocks in an
xattr bucket.  This is usually found by dereferencing the bh array
hanging off of the ocfs2_xattr_bucket structure.  Rather than do this
all the time, let's provide a nice little macro.  The idea is ripped
from the ocfs2_path code.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3cf8e80b2b6..8594df36640 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -155,6 +155,7 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
 }
 
 #define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
 
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
@@ -801,7 +802,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 								i,
 								&block_off,
 								&name_offset);
-			xs->base = xs->bucket.bu_bhs[block_off]->b_data;
+			xs->base = bucket_block(&xs->bucket, block_off);
 		}
 		if (ocfs2_xattr_is_local(xs->here)) {
 			memcpy(buffer, (void *)xs->base +
@@ -2280,11 +2281,11 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 	}
 	xs->bucket.bu_bhs[0] = lower_bh;
 	xs->bucket.bu_xh = (struct ocfs2_xattr_header *)
-					xs->bucket.bu_bhs[0]->b_data;
+					bucket_block(&xs->bucket, 0);
 	lower_bh = NULL;
 
 	xs->header = xs->bucket.bu_xh;
-	xs->base = xs->bucket.bu_bhs[0]->b_data;
+	xs->base = bucket_block(&xs->bucket, 0);
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
 	if (found) {
@@ -2378,7 +2379,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 			goto out;
 		}
 
-		bucket.bu_xh = (struct ocfs2_xattr_header *)bucket.bu_bhs[0]->b_data;
+		bucket.bu_xh = (struct ocfs2_xattr_header *)bucket_block(&bucket, 0);
 		/*
 		 * The real bucket num in this series of blocks is stored
 		 * in the 1st bucket.
@@ -2457,7 +2458,7 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 			if (ret)
 				break;
 
-			name = (const char *)bucket->bu_bhs[block_off]->b_data +
+			name = (const char *)bucket_block(bucket, block_off) +
 				new_offset;
 			ret = ocfs2_xattr_list_entry(xl->buffer,
 						     xl->buffer_size,
@@ -2630,7 +2631,7 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 
 	xs->bucket.bu_bhs[0] = new_bh;
 	get_bh(new_bh);
-	xs->bucket.bu_xh = (struct ocfs2_xattr_header *)xs->bucket.bu_bhs[0]->b_data;
+	xs->bucket.bu_xh = (struct ocfs2_xattr_header *)bucket_block(&xs->bucket, 0);
 	xs->header = xs->bucket.bu_xh;
 
 	xs->base = new_bh->b_data;
@@ -3931,7 +3932,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
 	int block_off = offs >> inode->i_sb->s_blocksize_bits;
 
 	offs = offs % inode->i_sb->s_blocksize;
-	return bucket->bu_bhs[block_off]->b_data + offs;
+	return bucket_block(bucket, block_off) + offs;
 }
 
 /*
-- 
cgit v1.2.3


From 3e6329463e3a5c311e1d607ff3db735a18b6d67a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 17:04:49 -0700
Subject: ocfs2: Convenient access to an xattr bucket's header.

The xattr code often wants to access the ocfs2_xattr_header at the start
of an bucket.  Rather than walk the pointer chains, let's just create
another nice macro.  As a side benefit, we can get rid of the mostly
spurious ->bu_xh element on the bucket structure.  The idea is ripped
from the ocfs2_path code.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8594df36640..1b77302b54f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -62,7 +62,6 @@ struct ocfs2_xattr_def_value_root {
 
 struct ocfs2_xattr_bucket {
 	struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
-	struct ocfs2_xattr_header *bu_xh;
 };
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
@@ -156,6 +155,7 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
 
 #define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
 #define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
 
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
@@ -798,7 +798,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
-								xs->bucket.bu_xh,
+								bucket_xh(&xs->bucket),
 								i,
 								&block_off,
 								&name_offset);
@@ -2280,11 +2280,9 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		bh = NULL;
 	}
 	xs->bucket.bu_bhs[0] = lower_bh;
-	xs->bucket.bu_xh = (struct ocfs2_xattr_header *)
-					bucket_block(&xs->bucket, 0);
 	lower_bh = NULL;
 
-	xs->header = xs->bucket.bu_xh;
+	xs->header = bucket_xh(&xs->bucket);
 	xs->base = bucket_block(&xs->bucket, 0);
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
@@ -2379,17 +2377,16 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 			goto out;
 		}
 
-		bucket.bu_xh = (struct ocfs2_xattr_header *)bucket_block(&bucket, 0);
 		/*
 		 * The real bucket num in this series of blocks is stored
 		 * in the 1st bucket.
 		 */
 		if (i == 0)
-			num_buckets = le16_to_cpu(bucket.bu_xh->xh_num_buckets);
+			num_buckets = le16_to_cpu(bucket_xh(&bucket)->xh_num_buckets);
 
 		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
 		     (unsigned long long)blkno,
-		     le32_to_cpu(bucket.bu_xh->xh_entries[0].xe_name_hash));
+		     le32_to_cpu(bucket_xh(&bucket)->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, &bucket, para);
 			if (ret) {
@@ -2444,14 +2441,14 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 	int i, block_off, new_offset;
 	const char *prefix, *name;
 
-	for (i = 0 ; i < le16_to_cpu(bucket->bu_xh->xh_count); i++) {
-		struct ocfs2_xattr_entry *entry = &bucket->bu_xh->xh_entries[i];
+	for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
+		struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
 		type = ocfs2_xattr_get_type(entry);
 		prefix = ocfs2_xattr_prefix(type);
 
 		if (prefix) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
-								bucket->bu_xh,
+								bucket_xh(bucket),
 								i,
 								&block_off,
 								&new_offset);
@@ -2631,8 +2628,7 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 
 	xs->bucket.bu_bhs[0] = new_bh;
 	get_bh(new_bh);
-	xs->bucket.bu_xh = (struct ocfs2_xattr_header *)bucket_block(&xs->bucket, 0);
-	xs->header = xs->bucket.bu_xh;
+	xs->header = bucket_xh(&xs->bucket);
 
 	xs->base = new_bh->b_data;
 	xs->end = xs->base + inode->i_sb->s_blocksize;
@@ -4398,7 +4394,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 					 struct ocfs2_xattr_search *xs)
 {
 	handle_t *handle = NULL;
-	struct ocfs2_xattr_header *xh = xs->bucket.bu_xh;
+	struct ocfs2_xattr_header *xh = bucket_xh(&xs->bucket);
 	struct ocfs2_xattr_entry *last = &xh->xh_entries[
 						le16_to_cpu(xh->xh_count) - 1];
 	int ret = 0;
@@ -4533,7 +4529,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 					      struct ocfs2_xattr_bucket *bucket,
 					      const char *name)
 {
-	struct ocfs2_xattr_header *xh = bucket->bu_xh;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
 
 	if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4703,7 +4699,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 					void *para)
 {
 	int ret = 0;
-	struct ocfs2_xattr_header *xh = bucket->bu_xh;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u16 i;
 	struct ocfs2_xattr_entry *xe;
 
-- 
cgit v1.2.3


From 6dde41d9e7ba62f84cd7e91c0e993500af32ceb6 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 17:16:48 -0700
Subject: ocfs2: Provide a wrapper to brelse() xattr bucket buffers.

A common theme is walking all the buffer heads on an ocfs2_xattr_bucket
and releasing them.  Let's wrap that.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 1b77302b54f..3478ad177b7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -157,6 +157,17 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
 #define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
 #define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
 
+static void ocfs2_xattr_bucket_relse(struct inode *inode,
+				     struct ocfs2_xattr_bucket *bucket)
+{
+	int i, blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	for (i = 0; i < blks; i++) {
+		brelse(bucket->bu_bhs[i]);
+		bucket->bu_bhs[i] = NULL;
+	}
+}
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -820,8 +831,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	}
 	ret = size;
 cleanup:
-	for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
-		brelse(xs->bucket.bu_bhs[i]);
+	ocfs2_xattr_bucket_relse(inode, &xs->bucket);
 	memset(&xs->bucket, 0, sizeof(xs->bucket));
 
 	brelse(xs->xattr_bh);
@@ -1932,7 +1942,6 @@ int ocfs2_xattr_set(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	int ret;
-	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
 	struct ocfs2_xattr_info xi = {
 		.name_index = name_index,
@@ -2034,8 +2043,7 @@ cleanup:
 	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
 	brelse(xbs.xattr_bh);
-	for (i = 0; i < blk_per_bucket; i++)
-		brelse(xbs.bucket.bu_bhs[i]);
+	ocfs2_xattr_bucket_relse(inode, &xbs.bucket);
 
 	return ret;
 }
@@ -2358,7 +2366,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 				       xattr_bucket_func *func,
 				       void *para)
 {
-	int i, j, ret = 0;
+	int i, ret = 0;
 	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
 	u32 num_buckets = clusters * bpc;
@@ -2395,14 +2403,12 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 			}
 		}
 
-		for (j = 0; j < blk_per_bucket; j++)
-			brelse(bucket.bu_bhs[j]);
+		ocfs2_xattr_bucket_relse(inode, &bucket);
 		memset(&bucket, 0, sizeof(bucket));
 	}
 
 out:
-	for (j = 0; j < blk_per_bucket; j++)
-		brelse(bucket.bu_bhs[j]);
+	ocfs2_xattr_bucket_relse(inode, &bucket);
 
 	return ret;
 }
@@ -4554,11 +4560,10 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	u16 count, header_size, xh_free_start;
-	int i, free, max_free, need, old;
+	int free, max_free, need, old;
 	size_t value_size = 0, name_len = strlen(xi->name);
 	size_t blocksize = inode->i_sb->s_blocksize;
 	int ret, allocation = 0;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
 	mlog_entry("Set xattr %s in xattr index block\n", xi->name);
 
@@ -4672,9 +4677,7 @@ try_again:
 			goto out;
 		}
 
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(xs->bucket.bu_bhs[i]);
-
+		ocfs2_xattr_bucket_relse(inode, &xs->bucket);
 		memset(&xs->bucket, 0, sizeof(xs->bucket));
 
 		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
-- 
cgit v1.2.3


From 784b816a9198dc3782c97cde8ddcf52fecdf1797 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 17:33:40 -0700
Subject: ocfs2: Improve ocfs2_read_xattr_bucket().

The ocfs2_read_xattr_bucket() function would read an xattr bucket into a
list of buffer heads.  However, we have a nice ocfs2_xattr_bucket
structure.  Let's have it fill that out instead.

In addition, ocfs2_read_xattr_bucket() would initialize buffer heads for
a bucket that's never been on disk before.  That's confusing.  Let's
call that functionality ocfs2_init_xattr_bucket().

The functions ocfs2_cp_xattr_bucket() and ocfs2_half_xattr_bucket() are
updated to use the ocfs2_xattr_bucket structure rather than raw bh
lists.  That way they can use the new read/init calls.  In addition,
they drop the wasted read of an existing target bucket.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 165 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 79 insertions(+), 86 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3478ad177b7..fa13fa48878 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -168,6 +168,48 @@ static void ocfs2_xattr_bucket_relse(struct inode *inode,
 	}
 }
 
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read.  We just need the buffer_heads.  Don't call this for
+ * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno)
+{
+	int i, rc = 0;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	for (i = 0; i < blks; i++) {
+		bucket->bu_bhs[i] = sb_getblk(inode->i_sb, xb_blkno + i);
+		if (!bucket->bu_bhs[i]) {
+			rc = -EIO;
+			mlog_errno(rc);
+			break;
+		}
+
+		ocfs2_set_new_buffer_uptodate(inode, bucket->bu_bhs[i]);
+	}
+
+	if (rc)
+		ocfs2_xattr_bucket_relse(inode, bucket);
+	return rc;
+}
+
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+				   struct ocfs2_xattr_bucket *bucket,
+				   u64 xb_blkno)
+{
+	int rc, blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	rc = ocfs2_read_blocks(inode, xb_blkno, blks, bucket->bu_bhs, 0);
+	if (rc)
+		ocfs2_xattr_bucket_relse(inode, bucket);
+	return rc;
+}
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -3097,31 +3139,6 @@ out:
 	return ret;
 }
 
-static int ocfs2_read_xattr_bucket(struct inode *inode,
-				   u64 blkno,
-				   struct buffer_head **bhs,
-				   int new)
-{
-	int ret = 0;
-	u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
-	if (!new)
-		return ocfs2_read_blocks(inode, blkno,
-					 blk_per_bucket, bhs, 0);
-
-	for (i = 0; i < blk_per_bucket; i++) {
-		bhs[i] = sb_getblk(inode->i_sb, blkno + i);
-		if (bhs[i] == NULL) {
-			ret = -EIO;
-			mlog_errno(ret);
-			break;
-		}
-		ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
-	}
-
-	return ret;
-}
-
 /*
  * Find the suitable pos when we divide a bucket into 2.
  * We have to make sure the xattrs with the same hash value exist
@@ -3184,7 +3201,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	int ret, i;
 	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	struct buffer_head **s_bhs, **t_bhs = NULL;
+	struct ocfs2_xattr_bucket s_bucket, t_bucket;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
@@ -3192,37 +3209,34 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
 	     (unsigned long long)blk, (unsigned long long)new_blk);
 
-	s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!s_bhs)
-		return -ENOMEM;
+	memset(&s_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
+	memset(&t_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
 
-	ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
+	ret = ocfs2_read_xattr_bucket(inode, &s_bucket, blk);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+	ret = ocfs2_journal_access(handle, inode, s_bucket.bu_bhs[0],
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
-	if (!t_bhs) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+	/*
+	 * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(inode, &t_bucket, new_blk);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
 	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+		ret = ocfs2_journal_access(handle, inode, t_bucket.bu_bhs[i],
 					   new_bucket_head ?
 					   OCFS2_JOURNAL_ACCESS_CREATE :
 					   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -3232,7 +3246,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		}
 	}
 
-	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	xh = bucket_xh(&s_bucket);
 	count = le16_to_cpu(xh->xh_count);
 	start = ocfs2_xattr_find_divide_pos(xh);
 
@@ -3245,9 +3259,9 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		 * that of the last entry in the previous bucket.
 		 */
 		for (i = 0; i < blk_per_bucket; i++)
-			memset(t_bhs[i]->b_data, 0, blocksize);
+			memset(bucket_block(&t_bucket, i), 0, blocksize);
 
-		xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+		xh = bucket_xh(&t_bucket);
 		xh->xh_free_start = cpu_to_le16(blocksize);
 		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
 		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3257,10 +3271,11 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 
 	/* copy the whole bucket to the new first. */
 	for (i = 0; i < blk_per_bucket; i++)
-		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+		memcpy(bucket_block(&t_bucket, i), bucket_block(&s_bucket, i),
+		       blocksize);
 
 	/* update the new bucket. */
-	xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+	xh = bucket_xh(&t_bucket);
 
 	/*
 	 * Calculate the total name/value len and xh_free_start for
@@ -3325,7 +3340,7 @@ set_num_buckets:
 		xh->xh_num_buckets = 0;
 
 	for (i = 0; i < blk_per_bucket; i++) {
-		ocfs2_journal_dirty(handle, t_bhs[i]);
+		ocfs2_journal_dirty(handle, t_bucket.bu_bhs[i]);
 		if (ret)
 			mlog_errno(ret);
 	}
@@ -3342,29 +3357,20 @@ set_num_buckets:
 	if (start == count)
 		goto out;
 
-	xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+	xh = bucket_xh(&s_bucket);
 	memset(&xh->xh_entries[start], 0,
 	       sizeof(struct ocfs2_xattr_entry) * (count - start));
 	xh->xh_count = cpu_to_le16(start);
 	xh->xh_free_start = cpu_to_le16(name_offset);
 	xh->xh_name_value_len = cpu_to_le16(name_value_len);
 
-	ocfs2_journal_dirty(handle, s_bhs[0]);
+	ocfs2_journal_dirty(handle, s_bucket.bu_bhs[0]);
 	if (ret)
 		mlog_errno(ret);
 
 out:
-	if (s_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(s_bhs[i]);
-	}
-	kfree(s_bhs);
-
-	if (t_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(t_bhs[i]);
-	}
-	kfree(t_bhs);
+	ocfs2_xattr_bucket_relse(inode, &s_bucket);
+	ocfs2_xattr_bucket_relse(inode, &t_bucket);
 
 	return ret;
 }
@@ -3384,7 +3390,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	int ret, i;
 	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int blocksize = inode->i_sb->s_blocksize;
-	struct buffer_head **s_bhs, **t_bhs = NULL;
+	struct ocfs2_xattr_bucket s_bucket, t_bucket;
 
 	BUG_ON(s_blkno == t_blkno);
 
@@ -3392,28 +3398,23 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
 	     t_is_new);
 
-	s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-			GFP_NOFS);
-	if (!s_bhs)
-		return -ENOMEM;
+	memset(&s_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
+	memset(&t_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
 
-	ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+	ret = ocfs2_read_xattr_bucket(inode, &s_bucket, s_blkno);
 	if (ret)
 		goto out;
 
-	t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-			GFP_NOFS);
-	if (!t_bhs) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+	/*
+	 * Even if !t_is_new, we're overwriting t_bucket.  Thus,
+	 * there's no need to read it.
+	 */
+	ret = ocfs2_init_xattr_bucket(inode, &t_bucket, t_blkno);
 	if (ret)
 		goto out;
 
 	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+		ret = ocfs2_journal_access(handle, inode, t_bucket.bu_bhs[i],
 					   t_is_new ?
 					   OCFS2_JOURNAL_ACCESS_CREATE :
 					   OCFS2_JOURNAL_ACCESS_WRITE);
@@ -3422,22 +3423,14 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	}
 
 	for (i = 0; i < blk_per_bucket; i++) {
-		memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
-		ocfs2_journal_dirty(handle, t_bhs[i]);
+		memcpy(bucket_block(&t_bucket, i), bucket_block(&s_bucket, i),
+		       blocksize);
+		ocfs2_journal_dirty(handle, t_bucket.bu_bhs[i]);
 	}
 
 out:
-	if (s_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(s_bhs[i]);
-	}
-	kfree(s_bhs);
-
-	if (t_bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(t_bhs[i]);
-	}
-	kfree(t_bhs);
+	ocfs2_xattr_bucket_relse(inode, &s_bucket);
+	ocfs2_xattr_bucket_relse(inode, &t_bucket);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 1224be020f62ada3e19822feeac3840abf80de3e Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 18:47:33 -0700
Subject: ocfs2: Wrap journal_access/journal_dirty for xattr buckets.

A common action is to call ocfs2_journal_access() and
ocfs2_journal_dirty() on the buffer heads of an xattr bucket.  Let's
create nice wrappers.

While we're there, let's drop the places that try to be smart by writing
only the first and last blocks of a bucket.  A bucket is contiguous, so
writing the whole thing is actually more efficient.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 140 +++++++++++++++++++++++++------------------------------
 1 file changed, 64 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fa13fa48878..99aefe4ea75 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -210,6 +210,37 @@ static int ocfs2_read_xattr_bucket(struct inode *inode,
 	return rc;
 }
 
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+					     struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket,
+					     int type)
+{
+	int i, rc = 0;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	for (i = 0; i < blks; i++) {
+		rc = ocfs2_journal_access(handle, inode,
+					  bucket->bu_bhs[i], type);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+	}
+
+	return rc;
+}
+
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+					     struct inode *inode,
+					     struct ocfs2_xattr_bucket *bucket)
+{
+	int i, blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	for (i = 0; i < blks; i++)
+		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -3218,8 +3249,8 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, s_bucket.bu_bhs[0],
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &s_bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3235,15 +3266,13 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, t_bucket.bu_bhs[i],
-					   new_bucket_head ?
-					   OCFS2_JOURNAL_ACCESS_CREATE :
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
+	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &t_bucket,
+						new_bucket_head ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
 	xh = bucket_xh(&s_bucket);
@@ -3339,11 +3368,7 @@ set_num_buckets:
 	else
 		xh->xh_num_buckets = 0;
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ocfs2_journal_dirty(handle, t_bucket.bu_bhs[i]);
-		if (ret)
-			mlog_errno(ret);
-	}
+	ocfs2_xattr_bucket_journal_dirty(handle, inode, &t_bucket);
 
 	/* store the first_hash of the new bucket. */
 	if (first_hash)
@@ -3364,9 +3389,7 @@ set_num_buckets:
 	xh->xh_free_start = cpu_to_le16(name_offset);
 	xh->xh_name_value_len = cpu_to_le16(name_value_len);
 
-	ocfs2_journal_dirty(handle, s_bucket.bu_bhs[0]);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_xattr_bucket_journal_dirty(handle, inode, &s_bucket);
 
 out:
 	ocfs2_xattr_bucket_relse(inode, &s_bucket);
@@ -3413,20 +3436,18 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	if (ret)
 		goto out;
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, t_bucket.bu_bhs[i],
-					   t_is_new ?
-					   OCFS2_JOURNAL_ACCESS_CREATE :
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret)
-			goto out;
-	}
+	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &t_bucket,
+						t_is_new ?
+						OCFS2_JOURNAL_ACCESS_CREATE :
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret)
+		goto out;
 
 	for (i = 0; i < blk_per_bucket; i++) {
 		memcpy(bucket_block(&t_bucket, i), bucket_block(&s_bucket, i),
 		       blocksize);
-		ocfs2_journal_dirty(handle, t_bucket.bu_bhs[i]);
 	}
+	ocfs2_xattr_bucket_journal_dirty(handle, inode, &t_bucket);
 
 out:
 	ocfs2_xattr_bucket_relse(inode, &s_bucket);
@@ -3799,9 +3820,9 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 
 	/*
 	 * We will touch all the buckets after the start_bh(include it).
-	 * Add one more bucket and modify the first_bh.
+	 * Then we add one more bucket.
 	 */
-	credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+	credits = end_blk - start_blk + 3 * blk_per_bucket + 1;
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -4077,33 +4098,6 @@ set_new_name_value:
 	return;
 }
 
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
-					     handle_t *handle,
-					     struct ocfs2_xattr_search *xs,
-					     struct buffer_head **bhs,
-					     u16 bh_num)
-{
-	int ret = 0, off, block_off;
-	struct ocfs2_xattr_entry *xe = xs->here;
-
-	/*
-	 * First calculate all the blocks we should journal_access
-	 * and journal_dirty. The first block should always be touched.
-	 */
-	ret = ocfs2_journal_dirty(handle, bhs[0]);
-	if (ret)
-		mlog_errno(ret);
-
-	/* calc the data. */
-	off = le16_to_cpu(xe->xe_name_offset);
-	block_off = off >> inode->i_sb->s_blocksize_bits;
-	ret = ocfs2_journal_dirty(handle, bhs[block_off]);
-	if (ret)
-		mlog_errno(ret);
-
-	return ret;
-}
-
 /*
  * Set the xattr entry in the specified bucket.
  * The bucket is indicated by xs->bucket and it should have the enough
@@ -4115,7 +4109,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 					   u32 name_hash,
 					   int local)
 {
-	int i, ret;
+	int ret;
 	handle_t *handle = NULL;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4143,22 +4137,16 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 		goto out;
 	}
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, xs->bucket.bu_bhs[i],
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
+	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &xs->bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
 	}
 
 	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+	ocfs2_xattr_bucket_journal_dirty(handle, inode, &xs->bucket);
 
-	/*Only dirty the blocks we have touched in set xattr. */
-	ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
-						xs->bucket.bu_bhs, blk_per_bucket);
-	if (ret)
-		mlog_errno(ret);
 out:
 	ocfs2_commit_trans(osb, handle);
 
@@ -4398,15 +4386,16 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 						le16_to_cpu(xh->xh_count) - 1];
 	int ret = 0;
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+				   ocfs2_blocks_per_xattr_bucket(inode->i_sb));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
 		return;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, xs->bucket.bu_bhs[0],
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &xs->bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -4418,9 +4407,8 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
 	le16_add_cpu(&xh->xh_count, -1);
 
-	ret = ocfs2_journal_dirty(handle, xs->bucket.bu_bhs[0]);
-	if (ret < 0)
-		mlog_errno(ret);
+	ocfs2_xattr_bucket_journal_dirty(handle, inode, &xs->bucket);
+
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 }
-- 
cgit v1.2.3


From 4980c6daba967124ed6420032960abd2b48412e2 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 18:54:43 -0700
Subject: ocfs2: Copy xattr buckets with a dedicated function.

Now that the places that copy whole buckets are using struct
ocfs2_xattr_bucket, we can do the copy in a dedicated function.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 99aefe4ea75..71d9e7bdd30 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -240,6 +240,19 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
 		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
 }
 
+static void ocfs2_xattr_bucket_copy_data(struct inode *inode,
+					 struct ocfs2_xattr_bucket *dest,
+					 struct ocfs2_xattr_bucket *src)
+{
+	int i;
+	int blocksize = inode->i_sb->s_blocksize;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+	for (i = 0; i < blks; i++) {
+		memcpy(bucket_block(dest, i), bucket_block(src, i),
+		       blocksize);
+	}
+}
 
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
@@ -3299,9 +3312,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	}
 
 	/* copy the whole bucket to the new first. */
-	for (i = 0; i < blk_per_bucket; i++)
-		memcpy(bucket_block(&t_bucket, i), bucket_block(&s_bucket, i),
-		       blocksize);
+	ocfs2_xattr_bucket_copy_data(inode, &t_bucket, &s_bucket);
 
 	/* update the new bucket. */
 	xh = bucket_xh(&t_bucket);
@@ -3410,9 +3421,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 				 u64 t_blkno,
 				 int t_is_new)
 {
-	int ret, i;
-	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	int blocksize = inode->i_sb->s_blocksize;
+	int ret;
 	struct ocfs2_xattr_bucket s_bucket, t_bucket;
 
 	BUG_ON(s_blkno == t_blkno);
@@ -3443,10 +3452,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	if (ret)
 		goto out;
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		memcpy(bucket_block(&t_bucket, i), bucket_block(&s_bucket, i),
-		       blocksize);
-	}
+	ocfs2_xattr_bucket_copy_data(inode, &t_bucket, &s_bucket);
 	ocfs2_xattr_bucket_journal_dirty(handle, inode, &t_bucket);
 
 out:
-- 
cgit v1.2.3


From ba937127596ec2c61437006741f7d29999284de4 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 24 Oct 2008 19:13:20 -0700
Subject: ocfs2: Take ocfs2_xattr_bucket structures off of the stack.

The ocfs2_xattr_bucket structure is a nice abstraction, but it is a bit
large to have on the stack.  Just like ocfs2_path, let's allocate it
with a ocfs2_xattr_bucket_new() function.

We can now store the inode on the bucket, cleaning up all the other
bucket functions.  While we're here, we catch another place or two that
wasn't using ocfs2_read_xattr_bucket().

Updates:
- No longer allocating xis.bucket, as it will never be used.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 281 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 166 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 71d9e7bdd30..766494ed6e1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -61,7 +61,14 @@ struct ocfs2_xattr_def_value_root {
 };
 
 struct ocfs2_xattr_bucket {
+	/* The inode these xattrs are associated with */
+	struct inode *bu_inode;
+
+	/* The actual buffers that make up the bucket */
 	struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+	/* How many blocks make up one bucket for this filesystem */
+	int bu_blocks;
 };
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
@@ -97,7 +104,7 @@ struct ocfs2_xattr_search {
 	 */
 	struct buffer_head *xattr_bh;
 	struct ocfs2_xattr_header *header;
-	struct ocfs2_xattr_bucket bucket;
+	struct ocfs2_xattr_bucket *bucket;
 	void *base;
 	void *end;
 	struct ocfs2_xattr_entry *here;
@@ -157,69 +164,91 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
 #define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
 #define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
 
-static void ocfs2_xattr_bucket_relse(struct inode *inode,
-				     struct ocfs2_xattr_bucket *bucket)
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
 {
-	int i, blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	struct ocfs2_xattr_bucket *bucket;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-	for (i = 0; i < blks; i++) {
+	BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+
+	bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+	if (bucket) {
+		bucket->bu_inode = inode;
+		bucket->bu_blocks = blks;
+	}
+
+	return bucket;
+}
+
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+	int i;
+
+	for (i = 0; i < bucket->bu_blocks; i++) {
 		brelse(bucket->bu_bhs[i]);
 		bucket->bu_bhs[i] = NULL;
 	}
 }
 
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+	if (bucket) {
+		ocfs2_xattr_bucket_relse(bucket);
+		bucket->bu_inode = NULL;
+		kfree(bucket);
+	}
+}
+
 /*
  * A bucket that has never been written to disk doesn't need to be
  * read.  We just need the buffer_heads.  Don't call this for
  * buckets that are already on disk.  ocfs2_read_xattr_bucket() initializes
  * them fully.
  */
-static int ocfs2_init_xattr_bucket(struct inode *inode,
-				   struct ocfs2_xattr_bucket *bucket,
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 				   u64 xb_blkno)
 {
 	int i, rc = 0;
-	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-	for (i = 0; i < blks; i++) {
-		bucket->bu_bhs[i] = sb_getblk(inode->i_sb, xb_blkno + i);
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+					      xb_blkno + i);
 		if (!bucket->bu_bhs[i]) {
 			rc = -EIO;
 			mlog_errno(rc);
 			break;
 		}
 
-		ocfs2_set_new_buffer_uptodate(inode, bucket->bu_bhs[i]);
+		ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+					      bucket->bu_bhs[i]);
 	}
 
 	if (rc)
-		ocfs2_xattr_bucket_relse(inode, bucket);
+		ocfs2_xattr_bucket_relse(bucket);
 	return rc;
 }
 
 /* Read the xattr bucket at xb_blkno */
-static int ocfs2_read_xattr_bucket(struct inode *inode,
-				   struct ocfs2_xattr_bucket *bucket,
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 				   u64 xb_blkno)
 {
-	int rc, blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int rc;
 
-	rc = ocfs2_read_blocks(inode, xb_blkno, blks, bucket->bu_bhs, 0);
+	rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+			       bucket->bu_blocks, bucket->bu_bhs, 0);
 	if (rc)
-		ocfs2_xattr_bucket_relse(inode, bucket);
+		ocfs2_xattr_bucket_relse(bucket);
 	return rc;
 }
 
 static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
-					     struct inode *inode,
 					     struct ocfs2_xattr_bucket *bucket,
 					     int type)
 {
 	int i, rc = 0;
-	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-	for (i = 0; i < blks; i++) {
-		rc = ocfs2_journal_access(handle, inode,
+	for (i = 0; i < bucket->bu_blocks; i++) {
+		rc = ocfs2_journal_access(handle, bucket->bu_inode,
 					  bucket->bu_bhs[i], type);
 		if (rc) {
 			mlog_errno(rc);
@@ -231,24 +260,24 @@ static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
 }
 
 static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
-					     struct inode *inode,
 					     struct ocfs2_xattr_bucket *bucket)
 {
-	int i, blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int i;
 
-	for (i = 0; i < blks; i++)
+	for (i = 0; i < bucket->bu_blocks; i++)
 		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
 }
 
-static void ocfs2_xattr_bucket_copy_data(struct inode *inode,
-					 struct ocfs2_xattr_bucket *dest,
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
 					 struct ocfs2_xattr_bucket *src)
 {
 	int i;
-	int blocksize = inode->i_sb->s_blocksize;
-	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int blocksize = src->bu_inode->i_sb->s_blocksize;
+
+	BUG_ON(dest->bu_blocks != src->bu_blocks);
+	BUG_ON(dest->bu_inode != src->bu_inode);
 
-	for (i = 0; i < blks; i++) {
+	for (i = 0; i < src->bu_blocks; i++) {
 		memcpy(bucket_block(dest, i), bucket_block(src, i),
 		       blocksize);
 	}
@@ -869,7 +898,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	size_t size;
 	int ret = -ENODATA, name_offset, name_len, block_off, i;
 
-	memset(&xs->bucket, 0, sizeof(xs->bucket));
+	xs->bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xs->bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto cleanup;
+	}
 
 	ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
 	if (ret) {
@@ -895,11 +929,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
 			ret = ocfs2_xattr_bucket_get_name_value(inode,
-								bucket_xh(&xs->bucket),
+								bucket_xh(xs->bucket),
 								i,
 								&block_off,
 								&name_offset);
-			xs->base = bucket_block(&xs->bucket, block_off);
+			xs->base = bucket_block(xs->bucket, block_off);
 		}
 		if (ocfs2_xattr_is_local(xs->here)) {
 			memcpy(buffer, (void *)xs->base +
@@ -917,8 +951,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
 	}
 	ret = size;
 cleanup:
-	ocfs2_xattr_bucket_relse(inode, &xs->bucket);
-	memset(&xs->bucket, 0, sizeof(xs->bucket));
+	ocfs2_xattr_bucket_free(xs->bucket);
 
 	brelse(xs->xattr_bh);
 	xs->xattr_bh = NULL;
@@ -2047,10 +2080,20 @@ int ocfs2_xattr_set(struct inode *inode,
 	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
 		return -EOPNOTSUPP;
 
+	/*
+	 * Only xbs will be used on indexed trees.  xis doesn't need a
+	 * bucket.
+	 */
+	xbs.bucket = ocfs2_xattr_bucket_new(inode);
+	if (!xbs.bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
+
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return ret;
+		goto cleanup_nolock;
 	}
 	xis.inode_bh = xbs.inode_bh = di_bh;
 	di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -2127,9 +2170,10 @@ int ocfs2_xattr_set(struct inode *inode,
 cleanup:
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 	ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
 	brelse(di_bh);
 	brelse(xbs.xattr_bh);
-	ocfs2_xattr_bucket_relse(inode, &xbs.bucket);
+	ocfs2_xattr_bucket_free(xbs.bucket);
 
 	return ret;
 }
@@ -2373,11 +2417,11 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		lower_bh = bh;
 		bh = NULL;
 	}
-	xs->bucket.bu_bhs[0] = lower_bh;
+	xs->bucket->bu_bhs[0] = lower_bh;
 	lower_bh = NULL;
 
-	xs->header = bucket_xh(&xs->bucket);
-	xs->base = bucket_block(&xs->bucket, 0);
+	xs->header = bucket_xh(xs->bucket);
+	xs->base = bucket_block(xs->bucket, 0);
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
 	if (found) {
@@ -2385,8 +2429,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		 * If we have found the xattr enty, read all the blocks in
 		 * this bucket.
 		 */
-		ret = ocfs2_read_blocks(inode, bucket_blkno(&xs->bucket) + 1,
-					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
+		ret = ocfs2_read_blocks(inode, bucket_blkno(xs->bucket) + 1,
+					blk_per_bucket - 1, &xs->bucket->bu_bhs[1],
 					0);
 		if (ret) {
 			mlog_errno(ret);
@@ -2395,7 +2439,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		xs->here = &xs->header->xh_entries[index];
 		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-		     (unsigned long long)bucket_blkno(&xs->bucket), index);
+		     (unsigned long long)bucket_blkno(xs->bucket), index);
 	} else
 		ret = -ENODATA;
 
@@ -2453,22 +2497,24 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 				       void *para)
 {
 	int i, ret = 0;
-	int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
 	u32 num_buckets = clusters * bpc;
-	struct ocfs2_xattr_bucket bucket;
+	struct ocfs2_xattr_bucket *bucket;
 
-	memset(&bucket, 0, sizeof(bucket));
+	bucket = ocfs2_xattr_bucket_new(inode);
+	if (!bucket) {
+		mlog_errno(-ENOMEM);
+		return -ENOMEM;
+	}
 
 	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
 	     clusters, (unsigned long long)blkno);
 
-	for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
-		ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
-					bucket.bu_bhs, 0);
+	for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
+		ret = ocfs2_read_xattr_bucket(bucket, blkno);
 		if (ret) {
 			mlog_errno(ret);
-			goto out;
+			break;
 		}
 
 		/*
@@ -2476,26 +2522,24 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		 * in the 1st bucket.
 		 */
 		if (i == 0)
-			num_buckets = le16_to_cpu(bucket_xh(&bucket)->xh_num_buckets);
+			num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
 
 		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
 		     (unsigned long long)blkno,
-		     le32_to_cpu(bucket_xh(&bucket)->xh_entries[0].xe_name_hash));
+		     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
 		if (func) {
-			ret = func(inode, &bucket, para);
-			if (ret) {
+			ret = func(inode, bucket, para);
+			if (ret)
 				mlog_errno(ret);
-				break;
-			}
+			/* Fall through to bucket_relse() */
 		}
 
-		ocfs2_xattr_bucket_relse(inode, &bucket);
-		memset(&bucket, 0, sizeof(bucket));
+		ocfs2_xattr_bucket_relse(bucket);
+		if (ret)
+			break;
 	}
 
-out:
-	ocfs2_xattr_bucket_relse(inode, &bucket);
-
+	ocfs2_xattr_bucket_free(bucket);
 	return ret;
 }
 
@@ -2718,9 +2762,9 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 	int i, blocksize = inode->i_sb->s_blocksize;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-	xs->bucket.bu_bhs[0] = new_bh;
+	xs->bucket->bu_bhs[0] = new_bh;
 	get_bh(new_bh);
-	xs->header = bucket_xh(&xs->bucket);
+	xs->header = bucket_xh(xs->bucket);
 
 	xs->base = new_bh->b_data;
 	xs->end = xs->base + inode->i_sb->s_blocksize;
@@ -2728,8 +2772,8 @@ static int ocfs2_xattr_update_xattr_search(struct inode *inode,
 	if (!xs->not_found) {
 		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
 			ret = ocfs2_read_blocks(inode,
-					bucket_blkno(&xs->bucket) + 1,
-					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
+					bucket_blkno(xs->bucket) + 1,
+					blk_per_bucket - 1, &xs->bucket->bu_bhs[1],
 					0);
 			if (ret) {
 				mlog_errno(ret);
@@ -3244,8 +3288,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 {
 	int ret, i;
 	int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	struct ocfs2_xattr_bucket s_bucket, t_bucket;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
@@ -3253,16 +3296,21 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
 	     (unsigned long long)blk, (unsigned long long)new_blk);
 
-	memset(&s_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
-	memset(&t_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
 
-	ret = ocfs2_read_xattr_bucket(inode, &s_bucket, blk);
+	ret = ocfs2_read_xattr_bucket(s_bucket, blk);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &s_bucket,
+	ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -3273,13 +3321,13 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	 * Even if !new_bucket_head, we're overwriting t_bucket.  Thus,
 	 * there's no need to read it.
 	 */
-	ret = ocfs2_init_xattr_bucket(inode, &t_bucket, new_blk);
+	ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &t_bucket,
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
 						new_bucket_head ?
 						OCFS2_JOURNAL_ACCESS_CREATE :
 						OCFS2_JOURNAL_ACCESS_WRITE);
@@ -3288,7 +3336,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	xh = bucket_xh(&s_bucket);
+	xh = bucket_xh(s_bucket);
 	count = le16_to_cpu(xh->xh_count);
 	start = ocfs2_xattr_find_divide_pos(xh);
 
@@ -3300,10 +3348,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		 * The hash value is set as one larger than
 		 * that of the last entry in the previous bucket.
 		 */
-		for (i = 0; i < blk_per_bucket; i++)
-			memset(bucket_block(&t_bucket, i), 0, blocksize);
+		for (i = 0; i < t_bucket->bu_blocks; i++)
+			memset(bucket_block(t_bucket, i), 0, blocksize);
 
-		xh = bucket_xh(&t_bucket);
+		xh = bucket_xh(t_bucket);
 		xh->xh_free_start = cpu_to_le16(blocksize);
 		xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
 		le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3312,10 +3360,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	}
 
 	/* copy the whole bucket to the new first. */
-	ocfs2_xattr_bucket_copy_data(inode, &t_bucket, &s_bucket);
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
 
 	/* update the new bucket. */
-	xh = bucket_xh(&t_bucket);
+	xh = bucket_xh(t_bucket);
 
 	/*
 	 * Calculate the total name/value len and xh_free_start for
@@ -3379,7 +3427,7 @@ set_num_buckets:
 	else
 		xh->xh_num_buckets = 0;
 
-	ocfs2_xattr_bucket_journal_dirty(handle, inode, &t_bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
 
 	/* store the first_hash of the new bucket. */
 	if (first_hash)
@@ -3393,18 +3441,18 @@ set_num_buckets:
 	if (start == count)
 		goto out;
 
-	xh = bucket_xh(&s_bucket);
+	xh = bucket_xh(s_bucket);
 	memset(&xh->xh_entries[start], 0,
 	       sizeof(struct ocfs2_xattr_entry) * (count - start));
 	xh->xh_count = cpu_to_le16(start);
 	xh->xh_free_start = cpu_to_le16(name_offset);
 	xh->xh_name_value_len = cpu_to_le16(name_value_len);
 
-	ocfs2_xattr_bucket_journal_dirty(handle, inode, &s_bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
 
 out:
-	ocfs2_xattr_bucket_relse(inode, &s_bucket);
-	ocfs2_xattr_bucket_relse(inode, &t_bucket);
+	ocfs2_xattr_bucket_free(s_bucket);
+	ocfs2_xattr_bucket_free(t_bucket);
 
 	return ret;
 }
@@ -3422,7 +3470,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 				 int t_is_new)
 {
 	int ret;
-	struct ocfs2_xattr_bucket s_bucket, t_bucket;
+	struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
 
 	BUG_ON(s_blkno == t_blkno);
 
@@ -3430,10 +3478,15 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
 	     t_is_new);
 
-	memset(&s_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
-	memset(&t_bucket, 0, sizeof(struct ocfs2_xattr_bucket));
-
-	ret = ocfs2_read_xattr_bucket(inode, &s_bucket, s_blkno);
+	s_bucket = ocfs2_xattr_bucket_new(inode);
+	t_bucket = ocfs2_xattr_bucket_new(inode);
+	if (!s_bucket || !t_bucket) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+  
+	ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
 	if (ret)
 		goto out;
 
@@ -3441,23 +3494,23 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	 * Even if !t_is_new, we're overwriting t_bucket.  Thus,
 	 * there's no need to read it.
 	 */
-	ret = ocfs2_init_xattr_bucket(inode, &t_bucket, t_blkno);
+	ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
 	if (ret)
 		goto out;
 
-	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &t_bucket,
+	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
 						t_is_new ?
 						OCFS2_JOURNAL_ACCESS_CREATE :
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret)
 		goto out;
 
-	ocfs2_xattr_bucket_copy_data(inode, &t_bucket, &s_bucket);
-	ocfs2_xattr_bucket_journal_dirty(handle, inode, &t_bucket);
+	ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
 
 out:
-	ocfs2_xattr_bucket_relse(inode, &s_bucket);
-	ocfs2_xattr_bucket_relse(inode, &t_bucket);
+	ocfs2_xattr_bucket_free(t_bucket);
+	ocfs2_xattr_bucket_free(s_bucket);
 
 	return ret;
 }
@@ -4009,7 +4062,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
 				xe->xe_value_size = 0;
 
 			val = ocfs2_xattr_bucket_get_val(inode,
-							 &xs->bucket, offs);
+							 xs->bucket, offs);
 			memset(val + OCFS2_XATTR_SIZE(name_len), 0,
 			       size - OCFS2_XATTR_SIZE(name_len));
 			if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4087,8 +4140,7 @@ set_new_name_value:
 		xh->xh_free_start = cpu_to_le16(offs);
 	}
 
-	val = ocfs2_xattr_bucket_get_val(inode,
-					 &xs->bucket, offs - size);
+	val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
 	xe->xe_name_offset = cpu_to_le16(offs - size);
 
 	memset(val, 0, size);
@@ -4122,12 +4174,12 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
 	     (unsigned long)xi->value_len, xi->name_index,
-	     (unsigned long long)bucket_blkno(&xs->bucket));
+	     (unsigned long long)bucket_blkno(xs->bucket));
 
-	if (!xs->bucket.bu_bhs[1]) {
+	if (!xs->bucket->bu_bhs[1]) {
 		ret = ocfs2_read_blocks(inode,
-					bucket_blkno(&xs->bucket) + 1,
-					blk_per_bucket - 1, &xs->bucket.bu_bhs[1],
+					bucket_blkno(xs->bucket) + 1,
+					blk_per_bucket - 1, &xs->bucket->bu_bhs[1],
 					0);
 		if (ret) {
 			mlog_errno(ret);
@@ -4143,7 +4195,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &xs->bucket,
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -4151,7 +4203,7 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 	}
 
 	ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
-	ocfs2_xattr_bucket_journal_dirty(handle, inode, &xs->bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 
 out:
 	ocfs2_commit_trans(osb, handle);
@@ -4264,10 +4316,10 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 	struct ocfs2_xattr_entry *xe = xs->here;
 	struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
 
-	BUG_ON(!xs->bucket.bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+	BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
 
 	offset = xe - xh->xh_entries;
-	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bu_bhs[0],
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket->bu_bhs[0],
 						offset, len);
 	if (ret)
 		mlog_errno(ret);
@@ -4387,7 +4439,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 					 struct ocfs2_xattr_search *xs)
 {
 	handle_t *handle = NULL;
-	struct ocfs2_xattr_header *xh = bucket_xh(&xs->bucket);
+	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
 	struct ocfs2_xattr_entry *last = &xh->xh_entries[
 						le16_to_cpu(xh->xh_count) - 1];
 	int ret = 0;
@@ -4400,7 +4452,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 		return;
 	}
 
-	ret = ocfs2_xattr_bucket_journal_access(handle, inode, &xs->bucket,
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
@@ -4413,7 +4465,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 	memset(last, 0, sizeof(struct ocfs2_xattr_entry));
 	le16_add_cpu(&xh->xh_count, -1);
 
-	ocfs2_xattr_bucket_journal_dirty(handle, inode, &xs->bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -4565,7 +4617,7 @@ try_again:
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
-			(unsigned long long)bucket_blkno(&xs->bucket),
+			(unsigned long long)bucket_blkno(xs->bucket),
 			header_size);
 
 	if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4605,7 +4657,7 @@ try_again:
 	mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
 	     "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
 	     " %u\n", xs->not_found,
-	     (unsigned long long)bucket_blkno(&xs->bucket),
+	     (unsigned long long)bucket_blkno(xs->bucket),
 	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
 	     le16_to_cpu(xh->xh_name_value_len));
 
@@ -4617,7 +4669,7 @@ try_again:
 			 * name/value will be moved, the xe shouldn't be changed
 			 * in xs.
 			 */
-			ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+			ret = ocfs2_defrag_xattr_bucket(inode, xs->bucket);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -4649,7 +4701,7 @@ try_again:
 		 * add a new bucket for the insert.
 		 */
 		ret = ocfs2_check_xattr_bucket_collision(inode,
-							 &xs->bucket,
+							 xs->bucket,
 							 xi->name);
 		if (ret) {
 			mlog_errno(ret);
@@ -4658,14 +4710,13 @@ try_again:
 
 		ret = ocfs2_add_new_xattr_bucket(inode,
 						 xs->xattr_bh,
-						 xs->bucket.bu_bhs[0]);
+						 xs->bucket->bu_bhs[0]);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		ocfs2_xattr_bucket_relse(inode, &xs->bucket);
-		memset(&xs->bucket, 0, sizeof(xs->bucket));
+		ocfs2_xattr_bucket_relse(xs->bucket);
 
 		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
 						   xi->name_index,
-- 
cgit v1.2.3


From e2356a3f02cfdbce735465a2b40b6dc72a764c26 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 27 Oct 2008 15:01:54 -0700
Subject: ocfs2: Use buckets in ocfs2_xattr_bucket_find().

Change the ocfs2_xattr_bucket_find() function to use ocfs2_xattr_bucket
as its abstraction.  This makes for more efficient reads, as buckets are
linear blocks, and also has improved caching characteristics.  It also
reads better.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 89 ++++++++++++++++++++------------------------------------
 1 file changed, 31 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 766494ed6e1..46986c635eb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2248,7 +2248,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
 				void *para);
 
 static int ocfs2_find_xe_in_bucket(struct inode *inode,
-				   struct buffer_head *header_bh,
+				   struct ocfs2_xattr_bucket *bucket,
 				   int name_index,
 				   const char *name,
 				   u32 name_hash,
@@ -2256,11 +2256,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 				   int *found)
 {
 	int i, ret = 0, cmp = 1, block_off, new_offset;
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)header_bh->b_data;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	size_t name_len = strlen(name);
 	struct ocfs2_xattr_entry *xe = NULL;
-	struct buffer_head *name_bh = NULL;
 	char *xe_name;
 
 	/*
@@ -2291,19 +2289,8 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 			break;
 		}
 
-		ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
-				       &name_bh);
-		if (ret) {
-			mlog_errno(ret);
-			break;
-		}
-		xe_name = name_bh->b_data + new_offset;
-
-		cmp = memcmp(name, xe_name, name_len);
-		brelse(name_bh);
-		name_bh = NULL;
-
-		if (cmp == 0) {
+		xe_name = bucket_block(bucket, block_off) + new_offset;
+		if (!memcmp(name, xe_name, name_len)) {
 			*xe_index = i;
 			*found = 1;
 			ret = 0;
@@ -2333,39 +2320,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 				   struct ocfs2_xattr_search *xs)
 {
 	int ret, found = 0;
-	struct buffer_head *bh = NULL;
-	struct buffer_head *lower_bh = NULL;
 	struct ocfs2_xattr_header *xh = NULL;
 	struct ocfs2_xattr_entry *xe = NULL;
 	u16 index = 0;
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int low_bucket = 0, bucket, high_bucket;
+	struct ocfs2_xattr_bucket *search;
 	u32 last_hash;
-	u64 blkno;
+	u64 blkno, lower_blkno = 0;
 
-	ret = ocfs2_read_block(inode, p_blkno, &bh);
+	search = ocfs2_xattr_bucket_new(inode);
+	if (!search) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(search, p_blkno);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	xh = (struct ocfs2_xattr_header *)bh->b_data;
+	xh = bucket_xh(search);
 	high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
-
 	while (low_bucket <= high_bucket) {
-		brelse(bh);
-		bh = NULL;
-		bucket = (low_bucket + high_bucket) / 2;
+		ocfs2_xattr_bucket_relse(search);
 
+		bucket = (low_bucket + high_bucket) / 2;
 		blkno = p_blkno + bucket * blk_per_bucket;
-
-		ret = ocfs2_read_block(inode, blkno, &bh);
+		ret = ocfs2_read_xattr_bucket(search, blkno);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		xh = (struct ocfs2_xattr_header *)bh->b_data;
+		xh = bucket_xh(search);
 		xe = &xh->xh_entries[0];
 		if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
 			high_bucket = bucket - 1;
@@ -2382,10 +2372,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 		last_hash = le32_to_cpu(xe->xe_name_hash);
 
-		/* record lower_bh which may be the insert place. */
-		brelse(lower_bh);
-		lower_bh = bh;
-		bh = NULL;
+		/* record lower_blkno which may be the insert place. */
+		lower_blkno = blkno;
 
 		if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
 			low_bucket = bucket + 1;
@@ -2393,7 +2381,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		}
 
 		/* the searched xattr should reside in this bucket if exists. */
-		ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+		ret = ocfs2_find_xe_in_bucket(inode, search,
 					      name_index, name, name_hash,
 					      &index, &found);
 		if (ret) {
@@ -2408,35 +2396,21 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 	 * When the xattr's hash value is in the gap of 2 buckets, we will
 	 * always set it to the previous bucket.
 	 */
-	if (!lower_bh) {
-		/*
-		 * We can't find any bucket whose first name_hash is less
-		 * than the find name_hash.
-		 */
-		BUG_ON(bh->b_blocknr != p_blkno);
-		lower_bh = bh;
-		bh = NULL;
+	if (!lower_blkno)
+		lower_blkno = p_blkno;
+
+	/* This should be in cache - we just read it during the search */
+	ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
-	xs->bucket->bu_bhs[0] = lower_bh;
-	lower_bh = NULL;
 
 	xs->header = bucket_xh(xs->bucket);
 	xs->base = bucket_block(xs->bucket, 0);
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
 	if (found) {
-		/*
-		 * If we have found the xattr enty, read all the blocks in
-		 * this bucket.
-		 */
-		ret = ocfs2_read_blocks(inode, bucket_blkno(xs->bucket) + 1,
-					blk_per_bucket - 1, &xs->bucket->bu_bhs[1],
-					0);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-
 		xs->here = &xs->header->xh_entries[index];
 		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
 		     (unsigned long long)bucket_blkno(xs->bucket), index);
@@ -2444,8 +2418,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 		ret = -ENODATA;
 
 out:
-	brelse(bh);
-	brelse(lower_bh);
+	ocfs2_xattr_bucket_free(search);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 178eeac354ea28828d5e94a3a7b51368c171d6a5 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 27 Oct 2008 15:18:29 -0700
Subject: ocfs2: Use buckets in ocfs2_xattr_create_index_block().

Use the ocfs2_xattr_bucket abstraction in
ocfs2_xattr_create_index_block() and its helpers.  We get more efficient
reads, a lot less buffer_head munging, and nicer code to boot.  While
we're at it, ocfs2_xattr_update_xattr_search() becomes void.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 114 ++++++++++++++++---------------------------------------
 1 file changed, 32 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 46986c635eb..76969b92200 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2649,32 +2649,34 @@ static void swap_xe(void *a, void *b, int size)
 /*
  * When the ocfs2_xattr_block is filled up, new bucket will be created
  * and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end.  This is why *target starts as the last buffer.
  * Note: we need to sort the entries since they are not saved in order
  * in the ocfs2_xattr_block.
  */
 static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 					   struct buffer_head *xb_bh,
-					   struct buffer_head *xh_bh,
-					   struct buffer_head *data_bh)
+					   struct ocfs2_xattr_bucket *bucket)
 {
 	int i, blocksize = inode->i_sb->s_blocksize;
+	int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u16 offset, size, off_change;
 	struct ocfs2_xattr_entry *xe;
 	struct ocfs2_xattr_block *xb =
 				(struct ocfs2_xattr_block *)xb_bh->b_data;
 	struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
-	struct ocfs2_xattr_header *xh =
-				(struct ocfs2_xattr_header *)xh_bh->b_data;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u16 count = le16_to_cpu(xb_xh->xh_count);
-	char *target = xh_bh->b_data, *src = xb_bh->b_data;
+	char *src = xb_bh->b_data;
+	char *target = bucket_block(bucket, blks - 1);
 
 	mlog(0, "cp xattr from block %llu to bucket %llu\n",
 	     (unsigned long long)xb_bh->b_blocknr,
-	     (unsigned long long)xh_bh->b_blocknr);
+	     (unsigned long long)bucket_blkno(bucket));
+
+	for (i = 0; i < blks; i++)
+		memset(bucket_block(bucket, i), 0, blocksize);
 
-	memset(xh_bh->b_data, 0, blocksize);
-	if (data_bh)
-		memset(data_bh->b_data, 0, blocksize);
 	/*
 	 * Since the xe_name_offset is based on ocfs2_xattr_header,
 	 * there is a offset change corresponding to the change of
@@ -2686,8 +2688,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 	size = blocksize - offset;
 
 	/* copy all the names and values. */
-	if (data_bh)
-		target = data_bh->b_data;
 	memcpy(target + offset, src + offset, size);
 
 	/* Init new header now. */
@@ -2697,7 +2697,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 	xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
 
 	/* copy all the entries. */
-	target = xh_bh->b_data;
+	target = bucket_block(bucket, 0);
 	offset = offsetof(struct ocfs2_xattr_header, xh_entries);
 	size = count * sizeof(struct ocfs2_xattr_entry);
 	memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2723,42 +2723,24 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
  * While if the entry is in index b-tree, "bucket" indicates the
  * real place of the xattr.
  */
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
-					   struct ocfs2_xattr_search *xs,
-					   struct buffer_head *old_bh,
-					   struct buffer_head *new_bh)
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
+					    struct ocfs2_xattr_search *xs,
+					    struct buffer_head *old_bh)
 {
-	int ret = 0;
 	char *buf = old_bh->b_data;
 	struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
 	struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
-	int i, blocksize = inode->i_sb->s_blocksize;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+	int i;
 
-	xs->bucket->bu_bhs[0] = new_bh;
-	get_bh(new_bh);
 	xs->header = bucket_xh(xs->bucket);
-
-	xs->base = new_bh->b_data;
+	xs->base = bucket_block(xs->bucket, 0);
 	xs->end = xs->base + inode->i_sb->s_blocksize;
 
-	if (!xs->not_found) {
-		if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
-			ret = ocfs2_read_blocks(inode,
-					bucket_blkno(xs->bucket) + 1,
-					blk_per_bucket - 1, &xs->bucket->bu_bhs[1],
-					0);
-			if (ret) {
-				mlog_errno(ret);
-				return ret;
-			}
-
-		}
-		i = xs->here - old_xh->xh_entries;
-		xs->here = &xs->header->xh_entries[i];
-	}
+	if (xs->not_found)
+		return;
 
-	return ret;
+	i = xs->here - old_xh->xh_entries;
+	xs->here = &xs->header->xh_entries[i];
 }
 
 static int ocfs2_xattr_create_index_block(struct inode *inode,
@@ -2771,18 +2753,17 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_alloc_context *data_ac;
-	struct buffer_head *xh_bh = NULL, *data_bh = NULL;
 	struct buffer_head *xb_bh = xs->xattr_bh;
 	struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)xb_bh->b_data;
 	struct ocfs2_xattr_tree_root *xr;
 	u16 xb_flags = le16_to_cpu(xb->xb_flags);
-	u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
 	mlog(0, "create xattr index block for %llu\n",
 	     (unsigned long long)xb_bh->b_blocknr);
 
 	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+	BUG_ON(!xs->bucket);
 
 	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
 	if (ret) {
@@ -2798,10 +2779,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	down_write(&oi->ip_alloc_sem);
 
 	/*
-	 * 3 more credits, one for xattr block update, one for the 1st block
-	 * of the new xattr bucket and one for the value/data.
+	 * We need more credits.  One for the xattr block update and one
+	 * for each block of the new xattr bucket.
 	 */
-	credits += 3;
+	credits += 1 + ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -2832,51 +2813,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	mlog(0, "allocate 1 cluster from %llu to xattr block\n",
 	     (unsigned long long)blkno);
 
-	xh_bh = sb_getblk(inode->i_sb, blkno);
-	if (!xh_bh) {
-		ret = -EIO;
+	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 
-	ocfs2_set_new_buffer_uptodate(inode, xh_bh);
-
-	ret = ocfs2_journal_access(handle, inode, xh_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+						OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 
-	if (bpb > 1) {
-		data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
-		if (!data_bh) {
-			ret = -EIO;
-			mlog_errno(ret);
-			goto out_commit;
-		}
-
-		ocfs2_set_new_buffer_uptodate(inode, data_bh);
-
-		ret = ocfs2_journal_access(handle, inode, data_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_commit;
-		}
-	}
-
-	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
-
-	ocfs2_journal_dirty(handle, xh_bh);
-	if (data_bh)
-		ocfs2_journal_dirty(handle, data_bh);
+	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 
-	ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
 
 	/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
 	memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2911,9 +2864,6 @@ out:
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 
-	brelse(xh_bh);
-	brelse(data_bh);
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From 161d6f30f18c4a7e2b24705b6690cce3ff276eb9 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 27 Oct 2008 15:25:18 -0700
Subject: ocfs2: Use buckets in ocfs2_defrag_xattr_bucket().

Use the ocfs2_xattr_bucket abstraction for reading and writing the
bucket in ocfs2_defrag_xattr_bucket().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 55 +++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 76969b92200..127a6285078 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2894,21 +2894,11 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	struct ocfs2_xattr_header *xh;
 	char *entries, *buf, *bucket_buf = NULL;
 	u64 blkno = bucket_blkno(bucket);
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u16 xh_free_start;
 	size_t blocksize = inode->i_sb->s_blocksize;
 	handle_t *handle;
-	struct buffer_head **bhs;
 	struct ocfs2_xattr_entry *xe;
-
-	bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
-			GFP_NOFS);
-	if (!bhs)
-		return -ENOMEM;
-
-	ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
-	if (ret)
-		goto out;
+	struct ocfs2_xattr_bucket *wb = NULL;
 
 	/*
 	 * In order to make the operation more efficient and generic,
@@ -2922,11 +2912,21 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
+	wb = ocfs2_xattr_bucket_new(inode);
+	if (!wb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(wb, blkno);
+	if (ret)
+		goto out;
+
 	buf = bucket_buf;
-	for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
-		memcpy(buf, bhs[i]->b_data, blocksize);
+	for (i = 0; i < wb->bu_blocks; i++, buf += blocksize)
+		memcpy(buf, bucket_block(wb, i), blocksize);
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), wb->bu_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		handle = NULL;
@@ -2934,13 +2934,11 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	for (i = 0; i < blk_per_bucket; i++) {
-		ret = ocfs2_journal_access(handle, inode, bhs[i],
-					   OCFS2_JOURNAL_ACCESS_WRITE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto commit;
-		}
+	ret = ocfs2_xattr_bucket_journal_access(handle, wb,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto commit;
 	}
 
 	xh = (struct ocfs2_xattr_header *)bucket_buf;
@@ -3009,21 +3007,14 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	     cmp_xe, swap_xe);
 
 	buf = bucket_buf;
-	for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
-		memcpy(bhs[i]->b_data, buf, blocksize);
-		ocfs2_journal_dirty(handle, bhs[i]);
-	}
+	for (i = 0; i < wb->bu_blocks; i++, buf += blocksize)
+		memcpy(bucket_block(wb, i), buf, blocksize);
+	ocfs2_xattr_bucket_journal_dirty(handle, wb);
 
 commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-
-	if (bhs) {
-		for (i = 0; i < blk_per_bucket; i++)
-			brelse(bhs[i]);
-	}
-	kfree(bhs);
-
+	ocfs2_xattr_bucket_free(wb);
 	kfree(bucket_buf);
 	return ret;
 }
-- 
cgit v1.2.3


From 02dbf38d19c19016f558fe0dc0c44f8041d3eb8e Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 27 Oct 2008 18:07:45 -0700
Subject: ocfs2: Use buckets in ocfs2_xattr_set_entry_in_bucket().

The ocfs2_xattr_set_entry_in_bucket() function is already working on an
ocfs2_xattr_bucket structure, so let's use the bucket API.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 127a6285078..029a9f4559f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4083,25 +4083,24 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 {
 	int ret;
 	handle_t *handle = NULL;
-	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	u64 blkno;
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
 	     (unsigned long)xi->value_len, xi->name_index,
 	     (unsigned long long)bucket_blkno(xs->bucket));
 
 	if (!xs->bucket->bu_bhs[1]) {
-		ret = ocfs2_read_blocks(inode,
-					bucket_blkno(xs->bucket) + 1,
-					blk_per_bucket - 1, &xs->bucket->bu_bhs[1],
-					0);
+		blkno = bucket_blkno(xs->bucket);
+		ocfs2_xattr_bucket_relse(xs->bucket);
+		ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, blk_per_bucket);
+	handle = ocfs2_start_trans(osb, xs->bucket->bu_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		handle = NULL;
-- 
cgit v1.2.3


From 1c32a2fd46ddc01bd86bff56a8f5d98c815750f4 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 6 Nov 2008 08:10:47 +0800
Subject: ocfs2/xattr: Remove additional bucket allocation in bucket
 defragment.

Joel has refactored xattr bucket and make xattr bucket a general
wrapper. So in ocfs2_defrag_xattr_bucket, we have already passed the
bucket in, so there is no need to allocate a new one and read it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 029a9f4559f..87cf39ddfe5 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2898,7 +2898,6 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	size_t blocksize = inode->i_sb->s_blocksize;
 	handle_t *handle;
 	struct ocfs2_xattr_entry *xe;
-	struct ocfs2_xattr_bucket *wb = NULL;
 
 	/*
 	 * In order to make the operation more efficient and generic,
@@ -2912,21 +2911,11 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	wb = ocfs2_xattr_bucket_new(inode);
-	if (!wb) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ret = ocfs2_read_xattr_bucket(wb, blkno);
-	if (ret)
-		goto out;
-
 	buf = bucket_buf;
-	for (i = 0; i < wb->bu_blocks; i++, buf += blocksize)
-		memcpy(buf, bucket_block(wb, i), blocksize);
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(buf, bucket_block(bucket, i), blocksize);
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), wb->bu_blocks);
+	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), bucket->bu_blocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		handle = NULL;
@@ -2934,7 +2923,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_xattr_bucket_journal_access(handle, wb,
+	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -3007,14 +2996,13 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	     cmp_xe, swap_xe);
 
 	buf = bucket_buf;
-	for (i = 0; i < wb->bu_blocks; i++, buf += blocksize)
-		memcpy(bucket_block(wb, i), buf, blocksize);
-	ocfs2_xattr_bucket_journal_dirty(handle, wb);
+	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+		memcpy(bucket_block(bucket, i), buf, blocksize);
+	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
 
 commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-	ocfs2_xattr_bucket_free(wb);
 	kfree(bucket_buf);
 	return ret;
 }
-- 
cgit v1.2.3


From 757055adc5d41b910bdead925060f077dd2d9169 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 6 Nov 2008 08:10:48 +0800
Subject: ocfs2/xattr: Only set buffer update if it doesn't exist in cache.

When we call ocfs2_init_xattr_bucket, we deem that the new buffer head
will be written to disk immediately, so we just use sb_getblk. But in
some cases the buffer may have already been in ocfs2 uptodate cache,
so we only call ocfs2_set_buffer_uptodate if the buffer head isn't
in the cache.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 87cf39ddfe5..d8fc714e941 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -219,8 +219,10 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 			break;
 		}
 
-		ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
-					      bucket->bu_bhs[i]);
+		if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+					   bucket->bu_bhs[i]))
+			ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+						      bucket->bu_bhs[i]);
 	}
 
 	if (rc)
-- 
cgit v1.2.3


From 976331d8789d4d84a11b45b87c520ade83715343 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 12 Nov 2008 08:26:57 +0800
Subject: ocfs2/xattr: Only extend xattr bucket in need.

When the first block of a bucket is filled up with xattr
entries, we normally extend the bucket. But if we are
just replace one xattr with small length, we don't need
to extend it. This is important since we will calculate
what we need before the transaction and in this situation
no resources will be allocated.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d8fc714e941..4501c63193d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4564,7 +4564,9 @@ try_again:
 	     free, need, max_free, le16_to_cpu(xh->xh_free_start),
 	     le16_to_cpu(xh->xh_name_value_len));
 
-	if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+	if (free < need ||
+	    (xs->not_found &&
+	     count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
 		if (need <= max_free &&
 		    count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
 			/*
-- 
cgit v1.2.3


From 2891d290aa6eee0821f7e4ad0b1c4ae4d964b0f1 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 12 Nov 2008 08:26:58 +0800
Subject: ocfs2: Add clusters free in dealloc_ctxt.

Now in ocfs2 xattr set, the whole process are divided into many small
parts and they are wrapped into diffrent transactions and it make the
set doesn't look like a real transaction. So we want to integrate it
into a real one.

In some cases we will allocate some clusters and free some in just one
transaction. e.g, one xattr is larger than inline size, so it and its
value root is stored within the inode while the value is outside in a
cluster. Then we try to update it with a smaller value(larger than the
size of root but smaller than inline size), we may need to free the
outside cluster while allocate a new bucket(one cluster) since now the
inode may be full. The old solution will lock the global_bitmap(if the
local alloc failed in stress test) and then the truncate log. This will
cause a ABBA lock with truncate log flush.

This patch add the clusters free in dealloc_ctxt, so that we can record
the free clusters during the transaction and then free it after we
release the global_bitmap in xattr set.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/ocfs2/alloc.h |   4 +++
 2 files changed, 103 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394..4614614084d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5800,7 +5800,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  */
 
 /*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator.  For the block
+ * suballocators, it represents one block.  For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
  */
 struct ocfs2_cached_block_free {
 	struct ocfs2_cached_block_free		*free_next;
@@ -5815,10 +5818,10 @@ struct ocfs2_per_slot_free_list {
 	struct ocfs2_cached_block_free		*f_first;
 };
 
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
-				   int sysfile_type,
-				   int slot,
-				   struct ocfs2_cached_block_free *head)
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+				    int sysfile_type,
+				    int slot,
+				    struct ocfs2_cached_block_free *head)
 {
 	int ret;
 	u64 bg_blkno;
@@ -5893,6 +5896,82 @@ out:
 	return ret;
 }
 
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit)
+{
+	int ret = 0;
+	struct ocfs2_cached_block_free *item;
+
+	item = kmalloc(sizeof(*item), GFP_NOFS);
+	if (item == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+	     bit, (unsigned long long)blkno);
+
+	item->free_blk = blkno;
+	item->free_bit = bit;
+	item->free_next = ctxt->c_global_allocator;
+
+	ctxt->c_global_allocator = item;
+	return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+				      struct ocfs2_cached_block_free *head)
+{
+	struct ocfs2_cached_block_free *tmp;
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	int ret = 0;
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	while (head) {
+		if (ocfs2_truncate_log_needs_flush(osb)) {
+			ret = __ocfs2_flush_truncate_log(osb);
+			if (ret < 0) {
+				mlog_errno(ret);
+				break;
+			}
+		}
+
+		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			mlog_errno(ret);
+			break;
+		}
+
+		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+						head->free_bit);
+
+		ocfs2_commit_trans(osb, handle);
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+
+		if (ret < 0) {
+			mlog_errno(ret);
+			break;
+		}
+	}
+
+	mutex_unlock(&tl_inode->i_mutex);
+
+	while (head) {
+		/* Premature exit may have left some dangling items. */
+		tmp = head;
+		head = head->free_next;
+		kfree(tmp);
+	}
+
+	return ret;
+}
+
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
 {
@@ -5908,8 +5987,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		if (fl->f_first) {
 			mlog(0, "Free items: (type %u, slot %d)\n",
 			     fl->f_inode_type, fl->f_slot);
-			ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
-						       fl->f_slot, fl->f_first);
+			ret2 = ocfs2_free_cached_blocks(osb,
+							fl->f_inode_type,
+							fl->f_slot,
+							fl->f_first);
 			if (ret2)
 				mlog_errno(ret2);
 			if (!ret)
@@ -5920,6 +6001,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		kfree(fl);
 	}
 
+	if (ctxt->c_global_allocator) {
+		ret2 = ocfs2_free_cached_clusters(osb,
+						  ctxt->c_global_allocator);
+		if (ret2)
+			mlog_errno(ret2);
+		if (!ret)
+			ret = ret2;
+
+		ctxt->c_global_allocator = NULL;
+	}
+
 	return ret;
 }
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfb..c301cf225f0 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -167,11 +167,15 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
  */
 struct ocfs2_cached_dealloc_ctxt {
 	struct ocfs2_per_slot_free_list		*c_first_suballocator;
+	struct ocfs2_cached_block_free 		*c_global_allocator;
 };
 static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 {
 	c->c_first_suballocator = NULL;
+	c->c_global_allocator = NULL;
 }
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+				u64 blkno, unsigned int bit);
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt);
 
-- 
cgit v1.2.3


From c73f60f900ddf73ec4ea2a143829ab97242c4e8c Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 12 Nov 2008 08:26:59 +0800
Subject: ocfs2/xattr: Move clusters free into dealloc.

Move clusters free process into dealloc context so that
they can be freed after the transaction.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4501c63193d..f1da381a44f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -457,7 +457,6 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	int ret;
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *tl_inode = osb->osb_tl_inode;
 	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_extent_tree et;
@@ -470,16 +469,6 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		return ret;
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
-
-	if (ocfs2_truncate_log_needs_flush(osb)) {
-		ret = __ocfs2_flush_truncate_log(osb);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -509,14 +498,13 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	ret = ocfs2_cache_cluster_dealloc(dealloc, phys_blkno, len);
 	if (ret)
 		mlog_errno(ret);
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
-	mutex_unlock(&tl_inode->i_mutex);
 
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
-- 
cgit v1.2.3


From 78f30c314a74b9dc5d7368d96fe4be883d9a3a04 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 12 Nov 2008 08:27:00 +0800
Subject: ocfs2/xattr: Reserve meta/data at the beginning of ocfs2_xattr_set.

In ocfs2 xattr set, we reserve metadata and clusters in any place
they are needed. It is time-consuming and ineffective, so this
patch try to reserve metadata and clusters at the beginning of
ocfs2_xattr_set.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.h |   4 +
 fs/ocfs2/xattr.c | 483 ++++++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 361 insertions(+), 126 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index c301cf225f0..3eb735eedae 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -176,6 +176,10 @@ static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
 }
 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 				u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+	return c->c_global_allocator != NULL;
+}
 int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f1da381a44f..4fd201a54c7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -71,6 +71,12 @@ struct ocfs2_xattr_bucket {
 	int bu_blocks;
 };
 
+struct ocfs2_xattr_set_ctxt {
+	struct ocfs2_alloc_context *meta_ac;
+	struct ocfs2_alloc_context *data_ac;
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+};
+
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
 
@@ -133,11 +139,13 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
 					size_t buffer_size);
 
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-					  struct ocfs2_xattr_search *xs);
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt);
 
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 					     struct ocfs2_xattr_info *xi,
-					     struct ocfs2_xattr_search *xs);
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt);
 
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
@@ -334,14 +342,13 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 u32 clusters_to_add,
 					 struct buffer_head *xattr_bh,
-					 struct ocfs2_xattr_value_root *xv)
+					 struct ocfs2_xattr_value_root *xv,
+					 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int status = 0;
 	int restart_func = 0;
 	int credits = 0;
 	handle_t *handle = NULL;
-	struct ocfs2_alloc_context *data_ac = NULL;
-	struct ocfs2_alloc_context *meta_ac = NULL;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
@@ -353,13 +360,6 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 
 restart_all:
 
-	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-				       &data_ac, &meta_ac);
-	if (status) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
 					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
@@ -386,8 +386,8 @@ restarted_transaction:
 					     0,
 					     &et,
 					     handle,
-					     data_ac,
-					     meta_ac,
+					     ctxt->data_ac,
+					     ctxt->meta_ac,
 					     &why);
 	if ((status < 0) && (status != -EAGAIN)) {
 		if (status != -ENOSPC)
@@ -432,14 +432,6 @@ leave:
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
 	}
-	if (data_ac) {
-		ocfs2_free_alloc_context(data_ac);
-		data_ac = NULL;
-	}
-	if (meta_ac) {
-		ocfs2_free_alloc_context(meta_ac);
-		meta_ac = NULL;
-	}
 	if ((!status) && restart_func) {
 		restart_func = 0;
 		goto restart_all;
@@ -452,23 +444,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 				      struct buffer_head *root_bh,
 				      struct ocfs2_xattr_value_root *xv,
 				      u32 cpos, u32 phys_cpos, u32 len,
-				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	handle_t *handle;
-	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_extent_tree et;
 
 	ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
 
-	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
-	}
-
 	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -483,8 +468,8 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-				  dealloc);
+	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
+				  &ctxt->dealloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -498,17 +483,13 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_cache_cluster_dealloc(dealloc, phys_blkno, len);
+	ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
 	if (ret)
 		mlog_errno(ret);
 
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
-
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
 	return ret;
 }
 
@@ -516,15 +497,12 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 				   u32 old_clusters,
 				   u32 new_clusters,
 				   struct buffer_head *root_bh,
-				   struct ocfs2_xattr_value_root *xv)
+				   struct ocfs2_xattr_value_root *xv,
+				   struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret = 0;
 	u32 trunc_len, cpos, phys_cpos, alloc_size;
 	u64 block;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_cached_dealloc_ctxt dealloc;
-
-	ocfs2_init_dealloc_ctxt(&dealloc);
 
 	if (old_clusters <= new_clusters)
 		return 0;
@@ -544,7 +522,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 
 		ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
 						 phys_cpos, alloc_size,
-						 &dealloc);
+						 ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -558,16 +536,14 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	}
 
 out:
-	ocfs2_schedule_truncate_log_flush(osb, 1);
-	ocfs2_run_deallocs(osb, &dealloc);
-
 	return ret;
 }
 
 static int ocfs2_xattr_value_truncate(struct inode *inode,
 				      struct buffer_head *root_bh,
 				      struct ocfs2_xattr_value_root *xv,
-				      int len)
+				      int len,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
 	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -579,11 +555,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	if (new_clusters > old_clusters)
 		ret = ocfs2_xattr_extend_allocation(inode,
 						    new_clusters - old_clusters,
-						    root_bh, xv);
+						    root_bh, xv, ctxt);
 	else
 		ret = ocfs2_xattr_shrink_size(inode,
 					      old_clusters, new_clusters,
-					      root_bh, xv);
+					      root_bh, xv, ctxt);
 
 	return ret;
 }
@@ -1167,6 +1143,7 @@ out:
 static int ocfs2_xattr_set_value_outside(struct inode *inode,
 					 struct ocfs2_xattr_info *xi,
 					 struct ocfs2_xattr_search *xs,
+					 struct ocfs2_xattr_set_ctxt *ctxt,
 					 size_t offs)
 {
 	size_t name_len = strlen(xi->name);
@@ -1186,7 +1163,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	xv->xr_list.l_next_free_rec = 0;
 
 	ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
-					 xi->value_len);
+					 xi->value_len, ctxt);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1317,6 +1294,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
 static int ocfs2_xattr_set_entry(struct inode *inode,
 				 struct ocfs2_xattr_info *xi,
 				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt,
 				 int flag)
 {
 	struct ocfs2_xattr_entry *last;
@@ -1387,7 +1365,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
 			/* Replace existing local xattr with tree root */
 			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-							    offs);
+							    ctxt, offs);
 			if (ret < 0)
 				mlog_errno(ret);
 			goto out;
@@ -1406,7 +1384,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				ret = ocfs2_xattr_value_truncate(inode,
 								 xs->xattr_bh,
 								 xv,
-								 xi->value_len);
+								 xi->value_len,
+								 ctxt);
 				if (ret < 0) {
 					mlog_errno(ret);
 					goto out;
@@ -1436,7 +1415,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				 ret = ocfs2_xattr_value_truncate(inode,
 								 xs->xattr_bh,
 								 xv,
-								 0);
+								 0,
+								 ctxt);
 				if (ret < 0)
 					mlog_errno(ret);
 			}
@@ -1531,7 +1511,7 @@ out_commit:
 		 * This is the second step for value size > INLINE_SIZE.
 		 */
 		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt, offs);
 		if (ret < 0) {
 			int ret2;
 
@@ -1555,6 +1535,10 @@ static int ocfs2_remove_value_outside(struct inode*inode,
 				      struct ocfs2_xattr_header *header)
 {
 	int ret = 0, i;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
 	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
@@ -1567,14 +1551,17 @@ static int ocfs2_remove_value_outside(struct inode*inode,
 				le16_to_cpu(entry->xe_name_offset);
 			xv = (struct ocfs2_xattr_value_root *)
 				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-			ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+			ret = ocfs2_xattr_value_truncate(inode, bh, xv,
+							 0, &ctxt);
 			if (ret < 0) {
 				mlog_errno(ret);
-				return ret;
+				break;
 			}
 		}
 	}
 
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 	return ret;
 }
 
@@ -1836,7 +1823,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
  */
 static int ocfs2_xattr_ibody_set(struct inode *inode,
 				 struct ocfs2_xattr_info *xi,
-				 struct ocfs2_xattr_search *xs)
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1853,7 +1841,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
 		}
 	}
 
-	ret = ocfs2_xattr_set_entry(inode, xi, xs,
+	ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
 				(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
 out:
 	up_write(&oi->ip_alloc_sem);
@@ -1926,12 +1914,12 @@ cleanup:
  */
 static int ocfs2_xattr_block_set(struct inode *inode,
 				 struct ocfs2_xattr_info *xi,
-				 struct ocfs2_xattr_search *xs)
+				 struct ocfs2_xattr_search *xs,
+				 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle = NULL;
 	struct ocfs2_xattr_block *xblk = NULL;
 	u16 suballoc_bit_start;
@@ -1940,15 +1928,6 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 	int ret;
 
 	if (!xs->xattr_bh) {
-		/*
-		 * Alloc one external block for extended attribute
-		 * outside of inode.
-		 */
-		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
 		handle = ocfs2_start_trans(osb,
 					   OCFS2_XATTR_BLOCK_CREATE_CREDITS);
 		if (IS_ERR(handle)) {
@@ -1963,7 +1942,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 			goto out_commit;
 		}
 
-		ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+		ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
 					   &suballoc_bit_start, &num_got,
 					   &first_blkno);
 		if (ret < 0) {
@@ -1996,7 +1975,6 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 		xs->end = (void *)xblk + inode->i_sb->s_blocksize;
 		xs->here = xs->header->xh_entries;
 
-
 		ret = ocfs2_journal_dirty(handle, new_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -2009,8 +1987,6 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 out_commit:
 		ocfs2_commit_trans(osb, handle);
 out:
-		if (meta_ac)
-			ocfs2_free_alloc_context(meta_ac);
 		if (ret < 0)
 			return ret;
 	} else
@@ -2018,22 +1994,266 @@ out:
 
 	if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		/* Set extended attribute into external block */
-		ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+		ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+					    OCFS2_HAS_XATTR_FL);
 		if (!ret || ret != -ENOSPC)
 			goto end;
 
-		ret = ocfs2_xattr_create_index_block(inode, xs);
+		ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
 		if (ret)
 			goto end;
 	}
 
-	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+	ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
 
 end:
 
 	return ret;
 }
 
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+				       struct ocfs2_xattr_info *xi,
+				       struct ocfs2_xattr_search *xs)
+{
+	u64 value_size;
+	struct ocfs2_xattr_entry *last;
+	int free, i;
+	size_t min_offs = xs->end - xs->base;
+
+	if (!xs->header)
+		return 0;
+
+	last = xs->header->xh_entries;
+
+	for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+		size_t offs = le16_to_cpu(last->xe_name_offset);
+		if (offs < min_offs)
+			min_offs = offs;
+		last += 1;
+	}
+
+	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	if (free < 0)
+		return 0;
+
+	BUG_ON(!xs->not_found);
+
+	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		value_size = OCFS2_XATTR_ROOT_SIZE;
+	else
+		value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+	if (free >= sizeof(struct ocfs2_xattr_entry) +
+		   OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+		return 1;
+
+	return 0;
+}
+
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     int *clusters_need,
+				     int *meta_need)
+{
+	int ret = 0, old_in_xb = 0;
+	int clusters_add = 0, meta_add = 0;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_xattr_block *xb = NULL;
+	struct ocfs2_xattr_entry *xe = NULL;
+	struct ocfs2_xattr_value_root *xv = NULL;
+	char *base = NULL;
+	int name_offset, name_len = 0;
+	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+						    xi->value_len);
+	u64 value_size;
+
+	/*
+	 * delete a xattr doesn't need metadata and cluster allocation.
+	 * so return.
+	 */
+	if (!xi->value)
+		goto out;
+
+	if (xis->not_found && xbs->not_found) {
+		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+			clusters_add += new_clusters;
+
+		goto meta_guess;
+	}
+
+	if (!xis->not_found) {
+		xe = xis->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		base = xis->base;
+	} else {
+		int i, block_off;
+		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+		xe = xbs->here;
+		name_offset = le16_to_cpu(xe->xe_name_offset);
+		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+		i = xbs->here - xbs->header->xh_entries;
+		old_in_xb = 1;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			ret = ocfs2_xattr_bucket_get_name_value(inode,
+							bucket_xh(xbs->bucket),
+							i, &block_off,
+							&name_offset);
+			base = bucket_block(xbs->bucket, block_off);
+		} else
+			base = xbs->base;
+	}
+
+	/* do cluster allocation guess first. */
+	value_size = le64_to_cpu(xe->xe_value_size);
+
+	if (old_in_xb) {
+		/*
+		 * In xattr set, we always try to set the xe in inode first,
+		 * so if it can be inserted into inode successfully, the old
+		 * one will be removed from the xattr block, and this xattr
+		 * will be inserted into inode as a new xattr in inode.
+		 */
+		if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+			clusters_add += new_clusters;
+			goto out;
+		}
+	}
+
+	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		/* the new values will be stored outside. */
+		u32 old_clusters = 0;
+
+		if (!ocfs2_xattr_is_local(xe)) {
+			old_clusters =	ocfs2_clusters_for_bytes(inode->i_sb,
+								 value_size);
+			xv = (struct ocfs2_xattr_value_root *)
+			     (base + name_offset + name_len);
+		} else
+			xv = &def_xv.xv;
+
+		if (old_clusters >= new_clusters)
+			goto out;
+		else {
+			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+			clusters_add += new_clusters - old_clusters;
+			goto out;
+		}
+	} else {
+		/*
+		 * Now the new value will be stored inside. So if the new
+		 * value is smaller than the size of value root or the old
+		 * value, we don't need any allocation, otherwise we have
+		 * to guess metadata allocation.
+		 */
+		if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+		    (!ocfs2_xattr_is_local(xe) &&
+		     OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+			goto out;
+	}
+
+meta_guess:
+	/* calculate metadata allocation. */
+	if (di->i_xattr_loc) {
+		if (!xbs->xattr_bh) {
+			ret = ocfs2_read_block(inode,
+					       le64_to_cpu(di->i_xattr_loc),
+					       &bh);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			xb = (struct ocfs2_xattr_block *)bh->b_data;
+		} else
+			xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+
+		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+			struct ocfs2_extent_list *el =
+				 &xb->xb_attrs.xb_root.xt_list;
+			meta_add += ocfs2_extend_meta_needed(el);
+		}
+
+		/*
+		 * This cluster will be used either for new bucket or for
+		 * new xattr block.
+		 * If the cluster size is the same as the bucket size, one
+		 * more is needed since we may need to extend the bucket
+		 * also.
+		 */
+		clusters_add += 1;
+		if (OCFS2_XATTR_BUCKET_SIZE ==
+			OCFS2_SB(inode->i_sb)->s_clustersize)
+			clusters_add += 1;
+	} else
+		meta_add += 1;
+out:
+	if (clusters_need)
+		*clusters_need = clusters_add;
+	if (meta_need)
+		*meta_need = meta_add;
+	brelse(bh);
+	return ret;
+}
+
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+				     struct ocfs2_dinode *di,
+				     struct ocfs2_xattr_info *xi,
+				     struct ocfs2_xattr_search *xis,
+				     struct ocfs2_xattr_search *xbs,
+				     struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int clusters_add, meta_add, ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+
+	ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+
+	ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+					&clusters_add, &meta_add);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d\n",
+	     xi->name, meta_add, clusters_add);
+
+	if (meta_add) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+							&ctxt->meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (clusters_add) {
+		ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+		if (ret)
+			mlog_errno(ret);
+	}
+out:
+	if (ret) {
+		if (ctxt->meta_ac) {
+			ocfs2_free_alloc_context(ctxt->meta_ac);
+			ctxt->meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null ctxt->data_ac.
+		 */
+	}
+
+	return ret;
+}
+
 /*
  * ocfs2_xattr_set()
  *
@@ -2051,6 +2271,8 @@ int ocfs2_xattr_set(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	int ret;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
 
 	struct ocfs2_xattr_info xi = {
 		.name_index = name_index,
@@ -2115,15 +2337,21 @@ int ocfs2_xattr_set(struct inode *inode,
 			goto cleanup;
 	}
 
+	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt);
+	if (ret) {
+		mlog_errno(ret);
+		goto cleanup;
+	}
+
 	if (!value) {
 		/* Remove existing extended attribute */
 		if (!xis.not_found)
-			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis, &ctxt);
 		else if (!xbs.not_found)
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs, &ctxt);
 	} else {
 		/* We always try to set extended attribute into inode first*/
-		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis, &ctxt);
 		if (!ret && !xbs.not_found) {
 			/*
 			 * If succeed and that extended attribute existing in
@@ -2131,7 +2359,7 @@ int ocfs2_xattr_set(struct inode *inode,
 			 */
 			xi.value = NULL;
 			xi.value_len = 0;
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs, &ctxt);
 		} else if (ret == -ENOSPC) {
 			if (di->i_xattr_loc && !xbs.xattr_bh) {
 				ret = ocfs2_xattr_block_find(inode, name_index,
@@ -2143,9 +2371,9 @@ int ocfs2_xattr_set(struct inode *inode,
 			 * If no space in inode, we will set extended attribute
 			 * into external block.
 			 */
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+			ret = ocfs2_xattr_block_set(inode, &xi, &xbs, &ctxt);
 			if (ret)
-				goto cleanup;
+				goto free;
 			if (!xis.not_found) {
 				/*
 				 * If succeed and that extended attribute
@@ -2153,10 +2381,19 @@ int ocfs2_xattr_set(struct inode *inode,
 				 */
 				xi.value = NULL;
 				xi.value_len = 0;
-				ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+				ret = ocfs2_xattr_ibody_set(inode, &xi,
+							    &xis, &ctxt);
 			}
 		}
 	}
+free:
+	if (ctxt.data_ac)
+		ocfs2_free_alloc_context(ctxt.data_ac);
+	if (ctxt.meta_ac)
+		ocfs2_free_alloc_context(ctxt.meta_ac);
+	if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
 cleanup:
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 	ocfs2_inode_unlock(inode, 1);
@@ -2734,7 +2971,8 @@ static void ocfs2_xattr_update_xattr_search(struct inode *inode,
 }
 
 static int ocfs2_xattr_create_index_block(struct inode *inode,
-					  struct ocfs2_xattr_search *xs)
+					  struct ocfs2_xattr_search *xs,
+					  struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, credits = OCFS2_SUBALLOC_ALLOC;
 	u32 bit_off, len;
@@ -2742,7 +2980,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_alloc_context *data_ac;
 	struct buffer_head *xb_bh = xs->xattr_bh;
 	struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)xb_bh->b_data;
@@ -2755,12 +2992,6 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
 	BUG_ON(!xs->bucket);
 
-	ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	/*
 	 * XXX:
 	 * We can use this lock for now, and maybe move to a dedicated mutex
@@ -2787,7 +3018,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 		goto out_commit;
 	}
 
-	ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+	ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+				     1, 1, &bit_off, &len);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -2850,10 +3082,6 @@ out_commit:
 out_sem:
 	up_write(&oi->ip_alloc_sem);
 
-out:
-	if (data_ac)
-		ocfs2_free_alloc_context(data_ac);
-
 	return ret;
 }
 
@@ -3614,7 +3842,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 				       u32 *num_clusters,
 				       u32 prev_cpos,
 				       u64 prev_blkno,
-				       int *extend)
+				       int *extend,
+				       struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, credits;
 	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
@@ -3622,8 +3851,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
 	u64 block;
 	handle_t *handle = NULL;
-	struct ocfs2_alloc_context *data_ac = NULL;
-	struct ocfs2_alloc_context *meta_ac = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 
@@ -3634,13 +3861,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
-	ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
-				    &data_ac, &meta_ac);
-	if (ret) {
-		mlog_errno(ret);
-		goto leave;
-	}
-
 	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
 					    clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
@@ -3658,7 +3878,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		goto leave;
 	}
 
-	ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+	ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
 				     clusters_to_add, &bit_off, &num_bits);
 	if (ret < 0) {
 		if (ret != -ENOSPC)
@@ -3719,7 +3939,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
 	     num_bits, (unsigned long long)block, v_start);
 	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
-				  num_bits, 0, meta_ac);
+				  num_bits, 0, ctxt->meta_ac);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto leave;
@@ -3734,10 +3954,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 leave:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
-	if (data_ac)
-		ocfs2_free_alloc_context(data_ac);
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
 
 	return ret;
 }
@@ -3821,7 +4037,8 @@ out:
  */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 				      struct buffer_head *xb_bh,
-				      struct buffer_head *header_bh)
+				      struct buffer_head *header_bh,
+				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct ocfs2_xattr_header *first_xh = NULL;
 	struct buffer_head *first_bh = NULL;
@@ -3872,7 +4089,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 						  &num_clusters,
 						  e_cpos,
 						  p_blkno,
-						  &extend);
+						  &extend,
+						  ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4147,7 +4365,8 @@ out:
 static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 					     struct buffer_head *header_bh,
 					     int xe_off,
-					     int len)
+					     int len,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, offset;
 	u64 value_blk;
@@ -4182,7 +4401,7 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 
 	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
 	     xe_off, (unsigned long long)header_bh->b_blocknr, len);
-	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4200,8 +4419,9 @@ out:
 }
 
 static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
-						struct ocfs2_xattr_search *xs,
-						int len)
+					struct ocfs2_xattr_search *xs,
+					int len,
+					struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, offset;
 	struct ocfs2_xattr_entry *xe = xs->here;
@@ -4211,7 +4431,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 
 	offset = xe - xh->xh_entries;
 	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket->bu_bhs[0],
-						offset, len);
+						offset, len, ctxt);
 	if (ret)
 		mlog_errno(ret);
 
@@ -4375,7 +4595,8 @@ out_commit:
  */
 static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 				     struct ocfs2_xattr_info *xi,
-				     struct ocfs2_xattr_search *xs)
+				     struct ocfs2_xattr_search *xs,
+				     struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret, local = 1;
 	size_t value_len;
@@ -4403,7 +4624,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 			value_len = 0;
 
 		ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-							   value_len);
+							   value_len,
+							   ctxt);
 		if (ret)
 			goto out;
 
@@ -4434,7 +4656,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 
 	/* allocate the space now for the outside block storage. */
 	ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
-						   value_len);
+						   value_len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
 
@@ -4485,7 +4707,8 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
 
 static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 					     struct ocfs2_xattr_info *xi,
-					     struct ocfs2_xattr_search *xs)
+					     struct ocfs2_xattr_search *xs,
+					     struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct ocfs2_xattr_header *xh;
 	struct ocfs2_xattr_entry *xe;
@@ -4603,7 +4826,8 @@ try_again:
 
 		ret = ocfs2_add_new_xattr_bucket(inode,
 						 xs->xattr_bh,
-						 xs->bucket->bu_bhs[0]);
+						 xs->bucket->bu_bhs[0],
+						 ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4622,7 +4846,7 @@ try_again:
 	}
 
 xattr_set:
-	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+	ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
 out:
 	mlog_exit(ret);
 	return ret;
@@ -4636,6 +4860,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	u16 i;
 	struct ocfs2_xattr_entry *xe;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+
+	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
@@ -4644,13 +4872,16 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 
 		ret = ocfs2_xattr_bucket_value_truncate(inode,
 							bucket->bu_bhs[0],
-							i, 0);
+							i, 0, &ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			break;
 		}
 	}
 
+	ocfs2_schedule_truncate_log_flush(osb, 1);
+	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 85db90e77806d48a19fda77dabe8897d369a1710 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 12 Nov 2008 08:27:01 +0800
Subject: ocfs2/xattr: Merge xattr set transaction.

In current ocfs2/xattr, the whole xattr set is divided into
many steps are many transaction are used, this make the
xattr set process isn't like a real transaction, so this
patch try to merge all the transaction into one. Another
benefit is that acl can use it easily now.

I don't merge the transaction of deleting xattr when we
remove an inode. The reason is that if we have a large number
of xattrs and every xattrs has large values(large enough
for outside storage), the whole transaction will be very
huge and it looks like jbd can't handle it(I meet with a
jbd complain once). And the old inode removal is also divided
into many steps, so I'd like to leave as it is.

Note:
In xattr set, I try to avoid ocfs2_extend_trans since if
the credits aren't enough for the extension, it will commit
all the dirty blocks and create a new transaction which may
lead to inconsistency in metadata. All ocfs2_extend_trans
remained are safe now.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 673 +++++++++++++++++++++++++++----------------------------
 1 file changed, 325 insertions(+), 348 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4fd201a54c7..7a9089255a8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -72,6 +72,7 @@ struct ocfs2_xattr_bucket {
 };
 
 struct ocfs2_xattr_set_ctxt {
+	handle_t *handle;
 	struct ocfs2_alloc_context *meta_ac;
 	struct ocfs2_alloc_context *data_ac;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
@@ -346,9 +347,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int status = 0;
-	int restart_func = 0;
-	int credits = 0;
-	handle_t *handle = NULL;
+	handle_t *handle = ctxt->handle;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
@@ -358,19 +357,6 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 
 	ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
 
-restart_all:
-
-	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-					    clusters_to_add);
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(status);
-		goto leave;
-	}
-
-restarted_transaction:
 	status = ocfs2_journal_access(handle, inode, xattr_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
@@ -389,9 +375,8 @@ restarted_transaction:
 					     ctxt->data_ac,
 					     ctxt->meta_ac,
 					     &why);
-	if ((status < 0) && (status != -EAGAIN)) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
+	if (status < 0) {
+		mlog_errno(status);
 		goto leave;
 	}
 
@@ -403,39 +388,13 @@ restarted_transaction:
 
 	clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
 
-	if (why != RESTART_NONE && clusters_to_add) {
-		if (why == RESTART_META) {
-			mlog(0, "restarting function.\n");
-			restart_func = 1;
-		} else {
-			BUG_ON(why != RESTART_TRANS);
-
-			mlog(0, "restarting transaction.\n");
-			/* TODO: This can be more intelligent. */
-			credits = ocfs2_calc_extend_credits(osb->sb,
-							    et.et_root_el,
-							    clusters_to_add);
-			status = ocfs2_extend_trans(handle, credits);
-			if (status < 0) {
-				/* handle still has to be committed at
-				 * this point. */
-				status = -ENOMEM;
-				mlog_errno(status);
-				goto leave;
-			}
-			goto restarted_transaction;
-		}
-	}
+	/*
+	 * We should have already allocated enough space before the transaction,
+	 * so no need to restart.
+	 */
+	BUG_ON(why != RESTART_NONE || clusters_to_add);
 
 leave:
-	if (handle) {
-		ocfs2_commit_trans(osb, handle);
-		handle = NULL;
-	}
-	if ((!status) && restart_func) {
-		restart_func = 0;
-		goto restart_all;
-	}
 
 	return status;
 }
@@ -448,31 +407,23 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 {
 	int ret;
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	handle_t *handle;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_extent_tree et;
 
 	ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
 
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ret = ocfs2_journal_access(handle, inode, root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
 				  &ctxt->dealloc);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	le32_add_cpu(&xv->xr_clusters, -len);
@@ -480,15 +431,13 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	ret = ocfs2_journal_dirty(handle, root_bh);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
 	if (ret)
 		mlog_errno(ret);
 
-out_commit:
-	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
@@ -975,6 +924,7 @@ static int ocfs2_xattr_get(struct inode *inode,
 }
 
 static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+					   handle_t *handle,
 					   struct ocfs2_xattr_value_root *xv,
 					   const void *value,
 					   int value_len)
@@ -986,14 +936,17 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 	u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
 	u64 blkno;
 	struct buffer_head *bh = NULL;
-	handle_t *handle;
 
 	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
 
+	/*
+	 * In __ocfs2_xattr_set_value_outside has already been dirtied,
+	 * so we don't need to worry about whether ocfs2_extend_trans
+	 * will create a new transactio for us or not.
+	 */
 	credits = clusters * bpc;
-	handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
@@ -1003,7 +956,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 					       &num_clusters, &xv->xr_list);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto out;
 		}
 
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
@@ -1012,7 +965,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 			ret = ocfs2_read_block(inode, blkno, &bh);
 			if (ret) {
 				mlog_errno(ret);
-				goto out_commit;
+				goto out;
 			}
 
 			ret = ocfs2_journal_access(handle,
@@ -1021,7 +974,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 						   OCFS2_JOURNAL_ACCESS_WRITE);
 			if (ret < 0) {
 				mlog_errno(ret);
-				goto out_commit;
+				goto out;
 			}
 
 			cp_len = value_len > blocksize ? blocksize : value_len;
@@ -1035,7 +988,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 			ret = ocfs2_journal_dirty(handle, bh);
 			if (ret < 0) {
 				mlog_errno(ret);
-				goto out_commit;
+				goto out;
 			}
 			brelse(bh);
 			bh = NULL;
@@ -1049,8 +1002,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 		}
 		cpos += num_clusters;
 	}
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	brelse(bh);
 
@@ -1058,28 +1009,21 @@ out:
 }
 
 static int ocfs2_xattr_cleanup(struct inode *inode,
+			       handle_t *handle,
 			       struct ocfs2_xattr_info *xi,
 			       struct ocfs2_xattr_search *xs,
 			       size_t offs)
 {
-	handle_t *handle = NULL;
 	int ret = 0;
 	size_t name_len = strlen(xi->name);
 	void *val = xs->base + offs;
 	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
-				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
 	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 	/* Decrease xattr count */
 	le16_add_cpu(&xs->header->xh_count, -1);
@@ -1090,32 +1034,23 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
 	if (ret < 0)
 		mlog_errno(ret);
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	return ret;
 }
 
 static int ocfs2_xattr_update_entry(struct inode *inode,
+				    handle_t *handle,
 				    struct ocfs2_xattr_info *xi,
 				    struct ocfs2_xattr_search *xs,
 				    size_t offs)
 {
-	handle_t *handle = NULL;
-	int ret = 0;
+	int ret;
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
-				   OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
 	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1129,8 +1064,6 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
 	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
 	if (ret < 0)
 		mlog_errno(ret);
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	return ret;
 }
@@ -1168,13 +1101,13 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
-					      xi->value_len);
+	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, offs);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, xv,
+					      xi->value, xi->value_len);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -1302,7 +1235,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
 	size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
 	size_t size_l = 0;
-	handle_t *handle = NULL;
+	handle_t *handle = ctxt->handle;
 	int free, i, ret;
 	struct ocfs2_xattr_info xi_l = {
 		.name_index = xi->name_index,
@@ -1391,19 +1324,21 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 					goto out;
 				}
 
-				ret = __ocfs2_xattr_set_value_outside(inode,
-								xv,
-								xi->value,
-								xi->value_len);
+				ret = ocfs2_xattr_update_entry(inode,
+							       handle,
+							       xi,
+							       xs,
+							       offs);
 				if (ret < 0) {
 					mlog_errno(ret);
 					goto out;
 				}
 
-				ret = ocfs2_xattr_update_entry(inode,
-							       xi,
-							       xs,
-							       offs);
+				ret = __ocfs2_xattr_set_value_outside(inode,
+								handle,
+								xv,
+								xi->value,
+								xi->value_len);
 				if (ret < 0)
 					mlog_errno(ret);
 				goto out;
@@ -1413,45 +1348,29 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				 * just trucate old value to zero.
 				 */
 				 ret = ocfs2_xattr_value_truncate(inode,
-								 xs->xattr_bh,
-								 xv,
-								 0,
-								 ctxt);
+								  xs->xattr_bh,
+								  xv,
+								  0,
+								  ctxt);
 				if (ret < 0)
 					mlog_errno(ret);
 			}
 		}
 	}
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
-				   OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		/* set extended attribute in external block. */
-		ret = ocfs2_extend_trans(handle,
-					 OCFS2_INODE_UPDATE_CREDITS +
-					 OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_commit;
-		}
 		ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
 					   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto out;
 		}
 	}
 
@@ -1465,7 +1384,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto out;
 		}
 	}
 
@@ -1502,9 +1421,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	if (ret < 0)
 		mlog_errno(ret);
 
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-
 	if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
 		/*
 		 * Set value outside in B tree.
@@ -1520,14 +1436,14 @@ out_commit:
 			 * If set value outside failed, we have to clean
 			 * the junk tree root we have already set in local.
 			 */
-			ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+			ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+						   xi, xs, offs);
 			if (ret2 < 0)
 				mlog_errno(ret2);
 		}
 	}
 out:
 	return ret;
-
 }
 
 static int ocfs2_remove_value_outside(struct inode*inode,
@@ -1540,6 +1456,13 @@ static int ocfs2_remove_value_outside(struct inode*inode,
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
+	ctxt.handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
 	for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 
@@ -1560,8 +1483,10 @@ static int ocfs2_remove_value_outside(struct inode*inode,
 		}
 	}
 
+	ocfs2_commit_trans(osb, ctxt.handle);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
 	return ret;
 }
 
@@ -1920,7 +1845,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di =  (struct ocfs2_dinode *)xs->inode_bh->b_data;
-	handle_t *handle = NULL;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_xattr_block *xblk = NULL;
 	u16 suballoc_bit_start;
 	u32 num_got;
@@ -1928,18 +1853,11 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 	int ret;
 
 	if (!xs->xattr_bh) {
-		handle = ocfs2_start_trans(osb,
-					   OCFS2_XATTR_BLOCK_CREATE_CREDITS);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			mlog_errno(ret);
-			goto out;
-		}
 		ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
 					   OCFS2_JOURNAL_ACCESS_CREATE);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 
 		ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
@@ -1947,7 +1865,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 					   &first_blkno);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 
 		new_bh = sb_getblk(inode->i_sb, first_blkno);
@@ -1957,7 +1875,7 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 					   OCFS2_JOURNAL_ACCESS_CREATE);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 
 		/* Initialize ocfs2_xattr_block */
@@ -1978,17 +1896,10 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 		ret = ocfs2_journal_dirty(handle, new_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out_commit;
+			goto end;
 		}
 		di->i_xattr_loc = cpu_to_le64(first_blkno);
-		ret = ocfs2_journal_dirty(handle, xs->inode_bh);
-		if (ret < 0)
-			mlog_errno(ret);
-out_commit:
-		ocfs2_commit_trans(osb, handle);
-out:
-		if (ret < 0)
-			return ret;
+		ocfs2_journal_dirty(handle, xs->inode_bh);
 	} else
 		xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
 
@@ -2057,10 +1968,11 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 				     struct ocfs2_xattr_search *xis,
 				     struct ocfs2_xattr_search *xbs,
 				     int *clusters_need,
-				     int *meta_need)
+				     int *meta_need,
+				     int *credits_need)
 {
 	int ret = 0, old_in_xb = 0;
-	int clusters_add = 0, meta_add = 0;
+	int clusters_add = 0, meta_add = 0, credits = 0;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_xattr_block *xb = NULL;
 	struct ocfs2_xattr_entry *xe = NULL;
@@ -2071,16 +1983,15 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 						    xi->value_len);
 	u64 value_size;
 
-	/*
-	 * delete a xattr doesn't need metadata and cluster allocation.
-	 * so return.
-	 */
-	if (!xi->value)
-		goto out;
-
 	if (xis->not_found && xbs->not_found) {
-		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+		if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
 			clusters_add += new_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							&def_xv.xv.xr_list,
+							new_clusters);
+		}
 
 		goto meta_guess;
 	}
@@ -2090,6 +2001,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		name_offset = le16_to_cpu(xe->xe_name_offset);
 		name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
 		base = xis->base;
+		credits += OCFS2_INODE_UPDATE_CREDITS;
 	} else {
 		int i, block_off;
 		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
@@ -2105,8 +2017,25 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 							i, &block_off,
 							&name_offset);
 			base = bucket_block(xbs->bucket, block_off);
-		} else
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+		} else {
 			base = xbs->base;
+			credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+		}
+	}
+
+	/*
+	 * delete a xattr doesn't need metadata and cluster allocation.
+	 * so just calculate the credits and return.
+	 *
+	 * The credits for removing the value tree will be extended
+	 * by ocfs2_remove_extent itself.
+	 */
+	if (!xi->value) {
+		if (!ocfs2_xattr_is_local(xe))
+			credits += OCFS2_REMOVE_EXTENT_CREDITS;
+
+		goto out;
 	}
 
 	/* do cluster allocation guess first. */
@@ -2121,6 +2050,13 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		 */
 		if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
 			clusters_add += new_clusters;
+			credits += OCFS2_REMOVE_EXTENT_CREDITS +
+				    OCFS2_INODE_UPDATE_CREDITS;
+			if (!ocfs2_xattr_is_local(xe))
+				credits += ocfs2_calc_extend_credits(
+							inode->i_sb,
+							&def_xv.xv.xr_list,
+							new_clusters);
 			goto out;
 		}
 	}
@@ -2137,11 +2073,16 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		} else
 			xv = &def_xv.xv;
 
-		if (old_clusters >= new_clusters)
+		if (old_clusters >= new_clusters) {
+			credits += OCFS2_REMOVE_EXTENT_CREDITS;
 			goto out;
-		else {
+		} else {
 			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
 			clusters_add += new_clusters - old_clusters;
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     &xv->xr_list,
+							     new_clusters -
+							     old_clusters);
 			goto out;
 		}
 	} else {
@@ -2177,6 +2118,8 @@ meta_guess:
 			struct ocfs2_extent_list *el =
 				 &xb->xb_attrs.xb_root.xt_list;
 			meta_add += ocfs2_extend_meta_needed(el);
+			credits += ocfs2_calc_extend_credits(inode->i_sb,
+							     el, 1);
 		}
 
 		/*
@@ -2187,16 +2130,23 @@ meta_guess:
 		 * also.
 		 */
 		clusters_add += 1;
+		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 		if (OCFS2_XATTR_BUCKET_SIZE ==
-			OCFS2_SB(inode->i_sb)->s_clustersize)
+			OCFS2_SB(inode->i_sb)->s_clustersize) {
+			credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 			clusters_add += 1;
-	} else
+		}
+	} else {
 		meta_add += 1;
+		credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
 out:
 	if (clusters_need)
 		*clusters_need = clusters_add;
 	if (meta_need)
 		*meta_need = meta_add;
+	if (credits_need)
+		*credits_need = credits;
 	brelse(bh);
 	return ret;
 }
@@ -2206,7 +2156,8 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 				     struct ocfs2_xattr_info *xi,
 				     struct ocfs2_xattr_search *xis,
 				     struct ocfs2_xattr_search *xbs,
-				     struct ocfs2_xattr_set_ctxt *ctxt)
+				     struct ocfs2_xattr_set_ctxt *ctxt,
+				     int *credits)
 {
 	int clusters_add, meta_add, ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2216,14 +2167,14 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 	ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
 
 	ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
-					&clusters_add, &meta_add);
+					&clusters_add, &meta_add, credits);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
 	}
 
-	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d\n",
-	     xi->name, meta_add, clusters_add);
+	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+	     "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
 
 	if (meta_add) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -2254,6 +2205,126 @@ out:
 	return ret;
 }
 
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+				    struct ocfs2_dinode *di,
+				    struct ocfs2_xattr_info *xi,
+				    struct ocfs2_xattr_search *xis,
+				    struct ocfs2_xattr_search *xbs,
+				    struct ocfs2_xattr_set_ctxt *ctxt)
+{
+	int ret = 0, credits;
+
+	if (!xi->value) {
+		/* Remove existing extended attribute */
+		if (!xis->not_found)
+			ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		else if (!xbs->not_found)
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+	} else {
+		/* We always try to set extended attribute into inode first*/
+		ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+		if (!ret && !xbs->not_found) {
+			/*
+			 * If succeed and that extended attribute existing in
+			 * external block, then we will remove it.
+			 */
+			xi->value = NULL;
+			xi->value_len = 0;
+
+			xis->not_found = -ENODATA;
+			ret = ocfs2_calc_xattr_set_need(inode,
+							di,
+							xi,
+							xis,
+							xbs,
+							NULL,
+							NULL,
+							&credits);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_extend_trans(ctxt->handle, credits +
+					ctxt->handle->h_buffer_credits);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+		} else if (ret == -ENOSPC) {
+			if (di->i_xattr_loc && !xbs->xattr_bh) {
+				ret = ocfs2_xattr_block_find(inode,
+							     xi->name_index,
+							     xi->name, xbs);
+				if (ret)
+					goto out;
+
+				xis->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits +
+					ctxt->handle->h_buffer_credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+			}
+			/*
+			 * If no space in inode, we will set extended attribute
+			 * into external block.
+			 */
+			ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+			if (ret)
+				goto out;
+			if (!xis->not_found) {
+				/*
+				 * If succeed and that extended attribute
+				 * existing in inode, we will remove it.
+				 */
+				xi->value = NULL;
+				xi->value_len = 0;
+				xbs->not_found = -ENODATA;
+				ret = ocfs2_calc_xattr_set_need(inode,
+								di,
+								xi,
+								xis,
+								xbs,
+								NULL,
+								NULL,
+								&credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+
+				ret = ocfs2_extend_trans(ctxt->handle, credits +
+						ctxt->handle->h_buffer_credits);
+				if (ret) {
+					mlog_errno(ret);
+					goto out;
+				}
+				ret = ocfs2_xattr_ibody_set(inode, xi,
+							    xis, ctxt);
+			}
+		}
+	}
+
+out:
+	return ret;
+}
+
 /*
  * ocfs2_xattr_set()
  *
@@ -2270,8 +2341,9 @@ int ocfs2_xattr_set(struct inode *inode,
 {
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
-	int ret;
+	int ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
 
 	struct ocfs2_xattr_info xi = {
@@ -2337,56 +2409,37 @@ int ocfs2_xattr_set(struct inode *inode,
 			goto cleanup;
 	}
 
-	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt);
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mutex_unlock(&tl_inode->i_mutex);
+			mlog_errno(ret);
+			goto cleanup;
+		}
+	}
+	mutex_unlock(&tl_inode->i_mutex);
+
+	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+					&xbs, &ctxt, &credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto cleanup;
 	}
 
-	if (!value) {
-		/* Remove existing extended attribute */
-		if (!xis.not_found)
-			ret = ocfs2_xattr_ibody_set(inode, &xi, &xis, &ctxt);
-		else if (!xbs.not_found)
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs, &ctxt);
-	} else {
-		/* We always try to set extended attribute into inode first*/
-		ret = ocfs2_xattr_ibody_set(inode, &xi, &xis, &ctxt);
-		if (!ret && !xbs.not_found) {
-			/*
-			 * If succeed and that extended attribute existing in
-			 * external block, then we will remove it.
-			 */
-			xi.value = NULL;
-			xi.value_len = 0;
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs, &ctxt);
-		} else if (ret == -ENOSPC) {
-			if (di->i_xattr_loc && !xbs.xattr_bh) {
-				ret = ocfs2_xattr_block_find(inode, name_index,
-							     name, &xbs);
-				if (ret)
-					goto cleanup;
-			}
-			/*
-			 * If no space in inode, we will set extended attribute
-			 * into external block.
-			 */
-			ret = ocfs2_xattr_block_set(inode, &xi, &xbs, &ctxt);
-			if (ret)
-				goto free;
-			if (!xis.not_found) {
-				/*
-				 * If succeed and that extended attribute
-				 * existing in inode, we will remove it.
-				 */
-				xi.value = NULL;
-				xi.value_len = 0;
-				ret = ocfs2_xattr_ibody_set(inode, &xi,
-							    &xis, &ctxt);
-			}
-		}
+	ctxt.handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto cleanup;
 	}
-free:
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+	ocfs2_commit_trans(osb, ctxt.handle);
+
 	if (ctxt.data_ac)
 		ocfs2_free_alloc_context(ctxt.data_ac);
 	if (ctxt.meta_ac)
@@ -2974,10 +3027,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 					  struct ocfs2_xattr_search *xs,
 					  struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int ret, credits = OCFS2_SUBALLOC_ALLOC;
+	int ret;
 	u32 bit_off, len;
 	u64 blkno;
-	handle_t *handle;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *xb_bh = xs->xattr_bh;
@@ -2999,30 +3052,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	 */
 	down_write(&oi->ip_alloc_sem);
 
-	/*
-	 * We need more credits.  One for the xattr block update and one
-	 * for each block of the new xattr bucket.
-	 */
-	credits += 1 + ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out_sem;
-	}
-
 	ret = ocfs2_journal_access(handle, inode, xb_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
 				     1, 1, &bit_off, &len);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	/*
@@ -3038,14 +3079,14 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
 						OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
@@ -3070,16 +3111,9 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 
 	xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
 
-	ret = ocfs2_journal_dirty(handle, xb_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
+	ocfs2_journal_dirty(handle, xb_bh);
 
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-
-out_sem:
+out:
 	up_write(&oi->ip_alloc_sem);
 
 	return ret;
@@ -3105,6 +3139,7 @@ static int cmp_xe_offset(const void *a, const void *b)
  * so that we can spare some space for insertion.
  */
 static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+				     handle_t *handle,
 				     struct ocfs2_xattr_bucket *bucket)
 {
 	int ret, i;
@@ -3114,7 +3149,6 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	u64 blkno = bucket_blkno(bucket);
 	u16 xh_free_start;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	handle_t *handle;
 	struct ocfs2_xattr_entry *xe;
 
 	/*
@@ -3133,19 +3167,11 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
 		memcpy(buf, bucket_block(bucket, i), blocksize);
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), bucket->bu_blocks);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
-		goto commit;
+		goto out;
 	}
 
 	xh = (struct ocfs2_xattr_header *)bucket_buf;
@@ -3203,7 +3229,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 			"bucket %llu\n", (unsigned long long)blkno);
 
 	if (xh_free_start == end)
-		goto commit;
+		goto out;
 
 	memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
 	xh->xh_free_start = cpu_to_le16(end);
@@ -3218,8 +3244,6 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 		memcpy(bucket_block(bucket, i), buf, blocksize);
 	ocfs2_xattr_bucket_journal_dirty(handle, bucket);
 
-commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
 	kfree(bucket_buf);
 	return ret;
@@ -3270,7 +3294,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 	 * 1 more for the update of the 1st bucket of the previous
 	 * extent record.
 	 */
-	credits = bpc / 2 + 1;
+	credits = bpc / 2 + 1 + handle->h_buffer_credits;
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -3662,7 +3686,7 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	 * We need to update the new cluster and 1 more for the update of
 	 * the 1st bucket of the previous extent rec.
 	 */
-	credits = bpc + 1;
+	credits = bpc + 1 + handle->h_buffer_credits;
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -3732,7 +3756,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
 				      u32 *first_hash)
 {
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	int ret, credits = 2 * blk_per_bucket;
+	int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
 
 	BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
 
@@ -3845,12 +3869,12 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 				       int *extend,
 				       struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int ret, credits;
+	int ret;
 	u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 	u32 prev_clusters = *num_clusters;
 	u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
 	u64 block;
-	handle_t *handle = NULL;
+	handle_t *handle = ctxt->handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 
@@ -3861,16 +3885,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
-	credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
-					    clusters_to_add);
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(ret);
-		goto leave;
-	}
-
 	ret = ocfs2_journal_access(handle, inode, root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
@@ -3924,18 +3938,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		}
 	}
 
-	if (handle->h_buffer_credits < credits) {
-		/*
-		 * The journal has been restarted before, and don't
-		 * have enough space for the insertion, so extend it
-		 * here.
-		 */
-		ret = ocfs2_extend_trans(handle, credits);
-		if (ret) {
-			mlog_errno(ret);
-			goto leave;
-		}
-	}
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
 	     num_bits, (unsigned long long)block, v_start);
 	ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
@@ -3946,15 +3948,10 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	}
 
 	ret = ocfs2_journal_dirty(handle, root_bh);
-	if (ret < 0) {
+	if (ret < 0)
 		mlog_errno(ret);
-		goto leave;
-	}
 
 leave:
-	if (handle)
-		ocfs2_commit_trans(osb, handle);
-
 	return ret;
 }
 
@@ -3963,6 +3960,7 @@ leave:
  * We meet with start_bh. Only move half of the xattrs to the bucket after it.
  */
 static int ocfs2_extend_xattr_bucket(struct inode *inode,
+				     handle_t *handle,
 				     struct buffer_head *first_bh,
 				     struct buffer_head *start_bh,
 				     u32 num_clusters)
@@ -3972,7 +3970,6 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	u64 start_blk = start_bh->b_blocknr, end_blk;
 	u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
-	handle_t *handle;
 	struct ocfs2_xattr_header *first_xh =
 				(struct ocfs2_xattr_header *)first_bh->b_data;
 	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
@@ -3989,11 +3986,10 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	 * We will touch all the buckets after the start_bh(include it).
 	 * Then we add one more bucket.
 	 */
-	credits = end_blk - start_blk + 3 * blk_per_bucket + 1;
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
+	credits = end_blk - start_blk + 3 * blk_per_bucket + 1 +
+		  handle->h_buffer_credits;
+	ret = ocfs2_extend_trans(handle, credits);
+	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
@@ -4002,14 +3998,14 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto commit;
+		goto out;
 	}
 
 	while (end_blk != start_blk) {
 		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
 					    end_blk + blk_per_bucket, 0);
 		if (ret)
-			goto commit;
+			goto out;
 		end_blk -= blk_per_bucket;
 	}
 
@@ -4020,8 +4016,6 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	le16_add_cpu(&first_xh->xh_num_buckets, 1);
 	ocfs2_journal_dirty(handle, first_bh);
 
-commit:
-	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
@@ -4099,6 +4093,7 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 
 	if (extend)
 		ret = ocfs2_extend_xattr_bucket(inode,
+						ctxt->handle,
 						first_bh,
 						header_bh,
 						num_clusters);
@@ -4272,14 +4267,13 @@ set_new_name_value:
  * space for the xattr insertion.
  */
 static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+					   handle_t *handle,
 					   struct ocfs2_xattr_info *xi,
 					   struct ocfs2_xattr_search *xs,
 					   u32 name_hash,
 					   int local)
 {
 	int ret;
-	handle_t *handle = NULL;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u64 blkno;
 
 	mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
@@ -4296,14 +4290,6 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, xs->bucket->bu_blocks);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(ret);
-		goto out;
-	}
-
 	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
@@ -4315,32 +4301,22 @@ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
 	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
 
 out:
-	ocfs2_commit_trans(osb, handle);
-
 	return ret;
 }
 
 static int ocfs2_xattr_value_update_size(struct inode *inode,
+					 handle_t *handle,
 					 struct buffer_head *xe_bh,
 					 struct ocfs2_xattr_entry *xe,
 					 u64 new_size)
 {
 	int ret;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	handle_t *handle = NULL;
-
-	handle = ocfs2_start_trans(osb, 1);
-	if (IS_ERR(handle)) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
 
 	ret = ocfs2_journal_access(handle, inode, xe_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out;
 	}
 
 	xe->xe_value_size = cpu_to_le64(new_size);
@@ -4349,8 +4325,6 @@ static int ocfs2_xattr_value_update_size(struct inode *inode,
 	if (ret < 0)
 		mlog_errno(ret);
 
-out_commit:
-	ocfs2_commit_trans(osb, handle);
 out:
 	return ret;
 }
@@ -4407,7 +4381,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+	ret = ocfs2_xattr_value_update_size(inode, ctxt->handle,
+					    header_bh, xe, len);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4439,6 +4414,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 }
 
 static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+						handle_t *handle,
 						struct ocfs2_xattr_search *xs,
 						char *val,
 						int value_len)
@@ -4454,7 +4430,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 
 	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
 
-	return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+	return __ocfs2_xattr_set_value_outside(inode, handle,
+					       xv, val, value_len);
 }
 
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4547,27 +4524,19 @@ out:
 }
 
 static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+					 handle_t *handle,
 					 struct ocfs2_xattr_search *xs)
 {
-	handle_t *handle = NULL;
 	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
 	struct ocfs2_xattr_entry *last = &xh->xh_entries[
 						le16_to_cpu(xh->xh_count) - 1];
 	int ret = 0;
 
-	handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
-				   ocfs2_blocks_per_xattr_bucket(inode->i_sb));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		return;
-	}
-
 	ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		return;
 	}
 
 	/* Remove the old entry. */
@@ -4577,9 +4546,6 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
 	le16_add_cpu(&xh->xh_count, -1);
 
 	ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
-
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 }
 
 /*
@@ -4645,7 +4611,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 		xi->value_len = OCFS2_XATTR_ROOT_SIZE;
 	}
 
-	ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+	ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+					      name_hash, local);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4666,13 +4633,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
 			 * storage and we have allocated xattr already,
 			 * so need to remove it.
 			 */
-			ocfs2_xattr_bucket_remove_xs(inode, xs);
+			ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
 		}
 		goto out;
 	}
 
 set_value_outside:
-	ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+	ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+						   xs, val, value_len);
 out:
 	return ret;
 }
@@ -4785,7 +4753,8 @@ try_again:
 			 * name/value will be moved, the xe shouldn't be changed
 			 * in xs.
 			 */
-			ret = ocfs2_defrag_xattr_bucket(inode, xs->bucket);
+			ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+							xs->bucket);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -4865,6 +4834,13 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
+	ctxt.handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (IS_ERR(ctxt.handle)) {
+		ret = PTR_ERR(ctxt.handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
 		if (ocfs2_xattr_is_local(xe))
@@ -4879,9 +4855,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 		}
 	}
 
+	ret = ocfs2_commit_trans(osb, ctxt.handle);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
-
+out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From fecc01126d7a244b7e9b563c80663ffdca35343b Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Wed, 12 Nov 2008 15:16:38 -0800
Subject: ocfs2: turn __ocfs2_remove_inode_range() into
 ocfs2_remove_btree_range()

This patch genericizes the high level handling of extent removal.
ocfs2_remove_btree_range() is nearly identical to
__ocfs2_remove_inode_range(), except that extent tree operations have been
used where necessary. We update ocfs2_remove_inode_range() to use the
generic helper. Now extent tree based structures have an easy way to
truncate ranges.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/alloc.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/alloc.h |  5 ++++
 fs/ocfs2/file.c  | 85 ++++----------------------------------------------------
 3 files changed, 82 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4614614084d..5592a2f6335 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5255,6 +5255,78 @@ out:
 	return ret;
 }
 
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	int ret;
+	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct inode *tl_inode = osb->osb_tl_inode;
+	handle_t *handle;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+
+	ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	mutex_lock(&tl_inode->i_mutex);
+
+	if (ocfs2_truncate_log_needs_flush(osb)) {
+		ret = __ocfs2_flush_truncate_log(osb);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+				  dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ocfs2_et_update_clusters(inode, et, -len);
+
+	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+out:
+	mutex_unlock(&tl_inode->i_mutex);
+
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return ret;
+}
+
 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
 {
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 3eb735eedae..0fbf8fc55a4 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -110,6 +110,11 @@ int ocfs2_remove_extent(struct inode *inode,
 			u32 cpos, u32 len, handle_t *handle,
 			struct ocfs2_alloc_context *meta_ac,
 			struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+			     struct ocfs2_extent_tree *et,
+			     u32 cpos, u32 phys_cpos, u32 len,
+			     struct ocfs2_cached_dealloc_ctxt *dealloc);
+
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct ocfs2_extent_tree *et);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b..360549161e2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1226,83 +1226,6 @@ out:
 	return ret;
 }
 
-static int __ocfs2_remove_inode_range(struct inode *inode,
-				      struct buffer_head *di_bh,
-				      u32 cpos, u32 phys_cpos, u32 len,
-				      struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
-	int ret;
-	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct inode *tl_inode = osb->osb_tl_inode;
-	handle_t *handle;
-	struct ocfs2_alloc_context *meta_ac = NULL;
-	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
-	struct ocfs2_extent_tree et;
-
-	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-
-	ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	mutex_lock(&tl_inode->i_mutex);
-
-	if (ocfs2_truncate_log_needs_flush(osb)) {
-		ret = __ocfs2_flush_truncate_log(osb);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
-				  dealloc);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	OCFS2_I(inode)->ip_clusters -= len;
-	di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
-	if (ret)
-		mlog_errno(ret);
-
-out_commit:
-	ocfs2_commit_trans(osb, handle);
-out:
-	mutex_unlock(&tl_inode->i_mutex);
-
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
-	return ret;
-}
-
 /*
  * Truncate a byte range, avoiding pages within partial clusters. This
  * preserves those pages for the zeroing code to write to.
@@ -1402,7 +1325,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct address_space *mapping = inode->i_mapping;
+	struct ocfs2_extent_tree et;
 
+	ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
 	if (byte_len == 0)
@@ -1458,9 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 
 		/* Only do work for non-holes */
 		if (phys_cpos != 0) {
-			ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
-							 phys_cpos, alloc_size,
-							 &dealloc);
+			ret = ocfs2_remove_btree_range(inode, &et, cpos,
+						       phys_cpos, alloc_size,
+						       &dealloc);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
-- 
cgit v1.2.3


From f5d362022a947e84b0a3dd656d09c6b2322e234f Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:15:44 +0800
Subject: ocfs2: move new inode allocation out of the transaction

Move out inode allocation from ocfs2_mknod_locked() because
vfs_dq_init() must be called outside of a transaction.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/namei.c | 108 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402ef..e8ff0bae179 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -66,12 +66,12 @@
 
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
-			      struct dentry *dentry, int mode,
+			      struct inode *inode,
+			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
-			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac);
 
 static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +186,34 @@ bail:
 	return ret;
 }
 
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(dir->i_sb);
+	if (!inode) {
+		mlog(ML_ERROR, "new_inode failed!\n");
+		return NULL;
+	}
+
+	/* populate as many fields early on as possible - many of
+	 * these are used by the support functions here and in
+	 * callers. */
+	if (S_ISDIR(mode))
+		inode->i_nlink = 2;
+	else
+		inode->i_nlink = 1;
+	inode->i_uid = current_fsuid();
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+	inode->i_mode = mode;
+	return inode;
+}
+
 static int ocfs2_mknod(struct inode *dir,
 		       struct dentry *dentry,
 		       int mode,
@@ -250,6 +278,13 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
+	inode = ocfs2_get_init_inode(dir, mode);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto leave;
+	}
+
 	/* Reserve a cluster if creating an extent based directory. */
 	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
 		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
@@ -269,9 +304,9 @@ static int ocfs2_mknod(struct inode *dir,
 	}
 
 	/* do the real work now. */
-	status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+	status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
-				    &inode, inode_ac);
+				    inode_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -332,8 +367,10 @@ leave:
 	brelse(de_bh);
 	brelse(parent_fe_bh);
 
-	if ((status < 0) && inode)
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
 		iput(inode);
+	}
 
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
@@ -348,12 +385,12 @@ leave:
 
 static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 			      struct inode *dir,
-			      struct dentry *dentry, int mode,
+			      struct inode *inode,
+			      struct dentry *dentry,
 			      dev_t dev,
 			      struct buffer_head **new_fe_bh,
 			      struct buffer_head *parent_fe_bh,
 			      handle_t *handle,
-			      struct inode **ret_inode,
 			      struct ocfs2_alloc_context *inode_ac)
 {
 	int status = 0;
@@ -361,14 +398,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	struct ocfs2_extent_list *fel;
 	u64 fe_blkno = 0;
 	u16 suballoc_bit;
-	struct inode *inode = NULL;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
-		   (unsigned long)dev, dentry->d_name.len,
+	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
 		   dentry->d_name.name);
 
 	*new_fe_bh = NULL;
-	*ret_inode = NULL;
 
 	status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
 				       &fe_blkno);
@@ -377,23 +412,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	inode = new_inode(dir->i_sb);
-	if (!inode) {
-		status = -ENOMEM;
-		mlog(ML_ERROR, "new_inode failed!\n");
-		goto leave;
-	}
-
 	/* populate as many fields early on as possible - many of
 	 * these are used by the support functions here and in
 	 * callers. */
 	inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
 	OCFS2_I(inode)->ip_blkno = fe_blkno;
-	if (S_ISDIR(mode))
-		inode->i_nlink = 2;
-	else
-		inode->i_nlink = 1;
-	inode->i_mode = mode;
 	spin_lock(&osb->osb_lock);
 	inode->i_generation = osb->s_next_generation++;
 	spin_unlock(&osb->osb_lock);
@@ -421,17 +444,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	fe->i_blkno = cpu_to_le64(fe_blkno);
 	fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
 	fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
-	fe->i_uid = cpu_to_le32(current_fsuid());
-	if (dir->i_mode & S_ISGID) {
-		fe->i_gid = cpu_to_le32(dir->i_gid);
-		if (S_ISDIR(mode))
-			mode |= S_ISGID;
-	} else
-		fe->i_gid = cpu_to_le32(current_fsgid());
-	fe->i_mode = cpu_to_le16(mode);
-	if (S_ISCHR(mode) || S_ISBLK(mode))
+	fe->i_uid = cpu_to_le32(inode->i_uid);
+	fe->i_gid = cpu_to_le32(inode->i_gid);
+	fe->i_mode = cpu_to_le16(inode->i_mode);
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
 		fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-
 	fe->i_links_count = cpu_to_le16(inode->i_nlink);
 
 	fe->i_last_eb_blk = 0;
@@ -446,7 +463,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	/*
 	 * If supported, directories start with inline data.
 	 */
-	if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+	if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
 		u16 feat = le16_to_cpu(fe->i_dyn_features);
 
 		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -484,17 +501,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	status = 0; /* error in ocfs2_create_new_inode_locks is not
 		     * critical */
 
-	*ret_inode = inode;
 leave:
 	if (status < 0) {
 		if (*new_fe_bh) {
 			brelse(*new_fe_bh);
 			*new_fe_bh = NULL;
 		}
-		if (inode) {
-			clear_nlink(inode);
-			iput(inode);
-		}
 	}
 
 	mlog_exit(status);
@@ -1542,6 +1554,13 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
+	inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+	if (!inode) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	/* don't reserve bitmap space for fast symlinks. */
 	if (l > ocfs2_fast_symlink_chars(sb)) {
 		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
@@ -1560,10 +1579,9 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
-	status = ocfs2_mknod_locked(osb, dir, dentry,
-				    S_IFLNK | S_IRWXUGO, 0,
-				    &new_fe_bh, parent_fe_bh, handle,
-				    &inode, inode_ac);
+	status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+				    0, &new_fe_bh, parent_fe_bh, handle,
+				    inode_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1644,8 +1662,10 @@ bail:
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
-	if ((status < 0) && inode)
+	if ((status < 0) && inode) {
+		clear_nlink(inode);
 		iput(inode);
+	}
 
 	mlog_exit(status);
 
-- 
cgit v1.2.3


From 6c3faba4421e230d77a181c260972229c542dec9 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:16:03 +0800
Subject: ocfs2: add ocfs2_xattr_set_handle

This function is used to set xattr's in a started transaction. It is only
called during inode creation inode for initial security/acl xattrs of the
new inode. These xattrs could be put into ibody or extent block, so xattr
bucket would not be use in this case.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h |  4 ++++
 2 files changed, 72 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7a9089255a8..6480254fe39 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2325,6 +2325,74 @@ out:
 	return ret;
 }
 
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * The xattrs could be put into ibody or extent block,
+ * xattr bucket would not be use in this case.
+ * transanction credits also be reserved in here.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+			   struct inode *inode,
+			   struct buffer_head *di_bh,
+			   int name_index,
+			   const char *name,
+			   const void *value,
+			   size_t value_len,
+			   int flags,
+			   struct ocfs2_alloc_context *meta_ac,
+			   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_dinode *di;
+	int ret;
+
+	struct ocfs2_xattr_info xi = {
+		.name_index = name_index,
+		.name = name,
+		.value = value,
+		.value_len = value_len,
+	};
+
+	struct ocfs2_xattr_search xis = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_search xbs = {
+		.not_found = -ENODATA,
+	};
+
+	struct ocfs2_xattr_set_ctxt ctxt = {
+		.handle = handle,
+		.meta_ac = meta_ac,
+		.data_ac = data_ac,
+	};
+
+	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+		return -EOPNOTSUPP;
+
+	xis.inode_bh = xbs.inode_bh = di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	down_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+	ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+	if (ret)
+		goto cleanup;
+	if (xis.not_found) {
+		ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+		if (ret)
+			goto cleanup;
+	}
+
+	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+cleanup:
+	up_write(&OCFS2_I(inode)->ip_xattr_sem);
+	brelse(xbs.xattr_bh);
+
+	return ret;
+}
+
 /*
  * ocfs2_xattr_set()
  *
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656..8fbdc163c83 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,6 +37,10 @@ extern struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
 		    size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+			   int, const char *, const void *, size_t, int,
+			   struct ocfs2_alloc_context *,
+			   struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
 
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3


From 923f7f3102b80403152e05aee3d55ecfce240440 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:16:27 +0800
Subject: ocfs2: add security xattr API

This patch add security xattr set/get/list APIs to
support security attributes in Ocfs2.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h |  1 +
 2 files changed, 48 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6480254fe39..db03162914c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/security.h>
 
 #define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
@@ -88,12 +89,14 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 struct xattr_handler *ocfs2_xattr_handlers[] = {
 	&ocfs2_xattr_user_handler,
 	&ocfs2_xattr_trusted_handler,
+	&ocfs2_xattr_security_handler,
 	NULL
 };
 
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
 	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
+	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
 };
 
 struct ocfs2_xattr_info {
@@ -4976,6 +4979,50 @@ out:
 	return ret;
 }
 
+/*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+					size_t list_size, const char *name,
+					size_t name_len)
+{
+	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	const size_t total_len = prefix_len + name_len + 1;
+
+	if (list && total_len <= list_size) {
+		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+		memcpy(list + prefix_len, name, name_len);
+		list[prefix_len + name_len] = '\0';
+	}
+	return total_len;
+}
+
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+				    void *buffer, size_t size)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+	return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+			       buffer, size);
+}
+
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+				    const void *value, size_t size, int flags)
+{
+	if (strcmp(name, "") == 0)
+		return -EINVAL;
+
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+			       size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= ocfs2_xattr_security_list,
+	.get	= ocfs2_xattr_security_get,
+	.set	= ocfs2_xattr_security_set,
+};
+
 /*
  * 'trusted' attributes support
  */
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 8fbdc163c83..55c5256ff56 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -32,6 +32,7 @@ enum ocfs2_xattr_type {
 
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
-- 
cgit v1.2.3


From 534eadddc1de8754a227202c0e747af4973f82ce Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:16:41 +0800
Subject: ocfs2: add ocfs2_init_security in during file create

Security attributes must be set when creating a new inode.

We do this in three steps.

- First, get security xattr's name and value by security_operation

- Calculate and reserve the meta data and clusters needed by this security
  xattr before starting transaction

- Finally, we set it before add_entry

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/namei.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/ocfs2/xattr.c |  70 ++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h |  17 +++++++++
 3 files changed, 182 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e8ff0bae179..40da46b907f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -229,6 +229,12 @@ static int ocfs2_mknod(struct inode *dir,
 	struct inode *inode = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
@@ -285,17 +291,39 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	/* Reserve a cluster if creating an extent based directory. */
-	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
-		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security xattr */
+	if (si.enable) {
+		status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+						  &xattr_credits, &xattr_ac);
 		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
+			mlog_errno(status);
 			goto leave;
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+	/* Reserve a cluster if creating an extent based directory. */
+	if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto leave;
+	}
+
+	handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS + xattr_credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -335,6 +363,15 @@ static int ocfs2_mknod(struct inode *dir,
 		inc_nlink(dir);
 	}
 
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto leave;
+		}
+	}
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
 				 de_bh);
@@ -366,6 +403,8 @@ leave:
 	brelse(new_fe_bh);
 	brelse(de_bh);
 	brelse(parent_fe_bh);
+	kfree(si.name);
+	kfree(si.value);
 
 	if ((status < 0) && inode) {
 		clear_nlink(inode);
@@ -378,6 +417,9 @@ leave:
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
+
 	mlog_exit(status);
 
 	return status;
@@ -1508,6 +1550,12 @@ static int ocfs2_symlink(struct inode *dir,
 	handle_t *handle = NULL;
 	struct ocfs2_alloc_context *inode_ac = NULL;
 	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *xattr_ac = NULL;
+	int want_clusters = 0;
+	int xattr_credits = 0;
+	struct ocfs2_security_xattr_info si = {
+		.enable = 1,
+	};
 
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1561,17 +1609,39 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
-	/* don't reserve bitmap space for fast symlinks. */
-	if (l > ocfs2_fast_symlink_chars(sb)) {
-		status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+	/* get security xattr */
+	status = ocfs2_init_security_get(inode, dir, &si);
+	if (status) {
+		if (status == -EOPNOTSUPP)
+			si.enable = 0;
+		else {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
+	/* calculate meta data/clusters for setting security xattr */
+	if (si.enable) {
+		status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+						  &xattr_credits, &xattr_ac);
 		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
+			mlog_errno(status);
 			goto bail;
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, credits);
+	/* don't reserve bitmap space for fast symlinks. */
+	if (l > ocfs2_fast_symlink_chars(sb))
+		want_clusters += 1;
+
+	status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+	if (status < 0) {
+		if (status != -ENOSPC)
+			mlog_errno(status);
+		goto bail;
+	}
+
+	handle = ocfs2_start_trans(osb, credits + xattr_credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -1632,6 +1702,15 @@ static int ocfs2_symlink(struct inode *dir,
 		}
 	}
 
+	if (si.enable) {
+		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+						 xattr_ac, data_ac);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+	}
+
 	status = ocfs2_add_entry(handle, dentry, inode,
 				 le64_to_cpu(fe->i_blkno), parent_fe_bh,
 				 de_bh);
@@ -1658,10 +1737,14 @@ bail:
 	brelse(new_fe_bh);
 	brelse(parent_fe_bh);
 	brelse(de_bh);
+	kfree(si.name);
+	kfree(si.value);
 	if (inode_ac)
 		ocfs2_free_alloc_context(inode_ac);
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
+	if (xattr_ac)
+		ocfs2_free_alloc_context(xattr_ac);
 	if ((status < 0) && inode) {
 		clear_nlink(inode);
 		iput(inode);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index db03162914c..2cab0d6615f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -81,6 +81,9 @@ struct ocfs2_xattr_set_ctxt {
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - sizeof(__u32))
 
 static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
@@ -343,6 +346,52 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
 	return;
 }
 
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+	int size = 0;
+
+	if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+	else
+		size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+	size += sizeof(struct ocfs2_xattr_entry);
+
+	return size;
+}
+
+int ocfs2_calc_security_init(struct inode *dir,
+			     struct ocfs2_security_xattr_info *si,
+			     int *want_clusters,
+			     int *xattr_credits,
+			     struct ocfs2_alloc_context **xattr_ac)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						 si->value_len);
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * So reserve one metadata block for it is ok.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	/* reserve clusters for xattr value which will be set in B tree*/
+	if (si->value_len > OCFS2_XATTR_INLINE_SIZE)
+		*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb,
+							   si->value_len);
+	return ret;
+}
+
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 u32 clusters_to_add,
 					 struct buffer_head *xattr_bh,
@@ -5016,6 +5065,27 @@ static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
 			       size, flags);
 }
 
+int ocfs2_init_security_get(struct inode *inode,
+			    struct inode *dir,
+			    struct ocfs2_security_xattr_info *si)
+{
+	return security_inode_init_security(inode, dir, &si->name, &si->value,
+					    &si->value_len);
+}
+
+int ocfs2_init_security_set(handle_t *handle,
+			    struct inode *inode,
+			    struct buffer_head *di_bh,
+			    struct ocfs2_security_xattr_info *si,
+			    struct ocfs2_alloc_context *xattr_ac,
+			    struct ocfs2_alloc_context *data_ac)
+{
+	return ocfs2_xattr_set_handle(handle, inode, di_bh,
+				     OCFS2_XATTR_INDEX_SECURITY,
+				     si->name, si->value, si->value_len, 0,
+				     xattr_ac, data_ac);
+}
+
 struct xattr_handler ocfs2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= ocfs2_xattr_security_list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 55c5256ff56..188ef6ba683 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,6 +30,13 @@ enum ocfs2_xattr_type {
 	OCFS2_XATTR_MAX
 };
 
+struct ocfs2_security_xattr_info {
+	int enable;
+	char *name;
+	void *value;
+	size_t value_len;
+};
+
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
 extern struct xattr_handler ocfs2_xattr_security_handler;
@@ -43,5 +50,15 @@ int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
 			   struct ocfs2_alloc_context *,
 			   struct ocfs2_alloc_context *);
 int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+			    struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+			    struct buffer_head *,
+			    struct ocfs2_security_xattr_info *,
+			    struct ocfs2_alloc_context *,
+			    struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+			     struct ocfs2_security_xattr_info *,
+			     int *, int *, struct ocfs2_alloc_context **);
 
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3


From 4e3e9d027f63488e676bf7700ec515a192e54f69 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:16:53 +0800
Subject: ocfs2: add ocfs2_xattr_get_nolock

This function does the work of ocfs2_xattr_get under an open lock.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 40 ++++++++++++++++++++++++++++------------
 fs/ocfs2/xattr.h |  2 ++
 2 files changed, 30 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2cab0d6615f..ba9b870a5dd 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -925,12 +925,8 @@ cleanup:
 	return ret;
 }
 
-/* ocfs2_xattr_get()
- *
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
+int ocfs2_xattr_get_nolock(struct inode *inode,
+			   struct buffer_head *di_bh,
 			   int name_index,
 			   const char *name,
 			   void *buffer,
@@ -938,7 +934,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 {
 	int ret;
 	struct ocfs2_dinode *di = NULL;
-	struct buffer_head *di_bh = NULL;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_xattr_search xis = {
 		.not_found = -ENODATA,
@@ -953,11 +948,6 @@ static int ocfs2_xattr_get(struct inode *inode,
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		ret = -ENODATA;
 
-	ret = ocfs2_inode_lock(inode, &di_bh, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return ret;
-	}
 	xis.inode_bh = xbs.inode_bh = di_bh;
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 
@@ -968,6 +958,32 @@ static int ocfs2_xattr_get(struct inode *inode,
 		ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
 					    buffer_size, &xbs);
 	up_read(&oi->ip_xattr_sem);
+
+	return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+			   int name_index,
+			   const char *name,
+			   void *buffer,
+			   size_t buffer_size)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+	ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+				     name, buffer, buffer_size);
+
 	ocfs2_inode_unlock(inode, 0);
 
 	brelse(di_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 188ef6ba683..86aa10ffe3f 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -43,6 +43,8 @@ extern struct xattr_handler ocfs2_xattr_security_handler;
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+			   const char *, void *, size_t);
 int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
 		    size_t, int);
 int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
-- 
cgit v1.2.3


From 929fb014e041c6572c5e8c3686f1e32742b5b953 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:17:04 +0800
Subject: ocfs2: add POSIX ACL API

This patch adds POSIX ACL(access control lists) APIs in ocfs2. We convert
struct posix_acl to many ocfs2_acl_entry and regard them as an extended
attribute entry.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile |   4 +
 fs/ocfs2/acl.c    | 378 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/acl.h    |  29 +++++
 fs/ocfs2/ocfs2.h  |   1 +
 fs/ocfs2/xattr.c  |  10 ++
 fs/ocfs2/xattr.h  |   4 +
 6 files changed, 426 insertions(+)
 create mode 100644 fs/ocfs2/acl.c
 create mode 100644 fs/ocfs2/acl.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3..e9ef5d162db 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -37,6 +37,10 @@ ocfs2-objs := \
 	ver.o			\
 	xattr.o
 
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
+
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
 ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 00000000000..62d0faad600
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,378 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+	int n, count;
+	struct posix_acl *acl;
+
+	if (!value)
+		return NULL;
+	if (size < sizeof(struct posix_acl_entry))
+		return ERR_PTR(-EINVAL);
+
+	count = size / sizeof(struct posix_acl_entry);
+	if (count < 0)
+		return ERR_PTR(-EINVAL);
+	if (count == 0)
+		return NULL;
+
+	acl = posix_acl_alloc(count, GFP_NOFS);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	for (n = 0; n < count; n++) {
+		struct ocfs2_acl_entry *entry =
+			(struct ocfs2_acl_entry *)value;
+
+		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+		acl->a_entries[n].e_id   = le32_to_cpu(entry->e_id);
+		value += sizeof(struct posix_acl_entry);
+
+	}
+	return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+	struct ocfs2_acl_entry *entry = NULL;
+	char *ocfs2_acl;
+	size_t n;
+
+	*size = acl->a_count * sizeof(struct posix_acl_entry);
+
+	ocfs2_acl = kmalloc(*size, GFP_NOFS);
+	if (!ocfs2_acl)
+		return ERR_PTR(-ENOMEM);
+
+	entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+	for (n = 0; n < acl->a_count; n++, entry++) {
+		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+		entry->e_id   = cpu_to_le32(acl->a_entries[n].e_id);
+	}
+	return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+					      int type,
+					      struct buffer_head *di_bh)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int name_index;
+	char *value = NULL;
+	struct posix_acl *acl;
+	int retval;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+	if (retval > 0) {
+		value = kmalloc(retval, GFP_NOFS);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+						"", value, retval);
+	}
+
+	if (retval > 0)
+		acl = ocfs2_acl_from_xattr(value, retval);
+	else if (retval == -ENODATA || retval == 0)
+		acl = NULL;
+	else
+		acl = ERR_PTR(retval);
+
+	kfree(value);
+
+	return acl;
+}
+
+
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	struct posix_acl *acl;
+	int ret;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return NULL;
+
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		acl = ERR_PTR(ret);
+		return acl;
+	}
+
+	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+	ocfs2_inode_unlock(inode, 0);
+
+	brelse(di_bh);
+
+	return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+			 struct inode *inode,
+			 struct buffer_head *di_bh,
+			 int type,
+			 struct posix_acl *acl,
+			 struct ocfs2_alloc_context *meta_ac,
+			 struct ocfs2_alloc_context *data_ac)
+{
+	int name_index;
+	void *value = NULL;
+	size_t size = 0;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			ret = posix_acl_equiv_mode(acl, &mode);
+			if (ret < 0)
+				return ret;
+			else {
+				inode->i_mode = mode;
+				if (ret == 0)
+					acl = NULL;
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = ocfs2_acl_to_xattr(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+	}
+
+	if (handle)
+		ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+					     "", value, size, 0,
+					     meta_ac, data_ac);
+	else
+		ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+	kfree(value);
+
+	return ret;
+}
+
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+					  char *list,
+					  size_t list_len,
+					  const char *name,
+					  size_t name_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+	return size;
+}
+
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+					   char *list,
+					   size_t list_len,
+					   const char *name,
+					   size_t name_len)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	if (list && size <= list_len)
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+	return size;
+}
+
+static int ocfs2_xattr_get_acl(struct inode *inode,
+			       int type,
+			       void *buffer,
+			       size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	acl = ocfs2_get_acl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+	ret = posix_acl_to_xattr(acl, buffer, size);
+	posix_acl_release(acl);
+
+	return ret;
+}
+
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+				      const char *name,
+				      void *buffer,
+				      size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+				       const char *name,
+				       void *buffer,
+				       size_t size)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int ocfs2_xattr_set_acl(struct inode *inode,
+			       int type,
+			       const void *value,
+			       size_t size)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl;
+	int ret = 0;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return -EOPNOTSUPP;
+
+	if (!is_owner_or_cap(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			ret = posix_acl_valid(acl);
+			if (ret)
+				goto cleanup;
+		}
+	} else
+		acl = NULL;
+
+	ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+				      const char *name,
+				      const void *value,
+				      size_t size,
+				      int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+				       const char *name,
+				       const void *value,
+				       size_t size,
+				       int flags)
+{
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+	return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.list	= ocfs2_xattr_list_acl_access,
+	.get	= ocfs2_xattr_get_acl_access,
+	.set	= ocfs2_xattr_set_acl_access,
+};
+
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.list	= ocfs2_xattr_list_acl_default,
+	.get	= ocfs2_xattr_get_acl_default,
+	.set	= ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 00000000000..1b39f3e14c1
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,29 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
+};
+
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d899..25d07ff1d3c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -195,6 +195,7 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
 	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
 	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
+	OCFS2_MOUNT_POSIX_ACL = 1 << 8,	/* POSIX access control lists */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ba9b870a5dd..2e273c2cb83 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -91,6 +91,10 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 
 struct xattr_handler *ocfs2_xattr_handlers[] = {
 	&ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	&ocfs2_xattr_acl_access_handler,
+	&ocfs2_xattr_acl_default_handler,
+#endif
 	&ocfs2_xattr_trusted_handler,
 	&ocfs2_xattr_security_handler,
 	NULL
@@ -98,6 +102,12 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
 
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
 	[OCFS2_XATTR_INDEX_USER]	= &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+					= &ocfs2_xattr_acl_access_handler,
+	[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+					= &ocfs2_xattr_acl_default_handler,
+#endif
 	[OCFS2_XATTR_INDEX_TRUSTED]	= &ocfs2_xattr_trusted_handler,
 	[OCFS2_XATTR_INDEX_SECURITY]	= &ocfs2_xattr_security_handler,
 };
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 86aa10ffe3f..6163df336d8 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,6 +40,10 @@ struct ocfs2_security_xattr_info {
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
 extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
-- 
cgit v1.2.3


From 23fc2702bea686569281708ad519b41a11d0a2f4 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:17:18 +0800
Subject: ocfs2: add ocfs2_check_acl

This function is used to enhance permission checking with POSIX ACLs.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/acl.c  | 15 +++++++++++++++
 fs/ocfs2/acl.h  | 10 ++++++++++
 fs/ocfs2/file.c |  3 ++-
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 62d0faad600..a6a2bf6d684 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -230,6 +230,21 @@ static int ocfs2_set_acl(handle_t *handle,
 	return ret;
 }
 
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+	struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		int ret = posix_acl_permission(inode, acl, mask);
+		posix_acl_release(acl);
+		return ret;
+	}
+
+	return -EAGAIN;
+}
+
 static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
 					  char *list,
 					  size_t list_len,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 1b39f3e14c1..fef10f1b782 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,4 +26,14 @@ struct ocfs2_acl_entry {
 	__le32 e_id;
 };
 
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+
+extern int ocfs2_check_acl(struct inode *, int);
+
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#define ocfs2_check_acl NULL
+
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
 #endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 360549161e2..7bad7d9b9a2 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -56,6 +56,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "xattr.h"
+#include "acl.h"
 
 #include "buffer_head_io.h"
 
@@ -1035,7 +1036,7 @@ int ocfs2_permission(struct inode *inode, int mask)
 		goto out;
 	}
 
-	ret = generic_permission(inode, mask, NULL);
+	ret = generic_permission(inode, mask, ocfs2_check_acl);
 
 	ocfs2_inode_unlock(inode, 0);
 out:
-- 
cgit v1.2.3


From 060bc66dd5017460076d9e808e2198cd532c943d Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:17:29 +0800
Subject: ocfs2: add ocfs2_acl_chmod

This function is used to update acl xattrs during file mode changes.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/acl.c  | 27 +++++++++++++++++++++++++++
 fs/ocfs2/acl.h  |  5 +++++
 fs/ocfs2/file.c |  6 ++++++
 3 files changed, 38 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index a6a2bf6d684..df72256c442 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -245,6 +245,33 @@ int ocfs2_check_acl(struct inode *inode, int mask)
 	return -EAGAIN;
 }
 
+int ocfs2_acl_chmod(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl, *clone;
+	int ret;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+		return 0;
+
+	acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_KERNEL);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	ret = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!ret)
+		ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+				    clone, NULL, NULL);
+	posix_acl_release(clone);
+	return ret;
+}
+
 static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
 					  char *list,
 					  size_t list_len,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index fef10f1b782..68ffd6436c5 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -29,10 +29,15 @@ struct ocfs2_acl_entry {
 #ifdef CONFIG_OCFS2_FS_POSIX_ACL
 
 extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
 
 #else /* CONFIG_OCFS2_FS_POSIX_ACL*/
 
 #define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
 
 #endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7bad7d9b9a2..4636aa6b011 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -990,6 +990,12 @@ bail_unlock_rw:
 bail:
 	brelse(bh);
 
+	if (!status && attr->ia_valid & ATTR_MODE) {
+		status = ocfs2_acl_chmod(inode);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
 	mlog_exit(status);
 	return status;
 }
-- 
cgit v1.2.3


From 89c38bd0ade3c567707ed8fce088b253b0369c50 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:17:41 +0800
Subject: ocfs2: add ocfs2_init_acl in mknod

We need to get the parent directories acls and let the new child inherit it.
To this, we add additional calculations for data/metadata allocation.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/acl.c   | 59 ++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/acl.h   | 14 ++++++++++
 fs/ocfs2/namei.c | 23 +++++++++++------
 fs/ocfs2/xattr.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.h |  3 +++
 5 files changed, 170 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index df72256c442..12dfb44c22e 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -272,6 +272,65 @@ int ocfs2_acl_chmod(struct inode *inode)
 	return ret;
 }
 
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+		   struct inode *inode,
+		   struct inode *dir,
+		   struct buffer_head *di_bh,
+		   struct buffer_head *dir_bh,
+		   struct ocfs2_alloc_context *meta_ac,
+		   struct ocfs2_alloc_context *data_ac)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct posix_acl *acl = NULL;
+	int ret = 0;
+
+	if (!S_ISLNK(inode->i_mode)) {
+		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+						   dir_bh);
+			if (IS_ERR(acl))
+				return PTR_ERR(acl);
+		}
+		if (!acl)
+			inode->i_mode &= ~current->fs->umask;
+	}
+	if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+		struct posix_acl *clone;
+		mode_t mode;
+
+		if (S_ISDIR(inode->i_mode)) {
+			ret = ocfs2_set_acl(handle, inode, di_bh,
+					    ACL_TYPE_DEFAULT, acl,
+					    meta_ac, data_ac);
+			if (ret)
+				goto cleanup;
+		}
+		clone = posix_acl_clone(acl, GFP_NOFS);
+		ret = -ENOMEM;
+		if (!clone)
+			goto cleanup;
+
+		mode = inode->i_mode;
+		ret = posix_acl_create_masq(clone, &mode);
+		if (ret >= 0) {
+			inode->i_mode = mode;
+			if (ret > 0) {
+				ret = ocfs2_set_acl(handle, inode,
+						    di_bh, ACL_TYPE_ACCESS,
+						    clone, meta_ac, data_ac);
+			}
+		}
+		posix_acl_release(clone);
+	}
+cleanup:
+	posix_acl_release(acl);
+	return ret;
+}
+
 static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
 					  char *list,
 					  size_t list_len,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 68ffd6436c5..8f6389ed4da 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -30,6 +30,10 @@ struct ocfs2_acl_entry {
 
 extern int ocfs2_check_acl(struct inode *, int);
 extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+			  struct buffer_head *, struct buffer_head *,
+			  struct ocfs2_alloc_context *,
+			  struct ocfs2_alloc_context *);
 
 #else /* CONFIG_OCFS2_FS_POSIX_ACL*/
 
@@ -38,6 +42,16 @@ static inline int ocfs2_acl_chmod(struct inode *inode)
 {
 	return 0;
 }
+static inline int ocfs2_init_acl(handle_t *handle,
+				 struct inode *inode,
+				 struct inode *dir,
+				 struct buffer_head *di_bh,
+				 struct buffer_head *dir_bh,
+				 struct ocfs2_alloc_context *meta_ac,
+				 struct ocfs2_alloc_context *data_ac)
+{
+	return 0;
+}
 
 #endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 40da46b907f..76551451209 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -61,6 +61,7 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "xattr.h"
+#include "acl.h"
 
 #include "buffer_head_io.h"
 
@@ -302,14 +303,13 @@ static int ocfs2_mknod(struct inode *dir,
 		}
 	}
 
-	/* calculate meta data/clusters for setting security xattr */
-	if (si.enable) {
-		status = ocfs2_calc_security_init(dir, &si, &want_clusters,
-						  &xattr_credits, &xattr_ac);
-		if (status < 0) {
-			mlog_errno(status);
-			goto leave;
-		}
+	/* calculate meta data/clusters for setting security and acl xattr */
+	status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+					&si, &want_clusters,
+					&xattr_credits, &xattr_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
 	}
 
 	/* Reserve a cluster if creating an extent based directory. */
@@ -363,6 +363,13 @@ static int ocfs2_mknod(struct inode *dir,
 		inc_nlink(dir);
 	}
 
+	status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+				xattr_ac, data_ac);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	if (si.enable) {
 		status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
 						 xattr_ac, data_ac);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2e273c2cb83..3cc8385f973 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -84,6 +84,10 @@ struct ocfs2_xattr_set_ctxt {
 #define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
 					 - sizeof(struct ocfs2_xattr_header) \
 					 - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
+					 - sizeof(struct ocfs2_xattr_block) \
+					 - sizeof(struct ocfs2_xattr_header) \
+					 - sizeof(__u32))
 
 static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
@@ -402,6 +406,81 @@ int ocfs2_calc_security_init(struct inode *dir,
 	return ret;
 }
 
+int ocfs2_calc_xattr_init(struct inode *dir,
+			  struct buffer_head *dir_bh,
+			  int mode,
+			  struct ocfs2_security_xattr_info *si,
+			  int *want_clusters,
+			  int *xattr_credits,
+			  struct ocfs2_alloc_context **xattr_ac)
+{
+	int ret = 0;
+	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+	int s_size = 0;
+	int a_size = 0;
+	int acl_len = 0;
+
+	if (si->enable)
+		s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+						     si->value_len);
+
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+		acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+					OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+					"", NULL, 0);
+		if (acl_len > 0) {
+			a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+			if (S_ISDIR(mode))
+				a_size <<= 1;
+		} else if (acl_len != 0 && acl_len != -ENODATA) {
+			mlog_errno(ret);
+			return ret;
+		}
+	}
+
+	if (!(s_size + a_size))
+		return ret;
+
+	/*
+	 * The max space of security xattr taken inline is
+	 * 256(name) + 80(value) + 16(entry) = 352 bytes,
+	 * The max space of acl xattr taken inline is
+	 * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+	 * when blocksize = 512, may reserve one more cluser for
+	 * xattr bucket, otherwise reserve one metadata block
+	 * for them is ok.
+	 */
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+		if (ret) {
+			mlog_errno(ret);
+			return ret;
+		}
+		*xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+	}
+
+	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+		*want_clusters += 1;
+		*xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+	}
+
+	/* reserve clusters for xattr value which will be set in B tree*/
+	if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE)
+		*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb,
+							   si->value_len);
+	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+	    acl_len > OCFS2_XATTR_INLINE_SIZE) {
+		*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+		if (S_ISDIR(mode))
+			*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb,
+								   acl_len);
+	}
+
+	return ret;
+}
+
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 u32 clusters_to_add,
 					 struct buffer_head *xattr_bh,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 6163df336d8..9a67e7d8f81 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -66,5 +66,8 @@ int ocfs2_init_security_set(handle_t *, struct inode *,
 int ocfs2_calc_security_init(struct inode *,
 			     struct ocfs2_security_xattr_info *,
 			     int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+			  int, struct ocfs2_security_xattr_info *,
+			  int *, int *, struct ocfs2_alloc_context **);
 
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3


From a68979b857283daf4acc405e476dcc8812a3ff2b Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 14 Nov 2008 11:17:52 +0800
Subject: ocfs2: add mount option and Kconfig option for acl

This patch adds the Kconfig option "CONFIG_OCFS2_FS_POSIX_ACL"
and mount options "acl" to enable acls in Ocfs2.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/Kconfig       |  9 +++++++++
 fs/ocfs2/super.c | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index ff0e8198020..e8a47f74a83 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -268,6 +268,15 @@ config OCFS2_COMPAT_JBD
 	  is backwards compatible with JBD.  It is safe to say N here.
 	  However, if you really want to use the original JBD, say Y here.
 
+config OCFS2_FS_POSIX_ACL
+	bool "OCFS2 POSIX Access Control Lists"
+	depends on OCFS2_FS
+	select FS_POSIX_ACL
+	default n
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
 endif # BLOCK
 
 source "fs/notify/Kconfig"
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78c..9e7accc68b4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -158,6 +158,8 @@ enum {
 	Opt_user_xattr,
 	Opt_nouser_xattr,
 	Opt_inode64,
+	Opt_acl,
+	Opt_noacl,
 	Opt_err,
 };
 
@@ -180,6 +182,8 @@ static const match_table_t tokens = {
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_inode64, "inode64"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
 	{Opt_err, NULL}
 };
 
@@ -466,6 +470,8 @@ unlock_osb:
 	if (!ret) {
 		/* Only save off the new mount options in case of a successful
 		 * remount. */
+		if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+			parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
 		osb->s_mount_opt = parsed_options.mount_opt;
 		osb->s_atime_quantum = parsed_options.atime_quantum;
 		osb->preferred_slot = parsed_options.slot;
@@ -651,6 +657,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	brelse(bh);
 	bh = NULL;
+
+	if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+		parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+
 	osb->s_mount_opt = parsed_options.mount_opt;
 	osb->s_atime_quantum = parsed_options.atime_quantum;
 	osb->preferred_slot = parsed_options.slot;
@@ -664,6 +674,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_magic = OCFS2_SUPER_MAGIC;
 
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+		((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
 	/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
 	 * heartbeat=none */
 	if (bdev_read_only(sb->s_bdev)) {
@@ -945,6 +958,19 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_inode64:
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
 			break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+		case Opt_acl:
+			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+			break;
+		case Opt_noacl:
+			mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+			break;
+#else
+		case Opt_acl:
+		case Opt_noacl:
+			printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+			break;
+#endif
 		default:
 			mlog(ML_ERROR,
 			     "Unrecognized mount option \"%s\" "
@@ -1017,6 +1043,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (opts & OCFS2_MOUNT_INODE64)
 		seq_printf(s, ",inode64");
 
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+	if (opts & OCFS2_MOUNT_POSIX_ACL)
+		seq_printf(s, ",acl");
+	else
+		seq_printf(s, ",noacl");
+#endif
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From b657c95c11088d77fc1bfc9c84d940f778bf9d12 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:11 -0800
Subject: ocfs2: Wrap inode block reads in a dedicated function.

The ocfs2 code currently reads inodes off disk with a simple
ocfs2_read_block() call.  Each place that does this has a different set
of sanity checks it performs.  Some check only the signature.  A couple
validate the block number (the block read vs di->i_blkno).  A couple
others check for VALID_FL.  Only one place validates i_fs_generation.  A
couple check nothing.  Even when an error is found, they don't all do
the same thing.

We wrap inode reading into ocfs2_read_inode_block().  This will validate
all the above fields, going readonly if they are invalid (they never
should be).  ocfs2_read_inode_block_full() is provided for the places
that want to pass read_block flags.  Every caller is passing a struct
inode with a valid ip_blkno, so we don't need a separate blkno argument
either.

We will remove the validation checks from the rest of the code in a
later commit, as they are no longer necessary.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c      |   2 +-
 fs/ocfs2/aops.c       |  11 +---
 fs/ocfs2/dir.c        |   6 +--
 fs/ocfs2/dlmglue.c    |  12 ++---
 fs/ocfs2/extent_map.c |   2 +-
 fs/ocfs2/file.c       |  21 ++------
 fs/ocfs2/inode.c      | 136 ++++++++++++++++++++++++++++++++++++--------------
 fs/ocfs2/inode.h      |  16 +++++-
 fs/ocfs2/journal.c    |   3 +-
 fs/ocfs2/localalloc.c |   8 +--
 fs/ocfs2/namei.c      |  14 +-----
 fs/ocfs2/symlink.c    |   2 +-
 12 files changed, 136 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5592a2f6335..9c598adc947 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5658,7 +5658,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		iput(inode);
 		mlog_errno(status);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b3342..e219f8b546a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,20 +68,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	fe = (struct ocfs2_dinode *) bh->b_data;
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature);
-		goto bail;
-	}
-
 	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
 						    le32_to_cpu(fe->i_clusters))) {
 		mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +255,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb8518..5777045f1a6 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -231,7 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(dir, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -458,7 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
 	struct ocfs2_dinode *di;
 	struct ocfs2_inline_data *data;
 
-	ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(dir, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -636,7 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
 	struct ocfs2_inline_data *data;
 	struct ocfs2_dir_entry *de;
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 		mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f..9f2a7f75d1b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2024,7 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	} else {
 		/* Boo, we have to go to disk. */
 		/* read bh, cast, ocfs2_refresh_inode */
-		status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+		status = ocfs2_read_inode_block(inode, bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail_refresh;
@@ -2032,18 +2032,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 		fe = (struct ocfs2_dinode *) (*bh)->b_data;
 
 		/* This is a good chance to make sure we're not
-		 * locking an invalid object.
+		 * locking an invalid object.  ocfs2_read_inode_block()
+		 * already checked that the inode block is sane.
 		 *
 		 * We bug on a stale inode here because we checked
 		 * above whether it was wiped from disk. The wiping
 		 * node provides a guarantee that we receive that
 		 * message and can mark the inode before dropping any
 		 * locks associated with it. */
-		if (!OCFS2_IS_VALID_DINODE(fe)) {
-			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-			status = -EIO;
-			goto bail_refresh;
-		}
 		mlog_bug_on_msg(inode->i_generation !=
 				le32_to_cpu(fe->i_generation),
 				"Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2081,7 @@ static int ocfs2_assign_bh(struct inode *inode,
 		return 0;
 	}
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+	status = ocfs2_read_inode_block(inode, ret_bh);
 	if (status < 0)
 		mlog_errno(status);
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac5823..b686b31cf49 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -630,7 +630,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
 	if (ret == 0)
 		goto out;
 
-	ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+	ret = ocfs2_read_inode_block(inode, &di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4636aa6b011..41001d515fa 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -402,12 +402,9 @@ static int ocfs2_truncate_file(struct inode *inode,
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)new_i_size);
 
+	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
+	 * already validated it */
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
 
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
@@ -546,18 +543,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	 */
 	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+	status = ocfs2_read_inode_block(inode, &bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
-
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		status = -EIO;
-		goto leave;
-	}
 
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -1135,9 +1126,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
 {
 	int ret;
 	struct buffer_head *bh = NULL;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+	ret = ocfs2_read_inode_block(inode, &bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1163,8 +1153,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
 	struct buffer_head *di_bh = NULL;
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
-		ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
-				       &di_bh);
+		ret = ocfs2_read_inode_block(inode, &di_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d51187..9eb701b8646 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -214,12 +214,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	return 0;
 }
 
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-		     	 int create_ino)
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino)
 {
 	struct super_block *sb;
 	struct ocfs2_super *osb;
-	int status = -EINVAL;
 	int use_plocks = 1;
 
 	mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +231,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	    ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
 		use_plocks = 0;
 
-	/* this means that read_inode cannot create a superblock inode
-	 * today.  change if needed. */
-	if (!OCFS2_IS_VALID_DINODE(fe) ||
-	    !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
-		mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
-		     "signature = %.*s, flags = 0x%x\n",
-		     inode->i_ino,
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature, le32_to_cpu(fe->i_flags));
-		goto bail;
-	}
+	/*
+	 * These have all been checked by ocfs2_read_inode_block() or set
+	 * by ocfs2_mknod_locked(), so a failure is a code bug.
+	 */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));  /* This means that read_inode
+						cannot create a superblock
+						inode today.  change if
+						that is needed. */
+	BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+	BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
 
-	if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
-		mlog(ML_ERROR, "file entry generation does not match "
-		     "superblock! osb->fs_generation=%x, "
-		     "fe->i_fs_generation=%x\n",
-		     osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
-		goto bail;
-	}
 
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
 	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -354,10 +345,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	ocfs2_set_inode_flags(inode);
 
-	status = 0;
-bail:
-	mlog_exit(status);
-	return status;
+	mlog_exit_void();
 }
 
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +448,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}
 
-	if (can_lock)
-		status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
-					   OCFS2_BH_IGNORE_CACHE);
-	else
+	if (can_lock) {
+		status = ocfs2_read_inode_block_full(inode, &bh,
+						     OCFS2_BH_IGNORE_CACHE);
+	} else {
 		status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+		if (!status)
+			status = ocfs2_validate_inode_block(osb->sb, bh);
+	}
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -472,12 +463,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 	status = -EINVAL;
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
-		     (unsigned long long)args->fi_blkno, 7,
-		     fe->i_signature);
-		goto bail;
-	}
 
 	/*
 	 * This is a code bug. Right now the caller needs to
@@ -491,10 +476,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 
 	if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
 	    S_ISBLK(le16_to_cpu(fe->i_mode)))
-    		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+		inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
 
-	if (ocfs2_populate_inode(inode, fe, 0) < 0)
-		goto bail;
+	ocfs2_populate_inode(inode, fe, 0);
 
 	BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
 
@@ -1264,3 +1248,79 @@ void ocfs2_refresh_inode(struct inode *inode,
 
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 }
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh)
+{
+	int rc = -EINVAL;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
+
+	if (!OCFS2_IS_VALID_DINODE(di)) {
+		ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    di->i_signature);
+		goto bail;
+	}
+
+	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(di->i_blkno));
+		goto bail;
+	}
+
+	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+			    (unsigned long long)bh->b_blocknr);
+		goto bail;
+	}
+
+	if (le32_to_cpu(di->i_fs_generation) !=
+	    OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Invalid dinode #%llu: fs_generation is %u\n",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(di->i_fs_generation));
+		goto bail;
+	}
+
+	rc = 0;
+
+bail:
+	return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+			       flags);
+	if (rc)
+		goto out;
+
+	if (!(flags & OCFS2_BH_READAHEAD)) {
+		rc = ocfs2_validate_inode_block(inode->i_sb, tmp);
+		if (rc) {
+			brelse(tmp);
+			goto out;
+		}
+	}
+
+	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+	return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4..b79c371a9d2 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
 			 int sysfile_type);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
-			 int create_ino);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+			  int create_ino);
 void ocfs2_read_inode(struct inode *inode);
 void ocfs2_read_inode2(struct inode *inode, void *opaque);
 ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -153,4 +153,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
 	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
 }
 
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+			       struct buffer_head *bh);
+/*
+ * Read an inode block into *bh.  If *bh is NULL, a bh will be allocated.
+ * This is a cached read.  The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+				int flags);
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3..877aaa05e19 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1135,8 +1135,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
 	}
 	SET_INODE_JOURNAL(inode);
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
-				   OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c3..19cfb1b9ce0 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -248,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -459,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 
 	mutex_lock(&inode->i_mutex);
 
-	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
-				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+					     OCFS2_BH_IGNORE_CACHE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 76551451209..0134bafdab9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -531,15 +531,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	if (ocfs2_populate_inode(inode, fe, 1) < 0) {
-		mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
-		     "i_blkno=%llu, i_ino=%lu\n",
-		     (unsigned long long)(*new_fe_bh)->b_blocknr,
-		     (unsigned long long)le64_to_cpu(fe->i_blkno),
-		     inode->i_ino);
-		BUG();
-	}
-
+	ocfs2_populate_inode(inode, fe, 1);
 	ocfs2_inode_set_new(osb, inode);
 	if (!ocfs2_mount_local(osb)) {
 		status = ocfs2_create_new_inode_locks(inode);
@@ -1864,9 +1856,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
-	status = ocfs2_read_block(orphan_dir_inode,
-				  OCFS2_I(orphan_dir_inode)->ip_blkno,
-				  &orphan_dir_bh);
+	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b..ed0a0cfd68d 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 
 	mlog_entry_void();
 
-	status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+	status = ocfs2_read_inode_block(inode, bh);
 	if (status < 0) {
 		mlog_errno(status);
 		link = ERR_PTR(status);
-- 
cgit v1.2.3


From 10995aa2451afa20b721cc7de856cae1a13dba57 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:12 -0800
Subject: ocfs2: Morph the haphazard OCFS2_IS_VALID_DINODE() checks.

Random places in the code would check a dinode bh to see if it was
valid.  Not only did they do different levels of validation, they
handled errors in different ways.

The previous commit unified inode block reads, validating all block
reads in the same place.  Thus, these haphazard checks are no longer
necessary.  Rather than eliminate them, however, we change them to
BUG_ON() checks.  This ensures the assumptions remain true.  All of the
code paths to these checks have been audited to ensure they come from a
validated inode read.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c    | 50 +++++++++++++++++++++-----------------------------
 fs/ocfs2/journal.c  | 17 +++++------------
 fs/ocfs2/ocfs2.h    |  8 --------
 fs/ocfs2/resize.c   | 10 ++++------
 fs/ocfs2/suballoc.c | 36 ++++++++++++++++--------------------
 5 files changed, 46 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9c598adc947..320545b9fe1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -187,20 +187,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 static int ocfs2_dinode_sanity_check(struct inode *inode,
 				     struct ocfs2_extent_tree *et)
 {
-	int ret = 0;
-	struct ocfs2_dinode *di;
+	struct ocfs2_dinode *di = et->et_object;
 
 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
 
-	di = et->et_object;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		ret = -EIO;
-		ocfs2_error(inode->i_sb,
-			"Inode %llu has invalid path root",
-			(unsigned long long)OCFS2_I(inode)->ip_blkno);
-	}
-
-	return ret;
+	return 0;
 }
 
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -5380,13 +5372,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
-	tl = &di->id2.i_dealloc;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-		status = -EIO;
-		goto bail;
-	}
 
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
 	tl_count = le16_to_cpu(tl->tl_count);
 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
 			tl_count == 0,
@@ -5536,13 +5528,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
-	tl = &di->id2.i_dealloc;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
-		status = -EIO;
-		goto out;
-	}
 
+	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
+	 * by the underlying call to ocfs2_read_inode_block(), so any
+	 * corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
 	num_to_flush = le16_to_cpu(tl->tl_used);
 	mlog(0, "Flush %u records from truncate log #%llu\n",
 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5697,13 +5689,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
-	tl = &di->id2.i_dealloc;
-	if (!OCFS2_IS_VALID_DINODE(di)) {
-		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
-		status = -EIO;
-		goto bail;
-	}
 
+	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
+	 * validated by the underlying call to ocfs2_read_inode_block(),
+	 * so any corruption is a code bug */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+	tl = &di->id2.i_dealloc;
 	if (le16_to_cpu(tl->tl_used)) {
 		mlog(0, "We'll have %u logs to recover\n",
 		     le16_to_cpu(tl->tl_used));
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 877aaa05e19..9223bfcca3b 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -587,17 +587,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	mlog_entry_void();
 
 	fe = (struct ocfs2_dinode *)bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		/* This is called from startup/shutdown which will
-		 * handle the errors in a specific manner, so no need
-		 * to call ocfs2_error() here. */
-		mlog(ML_ERROR, "Journal dinode %llu  has invalid "
-		     "signature: %.*s",
-		     (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
-		     fe->i_signature);
-		status = -EIO;
-		goto out;
-	}
+
+	/* The journal bh on the osb always comes from ocfs2_journal_init()
+	 * and was validated there inside ocfs2_inode_lock_full().  It's a
+	 * code bug if we mess it up. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
 	flags = le32_to_cpu(fe->id1.journal1.ij_flags);
 	if (dirty)
@@ -613,7 +607,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	if (status < 0)
 		mlog_errno(status);
 
-out:
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 25d07ff1d3c..467bdb6f71e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -444,14 +444,6 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_DINODE(ptr)					\
 	(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
 
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di)	do {			\
-	typeof(__di) ____di = (__di);					\
-	ocfs2_error((__sb), 						\
-		"Dinode # %llu has bad signature %.*s",			\
-		(unsigned long long)le64_to_cpu((____di)->i_blkno), 7, 	\
-		(____di)->i_signature);					\
-} while (0)
-
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
 
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a..739d452f617 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 
 	fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
 
+	/* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+	 * so any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
 				 ocfs2_group_bitmap_size(osb->sb) * 8) {
 		mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,12 +326,6 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 		goto out_unlock;
 	}
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
-		ret = -EIO;
-		goto out_unlock;
-	}
-
 	first_new_cluster = le32_to_cpu(fe->i_clusters);
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b5..95d432b694e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -441,11 +441,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	ac->ac_alloc_slot = slot;
 
 	fe = (struct ocfs2_dinode *) bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+
+	/* The bh was validated by the inode read inside
+	 * ocfs2_inode_lock().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -931,11 +931,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto out;
-	}
 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
 		status = -EIO;
@@ -1392,11 +1387,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
 	BUG_ON(!ac->ac_bh);
 
 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+
+	/* The bh was validated by the inode read during
+	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1782,11 +1777,12 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
+	/* The alloc_bh comes from ocfs2_free_dinode() or
+	 * ocfs2_free_clusters().  The callers have all locked the
+	 * allocator and gotten alloc_bh from the lock call.  This
+	 * validates the dinode buffer.  Any corruption that has happended
+	 * is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
 
 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
-- 
cgit v1.2.3


From 57e3e7971136003c96766346049aa73b82cab079 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:13 -0800
Subject: ocfs2: Consolidate validation of group descriptors.

Currently the validation of group descriptors is directly duplicated so
that one version can error the filesystem and the other (resize) can
just report the problem.  Consolidate to one function that takes a
boolean.  Wrap that function with the old call for the old users.

This is in preparation for lifting the read+validate step into a
single function.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/resize.c   | 40 ++++++-----------------------
 fs/ocfs2/suballoc.c | 74 +++++++++++++++++++++++++++++++----------------------
 fs/ocfs2/suballoc.h | 20 ++++++++++++---
 3 files changed, 68 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 739d452f617..a2de32a317a 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -396,41 +396,16 @@ static int ocfs2_check_new_group(struct inode *inode,
 				 struct buffer_head *group_bh)
 {
 	int ret;
-	struct ocfs2_group_desc *gd;
+	struct ocfs2_group_desc *gd =
+		(struct ocfs2_group_desc *)group_bh->b_data;
 	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
-	unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
-				le16_to_cpu(di->id2.i_chain.cl_bpc);
 
+	ret = ocfs2_validate_group_descriptor(inode->i_sb, di, gd, 1);
+	if (ret)
+		goto out;
 
-	gd = (struct ocfs2_group_desc *)group_bh->b_data;
-
-	ret = -EIO;
-	if (!OCFS2_IS_VALID_GROUP_DESC(gd))
-		mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno));
-	else if (di->i_blkno != gd->bg_parent_dinode)
-		mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
-		     "pointer (%llu, expected %llu)\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-		     (unsigned long long)le64_to_cpu(di->i_blkno));
-	else if (le16_to_cpu(gd->bg_bits) > max_bits)
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits));
-	else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-		     "claims that %u are free\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits),
-		     le16_to_cpu(gd->bg_free_bits_count));
-	else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
-		mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
-		     "max bitmap bits of %u\n",
-		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-		     le16_to_cpu(gd->bg_bits),
-		     8 * le16_to_cpu(gd->bg_size));
-	else if (le16_to_cpu(gd->bg_chain) != input->chain)
+	ret = -EINVAL;
+	if (le16_to_cpu(gd->bg_chain) != input->chain)
 		mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
 		     "while input has %u set.\n",
 		     (unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -449,6 +424,7 @@ static int ocfs2_check_new_group(struct inode *inode,
 	else
 		ret = 0;
 
+out:
 	return ret;
 }
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 95d432b694e..ddba97dc06a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -146,59 +146,71 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 }
 
 /* somewhat more expensive than our other checks, so use sparingly. */
-int ocfs2_check_group_descriptor(struct super_block *sb,
-				 struct ocfs2_dinode *di,
-				 struct ocfs2_group_desc *gd)
+int ocfs2_validate_group_descriptor(struct super_block *sb,
+				    struct ocfs2_dinode *di,
+				    struct ocfs2_group_desc *gd,
+				    int clean_error)
 {
 	unsigned int max_bits;
 
+#define do_error(fmt, ...)						\
+	do{								\
+		if (clean_error)					\
+			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
+		else							\
+			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
+	} while (0)
+
 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
-		return -EIO;
+		do_error("Group Descriptor #%llu has bad signature %.*s",
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno), 7,
+			 gd->bg_signature);
+		return -EINVAL;
 	}
 
 	if (di->i_blkno != gd->bg_parent_dinode) {
-		ocfs2_error(sb, "Group descriptor # %llu has bad parent "
-			    "pointer (%llu, expected %llu)",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
-			    (unsigned long long)le64_to_cpu(di->i_blkno));
-		return -EIO;
+		do_error("Group descriptor # %llu has bad parent "
+			 "pointer (%llu, expected %llu)",
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+			 (unsigned long long)le64_to_cpu(di->i_blkno));
+		return -EINVAL;
 	}
 
 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits));
-		return -EIO;
+		do_error("Group descriptor # %llu has bit count of %u",
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 le16_to_cpu(gd->bg_bits));
+		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_chain) >=
 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-		ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_chain));
-		return -EIO;
+		do_error("Group descriptor # %llu has bad chain %u",
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 le16_to_cpu(gd->bg_chain));
+		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-			    "claims that %u are free",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits),
-			    le16_to_cpu(gd->bg_free_bits_count));
-		return -EIO;
+		do_error("Group descriptor # %llu has bit count %u but "
+			 "claims that %u are free",
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 le16_to_cpu(gd->bg_bits),
+			 le16_to_cpu(gd->bg_free_bits_count));
+		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
-			    "max bitmap bits of %u",
-			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			    le16_to_cpu(gd->bg_bits),
-			    8 * le16_to_cpu(gd->bg_size));
-		return -EIO;
+		do_error("Group descriptor # %llu has bit count %u but "
+			 "max bitmap bits of %u",
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 le16_to_cpu(gd->bg_bits),
+			 8 * le16_to_cpu(gd->bg_size));
+		return -EINVAL;
 	}
+#undef do_error
 
 	return 0;
 }
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f45..7adfcc478bd 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -165,9 +165,23 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 
 /* somewhat more expensive than our other checks, so use sparingly. */
-int ocfs2_check_group_descriptor(struct super_block *sb,
-				 struct ocfs2_dinode *di,
-				 struct ocfs2_group_desc *gd);
+/*
+ * By default, ocfs2_validate_group_descriptor() calls ocfs2_error() when it
+ * finds a problem.  A caller that wants to check a group descriptor
+ * without going readonly passes a nonzero clean_error.  This is only
+ * resize, really.
+ */
+int ocfs2_validate_group_descriptor(struct super_block *sb,
+				    struct ocfs2_dinode *di,
+				    struct ocfs2_group_desc *gd,
+				    int clean_error);
+static inline int ocfs2_check_group_descriptor(struct super_block *sb,
+					       struct ocfs2_dinode *di,
+					       struct ocfs2_group_desc *gd)
+{
+	return ocfs2_validate_group_descriptor(sb, di, gd, 0);
+}
+
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
-- 
cgit v1.2.3


From 68f64d471be38631d7196b938d9809802dd467fa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:14 -0800
Subject: ocfs2: Wrap group descriptor reads in a dedicated function.

We have a clean call for validating group descriptors, but every place
that wants the always does a read_block()+validate() call pair.  Create
a toplevel ocfs2_read_group_descriptor() that does the right
thing.  This allows us to leverage the single call point later for
fancier handling.  We also add validation of gd->bg_generation against
the superblock and gd->bg_blkno against the block we thought we read.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/resize.c   |  12 ++----
 fs/ocfs2/suballoc.c | 108 +++++++++++++++++++++++++++++++---------------------
 fs/ocfs2/suballoc.h |  19 ++++-----
 3 files changed, 78 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index a2de32a317a..252baff5eb8 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -330,20 +330,14 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
 					      first_new_cluster - 1);
 
-	ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+	ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+					  &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
-
 	group = (struct ocfs2_group_desc *)group_bh->b_data;
 
-	ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
-
 	cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
 		le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -400,7 +394,7 @@ static int ocfs2_check_new_group(struct inode *inode,
 		(struct ocfs2_group_desc *)group_bh->b_data;
 	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
 
-	ret = ocfs2_validate_group_descriptor(inode->i_sb, di, gd, 1);
+	ret = ocfs2_validate_group_descriptor(inode->i_sb, di, group_bh, 1);
 	if (ret)
 		goto out;
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index ddba97dc06a..797f509d725 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -145,13 +145,13 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
 
-/* somewhat more expensive than our other checks, so use sparingly. */
 int ocfs2_validate_group_descriptor(struct super_block *sb,
 				    struct ocfs2_dinode *di,
-				    struct ocfs2_group_desc *gd,
+				    struct buffer_head *bh,
 				    int clean_error)
 {
 	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
 #define do_error(fmt, ...)						\
 	do{								\
@@ -162,16 +162,32 @@ int ocfs2_validate_group_descriptor(struct super_block *sb,
 	} while (0)
 
 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		do_error("Group Descriptor #%llu has bad signature %.*s",
-			 (unsigned long long)le64_to_cpu(gd->bg_blkno), 7,
+		do_error("Group descriptor #%llu has bad signature %.*s",
+			 (unsigned long long)bh->b_blocknr, 7,
 			 gd->bg_signature);
 		return -EINVAL;
 	}
 
+	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+		do_error("Group descriptor #%llu has an invalid bg_blkno "
+			 "of %llu",
+			 (unsigned long long)bh->b_blocknr,
+			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+		do_error("Group descriptor #%llu has an invalid "
+			 "fs_generation of #%u",
+			 (unsigned long long)bh->b_blocknr,
+			 le32_to_cpu(gd->bg_generation));
+		return -EINVAL;
+	}
+
 	if (di->i_blkno != gd->bg_parent_dinode) {
-		do_error("Group descriptor # %llu has bad parent "
+		do_error("Group descriptor #%llu has bad parent "
 			 "pointer (%llu, expected %llu)",
-			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 (unsigned long long)bh->b_blocknr,
 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
 			 (unsigned long long)le64_to_cpu(di->i_blkno));
 		return -EINVAL;
@@ -179,33 +195,33 @@ int ocfs2_validate_group_descriptor(struct super_block *sb,
 
 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
-		do_error("Group descriptor # %llu has bit count of %u",
-			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		do_error("Group descriptor #%llu has bit count of %u",
+			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits));
 		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_chain) >=
 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
-		do_error("Group descriptor # %llu has bad chain %u",
-			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+		do_error("Group descriptor #%llu has bad chain %u",
+			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_chain));
 		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-		do_error("Group descriptor # %llu has bit count %u but "
+		do_error("Group descriptor #%llu has bit count %u but "
 			 "claims that %u are free",
-			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits),
 			 le16_to_cpu(gd->bg_free_bits_count));
 		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-		do_error("Group descriptor # %llu has bit count %u but "
+		do_error("Group descriptor #%llu has bit count %u but "
 			 "max bitmap bits of %u",
-			 (unsigned long long)le64_to_cpu(gd->bg_blkno),
+			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits),
 			 8 * le16_to_cpu(gd->bg_size));
 		return -EINVAL;
@@ -215,6 +231,30 @@ int ocfs2_validate_group_descriptor(struct super_block *sb,
 	return 0;
 }
 
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(inode, gd_blkno, &tmp);
+	if (rc)
+		goto out;
+
+	rc = ocfs2_validate_group_descriptor(inode->i_sb, di, tmp, 0);
+	if (rc) {
+		brelse(tmp);
+		goto out;
+	}
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc;
+}
+
 static int ocfs2_block_group_fill(handle_t *handle,
 				  struct inode *alloc_inode,
 				  struct buffer_head *bg_bh,
@@ -1177,21 +1217,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
 	u16 found;
 	struct buffer_head *group_bh = NULL;
 	struct ocfs2_group_desc *gd;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
 	struct inode *alloc_inode = ac->ac_inode;
 
-	ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+	ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+					  &group_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 
 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
-	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
-		ret = -EIO;
-		goto out;
-	}
-
 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
 				  ac->ac_max_block, bit_off, &found);
 	if (ret < 0) {
@@ -1248,19 +1284,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	     bits_wanted, chain,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
 
-	status = ocfs2_read_block(alloc_inode,
-				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
-				  &group_bh);
+	status = ocfs2_read_group_descriptor(alloc_inode, fe,
+					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
+					     &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
-	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
 
 	status = -ENOSPC;
 	/* for now, the chain search is a bit simplistic. We just use
@@ -1278,18 +1309,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		next_group = le64_to_cpu(bg->bg_next_group);
 		prev_group_bh = group_bh;
 		group_bh = NULL;
-		status = ocfs2_read_block(alloc_inode,
-					  next_group, &group_bh);
+		status = ocfs2_read_group_descriptor(alloc_inode, fe,
+						     next_group, &group_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
-		status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
-		if (status) {
-			mlog_errno(status);
-			goto bail;
-		}
 	}
 	if (status < 0) {
 		if (status != -ENOSPC)
@@ -1801,18 +1827,14 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
 	     (unsigned long long)bg_blkno, start_bit);
 
-	status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+					     &group_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
-
 	group = (struct ocfs2_group_desc *) group_bh->b_data;
-	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
-	if (status) {
-		mlog_errno(status);
-		goto bail;
-	}
+
 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
 
 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 7adfcc478bd..43de4fd826d 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,23 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
  * and return that block offset. */
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 
-/* somewhat more expensive than our other checks, so use sparingly. */
 /*
  * By default, ocfs2_validate_group_descriptor() calls ocfs2_error() when it
  * finds a problem.  A caller that wants to check a group descriptor
  * without going readonly passes a nonzero clean_error.  This is only
- * resize, really.
+ * resize, really.  Everyone else should be using
+ * ocfs2_read_group_descriptor().
  */
 int ocfs2_validate_group_descriptor(struct super_block *sb,
 				    struct ocfs2_dinode *di,
-				    struct ocfs2_group_desc *gd,
+				    struct buffer_head *bh,
 				    int clean_error);
-static inline int ocfs2_check_group_descriptor(struct super_block *sb,
-					       struct ocfs2_dinode *di,
-					       struct ocfs2_group_desc *gd)
-{
-	return ocfs2_validate_group_descriptor(sb, di, gd, 0);
-}
+/*
+ * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+				u64 gd_blkno, struct buffer_head **bh);
 
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
 			  u32 clusters_to_add, u32 extents_to_split,
-- 
cgit v1.2.3


From 4203530613280281868b3ca36c817530bca3825c Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:15 -0800
Subject: ocfs2: Morph the haphazard OCFS2_IS_VALID_GROUP_DESC() checks.

Random places in the code would check a group descriptor bh to see if it
was valid. The previous commit unified descriptor block reads,
validating all block reads in the same place.  Thus, these checks are no
longer necessary.  Rather than eliminate them, however, we change them
to BUG_ON() checks.  This ensures the assumptions remain true.  All of
the code paths to these checks have been audited to ensure they come
from a validated descriptor read.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h    |  7 -------
 fs/ocfs2/suballoc.c | 39 ++++++++++++++-------------------------
 2 files changed, 14 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 467bdb6f71e..82ba887afa0 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -458,13 +458,6 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
 
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd)	do {		\
-	typeof(__gd) ____gd = (__gd);					\
-		ocfs2_error((__sb),					\
-		"Group Descriptor # %llu has bad signature %.*s",	\
-		(unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
-		(____gd)->bg_signature);				\
-} while (0)
 
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
 	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 797f509d725..766a00b2644 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -842,10 +842,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
 	int offset, start, found, status = 0;
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
-		return -EIO;
-	}
+	/* Callers got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
 	found = start = best_offset = best_size = 0;
 	bitmap = bg->bg_bitmap;
@@ -910,11 +909,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto bail;
-	}
+	/* All callers get the descriptor via
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 
 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -983,16 +980,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto out;
-	}
-	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
-		status = -EIO;
-		goto out;
-	}
+	/* The caller got these descriptors from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
 
 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
 	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -1055,7 +1046,7 @@ out_rollback:
 		bg->bg_next_group = cpu_to_le64(bg_ptr);
 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
 	}
-out:
+
 	mlog_exit(status);
 	return status;
 }
@@ -1758,11 +1749,9 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 
 	mlog_entry_void();
 
-	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
-		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
-		status = -EIO;
-		goto bail;
-	}
+	/* The caller got this descriptor from
+	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
 
-- 
cgit v1.2.3


From 5e96581a377fc6bd76e9b112da9aeb8a7ae8bf22 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:16 -0800
Subject: ocfs2: Wrap extent block reads in a dedicated function.

We weren't consistently checking extent blocks after we read them.
Most places checked the signature, but none checked h_blkno or
h_fs_signature.  Create a toplevel ocfs2_read_extent_block() that does
the read and the validation.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c      | 151 ++++++++++++++++++++++++++++++++------------------
 fs/ocfs2/alloc.h      |   8 +++
 fs/ocfs2/extent_map.c |  23 ++------
 fs/ocfs2/ocfs2.h      |   8 ---
 4 files changed, 111 insertions(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 320545b9fe1..f430cc6e0f3 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -678,6 +678,66 @@ struct ocfs2_merge_ctxt {
 	int			c_split_covers_rec;
 };
 
+static int ocfs2_validate_extent_block(struct super_block *sb,
+				       struct buffer_head *bh)
+{
+	struct ocfs2_extent_block *eb =
+		(struct ocfs2_extent_block *)bh->b_data;
+
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has bad signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    eb->h_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid h_blkno "
+			    "of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(eb->h_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extent block #%llu has an invalid "
+			    "h_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(eb->h_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+			    struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(inode, eb_blkno, &tmp);
+	if (rc)
+		goto out;
+
+	rc = ocfs2_validate_extent_block(inode->i_sb, tmp);
+	if (rc) {
+		brelse(tmp);
+		goto out;
+	}
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc;
+}
+
+
 /*
  * How many free extents have we got before we need more meta data?
  */
@@ -697,8 +757,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
 	if (last_eb_blk) {
-		retval = ocfs2_read_block(inode, last_eb_blk,
-					  &eb_bh);
+		retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
 		if (retval < 0) {
 			mlog_errno(retval);
 			goto bail;
@@ -900,11 +959,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	for(i = 0; i < new_blocks; i++) {
 		bh = new_eb_bhs[i];
 		eb = (struct ocfs2_extent_block *) bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			status = -EIO;
-			goto bail;
-		}
+		/* ocfs2_create_new_meta_bhs() should create it right! */
+		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 		eb_el = &eb->h_list;
 
 		status = ocfs2_journal_access(handle, inode, bh,
@@ -1044,11 +1100,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	}
 
 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
-	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-		status = -EIO;
-		goto bail;
-	}
+	/* ocfs2_create_new_meta_bhs() should create it right! */
+	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 
 	eb_el = &eb->h_list;
 	root_el = et->et_root_el;
@@ -1168,18 +1221,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
 		brelse(bh);
 		bh = NULL;
 
-		status = ocfs2_read_block(inode, blkno, &bh);
+		status = ocfs2_read_extent_block(inode, blkno, &bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 
 		eb = (struct ocfs2_extent_block *) bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			status = -EIO;
-			goto bail;
-		}
 		el = &eb->h_list;
 
 		if (le16_to_cpu(el->l_next_free_rec) <
@@ -1532,7 +1580,7 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		brelse(bh);
 		bh = NULL;
-		ret = ocfs2_read_block(inode, blkno, &bh);
+		ret = ocfs2_read_extent_block(inode, blkno, &bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1540,11 +1588,6 @@ static int __ocfs2_find_path(struct inode *inode,
 
 		eb = (struct ocfs2_extent_block *) bh->b_data;
 		el = &eb->h_list;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			ret = -EIO;
-			goto out;
-		}
 
 		if (le16_to_cpu(el->l_next_free_rec) >
 		    le16_to_cpu(el->l_count)) {
@@ -4089,8 +4132,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 			    le16_to_cpu(new_el->l_count)) {
 				bh = path_leaf_bh(left_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
-				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-								 eb);
+				ocfs2_error(inode->i_sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of "
+					    "%d.  It should have "
+					    "matched the l_count of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec),
+					    le16_to_cpu(new_el->l_count));
+				status = -EINVAL;
 				goto out;
 			}
 			rec = &new_el->l_recs[
@@ -4139,8 +4189,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
 				bh = path_leaf_bh(right_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
-				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-								 eb);
+				ocfs2_error(inode->i_sb,
+					    "Extent block #%llu has an "
+					    "invalid l_next_free_rec of %d",
+					    (unsigned long long)le64_to_cpu(eb->h_blkno),
+					    le16_to_cpu(new_el->l_next_free_rec));
+				status = -EINVAL;
 				goto out;
 			}
 			rec = &new_el->l_recs[1];
@@ -4286,7 +4340,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
 		 * may want it later.
 		 */
-		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+		ret = ocfs2_read_extent_block(inode,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
@@ -4752,20 +4808,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 	if (path->p_tree_depth) {
 		struct ocfs2_extent_block *eb;
 
-		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh);
+		ret = ocfs2_read_extent_block(inode,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
 		if (ret) {
 			mlog_exit(ret);
 			goto out;
 		}
 
 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			ret = -EROFS;
-			goto out;
-		}
-
 		rightmost_el = &eb->h_list;
 	} else
 		rightmost_el = path_root_el(path);
@@ -4910,8 +4961,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
 
 	depth = path->p_tree_depth;
 	if (depth > 0) {
-		ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
-				       &last_eb_bh);
+		ret = ocfs2_read_extent_block(inode,
+					      ocfs2_et_get_last_eb_blk(et),
+					      &last_eb_bh);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -6231,11 +6283,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
 
 	eb = (struct ocfs2_extent_block *) bh->b_data;
 	el = &eb->h_list;
-	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-		ret = -EROFS;
-		goto out;
-	}
+
+	/* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
+	 * Any corruption is a code bug. */
+	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 
 	*new_last_eb = bh;
 	get_bh(*new_last_eb);
@@ -7140,20 +7191,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
 	if (fe->id2.i_list.l_tree_depth) {
-		status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
-					  &last_eb_bh);
+		status = ocfs2_read_extent_block(inode,
+						 le64_to_cpu(fe->i_last_eb_blk),
+						 &last_eb_bh);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-
-			brelse(last_eb_bh);
-			status = -EIO;
-			goto bail;
-		}
 	}
 
 	(*tc)->tc_last_eb_bh = last_eb_bh;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0fbf8fc55a4..59d37d1b7d4 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -73,6 +73,14 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct buffer_head *bh,
 					struct ocfs2_xattr_value_root *xv);
 
+/*
+ * Read an extent block into *bh.  If *bh is NULL, a bh will be
+ * allocated.  This is a cached read.  The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+			    struct buffer_head **bh);
+
 struct ocfs2_alloc_context;
 int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index b686b31cf49..0bd9d9698a2 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 
-	ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+	ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 	eb = (struct ocfs2_extent_block *) eb_bh->b_data;
 	el = &eb->h_list;
 
-	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		ret = -EROFS;
-		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-		goto out;
-	}
-
 	if (el->l_tree_depth) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
 		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
 			goto no_more_extents;
 
-		ret = ocfs2_read_block(inode,
-				       le64_to_cpu(eb->h_next_leaf_blk),
-				       &next_eb_bh);
+		ret = ocfs2_read_extent_block(inode,
+					      le64_to_cpu(eb->h_next_leaf_blk),
+					      &next_eb_bh);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
-			ret = -EROFS;
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
-			goto out;
-		}
 
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
 		el = &next_eb->h_list;
-
 		i = ocfs2_search_for_hole_index(el, v_cluster);
 	}
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 82ba887afa0..f04b229fc75 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -447,14 +447,6 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr)				\
 	(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
 
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb)	do {		\
-	typeof(__eb) ____eb = (__eb);					\
-	ocfs2_error((__sb), 						\
-		"Extent Block # %llu has bad signature %.*s",		\
-		(unsigned long long)le64_to_cpu((____eb)->h_blkno), 7,	\
-		(____eb)->h_signature);					\
-} while (0)
-
 #define OCFS2_IS_VALID_GROUP_DESC(ptr)					\
 	(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
 
-- 
cgit v1.2.3


From a22305cc693254a2aa651e797875669112ef8635 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:17 -0800
Subject: ocfs2: Wrap dirblock reads in a dedicated function.

We have ocfs2_bread() as a vestige of the original ext-based dir code.
It's only used by directories, though.  Turn it into
ocfs2_read_dir_block(), with a prototype matching the other metadata
read functions.  It's set up to validate dirblocks when the time comes.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c | 150 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 88 insertions(+), 62 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 5777045f1a6..c2f3fd93be5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,49 +82,6 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
-static struct buffer_head *ocfs2_bread(struct inode *inode,
-				       int block, int *err, int reada)
-{
-	struct buffer_head *bh = NULL;
-	int tmperr;
-	u64 p_blkno;
-	int readflags = 0;
-
-	if (reada)
-		readflags |= OCFS2_BH_READAHEAD;
-
-	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!reada);
-		return NULL;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
-					     NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (tmperr < 0) {
-		mlog_errno(tmperr);
-		goto fail;
-	}
-
-	tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
-	if (tmperr < 0)
-		goto fail;
-
-	tmperr = 0;
-
-	*err = 0;
-	return bh;
-
-fail:
-	brelse(bh);
-	bh = NULL;
-
-	*err = -EIO;
-	return NULL;
-}
-
 /*
  * bh passed here can be an inode block or a dir data block, depending
  * on the inode inline data flag.
@@ -250,6 +207,76 @@ out:
 	return NULL;
 }
 
+static int ocfs2_validate_dir_block(struct super_block *sb,
+				    struct buffer_head *bh)
+{
+	/*
+	 * Nothing yet.  We don't validate dirents here, that's handled
+	 * in-place when the code walks them.
+	 */
+
+	return 0;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread().  We haven't audited what returning the
+ * real error codes would do to callers.  We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+				struct buffer_head **bh, int flags)
+{
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
+	u64 p_blkno;
+
+	if (((u64)v_block << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+		goto out;
+	}
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	rc = ocfs2_extent_map_get_blocks(inode, v_block, &p_blkno, NULL,
+					 NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (rc) {
+		mlog_errno(rc);
+		goto out;
+	}
+
+	if (!p_blkno) {
+		rc = -EIO;
+		mlog(ML_ERROR,
+		     "Directory #%llu contains a hole at offset %llu\n",
+		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+		     (unsigned long long)v_block << inode->i_sb->s_blocksize_bits);
+		goto out;
+	}
+
+	rc = ocfs2_read_blocks(inode, p_blkno, 1, &tmp, flags);
+	if (rc) {
+		mlog_errno(rc);
+		goto out;
+	}
+
+	if (!(flags & OCFS2_BH_READAHEAD)) {
+		rc = ocfs2_validate_dir_block(inode->i_sb, tmp);
+		if (rc) {
+			brelse(tmp);
+			goto out;
+		}
+	}
+
+	/* If ocfs2_read_blocks() got us a new bh, pass it up.  */
+	if (!*bh)
+		*bh = tmp;
+
+out:
+	return rc ? -EIO : 0;
+}
+
 static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
 					       struct inode *dir,
 					       struct ocfs2_dir_entry **res_dir)
@@ -296,15 +323,17 @@ restart:
 				}
 				num++;
 
-				bh = ocfs2_bread(dir, b++, &err, 1);
+				bh = NULL;
+				err = ocfs2_read_dir_block(dir, b++, &bh,
+							   OCFS2_BH_READAHEAD);
 				bh_use[ra_max] = bh;
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
 			goto next;
-		if (ocfs2_read_block(dir, block, &bh)) {
+		if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
 			/* read error, skip block & hope for the best.
-			 * ocfs2_read_block() has released the bh. */
+			 * ocfs2_read_dir_block() has released the bh. */
 			ocfs2_error(dir->i_sb, "reading directory %llu, "
 				    "offset %lu\n",
 				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -724,7 +753,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 	int i, stored;
 	struct buffer_head * bh, * tmp;
 	struct ocfs2_dir_entry * de;
-	int err;
 	struct super_block * sb = inode->i_sb;
 	unsigned int ra_sectors = 16;
 
@@ -735,12 +763,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 
 	while (!error && !stored && *f_pos < i_size_read(inode)) {
 		blk = (*f_pos) >> sb->s_blocksize_bits;
-		bh = ocfs2_bread(inode, blk, &err, 0);
-		if (!bh) {
-			mlog(ML_ERROR,
-			     "directory #%llu contains a hole at offset %lld\n",
-			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-			     *f_pos);
+		if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+			/* Skip the corrupt dirblock and keep trying */
 			*f_pos += sb->s_blocksize - offset;
 			continue;
 		}
@@ -754,8 +778,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
 		    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
 			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
-				tmp = ocfs2_bread(inode, ++blk, &err, 1);
-				brelse(tmp);
+				tmp = NULL;
+				if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+							  OCFS2_BH_READAHEAD))
+					brelse(tmp);
 			}
 			last_ra_blk = blk;
 			ra_sectors = 8;
@@ -828,6 +854,7 @@ revalidate:
 		}
 		offset = 0;
 		brelse(bh);
+		bh = NULL;
 	}
 
 	stored = 0;
@@ -1680,8 +1707,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 	struct super_block *sb = dir->i_sb;
 	int status;
 
-	bh = ocfs2_bread(dir, 0, &status, 0);
-	if (!bh) {
+	status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+	if (status) {
 		mlog_errno(status);
 		goto bail;
 	}
@@ -1702,11 +1729,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 				status = -ENOSPC;
 				goto bail;
 			}
-			bh = ocfs2_bread(dir,
-					 offset >> sb->s_blocksize_bits,
-					 &status,
-					 0);
-			if (!bh) {
+			status = ocfs2_read_dir_block(dir,
+					     offset >> sb->s_blocksize_bits,
+					     &bh, 0);
+			if (status) {
 				mlog_errno(status);
 				goto bail;
 			}
-- 
cgit v1.2.3


From 4ae1d69bedc8d174cb8a558694607e013157cde1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:18 -0800
Subject: ocfs2: Wrap xattr block reads in a dedicated function

We weren't consistently checking xattr blocks after we read them.
Most places checked the signature, but none checked xb_blkno or
xb_fs_signature.  Create a toplevel ocfs2_read_xattr_block() that does
the read and the validation.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 94 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3cc8385f973..ef4aa5482d0 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -314,6 +314,65 @@ static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
 	}
 }
 
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	struct ocfs2_xattr_block *xb =
+		(struct ocfs2_xattr_block *)bh->b_data;
+
+	mlog(0, "Validating xattr block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has bad "
+			    "signature %.*s",
+			    (unsigned long long)bh->b_blocknr, 7,
+			    xb->xb_signature);
+		return -EINVAL;
+	}
+
+	if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an "
+			    "invalid xb_blkno of %llu",
+			    (unsigned long long)bh->b_blocknr,
+			    (unsigned long long)le64_to_cpu(xb->xb_blkno));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+		ocfs2_error(sb,
+			    "Extended attribute block #%llu has an invalid "
+			    "xb_fs_generation of #%u",
+			    (unsigned long long)bh->b_blocknr,
+			    le32_to_cpu(xb->xb_fs_generation));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+				  struct buffer_head **bh)
+{
+	int rc;
+	struct buffer_head *tmp = *bh;
+
+	rc = ocfs2_read_block(inode, xb_blkno, &tmp);
+	if (!rc) {
+		rc = ocfs2_validate_xattr_block(inode->i_sb, tmp);
+		if (rc)
+			brelse(tmp);
+	}
+
+	/* If ocfs2_read_block() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
+
+	return rc;
+}
+
 static inline const char *ocfs2_xattr_prefix(int name_index)
 {
 	struct xattr_handler *handler = NULL;
@@ -739,18 +798,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
-		goto cleanup;
-	}
-
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
 		ret = ocfs2_xattr_list_entries(inode, header,
@@ -760,7 +815,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
 		ret = ocfs2_xattr_tree_list_index_block(inode, xt,
 						   buffer, buffer_size);
 	}
-cleanup:
+
 	brelse(blk_bh);
 
 	return ret;
@@ -1693,24 +1748,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 	u64 blk, bg_blkno;
 	u16 bit;
 
-	ret = ocfs2_read_block(inode, block, &blk_bh);
+	ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
-		goto out;
-	}
-
 	ret = ocfs2_xattr_block_remove(inode, blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	blk = le64_to_cpu(xb->xb_blkno);
 	bit = le16_to_cpu(xb->xb_suballoc_bit);
 	bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1950,19 +2000,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
 	if (!di->i_xattr_loc)
 		return ret;
 
-	ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+	ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+				     &blk_bh);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
 
-	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
-	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ret = -EIO;
-		goto cleanup;
-	}
-
 	xs->xattr_bh = blk_bh;
+	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		xs->header = &xb->xb_attrs.xb_header;
@@ -2259,9 +2305,9 @@ meta_guess:
 	/* calculate metadata allocation. */
 	if (di->i_xattr_loc) {
 		if (!xbs->xattr_bh) {
-			ret = ocfs2_read_block(inode,
-					       le64_to_cpu(di->i_xattr_loc),
-					       &bh);
+			ret = ocfs2_read_xattr_block(inode,
+						     le64_to_cpu(di->i_xattr_loc),
+						     &bh);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
-- 
cgit v1.2.3


From 970e4936d7d15f35d00fd15a14f5343ba78b2fc8 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:19 -0800
Subject: ocfs2: Validate metadata only when it's read from disk.

Add an optional validation hook to ocfs2_read_blocks().  Now the
validation function is only called when a block was actually read off of
disk.  It is not called when the buffer was in cache.

We add a buffer state bit BH_NeedsValidate to flag these buffers.  It
must always be one higher than the last JBD2 buffer state bit.

The dinode, dirblock, extent_block, and xattr_block validators are
lifted to this scheme directly.  The group_descriptor validator needs to
be split into two pieces.  The first part only needs the gd buffer and
is passed to ocfs2_read_block().  The second part requires the dinode as
well, and is called every time.  It's only 3 compares, so it's tiny.
This also allows us to clean up the non-fatal gd check used by resize.c.
It now has no magic argument.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c          | 17 ++++-----
 fs/ocfs2/buffer_head_io.c | 33 ++++++++++++++++-
 fs/ocfs2/buffer_head_io.h | 27 ++++++++------
 fs/ocfs2/dir.c            | 13 +++----
 fs/ocfs2/inode.c          | 18 +++-------
 fs/ocfs2/resize.c         |  2 +-
 fs/ocfs2/slot_map.c       |  4 +--
 fs/ocfs2/suballoc.c       | 91 +++++++++++++++++++++++++++++++++--------------
 fs/ocfs2/suballoc.h       | 15 ++++----
 fs/ocfs2/xattr.c          | 26 +++++++-------
 10 files changed, 149 insertions(+), 97 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f430cc6e0f3..e823a27ba34 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -684,6 +684,9 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
 	struct ocfs2_extent_block *eb =
 		(struct ocfs2_extent_block *)bh->b_data;
 
+	mlog(0, "Validating extent block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 		ocfs2_error(sb,
 			    "Extent block #%llu has bad signature %.*s",
@@ -719,21 +722,13 @@ int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
 	int rc;
 	struct buffer_head *tmp = *bh;
 
-	rc = ocfs2_read_block(inode, eb_blkno, &tmp);
-	if (rc)
-		goto out;
-
-	rc = ocfs2_validate_extent_block(inode->i_sb, tmp);
-	if (rc) {
-		brelse(tmp);
-		goto out;
-	}
+	rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+			      ocfs2_validate_extent_block);
 
 	/* If ocfs2_read_block() got us a new bh, pass it up. */
-	if (!*bh)
+	if (!rc && !*bh)
 		*bh = tmp;
 
-out:
 	return rc;
 }
 
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7..0e9eed0c223 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,19 @@
 
 #include "buffer_head_io.h"
 
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits.  Currently BH_Unshadow is the last
+ * JBD2 bit.
+ */
+enum ocfs2_state_bits {
+	BH_NeedsValidate = BH_Unshadow + 1,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
 int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 		      struct inode *inode)
 {
@@ -166,7 +179,9 @@ bail:
 }
 
 int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
-		      struct buffer_head *bhs[], int flags)
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh))
 {
 	int status = 0;
 	int i, ignore_cache = 0;
@@ -298,6 +313,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 
 			clear_buffer_uptodate(bh);
 			get_bh(bh); /* for end_buffer_read_sync() */
+			if (validate)
+				set_buffer_needs_validate(bh);
 			bh->b_end_io = end_buffer_read_sync;
 			submit_bh(READ, bh);
 			continue;
@@ -328,6 +345,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
 				bhs[i] = NULL;
 				continue;
 			}
+
+			if (buffer_needs_validate(bh)) {
+				/* We never set NeedsValidate if the
+				 * buffer was held by the journal, so
+				 * that better not have changed */
+				BUG_ON(buffer_jbd(bh));
+				clear_buffer_needs_validate(bh);
+				status = validate(inode->i_sb, bh);
+				if (status) {
+					put_bh(bh);
+					bhs[i] = NULL;
+					continue;
+				}
+			}
 		}
 
 		/* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade..c75d682dadd 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
 void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
 			     int uptodate);
 
-static inline int ocfs2_read_block(struct inode	       *inode,
-				   u64                  off,
-				   struct buffer_head **bh);
-
 int ocfs2_write_block(struct ocfs2_super          *osb,
 		      struct buffer_head  *bh,
 		      struct inode        *inode);
-int ocfs2_read_blocks(struct inode	  *inode,
-		      u64                  block,
-		      int                  nr,
-		      struct buffer_head  *bhs[],
-		      int                  flags);
 int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 			   unsigned int nr, struct buffer_head *bhs[]);
 
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk.  It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+		      struct buffer_head *bhs[], int flags,
+		      int (*validate)(struct super_block *sb,
+				      struct buffer_head *bh));
+
 int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 				struct buffer_head *bh);
 
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 #define OCFS2_BH_READAHEAD         8
 
 static inline int ocfs2_read_block(struct inode *inode, u64 off,
-				   struct buffer_head **bh)
+				   struct buffer_head **bh,
+				   int (*validate)(struct super_block *sb,
+						   struct buffer_head *bh))
 {
 	int status = 0;
 
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
 		goto bail;
 	}
 
-	status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+	status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
 
 bail:
 	return status;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c2f3fd93be5..7e863d40380 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -214,6 +214,8 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
 	 * Nothing yet.  We don't validate dirents here, that's handled
 	 * in-place when the code walks them.
 	 */
+	mlog(0, "Validating dirblock %llu\n",
+	     (unsigned long long)bh->b_blocknr);
 
 	return 0;
 }
@@ -255,20 +257,13 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 		goto out;
 	}
 
-	rc = ocfs2_read_blocks(inode, p_blkno, 1, &tmp, flags);
+	rc = ocfs2_read_blocks(inode, p_blkno, 1, &tmp, flags,
+			       ocfs2_validate_dir_block);
 	if (rc) {
 		mlog_errno(rc);
 		goto out;
 	}
 
-	if (!(flags & OCFS2_BH_READAHEAD)) {
-		rc = ocfs2_validate_dir_block(inode->i_sb, tmp);
-		if (rc) {
-			brelse(tmp);
-			goto out;
-		}
-	}
-
 	/* If ocfs2_read_blocks() got us a new bh, pass it up.  */
 	if (!*bh)
 		*bh = tmp;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 9eb701b8646..ec3497bafda 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1255,6 +1255,9 @@ int ocfs2_validate_inode_block(struct super_block *sb,
 	int rc = -EINVAL;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
+	mlog(0, "Validating dinode %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
 	BUG_ON(!buffer_uptodate(bh));
 
 	if (!OCFS2_IS_VALID_DINODE(di)) {
@@ -1300,23 +1303,12 @@ int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
 	struct buffer_head *tmp = *bh;
 
 	rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
-			       flags);
-	if (rc)
-		goto out;
-
-	if (!(flags & OCFS2_BH_READAHEAD)) {
-		rc = ocfs2_validate_inode_block(inode->i_sb, tmp);
-		if (rc) {
-			brelse(tmp);
-			goto out;
-		}
-	}
+			       flags, ocfs2_validate_inode_block);
 
 	/* If ocfs2_read_blocks() got us a new bh, pass it up. */
-	if (!*bh)
+	if (!rc && !*bh)
 		*bh = tmp;
 
-out:
 	return rc;
 }
 
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 252baff5eb8..867de3ebfca 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -394,7 +394,7 @@ static int ocfs2_check_new_group(struct inode *inode,
 		(struct ocfs2_group_desc *)group_bh->b_data;
 	u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
 
-	ret = ocfs2_validate_group_descriptor(inode->i_sb, di, group_bh, 1);
+	ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
 	if (ret)
 		goto out;
 
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f850..40661e7824e 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	 * this is not true, the read of -1 (UINT64_MAX) will fail.
 	 */
 	ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
-				OCFS2_BH_IGNORE_CACHE);
+				OCFS2_BH_IGNORE_CACHE, NULL);
 	if (ret == 0) {
 		spin_lock(&osb->osb_lock);
 		ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 
 		bh = NULL;  /* Acquire a fresh bh */
 		status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
-					   OCFS2_BH_IGNORE_CACHE);
+					   OCFS2_BH_IGNORE_CACHE, NULL);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 766a00b2644..226fe21f260 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -145,14 +145,6 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
 }
 
-int ocfs2_validate_group_descriptor(struct super_block *sb,
-				    struct ocfs2_dinode *di,
-				    struct buffer_head *bh,
-				    int clean_error)
-{
-	unsigned int max_bits;
-	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
-
 #define do_error(fmt, ...)						\
 	do{								\
 		if (clean_error)					\
@@ -161,6 +153,12 @@ int ocfs2_validate_group_descriptor(struct super_block *sb,
 			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
 	} while (0)
 
+static int ocfs2_validate_gd_self(struct super_block *sb,
+				  struct buffer_head *bh,
+				  int clean_error)
+{
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
 		do_error("Group descriptor #%llu has bad signature %.*s",
 			 (unsigned long long)bh->b_blocknr, 7,
@@ -184,6 +182,35 @@ int ocfs2_validate_group_descriptor(struct super_block *sb,
 		return -EINVAL;
 	}
 
+	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "claims that %u are free",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 le16_to_cpu(gd->bg_free_bits_count));
+		return -EINVAL;
+	}
+
+	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+		do_error("Group descriptor #%llu has bit count %u but "
+			 "max bitmap bits of %u",
+			 (unsigned long long)bh->b_blocknr,
+			 le16_to_cpu(gd->bg_bits),
+			 8 * le16_to_cpu(gd->bg_size));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+				    struct ocfs2_dinode *di,
+				    struct buffer_head *bh,
+				    int clean_error)
+{
+	unsigned int max_bits;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
 	if (di->i_blkno != gd->bg_parent_dinode) {
 		do_error("Group descriptor #%llu has bad parent "
 			 "pointer (%llu, expected %llu)",
@@ -209,26 +236,35 @@ int ocfs2_validate_group_descriptor(struct super_block *sb,
 		return -EINVAL;
 	}
 
-	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-		do_error("Group descriptor #%llu has bit count %u but "
-			 "claims that %u are free",
-			 (unsigned long long)bh->b_blocknr,
-			 le16_to_cpu(gd->bg_bits),
-			 le16_to_cpu(gd->bg_free_bits_count));
-		return -EINVAL;
-	}
+	return 0;
+}
 
-	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-		do_error("Group descriptor #%llu has bit count %u but "
-			 "max bitmap bits of %u",
-			 (unsigned long long)bh->b_blocknr,
-			 le16_to_cpu(gd->bg_bits),
-			 8 * le16_to_cpu(gd->bg_size));
-		return -EINVAL;
-	}
 #undef do_error
 
-	return 0;
+/*
+ * This version only prints errors.  It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh)
+{
+	int rc;
+
+	rc = ocfs2_validate_gd_self(sb, bh, 1);
+	if (!rc)
+		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+	return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+					   struct buffer_head *bh)
+{
+	mlog(0, "Validating group descriptor %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	return ocfs2_validate_gd_self(sb, bh, 0);
 }
 
 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
@@ -237,11 +273,12 @@ int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
 	int rc;
 	struct buffer_head *tmp = *bh;
 
-	rc = ocfs2_read_block(inode, gd_blkno, &tmp);
+	rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+			      ocfs2_validate_group_descriptor);
 	if (rc)
 		goto out;
 
-	rc = ocfs2_validate_group_descriptor(inode->i_sb, di, tmp, 0);
+	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
 	if (rc) {
 		brelse(tmp);
 		goto out;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 43de4fd826d..e3c13c77f9e 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -165,16 +165,15 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
 
 /*
- * By default, ocfs2_validate_group_descriptor() calls ocfs2_error() when it
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
  * finds a problem.  A caller that wants to check a group descriptor
- * without going readonly passes a nonzero clean_error.  This is only
- * resize, really.  Everyone else should be using
- * ocfs2_read_group_descriptor().
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function.  This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
  */
-int ocfs2_validate_group_descriptor(struct super_block *sb,
-				    struct ocfs2_dinode *di,
-				    struct buffer_head *bh,
-				    int clean_error);
+int ocfs2_check_group_descriptor(struct super_block *sb,
+				 struct ocfs2_dinode *di,
+				 struct buffer_head *bh);
 /*
  * Read a group descriptor block into *bh.  If *bh is NULL, a bh will be
  * allocated.  This is a cached read.  The descriptor will be validated with
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ef4aa5482d0..8af29b3bd6d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -266,7 +266,8 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 	int rc;
 
 	rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
-			       bucket->bu_blocks, bucket->bu_bhs, 0);
+			       bucket->bu_blocks, bucket->bu_bhs, 0,
+			       NULL);
 	if (rc)
 		ocfs2_xattr_bucket_relse(bucket);
 	return rc;
@@ -359,12 +360,8 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
 	int rc;
 	struct buffer_head *tmp = *bh;
 
-	rc = ocfs2_read_block(inode, xb_blkno, &tmp);
-	if (!rc) {
-		rc = ocfs2_validate_xattr_block(inode->i_sb, tmp);
-		if (rc)
-			brelse(tmp);
-	}
+	rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+			      ocfs2_validate_xattr_block);
 
 	/* If ocfs2_read_block() got us a new bh, pass it up. */
 	if (!rc && !*bh)
@@ -925,7 +922,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 		/* Copy ocfs2_xattr_value */
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(inode, blkno, &bh);
+			ret = ocfs2_read_block(inode, blkno, &bh, NULL);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -1174,7 +1171,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 		blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
 
 		for (i = 0; i < num_clusters * bpc; i++, blkno++) {
-			ret = ocfs2_read_block(inode, blkno, &bh);
+			ret = ocfs2_read_block(inode, blkno, &bh, NULL);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -2206,7 +2203,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		base = xis->base;
 		credits += OCFS2_INODE_UPDATE_CREDITS;
 	} else {
-		int i, block_off;
+		int i, block_off = 0;
 		xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
 		xe = xbs->here;
 		name_offset = le16_to_cpu(xe->xe_name_offset);
@@ -2840,6 +2837,7 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
 			break;
 		}
 
+
 		xe_name = bucket_block(bucket, block_off) + new_offset;
 		if (!memcmp(name, xe_name, name_len)) {
 			*xe_index = i;
@@ -3598,7 +3596,7 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 			goto out;
 		}
 
-		ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+		ret = ocfs2_read_block(inode, prev_blkno, &old_bh, NULL);
 		if (ret < 0) {
 			mlog_errno(ret);
 			brelse(new_bh);
@@ -3990,7 +3988,7 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	ocfs2_journal_dirty(handle, first_bh);
 
 	/* update the new bucket header. */
-	ret = ocfs2_read_block(inode, to_blk_start, &bh);
+	ret = ocfs2_read_block(inode, to_blk_start, &bh, NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -4337,7 +4335,7 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+	ret = ocfs2_read_block(inode, p_blkno, &first_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4635,7 +4633,7 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
 	value_blk += header_bh->b_blocknr;
 
-	ret = ocfs2_read_block(inode, value_blk, &value_bh);
+	ret = ocfs2_read_block(inode, value_blk, &value_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
-- 
cgit v1.2.3


From a8549fb5abb2b372e46d5de0d23ff8b24f4a61af Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:20 -0800
Subject: ocfs2: Wrap virtual block reads in ocfs2_read_virt_blocks()

The ocfs2_read_dir_block() function really maps an inode's virtual
blocks to physical ones before calling ocfs2_read_blocks().  Let's
extract that to common code, because other places might want to do that.

Other than the block number being virtual, ocfs2_read_virt_blocks()
takes the same arguments as ocfs2_read_blocks().  It converts those
virtual block numbers to physical before calling ocfs2_read_blocks()
directly.  If the blocks asked for are discontiguous, this can mean
multiple calls to ocfs2_read_blocks(), but this is mostly hidden from
the caller.

Like ocfs2_read_blocks(), the caller can pass in an existing
buffer_head.  This is usually done to pick up some readahead I/O.
ocfs2_read_virt_blocks() checks the buffer_head's block number
against the extent map - it must match.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/extent_map.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/extent_map.h | 24 +++++++++++++++++
 2 files changed, 95 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 0bd9d9698a2..f2bb1a04d25 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -806,3 +806,74 @@ out:
 
 	return ret;
 }
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh))
+{
+	int rc = 0;
+	u64 p_block, p_count;
+	int i, count, done = 0;
+
+	mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+		   "flags = %x, validate = %p)\n",
+		   inode, (unsigned long long)v_block, nr, bhs, flags,
+		   validate);
+
+	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+	    i_size_read(inode)) {
+		BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+		goto out;
+	}
+
+	while (done < nr) {
+		down_read(&OCFS2_I(inode)->ip_alloc_sem);
+		rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+						 &p_block, &p_count, NULL);
+		up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+
+		if (!p_block) {
+			rc = -EIO;
+			mlog(ML_ERROR,
+			     "Inode #%llu contains a hole at offset %llu\n",
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			     (unsigned long long)(v_block + done) <<
+			     inode->i_sb->s_blocksize_bits);
+			break;
+		}
+
+		count = nr - done;
+		if (p_count < count)
+			count = p_count;
+
+		/*
+		 * If the caller passed us bhs, they should have come
+		 * from a previous readahead call to this function.  Thus,
+		 * they should have the right b_blocknr.
+		 */
+		for (i = 0; i < count; i++) {
+			if (!bhs[done + i])
+				continue;
+			BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+		}
+
+		rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+				       flags, validate);
+		if (rc) {
+			mlog_errno(rc);
+			break;
+		}
+		done += count;
+	}
+
+out:
+	mlog_exit(rc);
+	return rc;
+}
+
+
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f3..b7dd9731b46 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 			     u32 *p_cluster, u32 *num_clusters,
 			     struct ocfs2_extent_list *el);
 
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+			   struct buffer_head *bhs[], int flags,
+			   int (*validate)(struct super_block *sb,
+					   struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+					struct buffer_head **bh,
+					int (*validate)(struct super_block *sb,
+							struct buffer_head *bh))
+{
+	int status = 0;
+
+	if (bh == NULL) {
+		printk("ocfs2: bh == NULL\n");
+		status = -EINVAL;
+		goto bail;
+	}
+
+	status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+	return status;
+}
+
+
 #endif  /* _EXTENT_MAP_H */
-- 
cgit v1.2.3


From 511308d90b53479b194cd067715f44dc99d39b08 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 13 Nov 2008 14:49:21 -0800
Subject: ocfs2: Convert ocfs2_read_dir_block() to ocfs2_read_virt_blocks()

Now that we've centralized the ocfs2_read_virt_blocks() code, let's use
it in ocfs2_read_dir_block().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c | 38 +++++---------------------------------
 1 file changed, 5 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 7e863d40380..d83cff95759 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -231,44 +231,16 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 {
 	int rc = 0;
 	struct buffer_head *tmp = *bh;
-	u64 p_blkno;
 
-	if (((u64)v_block << inode->i_sb->s_blocksize_bits) >=
-	    i_size_read(inode)) {
-		BUG_ON(!(flags & OCFS2_BH_READAHEAD));
-		goto out;
-	}
-
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	rc = ocfs2_extent_map_get_blocks(inode, v_block, &p_blkno, NULL,
-					 NULL);
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (rc) {
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+				    ocfs2_validate_dir_block);
+	if (rc)
 		mlog_errno(rc);
-		goto out;
-	}
 
-	if (!p_blkno) {
-		rc = -EIO;
-		mlog(ML_ERROR,
-		     "Directory #%llu contains a hole at offset %llu\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (unsigned long long)v_block << inode->i_sb->s_blocksize_bits);
-		goto out;
-	}
-
-	rc = ocfs2_read_blocks(inode, p_blkno, 1, &tmp, flags,
-			       ocfs2_validate_dir_block);
-	if (rc) {
-		mlog_errno(rc);
-		goto out;
-	}
-
-	/* If ocfs2_read_blocks() got us a new bh, pass it up.  */
-	if (!*bh)
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
 		*bh = tmp;
 
-out:
 	return rc ? -EIO : 0;
 }
 
-- 
cgit v1.2.3


From 53ef99cad9878f02f27bb30bc304fc42af8bdd6e Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 18 Nov 2008 16:53:43 -0800
Subject: ocfs2: Remove JBD compatibility layer

JBD2 is fully backwards compatible with JBD and it's been tested enough with
Ocfs2 that we can clean this code up now.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/Kconfig                  | 10 ------
 fs/ocfs2/alloc.c            |  5 ---
 fs/ocfs2/aops.c             | 24 ++-----------
 fs/ocfs2/journal.c          | 14 --------
 fs/ocfs2/journal.h          | 11 +-----
 fs/ocfs2/ocfs2_jbd_compat.h | 82 ---------------------------------------------
 6 files changed, 3 insertions(+), 143 deletions(-)
 delete mode 100644 fs/ocfs2/ocfs2_jbd_compat.h

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e8a47f74a83..b93425ad15d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -258,16 +258,6 @@ config OCFS2_DEBUG_FS
 	  this option for debugging only as it is likely to decrease
 	  performance of the filesystem.
 
-config OCFS2_COMPAT_JBD
-	bool "Use JBD for compatibility"
-	depends on OCFS2_FS
-	default n
-	select JBD
-	help
-	  The ocfs2 filesystem now uses JBD2 for its journalling.  JBD2
-	  is backwards compatible with JBD.  It is safe to say N here.
-	  However, if you really want to use the original JBD, say Y here.
-
 config OCFS2_FS_POSIX_ACL
 	bool "OCFS2 POSIX Access Control Lists"
 	depends on OCFS2_FS
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e823a27ba34..69d67ab069b 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6638,11 +6638,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
 		mlog_errno(ret);
 	else if (ocfs2_should_order_data(inode)) {
 		ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-		ret = walk_page_buffers(handle, page_buffers(page),
-					from, to, &partial,
-					ocfs2_journal_dirty_data);
-#endif
 		if (ret < 0)
 			mlog_errno(ret);
 	}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e219f8b546a..6af79adb2ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -474,12 +474,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 
 	if (ocfs2_should_order_data(inode)) {
 		ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, NULL,
-					ocfs2_journal_dirty_data);
-#endif
 		if (ret < 0)
 			mlog_errno(ret);
 	}
@@ -1065,15 +1059,8 @@ static void ocfs2_write_failure(struct inode *inode,
 		tmppage = wc->w_pages[i];
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode)) {
+			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-				walk_page_buffers(wc->w_handle,
-						  page_buffers(tmppage),
-						  from, to, NULL,
-						  ocfs2_journal_dirty_data);
-#endif
-			}
 
 			block_commit_write(tmppage, from, to);
 		}
@@ -1912,15 +1899,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 		}
 
 		if (page_has_buffers(tmppage)) {
-			if (ocfs2_should_order_data(inode)) {
+			if (ocfs2_should_order_data(inode))
 				ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-				walk_page_buffers(wc->w_handle,
-						  page_buffers(tmppage),
-						  from, to, NULL,
-						  ocfs2_journal_dirty_data);
-#endif
-			}
 			block_commit_write(tmppage, from, to);
 		}
 	}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9223bfcca3b..12b62a3cbf6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -434,20 +434,6 @@ int ocfs2_journal_dirty(handle_t *handle,
 	return status;
 }
 
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
-			     struct buffer_head *bh)
-{
-	int err = journal_dirty_data(handle, bh);
-	if (err)
-		mlog_errno(err);
-	/* TODO: When we can handle it, abort the handle and go RO on
-	 * error here. */
-
-	return err;
-}
-#endif
-
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
 
 void ocfs2_set_journal_params(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3ce..8203980fefe 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
 #define OCFS2_JOURNAL_H
 
 #include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
 
 enum ocfs2_journal_state {
 	OCFS2_JOURNAL_FREE = 0,
@@ -273,10 +268,6 @@ int                  ocfs2_journal_access(handle_t *handle,
  */
 int                  ocfs2_journal_dirty(handle_t *handle,
 					 struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int                  ocfs2_journal_dirty_data(handle_t *handle,
-					      struct buffer_head *bh);
-#endif
 
 /*
  *  Credit Macros:
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f55..00000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- */
-
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-
-struct jbd2_inode {
-	unsigned int dummy;
-};
-
-#define JBD2_BARRIER			JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE	JBD_DEFAULT_MAX_COMMIT_AGE
-
-#define jbd2_journal_ack_err			journal_ack_err
-#define jbd2_journal_clear_err			journal_clear_err
-#define jbd2_journal_destroy			journal_destroy
-#define jbd2_journal_dirty_metadata		journal_dirty_metadata
-#define jbd2_journal_errno			journal_errno
-#define jbd2_journal_extend			journal_extend
-#define jbd2_journal_flush			journal_flush
-#define jbd2_journal_force_commit		journal_force_commit
-#define jbd2_journal_get_write_access		journal_get_write_access
-#define jbd2_journal_get_undo_access		journal_get_undo_access
-#define jbd2_journal_init_inode			journal_init_inode
-#define jbd2_journal_invalidatepage		journal_invalidatepage
-#define jbd2_journal_load			journal_load
-#define jbd2_journal_lock_updates		journal_lock_updates
-#define jbd2_journal_restart			journal_restart
-#define jbd2_journal_start			journal_start
-#define jbd2_journal_start_commit		journal_start_commit
-#define jbd2_journal_stop			journal_stop
-#define jbd2_journal_try_to_free_buffers	journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates		journal_unlock_updates
-#define jbd2_journal_wipe			journal_wipe
-#define jbd2_log_wait_commit			log_wait_commit
-
-static inline int jbd2_journal_file_inode(handle_t *handle,
-					  struct jbd2_inode *inode)
-{
-	return 0;
-}
-
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
-						      loff_t new_size)
-{
-	return 0;
-}
-
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
-					       struct inode *inode)
-{
-	return;
-}
-
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
-						  struct jbd2_inode *jinode)
-{
-	return;
-}
-
-
-#endif  /* OCFS2_JBD_COMPAT_H */
-- 
cgit v1.2.3


From 97aff52ae13d3c11a074bbbfc80ad0b59cb8cdeb Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 19 Nov 2008 16:48:41 +0800
Subject: ocfs2/xattr: Fix a bug in xattr allocation estimation

When we extend one xattr's value to a large size, the old value size might
be smaller than the size of a value root. In those cases, we still need to
guess the metadata allocation.

Reported-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 8af29b3bd6d..d0b94edb966 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2270,6 +2270,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 								 value_size);
 			xv = (struct ocfs2_xattr_value_root *)
 			     (base + name_offset + name_len);
+			value_size = OCFS2_XATTR_ROOT_SIZE;
 		} else
 			xv = &def_xv.xv;
 
@@ -2283,7 +2284,8 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 							     &xv->xr_list,
 							     new_clusters -
 							     old_clusters);
-			goto out;
+			if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+				goto out;
 		}
 	} else {
 		/*
-- 
cgit v1.2.3


From 9f868f16e40e9ad8e39aebff94a4be0d96520734 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 19 Nov 2008 16:48:42 +0800
Subject: ocfs2/xattr: Restore not_found in xis

During an xattr set, when we move a xattr which was stored in inode to the
outside bucket, we have to delete it and it will use the old value of
xis->not_found. xis->not_found is removed by ocfs2_calc_xattr_set_need
though, so we must restore it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d0b94edb966..9cb71e1c7c6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2414,7 +2414,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 				    struct ocfs2_xattr_search *xbs,
 				    struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int ret = 0, credits;
+	int ret = 0, credits, old_found;
 
 	if (!xi->value) {
 		/* Remove existing extended attribute */
@@ -2433,6 +2433,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 			xi->value = NULL;
 			xi->value_len = 0;
 
+			old_found = xis->not_found;
 			xis->not_found = -ENODATA;
 			ret = ocfs2_calc_xattr_set_need(inode,
 							di,
@@ -2442,6 +2443,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 							NULL,
 							NULL,
 							&credits);
+			xis->not_found = old_found;
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -2462,6 +2464,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 				if (ret)
 					goto out;
 
+				old_found = xis->not_found;
 				xis->not_found = -ENODATA;
 				ret = ocfs2_calc_xattr_set_need(inode,
 								di,
@@ -2471,6 +2474,7 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 								NULL,
 								NULL,
 								&credits);
+				xis->not_found = old_found;
 				if (ret) {
 					mlog_errno(ret);
 					goto out;
-- 
cgit v1.2.3


From 74f783af95c982aef6d3a1415275650dcf511666 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 19 Aug 2008 14:51:22 +0200
Subject: quota: Add callbacks for allocating and destroying dquot structures

Some filesystems would like to keep private information together with each
dquot. Add callbacks alloc_dquot and destroy_dquot allowing filesystem to
allocate larger dquots from their private slab in a similar fashion we
currently allocate inodes.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581..1b5fc4b7fbe 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -415,6 +415,16 @@ out_dqlock:
 	return ret;
 }
 
+static void dquot_destroy(struct dquot *dquot)
+{
+	kmem_cache_free(dquot_cachep, dquot);
+}
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+	dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
 /* Invalidate all dquots on the list. Note that this function is called after
  * quota is disabled and pointers from inodes removed so there cannot be new
  * quota users. There can still be some users of quotas due to inodes being
@@ -463,7 +473,7 @@ restart:
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
-		kmem_cache_free(dquot_cachep, dquot);
+		do_destroy_dquot(dquot);
 	}
 	spin_unlock(&dq_list_lock);
 }
@@ -527,7 +537,7 @@ static void prune_dqcache(int count)
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
-		kmem_cache_free(dquot_cachep, dquot);
+		do_destroy_dquot(dquot);
 		count--;
 		head = free_dquots.prev;
 	}
@@ -625,11 +635,16 @@ we_slept:
 	spin_unlock(&dq_list_lock);
 }
 
+static struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+	return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
 	struct dquot *dquot;
 
-	dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+	dquot = sb->dq_op->alloc_dquot(sb, type);
 	if(!dquot)
 		return NODQUOT;
 
@@ -682,7 +697,7 @@ we_slept:
 		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
 		if (empty)
-			kmem_cache_free(dquot_cachep, empty);
+			do_destroy_dquot(empty);
 	}
 	/* Wait for dq_lock - after this we know that either dquot_release() is already
 	 * finished or it will be canceled due to dq_count > 1 test */
@@ -1533,7 +1548,9 @@ struct dquot_operations dquot_operations = {
 	.acquire_dquot	= dquot_acquire,
 	.release_dquot	= dquot_release,
 	.mark_dirty	= dquot_mark_dquot_dirty,
-	.write_info	= dquot_commit_info
+	.write_info	= dquot_commit_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
 static inline void set_enable_flags(struct quota_info *dqopt, int type)
-- 
cgit v1.2.3


From 12095460f7f315f8ef67a55b2194195d325d48d7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 14:45:12 +0200
Subject: quota: Increase size of variables for limits and inode usage

So far quota was fine with quota block limits and inode limits/numbers in
a 32-bit type. Now with rapid increase in storage sizes there are coming
requests to be able to handle quota limits above 4TB / more that 2^32 inodes.
So bump up sizes of types in mem_dqblk structure to 64-bits to be able to
handle this. Also update inode allocation / checking functions to use qsize_t
and make global structure keep quota limits in bytes so that things are
consistent.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c    | 50 +++++++++++++++++++++++++++++---------------------
 fs/quota_v1.c | 25 +++++++++++++++++++------
 fs/quota_v2.c | 21 +++++++++++++++++----
 3 files changed, 65 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 1b5fc4b7fbe..c02223b6aeb 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -835,7 +835,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
 	}
 }
 
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
 {
 	dquot->dq_dqb.dqb_curinodes += number;
 }
@@ -845,7 +845,7 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
 	dquot->dq_dqb.dqb_curspace += number;
 }
 
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
 	if (dquot->dq_dqb.dqb_curinodes > number)
 		dquot->dq_dqb.dqb_curinodes -= number;
@@ -862,7 +862,7 @@ static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 		dquot->dq_dqb.dqb_curspace -= number;
 	else
 		dquot->dq_dqb.dqb_curspace = 0;
-	if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+	if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
 		dquot->dq_dqb.dqb_btime = (time_t) 0;
 	clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 }
@@ -1038,7 +1038,7 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 }
 
 /* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
 	*warntype = QUOTA_NL_NOWARN;
 	if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
@@ -1077,7 +1077,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_bhardlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BHARDWARN;
@@ -1085,7 +1085,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
@@ -1094,7 +1094,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	   toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime == 0) {
 		if (!prealloc) {
 			*warntype = QUOTA_NL_BSOFTWARN;
@@ -1111,7 +1111,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	return QUOTA_OK;
 }
 
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
 	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1128,15 +1128,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
 static int info_bdq_free(struct dquot *dquot, qsize_t space)
 {
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-	    toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+	    dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
 		return QUOTA_NL_NOWARN;
 
-	if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
-	    dquot->dq_dqb.dqb_bsoftlimit)
+	if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
 		return QUOTA_NL_BSOFTBELOW;
-	if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
-	    toqb(dquot->dq_dqb.dqb_curspace - space) <
-						dquot->dq_dqb.dqb_bhardlimit)
+	if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+	    dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
 		return QUOTA_NL_BHARDBELOW;
 	return QUOTA_NL_NOWARN;
 }
@@ -1279,7 +1277,7 @@ warn_put_all:
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
@@ -1364,7 +1362,7 @@ out_sub:
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
@@ -1883,14 +1881,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
 	return ret;
 }
 
+static inline qsize_t qbtos(qsize_t blocks)
+{
+	return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 
 	spin_lock(&dq_data_lock);
-	di->dqb_bhardlimit = dm->dqb_bhardlimit;
-	di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+	di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+	di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
 	di->dqb_curspace = dm->dqb_curspace;
 	di->dqb_ihardlimit = dm->dqb_ihardlimit;
 	di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1937,8 +1945,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 		check_blim = 1;
 	}
 	if (di->dqb_valid & QIF_BLIMITS) {
-		dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
-		dm->dqb_bhardlimit = di->dqb_bhardlimit;
+		dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+		dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
 		check_blim = 1;
 	}
 	if (di->dqb_valid & QIF_INODES) {
@@ -1956,7 +1964,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 		dm->dqb_itime = di->dqb_itime;
 
 	if (check_blim) {
-		if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+		if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
 		}
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb..3e078eee564 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -14,14 +14,27 @@ MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_LICENSE("GPL");
 
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
 static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
 {
 	m->dqb_ihardlimit = d->dqb_ihardlimit;
 	m->dqb_isoftlimit = d->dqb_isoftlimit;
 	m->dqb_curinodes = d->dqb_curinodes;
-	m->dqb_bhardlimit = d->dqb_bhardlimit;
-	m->dqb_bsoftlimit = d->dqb_bsoftlimit;
-	m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+	m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
+	m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
+	m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
 	m->dqb_itime = d->dqb_itime;
 	m->dqb_btime = d->dqb_btime;
 }
@@ -31,9 +44,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
 	d->dqb_ihardlimit = m->dqb_ihardlimit;
 	d->dqb_isoftlimit = m->dqb_isoftlimit;
 	d->dqb_curinodes = m->dqb_curinodes;
-	d->dqb_bhardlimit = m->dqb_bhardlimit;
-	d->dqb_bsoftlimit = m->dqb_bsoftlimit;
-	d->dqb_curblocks = toqb(m->dqb_curspace);
+	d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
+	d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
 	d->dqb_itime = m->dqb_itime;
 	d->dqb_btime = m->dqb_btime;
 }
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d..51c4717f7c6 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -26,6 +26,19 @@ typedef char *dqbuf_t;
 #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
 #define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
 
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v2_stoqb(qsize_t space)
+{
+	return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+	return blocks << QUOTABLOCK_BITS;
+}
+
 /* Check whether given file is really vfsv0 quotafile */
 static int v2_check_quota_file(struct super_block *sb, int type)
 {
@@ -104,8 +117,8 @@ static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
 	m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
 	m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
 	m->dqb_itime = le64_to_cpu(d->dqb_itime);
-	m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
-	m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+	m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
+	m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
 	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
 	m->dqb_btime = le64_to_cpu(d->dqb_btime);
 }
@@ -116,8 +129,8 @@ static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
 	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
 	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
 	d->dqb_itime = cpu_to_le64(m->dqb_itime);
-	d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
-	d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+	d->dqb_bhardlimit = cpu_to_le32(v2_qbtos(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le32(v2_qbtos(m->dqb_bsoftlimit));
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
 	d->dqb_id = cpu_to_le32(id);
-- 
cgit v1.2.3


From 1497d3ad487b64eeea83ac203263802755438949 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 15:49:59 +0200
Subject: quota: Remove bogus 'optimization' in check_idq() and check_bdq()

Checks like <= 0 for an unsigned type do not make much sence. The value
could be only 0 and that does not happen often enough for the check
to be worth it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index c02223b6aeb..c88330602dd 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1041,7 +1041,7 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
 	*warntype = QUOTA_NL_NOWARN;
-	if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1073,7 +1073,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
 	*warntype = QUOTA_NL_NOWARN;
-	if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_bhardlimit &&
-- 
cgit v1.2.3


From e4bc7b4b7ff783779b6928d55a9308910bf180a3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 16:21:01 +0200
Subject: quota: Make _SUSPENDED just a flag

Upto now, DQUOT_USR_SUSPENDED behaved like a state - i.e., either quota
was enabled or suspended or none. Now allowed states are 0, ENABLED,
ENABLED | SUSPENDED. This will be useful later when we implement separate
enabling of quota usage tracking and limits enforcement because we need to
keep track of a state which has been suspended.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index c88330602dd..22340c610e1 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1570,18 +1570,20 @@ static inline void reset_enable_flags(struct quota_info *dqopt, int type,
 {
 	switch (type) {
 		case USRQUOTA:
-			dqopt->flags &= ~DQUOT_USR_ENABLED;
 			if (remount)
 				dqopt->flags |= DQUOT_USR_SUSPENDED;
-			else
+			else {
+				dqopt->flags &= ~DQUOT_USR_ENABLED;
 				dqopt->flags &= ~DQUOT_USR_SUSPENDED;
+			}
 			break;
 		case GRPQUOTA:
-			dqopt->flags &= ~DQUOT_GRP_ENABLED;
 			if (remount)
 				dqopt->flags |= DQUOT_GRP_SUSPENDED;
-			else
+			else {
+				dqopt->flags &= ~DQUOT_GRP_ENABLED;
 				dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
+			}
 			break;
 	}
 }
-- 
cgit v1.2.3


From f55abc0fb9c3189de3da829adf3220322c0da43e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 17:50:32 +0200
Subject: quota: Allow to separately enable quota accounting and enforcing
 limits

Split DQUOT_USR_ENABLED (and DQUOT_GRP_ENABLED) into DQUOT_USR_USAGE_ENABLED
and DQUOT_USR_LIMITS_ENABLED. This way we are able to separately enable /
disable whether we should:
1) ignore quotas completely
2) just keep uptodate information about usage
3) actually enforce quota limits

This is going to be useful when quota is treated as filesystem metadata - we
then want to keep quota information uptodate all the time and just enable /
disable limits enforcement.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 222 ++++++++++++++++++++++++++++++++++++++-----------------------
 fs/quota.c |   8 +--
 2 files changed, 142 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 22340c610e1..7569633efe0 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -489,7 +489,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
-		if (!sb_has_quota_enabled(sb, cnt))
+		if (!sb_has_quota_active(sb, cnt))
 			continue;
 		spin_lock(&dq_list_lock);
 		dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -514,8 +514,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
 	}
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
-			&& info_dirty(&dqopt->info[cnt]))
+		if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+		    && info_dirty(&dqopt->info[cnt]))
 			sb->dq_op->write_info(sb, cnt);
 	spin_lock(&dq_list_lock);
 	dqstats.syncs++;
@@ -594,7 +594,7 @@ we_slept:
 		/* We have more than one user... nothing to do */
 		atomic_dec(&dquot->dq_count);
 		/* Releasing dquot during quotaoff phase? */
-		if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+		if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
 		    atomic_read(&dquot->dq_count) == 1)
 			wake_up(&dquot->dq_wait_unused);
 		spin_unlock(&dq_list_lock);
@@ -670,7 +670,7 @@ static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 	unsigned int hashent = hashfn(sb, id, type);
 	struct dquot *dquot, *empty = NODQUOT;
 
-        if (!sb_has_quota_enabled(sb, type))
+        if (!sb_has_quota_active(sb, type))
 		return NODQUOT;
 we_slept:
 	spin_lock(&dq_list_lock);
@@ -1041,7 +1041,8 @@ static inline char ignore_hardlimit(struct dquot *dquot)
 static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
 	*warntype = QUOTA_NL_NOWARN;
-	if (test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1073,7 +1074,8 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
 	*warntype = QUOTA_NL_NOWARN;
-	if (test_bit(DQ_FAKE_B, &dquot->dq_flags))
+	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_bhardlimit &&
@@ -1114,7 +1116,8 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
-	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+	    !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
 		return QUOTA_NL_NOWARN;
 
 	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1508,7 +1511,7 @@ warn_put_all:
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 {
-	if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+	if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
 		vfs_dq_init(inode);
 		if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
 			return 1;
@@ -1551,53 +1554,22 @@ struct dquot_operations dquot_operations = {
 	.destroy_dquot	= dquot_destroy,
 };
 
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
-	switch (type) {
-		case USRQUOTA:
-			dqopt->flags |= DQUOT_USR_ENABLED;
-			dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-			break;
-		case GRPQUOTA:
-			dqopt->flags |= DQUOT_GRP_ENABLED;
-			dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-			break;
-	}
-}
-
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
-				      int remount)
-{
-	switch (type) {
-		case USRQUOTA:
-			if (remount)
-				dqopt->flags |= DQUOT_USR_SUSPENDED;
-			else {
-				dqopt->flags &= ~DQUOT_USR_ENABLED;
-				dqopt->flags &= ~DQUOT_USR_SUSPENDED;
-			}
-			break;
-		case GRPQUOTA:
-			if (remount)
-				dqopt->flags |= DQUOT_GRP_SUSPENDED;
-			else {
-				dqopt->flags &= ~DQUOT_GRP_ENABLED;
-				dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
-			}
-			break;
-	}
-}
-
-
 /*
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  */
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 {
 	int cnt, ret = 0;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *toputinode[MAXQUOTAS];
 
+	/* Cannot turn off usage accounting without turning off limits, or
+	 * suspend quotas and simultaneously turn quotas off. */
+	if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+	    || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+	    DQUOT_USAGE_ENABLED)))
+		return -EINVAL;
+
 	/* We need to serialize quota_off() for device */
 	mutex_lock(&dqopt->dqonoff_mutex);
 
@@ -1606,7 +1578,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 	 * sometimes we are called when fill_super() failed and calling
 	 * sync_fs() in such cases does no good.
 	 */
-	if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+	if (!sb_any_quota_loaded(sb)) {
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return 0;
 	}
@@ -1614,17 +1586,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		toputinode[cnt] = NULL;
 		if (type != -1 && cnt != type)
 			continue;
-		/* If we keep inodes of quota files after remount and quotaoff
-		 * is called, drop kept inodes. */
-		if (!remount && sb_has_quota_suspended(sb, cnt)) {
-			iput(dqopt->files[cnt]);
-			dqopt->files[cnt] = NULL;
-			reset_enable_flags(dqopt, cnt, 0);
+		if (!sb_has_quota_loaded(sb, cnt))
 			continue;
+
+		if (flags & DQUOT_SUSPENDED) {
+			dqopt->flags |=
+				dquot_state_flag(DQUOT_SUSPENDED, cnt);
+		} else {
+			dqopt->flags &= ~dquot_state_flag(flags, cnt);
+			/* Turning off suspended quotas? */
+			if (!sb_has_quota_loaded(sb, cnt) &&
+			    sb_has_quota_suspended(sb, cnt)) {
+				dqopt->flags &=	~dquot_state_flag(
+							DQUOT_SUSPENDED, cnt);
+				iput(dqopt->files[cnt]);
+				dqopt->files[cnt] = NULL;
+				continue;
+			}
 		}
-		if (!sb_has_quota_enabled(sb, cnt))
+
+		/* We still have to keep quota loaded? */
+		if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
 			continue;
-		reset_enable_flags(dqopt, cnt, remount);
 
 		/* Note: these are blocking operations */
 		drop_dquot_ref(sb, cnt);
@@ -1640,7 +1623,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 		put_quota_format(dqopt->info[cnt].dqi_format);
 
 		toputinode[cnt] = dqopt->files[cnt];
-		if (!remount)
+		if (!sb_has_quota_loaded(sb, cnt))
 			dqopt->files[cnt] = NULL;
 		dqopt->info[cnt].dqi_flags = 0;
 		dqopt->info[cnt].dqi_igrace = 0;
@@ -1663,7 +1646,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 			mutex_lock(&dqopt->dqonoff_mutex);
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
-			if (!sb_has_quota_enabled(sb, cnt)) {
+			if (!sb_has_quota_loaded(sb, cnt)) {
 				mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
@@ -1673,10 +1656,13 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
 			/* On remount RO, we keep the inode pointer so that we
-			 * can reenable quota on the subsequent remount RW.
-			 * But we have better not keep inode pointer when there
-			 * is pending delete on the quota file... */
-			if (!remount)
+			 * can reenable quota on the subsequent remount RW. We
+			 * have to check 'flags' variable and not use sb_has_
+			 * function because another quotaon / quotaoff could
+			 * change global state before we got here. We refuse
+			 * to suspend quotas when there is pending delete on
+			 * the quota file... */
+			if (!(flags & DQUOT_SUSPENDED))
 				iput(toputinode[cnt]);
 			else if (!toputinode[cnt]->i_nlink)
 				ret = -EBUSY;
@@ -1686,12 +1672,22 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 	return ret;
 }
 
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+	return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+				 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
+
 /*
  *	Turn quotas on on a device
  */
 
-/* Helper function when we already have the inode */
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+	unsigned int flags)
 {
 	struct quota_format_type *fmt = find_quota_format(format_id);
 	struct super_block *sb = inode->i_sb;
@@ -1713,6 +1709,11 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 		error = -EINVAL;
 		goto out_fmt;
 	}
+	/* Usage always has to be set... */
+	if (!(flags & DQUOT_USAGE_ENABLED)) {
+		error = -EINVAL;
+		goto out_fmt;
+	}
 
 	/* As we bypass the pagecache we must now flush the inode so that
 	 * we see all the changes from userspace... */
@@ -1721,8 +1722,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	invalidate_bdev(sb->s_bdev);
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
-	if (sb_has_quota_enabled(sb, type) ||
-			sb_has_quota_suspended(sb, type)) {
+	if (sb_has_quota_loaded(sb, type)) {
 		error = -EBUSY;
 		goto out_lock;
 	}
@@ -1754,7 +1754,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
 	}
 	mutex_unlock(&dqopt->dqio_mutex);
 	mutex_unlock(&inode->i_mutex);
-	set_enable_flags(dqopt, type);
+	dqopt->flags |= dquot_state_flag(flags, type);
 
 	add_dquot_ref(sb, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1787,20 +1787,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *inode;
 	int ret;
+	unsigned int flags;
 
 	mutex_lock(&dqopt->dqonoff_mutex);
 	if (!sb_has_quota_suspended(sb, type)) {
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return 0;
 	}
-	BUG_ON(sb_has_quota_enabled(sb, type));
-
 	inode = dqopt->files[type];
 	dqopt->files[type] = NULL;
-	reset_enable_flags(dqopt, type, 0);
+	flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+						DQUOT_LIMITS_ENABLED, type);
+	dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
-	ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+	flags = dquot_generic_flag(flags, type);
+	ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+				   flags);
 	iput(inode);
 
 	return ret;
@@ -1816,12 +1819,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 	if (path->mnt->mnt_sb != sb)
 		error = -EXDEV;
 	else
-		error = vfs_quota_on_inode(path->dentry->d_inode, type,
-					   format_id);
+		error = vfs_load_quota_inode(path->dentry->d_inode, type,
+					     format_id, DQUOT_USAGE_ENABLED |
+					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
 
-/* Actual function called from quotactl() */
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 		 int remount)
 {
@@ -1839,6 +1842,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 	return error;
 }
 
+/*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+		unsigned int flags)
+{
+	int ret = 0;
+	struct super_block *sb = inode->i_sb;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	/* Just unsuspend quotas? */
+	if (flags & DQUOT_SUSPENDED)
+		return vfs_quota_on_remount(sb, type);
+	if (!flags)
+		return 0;
+	/* Just updating flags needed? */
+	if (sb_has_quota_loaded(sb, type)) {
+		mutex_lock(&dqopt->dqonoff_mutex);
+		/* Now do a reliable test... */
+		if (!sb_has_quota_loaded(sb, type)) {
+			mutex_unlock(&dqopt->dqonoff_mutex);
+			goto load_quota;
+		}
+		if (flags & DQUOT_USAGE_ENABLED &&
+		    sb_has_quota_usage_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		if (flags & DQUOT_LIMITS_ENABLED &&
+		    sb_has_quota_limits_enabled(sb, type)) {
+			ret = -EBUSY;
+			goto out_lock;
+		}
+		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return ret;
+	}
+
+load_quota:
+	return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+
 /*
  * This function is used when filesystem needs to initialize quotas
  * during mount time.
@@ -1860,7 +1907,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
 
 	error = security_quota_on(dentry);
 	if (!error)
-		error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+		error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+				DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 
 out:
 	dput(dentry);
@@ -1997,12 +2045,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	int rc;
 
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!(dquot = dqget(sb, id, type))) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-		return -ESRCH;
+	dquot = dqget(sb, id, type);
+	if (!dquot) {
+		rc = -ESRCH;
+		goto out;
 	}
 	rc = do_set_dqblk(dquot, di);
 	dqput(dquot);
+out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return rc;
 }
@@ -2013,7 +2063,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	struct mem_dqinfo *mi;
   
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_enabled(sb, type)) {
+	if (!sb_has_quota_active(sb, type)) {
 		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 		return -ESRCH;
 	}
@@ -2032,11 +2082,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
+	int err = 0;
 
 	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_enabled(sb, type)) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-		return -ESRCH;
+	if (!sb_has_quota_active(sb, type)) {
+		err = -ESRCH;
+		goto out;
 	}
 	mi = sb_dqopt(sb)->info + type;
 	spin_lock(&dq_data_lock);
@@ -2050,8 +2101,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
 	sb->dq_op->write_info(sb, type);
+out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
-	return 0;
+	return err;
 }
 
 struct quotactl_ops vfs_quotactl_ops = {
@@ -2213,9 +2265,11 @@ EXPORT_SYMBOL(register_quota_format);
 EXPORT_SYMBOL(unregister_quota_format);
 EXPORT_SYMBOL(dqstats);
 EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
 EXPORT_SYMBOL(vfs_quota_on);
 EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e0161..8678d9f35ee 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 		case Q_SETQUOTA:
 		case Q_GETQUOTA:
 			/* This is just informative test so we are satisfied without a lock */
-			if (!sb_has_quota_enabled(sb, type))
+			if (!sb_has_quota_active(sb, type))
 				return -ESRCH;
 	}
 
@@ -175,7 +175,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
-		if (!sb_has_quota_enabled(sb, cnt))
+		if (!sb_has_quota_active(sb, cnt))
 			continue;
 		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +201,7 @@ restart:
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 			if (type != -1 && type != cnt)
 				continue;
-			if (!sb_has_quota_enabled(sb, cnt))
+			if (!sb_has_quota_active(sb, cnt))
 				continue;
 			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
 			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +245,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 			__u32 fmt;
 
 			down_read(&sb_dqopt(sb)->dqptr_sem);
-			if (!sb_has_quota_enabled(sb, type)) {
+			if (!sb_has_quota_active(sb, type)) {
 				up_read(&sb_dqopt(sb)->dqptr_sem);
 				return -ESRCH;
 			}
-- 
cgit v1.2.3


From ee0d5ffe0da2aa992004447113e28622621a983f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 18:11:50 +0200
Subject: ext3: Use sb_any_quota_loaded() instead of sb_any_quota_enabled()

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ext3/super.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec..250ec53195c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1035,8 +1035,7 @@ static int parse_options (char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 					"EXT3-fs: Cannot change journaled "
@@ -1075,8 +1074,7 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 					"journaled quota options when "
@@ -1095,8 +1093,7 @@ clear_qf_name:
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
 set_qf_format:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
 				printk(KERN_ERR "EXT3-fs: Cannot change "
 					"journaled quota options when "
@@ -1115,8 +1112,7 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb) ||
-			    sb_any_quota_suspended(sb)) {
+			if (sb_any_quota_loaded(sb)) {
 				printk(KERN_ERR "EXT3-fs: Cannot change quota "
 					"options when quota turned on.\n");
 				return 0;
-- 
cgit v1.2.3


From 17bd13b31ce4fe7f789d8848e8cbc8cb42b10544 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 18:14:35 +0200
Subject: ext4: Use sb_any_quota_loaded() instead of sb_any_quota_enabled()

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ext4/super.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04158ad74db..49fcf8864e7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1142,8 +1142,7 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
 				       "EXT4-fs: Cannot change journaled "
@@ -1182,8 +1181,7 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 					"journaled quota options when "
@@ -1202,8 +1200,7 @@ clear_qf_name:
 		case Opt_jqfmt_vfsv0:
 			qfmt = QFMT_VFS_V0;
 set_qf_format:
-			if ((sb_any_quota_enabled(sb) ||
-			     sb_any_quota_suspended(sb)) &&
+			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
 					"journaled quota options when "
@@ -1222,7 +1219,7 @@ set_qf_format:
 			set_opt(sbi->s_mount_opt, GRPQUOTA);
 			break;
 		case Opt_noquota:
-			if (sb_any_quota_enabled(sb)) {
+			if (sb_any_quota_loaded(sb)) {
 				printk(KERN_ERR "EXT4-fs: Cannot change quota "
 					"options when quota turned on.\n");
 				return 0;
-- 
cgit v1.2.3


From 6929f891241d3fe3af01d28503b645e63241e49a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 18:16:36 +0200
Subject: reiserfs: Use sb_any_quota_loaded() instead of
 sb_any_quota_enabled().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/reiserfs/super.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce..a9b393a5815 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -994,8 +994,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		if (c == 'u' || c == 'g') {
 			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
 
-			if ((sb_any_quota_enabled(s) ||
-			     sb_any_quota_suspended(s)) &&
+			if (sb_any_quota_loaded(s) &&
 			    (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1040,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 						 "reiserfs_parse_options: unknown quota format specified.");
 				return 0;
 			}
-			if ((sb_any_quota_enabled(s) ||
-			     sb_any_quota_suspended(s)) &&
+			if (sb_any_quota_loaded(s) &&
 			    *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
 				reiserfs_warning(s,
 						 "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1065,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 	}
 	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
 	if (!(*mount_options & (1 << REISERFS_QUOTA))
-	    && sb_any_quota_enabled(s)) {
+	    && sb_any_quota_loaded(s)) {
 		reiserfs_warning(s,
 				 "reiserfs_parse_options: quota options must be present when quota is turned on.");
 		return 0;
-- 
cgit v1.2.3


From ca785ec66b991e9ca74dd9840fc014487ad095e1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 30 Sep 2008 17:53:37 +0200
Subject: quota: Introduce DQUOT_QUOTA_SYS_FILE flag

If filesystem can handle quota files as system files hidden from users, we can
skip a lot of cache invalidation, syncing, inode flags setting etc. when
turning quotas on, off and quota_sync. Allow filesystem to indicate that it is
hiding quota files from users by DQUOT_QUOTA_SYS_FILE flag.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 45 ++++++++++++++++++++++++++++++---------------
 fs/quota.c |  3 +++
 2 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 7569633efe0..74185c34a4f 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1631,6 +1631,11 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 		dqopt->ops[cnt] = NULL;
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
+
+	/* Skip syncing and setting flags if quota files are hidden */
+	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+		goto put_inodes;
+
 	/* Sync the superblock so that buffers with quota data are written to
 	 * disk (and so userspace sees correct data afterwards). */
 	if (sb->s_op->sync_fs)
@@ -1655,6 +1660,12 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 				mark_inode_dirty(toputinode[cnt]);
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
+		}
+	if (sb->s_bdev)
+		invalidate_bdev(sb->s_bdev);
+put_inodes:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (toputinode[cnt]) {
 			/* On remount RO, we keep the inode pointer so that we
 			 * can reenable quota on the subsequent remount RW. We
 			 * have to check 'flags' variable and not use sb_has_
@@ -1667,8 +1678,6 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 			else if (!toputinode[cnt]->i_nlink)
 				ret = -EBUSY;
 		}
-	if (sb->s_bdev)
-		invalidate_bdev(sb->s_bdev);
 	return ret;
 }
 
@@ -1715,25 +1724,31 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		goto out_fmt;
 	}
 
-	/* As we bypass the pagecache we must now flush the inode so that
-	 * we see all the changes from userspace... */
-	write_inode_now(inode, 1);
-	/* And now flush the block cache so that kernel sees the changes */
-	invalidate_bdev(sb->s_bdev);
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* As we bypass the pagecache we must now flush the inode so
+		 * that we see all the changes from userspace... */
+		write_inode_now(inode, 1);
+		/* And now flush the block cache so that kernel sees the
+		 * changes */
+		invalidate_bdev(sb->s_bdev);
+	}
 	mutex_lock(&inode->i_mutex);
 	mutex_lock(&dqopt->dqonoff_mutex);
 	if (sb_has_quota_loaded(sb, type)) {
 		error = -EBUSY;
 		goto out_lock;
 	}
-	/* We don't want quota and atime on quota files (deadlocks possible)
-	 * Also nobody should write to the file - we use special IO operations
-	 * which ignore the immutable bit. */
-	down_write(&dqopt->dqptr_sem);
-	oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
-	inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
-	up_write(&dqopt->dqptr_sem);
-	sb->dq_op->drop(inode);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+		/* We don't want quota and atime on quota files (deadlocks
+		 * possible) Also nobody should write to the file - we use
+		 * special IO operations which ignore the immutable bit. */
+		down_write(&dqopt->dqptr_sem);
+		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+		up_write(&dqopt->dqptr_sem);
+		sb->dq_op->drop(inode);
+	}
 
 	error = -EIO;
 	dqopt->files[type] = igrab(inode);
diff --git a/fs/quota.c b/fs/quota.c
index 8678d9f35ee..4a8c94f05f7 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	int cnt;
 
 	sb->s_qcop->quota_sync(sb, type);
+
+	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+		return;
 	/* This is not very clever (and fast) but currently I don't know about
 	 * any other simple way of getting quota data to disk and we must get
 	 * them there for userspace to be visible... */
-- 
cgit v1.2.3


From cf770c137122b78470a67ebd5498947869a09197 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 21 Sep 2008 23:17:53 +0200
Subject: quota: Move quotaio_v[12].h from include/linux/ to fs/

Since these include files are used only by implementation of quota formats,
there's no need to have them in include/linux/.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/quota_v1.c   |  3 ++-
 fs/quota_v2.c   |  7 ++---
 fs/quotaio_v1.h | 33 ++++++++++++++++++++++++
 fs/quotaio_v2.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 4 deletions(-)
 create mode 100644 fs/quotaio_v1.h
 create mode 100644 fs/quotaio_v2.h

(limited to 'fs')

diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 3e078eee564..b4af1c69ad1 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,13 +3,14 @@
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
 
 #include <asm/byteorder.h>
 
+#include "quotaio_v1.h"
+
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Old quota format support");
 MODULE_LICENSE("GPL");
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index 51c4717f7c6..a21d1a7c356 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -15,6 +14,8 @@
 
 #include <asm/byteorder.h>
 
+#include "quotaio_v2.h"
+
 MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Quota format v2 support");
 MODULE_LICENSE("GPL");
@@ -129,8 +130,8 @@ static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
 	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
 	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
 	d->dqb_itime = cpu_to_le64(m->dqb_itime);
-	d->dqb_bhardlimit = cpu_to_le32(v2_qbtos(m->dqb_bhardlimit));
-	d->dqb_bsoftlimit = cpu_to_le32(v2_qbtos(m->dqb_bsoftlimit));
+	d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
+	d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
 	d->dqb_id = cpu_to_le32(id);
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 00000000000..746654b5de7
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+
+#include <linux/types.h>
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME  604800	/* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME  604800	/* (7*24*60*60) 1 week */
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+	__u32 dqb_bhardlimit;	/* absolute limit on disk blks alloc */
+	__u32 dqb_bsoftlimit;	/* preferred limit on disk blks */
+	__u32 dqb_curblocks;	/* current block count */
+	__u32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__u32 dqb_isoftlimit;	/* preferred inode limit */
+	__u32 dqb_curinodes;	/* current # allocated inodes */
+	time_t dqb_btime;	/* time limit for excessive disk use */
+	time_t dqb_itime;	/* time limit for excessive inode use */
+};
+
+#define v1_dqoff(UID)      ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+
+#endif	/* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 00000000000..303d7cbe30d
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,79 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+	0xd9c01f11,	/* USRQUOTA */\
+	0xd9c01927	/* GRPQUOTA */\
+}
+
+#define V2_INITQVERSIONS {\
+	0,		/* USRQUOTA */\
+	0		/* GRPQUOTA */\
+}
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2_disk_dqblk {
+	__le32 dqb_id;		/* id this quota applies to */
+	__le32 dqb_ihardlimit;	/* absolute limit on allocated inodes */
+	__le32 dqb_isoftlimit;	/* preferred inode limit */
+	__le32 dqb_curinodes;	/* current # allocated inodes */
+	__le32 dqb_bhardlimit;	/* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+	__le32 dqb_bsoftlimit;	/* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+	__le64 dqb_curspace;	/* current space occupied (in bytes) */
+	__le64 dqb_btime;	/* time limit for excessive disk use */
+	__le64 dqb_itime;	/* time limit for excessive inode use */
+};
+
+/*
+ * Here are header structures as written on disk and their in-memory copies
+ */
+/* First generic header */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* File version */
+};
+
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
+	__le32 dqi_igrace;	/* Time before inode soft limit becomes hard limit */
+	__le32 dqi_flags;	/* Flags for quotafile (DQF_*) */
+	__le32 dqi_blocks;	/* Number of blocks in file */
+	__le32 dqi_free_blk;	/* Number of first free block in the list */
+	__le32 dqi_free_entry;	/* Number of block with at least one free entry */
+};
+
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct v2_disk_dqdbheader {
+	__le32 dqdh_next_free;	/* Number of next block with free entry */
+	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
+	__le16 dqdh_entries;	/* Number of valid entries in block */
+	__le16 dqdh_pad1;
+	__le32 dqdh_pad2;
+};
+
+#define V2_DQINFOOFF	sizeof(struct v2_disk_dqheader)	/* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS	10
+#define V2_DQBLKSIZE	(1 << V2_DQBLKSIZE_BITS)	/* Size of block with quota structures */
+#define V2_DQTREEOFF	1		/* Offset of tree in file in blocks */
+#define V2_DQTREEDEPTH	4		/* Depth of quota tree */
+#define V2_DQSTRINBLK	((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk))	/* Number of entries in one blocks */
+
+#endif /* _LINUX_QUOTAIO_V2_H */
-- 
cgit v1.2.3


From 1ccd14b9c271c1ac6eec5c5ec5def433100e7248 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 22 Sep 2008 05:54:49 +0200
Subject: quota: Split off quota tree handling into a separate file

There is going to be a new version of quota format having 64-bit
quota limits and a new quota format for OCFS2. They are both
going to use the same tree structure as VFSv0 quota format. So
split out tree handling into a separate file and make size of
leaf blocks, amount of space usable in each block (needed for
checksumming) and structures contained in them configurable
so that the code can be shared.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/Kconfig      |   5 +
 fs/Makefile     |   1 +
 fs/quota_tree.c | 645 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/quota_tree.h |  25 +++
 fs/quota_v2.c   | 596 +++++----------------------------------------------
 fs/quotaio_v2.h |  33 +--
 6 files changed, 735 insertions(+), 570 deletions(-)
 create mode 100644 fs/quota_tree.c
 create mode 100644 fs/quota_tree.h

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index b93425ad15d..c1ce3d8831d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -302,6 +302,10 @@ config PRINT_QUOTA_WARNING
 	  Note that this behavior is currently deprecated and may go away in
 	  future. Please use notification via netlink socket instead.
 
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+	 tristate
+
 config QFMT_V1
 	tristate "Old quota format support"
 	depends on QUOTA
@@ -313,6 +317,7 @@ config QFMT_V1
 config QFMT_V2
 	tristate "Quota format v2 support"
 	depends on QUOTA
+	select QUOTA_TREE
 	help
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
 	  need this functionality say Y here.
diff --git a/fs/Makefile b/fs/Makefile
index e6f423d1d22..c830611550d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
 obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
 obj-$(CONFIG_QUOTACTL)		+= quota.o
 
 obj-$(CONFIG_PROC_FS)		+= proc/
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 00000000000..953404c95b1
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
+/*
+ *	vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_QT_PARANOIA
+
+typedef char *dqbuf_t;
+
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+	unsigned int epb = info->dqi_usable_bs >> 2;
+
+	depth = info->dqi_qtree_depth - depth - 1;
+	while (depth--)
+		id /= epb;
+	return id % epb;
+}
+
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+	return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+	       / info->dqi_entry_size;
+}
+
+static dqbuf_t getdqbuf(size_t size)
+{
+	dqbuf_t buf = kmalloc(size, GFP_NOFS);
+	if (!buf)
+		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+	return buf;
+}
+
+static inline void freedqbuf(dqbuf_t buf)
+{
+	kfree(buf);
+}
+
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	memset(buf, 0, info->dqi_usable_bs);
+	return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+	struct super_block *sb = info->dqi_sb;
+
+	return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int ret, blk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (info->dqi_free_blk) {
+		blk = info->dqi_free_blk;
+		ret = read_blk(info, blk, buf);
+		if (ret < 0)
+			goto out_buf;
+		info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+	}
+	else {
+		memset(buf, 0, info->dqi_usable_bs);
+		/* Assure block allocation... */
+		ret = write_blk(info, info->dqi_blocks, buf);
+		if (ret < 0)
+			goto out_buf;
+		blk = info->dqi_blocks++;
+	}
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	ret = blk;
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	dh->dqdh_entries = cpu_to_le16(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		return err;
+	info->dqi_free_blk = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+}
+
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	if (nextblk) {
+		err = read_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							dh->dqdh_prev_free;
+		err = write_blk(info, nextblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	if (prevblk) {
+		err = read_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+							dh->dqdh_next_free;
+		err = write_blk(info, prevblk, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	} else {
+		info->dqi_free_entry = nextblk;
+		mark_info_dirty(info->dqi_sb, info->dqi_type);
+	}
+	freedqbuf(tmpbuf);
+	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+	/* No matter whether write succeeds block is out of list */
+	if (write_blk(info, blk, buf) < 0)
+		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+	return 0;
+out_buf:
+	freedqbuf(tmpbuf);
+	return err;
+}
+
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+	int err;
+
+	if (!tmpbuf)
+		return -ENOMEM;
+	dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+	dh->dqdh_prev_free = cpu_to_le32(0);
+	err = write_blk(info, blk, buf);
+	if (err < 0)
+		goto out_buf;
+	if (info->dqi_free_entry) {
+		err = read_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+		((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+							cpu_to_le32(blk);
+		err = write_blk(info, info->dqi_free_entry, tmpbuf);
+		if (err < 0)
+			goto out_buf;
+	}
+	freedqbuf(tmpbuf);
+	info->dqi_free_entry = blk;
+	mark_info_dirty(info->dqi_sb, info->dqi_type);
+	return 0;
+out_buf:
+	freedqbuf(tmpbuf);
+	return err;
+}
+
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+	int i;
+
+	for (i = 0; i < info->dqi_entry_size; i++)
+		if (disk[i])
+			return 0;
+	return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+			      struct dquot *dquot, int *err)
+{
+	uint blk, i;
+	struct qt_disk_dqdbheader *dh;
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *ddquot;
+
+	*err = 0;
+	if (!buf) {
+		*err = -ENOMEM;
+		return 0;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	if (info->dqi_free_entry) {
+		blk = info->dqi_free_entry;
+		*err = read_blk(info, blk, buf);
+		if (*err < 0)
+			goto out_buf;
+	} else {
+		blk = get_free_dqblk(info);
+		if ((int)blk < 0) {
+			*err = blk;
+			freedqbuf(buf);
+			return 0;
+		}
+		memset(buf, 0, info->dqi_usable_bs);
+		/* This is enough as block is already zeroed and entry list is empty... */
+		info->dqi_free_entry = blk;
+		mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+	}
+	/* Block will be full? */
+	if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+		*err = remove_free_dqentry(info, buf, blk);
+		if (*err < 0) {
+			printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+			       "remove block (%u) from entry free list.\n",
+			       blk);
+			goto out_buf;
+		}
+	}
+	le16_add_cpu(&dh->dqdh_entries, 1);
+	/* Find free structure in block */
+	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+	     i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+	     i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+	if (i == qtree_dqstr_in_blk(info)) {
+		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+				"but it shouldn't.\n");
+		*err = -EIO;
+		goto out_buf;
+	}
+#endif
+	*err = write_blk(info, blk, buf);
+	if (*err < 0) {
+		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+				"data block %u.\n", blk);
+		goto out_buf;
+	}
+	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+			sizeof(struct qt_disk_dqdbheader) +
+			i * info->dqi_entry_size;
+	freedqbuf(buf);
+	return blk;
+out_buf:
+	freedqbuf(buf);
+	return 0;
+}
+
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			  uint *treeblk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0, newson = 0, newact = 0;
+	__le32 *ref;
+	uint newblk;
+
+	if (!buf)
+		return -ENOMEM;
+	if (!*treeblk) {
+		ret = get_free_dqblk(info);
+		if (ret < 0)
+			goto out_buf;
+		*treeblk = ret;
+		memset(buf, 0, info->dqi_usable_bs);
+		newact = 1;
+	} else {
+		ret = read_blk(info, *treeblk, buf);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Can't read tree quota block "
+					"%u.\n", *treeblk);
+			goto out_buf;
+		}
+	}
+	ref = (__le32 *)buf;
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!newblk)
+		newson = 1;
+	if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+		if (newblk) {
+			printk(KERN_ERR "VFS: Inserting already present quota "
+					"entry (block %u).\n",
+			       le32_to_cpu(ref[get_index(info,
+						dquot->dq_id, depth)]));
+			ret = -EIO;
+			goto out_buf;
+		}
+#endif
+		newblk = find_free_dqentry(info, dquot, &ret);
+	} else {
+		ret = do_insert_tree(info, dquot, &newblk, depth+1);
+	}
+	if (newson && ret >= 0) {
+		ref[get_index(info, dquot->dq_id, depth)] =
+							cpu_to_le32(newblk);
+		ret = write_blk(info, *treeblk, buf);
+	} else if (newact && ret < 0) {
+		put_free_dqblk(info, buf, *treeblk);
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot)
+{
+	int tmp = QT_TREEOFF;
+	return do_insert_tree(info, dquot, &tmp, 0);
+}
+
+/*
+ *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	ssize_t ret;
+	dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+
+	if (!ddquot)
+		return -ENOMEM;
+
+	/* dq_off is guarded by dqio_mutex */
+	if (!dquot->dq_off) {
+		ret = dq_insert_tree(info, dquot);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Error %zd occurred while "
+					"creating quota.\n", ret);
+			freedqbuf(ddquot);
+			return ret;
+		}
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+	spin_unlock(&dq_data_lock);
+	ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+					info->dqi_entry_size, dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+		       sb->s_id);
+		if (ret >= 0)
+			ret = -ENOSPC;
+	} else {
+		ret = 0;
+	}
+	dqstats.writes++;
+	freedqbuf(ddquot);
+
+	return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+			uint blk)
+{
+	struct qt_disk_dqdbheader *dh;
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+
+	if (!buf)
+		return -ENOMEM;
+	if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+		printk(KERN_ERR "VFS: Quota structure has offset to other "
+		  "block (%u) than it should (%u).\n", blk,
+		  (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+		goto out_buf;
+	}
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+		goto out_buf;
+	}
+	dh = (struct qt_disk_dqdbheader *)buf;
+	le16_add_cpu(&dh->dqdh_entries, -1);
+	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
+		ret = remove_free_dqentry(info, buf, blk);
+		if (ret >= 0)
+			ret = put_free_dqblk(info, buf, blk);
+		if (ret < 0) {
+			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+			  "to free list.\n", blk);
+			goto out_buf;
+		}
+	} else {
+		memset(buf +
+		       (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+		       0, info->dqi_entry_size);
+		if (le16_to_cpu(dh->dqdh_entries) ==
+		    qtree_dqstr_in_blk(info) - 1) {
+			/* Insert will write block itself */
+			ret = insert_free_dqentry(info, buf, blk);
+			if (ret < 0) {
+				printk(KERN_ERR "VFS: Can't insert quota data "
+				       "block (%u) to free entry list.\n", blk);
+				goto out_buf;
+			}
+		} else {
+			ret = write_blk(info, blk, buf);
+			if (ret < 0) {
+				printk(KERN_ERR "VFS: Can't write quota data "
+				  "block %u\n", blk);
+				goto out_buf;
+			}
+		}
+	}
+	dquot->dq_off = 0;	/* Quota is now unattached */
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+		       uint *blk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	int ret = 0;
+	uint newblk;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, *blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+		goto out_buf;
+	}
+	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (depth == info->dqi_qtree_depth - 1) {
+		ret = free_dqentry(info, dquot, newblk);
+		newblk = 0;
+	} else {
+		ret = remove_tree(info, dquot, &newblk, depth+1);
+	}
+	if (ret >= 0 && !newblk) {
+		int i;
+		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+		/* Block got empty? */
+		for (i = 0;
+		     i < (info->dqi_usable_bs >> 2) && !ref[i];
+		     i++);
+		/* Don't put the root block into the free block list */
+		if (i == (info->dqi_usable_bs >> 2)
+		    && *blk != QT_TREEOFF) {
+			put_free_dqblk(info, buf, *blk);
+			*blk = 0;
+		} else {
+			ret = write_blk(info, *blk, buf);
+			if (ret < 0)
+				printk(KERN_ERR "VFS: Can't write quota tree "
+				  "block %u.\n", *blk);
+		}
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	uint tmp = QT_TREEOFF;
+
+	if (!dquot->dq_off)	/* Even not allocated? */
+		return 0;
+	return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+				 struct dquot *dquot, uint blk)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	int i;
+	char *ddquot;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		goto out_buf;
+	}
+	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+	     i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+	     i++, ddquot += info->dqi_entry_size);
+	if (i == qtree_dqstr_in_blk(info)) {
+		printk(KERN_ERR "VFS: Quota for id %u referenced "
+		  "but not present.\n", dquot->dq_id);
+		ret = -EIO;
+		goto out_buf;
+	} else {
+		ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+		  qt_disk_dqdbheader) + i * info->dqi_entry_size;
+	}
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+				struct dquot *dquot, uint blk, int depth)
+{
+	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	loff_t ret = 0;
+	__le32 *ref = (__le32 *)buf;
+
+	if (!buf)
+		return -ENOMEM;
+	ret = read_blk(info, blk, buf);
+	if (ret < 0) {
+		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+		goto out_buf;
+	}
+	ret = 0;
+	blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+	if (!blk)	/* No reference? */
+		goto out_buf;
+	if (depth < info->dqi_qtree_depth - 1)
+		ret = find_tree_dqentry(info, dquot, blk, depth+1);
+	else
+		ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+	freedqbuf(buf);
+	return ret;
+}
+
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+				  struct dquot *dquot)
+{
+	return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	int type = dquot->dq_type;
+	struct super_block *sb = dquot->dq_sb;
+	loff_t offset;
+	dqbuf_t ddquot;
+	int ret = 0;
+
+#ifdef __QUOTA_QT_PARANOIA
+	/* Invalidated quota? */
+	if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+		return -EIO;
+	}
+#endif
+	/* Do we know offset of the dquot entry in the quota file? */
+	if (!dquot->dq_off) {
+		offset = find_dqentry(info, dquot);
+		if (offset <= 0) {	/* Entry not present? */
+			if (offset < 0)
+				printk(KERN_ERR "VFS: Can't read quota "
+				  "structure for id %u.\n", dquot->dq_id);
+			dquot->dq_off = 0;
+			set_bit(DQ_FAKE_B, &dquot->dq_flags);
+			memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+			ret = offset;
+			goto out;
+		}
+		dquot->dq_off = offset;
+	}
+	ddquot = getdqbuf(info->dqi_entry_size);
+	if (!ddquot)
+		return -ENOMEM;
+	ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+				   info->dqi_entry_size, dquot->dq_off);
+	if (ret != info->dqi_entry_size) {
+		if (ret >= 0)
+			ret = -EIO;
+		printk(KERN_ERR "VFS: Error while reading quota "
+				"structure for id %u.\n", dquot->dq_id);
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+		freedqbuf(ddquot);
+		goto out;
+	}
+	spin_lock(&dq_data_lock);
+	info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+	if (!dquot->dq_dqb.dqb_bhardlimit &&
+	    !dquot->dq_dqb.dqb_bsoftlimit &&
+	    !dquot->dq_dqb.dqb_ihardlimit &&
+	    !dquot->dq_dqb.dqb_isoftlimit)
+		set_bit(DQ_FAKE_B, &dquot->dq_flags);
+	spin_unlock(&dq_data_lock);
+	freedqbuf(ddquot);
+out:
+	dqstats.reads++;
+	return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+		return qtree_delete_dquot(info, dquot);
+	return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 00000000000..a1ab8db81a5
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ *	Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ *  Structure of header of block with quota structures. It is padded to 16 bytes so
+ *  there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+	__le32 dqdh_next_free;	/* Number of next block with free entry */
+	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
+	__le16 dqdh_entries;	/* Number of valid entries in block */
+	__le16 dqdh_pad1;
+	__le32 dqdh_pad2;
+};
+
+#define QT_TREEOFF	1		/* Offset of tree in file in blocks */
+
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index a21d1a7c356..a87f1028a42 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -14,6 +14,7 @@
 
 #include <asm/byteorder.h>
 
+#include "quota_tree.h"
 #include "quotaio_v2.h"
 
 MODULE_AUTHOR("Jan Kara");
@@ -22,10 +23,15 @@ MODULE_LICENSE("GPL");
 
 #define __QUOTA_V2_PARANOIA
 
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
 
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+static struct qtree_fmt_operations v2_qtree_ops = {
+	.mem2disk_dqblk = v2_mem2diskdqb,
+	.disk2mem_dqblk = v2_disk2memdqb,
+	.is_id = v2_is_id,
+};
 
 #define QUOTABLOCK_BITS 10
 #define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
@@ -64,7 +70,7 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 static int v2_read_file_info(struct super_block *sb, int type)
 {
 	struct v2_disk_dqinfo dinfo;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
 	ssize_t size;
 
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -80,9 +86,16 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
-	info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
-	info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
-	info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	info->u.v2_i.i.dqi_sb = sb;
+	info->u.v2_i.i.dqi_type = type;
+	info->u.v2_i.i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	info->u.v2_i.i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	info->u.v2_i.i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	info->u.v2_i.i.dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+	info->u.v2_i.i.dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+	info->u.v2_i.i.dqi_qtree_depth = qtree_depth(&info->u.v2_i.i);
+	info->u.v2_i.i.dqi_entry_size = sizeof(struct v2_disk_dqblk);
+	info->u.v2_i.i.dqi_ops = &v2_qtree_ops;
 	return 0;
 }
 
@@ -90,7 +103,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 static int v2_write_file_info(struct super_block *sb, int type)
 {
 	struct v2_disk_dqinfo dinfo;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
 	ssize_t size;
 
 	spin_lock(&dq_data_lock);
@@ -99,9 +112,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
 	dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
 	spin_unlock(&dq_data_lock);
-	dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
-	dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-	dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+	dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.i.dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.i.dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.i.dqi_free_entry);
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -112,8 +125,11 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	return 0;
 }
 
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
 {
+	struct v2_disk_dqblk *d = dp, empty;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
 	m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
 	m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
 	m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
@@ -122,10 +138,20 @@ static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
 	m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
 	m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
 	m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	/* We need to escape back all-zero structure */
+	memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+	empty.dqb_itime = cpu_to_le64(1);
+	if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+		m->dqb_itime = 0;
 }
 
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 {
+	struct v2_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+	struct qtree_mem_dqinfo *info =
+			&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i;
+
 	d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
 	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
 	d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
@@ -134,553 +160,35 @@ static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
 	d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
 	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
 	d->dqb_btime = cpu_to_le64(m->dqb_btime);
-	d->dqb_id = cpu_to_le32(id);
-}
-
-static dqbuf_t getdqbuf(void)
-{
-	dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
-	if (!buf)
-		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
-	return buf;
-}
-
-static inline void freedqbuf(dqbuf_t buf)
-{
-	kfree(buf);
-}
-
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-	memset(buf, 0, V2_DQBLKSIZE);
-	return sb->s_op->quota_read(sb, type, (char *)buf,
-	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
-	return sb->s_op->quota_write(sb, type, (char *)buf,
-	       V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
-	dqbuf_t buf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int ret, blk;
-
-	if (!buf)
-		return -ENOMEM;
-	if (info->u.v2_i.dqi_free_blk) {
-		blk = info->u.v2_i.dqi_free_blk;
-		if ((ret = read_blk(sb, type, blk, buf)) < 0)
-			goto out_buf;
-		info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
-	}
-	else {
-		memset(buf, 0, V2_DQBLKSIZE);
-		/* Assure block allocation... */
-		if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
-			goto out_buf;
-		blk = info->u.v2_i.dqi_blocks++;
-	}
-	mark_info_dirty(sb, type);
-	ret = blk;
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int err;
-
-	dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
-	dh->dqdh_prev_free = cpu_to_le32(0);
-	dh->dqdh_entries = cpu_to_le16(0);
-	info->u.v2_i.dqi_free_blk = blk;
-	mark_info_dirty(sb, type);
-	/* Some strange block. We had better leave it... */
-	if ((err = write_blk(sb, type, blk, buf)) < 0)
-		return err;
-	return 0;
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	if (qtree_entry_unused(info, dp))
+		d->dqb_itime = cpu_to_le64(1);
 }
 
-/* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+static int v2_is_id(void *dp, struct dquot *dquot)
 {
-	dqbuf_t tmpbuf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
-	int err;
+	struct v2_disk_dqblk *d = dp;
+	struct qtree_mem_dqinfo *info =
+			&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i;
 
-	if (!tmpbuf)
-		return -ENOMEM;
-	if (nextblk) {
-		if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
-		if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	if (prevblk) {
-		if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
-		if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	else {
-		info->u.v2_i.dqi_free_entry = nextblk;
-		mark_info_dirty(sb, type);
-	}
-	freedqbuf(tmpbuf);
-	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
-	/* No matter whether write succeeds block is out of list */
-	if (write_blk(sb, type, blk, buf) < 0)
-		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
-	return 0;
-out_buf:
-	freedqbuf(tmpbuf);
-	return err;
-}
-
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
-	dqbuf_t tmpbuf = getdqbuf();
-	struct mem_dqinfo *info = sb_dqinfo(sb, type);
-	struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
-	int err;
-
-	if (!tmpbuf)
-		return -ENOMEM;
-	dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
-	dh->dqdh_prev_free = cpu_to_le32(0);
-	if ((err = write_blk(sb, type, blk, buf)) < 0)
-		goto out_buf;
-	if (info->u.v2_i.dqi_free_entry) {
-		if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-			goto out_buf;
-		((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
-		if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
-			goto out_buf;
-	}
-	freedqbuf(tmpbuf);
-	info->u.v2_i.dqi_free_entry = blk;
-	mark_info_dirty(sb, type);
-	return 0;
-out_buf:
-	freedqbuf(tmpbuf);
-	return err;
-}
-
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
-	struct super_block *sb = dquot->dq_sb;
-	struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
-	uint blk, i;
-	struct v2_disk_dqdbheader *dh;
-	struct v2_disk_dqblk *ddquot;
-	struct v2_disk_dqblk fakedquot;
-	dqbuf_t buf;
-
-	*err = 0;
-	if (!(buf = getdqbuf())) {
-		*err = -ENOMEM;
+	if (qtree_entry_unused(info, dp))
 		return 0;
-	}
-	dh = (struct v2_disk_dqdbheader *)buf;
-	ddquot = GETENTRIES(buf);
-	if (info->u.v2_i.dqi_free_entry) {
-		blk = info->u.v2_i.dqi_free_entry;
-		if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
-			goto out_buf;
-	}
-	else {
-		blk = get_free_dqblk(sb, dquot->dq_type);
-		if ((int)blk < 0) {
-			*err = blk;
-			freedqbuf(buf);
-			return 0;
-		}
-		memset(buf, 0, V2_DQBLKSIZE);
-		/* This is enough as block is already zeroed and entry list is empty... */
-		info->u.v2_i.dqi_free_entry = blk;
-		mark_info_dirty(sb, dquot->dq_type);
-	}
-	if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK)	/* Block will be full? */
-		if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
-			printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
-			goto out_buf;
-		}
-	le16_add_cpu(&dh->dqdh_entries, 1);
-	memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-	/* Find free structure in block */
-	for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
-	if (i == V2_DQSTRINBLK) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
-		*err = -EIO;
-		goto out_buf;
-	}
-#endif
-	if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
-		goto out_buf;
-	}
-	dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
-	freedqbuf(buf);
-	return blk;
-out_buf:
-	freedqbuf(buf);
-	return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
 }
 
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
-	struct super_block *sb = dquot->dq_sb;
-	dqbuf_t buf;
-	int ret = 0, newson = 0, newact = 0;
-	__le32 *ref;
-	uint newblk;
-
-	if (!(buf = getdqbuf()))
-		return -ENOMEM;
-	if (!*treeblk) {
-		ret = get_free_dqblk(sb, dquot->dq_type);
-		if (ret < 0)
-			goto out_buf;
-		*treeblk = ret;
-		memset(buf, 0, V2_DQBLKSIZE);
-		newact = 1;
-	}
-	else {
-		if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
-			printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
-			goto out_buf;
-		}
-	}
-	ref = (__le32 *)buf;
-	newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (!newblk)
-		newson = 1;
-	if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
-		if (newblk) {
-			printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
-			ret = -EIO;
-			goto out_buf;
-		}
-#endif
-		newblk = find_free_dqentry(dquot, &ret);
-	}
-	else
-		ret = do_insert_tree(dquot, &newblk, depth+1);
-	if (newson && ret >= 0) {
-		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
-		ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
-	}
-	else if (newact && ret < 0)
-		put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Wrapper for inserting quota structure into tree */
-static inline int dq_insert_tree(struct dquot *dquot)
+static int v2_read_dquot(struct dquot *dquot)
 {
-	int tmp = V2_DQTREEOFF;
-	return do_insert_tree(dquot, &tmp, 0);
+	return qtree_read_dquot(&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i, dquot);
 }
 
-/*
- *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
 static int v2_write_dquot(struct dquot *dquot)
 {
-	int type = dquot->dq_type;
-	ssize_t ret;
-	struct v2_disk_dqblk ddquot, empty;
-
-	/* dq_off is guarded by dqio_mutex */
-	if (!dquot->dq_off)
-		if ((ret = dq_insert_tree(dquot)) < 0) {
-			printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
-			return ret;
-		}
-	spin_lock(&dq_data_lock);
-	mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
-	/* Argh... We may need to write structure full of zeroes but that would be
-	 * treated as an empty place by the rest of the code. Format change would
-	 * be definitely cleaner but the problems probably are not worth it */
-	memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-	if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-		ddquot.dqb_itime = cpu_to_le64(1);
-	spin_unlock(&dq_data_lock);
-	ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
-	      (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
-	if (ret != sizeof(struct v2_disk_dqblk)) {
-		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
-		if (ret >= 0)
-			ret = -ENOSPC;
-	}
-	else
-		ret = 0;
-	dqstats.writes++;
-
-	return ret;
+	return qtree_write_dquot(&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i, dquot);
 }
 
-/* Free dquot entry in data block */
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
-	struct super_block *sb = dquot->dq_sb;
-	int type = dquot->dq_type;
-	struct v2_disk_dqdbheader *dh;
-	dqbuf_t buf = getdqbuf();
-	int ret = 0;
-
-	if (!buf)
-		return -ENOMEM;
-	if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
-		printk(KERN_ERR "VFS: Quota structure has offset to other "
-		  "block (%u) than it should (%u).\n", blk,
-		  (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
-		goto out_buf;
-	}
-	if ((ret = read_blk(sb, type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
-		goto out_buf;
-	}
-	dh = (struct v2_disk_dqdbheader *)buf;
-	le16_add_cpu(&dh->dqdh_entries, -1);
-	if (!le16_to_cpu(dh->dqdh_entries)) {	/* Block got free? */
-		if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
-		    (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
-			printk(KERN_ERR "VFS: Can't move quota data block (%u) "
-			  "to free list.\n", blk);
-			goto out_buf;
-		}
-	}
-	else {
-		memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
-		  sizeof(struct v2_disk_dqblk));
-		if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
-			/* Insert will write block itself */
-			if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
-				printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
-				goto out_buf;
-			}
-		}
-		else
-			if ((ret = write_blk(sb, type, blk, buf)) < 0) {
-				printk(KERN_ERR "VFS: Can't write quota data "
-				  "block %u\n", blk);
-				goto out_buf;
-			}
-	}
-	dquot->dq_off = 0;	/* Quota is now unattached */
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
-	struct super_block *sb = dquot->dq_sb;
-	int type = dquot->dq_type;
-	dqbuf_t buf = getdqbuf();
-	int ret = 0;
-	uint newblk;
-	__le32 *ref = (__le32 *)buf;
-	
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
-		goto out_buf;
-	}
-	newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (depth == V2_DQTREEDEPTH-1) {
-		ret = free_dqentry(dquot, newblk);
-		newblk = 0;
-	}
-	else
-		ret = remove_tree(dquot, &newblk, depth+1);
-	if (ret >= 0 && !newblk) {
-		int i;
-		ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
-		for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++);	/* Block got empty? */
-		/* Don't put the root block into the free block list */
-		if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
-			put_free_dqblk(sb, type, buf, *blk);
-			*blk = 0;
-		}
-		else
-			if ((ret = write_blk(sb, type, *blk, buf)) < 0)
-				printk(KERN_ERR "VFS: Can't write quota tree "
-				  "block %u.\n", *blk);
-	}
-out_buf:
-	freedqbuf(buf);
-	return ret;	
-}
-
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
-	uint tmp = V2_DQTREEOFF;
-
-	if (!dquot->dq_off)	/* Even not allocated? */
-		return 0;
-	return remove_tree(dquot, &tmp, 0);
-}
-
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
-	dqbuf_t buf = getdqbuf();
-	loff_t ret = 0;
-	int i;
-	struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-		goto out_buf;
-	}
-	if (dquot->dq_id)
-		for (i = 0; i < V2_DQSTRINBLK &&
-		     le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
-	else {	/* ID 0 as a bit more complicated searching... */
-		struct v2_disk_dqblk fakedquot;
-
-		memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
-		for (i = 0; i < V2_DQSTRINBLK; i++)
-			if (!le32_to_cpu(ddquot[i].dqb_id) &&
-			    memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
-				break;
-	}
-	if (i == V2_DQSTRINBLK) {
-		printk(KERN_ERR "VFS: Quota for id %u referenced "
-		  "but not present.\n", dquot->dq_id);
-		ret = -EIO;
-		goto out_buf;
-	}
-	else
-		ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
-		  v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
-	dqbuf_t buf = getdqbuf();
-	loff_t ret = 0;
-	__le32 *ref = (__le32 *)buf;
-
-	if (!buf)
-		return -ENOMEM;
-	if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
-		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
-		goto out_buf;
-	}
-	ret = 0;
-	blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
-	if (!blk)	/* No reference? */
-		goto out_buf;
-	if (depth < V2_DQTREEDEPTH-1)
-		ret = find_tree_dqentry(dquot, blk, depth+1);
-	else
-		ret = find_block_dqentry(dquot, blk);
-out_buf:
-	freedqbuf(buf);
-	return ret;
-}
-
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
-	return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-
-static int v2_read_dquot(struct dquot *dquot)
-{
-	int type = dquot->dq_type;
-	loff_t offset;
-	struct v2_disk_dqblk ddquot, empty;
-	int ret = 0;
-
-#ifdef __QUOTA_V2_PARANOIA
-	/* Invalidated quota? */
-	if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
-		printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
-		return -EIO;
-	}
-#endif
-	offset = find_dqentry(dquot);
-	if (offset <= 0) {	/* Entry not present? */
-		if (offset < 0)
-			printk(KERN_ERR "VFS: Can't read quota "
-			  "structure for id %u.\n", dquot->dq_id);
-		dquot->dq_off = 0;
-		set_bit(DQ_FAKE_B, &dquot->dq_flags);
-		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-		ret = offset;
-	}
-	else {
-		dquot->dq_off = offset;
-		if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
-		    (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
-		    != sizeof(struct v2_disk_dqblk)) {
-			if (ret >= 0)
-				ret = -EIO;
-			printk(KERN_ERR "VFS: Error while reading quota "
-			  "structure for id %u.\n", dquot->dq_id);
-			memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
-		}
-		else {
-			ret = 0;
-			/* We need to escape back all-zero structure */
-			memset(&empty, 0, sizeof(struct v2_disk_dqblk));
-			empty.dqb_itime = cpu_to_le64(1);
-			if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
-				ddquot.dqb_itime = 0;
-		}
-		disk2memdqb(&dquot->dq_dqb, &ddquot);
-		if (!dquot->dq_dqb.dqb_bhardlimit &&
-			!dquot->dq_dqb.dqb_bsoftlimit &&
-			!dquot->dq_dqb.dqb_ihardlimit &&
-			!dquot->dq_dqb.dqb_isoftlimit)
-			set_bit(DQ_FAKE_B, &dquot->dq_flags);
-	}
-	dqstats.reads++;
-
-	return ret;
-}
-
-/* Check whether dquot should not be deleted. We know we are
- * the only one operating on dquot (thanks to dq_lock) */
 static int v2_release_dquot(struct dquot *dquot)
 {
-	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
-		return v2_delete_dquot(dquot);
-	return 0;
+	return qtree_release_dquot(&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i, dquot);
 }
 
 static struct quota_format_ops v2_format_ops = {
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
index 303d7cbe30d..530fe580685 100644
--- a/fs/quotaio_v2.h
+++ b/fs/quotaio_v2.h
@@ -21,6 +21,12 @@
 	0		/* GRPQUOTA */\
 }
 
+/* First generic header */
+struct v2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* File version */
+};
+
 /*
  * The following structure defines the format of the disk quota file
  * (as it appears on disk) - the file is a radix tree whose leaves point
@@ -38,15 +44,6 @@ struct v2_disk_dqblk {
 	__le64 dqb_itime;	/* time limit for excessive inode use */
 };
 
-/*
- * Here are header structures as written on disk and their in-memory copies
- */
-/* First generic header */
-struct v2_disk_dqheader {
-	__le32 dqh_magic;	/* Magic number identifying file */
-	__le32 dqh_version;	/* File version */
-};
-
 /* Header with type and version specific information */
 struct v2_disk_dqinfo {
 	__le32 dqi_bgrace;	/* Time before block soft limit becomes hard limit */
@@ -57,23 +54,7 @@ struct v2_disk_dqinfo {
 	__le32 dqi_free_entry;	/* Number of block with at least one free entry */
 };
 
-/*
- *  Structure of header of block with quota structures. It is padded to 16 bytes so
- *  there will be space for exactly 21 quota-entries in a block
- */
-struct v2_disk_dqdbheader {
-	__le32 dqdh_next_free;	/* Number of next block with free entry */
-	__le32 dqdh_prev_free;	/* Number of previous block with free entry */
-	__le16 dqdh_entries;	/* Number of valid entries in block */
-	__le16 dqdh_pad1;
-	__le32 dqdh_pad2;
-};
-
 #define V2_DQINFOOFF	sizeof(struct v2_disk_dqheader)	/* Offset of info header in file */
-#define V2_DQBLKSIZE_BITS	10
-#define V2_DQBLKSIZE	(1 << V2_DQBLKSIZE_BITS)	/* Size of block with quota structures */
-#define V2_DQTREEOFF	1		/* Offset of tree in file in blocks */
-#define V2_DQTREEDEPTH	4		/* Depth of quota tree */
-#define V2_DQSTRINBLK	((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk))	/* Number of entries in one blocks */
+#define V2_DQBLKSIZE_BITS 10				/* Size of leaf block in tree */
 
 #endif /* _LINUX_QUOTAIO_V2_H */
-- 
cgit v1.2.3


From e3d4d56b9715e40ded2a84d0d4fa7f3b6c58983c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Oct 2008 18:44:14 +0200
Subject: quota: Convert union in mem_dqinfo to a pointer

Coming quota support for OCFS2 is going to need quite a bit
of additional per-sb quota information. Moreover having fs.h
include all the types needed for this structure would be a
pain in the a**. So remove the union from mem_dqinfo and add
a private pointer for filesystem's use.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/quota_v2.c | 53 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index a87f1028a42..b618b563635 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -71,6 +71,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 {
 	struct v2_disk_dqinfo dinfo;
 	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo;
 	ssize_t size;
 
 	size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -80,22 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
 			sb->s_id);
 		return -1;
 	}
+	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+	if (!info->dqi_priv) {
+		printk(KERN_WARNING
+		       "Not enough memory for quota information structure.\n");
+		return -1;
+	}
+	qinfo = info->dqi_priv;
 	/* limits are stored as unsigned 32-bit data */
 	info->dqi_maxblimit = 0xffffffff;
 	info->dqi_maxilimit = 0xffffffff;
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
-	info->u.v2_i.i.dqi_sb = sb;
-	info->u.v2_i.i.dqi_type = type;
-	info->u.v2_i.i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
-	info->u.v2_i.i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
-	info->u.v2_i.i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
-	info->u.v2_i.i.dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
-	info->u.v2_i.i.dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
-	info->u.v2_i.i.dqi_qtree_depth = qtree_depth(&info->u.v2_i.i);
-	info->u.v2_i.i.dqi_entry_size = sizeof(struct v2_disk_dqblk);
-	info->u.v2_i.i.dqi_ops = &v2_qtree_ops;
+	qinfo->dqi_sb = sb;
+	qinfo->dqi_type = type;
+	qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+	qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+	qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+	qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+	qinfo->dqi_ops = &v2_qtree_ops;
 	return 0;
 }
 
@@ -104,6 +112,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
 {
 	struct v2_disk_dqinfo dinfo;
 	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
 	ssize_t size;
 
 	spin_lock(&dq_data_lock);
@@ -112,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
 	dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
 	spin_unlock(&dq_data_lock);
-	dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.i.dqi_blocks);
-	dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.i.dqi_free_blk);
-	dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.i.dqi_free_entry);
+	dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -150,7 +159,7 @@ static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
 	struct v2_disk_dqblk *d = dp;
 	struct mem_dqblk *m = &dquot->dq_dqb;
 	struct qtree_mem_dqinfo *info =
-			&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i;
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
 
 	d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
 	d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
@@ -169,7 +178,7 @@ static int v2_is_id(void *dp, struct dquot *dquot)
 {
 	struct v2_disk_dqblk *d = dp;
 	struct qtree_mem_dqinfo *info =
-			&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i;
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
 
 	if (qtree_entry_unused(info, dp))
 		return 0;
@@ -178,24 +187,30 @@ static int v2_is_id(void *dp, struct dquot *dquot)
 
 static int v2_read_dquot(struct dquot *dquot)
 {
-	return qtree_read_dquot(&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i, dquot);
+	return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 
 static int v2_write_dquot(struct dquot *dquot)
 {
-	return qtree_write_dquot(&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i, dquot);
+	return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
 }
 
 static int v2_release_dquot(struct dquot *dquot)
 {
-	return qtree_release_dquot(&sb_dqinfo(dquot->dq_sb, dquot->dq_type)->u.v2_i.i, dquot);
+	return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
+}
+
+static int v2_free_file_info(struct super_block *sb, int type)
+{
+	kfree(sb_dqinfo(sb, type)->dqi_priv);
+	return 0;
 }
 
 static struct quota_format_ops v2_format_ops = {
 	.check_quota_file	= v2_check_quota_file,
 	.read_file_info		= v2_read_file_info,
 	.write_file_info	= v2_write_file_info,
-	.free_file_info		= NULL,
+	.free_file_info		= v2_free_file_info,
 	.read_dqblk		= v2_read_dquot,
 	.commit_dqblk		= v2_write_dquot,
 	.release_dqblk		= v2_release_dquot,
-- 
cgit v1.2.3


From db49d2df489f727096438706a5428115e84a3f0d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 1 Oct 2008 18:21:39 +0200
Subject: quota: Allow negative usage of space and inodes

For clustered filesystems, it can happen that space / inode usage goes
negative temporarily (because some node is allocating another node
is freeing and they are not completely in sync). So let quota code
allow this and change qsize_t so a signed type so that we don't
underflow the variables.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 74185c34a4f..9c78ffe1aad 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -847,7 +847,8 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
 
 static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
-	if (dquot->dq_dqb.dqb_curinodes > number)
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curinodes >= number)
 		dquot->dq_dqb.dqb_curinodes -= number;
 	else
 		dquot->dq_dqb.dqb_curinodes = 0;
@@ -858,7 +859,8 @@ static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 
 static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
-	if (dquot->dq_dqb.dqb_curspace > number)
+	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+	    dquot->dq_dqb.dqb_curspace >= number)
 		dquot->dq_dqb.dqb_curspace -= number;
 	else
 		dquot->dq_dqb.dqb_curspace = 0;
-- 
cgit v1.2.3


From 4d59bce4f9eaf26d6d9046b56a2f1c0c7f20981d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 2 Oct 2008 16:48:10 +0200
Subject: quota: Keep which entries were set by SETQUOTA quotactl

Quota in a clustered environment needs to synchronize quota information
among cluster nodes. This means we have to occasionally update some
information in dquot from disk / network. On the other hand we have to
be careful not to overwrite changes administrator did via SETQUOTA.
So indicate in dquot->dq_flags which entries have been set by SETQUOTA
and quota format can clear these flags when it properly propagated
the changes.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 9c78ffe1aad..89226726daa 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -2010,25 +2010,33 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	if (di->dqb_valid & QIF_SPACE) {
 		dm->dqb_curspace = di->dqb_curspace;
 		check_blim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_BLIMITS) {
 		dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
 		dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
 		check_blim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_INODES) {
 		dm->dqb_curinodes = di->dqb_curinodes;
 		check_ilim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_ILIMITS) {
 		dm->dqb_isoftlimit = di->dqb_isoftlimit;
 		dm->dqb_ihardlimit = di->dqb_ihardlimit;
 		check_ilim = 1;
+		__set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
 	}
-	if (di->dqb_valid & QIF_BTIME)
+	if (di->dqb_valid & QIF_BTIME) {
 		dm->dqb_btime = di->dqb_btime;
-	if (di->dqb_valid & QIF_ITIME)
+		__set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	}
+	if (di->dqb_valid & QIF_ITIME) {
 		dm->dqb_itime = di->dqb_itime;
+		__set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	}
 
 	if (check_blim) {
 		if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
-- 
cgit v1.2.3


From 3d9ea253a0e73dccaa869888ec2ceb17ea76c810 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 10 Oct 2008 16:12:23 +0200
Subject: quota: Add helpers to allow ocfs2 specific quota initialization,
 freeing and recovery

OCFS2 needs to peek whether quota structure is already in memory so
that it can avoid expensive cluster locking in that case. Similarly
when freeing dquots, it checks whether it is the last quota structure
user or not. Finally, it needs to get reference to dquot structure for
specified id and quota type when recovering quota file after crash.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 89226726daa..ae8fd9e645c 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
 
 struct dqstats dqstats;
 
-static void dqput(struct dquot *dquot);
-
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
 {
@@ -568,7 +566,7 @@ static struct shrinker dqcache_shrinker = {
  * NOTE: If you change this function please check whether dqput_blocks() works right...
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
 {
 	int ret;
 
@@ -661,11 +659,29 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 	return dquot;
 }
 
+/*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+	unsigned int hashent = hashfn(sb, id, type);
+	int ret = 0;
+
+        if (!sb_has_quota_active(sb, type))
+		return 0;
+	spin_lock(&dq_list_lock);
+	if (find_dquot(hashent, sb, id, type) != NODQUOT)
+		ret = 1;
+	spin_unlock(&dq_list_lock);
+	return ret;
+}
+
 /*
  * Get reference to dquot
  * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 	unsigned int hashent = hashfn(sb, id, type);
 	struct dquot *dquot, *empty = NODQUOT;
@@ -1184,17 +1200,23 @@ out_err:
  * 	Release all quotas referenced by inode
  *	Transaction must be started at an entry
  */
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
 {
 	int cnt;
 
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (inode->i_dquot[cnt] != NODQUOT) {
 			dqput(inode->i_dquot[cnt]);
 			inode->i_dquot[cnt] = NODQUOT;
 		}
 	}
+	return 0;
+}
+
+int dquot_drop(struct inode *inode)
+{
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	dquot_drop_locked(inode);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return 0;
 }
@@ -2308,7 +2330,11 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
-- 
cgit v1.2.3


From 12c77527e4138bc3b17d17b0e0c909e4fc84924f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 17:05:00 +0200
Subject: quota: Implement function for scanning active dquots

OCFS2 needs to scan all active dquots once in a while and sync quota
information among cluster nodes. Provide a helper function for it so
that it does not have to reimplement internally a list which VFS
already has. Moreover this function is probably going to be useful
for other clustered filesystems if they decide to use VFS quotas.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index ae8fd9e645c..075dc76904e 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -476,6 +476,41 @@ restart:
 	spin_unlock(&dq_list_lock);
 }
 
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+		      int (*fn)(struct dquot *dquot, unsigned long priv),
+		      unsigned long priv)
+{
+	struct dquot *dquot, *old_dquot = NULL;
+	int ret = 0;
+
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	spin_lock(&dq_list_lock);
+	list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+		if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+			continue;
+		if (dquot->dq_sb != sb)
+			continue;
+		/* Now we have active dquot so we can just increase use count */
+		atomic_inc(&dquot->dq_count);
+		dqstats.lookups++;
+		spin_unlock(&dq_list_lock);
+		dqput(old_dquot);
+		old_dquot = dquot;
+		ret = fn(dquot, priv);
+		if (ret < 0)
+			goto out;
+		spin_lock(&dq_list_lock);
+		/* We are safe to continue now because our dquot could not
+		 * be moved out of the inuse list while we hold the reference */
+	}
+	spin_unlock(&dq_list_lock);
+out:
+	dqput(old_dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	return ret;
+}
+
 int vfs_quota_sync(struct super_block *sb, int type)
 {
 	struct list_head *dirty;
@@ -2318,6 +2353,7 @@ EXPORT_SYMBOL(vfs_quota_on_path);
 EXPORT_SYMBOL(vfs_quota_on_mount);
 EXPORT_SYMBOL(vfs_quota_disable);
 EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
 EXPORT_SYMBOL(vfs_quota_sync);
 EXPORT_SYMBOL(vfs_get_dqinfo);
 EXPORT_SYMBOL(vfs_set_dqinfo);
-- 
cgit v1.2.3


From 90e86a63eadf1a3b2f19b68d82150dc63fe01443 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 27 Aug 2008 22:30:28 +0200
Subject: ocfs2: Support nested transactions

OCFS2 can easily support nested transactions. We just have to
take care and not spoil statistics acquire semaphore unnecessarily.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/journal.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 12b62a3cbf6..11a1178d5ee 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -256,11 +256,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 	BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
 	BUG_ON(max_buffs <= 0);
 
-	/* JBD might support this, but our journalling code doesn't yet. */
-	if (journal_current_handle()) {
-		mlog(ML_ERROR, "Recursive transaction attempted!\n");
-		BUG();
-	}
+	/* Nested transaction? Just return the handle... */
+	if (journal_current_handle())
+		return jbd2_journal_start(journal, max_buffs);
 
 	down_read(&osb->journal->j_trans_barrier);
 
@@ -285,16 +283,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 int ocfs2_commit_trans(struct ocfs2_super *osb,
 		       handle_t *handle)
 {
-	int ret;
+	int ret, nested;
 	struct ocfs2_journal *journal = osb->journal;
 
 	BUG_ON(!handle);
 
+	nested = handle->h_ref > 1;
 	ret = jbd2_journal_stop(handle);
 	if (ret < 0)
 		mlog_errno(ret);
 
-	up_read(&journal->j_trans_barrier);
+	if (!nested)
+		up_read(&journal->j_trans_barrier);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 1a224ad11eeb190da4a123e156601aad1bb67f24 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 20 Aug 2008 15:43:36 +0200
Subject: ocfs2: Assign feature bits and system inodes to quota feature and
 quota files

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/Kconfig          |  2 ++
 fs/ocfs2/inode.c    |  2 ++
 fs/ocfs2/ocfs2_fs.h | 21 ++++++++++++++++++---
 fs/ocfs2/super.c    | 17 +++++++++++++++++
 4 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index c1ce3d8831d..f9b6e2979aa 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
 	select CONFIGFS_FS
 	select JBD2
 	select CRC32
+	select QUOTA
+	select QUOTA_TREE
 	help
 	  OCFS2 is a general purpose extent based shared disk cluster file
 	  system with many similarities to ext3. It supports 64 bit inode
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ec3497bafda..ec25d998419 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -283,6 +283,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+	} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+		inode->i_flags |= S_NOQUOTA;
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
 		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
 		/* we can't actually hit this as read_inode can't
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7..06e3bd632ff 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -94,7 +94,7 @@
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
 					 | OCFS2_FEATURE_INCOMPAT_XATTR)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP	OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
 
 /*
  * Heartbeat-only devices are missing journals and other files.  The
@@ -163,6 +163,12 @@
  */
 #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN	0x0001
 
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA	0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA	0x0004
+
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
  */
@@ -192,6 +198,7 @@
 #define OCFS2_HEARTBEAT_FL	(0x00000200)	/* Heartbeat area */
 #define OCFS2_CHAIN_FL		(0x00000400)	/* Chain allocator */
 #define OCFS2_DEALLOC_FL	(0x00000800)	/* Truncate log */
+#define OCFS2_QUOTA_FL		(0x00001000)	/* Quota file */
 
 /*
  * Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +336,17 @@ enum {
 #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
 	HEARTBEAT_SYSTEM_INODE,
 	GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+	USER_QUOTA_SYSTEM_INODE,
+	GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
 	ORPHAN_DIR_SYSTEM_INODE,
 	EXTENT_ALLOC_SYSTEM_INODE,
 	INODE_ALLOC_SYSTEM_INODE,
 	JOURNAL_SYSTEM_INODE,
 	LOCAL_ALLOC_SYSTEM_INODE,
 	TRUNCATE_LOG_SYSTEM_INODE,
+	LOCAL_USER_QUOTA_SYSTEM_INODE,
+	LOCAL_GROUP_QUOTA_SYSTEM_INODE,
 	NUM_SYSTEM_INODES
 };
 
@@ -349,6 +360,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	[SLOT_MAP_SYSTEM_INODE]			= { "slot_map", 0, S_IFREG | 0644 },
 	[HEARTBEAT_SYSTEM_INODE]		= { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
 	[GLOBAL_BITMAP_SYSTEM_INODE]		= { "global_bitmap", 0, S_IFREG | 0644 },
+	[USER_QUOTA_SYSTEM_INODE]		= { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[GROUP_QUOTA_SYSTEM_INODE]		= { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 
 	/* Slot-specific system inodes (one copy per slot) */
 	[ORPHAN_DIR_SYSTEM_INODE]		= { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +369,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
 	[INODE_ALLOC_SYSTEM_INODE]		= { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
 	[JOURNAL_SYSTEM_INODE]			= { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
 	[LOCAL_ALLOC_SYSTEM_INODE]		= { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
-	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+	[TRUNCATE_LOG_SYSTEM_INODE]		= { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+	[LOCAL_USER_QUOTA_SYSTEM_INODE]		= { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+	[LOCAL_GROUP_QUOTA_SYSTEM_INODE]	= { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
 };
 
 /* Parameter passed from mount.ocfs2 to module */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 9e7accc68b4..41bb0197cf4 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -225,6 +225,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
 	return 0;
 }
 
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+	    && (ino == USER_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+		return 0;
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+	    && (ino == GROUP_QUOTA_SYSTEM_INODE
+		|| ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+		return 0;
+	return 1;
+}
+
 static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 {
 	struct inode *new = NULL;
@@ -251,6 +264,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 
 	for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
 	     i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
@@ -281,6 +296,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
 	     i < NUM_SYSTEM_INODES;
 	     i++) {
+		if (!ocfs2_need_system_inode(osb, i))
+			continue;
 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
 		if (!new) {
 			ocfs2_release_system_inodes(osb);
-- 
cgit v1.2.3


From bbbd0eb34bf801dee01e345785959a75258f6567 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 Aug 2008 18:22:30 +0200
Subject: ocfs2: Mark system files as not subject to quota accounting

Mark system files as not subject to quota accounting. This prevents
possible recursions into quota code and thus deadlocks.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index ec25d998419..50dbc486ef7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -275,8 +275,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 
-	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+		inode->i_flags |= S_NOQUOTA;
+	}
 
 	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
-- 
cgit v1.2.3


From 9e33d69f553aaf11377307e8d6f82deb3385e351 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 25 Aug 2008 19:56:50 +0200
Subject: ocfs2: Implementation of local and global quota file handling

For each quota type each node has local quota file. In this file it stores
changes users have made to disk usage via this node. Once in a while this
information is synced to global file (and thus with other nodes) so that
limits enforcement at least aproximately works.

Global quota files contain all the information about usage and limits. It's
mostly handled by the generic VFS code (which implements a trie of structures
inside a quota file). We only have to provide functions to convert structures
from on-disk format to in-memory one. We also have to provide wrappers for
various quota functions starting transactions and acquiring necessary cluster
locks before the actual IO is really started.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile          |   2 +
 fs/ocfs2/cluster/masklog.h |   1 +
 fs/ocfs2/dlmglue.c         | 146 +++++++
 fs/ocfs2/dlmglue.h         |  19 +
 fs/ocfs2/file.c            |   6 +-
 fs/ocfs2/file.h            |   3 +
 fs/ocfs2/inode.h           |   2 +
 fs/ocfs2/ocfs2_fs.h        | 103 +++++
 fs/ocfs2/ocfs2_lockid.h    |   5 +
 fs/ocfs2/quota.h           |  93 +++++
 fs/ocfs2/quota_global.c    | 919 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/quota_local.c     | 833 ++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/super.c           |  38 +-
 13 files changed, 2165 insertions(+), 5 deletions(-)
 create mode 100644 fs/ocfs2/quota.h
 create mode 100644 fs/ocfs2/quota_global.c
 create mode 100644 fs/ocfs2/quota_local.c

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index e9ef5d162db..7e4b361b755 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -35,6 +35,8 @@ ocfs2-objs := \
 	sysfile.o 		\
 	uptodate.o		\
 	ver.o			\
+	quota_local.o		\
+	quota_global.o		\
 	xattr.o
 
 ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c68047..7e72a81bc2d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x0000000100000000ULL /* sent to KERN_ERR */
 #define ML_NOTICE	0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 9f2a7f75d1b..058aa86490a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_DLM_GLUE
 #include <cluster/masklog.h>
@@ -51,6 +52,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "uptodate.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
 static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
 
 /*
  * Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
 				     struct ocfs2_lock_res *lockres);
 
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
 
 #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
 
@@ -258,6 +262,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
 	.flags		= 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+	.set_lvb	= ocfs2_set_qinfo_lvb,
+	.get_osb	= ocfs2_get_qinfo_osb,
+	.flags		= LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +289,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
 	return (struct ocfs2_dentry_lock *)lockres->l_priv;
 }
 
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+	BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+	return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
 static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
 {
 	if (lockres->l_ops->get_osb)
@@ -507,6 +524,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
 	return OCFS2_SB(inode->i_sb);
 }
 
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+	return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
 static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
 {
 	struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +633,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 	lockres->l_flags |= OCFS2_LOCK_NOCACHE;
 }
 
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+			       struct ocfs2_mem_dqinfo *info)
+{
+	ocfs2_lock_res_init_once(lockres);
+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+			      0, lockres->l_name);
+	ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+				   OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+				   info);
+}
+
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
 	mlog_entry_void();
@@ -3445,6 +3480,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
 	return UNBLOCK_CONTINUE_POST;
 }
 
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_qinfo_lvb *lvb;
+	struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+
+	mlog_entry_void();
+
+	lvb = (struct ocfs2_qinfo_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+	lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+	lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+
+	mlog_exit_void();
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+	mlog_entry_void();
+	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+		ocfs2_cluster_unlock(osb, lockres, level);
+	mlog_exit_void();
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+					    oinfo->dqi_gi.dqi_type);
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct buffer_head *bh;
+	struct ocfs2_global_disk_dqinfo *gdinfo;
+	int status = 0;
+
+	if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+		info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+		info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+		oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+		oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+		oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					be32_to_cpu(lvb->lvb_free_entry);
+	} else {
+		bh = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &status);
+		if (!bh) {
+			mlog_errno(status);
+			goto bail;
+		}
+		gdinfo = (struct ocfs2_global_disk_dqinfo *)
+					(bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+		info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+		info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+		oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+		oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+		oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+		oinfo->dqi_gi.dqi_free_entry =
+					le32_to_cpu(gdinfo->dqi_free_entry);
+		brelse(bh);
+		ocfs2_track_lock_refresh(lockres);
+	}
+
+bail:
+	return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	int status = 0;
+
+	mlog_entry_void();
+
+	/* On RO devices, locking really isn't needed... */
+	if (ocfs2_is_hard_readonly(osb)) {
+		if (ex)
+			status = -EROFS;
+		goto bail;
+	}
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
+	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
+	/* OK, we have the lock but we need to refresh the quota info */
+	status = ocfs2_refresh_qinfo(oinfo);
+	if (status)
+		ocfs2_qinfo_unlock(oinfo, ex);
+	ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+	mlog_exit(status);
+	return status;
+}
+
 /*
  * This is the filesystem locking protocol.  It provides the lock handling
  * hooks for the underlying DLM.  It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b..3f8d9986b8e 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
 	__be32       lvb_reserved2;
 };
 
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+	__u8	lvb_version;
+	__u8	lvb_reserved[3];
+	__be32	lvb_bgrace;
+	__be32	lvb_igrace;
+	__be32	lvb_syncms;
+	__be32	lvb_blocks;
+	__be32	lvb_free_blk;
+	__be32	lvb_free_entry;
+};
+
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
 struct ocfs2_file_private;
 void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
 			      struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+                               struct ocfs2_mem_dqinfo *info);
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 int ocfs2_create_new_inode_locks(struct inode *inode);
 int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
 int ocfs2_file_lock(struct file *file, int ex, int trylock);
 void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+
 
 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 41001d515fa..372d96505a7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -304,9 +304,9 @@ bail:
 	return status;
 }
 
-static int ocfs2_simple_size_update(struct inode *inode,
-				    struct buffer_head *di_bh,
-				    u64 new_i_size)
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size)
 {
 	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5..172f9fbc9fc 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 			 struct ocfs2_alloc_context *data_ac,
 			 struct ocfs2_alloc_context *meta_ac,
 			 enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+			     struct buffer_head *di_bh,
+			     u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
 			  u64 zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index b79c371a9d2..eb3c302b38d 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 			   struct buffer_head *bh);
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+				int block, int *err, int reada);
 
 void ocfs2_set_inode_flags(struct inode *inode);
 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 06e3bd632ff..0a5ac790a62 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -883,6 +883,109 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
 	return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
 }
 
+/*
+ *  On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+	0x0cf52470, /* USRQUOTA */ \
+	0x0cf52471  /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+	__le32 dqh_magic;	/* Magic number identifying file */
+	__le32 dqh_version;	/* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/	__le32 dqi_bgrace;	/* Grace time for space softlimit excess */
+	__le32 dqi_igrace;	/* Grace time for inode softlimit excess */
+	__le32 dqi_syncms;	/* Time after which we sync local changes to
+				 * global quota file */
+	__le32 dqi_blocks;	/* Number of blocks in quota file */
+/*10*/	__le32 dqi_free_blk;	/* First free block in quota file */
+	__le32 dqi_free_entry;	/* First block with free dquot entry in quota
+				 * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/	__le32 dqb_id;          /* ID the structure belongs to */
+	__le32 dqb_use_count;   /* Number of nodes having reference to this structure */
+	__le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+/*10*/	__le64 dqb_isoftlimit;  /* preferred inode limit */
+	__le64 dqb_curinodes;   /* current # allocated inodes */
+/*20*/	__le64 dqb_bhardlimit;  /* absolute limit on disk space */
+	__le64 dqb_bsoftlimit;  /* preferred limit on disk space */
+/*30*/	__le64 dqb_curspace;    /* current space occupied */
+	__le64 dqb_btime;       /* time limit for excessive disk use */
+/*40*/	__le64 dqb_itime;       /* time limit for excessive inode use */
+	__le64 dqb_pad1;
+/*50*/	__le64 dqb_pad2;
+};
+
+/*
+ *  On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+	0x0cf524c0, /* USRQUOTA */ \
+	0x0cf524c1  /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+	0, \
+	0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN	0x0001	/* Quota file is empty (this should be after\
+				 * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+	__le32 dqi_flags;	/* Flags for quota file */
+	__le32 dqi_chunks;	/* Number of chunks of quota structures
+				 * with a bitmap */
+	__le32 dqi_blocks;	/* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+	__le32 dqc_free;	/* Number of free entries in the bitmap */
+	u8 dqc_bitmap[0];	/* Bitmap of entries in the corresponding
+				 * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/	__le64 dqb_id;		/* id this quota applies to */
+	__le64 dqb_spacemod;	/* Change in the amount of used space */
+/*10*/	__le64 dqb_inodemod;	/* Change in the amount of used inodes */
+};
+
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f..eb6f50c9cec 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_DENTRY,
 	OCFS2_LOCK_TYPE_OPEN,
 	OCFS2_LOCK_TYPE_FLOCK,
+	OCFS2_LOCK_TYPE_QINFO,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_FLOCK:
 			c = 'F';
 			break;
+		case OCFS2_LOCK_TYPE_QINFO:
+			c = 'Q';
+			break;
 		default:
 			c = '\0';
 	}
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
 	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 	[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+	[OCFS2_LOCK_TYPE_QINFO] = "Quota",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 00000000000..1f1c86311b3
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,93 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+	struct dquot dq_dquot;	/* Generic VFS dquot */
+	loff_t dq_local_off;	/* Offset in the local quota file */
+	struct ocfs2_quota_chunk *dq_chunk;	/* Chunk dquot is in */
+	unsigned int dq_use_count;	/* Number of nodes having reference to this entry in global quota file */
+	s64 dq_origspace;	/* Last globally synced space usage */
+	s64 dq_originodes;	/* Last globally synced inode usage */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+	unsigned int dqi_type;		/* Quota type this structure describes */
+	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
+	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
+	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
+	struct list_head dqi_chunk;	/* List of chunks */
+	struct inode *dqi_gqinode;	/* Global quota file inode */
+	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
+	struct buffer_head *dqi_gqi_bh;	/* Buffer head with global quota file inode - set only if inode lock is obtained */
+	int dqi_gqi_count;		/* Number of holders of dqi_gqi_bh */
+	struct buffer_head *dqi_lqi_bh;	/* Buffer head with local quota file inode */
+	struct buffer_head *dqi_ibh;	/* Buffer with information header */
+	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+	return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+	struct list_head qc_chunk;	/* List of quotafile chunks */
+	int qc_num;			/* Number of quota chunk */
+	struct buffer_head *qc_headerbh;	/* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+	return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+struct buffer_head *ocfs2_read_quota_block(struct inode *inode,
+					   int block, int *err);
+
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 00000000000..af8340c4536
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,919 @@
+/*
+ *  Implementation of operations over global quota file
+ */
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	/* Update from disk only entries not set by the admin */
+	if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+		m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+		m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+	if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+		m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+		m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+	}
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+	if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+		m->dqb_btime = le64_to_cpu(d->dqb_btime);
+	if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+		m->dqb_itime = le64_to_cpu(d->dqb_itime);
+	OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct mem_dqblk *m = &dquot->dq_dqb;
+
+	d->dqb_id = cpu_to_le32(dquot->dq_id);
+	d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+	d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+	d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+	d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+	d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+	d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+	d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+	d->dqb_btime = cpu_to_le64(m->dqb_btime);
+	d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+	struct ocfs2_global_disk_dqblk *d = dp;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+		return 0;
+	return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+	.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+	.disk2mem_dqblk = ocfs2_global_disk2memdqb,
+	.is_id = ocfs2_global_is_id,
+};
+
+
+struct buffer_head *ocfs2_read_quota_block(struct inode *inode,
+					   int block, int *err)
+{
+	struct buffer_head *tmp = NULL;
+
+	*err = ocfs2_read_virt_blocks(inode, block, 1, &tmp, 0, NULL);
+	if (*err)
+		mlog_errno(*err);
+
+	return tmp;
+}
+
+static struct buffer_head *ocfs2_get_quota_block(struct inode *inode,
+						 int block, int *err)
+{
+	u64 pblock, pcount;
+	struct buffer_head *bh;
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	*err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount,
+					   NULL);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (*err) {
+		mlog_errno(*err);
+		return NULL;
+	}
+	bh = sb_getblk(inode->i_sb, pblock);
+	if (!bh) {
+		*err = -EIO;
+		mlog_errno(*err);
+	}
+	return bh;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+			 size_t len, loff_t off)
+{
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	loff_t i_size = i_size_read(gqinode);
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0;
+	struct buffer_head *bh;
+	size_t toread, tocopy;
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy = min((size_t)(sb->s_blocksize - offset), toread);
+		bh = ocfs2_read_quota_block(gqinode, blk, &err);
+		if (!bh) {
+			mlog_errno(err);
+			return err;
+		}
+		memcpy(data, bh->b_data + offset, tocopy);
+		brelse(bh);
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+			  const char *data, size_t len, loff_t off)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *gqinode = oinfo->dqi_gqinode;
+	int offset = off & (sb->s_blocksize - 1);
+	sector_t blk = off >> sb->s_blocksize_bits;
+	int err = 0, new = 0;
+	struct buffer_head *bh;
+	handle_t *handle = journal_current_handle();
+
+	if (!handle) {
+		mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+		     "because transaction was not started.\n",
+		     (unsigned long long)off, (unsigned long long)len);
+		return -EIO;
+	}
+	if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+		WARN_ON(1);
+		len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+	}
+
+	mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+	if (gqinode->i_size < off + len) {
+		down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		err = ocfs2_extend_no_holes(gqinode, off + len, off);
+		up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+		if (err < 0)
+			goto out;
+		err = ocfs2_simple_size_update(gqinode,
+					       oinfo->dqi_gqi_bh,
+					       off + len);
+		if (err < 0)
+			goto out;
+		new = 1;
+	}
+	/* Not rewriting whole block? */
+	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+	    !new) {
+		bh = ocfs2_read_quota_block(gqinode, blk, &err);
+		if (!bh) {
+			mlog_errno(err);
+			return err;
+		}
+		err = ocfs2_journal_access(handle, gqinode, bh,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	} else {
+		bh = ocfs2_get_quota_block(gqinode, blk, &err);
+		if (!bh) {
+			mlog_errno(err);
+			return err;
+		}
+		err = ocfs2_journal_access(handle, gqinode, bh,
+						OCFS2_JOURNAL_ACCESS_CREATE);
+	}
+	if (err < 0) {
+		brelse(bh);
+		goto out;
+	}
+	lock_buffer(bh);
+	if (new)
+		memset(bh->b_data, 0, sb->s_blocksize);
+	memcpy(bh->b_data + offset, data, len);
+	flush_dcache_page(bh->b_page);
+	unlock_buffer(bh);
+	ocfs2_set_buffer_uptodate(gqinode, bh);
+	err = ocfs2_journal_dirty(handle, bh);
+	brelse(bh);
+	if (err < 0)
+		goto out;
+out:
+	if (err) {
+		mutex_unlock(&gqinode->i_mutex);
+		mlog_errno(err);
+		return err;
+	}
+	gqinode->i_version++;
+	ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+	mutex_unlock(&gqinode->i_mutex);
+	return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	int status;
+	struct buffer_head *bh = NULL;
+
+	status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+	if (status < 0)
+		return status;
+	spin_lock(&dq_data_lock);
+	if (!oinfo->dqi_gqi_count++)
+		oinfo->dqi_gqi_bh = bh;
+	else
+		WARN_ON(bh != oinfo->dqi_gqi_bh);
+	spin_unlock(&dq_data_lock);
+	return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+	ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+	brelse(oinfo->dqi_gqi_bh);
+	spin_lock(&dq_data_lock);
+	if (!--oinfo->dqi_gqi_count)
+		oinfo->dqi_gqi_bh = NULL;
+	spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+	struct inode *gqinode = NULL;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct ocfs2_global_disk_dqinfo dinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	int status;
+
+	mlog_entry_void();
+
+	/* Read global header */
+	gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+			OCFS2_INVALID_SLOT);
+	if (!gqinode) {
+		mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+			type);
+		status = -EINVAL;
+		goto out_err;
+	}
+	oinfo->dqi_gi.dqi_sb = sb;
+	oinfo->dqi_gi.dqi_type = type;
+	ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+	oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+	oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+	oinfo->dqi_gqi_bh = NULL;
+	oinfo->dqi_gqi_count = 0;
+	oinfo->dqi_gqinode = gqinode;
+	status = ocfs2_lock_global_qf(oinfo, 0);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+				      sizeof(struct ocfs2_global_disk_dqinfo),
+				      OCFS2_GLOBAL_INFO_OFF);
+	ocfs2_unlock_global_qf(oinfo, 0);
+	if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+		     status);
+		if (status >= 0)
+			status = -EIO;
+		mlog_errno(status);
+		goto out_err;
+	}
+	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+	oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+	oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+						OCFS2_QBLK_RESERVED_SPACE;
+	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+out_err:
+	mlog_exit(status);
+	return status;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_global_disk_dqinfo dinfo;
+	ssize_t size;
+
+	spin_lock(&dq_data_lock);
+	info->dqi_flags &= ~DQF_INFO_DIRTY;
+	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+	dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+	spin_unlock(&dq_data_lock);
+	dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+	dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+	dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+	dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+				     sizeof(struct ocfs2_global_disk_dqinfo),
+				     OCFS2_GLOBAL_INFO_OFF);
+	if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+		mlog(ML_ERROR, "Cannot write global quota info structure\n");
+		if (size >= 0)
+			size = -EIO;
+		return size;
+	}
+	return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+	int err;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 1);
+	if (err < 0)
+		return err;
+	err = __ocfs2_global_write_info(sb, type);
+	ocfs2_qinfo_unlock(info, 1);
+	return err;
+}
+
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+	int err, err2, ex = 0;
+	struct ocfs2_mem_dqinfo *info =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+	err = ocfs2_qinfo_lock(info, 0);
+	if (err < 0)
+		goto out;
+	err = qtree_read_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	OCFS2_DQUOT(dquot)->dq_use_count++;
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	if (!dquot->dq_off) {	/* No real quota entry? */
+		/* Upgrade to exclusive lock for allocation */
+		err = ocfs2_qinfo_lock(info, 1);
+		if (err < 0)
+			goto out_qlock;
+		ex = 1;
+	}
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+		err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+		if (!err)
+			err = err2;
+	}
+out_qlock:
+	if (ex)
+		ocfs2_qinfo_unlock(info, 1);
+	ocfs2_qinfo_unlock(info, 0);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+	int err, err2;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_global_disk_dqblk dqblk;
+	s64 spacechange, inodechange;
+	time_t olditime, oldbtime;
+
+	err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				   sizeof(struct ocfs2_global_disk_dqblk),
+				   dquot->dq_off);
+	if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+		if (err >= 0) {
+			mlog(ML_ERROR, "Short read from global quota file "
+				       "(%u read)\n", err);
+			err = -EIO;
+		}
+		goto out;
+	}
+
+	/* Update space and inode usage. Get also other information from
+	 * global quota file so that we don't overwrite any changes there.
+	 * We are */
+	spin_lock(&dq_data_lock);
+	spacechange = dquot->dq_dqb.dqb_curspace -
+					OCFS2_DQUOT(dquot)->dq_origspace;
+	inodechange = dquot->dq_dqb.dqb_curinodes -
+					OCFS2_DQUOT(dquot)->dq_originodes;
+	olditime = dquot->dq_dqb.dqb_itime;
+	oldbtime = dquot->dq_dqb.dqb_btime;
+	ocfs2_global_disk2memdqb(dquot, &dqblk);
+	mlog(0, "Syncing global dquot %d space %lld+%lld, inodes %lld+%lld\n",
+	     dquot->dq_id, dquot->dq_dqb.dqb_curspace, spacechange,
+	     dquot->dq_dqb.dqb_curinodes, inodechange);
+	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curspace += spacechange;
+	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+		dquot->dq_dqb.dqb_curinodes += inodechange;
+	/* Set properly space grace time... */
+	if (dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+		    oldbtime > 0) {
+			if (dquot->dq_dqb.dqb_btime > 0)
+				dquot->dq_dqb.dqb_btime =
+					min(dquot->dq_dqb.dqb_btime, oldbtime);
+			else
+				dquot->dq_dqb.dqb_btime = oldbtime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_btime = 0;
+		clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+	}
+	/* Set properly inode grace time... */
+	if (dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+		if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+		    olditime > 0) {
+			if (dquot->dq_dqb.dqb_itime > 0)
+				dquot->dq_dqb.dqb_itime =
+					min(dquot->dq_dqb.dqb_itime, olditime);
+			else
+				dquot->dq_dqb.dqb_itime = olditime;
+		}
+	} else {
+		dquot->dq_dqb.dqb_itime = 0;
+		clear_bit(DQ_INODES_B, &dquot->dq_flags);
+	}
+	/* All information is properly updated, clear the flags */
+	__clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+	__clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+	OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+	OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+	spin_unlock(&dq_data_lock);
+	err = ocfs2_qinfo_lock(info, freeing);
+	if (err < 0) {
+		mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+			       " (type=%d, id=%u)\n", dquot->dq_type,
+			       (unsigned)dquot->dq_id);
+		goto out;
+	}
+	if (freeing)
+		OCFS2_DQUOT(dquot)->dq_use_count--;
+	err = qtree_write_dquot(&info->dqi_gi, dquot);
+	if (err < 0)
+		goto out_qlock;
+	if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+		err = qtree_release_dquot(&info->dqi_gi, dquot);
+		if (info_dirty(sb_dqinfo(sb, type))) {
+			err2 = __ocfs2_global_write_info(sb, type);
+			if (!err)
+				err = err2;
+		}
+	}
+out_qlock:
+	ocfs2_qinfo_unlock(info, freeing);
+out:
+	if (err < 0)
+		mlog_errno(err);
+	return err;
+}
+
+/*
+ *  Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	status = dquot_commit(dquot);
+	ocfs2_commit_trans(osb, handle);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo;
+	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+		return 0;
+
+	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	/* We modify tree, leaf block, global info, local chunk header,
+	 * global and local inode */
+	return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+	       2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_release(dquot);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+	struct ocfs2_mem_dqinfo *oinfo;
+	int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+				    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+	struct ocfs2_dinode *lfe, *gfe;
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+		return 0;
+
+	oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+	lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+	/* We can extend local file + global file. In local file we
+	 * can modify info, chunk header block and dquot block. In
+	 * global file we can modify info, tree and leaf block */
+	return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+	       ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+	       3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+	handle_t *handle;
+	struct ocfs2_mem_dqinfo *oinfo =
+			sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+	int status = 0;
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	/* We need an exclusive lock, because we're going to update use count
+	 * and instantiate possibly new dquot structure */
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb,
+		ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_acquire(dquot);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+	unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+			     (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+	int sync = 0;
+	int status;
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	handle_t *handle;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+
+	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	dquot_mark_dquot_dirty(dquot);
+
+	/* In case user set some limits, sync dquot immediately to global
+	 * quota file so that information propagates quicker */
+	spin_lock(&dq_data_lock);
+	if (dquot->dq_flags & mask)
+		sync = 1;
+	spin_unlock(&dq_data_lock);
+	if (!sync) {
+		status = ocfs2_write_dquot(dquot);
+		goto out;
+	}
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = ocfs2_sync_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	/* Now write updated local dquot structure */
+	status = dquot_commit(dquot);
+out_trans:
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+	handle_t *handle;
+	int status = 0;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+	mlog_entry_void();
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+	handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	status = dquot_commit_info(sb, type);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/* This is difficult. We have to lock quota inode and start transaction
+ * in this function but we don't want to take the penalty of exlusive
+ * quota file lock when we are just going to use cached structures. So
+ * we just take read lock check whether we have dquot cached and if so,
+ * we don't have to take the write lock... */
+static int ocfs2_dquot_initialize(struct inode *inode, int type)
+{
+	handle_t *handle = NULL;
+	int status = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+	int exclusive = 0;
+	int cnt;
+	qid_t id;
+
+	mlog_entry_void();
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 0);
+		if (status < 0)
+			goto out;
+		/* This is just a performance optimization not a reliable test.
+		 * Since we hold an inode lock, noone can actually release
+		 * the structure until we are finished with initialization. */
+		if (inode->i_dquot[cnt] != NODQUOT) {
+			ocfs2_unlock_global_qf(oinfo, 0);
+			continue;
+		}
+		/* When we have inode lock, we know that no dquot_release() can
+		 * run and thus we can safely check whether we need to
+		 * read+modify global file to get quota information or whether
+		 * our node already has it. */
+		if (cnt == USRQUOTA)
+			id = inode->i_uid;
+		else if (cnt == GRPQUOTA)
+			id = inode->i_gid;
+		else
+			BUG();
+		/* Obtain exclusion from quota off... */
+		down_write(&sb_dqopt(sb)->dqptr_sem);
+		exclusive = !dquot_is_cached(sb, id, cnt);
+		up_write(&sb_dqopt(sb)->dqptr_sem);
+		if (exclusive) {
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0) {
+				exclusive = 0;
+				mlog_errno(status);
+				goto out_ilock;
+			}
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+					ocfs2_calc_qinit_credits(sb, cnt));
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_ilock;
+			}
+		}
+		dquot_initialize(inode, cnt);
+		if (exclusive) {
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+			ocfs2_unlock_global_qf(oinfo, 1);
+		}
+		ocfs2_unlock_global_qf(oinfo, 0);
+	}
+	mlog_exit(0);
+	return 0;
+out_ilock:
+	if (exclusive)
+		ocfs2_unlock_global_qf(oinfo, 1);
+	ocfs2_unlock_global_qf(oinfo, 0);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_dquot_drop_slow(struct inode *inode)
+{
+	int status;
+	int cnt;
+	int got_lock[MAXQUOTAS] = {0, 0};
+	handle_t *handle;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 1);
+		if (status < 0)
+			goto out;
+		got_lock[cnt] = 1;
+	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb),
+			ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+			ocfs2_calc_qinit_credits(sb, GRPQUOTA));
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+				goto out;
+	}
+	dquot_drop(inode);
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (got_lock[cnt]) {
+			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+			ocfs2_unlock_global_qf(oinfo, 1);
+		}
+	return status;
+}
+
+/* See the comment before ocfs2_dquot_initialize. */
+static int ocfs2_dquot_drop(struct inode *inode)
+{
+	int status = 0;
+	struct super_block *sb = inode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo;
+	int exclusive = 0;
+	int cnt;
+	int got_lock[MAXQUOTAS] = {0, 0};
+
+	mlog_entry_void();
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
+		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+		status = ocfs2_lock_global_qf(oinfo, 0);
+		if (status < 0)
+			goto out;
+		got_lock[cnt] = 1;
+	}
+	/* Lock against anyone releasing references so that when when we check
+	 * we know we are not going to be last ones to release dquot */
+	down_write(&sb_dqopt(sb)->dqptr_sem);
+	/* Urgh, this is a terrible hack :( */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt] != NODQUOT &&
+		    atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
+			exclusive = 1;
+			break;
+		}
+	}
+	if (!exclusive)
+		dquot_drop_locked(inode);
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+out:
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (got_lock[cnt]) {
+			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+			ocfs2_unlock_global_qf(oinfo, 0);
+		}
+	/* In case we bailed out because we had to do expensive locking
+	 * do it now... */
+	if (exclusive)
+		status = ocfs2_dquot_drop_slow(inode);
+	mlog_exit(status);
+	return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+	struct ocfs2_dquot *dquot =
+				kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+	if (!dquot)
+		return NULL;
+	return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+	kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+struct dquot_operations ocfs2_quota_operations = {
+	.initialize	= ocfs2_dquot_initialize,
+	.drop		= ocfs2_dquot_drop,
+	.alloc_space	= dquot_alloc_space,
+	.alloc_inode	= dquot_alloc_inode,
+	.free_space	= dquot_free_space,
+	.free_inode	= dquot_free_inode,
+	.transfer	= dquot_transfer,
+	.write_dquot	= ocfs2_write_dquot,
+	.acquire_dquot	= ocfs2_acquire_dquot,
+	.release_dquot	= ocfs2_release_dquot,
+	.mark_dirty	= ocfs2_mark_dquot_dirty,
+	.write_info	= ocfs2_write_info,
+	.alloc_dquot	= ocfs2_alloc_dquot,
+	.destroy_dquot	= ocfs2_destroy_dquot,
+};
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 00000000000..55c3f2f98dc
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,833 @@
+/*
+ *  Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+	return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+		sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+	return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+		 OCFS2_QBLK_RESERVED_SPACE) << 3) /
+	       ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+	return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+	/* 1 block for local quota file info, 1 block per chunk for chunk info */
+	return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ((ol_quota_chunk_block(sb, c) + 1 + off / epb)
+		<< sb->s_blocksize_bits) +
+		(off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+	return off >> sb->s_blocksize_bits;
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+	return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ((off >> sb->s_blocksize_bits) -
+			ol_quota_chunk_block(sb, c) - 1) * epb
+	       + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+		 sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+		void (*modify)(struct buffer_head *, void *), void *private)
+{
+	struct super_block *sb = inode->i_sb;
+	handle_t *handle;
+	int status;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		return status;
+	}
+	status = ocfs2_journal_access(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	lock_buffer(bh);
+	modify(bh, private);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		ocfs2_commit_trans(OCFS2_SB(sb), handle);
+		return status;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		return status;
+	}
+	return 0;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+	unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+	unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+	unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+	unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+					GROUP_QUOTA_SYSTEM_INODE };
+	struct buffer_head *bh;
+	struct inode *linode = sb_dqopt(sb)->files[type];
+	struct inode *ginode = NULL;
+	struct ocfs2_disk_dqheader *dqhead;
+	int status, ret = 0;
+
+	/* First check whether we understand local quota file */
+	bh = ocfs2_read_quota_block(linode, 0, &status);
+	if (!bh) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+			type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+		mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+			lmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+		mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+			" type=%d\n", le32_to_cpu(dqhead->dqh_version),
+			lversions[type], type);
+		goto out_err;
+	}
+	brelse(bh);
+	bh = NULL;
+
+	/* Next check whether we understand global quota file */
+	ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+						OCFS2_INVALID_SLOT);
+	if (!ginode) {
+		mlog(ML_ERROR, "cannot get global quota file inode "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	/* Since the header is read only, we don't care about locking */
+	bh = ocfs2_read_quota_block(ginode, 0, &status);
+	if (!bh) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read global quota file header "
+				"(type=%d)\n", type);
+		goto out_err;
+	}
+	dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+	if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+		mlog(ML_ERROR, "global quota file magic does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+		goto out_err;
+	}
+	if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+		mlog(ML_ERROR, "global quota file version does not match "
+			"(%u != %u), type=%d\n",
+			le32_to_cpu(dqhead->dqh_version), gversions[type],
+			type);
+		goto out_err;
+	}
+
+	ret = 1;
+out_err:
+	brelse(bh);
+	iput(ginode);
+	return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+	struct ocfs2_quota_chunk *pos, *next;
+
+	list_for_each_entry_safe(pos, next, head, qc_chunk) {
+		list_del(&pos->qc_chunk);
+		brelse(pos->qc_headerbh);
+		kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+	}
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+			struct ocfs2_local_disk_dqinfo *ldinfo,
+			struct list_head *head)
+{
+	struct ocfs2_quota_chunk *newchunk;
+	int i, status;
+
+	INIT_LIST_HEAD(head);
+	for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+		newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+		if (!newchunk) {
+			ocfs2_release_local_quota_bitmaps(head);
+			return -ENOMEM;
+		}
+		newchunk->qc_num = i;
+		newchunk->qc_headerbh = ocfs2_read_quota_block(inode,
+				ol_quota_chunk_block(inode->i_sb, i),
+				&status);
+		if (!newchunk->qc_headerbh) {
+			mlog_errno(status);
+			kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+			ocfs2_release_local_quota_bitmaps(head);
+			return status;
+		}
+		list_add_tail(&newchunk->qc_chunk, head);
+	}
+	return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+	struct mem_dqinfo *info = private;
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	spin_lock(&dq_data_lock);
+	ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+	ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+	ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+	spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	int status;
+	struct buffer_head *bh = NULL;
+	int locked = 0;
+
+	info->dqi_maxblimit = 0x7fffffffffffffffLL;
+	info->dqi_maxilimit = 0x7fffffffffffffffLL;
+	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+	if (!oinfo) {
+		mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+			       " info.");
+		goto out_err;
+	}
+	info->dqi_priv = oinfo;
+	oinfo->dqi_type = type;
+	INIT_LIST_HEAD(&oinfo->dqi_chunk);
+	oinfo->dqi_lqi_bh = NULL;
+	oinfo->dqi_ibh = NULL;
+
+	status = ocfs2_global_read_info(sb, type);
+	if (status < 0)
+		goto out_err;
+
+	status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	locked = 1;
+
+	/* Now read local header */
+	bh = ocfs2_read_quota_block(lqinode, 0, &status);
+	if (!bh) {
+		mlog_errno(status);
+		mlog(ML_ERROR, "failed to read quota file info header "
+			"(type=%d)\n", type);
+		goto out_err;
+	}
+	ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+						OCFS2_LOCAL_INFO_OFF);
+	info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+	oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+	oinfo->dqi_ibh = bh;
+
+	/* We crashed when using local quota file? */
+	if (!(info->dqi_flags & OLQF_CLEAN))
+		goto out_err;	/* So far we just bail out. Later we should resync here */
+
+	status = ocfs2_load_local_quota_bitmaps(sb_dqopt(sb)->files[type],
+						ldinfo,
+						&oinfo->dqi_chunk);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now mark quota file as used */
+	info->dqi_flags &= ~OLQF_CLEAN;
+	status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	return 0;
+out_err:
+	if (oinfo) {
+		iput(oinfo->dqi_gqinode);
+		ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+		ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+		brelse(oinfo->dqi_lqi_bh);
+		if (locked)
+			ocfs2_inode_unlock(lqinode, 1);
+		ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+		kfree(oinfo);
+	}
+	brelse(bh);
+	return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+						->dqi_ibh;
+	int status;
+
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int mark_clean = 1, len;
+	int status;
+
+	iput(oinfo->dqi_gqinode);
+	ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+	ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+					(chunk->qc_headerbh->b_data);
+		if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+			len = ol_chunk_entries(sb);
+		} else {
+			len = (oinfo->dqi_blocks -
+			       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+			      * ol_quota_entries_per_block(sb);
+		}
+		/* Not all entries free? Bug! */
+		if (le32_to_cpu(dchunk->dqc_free) != len) {
+			mlog(ML_ERROR, "releasing quota file with used "
+					"entries (type=%d)\n", type);
+			mark_clean = 0;
+		}
+	}
+	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+	if (!mark_clean)
+		goto out;
+
+	/* Mark local file as clean */
+	info->dqi_flags |= OLQF_CLEAN;
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+				 oinfo->dqi_ibh,
+				 olq_update_info,
+				 info);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+out:
+	ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+	brelse(oinfo->dqi_ibh);
+	brelse(oinfo->dqi_lqi_bh);
+	kfree(oinfo);
+	return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+	struct ocfs2_dquot *od = private;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct super_block *sb = od->dq_dquot.dq_sb;
+
+	dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+		+ ol_dqblk_block_offset(sb, od->dq_local_off));
+
+	dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+	spin_lock(&dq_data_lock);
+	dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+					  od->dq_origspace);
+	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+					  od->dq_originodes);
+	spin_unlock(&dq_data_lock);
+	mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+	     od->dq_dquot.dq_id, dqblk->dqb_spacemod, dqblk->dqb_inodemod);
+}
+
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct buffer_head *bh;
+	int status;
+
+	bh = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+				    ol_dqblk_file_block(sb, od->dq_local_off),
+				    &status);
+	if (!bh) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+				 olq_set_dquot, od);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	brelse(bh);
+	return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int found = 0, len;
+
+	list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+		dchunk = (struct ocfs2_local_disk_chunk *)
+						chunk->qc_headerbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) > 0) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return NULL;
+
+	if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+		len = ol_chunk_entries(sb);
+	} else {
+		len = (oinfo->dqi_blocks -
+		       ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+		      * ol_quota_entries_per_block(sb);
+	}
+
+	found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+	/* We failed? */
+	if (found == len) {
+		mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+		     " entries free (type=%d)\n", chunk->qc_num,
+		     le32_to_cpu(dchunk->dqc_free), type);
+		return ERR_PTR(-EIO);
+	}
+	*offset = found;
+	return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+							struct super_block *sb,
+							int type,
+							int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk = NULL;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int status;
+	handle_t *handle;
+	struct buffer_head *bh = NULL;
+	u64 p_blkno;
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode,
+				       lqinode->i_size + 2 * sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + 2 * sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+	if (!chunk) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+
+	down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+					     &p_blkno, NULL, NULL);
+	up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	bh = sb_getblk(sb, p_blkno);
+	if (!bh) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto out;
+	}
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access(handle, lqinode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	lock_buffer(bh);
+	dchunk->dqc_free = ol_quota_entries_per_block(sb);
+	memset(dchunk->dqc_bitmap, 0,
+	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+	       OCFS2_QBLK_RESERVED_SPACE);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	status = ocfs2_journal_dirty(handle, bh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	oinfo->dqi_blocks += 2;
+	oinfo->dqi_chunks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+	chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+				   struct ocfs2_quota_chunk,
+				   qc_chunk)->qc_num + 1;
+	chunk->qc_headerbh = bh;
+	*offset = 0;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	brelse(bh);
+	kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+	return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+						       struct super_block *sb,
+						       int type,
+						       int *offset)
+{
+	struct mem_dqinfo *info = sb_dqinfo(sb, type);
+	struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+	struct ocfs2_quota_chunk *chunk;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_local_disk_chunk *dchunk;
+	int epb = ol_quota_entries_per_block(sb);
+	unsigned int chunk_blocks;
+	int status;
+	handle_t *handle;
+
+	if (list_empty(&oinfo->dqi_chunk))
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+	/* Is the last chunk full? */
+	chunk = list_entry(oinfo->dqi_chunk.prev,
+			struct ocfs2_quota_chunk, qc_chunk);
+	chunk_blocks = oinfo->dqi_blocks -
+			ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+	if (ol_chunk_blocks(sb) == chunk_blocks)
+		return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+	/* We are protected by dqio_sem so no locking needed */
+	status = ocfs2_extend_no_holes(lqinode,
+				       lqinode->i_size + sb->s_blocksize,
+				       lqinode->i_size);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+					  lqinode->i_size + sb->s_blocksize);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out;
+	}
+	status = ocfs2_journal_access(handle, lqinode, chunk->qc_headerbh,
+				 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+	lock_buffer(chunk->qc_headerbh);
+	le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+	unlock_buffer(chunk->qc_headerbh);
+	status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+	oinfo->dqi_blocks++;
+	status = ocfs2_local_write_info(sb, type);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_trans;
+	}
+
+	status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	*offset = chunk_blocks * epb;
+	return chunk;
+out_trans:
+	ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+	return ERR_PTR(status);
+}
+
+void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+	int *offset = private;
+	struct ocfs2_local_disk_chunk *dchunk;
+
+	dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+	ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+	struct super_block *sb = dquot->dq_sb;
+	int type = dquot->dq_type;
+	struct inode *lqinode = sb_dqopt(sb)->files[type];
+	struct ocfs2_quota_chunk *chunk;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	int offset;
+	int status;
+
+	chunk = ocfs2_find_free_entry(sb, type, &offset);
+	if (!chunk) {
+		chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+		if (IS_ERR(chunk))
+			return PTR_ERR(chunk);
+	} else if (IS_ERR(chunk)) {
+		return PTR_ERR(chunk);
+	}
+	od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+	od->dq_chunk = chunk;
+
+	/* Initialize dquot structure on disk */
+	status = ocfs2_local_write_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	/* Mark structure as allocated */
+	status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+				 &offset);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+out:
+	return status;
+}
+
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+	int status;
+
+	mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+
+	status = ocfs2_global_read_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+
+	/* Now create entry in the local quota file */
+	status = ocfs2_create_local_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_err;
+	}
+	mlog_exit(0);
+	return 0;
+out_err:
+	mlog_exit(status);
+	return status;
+}
+
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+	int status;
+	int type = dquot->dq_type;
+	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int offset;
+	handle_t *handle = journal_current_handle();
+
+	BUG_ON(!handle);
+	/* First write all local changes to global file */
+	status = ocfs2_global_release_dquot(dquot);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+
+	status = ocfs2_journal_access(handle, sb_dqopt(sb)->files[type],
+			od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+					     od->dq_local_off);
+	dchunk = (struct ocfs2_local_disk_chunk *)
+			(od->dq_chunk->qc_headerbh->b_data);
+	/* Mark structure as freed */
+	lock_buffer(od->dq_chunk->qc_headerbh);
+	ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+	le32_add_cpu(&dchunk->dqc_free, 1);
+	unlock_buffer(od->dq_chunk->qc_headerbh);
+	status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out;
+	}
+	status = 0;
+out:
+	/* Clear the read bit so that next time someone uses this
+	 * dquot he reads fresh info from disk and allocates local
+	 * dquot structure */
+	clear_bit(DQ_READ_B, &dquot->dq_flags);
+	return status;
+}
+
+static struct quota_format_ops ocfs2_format_ops = {
+	.check_quota_file	= ocfs2_local_check_quota_file,
+	.read_file_info		= ocfs2_local_read_info,
+	.write_file_info	= ocfs2_global_write_info,
+	.free_file_info		= ocfs2_local_free_info,
+	.read_dqblk		= ocfs2_local_read_dquot,
+	.commit_dqblk		= ocfs2_local_write_dquot,
+	.release_dqblk		= ocfs2_local_release_dquot,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+	.qf_fmt_id = QFMT_OCFS2,
+	.qf_ops = &ocfs2_format_ops,
+	.qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 41bb0197cf4..7bb83e41581 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -65,10 +65,13 @@
 #include "uptodate.h"
 #include "ver.h"
 #include "xattr.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
 static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
 
 /* OCFS2 needs to schedule several differnt types of work which
  * require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -137,6 +140,8 @@ static const struct super_operations ocfs2_sops = {
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
+	.quota_read	= ocfs2_quota_read,
+	.quota_write	= ocfs2_quota_write,
 };
 
 enum {
@@ -1104,6 +1109,7 @@ static int __init ocfs2_init(void)
 
 	ocfs2_set_locking_protocol();
 
+	status = register_quota_format(&ocfs2_quota_format);
 leave:
 	if (status < 0) {
 		ocfs2_free_mem_caches();
@@ -1127,6 +1133,8 @@ static void __exit ocfs2_exit(void)
 		destroy_workqueue(ocfs2_wq);
 	}
 
+	unregister_quota_format(&ocfs2_quota_format);
+
 	debugfs_remove(ocfs2_debugfs_root);
 
 	ocfs2_free_mem_caches();
@@ -1242,8 +1250,27 @@ static int ocfs2_initialize_mem_caches(void)
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
 						SLAB_MEM_SPREAD),
 				       ocfs2_inode_init_once);
-	if (!ocfs2_inode_cachep)
+	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+					sizeof(struct ocfs2_dquot),
+					0,
+					(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+						SLAB_MEM_SPREAD),
+					NULL);
+	ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+					sizeof(struct ocfs2_quota_chunk),
+					0,
+					(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+					NULL);
+	if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+	    !ocfs2_qf_chunk_cachep) {
+		if (ocfs2_inode_cachep)
+			kmem_cache_destroy(ocfs2_inode_cachep);
+		if (ocfs2_dquot_cachep)
+			kmem_cache_destroy(ocfs2_dquot_cachep);
+		if (ocfs2_qf_chunk_cachep)
+			kmem_cache_destroy(ocfs2_qf_chunk_cachep);
 		return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -1252,8 +1279,15 @@ static void ocfs2_free_mem_caches(void)
 {
 	if (ocfs2_inode_cachep)
 		kmem_cache_destroy(ocfs2_inode_cachep);
-
 	ocfs2_inode_cachep = NULL;
+
+	if (ocfs2_dquot_cachep)
+		kmem_cache_destroy(ocfs2_dquot_cachep);
+	ocfs2_dquot_cachep = NULL;
+
+	if (ocfs2_qf_chunk_cachep)
+		kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+	ocfs2_qf_chunk_cachep = NULL;
 }
 
 static int ocfs2_get_sector(struct super_block *sb,
-- 
cgit v1.2.3


From a90714c150e3ce677c57a9dac3ab1ec342c75a95 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 9 Oct 2008 19:38:40 +0200
Subject: ocfs2: Add quota calls for allocation and freeing of inodes and space

Add quota calls for allocation and freeing of inodes and space, also update
estimates on number of needed credits for a transaction. Move out inode
allocation from ocfs2_mknod_locked() because vfs_dq_init() must be called
outside of a transaction.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c   | 20 +++++++++++--
 fs/ocfs2/aops.c    | 16 +++++++++--
 fs/ocfs2/dir.c     | 24 ++++++++++++++--
 fs/ocfs2/file.c    | 72 ++++++++++++++++++++++++++++++++++++++++++----
 fs/ocfs2/inode.c   | 10 +++++--
 fs/ocfs2/journal.h | 84 ++++++++++++++++++++++++++++++++++++++++++++----------
 fs/ocfs2/namei.c   | 44 +++++++++++++++++++++++++---
 fs/ocfs2/xattr.c   | 14 +++++----
 8 files changed, 245 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 69d67ab069b..84a7bd4db5d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/swap.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -5322,7 +5323,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
@@ -6552,6 +6553,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	vfs_dq_free_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
 				      clusters_to_del;
@@ -6860,6 +6863,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	struct page **pages = NULL;
 	loff_t end = osb->s_clustersize;
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 	has_data = i_size_read(inode) ? 1 : 0;
 
@@ -6879,7 +6883,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+	handle = ocfs2_start_trans(osb,
+				   ocfs2_inline_to_extents_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
@@ -6898,6 +6903,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		unsigned int page_end;
 		u64 phys;
 
+		if (vfs_dq_alloc_space_nodirty(inode,
+				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
+			ret = -EDQUOT;
+			goto out_commit;
+		}
+		did_quota = 1;
+
 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
 					   &num);
 		if (ret) {
@@ -6971,6 +6983,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 	}
 
 out_commit:
+	if (ret < 0 && did_quota)
+		vfs_dq_free_space_nodirty(inode,
+					  ocfs2_clusters_to_bytes(osb->sb, 1));
+
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock:
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 6af79adb2ec..6b647ec87bb 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -1730,6 +1731,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 	wc->w_handle = handle;
 
+	if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
 	/*
 	 * We don't want this to fail in ocfs2_write_end(), so do it
 	 * here.
@@ -1738,7 +1744,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 
 	/*
@@ -1751,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 					 mmap_page);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 
 	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
 					  len);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_commit;
+		goto out_quota;
 	}
 
 	if (data_ac)
@@ -1770,6 +1776,10 @@ success:
 	*pagep = wc->w_target_page;
 	*fsdata = wc;
 	return 0;
+out_quota:
+	if (clusters_to_alloc)
+		vfs_dq_free_space(inode,
+			  ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d83cff95759..3708fe482e3 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -1210,9 +1211,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 				   unsigned int blocks_wanted,
 				   struct buffer_head **first_block_bh)
 {
-	int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
 	u32 alloc, bit_off, len;
 	struct super_block *sb = dir->i_sb;
+	int ret, credits = ocfs2_inline_to_extents_credits(sb);
 	u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
 	struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1221,6 +1222,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	handle_t *handle;
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 	ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
 
@@ -1258,6 +1260,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 		goto out_sem;
 	}
 
+	if (vfs_dq_alloc_space_nodirty(dir,
+				ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+		ret = -EDQUOT;
+		goto out_commit;
+	}
+	did_quota = 1;
 	/*
 	 * Try to claim as many clusters as the bitmap can give though
 	 * if we only get one now, that's enough to continue. The rest
@@ -1380,6 +1388,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	dirdata_bh = NULL;
 
 out_commit:
+	if (ret < 0 && did_quota)
+		vfs_dq_free_space_nodirty(dir,
+			ocfs2_clusters_to_bytes(osb->sb, 2));
 	ocfs2_commit_trans(osb, handle);
 
 out_sem:
@@ -1404,7 +1415,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct buffer_head **new_bh)
 {
 	int status;
-	int extend;
+	int extend, did_quota = 0;
 	u64 p_blkno, v_blkno;
 
 	spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1414,6 +1425,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	if (extend) {
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
+		if (vfs_dq_alloc_space_nodirty(dir,
+					ocfs2_clusters_to_bytes(sb, 1))) {
+			status = -EDQUOT;
+			goto bail;
+		}
+		did_quota = 1;
+
 		status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
 					      1, 0, parent_fe_bh, handle,
 					      data_ac, meta_ac, NULL);
@@ -1439,6 +1457,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 	}
 	status = 0;
 bail:
+	if (did_quota && status < 0)
+		vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
 	mlog_exit(status);
 	return status;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 372d96505a7..9374d374a26 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/writeback.h>
 #include <linux/falloc.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -57,6 +58,7 @@
 #include "super.h"
 #include "xattr.h"
 #include "acl.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
@@ -534,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
+	int did_quota = 0;
 
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
@@ -577,6 +580,13 @@ restart_all:
 	}
 
 restarted_transaction:
+	if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+	    clusters_to_add))) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota = 1;
+
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
 	 * update i_size. */
@@ -614,6 +624,10 @@ restarted_transaction:
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	/* Release unused quota reservation */
+	vfs_dq_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+	did_quota = 0;
 
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
@@ -646,6 +660,9 @@ restarted_transaction:
 	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
 
 leave:
+	if (status < 0 && did_quota)
+		vfs_dq_free_space(inode,
+			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
@@ -877,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	struct buffer_head *bh = NULL;
 	handle_t *handle = NULL;
+	int locked[MAXQUOTAS] = {0, 0};
+	int credits, qtype;
+	struct ocfs2_mem_dqinfo *oinfo;
 
 	mlog_entry("(0x%p, '%.*s')\n", dentry,
 	           dentry->d_name.len, dentry->d_name.name);
@@ -947,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto bail_unlock;
+	if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+	    (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+		credits = OCFS2_INODE_UPDATE_CREDITS;
+		if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+			oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0)
+				goto bail_unlock;
+			credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+				ocfs2_calc_qdel_credits(sb, USRQUOTA);
+			locked[USRQUOTA] = 1;
+		}
+		if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+		    && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+		    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+			oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+			status = ocfs2_lock_global_qf(oinfo, 1);
+			if (status < 0)
+				goto bail_unlock;
+			credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+				   ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+			locked[GRPQUOTA] = 1;
+		}
+		handle = ocfs2_start_trans(osb, credits);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
+		status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+		if (status < 0)
+			goto bail_commit;
+	} else {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto bail_unlock;
+		}
 	}
 
 	/*
@@ -974,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
+	for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+		if (!locked[qtype])
+			continue;
+		oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+		ocfs2_unlock_global_qf(oinfo, 1);
+	}
 	ocfs2_inode_unlock(inode, 1);
 bail_unlock_rw:
 	if (size_change)
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 50dbc486ef7..288512c9dbc 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/quotaops.h>
 
 #include <asm/byteorder.h>
 
@@ -603,7 +604,8 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+	handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+					ocfs2_quota_trans_credits(inode->i_sb));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
@@ -635,6 +637,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	ocfs2_remove_from_cache(inode, di_bh);
+	vfs_dq_free_inode(inode);
 
 	status = ocfs2_free_dinode(handle, inode_alloc_inode,
 				   inode_alloc_bh, di);
@@ -917,7 +920,10 @@ void ocfs2_delete_inode(struct inode *inode)
 
 	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
 
-	if (is_bad_inode(inode)) {
+	/* When we fail in read_inode() we mark inode as bad. The second test
+	 * catches the case when inode allocation fails before allocating
+	 * a block for inode. */
+	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
 		mlog(0, "Skipping delete of bad inode\n");
 		goto bail;
 	}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 8203980fefe..ee08e9c1fc1 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -284,6 +284,37 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
 
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+	int credits = 0;
+
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+		credits += OCFS2_QWRITE_CREDITS;
+	return credits;
+}
+
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
+
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS	(OCFS2_INODE_UPDATE_CREDITS + 1)
 
@@ -294,8 +325,11 @@ int                  ocfs2_journal_dirty(handle_t *handle,
  * prev. group desc. if we relink. */
 #define OCFS2_SUBALLOC_ALLOC (3)
 
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC		\
-					 + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+	return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* dinode + group descriptor update. We don't relink on free yet. */
 #define OCFS2_SUBALLOC_FREE  (2)
@@ -304,16 +338,23 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE 		      \
 					 + OCFS2_TRUNCATE_LOG_UPDATE)
 
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+	return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
  * bitmap block for the new bit) */
 #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
 
 /* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC                         \
-			    + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+ * group descriptor + mkdir/symlink blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb)
+{
+	return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* local alloc metadata change + main bitmap updates */
 #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS                 \
@@ -323,13 +364,21 @@ int                  ocfs2_journal_dirty(handle_t *handle,
  * for the dinode, one for the new block. */
 #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
 
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
-#define OCFS2_LINK_CREDITS  (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+	return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+	       ocfs2_quota_trans_credits(sb);
+}
 
 /* inode + dir inode (if we unlink a dir), + dir entry block + orphan
  * dir inode link */
-#define OCFS2_UNLINK_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 1             \
-			      + OCFS2_LINK_CREDITS)
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+	/* The quota update from ocfs2_link_credits is unused here... */
+	return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
 
 /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
  * inode alloc group descriptor */
@@ -338,8 +387,10 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 /* dinode update, old dir dinode update, new dir dinode update, old
  * dir dir entry, new dir dir entry, dir entry update for renaming
  * directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3              \
-			     + OCFS2_UNLINK_CREDITS)
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+	return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
 
 /* global bitmap dinode, group desc., relinked group,
  * suballocator dinode, group desc., relinked group,
@@ -377,18 +428,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
 	 * credit for the dinode there. */
 	extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
 
-	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+	return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+	       ocfs2_quota_trans_credits(sb);
 }
 
 static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
 {
-	int blocks = OCFS2_MKNOD_CREDITS;
+	int blocks = ocfs2_mknod_credits(sb);
 
 	/* links can be longer than one block so we may update many
 	 * within our single allocated extent. */
 	blocks += ocfs2_clusters_to_blocks(sb, 1);
 
-	return blocks;
+	return blocks + ocfs2_quota_trans_credits(sb);
 }
 
 static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -425,6 +477,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	/* update to the truncate log. */
 	credits += OCFS2_TRUNCATE_LOG_UPDATE;
 
+	credits += ocfs2_quota_trans_credits(sb);
+
 	return credits;
 }
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 0134bafdab9..6173807ba23 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
@@ -212,6 +213,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
 	} else
 		inode->i_gid = current_fsgid();
 	inode->i_mode = mode;
+	vfs_dq_init(inode);
 	return inode;
 }
 
@@ -236,6 +238,7 @@ static int ocfs2_mknod(struct inode *dir,
 	struct ocfs2_security_xattr_info si = {
 		.enable = 1,
 	};
+	int did_quota_inode = 0;
 
 	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
 		   (unsigned long)dev, dentry->d_name.len,
@@ -323,7 +326,8 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS + xattr_credits);
+	handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+				   xattr_credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -331,6 +335,15 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	}
 
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto leave;
+	}
+	did_quota_inode = 1;
+
 	/* do the real work now. */
 	status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
@@ -399,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
 	d_instantiate(dentry, inode);
 	status = 0;
 leave:
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
@@ -641,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_unlock_inode;
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		err = PTR_ERR(handle);
 		handle = NULL;
@@ -828,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -1234,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		handle = NULL;
@@ -1555,6 +1570,7 @@ static int ocfs2_symlink(struct inode *dir,
 	struct ocfs2_security_xattr_info si = {
 		.enable = 1,
 	};
+	int did_quota = 0, did_quota_inode = 0;
 
 	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
 		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1648,6 +1664,15 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	}
 
+	/* We don't use standard VFS wrapper because we don't want vfs_dq_init
+	 * to be called. */
+	if (sb_any_quota_active(osb->sb) &&
+	    osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+		status = -EDQUOT;
+		goto bail;
+	}
+	did_quota_inode = 1;
+
 	status = ocfs2_mknod_locked(osb, dir, inode, dentry,
 				    0, &new_fe_bh, parent_fe_bh, handle,
 				    inode_ac);
@@ -1663,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
+		if (vfs_dq_alloc_space_nodirty(inode,
+		    ocfs2_clusters_to_bytes(osb->sb, 1))) {
+			status = -EDQUOT;
+			goto bail;
+		}
+		did_quota = 1;
 		status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
 					      new_fe_bh,
 					      handle, data_ac, NULL,
@@ -1728,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
 	dentry->d_op = &ocfs2_dentry_ops;
 	d_instantiate(dentry, inode);
 bail:
+	if (status < 0 && did_quota)
+		vfs_dq_free_space_nodirty(inode,
+					ocfs2_clusters_to_bytes(osb->sb, 1));
+	if (status < 0 && did_quota_inode)
+		vfs_dq_free_inode(inode);
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 9cb71e1c7c6..3b9634c7d29 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1665,7 +1665,8 @@ static int ocfs2_remove_value_outside(struct inode*inode,
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
-	ctxt.handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	ctxt.handle = ocfs2_start_trans(osb,
+					ocfs2_remove_extent_credits(osb->sb));
 	if (IS_ERR(ctxt.handle)) {
 		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
@@ -2233,7 +2234,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 	 */
 	if (!xi->value) {
 		if (!ocfs2_xattr_is_local(xe))
-			credits += OCFS2_REMOVE_EXTENT_CREDITS;
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
 
 		goto out;
 	}
@@ -2250,7 +2251,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 		 */
 		if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
 			clusters_add += new_clusters;
-			credits += OCFS2_REMOVE_EXTENT_CREDITS +
+			credits += ocfs2_remove_extent_credits(inode->i_sb) +
 				    OCFS2_INODE_UPDATE_CREDITS;
 			if (!ocfs2_xattr_is_local(xe))
 				credits += ocfs2_calc_extend_credits(
@@ -2275,7 +2276,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 			xv = &def_xv.xv;
 
 		if (old_clusters >= new_clusters) {
-			credits += OCFS2_REMOVE_EXTENT_CREDITS;
+			credits += ocfs2_remove_extent_credits(inode->i_sb);
 			goto out;
 		} else {
 			meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
@@ -4750,7 +4751,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 		}
 	}
 
-	handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
 	if (IS_ERR(handle)) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -5109,7 +5110,8 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
-	ctxt.handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+	ctxt.handle = ocfs2_start_trans(osb,
+					ocfs2_remove_extent_credits(osb->sb));
 	if (IS_ERR(ctxt.handle)) {
 		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
-- 
cgit v1.2.3


From 171bf93ce11f4c9929fdce6ce63df8da2f3c4475 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 20 Oct 2008 15:36:47 +0200
Subject: ocfs2: Periodic quota syncing

This patch creates a work queue for periodic syncing of locally cached quota
information to the global quota files. We constantly queue a delayed work
item, to get the periodic behavior.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/quota.h        |  5 +++
 fs/ocfs2/quota_global.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/quota_local.c  |  4 +++
 fs/ocfs2/super.c        |  7 ++++
 4 files changed, 101 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 1f1c86311b3..e2233d51507 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -39,6 +39,7 @@ struct ocfs2_mem_dqinfo {
 	unsigned int dqi_chunks;	/* Number of chunks in local quota file */
 	unsigned int dqi_blocks;	/* Number of blocks allocated for local quota file */
 	unsigned int dqi_syncms;	/* How often should we sync with other nodes */
+	unsigned int dqi_syncjiff;	/* Precomputed dqi_syncms in jiffies */
 	struct list_head dqi_chunk;	/* List of chunks */
 	struct inode *dqi_gqinode;	/* Global quota file inode */
 	struct ocfs2_lock_res dqi_gqlock;	/* Lock protecting quota information structure */
@@ -47,6 +48,7 @@ struct ocfs2_mem_dqinfo {
 	struct buffer_head *dqi_lqi_bh;	/* Buffer head with local quota file inode */
 	struct buffer_head *dqi_ibh;	/* Buffer with information header */
 	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
+	struct delayed_work dqi_sync_work;	/* Work for syncing dquots */
 };
 
 static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
@@ -90,4 +92,7 @@ struct buffer_head *ocfs2_read_quota_block(struct inode *inode,
 extern struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
 
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+
 #endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index af8340c4536..adf53508bdb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -1,10 +1,14 @@
 /*
  *  Implementation of operations over global quota file
  */
+#include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/quota.h>
 #include <linux/quotaops.h>
 #include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
 
 #define MLOG_MASK_PREFIX ML_QUOTA
 #include <cluster/masklog.h>
@@ -20,6 +24,10 @@
 #include "uptodate.h"
 #include "quota.h"
 
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+
+static void qsync_work_fn(struct work_struct *work);
+
 static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
 {
 	struct ocfs2_global_disk_dqblk *d = dp;
@@ -313,6 +321,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
 	oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+	oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
 	oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
 	oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
 	oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -320,6 +329,10 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
 						OCFS2_QBLK_RESERVED_SPACE;
 	oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+	INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+			   oinfo->dqi_syncjiff);
+
 out_err:
 	mlog_exit(status);
 	return status;
@@ -519,6 +532,61 @@ out:
 	return err;
 }
 
+/*
+ *  Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+	handle_t *handle;
+	struct super_block *sb = dquot->dq_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_super *osb = OCFS2_SB(sb);
+	int status = 0;
+
+	mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+		   dquot->dq_type, type, sb->s_id);
+	if (type != dquot->dq_type)
+		goto out;
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+	if (IS_ERR(handle)) {
+		status = PTR_ERR(handle);
+		mlog_errno(status);
+		goto out_ilock;
+	}
+	mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+	status = ocfs2_sync_dquot(dquot);
+	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+	if (status < 0)
+		mlog_errno(status);
+	/* We have to write local structure as well... */
+	dquot_mark_dquot_dirty(dquot);
+	status = dquot_commit(dquot);
+	if (status < 0)
+		mlog_errno(status);
+	ocfs2_commit_trans(osb, handle);
+out_ilock:
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	mlog_exit(status);
+	return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+	struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+						      struct ocfs2_mem_dqinfo,
+						      dqi_sync_work.work);
+	struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+	queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+			   oinfo->dqi_syncjiff);
+}
+
 /*
  *  Wrappers for generic quota functions
  */
@@ -917,3 +985,20 @@ struct dquot_operations ocfs2_quota_operations = {
 	.alloc_dquot	= ocfs2_alloc_dquot,
 	.destroy_dquot	= ocfs2_destroy_dquot,
 };
+
+int ocfs2_quota_setup(void)
+{
+	ocfs2_quota_wq = create_workqueue("o2quot");
+	if (!ocfs2_quota_wq)
+		return -ENOMEM;
+	return 0;
+}
+
+void ocfs2_quota_shutdown(void)
+{
+	if (ocfs2_quota_wq) {
+		flush_workqueue(ocfs2_quota_wq);
+		destroy_workqueue(ocfs2_quota_wq);
+		ocfs2_quota_wq = NULL;
+	}
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 55c3f2f98dc..40e82b48313 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -368,6 +368,10 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 	int mark_clean = 1, len;
 	int status;
 
+	/* At this point we know there are no more dquots and thus
+	 * even if there's some sync in the pdflush queue, it won't
+	 * find any dquots and return without doing anything */
+	cancel_delayed_work_sync(&oinfo->dqi_sync_work);
 	iput(oinfo->dqi_gqinode);
 	ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
 	ocfs2_lock_res_free(&oinfo->dqi_gqlock);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7bb83e41581..60f1d29421a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1107,11 +1107,16 @@ static int __init ocfs2_init(void)
 		mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
 	}
 
+	status = ocfs2_quota_setup();
+	if (status)
+		goto leave;
+
 	ocfs2_set_locking_protocol();
 
 	status = register_quota_format(&ocfs2_quota_format);
 leave:
 	if (status < 0) {
+		ocfs2_quota_shutdown();
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
 	}
@@ -1128,6 +1133,8 @@ static void __exit ocfs2_exit(void)
 {
 	mlog_entry_void();
 
+	ocfs2_quota_shutdown();
+
 	if (ocfs2_wq) {
 		flush_workqueue(ocfs2_wq);
 		destroy_workqueue(ocfs2_wq);
-- 
cgit v1.2.3


From 2205363dce7447b8e85f1ead14387664c1a98753 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 20 Oct 2008 23:50:38 +0200
Subject: ocfs2: Implement quota recovery

Implement functions for recovery after a crash. Functions just
read local quota file and sync info to global quota file.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/journal.c      | 108 +++++++++---
 fs/ocfs2/journal.h      |   1 +
 fs/ocfs2/ocfs2.h        |   4 +-
 fs/ocfs2/quota.h        |  21 +++
 fs/ocfs2/quota_global.c |   1 -
 fs/ocfs2/quota_local.c  | 425 +++++++++++++++++++++++++++++++++++++++++++++++-
 6 files changed, 528 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 11a1178d5ee..c60242018d9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -45,6 +45,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "quota.h"
 
 #include "buffer_head_io.h"
 
@@ -52,7 +53,7 @@ DEFINE_SPINLOCK(trans_inc_lock);
 
 static int ocfs2_force_read_journal(struct inode *inode);
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-			      int node_num);
+			      int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
 static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
@@ -857,6 +858,7 @@ struct ocfs2_la_recovery_item {
 	int			lri_slot;
 	struct ocfs2_dinode	*lri_la_dinode;
 	struct ocfs2_dinode	*lri_tl_dinode;
+	struct ocfs2_quota_recovery *lri_qrec;
 };
 
 /* Does the second half of the recovery process. By this point, the
@@ -877,6 +879,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 	struct ocfs2_super *osb = journal->j_osb;
 	struct ocfs2_dinode *la_dinode, *tl_dinode;
 	struct ocfs2_la_recovery_item *item, *n;
+	struct ocfs2_quota_recovery *qrec;
 	LIST_HEAD(tmp_la_list);
 
 	mlog_entry_void();
@@ -922,6 +925,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		if (ret < 0)
 			mlog_errno(ret);
 
+		qrec = item->lri_qrec;
+		if (qrec) {
+			mlog(0, "Recovering quota files");
+			ret = ocfs2_finish_quota_recovery(osb, qrec,
+							  item->lri_slot);
+			if (ret < 0)
+				mlog_errno(ret);
+			/* Recovery info is already freed now */
+		}
+
 		kfree(item);
 	}
 
@@ -935,7 +948,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 					    int slot_num,
 					    struct ocfs2_dinode *la_dinode,
-					    struct ocfs2_dinode *tl_dinode)
+					    struct ocfs2_dinode *tl_dinode,
+					    struct ocfs2_quota_recovery *qrec)
 {
 	struct ocfs2_la_recovery_item *item;
 
@@ -950,6 +964,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 		if (tl_dinode)
 			kfree(tl_dinode);
 
+		if (qrec)
+			ocfs2_free_quota_recovery(qrec);
+
 		mlog_errno(-ENOMEM);
 		return;
 	}
@@ -958,6 +975,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
 	item->lri_la_dinode = la_dinode;
 	item->lri_slot = slot_num;
 	item->lri_tl_dinode = tl_dinode;
+	item->lri_qrec = qrec;
 
 	spin_lock(&journal->j_lock);
 	list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -977,6 +995,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 		ocfs2_queue_recovery_completion(journal,
 						osb->slot_num,
 						osb->local_alloc_copy,
+						NULL,
 						NULL);
 		ocfs2_schedule_truncate_log_flush(osb, 0);
 
@@ -985,11 +1004,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
 	}
 }
 
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+	if (osb->quota_rec) {
+		ocfs2_queue_recovery_completion(osb->journal,
+						osb->slot_num,
+						NULL,
+						NULL,
+						osb->quota_rec);
+		osb->quota_rec = NULL;
+	}
+}
+
 static int __ocfs2_recovery_thread(void *arg)
 {
-	int status, node_num;
+	int status, node_num, slot_num;
 	struct ocfs2_super *osb = arg;
 	struct ocfs2_recovery_map *rm = osb->recovery_map;
+	int *rm_quota = NULL;
+	int rm_quota_used = 0, i;
+	struct ocfs2_quota_recovery *qrec;
 
 	mlog_entry_void();
 
@@ -998,6 +1032,11 @@ static int __ocfs2_recovery_thread(void *arg)
 		goto bail;
 	}
 
+	rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+	if (!rm_quota) {
+		status = -ENOMEM;
+		goto bail;
+	}
 restart:
 	status = ocfs2_super_lock(osb, 1);
 	if (status < 0) {
@@ -1011,8 +1050,28 @@ restart:
 		 * clear it until ocfs2_recover_node() has succeeded. */
 		node_num = rm->rm_entries[0];
 		spin_unlock(&osb->osb_lock);
-
-		status = ocfs2_recover_node(osb, node_num);
+		mlog(0, "checking node %d\n", node_num);
+		slot_num = ocfs2_node_num_to_slot(osb, node_num);
+		if (slot_num == -ENOENT) {
+			status = 0;
+			mlog(0, "no slot for this node, so no recovery"
+			     "required.\n");
+			goto skip_recovery;
+		}
+		mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+		/* It is a bit subtle with quota recovery. We cannot do it
+		 * immediately because we have to obtain cluster locks from
+		 * quota files and we also don't want to just skip it because
+		 * then quota usage would be out of sync until some node takes
+		 * the slot. So we remember which nodes need quota recovery
+		 * and when everything else is done, we recover quotas. */
+		for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+		if (i == rm_quota_used)
+			rm_quota[rm_quota_used++] = slot_num;
+
+		status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
 		if (!status) {
 			ocfs2_recovery_map_clear(osb, node_num);
 		} else {
@@ -1034,13 +1093,27 @@ restart:
 	if (status < 0)
 		mlog_errno(status);
 
+	/* Now it is right time to recover quotas... We have to do this under
+	 * superblock lock so that noone can start using the slot (and crash)
+	 * before we recover it */
+	for (i = 0; i < rm_quota_used; i++) {
+		qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+		if (IS_ERR(qrec)) {
+			status = PTR_ERR(qrec);
+			mlog_errno(status);
+			continue;
+		}
+		ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+						NULL, NULL, qrec);
+	}
+
 	ocfs2_super_unlock(osb, 1);
 
 	/* We always run recovery on our own orphan dir - the dead
 	 * node(s) may have disallowd a previos inode delete. Re-processing
 	 * is therefore required. */
 	ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-					NULL);
+					NULL, NULL);
 
 bail:
 	mutex_lock(&osb->recovery_lock);
@@ -1055,6 +1128,9 @@ bail:
 
 	mutex_unlock(&osb->recovery_lock);
 
+	if (rm_quota)
+		kfree(rm_quota);
+
 	mlog_exit(status);
 	/* no one is callint kthread_stop() for us so the kthread() api
 	 * requires that we call do_exit().  And it isn't exported, but
@@ -1282,31 +1358,19 @@ done:
  * far less concerning.
  */
 static int ocfs2_recover_node(struct ocfs2_super *osb,
-			      int node_num)
+			      int node_num, int slot_num)
 {
 	int status = 0;
-	int slot_num;
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 
-	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
-		   node_num, osb->node_num);
-
-	mlog(0, "checking node %d\n", node_num);
+	mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
+		   node_num, slot_num, osb->node_num);
 
 	/* Should not ever be called to recover ourselves -- in that
 	 * case we should've called ocfs2_journal_load instead. */
 	BUG_ON(osb->node_num == node_num);
 
-	slot_num = ocfs2_node_num_to_slot(osb, node_num);
-	if (slot_num == -ENOENT) {
-		status = 0;
-		mlog(0, "no slot for this node, so no recovery required.\n");
-		goto done;
-	}
-
-	mlog(0, "node %d was using slot %d\n", node_num, slot_num);
-
 	status = ocfs2_replay_journal(osb, node_num, slot_num);
 	if (status < 0) {
 		if (status == -EBUSY) {
@@ -1342,7 +1406,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 
 	/* This will kfree the memory pointed to by la_copy and tl_copy */
 	ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-					tl_copy);
+					tl_copy, NULL);
 
 	status = 0;
 done:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index ee08e9c1fc1..37013bf9ce2 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -168,6 +168,7 @@ void   ocfs2_recovery_thread(struct ocfs2_super *osb,
 			     int node_num);
 int    ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
 void   ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
 
 static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
 {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f04b229fc75..6b25b4aa720 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -206,6 +206,7 @@ enum ocfs2_mount_options
 struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
 struct ocfs2_super
 {
 	struct task_struct *commit_task;
@@ -287,10 +288,11 @@ struct ocfs2_super
 	char *local_alloc_debug_buf;
 #endif
 
-	/* Next two fields are for local node slot recovery during
+	/* Next three fields are for local node slot recovery during
 	 * mount. */
 	int dirty;
 	struct ocfs2_dinode *local_alloc_copy;
+	struct ocfs2_quota_recovery *quota_rec;
 
 	struct ocfs2_alloc_stats alloc_stats;
 	char dev_str[20];		/* "major,minor" of the device */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index e2233d51507..04872b45b99 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -33,6 +33,17 @@ struct ocfs2_dquot {
 	s64 dq_originodes;	/* Last globally synced inode usage */
 };
 
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+	struct list_head rc_list;	/* List of chunks */
+	int rc_chunk;			/* Chunk number */
+	unsigned long *rc_bitmap;	/* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+	struct list_head r_list[MAXQUOTAS];	/* List of chunks to recover */
+};
+
 /* In-memory structure with quota header information */
 struct ocfs2_mem_dqinfo {
 	unsigned int dqi_type;		/* Quota type this structure describes */
@@ -49,6 +60,10 @@ struct ocfs2_mem_dqinfo {
 	struct buffer_head *dqi_ibh;	/* Buffer with information header */
 	struct qtree_mem_dqinfo dqi_gi;	/* Info about global file */
 	struct delayed_work dqi_sync_work;	/* Work for syncing dquots */
+	struct ocfs2_quota_recovery *dqi_rec;	/* Pointer to recovery
+						 * information, in case we
+						 * enable quotas on file
+						 * needing it */
 };
 
 static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
@@ -67,6 +82,12 @@ extern struct kmem_cache *ocfs2_qf_chunk_cachep;
 
 extern struct qtree_fmt_operations ocfs2_global_ops;
 
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+				struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
 ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
 			 size_t len, loff_t off);
 ssize_t ocfs2_quota_write(struct super_block *sb, int type,
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index adf53508bdb..49b536a2190 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -87,7 +87,6 @@ struct qtree_fmt_operations ocfs2_global_ops = {
 	.is_id = ocfs2_global_is_id,
 };
 
-
 struct buffer_head *ocfs2_read_quota_block(struct inode *inode,
 					   int block, int *err)
 {
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 40e82b48313..b98562174cd 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -49,14 +49,25 @@ static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
 	return 1 + (ol_chunk_blocks(sb) + 1) * c;
 }
 
-/* Offset of the dquot structure in the quota file */
-static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+	int epb = ol_quota_entries_per_block(sb);
+
+	return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
 {
 	int epb = ol_quota_entries_per_block(sb);
 
-	return ((ol_quota_chunk_block(sb, c) + 1 + off / epb)
-		<< sb->s_blocksize_bits) +
-		(off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+	return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+	return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+	       ol_dqblk_block_off(sb, c, off);
 }
 
 /* Compute block number from given offset */
@@ -253,6 +264,379 @@ static void olq_update_info(struct buffer_head *bh, void *private)
 	spin_unlock(&dq_data_lock);
 }
 
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+				    struct ocfs2_local_disk_chunk *dchunk,
+				    int chunk,
+				    struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *rc;
+
+	rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+	if (!rc)
+		return -ENOMEM;
+	rc->rc_chunk = chunk;
+	rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+	if (!rc->rc_bitmap) {
+		kfree(rc);
+		return -ENOMEM;
+	}
+	memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+	       (ol_chunk_entries(sb) + 7) >> 3);
+	list_add_tail(&rc->rc_list, head);
+	return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+	struct ocfs2_recovery_chunk *next;
+	struct ocfs2_recovery_chunk *rchunk;
+
+	list_for_each_entry_safe(rchunk, next, head, rc_list) {
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+	}
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		free_recovery_list(&(rec->r_list[type]));
+	kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+				     struct ocfs2_local_disk_dqinfo *ldinfo,
+				     int type,
+				     struct list_head *head)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct buffer_head *hbh;
+	struct ocfs2_local_disk_chunk *dchunk;
+	int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+	int status = 0;
+
+	for (i = 0; i < chunks; i++) {
+		hbh = ocfs2_read_quota_block(lqinode,
+					     ol_quota_chunk_block(sb, i),
+					     &status);
+		if (!hbh) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+			status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+		brelse(hbh);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		free_recovery_list(head);
+	return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+	int type;
+	struct ocfs2_quota_recovery *rec;
+
+	rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+	if (!rec)
+		return NULL;
+	for (type = 0; type < MAXQUOTAS; type++)
+		INIT_LIST_HEAD(&(rec->r_list[type]));
+	return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+						struct ocfs2_super *osb,
+						int slot_num)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct inode *lqinode;
+	struct buffer_head *bh;
+	int type;
+	int status = 0;
+	struct ocfs2_quota_recovery *rec;
+
+	mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+	rec = ocfs2_alloc_quota_recovery();
+	if (!rec)
+		return ERR_PTR(-ENOMEM);
+	/* First init... */
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		/* At this point, journal of the slot is already replayed so
+		 * we can trust metadata and data of the quota file */
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+					       OCFS2_META_LOCK_RECOVERY);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = ocfs2_read_quota_block(lqinode, 0, &status);
+		if (!bh) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+						   &rec->r_list[type]);
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	if (status < 0) {
+		ocfs2_free_quota_recovery(rec);
+		rec = ERR_PTR(status);
+	}
+	return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+					  int type,
+					  struct ocfs2_quota_recovery *rec)
+{
+	struct super_block *sb = lqinode->i_sb;
+	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+	struct ocfs2_local_disk_chunk *dchunk;
+	struct ocfs2_local_disk_dqblk *dqblk;
+	struct dquot *dquot;
+	handle_t *handle;
+	struct buffer_head *hbh = NULL, *qbh = NULL;
+	int status = 0;
+	int bit, chunk;
+	struct ocfs2_recovery_chunk *rchunk, *next;
+	qsize_t spacechange, inodechange;
+
+	mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+
+	status = ocfs2_lock_global_qf(oinfo, 1);
+	if (status < 0)
+		goto out;
+
+	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+		chunk = rchunk->rc_chunk;
+		hbh = ocfs2_read_quota_block(lqinode,
+					     ol_quota_chunk_block(sb, chunk),
+					     &status);
+		if (!hbh) {
+			mlog_errno(status);
+			break;
+		}
+		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+		for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+			qbh = ocfs2_read_quota_block(lqinode,
+						ol_dqblk_block(sb, chunk, bit),
+						&status);
+			if (!qbh) {
+				mlog_errno(status);
+				break;
+			}
+			dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+				ol_dqblk_block_off(sb, chunk, bit));
+			dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+			if (!dquot) {
+				status = -EIO;
+				mlog(ML_ERROR, "Failed to get quota structure "
+				     "for id %u, type %d. Cannot finish quota "
+				     "file recovery.\n",
+				     (unsigned)le64_to_cpu(dqblk->dqb_id),
+				     type);
+				goto out_put_bh;
+			}
+			handle = ocfs2_start_trans(OCFS2_SB(sb),
+						   OCFS2_QSYNC_CREDITS);
+			if (IS_ERR(handle)) {
+				status = PTR_ERR(handle);
+				mlog_errno(status);
+				goto out_put_dquot;
+			}
+			mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+			spin_lock(&dq_data_lock);
+			/* Add usage from quota entry into quota changes
+			 * of our node. Auxiliary variables are important
+			 * due to signedness */
+			spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+			inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+			dquot->dq_dqb.dqb_curspace += spacechange;
+			dquot->dq_dqb.dqb_curinodes += inodechange;
+			spin_unlock(&dq_data_lock);
+			/* We want to drop reference held by the crashed
+			 * node. Since we have our own reference we know
+			 * global structure actually won't be freed. */
+			status = ocfs2_global_release_dquot(dquot);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			/* Release local quota file entry */
+			status = ocfs2_journal_access(handle, lqinode,
+					qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+			if (status < 0) {
+				mlog_errno(status);
+				goto out_commit;
+			}
+			lock_buffer(qbh);
+			WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+			ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+			le32_add_cpu(&dchunk->dqc_free, 1);
+			unlock_buffer(qbh);
+			status = ocfs2_journal_dirty(handle, qbh);
+			if (status < 0)
+				mlog_errno(status);
+out_commit:
+			mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+			ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+			dqput(dquot);
+out_put_bh:
+			brelse(qbh);
+			if (status < 0)
+				break;
+		}
+		brelse(hbh);
+		list_del(&rchunk->rc_list);
+		kfree(rchunk->rc_bitmap);
+		kfree(rchunk);
+		if (status < 0)
+			break;
+	}
+	ocfs2_unlock_global_qf(oinfo, 1);
+out:
+	if (status < 0)
+		free_recovery_list(&(rec->r_list[type]));
+	mlog_exit(status);
+	return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+				struct ocfs2_quota_recovery *rec,
+				int slot_num)
+{
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	struct super_block *sb = osb->sb;
+	struct ocfs2_local_disk_dqinfo *ldinfo;
+	struct buffer_head *bh;
+	handle_t *handle;
+	int type;
+	int status = 0;
+	struct inode *lqinode;
+	unsigned int flags;
+
+	mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (list_empty(&(rec->r_list[type])))
+			continue;
+		mlog(0, "Recovering quota in slot %d\n", slot_num);
+		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+		if (!lqinode) {
+			status = -ENOENT;
+			goto out;
+		}
+		status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+						       OCFS2_META_LOCK_NOQUEUE);
+		/* Someone else is holding the lock? Then he must be
+		 * doing the recovery. Just skip the file... */
+		if (status == -EAGAIN) {
+			mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+			     "because quota file is locked.\n", slot_num);
+			status = 0;
+			goto out_put;
+		} else if (status < 0) {
+			mlog_errno(status);
+			goto out_put;
+		}
+		/* Now read local header */
+		bh = ocfs2_read_quota_block(lqinode, 0, &status);
+		if (!bh) {
+			mlog_errno(status);
+			mlog(ML_ERROR, "failed to read quota file info header "
+				"(slot=%d type=%d)\n", slot_num, type);
+			goto out_lock;
+		}
+		ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+							OCFS2_LOCAL_INFO_OFF);
+		/* Is recovery still needed? */
+		flags = le32_to_cpu(ldinfo->dqi_flags);
+		if (!(flags & OLQF_CLEAN))
+			status = ocfs2_recover_local_quota_file(lqinode,
+								type,
+								rec);
+		/* We don't want to mark file as clean when it is actually
+		 * active */
+		if (slot_num == osb->slot_num)
+			goto out_bh;
+		/* Mark quota file as clean if we are recovering quota file of
+		 * some other node. */
+		handle = ocfs2_start_trans(osb, 1);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out_bh;
+		}
+		status = ocfs2_journal_access(handle, lqinode, bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_trans;
+		}
+		lock_buffer(bh);
+		ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+		unlock_buffer(bh);
+		status = ocfs2_journal_dirty(handle, bh);
+		if (status < 0)
+			mlog_errno(status);
+out_trans:
+		ocfs2_commit_trans(osb, handle);
+out_bh:
+		brelse(bh);
+out_lock:
+		ocfs2_inode_unlock(lqinode, 1);
+out_put:
+		iput(lqinode);
+		if (status < 0)
+			break;
+	}
+out:
+	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	kfree(rec);
+	return status;
+}
+
 /* Read information header from quota file */
 static int ocfs2_local_read_info(struct super_block *sb, int type)
 {
@@ -262,6 +646,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	struct inode *lqinode = sb_dqopt(sb)->files[type];
 	int status;
 	struct buffer_head *bh = NULL;
+	struct ocfs2_quota_recovery *rec;
 	int locked = 0;
 
 	info->dqi_maxblimit = 0x7fffffffffffffffLL;
@@ -275,6 +660,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	info->dqi_priv = oinfo;
 	oinfo->dqi_type = type;
 	INIT_LIST_HEAD(&oinfo->dqi_chunk);
+	oinfo->dqi_rec = NULL;
 	oinfo->dqi_lqi_bh = NULL;
 	oinfo->dqi_ibh = NULL;
 
@@ -305,10 +691,27 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	oinfo->dqi_ibh = bh;
 
 	/* We crashed when using local quota file? */
-	if (!(info->dqi_flags & OLQF_CLEAN))
-		goto out_err;	/* So far we just bail out. Later we should resync here */
+	if (!(info->dqi_flags & OLQF_CLEAN)) {
+		rec = OCFS2_SB(sb)->quota_rec;
+		if (!rec) {
+			rec = ocfs2_alloc_quota_recovery();
+			if (!rec) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				goto out_err;
+			}
+			OCFS2_SB(sb)->quota_rec = rec;
+		}
 
-	status = ocfs2_load_local_quota_bitmaps(sb_dqopt(sb)->files[type],
+		status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+                                                   &rec->r_list[type]);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out_err;
+		}
+	}
+
+	status = ocfs2_load_local_quota_bitmaps(lqinode,
 						ldinfo,
 						&oinfo->dqi_chunk);
 	if (status < 0) {
@@ -394,6 +797,12 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 	}
 	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
 
+	/* dqonoff_mutex protects us against racing with recovery thread... */
+	if (oinfo->dqi_rec) {
+		ocfs2_free_quota_recovery(oinfo->dqi_rec);
+		mark_clean = 0;
+	}
+
 	if (!mark_clean)
 		goto out;
 
-- 
cgit v1.2.3


From 19ece546a418997226bd91552fbc41abcb05cea6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 Aug 2008 20:13:17 +0200
Subject: ocfs2: Enable quota accounting on mount, disable on umount

Enable quota usage tracking on mount and disable it on umount. Also
add support for quota on and quota off quotactls and usrquota and
grpquota mount options. Add quota features among supported ones.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/journal.c  |  20 ++++-
 fs/ocfs2/ocfs2.h    |   3 +
 fs/ocfs2/ocfs2_fs.h |   4 +-
 fs/ocfs2/super.c    | 222 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 245 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index c60242018d9..302f1144a70 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -56,7 +56,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 			      int node_num, int slot_num);
 static int __ocfs2_recovery_thread(void *arg);
 static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
 static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 				      int dirty, int replayed);
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -65,6 +65,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 				 int slot);
 static int ocfs2_commit_thread(void *arg);
 
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 0);
+}
+
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+	return __ocfs2_wait_on_mount(osb, 1);
+}
+
+
 
 /*
  * The recovery_list is a simple linked list of node numbers to recover.
@@ -895,6 +906,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 
 		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
 
+		ocfs2_wait_on_quotas(osb);
+
 		la_dinode = item->lri_la_dinode;
 		if (la_dinode) {
 			mlog(0, "Clean up local alloc %llu\n",
@@ -1701,13 +1714,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	return ret;
 }
 
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 {
 	/* This check is good because ocfs2 will wait on our recovery
 	 * thread before changing it to something other than MOUNTED
 	 * or DISABLED. */
 	wait_event(osb->osb_mount_event,
-		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+		  (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+		   atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
 		   atomic_read(&osb->vol_state) == VOLUME_DISABLED);
 
 	/* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6b25b4aa720..5c777988042 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
 {
 	VOLUME_INIT = 0,
 	VOLUME_MOUNTED,
+	VOLUME_MOUNTED_QUOTAS,
 	VOLUME_DISMOUNTED,
 	VOLUME_DISABLED
 };
@@ -196,6 +197,8 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
 	OCFS2_MOUNT_INODE64 = 1 << 7,	/* Allow inode numbers > 2^32 */
 	OCFS2_MOUNT_POSIX_ACL = 1 << 8,	/* POSIX access control lists */
+	OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+	OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 0a5ac790a62..359732e18e8 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -94,7 +94,9 @@
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
 					 | OCFS2_FEATURE_INCOMPAT_XATTR)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 
 /*
  * Heartbeat-only devices are missing journals and other files.  The
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 60f1d29421a..2eb657c3e7a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
 #include <linux/debugfs.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/quotaops.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -127,6 +128,9 @@ static int ocfs2_get_sector(struct super_block *sb,
 static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 
 static const struct super_operations ocfs2_sops = {
 	.statfs		= ocfs2_statfs,
@@ -165,6 +169,8 @@ enum {
 	Opt_inode64,
 	Opt_acl,
 	Opt_noacl,
+	Opt_usrquota,
+	Opt_grpquota,
 	Opt_err,
 };
 
@@ -189,6 +195,8 @@ static const match_table_t tokens = {
 	{Opt_inode64, "inode64"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
+	{Opt_usrquota, "usrquota"},
+	{Opt_grpquota, "grpquota"},
 	{Opt_err, NULL}
 };
 
@@ -452,6 +460,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 
 	/* We're going to/from readonly mode. */
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+		/* Disable quota accounting before remounting RO */
+		if (*flags & MS_RDONLY) {
+			ret = ocfs2_susp_quotas(osb, 0);
+			if (ret < 0)
+				goto out;
+		}
 		/* Lock here so the check of HARD_RO and the potential
 		 * setting of SOFT_RO is atomic. */
 		spin_lock(&osb->osb_lock);
@@ -487,6 +501,21 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		}
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
+		/* Enable quota accounting after remounting RW */
+		if (!ret && !(*flags & MS_RDONLY)) {
+			if (sb_any_quota_suspended(sb))
+				ret = ocfs2_susp_quotas(osb, 1);
+			else
+				ret = ocfs2_enable_quotas(osb);
+			if (ret < 0) {
+				/* Return back changes... */
+				spin_lock(&osb->osb_lock);
+				sb->s_flags |= MS_RDONLY;
+				osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+				spin_unlock(&osb->osb_lock);
+				goto out;
+			}
+		}
 	}
 
 	if (!ret) {
@@ -647,6 +676,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
 	return 0;
 }
 
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+	int type;
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	int status = 0;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		if (unsuspend)
+			status = vfs_quota_enable(
+					sb_dqopt(sb)->files[type],
+					type, QFMT_OCFS2,
+					DQUOT_SUSPENDED);
+		else
+			status = vfs_quota_disable(sb, type,
+						   DQUOT_SUSPENDED);
+		if (status < 0)
+			break;
+	}
+	if (status < 0)
+		mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+		     "remount (error = %d).\n", status);
+	return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+	struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+	struct super_block *sb = osb->sb;
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+	unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+					LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+	int status;
+	int type;
+
+	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+			continue;
+		inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+							osb->slot_num);
+		if (!inode[type]) {
+			status = -ENOENT;
+			goto out_quota_off;
+		}
+		status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+						DQUOT_USAGE_ENABLED);
+		if (status < 0)
+			goto out_quota_off;
+	}
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	return 0;
+out_quota_off:
+	ocfs2_disable_quotas(osb);
+	for (type = 0; type < MAXQUOTAS; type++)
+		iput(inode[type]);
+	mlog_errno(status);
+	return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+	int type;
+	struct inode *inode;
+	struct super_block *sb = osb->sb;
+
+	/* We mostly ignore errors in this function because there's not much
+	 * we can do when we see them */
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!sb_has_quota_loaded(sb, type))
+			continue;
+		inode = igrab(sb->s_dquot.files[type]);
+		/* Turn off quotas. This will remove all dquot structures from
+		 * memory and so they will be automatically synced to global
+		 * quota files */
+		vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+					    DQUOT_LIMITS_ENABLED);
+		if (!inode)
+			continue;
+		iput(inode);
+	}
+}
+
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+			  char *path, int remount)
+{
+	unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+					     OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+
+	if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+		return -EINVAL;
+
+	if (remount)
+		return 0;	/* Just ignore it has been handled in
+				 * ocfs2_remount() */
+	return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+				    format_id, DQUOT_LIMITS_ENABLED);
+}
+
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+	if (remount)
+		return 0;	/* Ignore now and handle later in
+				 * ocfs2_remount() */
+	return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
+static struct quotactl_ops ocfs2_quotactl_ops = {
+	.quota_on	= ocfs2_quota_on,
+	.quota_off	= ocfs2_quota_off,
+	.quota_sync	= vfs_quota_sync,
+	.get_info	= vfs_get_dqinfo,
+	.set_info	= vfs_set_dqinfo,
+	.get_dqblk	= vfs_get_dqblk,
+	.set_dqblk	= vfs_set_dqblk,
+};
+
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct dentry *root;
@@ -689,6 +843,22 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	osb->osb_commit_interval = parsed_options.commit_interval;
 	osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
 	osb->local_alloc_bits = osb->local_alloc_default_bits;
+	if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "User quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		goto read_super_error;
+	}
+	if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+	    !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+					 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+		status = -EINVAL;
+		mlog(ML_ERROR, "Group quotas were requested, but this "
+		     "filesystem does not have the feature enabled.\n");
+		goto read_super_error;
+	}
 
 	status = ocfs2_verify_userspace_stack(osb, &parsed_options);
 	if (status)
@@ -793,6 +963,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	atomic_set(&osb->vol_state, VOLUME_MOUNTED);
 	wake_up(&osb->osb_mount_event);
 
+	/* Now we can initialize quotas because we can afford to wait
+	 * for cluster locks recovery now. That also means that truncation
+	 * log recovery can happen but that waits for proper quota setup */
+	if (!(sb->s_flags & MS_RDONLY)) {
+		status = ocfs2_enable_quotas(osb);
+		if (status < 0) {
+			/* We have to err-out specially here because
+			 * s_root is already set */
+			mlog_errno(status);
+			atomic_set(&osb->vol_state, VOLUME_DISABLED);
+			wake_up(&osb->osb_mount_event);
+			mlog_exit(status);
+			return status;
+		}
+	}
+
+	ocfs2_complete_quota_recovery(osb);
+
+	/* Now we wake up again for processes waiting for quotas */
+	atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+	wake_up(&osb->osb_mount_event);
+
 	mlog_exit(status);
 	return status;
 
@@ -980,6 +1172,28 @@ static int ocfs2_parse_options(struct super_block *sb,
 		case Opt_inode64:
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
 			break;
+		case Opt_usrquota:
+			/* We check only on remount, otherwise features
+			 * aren't yet initialized. */
+			if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+			    OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+				mlog(ML_ERROR, "User quota requested but "
+				     "filesystem feature is not set\n");
+				status = 0;
+				goto bail;
+			}
+			mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+			break;
+		case Opt_grpquota:
+			if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+			    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+				mlog(ML_ERROR, "Group quota requested but "
+				     "filesystem feature is not set\n");
+				status = 0;
+				goto bail;
+			}
+			mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+			break;
 #ifdef CONFIG_OCFS2_FS_POSIX_ACL
 		case Opt_acl:
 			mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
@@ -1056,6 +1270,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	if (osb->osb_cluster_stack[0])
 		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
 			   osb->osb_cluster_stack);
+	if (opts & OCFS2_MOUNT_USRQUOTA)
+		seq_printf(s, ",usrquota");
+	if (opts & OCFS2_MOUNT_GRPQUOTA)
+		seq_printf(s, ",grpquota");
 
 	if (opts & OCFS2_MOUNT_NOUSERXATTR)
 		seq_printf(s, ",nouser_xattr");
@@ -1394,6 +1612,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	osb = OCFS2_SB(sb);
 	BUG_ON(!osb);
 
+	ocfs2_disable_quotas(osb);
+
 	ocfs2_shutdown_local_alloc(osb);
 
 	ocfs2_truncate_log_shutdown(osb);
@@ -1504,6 +1724,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	sb->s_fs_info = osb;
 	sb->s_op = &ocfs2_sops;
 	sb->s_export_op = &ocfs2_export_ops;
+	sb->s_qcop = &ocfs2_quotactl_ops;
+	sb->dq_op = &ocfs2_quota_operations;
 	sb->s_xattr = ocfs2_xattr_handlers;
 	sb->s_time_gran = 1;
 	sb->s_flags |= MS_NOATIME;
-- 
cgit v1.2.3


From b86c86fa1feb50221dc16071ae5b8a4acf3bd32c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Tue, 18 Nov 2008 17:16:47 -0800
Subject: ocfs2: Use BH_JBDPrivateStart instead of BH_Unshadow

This is safer. We no longer have to worry about tracking changes to
jbd_state_bits.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/buffer_head_io.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 0e9eed0c223..15c8e6deee2 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -42,11 +42,10 @@
 /*
  * Bits on bh->b_state used by ocfs2.
  *
- * These MUST be after the JBD2 bits.  Currently BH_Unshadow is the last
- * JBD2 bit.
+ * These MUST be after the JBD2 bits.  Hence, we use BH_JBDPrivateStart.
  */
 enum ocfs2_state_bits {
-	BH_NeedsValidate = BH_Unshadow + 1,
+	BH_NeedsValidate = BH_JBDPrivateStart,
 };
 
 /* Expand the magic b_state functions */
-- 
cgit v1.2.3


From 57a09a7b3d9445a17c78d544f1e49d4d7d61705a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:26 +0100
Subject: ocfs2: Add missing initialization

Add missing variable initialization to ocfs2_dquot_drop_slow().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 49b536a2190..10ecb33298d 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -870,7 +870,7 @@ out:
 
 static int ocfs2_dquot_drop_slow(struct inode *inode)
 {
-	int status;
+	int status = 0;
 	int cnt;
 	int got_lock[MAXQUOTAS] = {0, 0};
 	handle_t *handle;
-- 
cgit v1.2.3


From 85eb8b73d66530bb7b931789ae7a5ec9744eed34 Mon Sep 17 00:00:00 2001
From: Joel Becker <Joel.Becker@oracle.com>
Date: Tue, 25 Nov 2008 15:31:27 +0100
Subject: ocfs2: Fix ocfs2_read_quota_block() error handling.

ocfs2_bread() has become ocfs2_read_virt_blocks(), with a prototype to
match ocfs2_read_blocks().  The quota code, converting from
ocfs2_bread(), wraps the call to ocfs2_read_virt_blocks() in
ocfs2_read_quota_block().  Unfortunately, the prototype of
ocfs2_read_quota_block() matches the old prototype of ocfs2_bread().

The problem is that ocfs2_bread() returned the buffer head, and callers
assumed that a NULL pointer was indicative of error.  It wasn't.  This
is why ocfs2_bread() took an int*err argument as well.

The new prototype of ocfs2_read_virt_blocks() avoids this error handling
confusion.  Let's change ocfs2_read_quota_block() to match.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c      |  6 ++---
 fs/ocfs2/quota.h        |  4 ++--
 fs/ocfs2/quota_global.c | 34 +++++++++++++++-----------
 fs/ocfs2/quota_local.c  | 64 +++++++++++++++++++++++++++----------------------
 4 files changed, 60 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 058aa86490a..b1c75911d8a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3519,7 +3519,7 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
 					    oinfo->dqi_gi.dqi_type);
 	struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
 	struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	struct ocfs2_global_disk_dqinfo *gdinfo;
 	int status = 0;
 
@@ -3532,8 +3532,8 @@ static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
 		oinfo->dqi_gi.dqi_free_entry =
 					be32_to_cpu(lvb->lvb_free_entry);
 	} else {
-		bh = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &status);
-		if (!bh) {
+		status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+		if (status) {
 			mlog_errno(status);
 			goto bail;
 		}
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 04872b45b99..7365e2e0870 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -107,8 +107,8 @@ static inline int ocfs2_global_release_dquot(struct dquot *dquot)
 
 int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
 void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
-struct buffer_head *ocfs2_read_quota_block(struct inode *inode,
-					   int block, int *err);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+			   struct buffer_head **bh);
 
 extern struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 10ecb33298d..2bdcddd3f1c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -87,16 +87,21 @@ struct qtree_fmt_operations ocfs2_global_ops = {
 	.is_id = ocfs2_global_is_id,
 };
 
-struct buffer_head *ocfs2_read_quota_block(struct inode *inode,
-					   int block, int *err)
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+			   struct buffer_head **bh)
 {
-	struct buffer_head *tmp = NULL;
+	int rc = 0;
+	struct buffer_head *tmp = *bh;
 
-	*err = ocfs2_read_virt_blocks(inode, block, 1, &tmp, 0, NULL);
-	if (*err)
-		mlog_errno(*err);
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, NULL);
+	if (rc)
+		mlog_errno(rc);
+
+	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+	if (!rc && !*bh)
+		*bh = tmp;
 
-	return tmp;
+	return rc;
 }
 
 static struct buffer_head *ocfs2_get_quota_block(struct inode *inode,
@@ -143,8 +148,9 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
 	toread = len;
 	while (toread > 0) {
 		tocopy = min((size_t)(sb->s_blocksize - offset), toread);
-		bh = ocfs2_read_quota_block(gqinode, blk, &err);
-		if (!bh) {
+		bh = NULL;
+		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		if (err) {
 			mlog_errno(err);
 			return err;
 		}
@@ -169,7 +175,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	int offset = off & (sb->s_blocksize - 1);
 	sector_t blk = off >> sb->s_blocksize_bits;
 	int err = 0, new = 0;
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	handle_t *handle = journal_current_handle();
 
 	if (!handle) {
@@ -200,13 +206,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	/* Not rewriting whole block? */
 	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
 	    !new) {
-		bh = ocfs2_read_quota_block(gqinode, blk, &err);
-		if (!bh) {
+		err = ocfs2_read_quota_block(gqinode, blk, &bh);
+		if (err) {
 			mlog_errno(err);
 			return err;
 		}
 		err = ocfs2_journal_access(handle, gqinode, bh,
-						OCFS2_JOURNAL_ACCESS_WRITE);
+					   OCFS2_JOURNAL_ACCESS_WRITE);
 	} else {
 		bh = ocfs2_get_quota_block(gqinode, blk, &err);
 		if (!bh) {
@@ -214,7 +220,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 			return err;
 		}
 		err = ocfs2_journal_access(handle, gqinode, bh,
-						OCFS2_JOURNAL_ACCESS_CREATE);
+					   OCFS2_JOURNAL_ACCESS_CREATE);
 	}
 	if (err < 0) {
 		brelse(bh);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b98562174cd..7053664f66a 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -139,15 +139,15 @@ static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
 	unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
 	unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
 					GROUP_QUOTA_SYSTEM_INODE };
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	struct inode *linode = sb_dqopt(sb)->files[type];
 	struct inode *ginode = NULL;
 	struct ocfs2_disk_dqheader *dqhead;
 	int status, ret = 0;
 
 	/* First check whether we understand local quota file */
-	bh = ocfs2_read_quota_block(linode, 0, &status);
-	if (!bh) {
+	status = ocfs2_read_quota_block(linode, 0, &bh);
+	if (status) {
 		mlog_errno(status);
 		mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
 			type);
@@ -178,8 +178,8 @@ static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
 		goto out_err;
 	}
 	/* Since the header is read only, we don't care about locking */
-	bh = ocfs2_read_quota_block(ginode, 0, &status);
-	if (!bh) {
+	status = ocfs2_read_quota_block(ginode, 0, &bh);
+	if (status) {
 		mlog_errno(status);
 		mlog(ML_ERROR, "failed to read global quota file header "
 				"(type=%d)\n", type);
@@ -235,10 +235,11 @@ static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
 			return -ENOMEM;
 		}
 		newchunk->qc_num = i;
-		newchunk->qc_headerbh = ocfs2_read_quota_block(inode,
+		newchunk->qc_headerbh = NULL;
+		status = ocfs2_read_quota_block(inode,
 				ol_quota_chunk_block(inode->i_sb, i),
-				&status);
-		if (!newchunk->qc_headerbh) {
+				&newchunk->qc_headerbh);
+		if (status) {
 			mlog_errno(status);
 			kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
 			ocfs2_release_local_quota_bitmaps(head);
@@ -320,10 +321,11 @@ static int ocfs2_recovery_load_quota(struct inode *lqinode,
 	int status = 0;
 
 	for (i = 0; i < chunks; i++) {
-		hbh = ocfs2_read_quota_block(lqinode,
-					     ol_quota_chunk_block(sb, i),
-					     &status);
-		if (!hbh) {
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, i),
+						&hbh);
+		if (status) {
 			mlog_errno(status);
 			break;
 		}
@@ -392,8 +394,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
 			goto out_put;
 		}
 		/* Now read local header */
-		bh = ocfs2_read_quota_block(lqinode, 0, &status);
-		if (!bh) {
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
 			mlog_errno(status);
 			mlog(ML_ERROR, "failed to read quota file info header "
 				"(slot=%d type=%d)\n", slot_num, type);
@@ -447,19 +450,21 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 
 	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
 		chunk = rchunk->rc_chunk;
-		hbh = ocfs2_read_quota_block(lqinode,
-					     ol_quota_chunk_block(sb, chunk),
-					     &status);
-		if (!hbh) {
+		hbh = NULL;
+		status = ocfs2_read_quota_block(lqinode,
+						ol_quota_chunk_block(sb, chunk),
+						&hbh);
+		if (status) {
 			mlog_errno(status);
 			break;
 		}
 		dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
 		for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
-			qbh = ocfs2_read_quota_block(lqinode,
+			qbh = NULL;
+			status = ocfs2_read_quota_block(lqinode,
 						ol_dqblk_block(sb, chunk, bit),
-						&status);
-			if (!qbh) {
+						&qbh);
+			if (status) {
 				mlog_errno(status);
 				break;
 			}
@@ -581,8 +586,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
 			goto out_put;
 		}
 		/* Now read local header */
-		bh = ocfs2_read_quota_block(lqinode, 0, &status);
-		if (!bh) {
+		bh = NULL;
+		status = ocfs2_read_quota_block(lqinode, 0, &bh);
+		if (status) {
 			mlog_errno(status);
 			mlog(ML_ERROR, "failed to read quota file info header "
 				"(slot=%d type=%d)\n", slot_num, type);
@@ -676,8 +682,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	locked = 1;
 
 	/* Now read local header */
-	bh = ocfs2_read_quota_block(lqinode, 0, &status);
-	if (!bh) {
+	status = ocfs2_read_quota_block(lqinode, 0, &bh);
+	if (status) {
 		mlog_errno(status);
 		mlog(ML_ERROR, "failed to read quota file info header "
 			"(type=%d)\n", type);
@@ -850,13 +856,13 @@ static int ocfs2_local_write_dquot(struct dquot *dquot)
 {
 	struct super_block *sb = dquot->dq_sb;
 	struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	int status;
 
-	bh = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+	status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
 				    ol_dqblk_file_block(sb, od->dq_local_off),
-				    &status);
-	if (!bh) {
+				    &bh);
+	if (status) {
 		mlog_errno(status);
 		goto out;
 	}
-- 
cgit v1.2.3


From af09e51b6810d3408db1c0e956b3b0687b0e3723 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:28 +0100
Subject: ocfs2: Fix oops when extending quota files

We have to mark buffer as uptodate before calling ocfs2_journal_access() and
ocfs2_set_buffer_uptodate() does not do this for us.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bdcddd3f1c..8fceb0c49b3 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -174,7 +174,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	struct inode *gqinode = oinfo->dqi_gqinode;
 	int offset = off & (sb->s_blocksize - 1);
 	sector_t blk = off >> sb->s_blocksize_bits;
-	int err = 0, new = 0;
+	int err = 0, new = 0, ja_type;
 	struct buffer_head *bh = NULL;
 	handle_t *handle = journal_current_handle();
 
@@ -207,32 +207,28 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
 	    !new) {
 		err = ocfs2_read_quota_block(gqinode, blk, &bh);
-		if (err) {
-			mlog_errno(err);
-			return err;
-		}
-		err = ocfs2_journal_access(handle, gqinode, bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
 	} else {
 		bh = ocfs2_get_quota_block(gqinode, blk, &err);
-		if (!bh) {
-			mlog_errno(err);
-			return err;
-		}
-		err = ocfs2_journal_access(handle, gqinode, bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
+		ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
 	}
-	if (err < 0) {
-		brelse(bh);
-		goto out;
+	if (err) {
+		mlog_errno(err);
+		return err;
 	}
 	lock_buffer(bh);
 	if (new)
 		memset(bh->b_data, 0, sb->s_blocksize);
 	memcpy(bh->b_data + offset, data, len);
 	flush_dcache_page(bh->b_page);
+	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	ocfs2_set_buffer_uptodate(gqinode, bh);
+	err = ocfs2_journal_access(handle, gqinode, bh, ja_type);
+	if (err < 0) {
+		brelse(bh);
+		goto out;
+	}
 	err = ocfs2_journal_dirty(handle, bh);
 	brelse(bh);
 	if (err < 0)
-- 
cgit v1.2.3


From 53a3604610e92a5344cf8003c19975583e71a598 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:29 +0100
Subject: ocfs2: Make ocfs2_get_quota_block() consistent with
 ocfs2_read_quota_block()

Make function return error status and not buffer pointer so that it's
consistent with ocfs2_read_quota_block().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 8fceb0c49b3..e527ec6e013 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -104,26 +104,25 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 	return rc;
 }
 
-static struct buffer_head *ocfs2_get_quota_block(struct inode *inode,
-						 int block, int *err)
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+				 struct buffer_head **bh)
 {
 	u64 pblock, pcount;
-	struct buffer_head *bh;
+	int err;
 
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
-	*err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount,
-					   NULL);
+	err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-	if (*err) {
-		mlog_errno(*err);
-		return NULL;
+	if (err) {
+		mlog_errno(err);
+		return err;
 	}
-	bh = sb_getblk(inode->i_sb, pblock);
-	if (!bh) {
-		*err = -EIO;
-		mlog_errno(*err);
+	*bh = sb_getblk(inode->i_sb, pblock);
+	if (!*bh) {
+		err = -EIO;
+		mlog_errno(err);
 	}
-	return bh;
+	return err;;
 }
 
 /* Read data from global quotafile - avoid pagecache and such because we cannot
@@ -209,7 +208,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 		err = ocfs2_read_quota_block(gqinode, blk, &bh);
 		ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
 	} else {
-		bh = ocfs2_get_quota_block(gqinode, blk, &err);
+		err = ocfs2_get_quota_block(gqinode, blk, &bh);
 		ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
 	}
 	if (err) {
-- 
cgit v1.2.3


From 9a2f3866c825c67c3a5806799cdc93fb7517f0c4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:30 +0100
Subject: ocfs2: Fix build warnings (64-bit types vs long long)

fs/ocfs2/quota_local.c: In function 'olq_set_dquot':
fs/ocfs2/quota_local.c:844: warning: format '%lld' expects type 'long long int', but argument 7 has type '__le64'
fs/ocfs2/quota_local.c:844: warning: format '%lld' expects type 'long long int', but argument 8 has type '__le64'
fs/ocfs2/quota_local.c:844: warning: format '%lld' expects type 'long long int', but argument 7 has type '__le64'
fs/ocfs2/quota_local.c:844: warning: format '%lld' expects type 'long long int', but argument 8 has type '__le64'
fs/ocfs2/quota_local.c:844: warning: format '%lld' expects type 'long long int', but argument 7 has type '__le64'
fs/ocfs2/quota_local.c:844: warning: format '%lld' expects type 'long long int', but argument 8 has type '__le64'
fs/ocfs2/quota_global.c: In function '__ocfs2_sync_dquot':
fs/ocfs2/quota_global.c:457: warning: format '%lld' expects type 'long long int', but argument 8 has type 's64'
fs/ocfs2/quota_global.c:457: warning: format '%lld' expects type 'long long int', but argument 10 has type 's64'
fs/ocfs2/quota_global.c:457: warning: format '%lld' expects type 'long long int', but argument 8 has type 's64'
fs/ocfs2/quota_global.c:457: warning: format '%lld' expects type 'long long int', but argument 10 has type 's64'
fs/ocfs2/quota_global.c:457: warning: format '%lld' expects type 'long long int', but argument 8 has type 's64'
fs/ocfs2/quota_global.c:457: warning: format '%lld' expects type 'long long int', but argument 10 has type 's64'

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 6 +++---
 fs/ocfs2/quota_local.c  | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index e527ec6e013..054d52bd825 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -457,9 +457,9 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
 	olditime = dquot->dq_dqb.dqb_itime;
 	oldbtime = dquot->dq_dqb.dqb_btime;
 	ocfs2_global_disk2memdqb(dquot, &dqblk);
-	mlog(0, "Syncing global dquot %d space %lld+%lld, inodes %lld+%lld\n",
-	     dquot->dq_id, dquot->dq_dqb.dqb_curspace, spacechange,
-	     dquot->dq_dqb.dqb_curinodes, inodechange);
+	mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+	     dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+	     dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
 	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
 		dquot->dq_dqb.dqb_curspace += spacechange;
 	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 7053664f66a..b5ddb22e627 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -848,7 +848,8 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
 					  od->dq_originodes);
 	spin_unlock(&dq_data_lock);
 	mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
-	     od->dq_dquot.dq_id, dqblk->dqb_spacemod, dqblk->dqb_inodemod);
+	     od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+	     (long long)le64_to_cpu(dqblk->dqb_inodemod));
 }
 
 /* Write dquot to local quota file */
-- 
cgit v1.2.3


From 7d9056ba20ebed6e3937a2e23183f6117919cb00 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:32 +0100
Subject: quota: Export dquot_alloc() and dquot_destroy() functions

These are default functions for creating and destroying quota structures
and they should be used from filesystems.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/dquot.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 075dc76904e..61bfff64e5a 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -413,10 +413,11 @@ out_dqlock:
 	return ret;
 }
 
-static void dquot_destroy(struct dquot *dquot)
+void dquot_destroy(struct dquot *dquot)
 {
 	kmem_cache_free(dquot_cachep, dquot);
 }
+EXPORT_SYMBOL(dquot_destroy);
 
 static inline void do_destroy_dquot(struct dquot *dquot)
 {
@@ -668,10 +669,11 @@ we_slept:
 	spin_unlock(&dq_list_lock);
 }
 
-static struct dquot *dquot_alloc(struct super_block *sb, int type)
+struct dquot *dquot_alloc(struct super_block *sb, int type)
 {
 	return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
 }
+EXPORT_SYMBOL(dquot_alloc);
 
 static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 {
-- 
cgit v1.2.3


From 4103003b3abb85af9dec9e60616ae086c2bcb4c9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:33 +0100
Subject: reiserfs: Add default allocation routines for quota structures

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/reiserfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index a9b393a5815..c55651f1407 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = {
 	.release_dquot = reiserfs_release_dquot,
 	.mark_dirty = reiserfs_mark_dquot_dirty,
 	.write_info = reiserfs_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
 static struct quotactl_ops reiserfs_qctl_operations = {
-- 
cgit v1.2.3


From 157091a2c3cdc71422cbc71eace205cf1b9f2200 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:34 +0100
Subject: ext3: Add default allocation routines for quota structures

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ext3/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 250ec53195c..c22d01467bd 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -713,7 +713,9 @@ static struct dquot_operations ext3_quota_operations = {
 	.acquire_dquot	= ext3_acquire_dquot,
 	.release_dquot	= ext3_release_dquot,
 	.mark_dirty	= ext3_mark_dquot_dirty,
-	.write_info	= ext3_write_info
+	.write_info	= ext3_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
 static struct quotactl_ops ext3_qctl_operations = {
-- 
cgit v1.2.3


From a5b5ee320185adc091a3a31630d278806b19d8f0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 25 Nov 2008 15:31:35 +0100
Subject: ext4: Add default allocation routines for quota structures

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ext4/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 49fcf8864e7..9494bb24939 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -803,7 +803,9 @@ static struct dquot_operations ext4_quota_operations = {
 	.acquire_dquot	= ext4_acquire_dquot,
 	.release_dquot	= ext4_release_dquot,
 	.mark_dirty	= ext4_mark_dquot_dirty,
-	.write_info	= ext4_write_info
+	.write_info	= ext4_write_info,
+	.alloc_dquot	= dquot_alloc,
+	.destroy_dquot	= dquot_destroy,
 };
 
 static struct quotactl_ops ext4_qctl_operations = {
-- 
cgit v1.2.3


From e35ff98f7c37b7bc901b4b90a66a0287565e456c Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 26 Nov 2008 16:20:19 -0800
Subject: ocfs2: fix indendation in ocfs2_dquot_drop_slow

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 054d52bd825..a10faebe88a 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -893,7 +893,7 @@ static int ocfs2_dquot_drop_slow(struct inode *inode)
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
 		mlog_errno(status);
-				goto out;
+		goto out;
 	}
 	dquot_drop(inode);
 	ocfs2_commit_trans(OCFS2_SB(sb), handle);
-- 
cgit v1.2.3


From df32b3343aa11e0c7f54783594b24321d17d376f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 25 Nov 2008 07:21:36 +0800
Subject: ocfs2/quota: sparse fixes for quota

Fix 2 minor things in quota. They are both found by sparse check.
1. an endian bug in ocfs2_local_quota_add_chunk.
2. change olq_alloc_dquot to static.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_local.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index b5ddb22e627..d451b715aef 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -988,7 +988,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		goto out_trans;
 	}
 	lock_buffer(bh);
-	dchunk->dqc_free = ol_quota_entries_per_block(sb);
+	dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
 	memset(dchunk->dqc_bitmap, 0,
 	       sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
 	       OCFS2_QBLK_RESERVED_SPACE);
@@ -1110,7 +1110,7 @@ out:
 	return ERR_PTR(status);
 }
 
-void olq_alloc_dquot(struct buffer_head *bh, void *private)
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
 {
 	int *offset = private;
 	struct ocfs2_local_disk_chunk *dchunk;
-- 
cgit v1.2.3


From 548b0f22bb7497ba76f91627b99f9fed53a91704 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 24 Nov 2008 19:32:13 -0800
Subject: ocfs2: Dirty the entire bucket in ocfs2_bucket_value_truncate()

ocfs2_bucket_value_truncate() currently takes the first bh of the
bucket, and magically plays around with the value bh - even though
the bucket structure in the calling function already has it.

In addition, future code wants to always dirty the entire bucket when it
is changed.  So let's pass the entire bucket into this function, skip
any block reads (we have them), and add the access/dirty logic.

ocfs2_xattr_update_value_size() is no longer necessary, as it only did
one thing other than journal access/dirty.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 74 +++++++++++++++++++++-----------------------------------
 1 file changed, 28 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3b9634c7d29..6db68a23a29 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4580,31 +4580,6 @@ out:
 	return ret;
 }
 
-static int ocfs2_xattr_value_update_size(struct inode *inode,
-					 handle_t *handle,
-					 struct buffer_head *xe_bh,
-					 struct ocfs2_xattr_entry *xe,
-					 u64 new_size)
-{
-	int ret;
-
-	ret = ocfs2_journal_access(handle, inode, xe_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	xe->xe_value_size = cpu_to_le64(new_size);
-
-	ret = ocfs2_journal_dirty(handle, xe_bh);
-	if (ret < 0)
-		mlog_errno(ret);
-
-out:
-	return ret;
-}
-
 /*
  * Truncate the specified xe_off entry in xattr bucket.
  * bucket is indicated by header_bh and len is the new length.
@@ -4613,7 +4588,7 @@ out:
  * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
  */
 static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
-					     struct buffer_head *header_bh,
+					     struct ocfs2_xattr_bucket *bucket,
 					     int xe_off,
 					     int len,
 					     struct ocfs2_xattr_set_ctxt *ctxt)
@@ -4623,8 +4598,7 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	struct buffer_head *value_bh = NULL;
 	struct ocfs2_xattr_value_root *xv;
 	struct ocfs2_xattr_entry *xe;
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)header_bh->b_data;
+	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	size_t blocksize = inode->i_sb->s_blocksize;
 
 	xe = &xh->xh_entries[xe_off];
@@ -4638,34 +4612,41 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 
 	/* We don't allow ocfs2_xattr_value to be stored in different block. */
 	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
-	value_blk += header_bh->b_blocknr;
 
-	ret = ocfs2_read_block(inode, value_blk, &value_bh, NULL);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
+	value_bh = bucket->bu_bhs[value_blk];
+	BUG_ON(!value_bh);
 
 	xv = (struct ocfs2_xattr_value_root *)
 		(value_bh->b_data + offset % blocksize);
 
-	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
-	     xe_off, (unsigned long long)header_bh->b_blocknr, len);
-	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len, ctxt);
+	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_xattr_value_update_size(inode, ctxt->handle,
-					    header_bh, xe, len);
+	/*
+	 * From here on out we have to dirty the bucket.  The generic
+	 * value calls only modify one of the bucket's bhs, but we need
+	 * to send the bucket at once.  So if they error, they *could* have
+	 * modified something.  We have to assume they did, and dirty
+	 * the whole bucket.  This leaves us in a consistent state.
+	 */
+	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+	     xe_off, (unsigned long long)bucket_blkno(bucket), len);
+	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out_dirty;
 	}
 
+	xe->xe_value_size = cpu_to_le64(len);
+
+out_dirty:
+	ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
+
 out:
-	brelse(value_bh);
 	return ret;
 }
 
@@ -4681,7 +4662,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
 	BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
 
 	offset = xe - xh->xh_entries;
-	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket->bu_bhs[0],
+	ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
 						offset, len, ctxt);
 	if (ret)
 		mlog_errno(ret);
@@ -5107,11 +5088,13 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 	struct ocfs2_xattr_entry *xe;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+	int credits = ocfs2_remove_extent_credits(osb->sb) +
+		ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
-	ctxt.handle = ocfs2_start_trans(osb,
-					ocfs2_remove_extent_credits(osb->sb));
+	ctxt.handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(ctxt.handle)) {
 		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
@@ -5123,8 +5106,7 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 		if (ocfs2_xattr_is_local(xe))
 			continue;
 
-		ret = ocfs2_xattr_bucket_value_truncate(inode,
-							bucket->bu_bhs[0],
+		ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
 							i, 0, &ctxt);
 		if (ret) {
 			mlog_errno(ret);
-- 
cgit v1.2.3


From 88c3b0622acf82c7c86fbc066e81e15edc7c1685 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 11 Dec 2008 08:54:11 +0800
Subject: ocfs2: Narrow the transaction for deleting xattrs from a bucket.

We move the transaction into the loop because in
ocfs2_remove_extent, we will double the credits in function
ocfs2_extend_rotate_transaction. So if we have a large loop
number, we will soon waste much the journal space.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6db68a23a29..df53a2ce2de 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5094,30 +5094,30 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
 
 	ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
 
-	ctxt.handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(ctxt.handle)) {
-		ret = PTR_ERR(ctxt.handle);
-		mlog_errno(ret);
-		goto out;
-	}
-
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
 		if (ocfs2_xattr_is_local(xe))
 			continue;
 
+		ctxt.handle = ocfs2_start_trans(osb, credits);
+		if (IS_ERR(ctxt.handle)) {
+			ret = PTR_ERR(ctxt.handle);
+			mlog_errno(ret);
+			break;
+		}
+
 		ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
 							i, 0, &ctxt);
+
+		ocfs2_commit_trans(osb, ctxt.handle);
 		if (ret) {
 			mlog_errno(ret);
 			break;
 		}
 	}
 
-	ret = ocfs2_commit_trans(osb, ctxt.handle);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &ctxt.dealloc);
-out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 92de109ade7999084fb0bfcc65d603252504e0d0 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 25 Nov 2008 17:06:40 -0800
Subject: ocfs2: Dirty the entire first bucket in ocfs2_extend_xattr_bucket()

ocfs2_extend_xattr_bucket() takes an extent of buckets and shifts some
of them down to make room for a new xattr.  It is passed the first bh of
the first bucket, because that is where we store the number of buckets
in the extent.

However, future code wants to always dirty the entire bucket when it
is changed.  So let's pass the entire bucket into this function, skip
any block reads (we have them), and add the access/dirty logic.  We also
can skip passing in the target bucket bh - we only need its block
number.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 85 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index df53a2ce2de..ed1e9596756 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3905,7 +3905,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 		mlog_errno(ret);
 		goto out;
 	}
-  
+
 	ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
 	if (ret)
 		goto out;
@@ -4232,37 +4232,45 @@ leave:
 }
 
 /*
- * Extend a new xattr bucket and move xattrs to the end one by one until
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * We are given an extent.  'first' is the bucket at the very front of
+ * the extent.  The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets.  'target_blkno' is the block number
+ * of the target bucket.  We wish to shift every bucket past the target
+ * down one, filling in that additional space.  When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
  */
 static int ocfs2_extend_xattr_bucket(struct inode *inode,
 				     handle_t *handle,
-				     struct buffer_head *first_bh,
-				     struct buffer_head *start_bh,
+				     struct ocfs2_xattr_bucket *first,
+				     u64 target_blk,
 				     u32 num_clusters)
 {
 	int ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	u64 start_blk = start_bh->b_blocknr, end_blk;
-	u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
-	struct ocfs2_xattr_header *first_xh =
-				(struct ocfs2_xattr_header *)first_bh->b_data;
-	u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+	u64 end_blk;
+	u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
 
 	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-	     "from %llu, len = %u\n", (unsigned long long)start_blk,
-	     (unsigned long long)first_bh->b_blocknr, num_clusters);
+	     "from %llu, len = %u\n", (unsigned long long)target_blk,
+	     (unsigned long long)bucket_blkno(first), num_clusters);
 
-	BUG_ON(bucket >= num_buckets);
+	/* The extent must have room for an additional bucket */
+	BUG_ON(new_bucket >=
+	       (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
 
-	end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+	/* end_blk points to the last existing bucket */
+	end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
 
 	/*
-	 * We will touch all the buckets after the start_bh(include it).
-	 * Then we add one more bucket.
+	 * end_blk is the start of the last existing bucket.
+	 * Thus, (end_blk - target_blk) covers the target bucket and
+	 * every bucket after it up to, but not including, the last
+	 * existing bucket.  Then we add the last existing bucket, the
+	 * new bucket, and the first bucket (3 * blk_per_bucket).
 	 */
-	credits = end_blk - start_blk + 3 * blk_per_bucket + 1 +
+	credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
 		  handle->h_buffer_credits;
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
@@ -4270,14 +4278,14 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, first_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	while (end_blk != start_blk) {
+	while (end_blk != target_blk) {
 		ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
 					    end_blk + blk_per_bucket, 0);
 		if (ret)
@@ -4285,12 +4293,12 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 		end_blk -= blk_per_bucket;
 	}
 
-	/* Move half of the xattr in start_blk to the next bucket. */
-	ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
-					start_blk + blk_per_bucket, NULL, 0);
+	/* Move half of the xattr in target_blkno to the next bucket. */
+	ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
+					target_blk + blk_per_bucket, NULL, 0);
 
-	le16_add_cpu(&first_xh->xh_num_buckets, 1);
-	ocfs2_journal_dirty(handle, first_bh);
+	le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
+	ocfs2_xattr_bucket_journal_dirty(handle, first);
 
 out:
 	return ret;
@@ -4324,10 +4332,19 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 	int ret, num_buckets, extend = 1;
 	u64 p_blkno;
 	u32 e_cpos, num_clusters;
+	/* The bucket at the front of the extent */
+	struct ocfs2_xattr_bucket *first;
 
 	mlog(0, "Add new xattr bucket starting form %llu\n",
 	     (unsigned long long)header_bh->b_blocknr);
 
+	first = ocfs2_xattr_bucket_new(inode);
+	if (!first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/*
 	 * Add refrence for header_bh here because it may be
 	 * changed in ocfs2_add_new_xattr_cluster and we need
@@ -4367,17 +4384,25 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 		}
 	}
 
-	if (extend)
+	if (extend) {
+		/* These bucket reads should be cached */
+		ret = ocfs2_read_xattr_bucket(first, first_bh->b_blocknr);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 		ret = ocfs2_extend_xattr_bucket(inode,
 						ctxt->handle,
-						first_bh,
-						header_bh,
+						first, header_bh->b_blocknr,
 						num_clusters);
-	if (ret)
-		mlog_errno(ret);
+		if (ret)
+			mlog_errno(ret);
+	}
+
 out:
 	brelse(first_bh);
 	brelse(header_bh);
+	ocfs2_xattr_bucket_free(first);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 15d609293d1954465a4788b9b182214323c6a2a1 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 25 Nov 2008 18:36:42 -0800
Subject: ocfs2: Dirty the entire first bucket in ocfs2_cp_xattr_cluster().

ocfs2_cp_xattr_cluster() takes the last bucket of a full extent and
copies it over to a new extent.  It then updates the headers of both
extents to reflect the new state.  It is passed the first bh of
the first bucket in order to update that first extent's bucket count.
It reads and dirties the first bh of the new extent for the same reason.

However, future code wants to always dirty the entire bucket when it
is changed.  So it is changed to read the entire bucket it is updating
for both extents.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 80 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ed1e9596756..4dba3475882 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3936,9 +3936,10 @@ out:
 }
 
 /*
- * Copy one xattr cluster from src_blk to to_blk.
- * The to_blk will become the first bucket header of the cluster, so its
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * src_blk points to the last cluster of an existing extent.  to_blk
+ * points to a newly allocated extent.  We copy the cluster over to the
+ * new extent, initializing its xh_num_buckets.  The old extent's
+ * xh_num_buckets shrinks by the same amount.
  */
 static int ocfs2_cp_xattr_cluster(struct inode *inode,
 				  handle_t *handle,
@@ -3950,27 +3951,42 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	int i, ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-	struct buffer_head *bh = NULL;
-	struct ocfs2_xattr_header *xh;
-	u64 to_blk_start = to_blk;
+	struct ocfs2_xattr_bucket *old_first, *new_first;
 
 	mlog(0, "cp xattrs from cluster %llu to %llu\n",
 	     (unsigned long long)src_blk, (unsigned long long)to_blk);
 
+	/* The first bucket of the original extent */
+	old_first = ocfs2_xattr_bucket_new(inode);
+	/* The first bucket of the new extent */
+	new_first = ocfs2_xattr_bucket_new(inode);
+	if (!old_first || !new_first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(old_first, first_bh->b_blocknr);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/*
-	 * We need to update the new cluster and 1 more for the update of
-	 * the 1st bucket of the previous extent rec.
+	 * We need to update the first bucket of the old extent and the
+	 * entire first cluster of the new extent.
 	 */
-	credits = bpc + 1 + handle->h_buffer_credits;
+	credits = blks_per_bucket + bpc + handle->h_buffer_credits;
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, first_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3978,45 +3994,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 
 	for (i = 0; i < num_buckets; i++) {
 		ret = ocfs2_cp_xattr_bucket(inode, handle,
-					    src_blk, to_blk, 1);
+					    src_blk + (i * blks_per_bucket),
+					    to_blk + (i * blks_per_bucket),
+					    1);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-
-		src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-		to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	}
 
-	/* update the old bucket header. */
-	xh = (struct ocfs2_xattr_header *)first_bh->b_data;
-	le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
-
-	ocfs2_journal_dirty(handle, first_bh);
-
-	/* update the new bucket header. */
-	ret = ocfs2_read_block(inode, to_blk_start, &bh, NULL);
-	if (ret < 0) {
+	/*
+	 * Get the new bucket ready before we dirty anything
+	 * (This actually shouldn't fail, because we already dirtied
+	 * it once in ocfs2_cp_xattr_bucket()).
+	 */
+	ret = ocfs2_read_xattr_bucket(new_first, to_blk);
+	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
-
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	xh = (struct ocfs2_xattr_header *)bh->b_data;
-	xh->xh_num_buckets = cpu_to_le16(num_buckets);
+	/* Now update the headers */
+	le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, old_first);
 
-	ocfs2_journal_dirty(handle, bh);
+	bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+	ocfs2_xattr_bucket_journal_dirty(handle, new_first);
 
 	if (first_hash)
-		*first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+		*first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+
 out:
-	brelse(bh);
+	ocfs2_xattr_bucket_free(new_first);
+	ocfs2_xattr_bucket_free(old_first);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2b656c1d6fc5ba7791a360766780a212faed5705 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 25 Nov 2008 19:00:15 -0800
Subject: ocfs2: Explain t_is_new in ocfs2_cp_xattr_cluster().

I was unsure of the JOURNAL_ACCESS parameters in
ocfs2_cp_xattr_cluster().  They're based on the function argument
't_is_new', but I couldn't quite figure out how t_is_new mapped to
allocation.  ocfs2_cp_xattr_cluster() actually overwrites the target,
regardless of t_is_new.

Well, I just figured it out.  So I'm adding a big fat comment for those
who come after me.  ocfs2_divide_xattr_cluster() has the same behavior.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4dba3475882..5efcf4e85d7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3747,6 +3747,11 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  See the comment in the
+	 * same part of ocfs2_cp_xattr_bucket().
+	 */
 	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
 						new_bucket_head ?
 						OCFS2_JOURNAL_ACCESS_CREATE :
@@ -3918,6 +3923,18 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	if (ret)
 		goto out;
 
+	/*
+	 * Hey, if we're overwriting t_bucket, what difference does
+	 * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
+	 * cluster to fill, we came here from ocfs2_cp_xattr_cluster(), and
+	 * it is really new - ACCESS_CREATE is required.  But we also
+	 * might have moved data out of t_bucket before extending back
+	 * into it.  ocfs2_add_new_xattr_bucket() can do this - its call
+	 * to ocfs2_add_new_xattr_cluster() may have created a new extent
+	 * and copied out the end of the old extent.  Then it re-extends
+	 * the old extent back to create space for new xattrs.  That's
+	 * how we get here, and the bucket isn't really new.
+	 */
 	ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
 						t_is_new ?
 						OCFS2_JOURNAL_ACCESS_CREATE :
-- 
cgit v1.2.3


From b5c03e746959bb005b987e9d8511df46680c3daa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 25 Nov 2008 19:58:16 -0800
Subject: ocfs2: Use ocfs2_cp_xattr_bucket() in
 ocfs2_mv_xattr_bucket_cross_cluster().

The buffer copy loop of ocfs2_mv_xattr_bucket_cross_cluster() actually
looks a lot like ocfs2_cp_xattr_bucket().  Let's just use that instead.
We also use bucket operations to update the buckets at the start of each
extent.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 169 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 104 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5efcf4e85d7..5be99666f02 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -170,6 +170,11 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+				 handle_t *handle,
+				 u64 s_blkno,
+				 u64 t_blkno,
+				 int t_is_new);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -3526,13 +3531,21 @@ out:
 }
 
 /*
- * Move half nums of the xattr bucket in the previous cluster to this new
- * cluster. We only touch the last cluster of the previous extend record.
+ * prev_blkno points to the start of an existing extent.  new_blkno
+ * points to a newly allocated extent.  Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary.  So we take the last cluster of the existing
+ * extent and split it down the middle.  We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count.  header_bh is the bucket were we were hoping
+ * to insert our xattr.  If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
  *
- * first_bh is the first buffer_head of a series of bucket in the same
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
  */
 static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 					       handle_t *handle,
@@ -3545,105 +3558,131 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 {
 	int i, ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
-	int blocksize = inode->i_sb->s_blocksize;
-	struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
-	struct ocfs2_xattr_header *new_xh;
+	int to_move = num_buckets / 2;
+	u64 last_cluster_blkno, src_blkno;
 	struct ocfs2_xattr_header *xh =
 			(struct ocfs2_xattr_header *)((*first_bh)->b_data);
+	struct ocfs2_xattr_bucket *old_first, *new_first;
 
 	BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
 	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
 
-	prev_bh = *first_bh;
-	get_bh(prev_bh);
-	xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
-
-	prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+	last_cluster_blkno = prev_blkno + ((num_clusters - 1) * bpc);
+	src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
 
 	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
 	     (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
 
+	/* The first bucket of the original extent */
+	old_first = ocfs2_xattr_bucket_new(inode);
+	/* The first bucket of the new extent */
+	new_first = ocfs2_xattr_bucket_new(inode);
+	if (!old_first || !new_first) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(old_first, prev_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	/*
-	 * We need to update the 1st half of the new cluster and
-	 * 1 more for the update of the 1st bucket of the previous
-	 * extent record.
+	 * We need to update the 1st half of the new extent, and we
+	 * need to update the first bucket of the old extent.
 	 */
-	credits = bpc / 2 + 1 + handle->h_buffer_credits;
+	credits = ((to_move + 1) * blks_per_bucket) + handle->h_buffer_credits;
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, prev_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
-		old_bh = new_bh = NULL;
-		new_bh = sb_getblk(inode->i_sb, new_blkno);
-		if (!new_bh) {
-			ret = -EIO;
+	for (i = 0; i < to_move; i++) {
+		ret = ocfs2_cp_xattr_bucket(inode, handle,
+					    src_blkno + (i * blks_per_bucket),
+					    new_blkno + (i * blks_per_bucket),
+					    1);
+		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
+	}
 
-		ocfs2_set_new_buffer_uptodate(inode, new_bh);
+	/*
+	 * Get the new bucket ready before we dirty anything
+	 * (This actually shouldn't fail, because we already dirtied
+	 * it once in ocfs2_cp_xattr_bucket()).
+	 */
+	ret = ocfs2_read_xattr_bucket(new_first, new_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		ret = ocfs2_journal_access(handle, inode, new_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
-		if (ret < 0) {
-			mlog_errno(ret);
-			brelse(new_bh);
-			goto out;
-		}
+	/* Now update the headers */
+	le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -to_move);
+	ocfs2_xattr_bucket_journal_dirty(handle, old_first);
 
-		ret = ocfs2_read_block(inode, prev_blkno, &old_bh, NULL);
-		if (ret < 0) {
-			mlog_errno(ret);
-			brelse(new_bh);
-			goto out;
-		}
+	bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(to_move);
+	ocfs2_xattr_bucket_journal_dirty(handle, new_first);
 
-		memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+	if (first_hash)
+		*first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
 
-		if (i == 0) {
-			new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
-			new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
+	/*
+	 * If the target bucket is anywhere past src_blkno, we moved
+	 * it to the new extent.  We need to update first_bh and header_bh.
+	 */
+	if ((*header_bh)->b_blocknr >= src_blkno) {
+		/* We're done with old_first, so we can re-use it. */
+		ocfs2_xattr_bucket_relse(old_first);
 
-			if (first_hash)
-				*first_hash = le32_to_cpu(
-					new_xh->xh_entries[0].xe_name_hash);
-			new_first_bh = new_bh;
-			get_bh(new_first_bh);
-		}
+		/* Find the block for the new target bucket */
+		src_blkno = new_blkno +
+			((*header_bh)->b_blocknr - src_blkno);
 
-		ocfs2_journal_dirty(handle, new_bh);
+		/*
+		 * This shouldn't fail - the buffers are in the
+		 * journal from ocfs2_cp_xattr_bucket().
+		 */
+		ret = ocfs2_read_xattr_bucket(old_first, src_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
-		if (*header_bh == old_bh) {
-			brelse(*header_bh);
-			*header_bh = new_bh;
-			get_bh(*header_bh);
+		brelse(*first_bh);
+		*first_bh = new_first->bu_bhs[0];
+		get_bh(*first_bh);
 
-			brelse(*first_bh);
-			*first_bh = new_first_bh;
-			get_bh(*first_bh);
-		}
-		brelse(new_bh);
-		brelse(old_bh);
+		brelse(*header_bh);
+		*header_bh = old_first->bu_bhs[0];
+		get_bh(*header_bh);
 	}
 
-	le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-
-	ocfs2_journal_dirty(handle, prev_bh);
 out:
-	brelse(prev_bh);
-	brelse(new_first_bh);
+	ocfs2_xattr_bucket_free(new_first);
+	ocfs2_xattr_bucket_free(old_first);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 874d65af1c8b8f6456a934701e6828d3017be029 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 13:02:18 -0800
Subject: ocfs2: Rename ocfs2_cp_xattr_cluster() to ocfs2_mv_xattr_buckets().

ocfs2_cp_xattr_cluster() takes the last cluster of an xattr extent,
copies its buckets to the front of a new extent, and then shrinks the bucket
count of the original extent.  So it's really moving the data, not
copying it.

While we're here, the function doesn't need a buffer_head for the old
extent, just the block number.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5be99666f02..c1f2e069074 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3965,11 +3965,12 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 	/*
 	 * Hey, if we're overwriting t_bucket, what difference does
 	 * ACCESS_CREATE vs ACCESS_WRITE make?  Well, if we allocated a new
-	 * cluster to fill, we came here from ocfs2_cp_xattr_cluster(), and
-	 * it is really new - ACCESS_CREATE is required.  But we also
-	 * might have moved data out of t_bucket before extending back
-	 * into it.  ocfs2_add_new_xattr_bucket() can do this - its call
-	 * to ocfs2_add_new_xattr_cluster() may have created a new extent
+	 * cluster to fill, we came here from
+	 * ocfs2_mv_xattr_buckets(), and it is really new -
+	 * ACCESS_CREATE is required.  But we also might have moved data
+	 * out of t_bucket before extending back into it.
+	 * ocfs2_add_new_xattr_bucket() can do this - its call to
+	 * ocfs2_add_new_xattr_cluster() may have created a new extent
 	 * and copied out the end of the old extent.  Then it re-extends
 	 * the old extent back to create space for new xattrs.  That's
 	 * how we get here, and the bucket isn't really new.
@@ -3992,17 +3993,16 @@ out:
 }
 
 /*
- * src_blk points to the last cluster of an existing extent.  to_blk
- * points to a newly allocated extent.  We copy the cluster over to the
- * new extent, initializing its xh_num_buckets.  The old extent's
- * xh_num_buckets shrinks by the same amount.
+ * src_blk points to the start of an existing extent.  last_blk points to
+ * last cluster in that extent.  to_blk points to a newly allocated
+ * extent.  We copy the buckets from cluster at last_blk to the new extent,
+ * initializing its xh_num_buckets.  The old extent's xh_num_buckets
+ * shrinks by the same amount.
  */
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
+static int ocfs2_mv_xattr_buckets(struct inode *inode,
 				  handle_t *handle,
-				  struct buffer_head *first_bh,
-				  u64 src_blk,
-				  u64 to_blk,
-				  u32 *first_hash)
+				  u64 src_blk, u64 last_blk,
+				  u64 to_blk, u32 *first_hash)
 {
 	int i, ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -4011,8 +4011,8 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
 	struct ocfs2_xattr_bucket *old_first, *new_first;
 
-	mlog(0, "cp xattrs from cluster %llu to %llu\n",
-	     (unsigned long long)src_blk, (unsigned long long)to_blk);
+	mlog(0, "mv xattrs from cluster %llu to %llu\n",
+	     (unsigned long long)last_blk, (unsigned long long)to_blk);
 
 	/* The first bucket of the original extent */
 	old_first = ocfs2_xattr_bucket_new(inode);
@@ -4024,7 +4024,7 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_xattr_bucket(old_first, first_bh->b_blocknr);
+	ret = ocfs2_read_xattr_bucket(old_first, src_blk);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4050,7 +4050,7 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
 
 	for (i = 0; i < num_buckets; i++) {
 		ret = ocfs2_cp_xattr_bucket(inode, handle,
-					    src_blk + (i * blks_per_bucket),
+					    last_blk + (i * blks_per_bucket),
 					    to_blk + (i * blks_per_bucket),
 					    1);
 		if (ret) {
@@ -4175,8 +4175,10 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 		u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
 
 		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
-			ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
-						     last_blk, new_blk,
+			ret = ocfs2_mv_xattr_buckets(inode, handle,
+						     (*first_bh)->b_blocknr,
+						     last_blk,
+						     new_blk,
 						     v_start);
 		else {
 			ret = ocfs2_divide_xattr_cluster(inode, handle,
-- 
cgit v1.2.3


From 54ecb6b6df54bf72befb359b21f3759b2952f9d9 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 13:18:31 -0800
Subject: ocfs2: ocfs2_mv_xattr_buckets() can handle a partial cluster now.

If you look at ocfs2_mv_xattr_bucket_cross_cluster(), you'll notice that
two-thirds of the code is almost identical to ocfs2_mv_xattr_buckets().
The only difference is that ocfs2_mv_xattr_buckets() moves a whole
cluster's worth, while ocfs2_mv_xattr_bucket_cross_cluster() moves half
the cluster.

We change ocfs2_mv_xattr_buckets() to allow moving partial clusters.
The original caller of ocfs2_mv_xattr_buckets() still moves the whole
cluster's worth - it just passes a start_bucket of 0.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c1f2e069074..97340940cee 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3995,18 +3995,19 @@ out:
 /*
  * src_blk points to the start of an existing extent.  last_blk points to
  * last cluster in that extent.  to_blk points to a newly allocated
- * extent.  We copy the buckets from cluster at last_blk to the new extent,
- * initializing its xh_num_buckets.  The old extent's xh_num_buckets
- * shrinks by the same amount.
+ * extent.  We copy the buckets from the cluster at last_blk to the new
+ * extent.  If start_bucket is non-zero, we skip that many buckets before
+ * we start copying.  The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied.  The old extent's xh_num_buckets shrinks
+ * by the same amount.
  */
-static int ocfs2_mv_xattr_buckets(struct inode *inode,
-				  handle_t *handle,
-				  u64 src_blk, u64 last_blk,
-				  u64 to_blk, u32 *first_hash)
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
+				  u32 *first_hash)
 {
 	int i, ret, credits;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
 	struct ocfs2_xattr_bucket *old_first, *new_first;
@@ -4014,6 +4015,12 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode,
 	mlog(0, "mv xattrs from cluster %llu to %llu\n",
 	     (unsigned long long)last_blk, (unsigned long long)to_blk);
 
+	BUG_ON(start_bucket >= num_buckets);
+	if (start_bucket) {
+		num_buckets -= start_bucket;
+		last_blk += (start_bucket * blks_per_bucket);
+	}
+
 	/* The first bucket of the original extent */
 	old_first = ocfs2_xattr_bucket_new(inode);
 	/* The first bucket of the new extent */
@@ -4031,10 +4038,11 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode,
 	}
 
 	/*
-	 * We need to update the first bucket of the old extent and the
-	 * entire first cluster of the new extent.
+	 * We need to update the first bucket of the old extent and all
+	 * the buckets going to the new extent.
 	 */
-	credits = blks_per_bucket + bpc + handle->h_buffer_credits;
+	credits = ((num_buckets + 1) * blks_per_bucket) +
+		handle->h_buffer_credits;
 	ret = ocfs2_extend_trans(handle, credits);
 	if (ret) {
 		mlog_errno(ret);
@@ -4177,8 +4185,7 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
 			ret = ocfs2_mv_xattr_buckets(inode, handle,
 						     (*first_bh)->b_blocknr,
-						     last_blk,
-						     new_blk,
+						     last_blk, new_blk, 0,
 						     v_start);
 		else {
 			ret = ocfs2_divide_xattr_cluster(inode, handle,
-- 
cgit v1.2.3


From c58b6032f93358871361a92d7743dbc85d27084e Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 13:36:24 -0800
Subject: ocfs2: Use ocfs2_mv_xattr_buckets() in
 ocfs2_mv_xattr_bucket_cross_cluster().

Now that ocfs2_mv_xattr_buckets() can move a partial cluster's worth of
buckets, ocfs2_mv_xattr_bucket_cross_cluster() can use it.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 110 +++++++++++++++----------------------------------------
 1 file changed, 29 insertions(+), 81 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 97340940cee..c3189286679 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -170,11 +170,10 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 
 static int ocfs2_delete_xattr_index_block(struct inode *inode,
 					  struct buffer_head *xb_bh);
-static int ocfs2_cp_xattr_bucket(struct inode *inode,
-				 handle_t *handle,
-				 u64 s_blkno,
-				 u64 t_blkno,
-				 int t_is_new);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+				  u64 src_blk, u64 last_blk, u64 to_blk,
+				  unsigned int start_bucket,
+				  u32 *first_hash);
 
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -3556,115 +3555,64 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 					       u32 num_clusters,
 					       u32 *first_hash)
 {
-	int i, ret, credits;
+	int ret;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
 	int to_move = num_buckets / 2;
-	u64 last_cluster_blkno, src_blkno;
+	u64 src_blkno;
+	u64 last_cluster_blkno = prev_blkno +
+		((num_clusters - 1) * ocfs2_clusters_to_blocks(inode->i_sb, 1));
 	struct ocfs2_xattr_header *xh =
 			(struct ocfs2_xattr_header *)((*first_bh)->b_data);
-	struct ocfs2_xattr_bucket *old_first, *new_first;
+	struct ocfs2_xattr_bucket *new_target, *new_first;
 
 	BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
 	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
 
-	last_cluster_blkno = prev_blkno + ((num_clusters - 1) * bpc);
-	src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
-
 	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-	     (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+	     (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
 
-	/* The first bucket of the original extent */
-	old_first = ocfs2_xattr_bucket_new(inode);
 	/* The first bucket of the new extent */
 	new_first = ocfs2_xattr_bucket_new(inode);
-	if (!old_first || !new_first) {
+	/* The target bucket if it was moved to the new extent */
+	new_target = ocfs2_xattr_bucket_new(inode);
+	if (!new_target || !new_first) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 
-	ret = ocfs2_read_xattr_bucket(old_first, prev_blkno);
+	ret = ocfs2_mv_xattr_buckets(inode, handle, prev_blkno,
+				     last_cluster_blkno, new_blkno,
+				     to_move, first_hash);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	/*
-	 * We need to update the 1st half of the new extent, and we
-	 * need to update the first bucket of the old extent.
-	 */
-	credits = ((to_move + 1) * blks_per_bucket) + handle->h_buffer_credits;
-	ret = ocfs2_extend_trans(handle, credits);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	for (i = 0; i < to_move; i++) {
-		ret = ocfs2_cp_xattr_bucket(inode, handle,
-					    src_blkno + (i * blks_per_bucket),
-					    new_blkno + (i * blks_per_bucket),
-					    1);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
-	}
-
-	/*
-	 * Get the new bucket ready before we dirty anything
-	 * (This actually shouldn't fail, because we already dirtied
-	 * it once in ocfs2_cp_xattr_bucket()).
-	 */
-	ret = ocfs2_read_xattr_bucket(new_first, new_blkno);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-	ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	/* Now update the headers */
-	le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -to_move);
-	ocfs2_xattr_bucket_journal_dirty(handle, old_first);
-
-	bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(to_move);
-	ocfs2_xattr_bucket_journal_dirty(handle, new_first);
-
-	if (first_hash)
-		*first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+	/* This is the first bucket that got moved */
+	src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
 
 	/*
-	 * If the target bucket is anywhere past src_blkno, we moved
-	 * it to the new extent.  We need to update first_bh and header_bh.
+	 * If the target bucket was part of the moved buckets, we need to
+	 * update first_bh and header_bh.
 	 */
 	if ((*header_bh)->b_blocknr >= src_blkno) {
-		/* We're done with old_first, so we can re-use it. */
-		ocfs2_xattr_bucket_relse(old_first);
-
 		/* Find the block for the new target bucket */
 		src_blkno = new_blkno +
 			((*header_bh)->b_blocknr - src_blkno);
 
 		/*
-		 * This shouldn't fail - the buffers are in the
+		 * These shouldn't fail - the buffers are in the
 		 * journal from ocfs2_cp_xattr_bucket().
 		 */
-		ret = ocfs2_read_xattr_bucket(old_first, src_blkno);
+		ret = ocfs2_read_xattr_bucket(new_first, new_blkno);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ret = ocfs2_read_xattr_bucket(new_target, src_blkno);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3675,13 +3623,13 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 		get_bh(*first_bh);
 
 		brelse(*header_bh);
-		*header_bh = old_first->bu_bhs[0];
+		*header_bh = new_target->bu_bhs[0];
 		get_bh(*header_bh);
 	}
 
 out:
 	ocfs2_xattr_bucket_free(new_first);
-	ocfs2_xattr_bucket_free(old_first);
+	ocfs2_xattr_bucket_free(new_target);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 92cf3adf48097b7561a3c83f800ed3b2b25b18d4 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 14:12:09 -0800
Subject: ocfs2: Start using buckets in ocfs2_adjust_xattr_cross_cluster().

We want to be passing around buckets instead of buffer_heads.  Let's get
them into ocfs2_adjust_xattr_cross_cluster.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 44 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 37 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c3189286679..975ba3653fe 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4111,28 +4111,54 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 					    u32 *v_start,
 					    int *extend)
 {
-	int ret = 0;
-	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	int ret;
+	struct ocfs2_xattr_bucket *first, *target;
 
 	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
 	     (unsigned long long)prev_blk, prev_clusters,
 	     (unsigned long long)new_blk);
 
+	/* The first bucket of the original extent */
+	first = ocfs2_xattr_bucket_new(inode);
+	/* The target bucket for insert */
+	target = ocfs2_xattr_bucket_new(inode);
+	if (!first || !target) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(prev_blk != (*first_bh)->b_blocknr);
+	ret = ocfs2_read_xattr_bucket(first, prev_blk);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_read_xattr_bucket(target, (*header_bh)->b_blocknr);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
 	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
 							  handle,
 							  first_bh,
 							  header_bh,
 							  new_blk,
-							  prev_blk,
+							  bucket_blkno(first),
 							  prev_clusters,
 							  v_start);
 	else {
-		u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+		/* The start of the last cluster in the first extent */
+		u64 last_blk = bucket_blkno(first) +
+			((prev_clusters - 1) *
+			 ocfs2_clusters_to_blocks(inode->i_sb, 1));
 
-		if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+		if (prev_clusters > 1 && bucket_blkno(target) != last_blk)
 			ret = ocfs2_mv_xattr_buckets(inode, handle,
-						     (*first_bh)->b_blocknr,
+						     bucket_blkno(first),
 						     last_blk, new_blk, 0,
 						     v_start);
 		else {
@@ -4140,11 +4166,15 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 							 last_blk, new_blk,
 							 v_start);
 
-			if ((*header_bh)->b_blocknr == last_blk && extend)
+			if ((bucket_blkno(target) == last_blk) && extend)
 				*extend = 0;
 		}
 	}
 
+out:
+	ocfs2_xattr_bucket_free(first);
+	ocfs2_xattr_bucket_free(target);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 41cb814866110b6e35dad7569ecf96163c3bb824 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 14:25:21 -0800
Subject: ocfs2: Pass buckets into ocfs2_mv_xattr_bucket_cross_cluster().

Now that ocfs2_adjust_xattr_cross_cluster() has buckets, it can pass
them into ocfs2_mv_xattr_bucket_cross_cluster().  It no longer has to
care about buffer_heads.  The manipulation of first_bh and header_bh
moves up to ocfs2_adjust_xattr_cross_cluster().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 84 +++++++++++++++++++++++++-------------------------------
 1 file changed, 37 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 975ba3653fe..2f16f50ebcb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3548,42 +3548,28 @@ out:
  */
 static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 					       handle_t *handle,
-					       struct buffer_head **first_bh,
-					       struct buffer_head **header_bh,
+					       struct ocfs2_xattr_bucket *first,
+					       struct ocfs2_xattr_bucket *target,
 					       u64 new_blkno,
-					       u64 prev_blkno,
 					       u32 num_clusters,
 					       u32 *first_hash)
 {
 	int ret;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+	struct super_block *sb = inode->i_sb;
+	int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
+	int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
 	int to_move = num_buckets / 2;
 	u64 src_blkno;
-	u64 last_cluster_blkno = prev_blkno +
-		((num_clusters - 1) * ocfs2_clusters_to_blocks(inode->i_sb, 1));
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)((*first_bh)->b_data);
-	struct ocfs2_xattr_bucket *new_target, *new_first;
+	u64 last_cluster_blkno = bucket_blkno(first) +
+		((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
 
-	BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
-	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+	BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
 
 	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
 	     (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
 
-	/* The first bucket of the new extent */
-	new_first = ocfs2_xattr_bucket_new(inode);
-	/* The target bucket if it was moved to the new extent */
-	new_target = ocfs2_xattr_bucket_new(inode);
-	if (!new_target || !new_first) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_mv_xattr_buckets(inode, handle, prev_blkno,
+	ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
 				     last_cluster_blkno, new_blkno,
 				     to_move, first_hash);
 	if (ret) {
@@ -3596,41 +3582,32 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 
 	/*
 	 * If the target bucket was part of the moved buckets, we need to
-	 * update first_bh and header_bh.
+	 * update first and target.
 	 */
-	if ((*header_bh)->b_blocknr >= src_blkno) {
+	if (bucket_blkno(target) >= src_blkno) {
 		/* Find the block for the new target bucket */
 		src_blkno = new_blkno +
-			((*header_bh)->b_blocknr - src_blkno);
+			(bucket_blkno(target) - src_blkno);
+
+		ocfs2_xattr_bucket_relse(first);
+		ocfs2_xattr_bucket_relse(target);
 
 		/*
 		 * These shouldn't fail - the buffers are in the
 		 * journal from ocfs2_cp_xattr_bucket().
 		 */
-		ret = ocfs2_read_xattr_bucket(new_first, new_blkno);
+		ret = ocfs2_read_xattr_bucket(first, new_blkno);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
-		ret = ocfs2_read_xattr_bucket(new_target, src_blkno);
-		if (ret) {
+		ret = ocfs2_read_xattr_bucket(target, src_blkno);
+		if (ret)
 			mlog_errno(ret);
-			goto out;
-		}
 
-		brelse(*first_bh);
-		*first_bh = new_first->bu_bhs[0];
-		get_bh(*first_bh);
-
-		brelse(*header_bh);
-		*header_bh = new_target->bu_bhs[0];
-		get_bh(*header_bh);
 	}
 
 out:
-	ocfs2_xattr_bucket_free(new_first);
-	ocfs2_xattr_bucket_free(new_target);
-
 	return ret;
 }
 
@@ -4141,16 +4118,29 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 		goto out;
 	}
 
-	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
 							  handle,
-							  first_bh,
-							  header_bh,
+							  first, target,
 							  new_blk,
-							  bucket_blkno(first),
 							  prev_clusters,
 							  v_start);
-	else {
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Did first+target get moved? */
+		if (prev_blk != bucket_blkno(first)) {
+			brelse(*first_bh);
+			*first_bh = first->bu_bhs[0];
+			get_bh(*first_bh);
+
+			brelse(*header_bh);
+			*header_bh = target->bu_bhs[0];
+			get_bh(*header_bh);
+		}
+	} else {
 		/* The start of the last cluster in the first extent */
 		u64 last_blk = bucket_blkno(first) +
 			((prev_clusters - 1) *
-- 
cgit v1.2.3


From 012ee910876e251621705e8dea7c353fd4914e19 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 14:43:31 -0800
Subject: ocfs2: Move buckets up into ocfs2_add_new_xattr_cluster().

Lift the buckets from ocfs2_adjust_xattr_cross_cluster() up into
ocfs2_add_new_xattr_cluster().  Now ocfs2_adjust_xattr_cross_cluster()
doesn't deal with buffer_heads.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 100 +++++++++++++++++++++++++++----------------------------
 1 file changed, 49 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2f16f50ebcb..4b247047b7a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4080,44 +4080,19 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
  */
 static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 					    handle_t *handle,
-					    struct buffer_head **first_bh,
-					    struct buffer_head **header_bh,
+					    struct ocfs2_xattr_bucket *first,
+					    struct ocfs2_xattr_bucket *target,
 					    u64 new_blk,
-					    u64 prev_blk,
 					    u32 prev_clusters,
 					    u32 *v_start,
 					    int *extend)
 {
 	int ret;
-	struct ocfs2_xattr_bucket *first, *target;
 
 	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-	     (unsigned long long)prev_blk, prev_clusters,
+	     (unsigned long long)bucket_blkno(first), prev_clusters,
 	     (unsigned long long)new_blk);
 
-	/* The first bucket of the original extent */
-	first = ocfs2_xattr_bucket_new(inode);
-	/* The target bucket for insert */
-	target = ocfs2_xattr_bucket_new(inode);
-	if (!first || !target) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	BUG_ON(prev_blk != (*first_bh)->b_blocknr);
-	ret = ocfs2_read_xattr_bucket(first, prev_blk);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_read_xattr_bucket(target, (*header_bh)->b_blocknr);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
 							  handle,
@@ -4125,46 +4100,33 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 							  new_blk,
 							  prev_clusters,
 							  v_start);
-		if (ret) {
+		if (ret)
 			mlog_errno(ret);
-			goto out;
-		}
-
-		/* Did first+target get moved? */
-		if (prev_blk != bucket_blkno(first)) {
-			brelse(*first_bh);
-			*first_bh = first->bu_bhs[0];
-			get_bh(*first_bh);
-
-			brelse(*header_bh);
-			*header_bh = target->bu_bhs[0];
-			get_bh(*header_bh);
-		}
 	} else {
 		/* The start of the last cluster in the first extent */
 		u64 last_blk = bucket_blkno(first) +
 			((prev_clusters - 1) *
 			 ocfs2_clusters_to_blocks(inode->i_sb, 1));
 
-		if (prev_clusters > 1 && bucket_blkno(target) != last_blk)
+		if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
 			ret = ocfs2_mv_xattr_buckets(inode, handle,
 						     bucket_blkno(first),
 						     last_blk, new_blk, 0,
 						     v_start);
-		else {
+			if (ret)
+				mlog_errno(ret);
+		} else {
 			ret = ocfs2_divide_xattr_cluster(inode, handle,
 							 last_blk, new_blk,
 							 v_start);
+			if (ret)
+				mlog_errno(ret);
 
 			if ((bucket_blkno(target) == last_blk) && extend)
 				*extend = 0;
 		}
 	}
 
-out:
-	ocfs2_xattr_bucket_free(first);
-	ocfs2_xattr_bucket_free(target);
-
 	return ret;
 }
 
@@ -4202,6 +4164,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	handle_t *handle = ctxt->handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
+	struct ocfs2_xattr_bucket *first, *target;
 
 	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
 	     "previous xattr blkno = %llu\n",
@@ -4210,6 +4173,29 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
+	/* The first bucket of the original extent */
+	first = ocfs2_xattr_bucket_new(inode);
+	/* The target bucket for insert */
+	target = ocfs2_xattr_bucket_new(inode);
+	if (!first || !target) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	BUG_ON(prev_blkno != (*first_bh)->b_blocknr);
+	ret = ocfs2_read_xattr_bucket(first, prev_blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
+	ret = ocfs2_read_xattr_bucket(target, (*header_bh)->b_blocknr);
+	if (ret) {
+		mlog_errno(ret);
+		goto leave;
+	}
+
 	ret = ocfs2_journal_access(handle, inode, root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
@@ -4250,10 +4236,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	} else {
 		ret = ocfs2_adjust_xattr_cross_cluster(inode,
 						       handle,
-						       first_bh,
-						       header_bh,
+						       first,
+						       target,
 						       block,
-						       prev_blkno,
 						       prev_clusters,
 						       &v_start,
 						       extend);
@@ -4261,6 +4246,17 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 			mlog_errno(ret);
 			goto leave;
 		}
+
+		/* Did first+target get moved? */
+		if (prev_blkno != bucket_blkno(first)) {
+			brelse(*first_bh);
+			*first_bh = first->bu_bhs[0];
+			get_bh(*first_bh);
+
+			brelse(*header_bh);
+			*header_bh = target->bu_bhs[0];
+			get_bh(*header_bh);
+		}
 	}
 
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
@@ -4277,6 +4273,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		mlog_errno(ret);
 
 leave:
+	ocfs2_xattr_bucket_free(first);
+	ocfs2_xattr_bucket_free(target);
 	return ret;
 }
 
-- 
cgit v1.2.3


From ed29c0ca14871021fc8aced74650648dcb2c6e81 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 15:08:44 -0800
Subject: ocfs2: Move buckets up into ocfs2_add_new_xattr_bucket().

Lift the buckets from ocfs2_add_new_xattr_cluster() up into
ocfs2_add_new_xattr_bucket().  Now ocfs2_add_new_xattr_cluster()
doesn't deal with buffer_heads.  In fact, we no longer have to play
get_bh() tricks at all.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 105 +++++++++++++++++--------------------------------------
 1 file changed, 32 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4b247047b7a..5a5a1bd7eed 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4148,11 +4148,10 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
  */
 static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 				       struct buffer_head *root_bh,
-				       struct buffer_head **first_bh,
-				       struct buffer_head **header_bh,
+				       struct ocfs2_xattr_bucket *first,
+				       struct ocfs2_xattr_bucket *target,
 				       u32 *num_clusters,
 				       u32 prev_cpos,
-				       u64 prev_blkno,
 				       int *extend,
 				       struct ocfs2_xattr_set_ctxt *ctxt)
 {
@@ -4164,38 +4163,14 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	handle_t *handle = ctxt->handle;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
-	struct ocfs2_xattr_bucket *first, *target;
 
 	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
 	     "previous xattr blkno = %llu\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     prev_cpos, (unsigned long long)prev_blkno);
+	     prev_cpos, (unsigned long long)bucket_blkno(first));
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
-	/* The first bucket of the original extent */
-	first = ocfs2_xattr_bucket_new(inode);
-	/* The target bucket for insert */
-	target = ocfs2_xattr_bucket_new(inode);
-	if (!first || !target) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto leave;
-	}
-
-	BUG_ON(prev_blkno != (*first_bh)->b_blocknr);
-	ret = ocfs2_read_xattr_bucket(first, prev_blkno);
-	if (ret) {
-		mlog_errno(ret);
-		goto leave;
-	}
-
-	ret = ocfs2_read_xattr_bucket(target, (*header_bh)->b_blocknr);
-	if (ret) {
-		mlog_errno(ret);
-		goto leave;
-	}
-
 	ret = ocfs2_journal_access(handle, inode, root_bh,
 				   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
@@ -4217,7 +4192,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	if (prev_blkno + prev_clusters * bpc == block &&
+	if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
 	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
 	     OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
 		/*
@@ -4246,17 +4221,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 			mlog_errno(ret);
 			goto leave;
 		}
-
-		/* Did first+target get moved? */
-		if (prev_blkno != bucket_blkno(first)) {
-			brelse(*first_bh);
-			*first_bh = first->bu_bhs[0];
-			get_bh(*first_bh);
-
-			brelse(*header_bh);
-			*header_bh = target->bu_bhs[0];
-			get_bh(*header_bh);
-		}
 	}
 
 	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
@@ -4273,8 +4237,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		mlog_errno(ret);
 
 leave:
-	ocfs2_xattr_bucket_free(first);
-	ocfs2_xattr_bucket_free(target);
 	return ret;
 }
 
@@ -4357,16 +4319,16 @@ out:
  * We will move all the buckets starting from header_bh to the next place. As
  * for this one, half num of its xattrs will be moved to the next one.
  *
- * We will allocate a new cluster if current cluster is full and adjust
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * We will allocate a new cluster if current cluster is full.  The
+ * underlying calls will make sure that there is space at the target
+ * bucket, shifting buckets around if necessary.  'target' may be updated
+ * by those calls.
  */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 				      struct buffer_head *xb_bh,
 				      struct buffer_head *header_bh,
 				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	struct ocfs2_xattr_header *first_xh = NULL;
-	struct buffer_head *first_bh = NULL;
 	struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)xb_bh->b_data;
 	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
@@ -4374,31 +4336,26 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 	struct ocfs2_xattr_header *xh =
 			(struct ocfs2_xattr_header *)header_bh->b_data;
 	u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_super *osb = OCFS2_SB(sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int ret, num_buckets, extend = 1;
 	u64 p_blkno;
 	u32 e_cpos, num_clusters;
 	/* The bucket at the front of the extent */
-	struct ocfs2_xattr_bucket *first;
+	struct ocfs2_xattr_bucket *first, *target;
 
 	mlog(0, "Add new xattr bucket starting form %llu\n",
 	     (unsigned long long)header_bh->b_blocknr);
 
+	/* The first bucket of the original extent */
 	first = ocfs2_xattr_bucket_new(inode);
-	if (!first) {
+	/* The target bucket for insert */
+	target = ocfs2_xattr_bucket_new(inode);
+	if (!first || !target) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
 	}
 
-	/*
-	 * Add refrence for header_bh here because it may be
-	 * changed in ocfs2_add_new_xattr_cluster and we need
-	 * to free it in the end.
-	 */
-	get_bh(header_bh);
-
 	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
 				  &num_clusters, el);
 	if (ret) {
@@ -4406,23 +4363,30 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_block(inode, p_blkno, &first_bh, NULL);
+	ret = ocfs2_read_xattr_bucket(first, p_blkno);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
-	first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+	ret = ocfs2_read_xattr_bucket(target, header_bh->b_blocknr);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-	if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+	if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+		/*
+		 * This can move first+target if the target bucket moves
+		 * to the new extent.
+		 */
 		ret = ocfs2_add_new_xattr_cluster(inode,
 						  xb_bh,
-						  &first_bh,
-						  &header_bh,
+						  first,
+						  target,
 						  &num_clusters,
 						  e_cpos,
-						  p_blkno,
 						  &extend,
 						  ctxt);
 		if (ret) {
@@ -4432,24 +4396,19 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 	}
 
 	if (extend) {
-		/* These bucket reads should be cached */
-		ret = ocfs2_read_xattr_bucket(first, first_bh->b_blocknr);
-		if (ret) {
-			mlog_errno(ret);
-			goto out;
-		}
 		ret = ocfs2_extend_xattr_bucket(inode,
 						ctxt->handle,
-						first, header_bh->b_blocknr,
+						first,
+						bucket_blkno(target),
 						num_clusters);
 		if (ret)
 			mlog_errno(ret);
 	}
 
 out:
-	brelse(first_bh);
-	brelse(header_bh);
 	ocfs2_xattr_bucket_free(first);
+	ocfs2_xattr_bucket_free(target);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 91f2033fa997aa92607470ed1ef90685b9d77a8c Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 26 Nov 2008 15:25:41 -0800
Subject: ocfs2: Pass xs->bucket into ocfs2_add_new_xattr_bucket().

Pass the actual target bucket for insert through to
ocfs2_add_new_xattr_bucket().  Now growing a bucket has no buffer_head
knowledge.

ocfs2_add_new_xattr_bucket() leavs xs->bucket in the proper state for
insert.  However, it doesn't update the rest of the search fields in xs,
so we still have to relse() and re-find.  That's OK, because everything
is cached.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 52 +++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5a5a1bd7eed..dfc51c305bb 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4314,43 +4314,42 @@ out:
 }
 
 /*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
- * xb_bh is the ocfs2_xattr_block.
- * We will move all the buckets starting from header_bh to the next place. As
- * for this one, half num of its xattrs will be moved to the next one.
+ * Add new xattr bucket in an extent record and adjust the buckets
+ * accordingly.  xb_bh is the ocfs2_xattr_block, and target is the
+ * bucket we want to insert into.
  *
- * We will allocate a new cluster if current cluster is full.  The
- * underlying calls will make sure that there is space at the target
- * bucket, shifting buckets around if necessary.  'target' may be updated
- * by those calls.
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
+ *
+ * If current cluster is full, we'll allocate a new one.  This may not
+ * be contiguous.  The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
  */
 static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 				      struct buffer_head *xb_bh,
-				      struct buffer_head *header_bh,
+				      struct ocfs2_xattr_bucket *target,
 				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	struct ocfs2_xattr_block *xb =
 			(struct ocfs2_xattr_block *)xb_bh->b_data;
 	struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
 	struct ocfs2_extent_list *el = &xb_root->xt_list;
-	struct ocfs2_xattr_header *xh =
-			(struct ocfs2_xattr_header *)header_bh->b_data;
-	u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+	u32 name_hash =
+		le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int ret, num_buckets, extend = 1;
 	u64 p_blkno;
 	u32 e_cpos, num_clusters;
 	/* The bucket at the front of the extent */
-	struct ocfs2_xattr_bucket *first, *target;
+	struct ocfs2_xattr_bucket *first;
 
-	mlog(0, "Add new xattr bucket starting form %llu\n",
-	     (unsigned long long)header_bh->b_blocknr);
+	mlog(0, "Add new xattr bucket starting from %llu\n",
+	     (unsigned long long)bucket_blkno(target));
 
 	/* The first bucket of the original extent */
 	first = ocfs2_xattr_bucket_new(inode);
-	/* The target bucket for insert */
-	target = ocfs2_xattr_bucket_new(inode);
-	if (!first || !target) {
+	if (!first) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
 		goto out;
@@ -4369,12 +4368,6 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_read_xattr_bucket(target, header_bh->b_blocknr);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
 	if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
 		/*
@@ -4407,7 +4400,6 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 
 out:
 	ocfs2_xattr_bucket_free(first);
-	ocfs2_xattr_bucket_free(target);
 
 	return ret;
 }
@@ -5083,15 +5075,21 @@ try_again:
 
 		ret = ocfs2_add_new_xattr_bucket(inode,
 						 xs->xattr_bh,
-						 xs->bucket->bu_bhs[0],
+						 xs->bucket,
 						 ctxt);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
+		/*
+		 * ocfs2_add_new_xattr_bucket() will have updated
+		 * xs->bucket if it moved, but it will not have updated
+		 * any of the other search fields.  Thus, we drop it and
+		 * re-search.  Everything should be cached, so it'll be
+		 * quick.
+		 */
 		ocfs2_xattr_bucket_relse(xs->bucket);
-
 		ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
 						   xi->name_index,
 						   xi->name, xs);
-- 
cgit v1.2.3


From 754938c142ae0c28360426c43f965ddc5164b21e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 15 Dec 2008 06:03:41 +0800
Subject: ocfs2/quota: Add QUOTA in mlog_attribute.

A new mlog mask has to be added into mlog_attribute before it can
be really used in mlog. ML_QUOTA is only added in masklog.h, so
add it to the array to enable it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/cluster/masklog.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef..96df5416993 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(QUORUM),
 	define_mask(EXPORT),
 	define_mask(XATTR),
+	define_mask(QUOTA),
 	define_mask(ERROR),
 	define_mask(NOTICE),
 	define_mask(KTHREAD),
-- 
cgit v1.2.3


From e06c8227fd94ec181849ba206bf032be31c4295c Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 11 Sep 2008 15:35:47 -0700
Subject: jbd2: Add buffer triggers

Filesystems often to do compute intensive operation on some
metadata.  If this operation is repeated many times, it can be very
expensive.  It would be much nicer if the operation could be performed
once before a buffer goes to disk.

This adds triggers to jbd2 buffer heads.  Just before writing a metadata
buffer to the journal, jbd2 will optionally call a commit trigger associated
with the buffer.  If the journal is aborted, an abort trigger will be
called on any dirty buffers as they are dropped from pending
transactions.

ocfs2 will use this feature.

Initially I tried to come up with a more generic trigger that could be
used for non-buffer-related events like transaction completion.  It
doesn't tie nicely, because the information a buffer trigger needs
(specific to a journal_head) isn't the same as what a transaction
trigger needs (specific to a tranaction_t or perhaps journal_t).  So I
implemented a buffer set, with the understanding that
journal/transaction wide triggers should be implemented separately.

There is only one trigger set allowed per buffer.  I can't think of any
reason to attach more than one set.  Contrast this with a journal or
transaction in which multiple places may want to watch the entire
transaction separately.

The trigger sets are considered static allocation from the jbd2
perspective.  ocfs2 will just have one trigger set per block type,
setting the same set on every bh of the same type.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/jbd2/commit.c      |  9 +++++++++
 fs/jbd2/journal.c     | 19 +++++++++++++++++++
 fs/jbd2/transaction.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a..c8a1bace685 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -509,6 +509,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		if (is_journal_aborted(journal)) {
 			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
+			jbd2_buffer_abort_trigger(jh,
+						  jh->b_frozen_data ?
+						  jh->b_frozen_triggers :
+						  jh->b_triggers);
 			jbd2_journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
 			 * any descriptor buffers which may have been
@@ -844,6 +848,9 @@ restart_loop:
 		 * data.
 		 *
 		 * Otherwise, we can just throw away the frozen data now.
+		 *
+		 * We also know that the frozen data has already fired
+		 * its triggers if they exist, so we can clear that too.
 		 */
 		if (jh->b_committed_data) {
 			jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +858,12 @@ restart_loop:
 			if (jh->b_frozen_data) {
 				jh->b_committed_data = jh->b_frozen_data;
 				jh->b_frozen_data = NULL;
+				jh->b_frozen_triggers = NULL;
 			}
 		} else if (jh->b_frozen_data) {
 			jbd2_free(jh->b_frozen_data, bh->b_size);
 			jh->b_frozen_data = NULL;
+			jh->b_frozen_triggers = NULL;
 		}
 
 		spin_lock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f..f6bff9d6f8d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,6 +50,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -290,6 +291,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	struct page *new_page;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
+	struct jbd2_buffer_trigger_type *triggers;
 
 	/*
 	 * The buffer really shouldn't be locked: only the current committing
@@ -314,12 +316,22 @@ repeat:
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
 		new_offset = offset_in_page(jh_in->b_frozen_data);
+		triggers = jh_in->b_frozen_triggers;
 	} else {
 		new_page = jh2bh(jh_in)->b_page;
 		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+		triggers = jh_in->b_triggers;
 	}
 
 	mapped_data = kmap_atomic(new_page, KM_USER0);
+	/*
+	 * Fire any commit trigger.  Do this before checking for escaping,
+	 * as the trigger may modify the magic offset.  If a copy-out
+	 * happens afterwards, it will have the correct data in the buffer.
+	 */
+	jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+				   triggers);
+
 	/*
 	 * Check for escaping
 	 */
@@ -352,6 +364,13 @@ repeat:
 		new_page = virt_to_page(tmp);
 		new_offset = offset_in_page(tmp);
 		done_copy_out = 1;
+
+		/*
+		 * This isn't strictly necessary, as we're using frozen
+		 * data for the escaping, but it keeps consistency with
+		 * b_frozen_data usage.
+		 */
+		jh_in->b_frozen_triggers = jh_in->b_triggers;
 	}
 
 	/*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599..4f925a4f3d0 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -741,6 +741,12 @@ done:
 		source = kmap_atomic(page, KM_USER0);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		kunmap_atomic(source, KM_USER0);
+
+		/*
+		 * Now that the frozen data is saved off, we need to store
+		 * any matching triggers.
+		 */
+		jh->b_frozen_triggers = jh->b_triggers;
 	}
 	jbd_unlock_bh_state(bh);
 
@@ -943,6 +949,47 @@ out:
 	return err;
 }
 
+/**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head.  This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+			       struct jbd2_buffer_trigger_type *type)
+{
+	struct journal_head *jh = bh2jh(bh);
+
+	jh->b_triggers = type;
+}
+
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+				struct jbd2_buffer_trigger_type *triggers)
+{
+	struct buffer_head *bh = jh2bh(jh);
+
+	if (!triggers || !triggers->t_commit)
+		return;
+
+	triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+			       struct jbd2_buffer_trigger_type *triggers)
+{
+	if (!triggers || !triggers->t_abort)
+		return;
+
+	triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
 /**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
-- 
cgit v1.2.3


From ab552d54673f262d7f70014003d3928d29270f22 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 16 Oct 2008 17:50:30 -0700
Subject: ocfs2: Add the on-disk structures for metadata checksums.

Define struct ocfs2_block_check, an 8-byte structure containing a 32bit
crc32_le and a 16bit hamming code ecc.  This will be used for metadata
checksums.  Add the structure to free spaces in the various metadata
structures.

Add the OCFS2_FEATURE_INCOMPAT_META_ECC bit.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_fs.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 359732e18e8..290fa26fba6 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -149,6 +149,9 @@
 /* Support for extended attributes */
 #define OCFS2_FEATURE_INCOMPAT_XATTR		0x0200
 
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC		0x0800
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -426,6 +429,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
  */
 #define OCFS2_RAW_SB(dinode)		(&((dinode)->id2.i_super))
 
+/*
+ * Block checking structure.  This is used in metadata to validate the
+ * contents.  If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/	__le32 bc_crc32e;	/* 802.3 Ethernet II CRC32 */
+	__le16 bc_ecc;		/* Single-error-correction parity vector.
+				   This is a simple Hamming code dependant
+				   on the blocksize.  OCFS2's maximum
+				   blocksize, 4K, requires 16 parity bits,
+				   so we fit in __le16. */
+	__le16 bc_reserved1;
+/*08*/
+};
+
 /*
  * On disk extent record for OCFS2
  * It describes a range of clusters on disk.
@@ -513,7 +532,7 @@ struct ocfs2_truncate_log {
 struct ocfs2_extent_block
 {
 /*00*/	__u8 h_signature[8];		/* Signature for verification */
-	__le64 h_reserved1;
+	struct ocfs2_block_check h_check;	/* Error checking */
 /*10*/	__le16 h_suballoc_slot;		/* Slot suballocator this
 					   extent_header belongs to */
 	__le16 h_suballoc_bit;		/* Bit offset in suballocator
@@ -683,7 +702,8 @@ struct ocfs2_dinode {
 					   was set in i_flags */
 	__le16 i_dyn_features;
 	__le64 i_xattr_loc;
-/*80*/	__le64 i_reserved2[7];
+/*80*/	struct ocfs2_block_check i_check;	/* Error checking */
+/*88*/	__le64 i_reserved2[6];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
 					   64bit union */
@@ -750,7 +770,8 @@ struct ocfs2_group_desc
 /*20*/	__le64   bg_parent_dinode;       /* dinode which owns me, in
 					   blocks */
 	__le64   bg_blkno;               /* Offset on disk, in blocks */
-/*30*/	__le64   bg_reserved2[2];
+/*30*/	struct ocfs2_block_check bg_check;	/* Error checking */
+	__le64   bg_reserved2;
 /*40*/	__u8    bg_bitmap[0];
 };
 
@@ -793,7 +814,12 @@ struct ocfs2_xattr_header {
 						   in this extent record,
 						   only valid in the first
 						   bucket. */
-	__le64  xh_csum;
+	struct ocfs2_block_check xh_check;	/* Error checking
+						   (Note, this is only
+						    used for xattr
+						    buckets.  A block uses
+						    xb_check and sets
+						    this field to zero.) */
 	struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
 };
 
@@ -844,7 +870,7 @@ struct ocfs2_xattr_block {
 					block group */
 	__le32	xb_fs_generation;    /* Must match super block */
 /*10*/	__le64	xb_blkno;            /* Offset on disk, in blocks */
-	__le64	xb_csum;
+	struct ocfs2_block_check xb_check;	/* Error checking */
 /*20*/	__le16	xb_flags;            /* Indicates whether this block contains
 					real xattr or a xattr tree. */
 	__le16	xb_reserved0;
@@ -988,6 +1014,25 @@ struct ocfs2_local_disk_dqblk {
 /*10*/	__le64 dqb_inodemod;	/* Change in the amount of used inodes */
 };
 
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/	struct ocfs2_block_check dq_check;	/* Error checking */
+/*08*/	/* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+								 void *buf)
+{
+	char *ptr = buf;
+	ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+	return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
 #ifdef __KERNEL__
 static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 {
-- 
cgit v1.2.3


From 70ad1ba7b48364d758a112df0823edc5ca6632aa Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 16 Oct 2008 17:54:25 -0700
Subject: ocfs2: Add the underlying blockcheck code.

This is the code that computes crc32 and ecc for ocfs2 metadata blocks.
There are high-level functions that check whether the filesystem has the
ecc feature, mid-level functions that work on a single block or array of
buffer_heads, and the low-level ecc hamming code that can handle
multiple buffers like crc32_le().

It's not hooked up to the filesystem yet.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/Makefile     |   1 +
 fs/ocfs2/blockcheck.c | 480 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/blockcheck.h |  82 +++++++++
 fs/ocfs2/ocfs2.h      |   8 +
 4 files changed, 571 insertions(+)
 create mode 100644 fs/ocfs2/blockcheck.c
 create mode 100644 fs/ocfs2/blockcheck.h

(limited to 'fs')

diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 7e4b361b755..01596079dd6 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
 ocfs2-objs := \
 	alloc.o 		\
 	aops.o 			\
+	blockcheck.o		\
 	buffer_head_io.o	\
 	dcache.o 		\
 	dir.o 			\
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 00000000000..2bf3d7f61ae
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,480 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+static int calc_parity_bits(unsigned int d)
+{
+	unsigned int p;
+
+	/*
+	 * Bits required for Single Error Correction is as follows:
+	 *
+	 * d + p + 1 <= 2^p
+	 *
+	 * We're restricting ourselves to 31 bits of parity, that should be
+	 * sufficient.
+	 */
+	for (p = 1; p < 32; p++)
+	{
+		if ((d + p + 1) <= (1 << p))
+			return p;
+	}
+
+	return 0;
+}
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer.  Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based.  This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example.  Take bit 1 of the data buffer.  1 is a power of two (2^0),
+ * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ */
+static unsigned int calc_code_bit(unsigned int i)
+{
+	unsigned int b, p;
+
+	/*
+	 * Data bits are 0-based, but we're talking code bits, which
+	 * are 1-based.
+	 */
+	b = i + 1;
+
+	/*
+	 * For every power of two below our bit number, bump our bit.
+	 *
+	 * We compare with (b + 1) becuase we have to compare with what b
+	 * would be _if_ it were bumped up by the parity bit.  Capice?
+	 */
+	for (p = 0; (1 << p) < (b + 1); p++)
+		b++;
+
+	return b;
+}
+
+/*
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+	unsigned int p = calc_parity_bits(nr + d);
+	unsigned int i, j, b;
+
+	BUG_ON(!p);
+
+	/*
+	 * b is the hamming code bit number.  Hamming code specifies a
+	 * 1-based array, but C uses 0-based.  So 'i' is for C, and 'b' is
+	 * for the algorithm.
+	 *
+	 * The i++ in the for loop is so that the start offset passed
+	 * to ocfs2_find_next_bit_set() is one greater than the previously
+	 * found bit.
+	 */
+	for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+	{
+		/*
+		 * i is the offset in this hunk, nr + i is the total bit
+		 * offset.
+		 */
+		b = calc_code_bit(nr + i);
+
+		for (j = 0; j < p; j++)
+		{
+			/*
+			 * Data bits in the resultant code are checked by
+			 * parity bits that are part of the bit number
+			 * representation.  Huh?
+			 *
+			 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+			 * In other words, the parity bit at position 2^k
+			 * checks bits in positions having bit k set in
+			 * their binary representation.  Conversely, for
+			 * instance, bit 13, i.e. 1101(2), is checked by
+			 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+			 * </wikipedia>
+			 *
+			 * Note that 'k' is the _code_ bit number.  'b' in
+			 * our loop.
+			 */
+			if (b & (1 << j))
+				parity ^= (1 << j);
+		}
+	}
+
+	/* While the data buffer was treated as little endian, the
+	 * return value is in host endian. */
+	return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+	return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix)
+{
+	unsigned int p = calc_parity_bits(nr + d);
+	unsigned int i, b;
+
+	BUG_ON(!p);
+
+	/*
+	 * If the bit to fix has an hweight of 1, it's a parity bit.  One
+	 * busted parity bit is its own error.  Nothing to do here.
+	 */
+	if (hweight32(fix) == 1)
+		return;
+
+	/*
+	 * nr + d is the bit right past the data hunk we're looking at.
+	 * If fix after that, nothing to do
+	 */
+	if (fix >= calc_code_bit(nr + d))
+		return;
+
+	/*
+	 * nr is the offset in the data hunk we're starting at.  Let's
+	 * start b at the offset in the code buffer.  See hamming_encode()
+	 * for a more detailed description of 'b'.
+	 */
+	b = calc_code_bit(nr);
+	/* If the fix is before this hunk, nothing to do */
+	if (fix < b)
+		return;
+
+	for (i = 0; i < d; i++, b++)
+	{
+		/* Skip past parity bits */
+		while (hweight32(b) == 1)
+			b++;
+
+		/*
+		 * i is the offset in this data hunk.
+		 * nr + i is the offset in the total data buffer.
+		 * b is the offset in the total code buffer.
+		 *
+		 * Thus, when b == fix, bit i in the current hunk needs
+		 * fixing.
+		 */
+		if (b == fix)
+		{
+			if (ocfs2_test_bit(i, data))
+				ocfs2_clear_bit(i, data);
+			else
+				ocfs2_set_bit(i, data);
+			break;
+		}
+	}
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+			     unsigned int fix)
+{
+	ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	u32 crc;
+	u32 ecc;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	crc = crc32_le(~0, data, blocksize);
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHORT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information.  Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc;
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	/* Ok, try ECC fixups */
+	ecc = ocfs2_hamming_encode_block(data, blocksize);
+	ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+
+	/* And check the crc32 again */
+	crc = crc32_le(~0, data, blocksize);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked.  bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information.  If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i;
+	u32 crc, ecc;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return;
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+
+	/*
+	 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+	 * larger than 16 bits.
+	 */
+	BUG_ON(ecc > USHORT_MAX);
+
+	bc->bc_crc32e = cpu_to_le32(crc);
+	bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads.  Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes.  If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc)
+{
+	int i, rc = 0;
+	struct ocfs2_block_check check;
+	u32 crc, ecc, fix;
+
+	BUG_ON(nr < 0);
+
+	if (!nr)
+		return 0;
+
+	check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+	check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+	memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+	/* Fast path - if the crc32 validates, we're good to go */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	/* Ok, try ECC fixups */
+	for (i = 0, ecc = 0; i < nr; i++) {
+		/*
+		 * The number of bits in a buffer is obviously b_size*8.
+		 * The offset of this buffer is b_size*i, so the bit offset
+		 * of this buffer is b_size*8*i.
+		 */
+		ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+						bhs[i]->b_size * 8,
+						bhs[i]->b_size * 8 * i);
+	}
+	fix = ecc ^ check.bc_ecc;
+	for (i = 0; i < nr; i++) {
+		/*
+		 * Try the fix against each buffer.  It will only affect
+		 * one of them.
+		 */
+		ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+				  bhs[i]->b_size * 8 * i, fix);
+	}
+
+	/* And check the crc32 again */
+	for (i = 0, crc = ~0; i < nr; i++)
+		crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+	if (crc == check.bc_crc32e)
+		goto out;
+
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+	rc = -EIO;
+
+out:
+	bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+	bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+	return rc;
+}
+
+/*
+ * These are the main API.  They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+
+	return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc)
+{
+	int rc = 0;
+
+	if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+		rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+
+	return rc;
+}
+
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 00000000000..70ec3feda32
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+			    struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+				struct buffer_head **bhs, int nr,
+				struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+			       struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+				   struct ocfs2_block_check *bc);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function.  It can be called across
+ * multiple hunks just like the crc32 code.  'd' is the number of bits
+ * _in_this_hunk_.  nr is the bit offset of this hunk.  So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+			 unsigned int nr);
+/*
+ * Fix a buffer with a bit error.  The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks.  nr is the bit
+ * offset of the current hunk.  If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+		       unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+				    unsigned int fix);
+#endif
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 5c777988042..2bb389fe739 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -382,6 +382,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
 	return 0;
 }
 
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -615,5 +622,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
 #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
 #endif  /* OCFS2_H */
 
-- 
cgit v1.2.3


From 684ef278377725d505aa23259ee673dab9b11851 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 2 Dec 2008 17:44:05 -0800
Subject: ocfs2: Add a validation hook for quota block reads.

Add a currently-returns-success hook for quota block reads.  We'll be
adding checks to this.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index a10faebe88a..7dbcfd7f65e 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -87,13 +87,25 @@ struct qtree_fmt_operations ocfs2_global_ops = {
 	.is_id = ocfs2_global_is_id,
 };
 
+static int ocfs2_validate_quota_block(struct super_block *sb,
+				      struct buffer_head *bh)
+{
+	struct ocfs2_disk_dqtrailer *dqt = ocfs2_dq_trailer(sb, bh->b_data);
+
+	mlog(0, "Validating quota block %llu\n",
+	     (unsigned long long)bh->b_blocknr);
+
+	return 0;
+}
+
 int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 			   struct buffer_head **bh)
 {
 	int rc = 0;
 	struct buffer_head *tmp = *bh;
 
-	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, NULL);
+	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+				    ocfs2_validate_quota_block);
 	if (rc)
 		mlog_errno(rc);
 
-- 
cgit v1.2.3


From d6b32bbb3eae3fb787f1c33bf9f767ca1ddeb208 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 17 Oct 2008 14:55:01 -0700
Subject: ocfs2: block read meta ecc.

Add block check calls to the read_block validate functions.  This is the
almost all of the read-side checking of metaecc.  xattr buckets are not checked
yet.   Writes are also unchecked, and so a read-write mount will quickly fail.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c        | 17 +++++++++++++++++
 fs/ocfs2/blockcheck.c   |  9 +++++++++
 fs/ocfs2/inode.c        | 18 +++++++++++++++++-
 fs/ocfs2/quota_global.c | 13 +++++++++++--
 fs/ocfs2/suballoc.c     | 31 ++++++++++++++++++++++++++++++-
 fs/ocfs2/xattr.c        | 17 +++++++++++++++++
 6 files changed, 101 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 84a7bd4db5d..6b27f74bb34 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -37,6 +37,7 @@
 
 #include "alloc.h"
 #include "aops.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -682,12 +683,28 @@ struct ocfs2_merge_ctxt {
 static int ocfs2_validate_extent_block(struct super_block *sb,
 				       struct buffer_head *bh)
 {
+	int rc;
 	struct ocfs2_extent_block *eb =
 		(struct ocfs2_extent_block *)bh->b_data;
 
 	mlog(0, "Validating extent block %llu\n",
 	     (unsigned long long)bh->b_blocknr);
 
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 		ocfs2_error(sb,
 			    "Extent block #%llu has bad signature %.*s",
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 2bf3d7f61ae..2ce6ae5e4b8 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -24,6 +24,8 @@
 #include <linux/bitops.h>
 #include <asm/byteorder.h>
 
+#include <cluster/masklog.h>
+
 #include "ocfs2.h"
 
 #include "blockcheck.h"
@@ -292,6 +294,10 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
 	if (crc == check.bc_crc32e)
 		goto out;
 
+	mlog(ML_ERROR,
+	     "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
 	/* Ok, try ECC fixups */
 	ecc = ocfs2_hamming_encode_block(data, blocksize);
 	ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
@@ -301,6 +307,9 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
 	if (crc == check.bc_crc32e)
 		goto out;
 
+	mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+	     (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
 	rc = -EIO;
 
 out:
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 288512c9dbc..9370b652ab9 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -38,6 +38,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "file.h"
@@ -1262,7 +1263,7 @@ void ocfs2_refresh_inode(struct inode *inode,
 int ocfs2_validate_inode_block(struct super_block *sb,
 			       struct buffer_head *bh)
 {
-	int rc = -EINVAL;
+	int rc;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
 	mlog(0, "Validating dinode %llu\n",
@@ -1270,6 +1271,21 @@ int ocfs2_validate_inode_block(struct super_block *sb,
 
 	BUG_ON(!buffer_uptodate(bh));
 
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+	if (rc)
+		goto bail;
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
+	rc = -EINVAL;
+
 	if (!OCFS2_IS_VALID_DINODE(di)) {
 		ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
 			    (unsigned long long)bh->b_blocknr, 7,
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 7dbcfd7f65e..a0b8b14cca8 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -16,6 +16,7 @@
 #include "ocfs2_fs.h"
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "inode.h"
 #include "journal.h"
 #include "file.h"
@@ -90,12 +91,20 @@ struct qtree_fmt_operations ocfs2_global_ops = {
 static int ocfs2_validate_quota_block(struct super_block *sb,
 				      struct buffer_head *bh)
 {
-	struct ocfs2_disk_dqtrailer *dqt = ocfs2_dq_trailer(sb, bh->b_data);
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
 
 	mlog(0, "Validating quota block %llu\n",
 	     (unsigned long long)bh->b_blocknr);
 
-	return 0;
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
 }
 
 int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 226fe21f260..78755766c32 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -250,8 +251,18 @@ int ocfs2_check_group_descriptor(struct super_block *sb,
 				 struct buffer_head *bh)
 {
 	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+	BUG_ON(!buffer_uptodate(bh));
 
-	rc = ocfs2_validate_gd_self(sb, bh, 1);
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (!rc)
+		rc = ocfs2_validate_gd_self(sb, bh, 1);
 	if (!rc)
 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
 
@@ -261,9 +272,27 @@ int ocfs2_check_group_descriptor(struct super_block *sb,
 static int ocfs2_validate_group_descriptor(struct super_block *sb,
 					   struct buffer_head *bh)
 {
+	int rc;
+	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
 	mlog(0, "Validating group descriptor %llu\n",
 	     (unsigned long long)bh->b_blocknr);
 
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal.
+	 */
+
 	return ocfs2_validate_gd_self(sb, bh, 0);
 }
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dfc51c305bb..bc822d6ba54 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -42,6 +42,7 @@
 
 #include "ocfs2.h"
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "symlink.h"
@@ -322,12 +323,28 @@ static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
 static int ocfs2_validate_xattr_block(struct super_block *sb,
 				      struct buffer_head *bh)
 {
+	int rc;
 	struct ocfs2_xattr_block *xb =
 		(struct ocfs2_xattr_block *)bh->b_data;
 
 	mlog(0, "Validating xattr block %llu\n",
 	     (unsigned long long)bh->b_blocknr);
 
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+	if (rc)
+		return rc;
+
+	/*
+	 * Errors after here are fatal
+	 */
+
 	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
 		ocfs2_error(sb,
 			    "Extended attribute block #%llu has bad "
-- 
cgit v1.2.3


From 50655ae9e91d272d48997bada59efe166aa5e343 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 11 Sep 2008 15:53:07 -0700
Subject: ocfs2: Add journal_access functions with jbd2 triggers.

We create wrappers for ocfs2_journal_access() that are specific to the
type of metadata block.  This allows us to associate jbd2 commit
triggers with the block.  The triggers will compute metadata ecc in a
future commit.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/journal.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/journal.h |  31 +++++++++--
 2 files changed, 181 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 302f1144a70..2daa5848faf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -369,10 +370,110 @@ bail:
 	return status;
 }
 
-int ocfs2_journal_access(handle_t *handle,
-			 struct inode *inode,
-			 struct buffer_head *bh,
-			 int type)
+struct ocfs2_triggers {
+	struct jbd2_buffer_trigger_type	ot_triggers;
+	int				ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+	return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_disk_dqtrailer *dqt =
+		ocfs2_block_dqtrailer(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+				struct buffer_head *bh)
+{
+	mlog(ML_ERROR,
+	     "ocfs2_abort_trigger called by JBD2.  bh = 0x%lx, "
+	     "bh->b_blocknr = %llu\n",
+	     (unsigned long)bh,
+	     (unsigned long long)bh->b_blocknr);
+
+	/* We aren't guaranteed to have the superblock here - but if we
+	 * don't, it'll just crash. */
+	ocfs2_error(bh->b_assoc_map->host->i_sb,
+		    "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers xb_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+	.ot_offset	= offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_dq_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+				  struct inode *inode,
+				  struct buffer_head *bh,
+				  struct ocfs2_triggers *triggers,
+				  int type)
 {
 	int status;
 
@@ -418,6 +519,8 @@ int ocfs2_journal_access(handle_t *handle,
 		status = -EINVAL;
 		mlog(ML_ERROR, "Uknown access type!\n");
 	}
+	if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+		jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
 	mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
 
 	if (status < 0)
@@ -428,6 +531,54 @@ int ocfs2_journal_access(handle_t *handle,
 	return status;
 }
 
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+			       struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	/* Right now, nothing for dirblocks */
+	return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
+
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+				      type);
+}
+
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+				      type);
+}
+
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh, int type)
+{
+	return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
+
 int ocfs2_journal_dirty(handle_t *handle,
 			struct buffer_head *bh)
 {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 37013bf9ce2..bca370dab02 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -212,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
  *  ocfs2_extend_trans     - Extend a handle by nblocks credits. This may
  *                          commit the handle to disk in the process, but will
  *                          not release any locks taken during the transaction.
- *  ocfs2_journal_access   - Notify the handle that we want to journal this
+ *  ocfs2_journal_access* - Notify the handle that we want to journal this
  *                          buffer. Will have to call ocfs2_journal_dirty once
  *                          we've actually dirtied it. Type is one of . or .
+ *                          Always call the specific flavor of
+ *                          ocfs2_journal_access_*() unless you intend to
+ *                          manage the checksum by hand.
  *  ocfs2_journal_dirty    - Mark a journalled buffer as having dirty data.
  *  ocfs2_jbd2_file_inode  - Mark an inode so that its data goes out before
  *                           the current handle commits.
@@ -244,10 +247,28 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
 
-int                  ocfs2_journal_access(handle_t *handle,
-					  struct inode *inode,
-					  struct buffer_head *bh,
-					  int type);
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+			       struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+			    struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+			 struct buffer_head *bh, int type);
+
 /*
  * A word about the journal_access/journal_dirty "dance". It is
  * entirely legal to journal_access a buffer more than once (as long
-- 
cgit v1.2.3


From ffdd7a54631f07918b75e324d86713a08c11ec06 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 17 Oct 2008 22:32:01 -0700
Subject: ocfs2: Wrap up the common use cases of ocfs2_new_path().

The majority of ocfs2_new_path() calls are:

	ocfs2_new_path(path_root_bh(otherpath),
		       path_root_el(otherpath));

Let's call that ocfs2_new_path_from_path().  The rest do similar things
from struct ocfs2_extent_tree.  Let's call those
ocfs2_new_path_from_et().  This will make the next change easier.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 6b27f74bb34..c22ff49b5e3 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -532,6 +532,16 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 	return path;
 }
 
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+	return ocfs2_new_path(path_root_bh(path), path_root_el(path));
+}
+
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+	return ocfs2_new_path(et->et_root_bh, et->et_root_el);
+}
+
 /*
  * Convenience function to journal all components in a path.
  */
@@ -2150,8 +2160,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
 
 	*ret_left_path = NULL;
 
-	left_path = ocfs2_new_path(path_root_bh(right_path),
-				   path_root_el(right_path));
+	left_path = ocfs2_new_path_from_path(right_path);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -2692,8 +2701,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 		goto out;
 	}
 
-	left_path = ocfs2_new_path(path_root_bh(path),
-				   path_root_el(path));
+	left_path = ocfs2_new_path_from_path(path);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -2702,8 +2710,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 
 	ocfs2_cp_path(left_path, path);
 
-	right_path = ocfs2_new_path(path_root_bh(path),
-				    path_root_el(path));
+	right_path = ocfs2_new_path_from_path(path);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -2833,8 +2840,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
 		 * We have a path to the left of this one - it needs
 		 * an update too.
 		 */
-		left_path = ocfs2_new_path(path_root_bh(path),
-					   path_root_el(path));
+		left_path = ocfs2_new_path_from_path(path);
 		if (!left_path) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
@@ -3075,8 +3081,7 @@ static int ocfs2_get_right_path(struct inode *inode,
 	/* This function shouldn't be called for the rightmost leaf. */
 	BUG_ON(right_cpos == 0);
 
-	right_path = ocfs2_new_path(path_root_bh(left_path),
-				    path_root_el(left_path));
+	right_path = ocfs2_new_path_from_path(left_path);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3247,8 +3252,7 @@ static int ocfs2_get_left_path(struct inode *inode,
 	/* This function shouldn't be called for the leftmost leaf. */
 	BUG_ON(left_cpos == 0);
 
-	left_path = ocfs2_new_path(path_root_bh(right_path),
-				   path_root_el(right_path));
+	left_path = ocfs2_new_path_from_path(right_path);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -3780,8 +3784,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
 		 * leftmost leaf.
 		 */
 		if (left_cpos) {
-			left_path = ocfs2_new_path(path_root_bh(right_path),
-						   path_root_el(right_path));
+			left_path = ocfs2_new_path_from_path(right_path);
 			if (!left_path) {
 				ret = -ENOMEM;
 				mlog_errno(ret);
@@ -4018,7 +4021,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		goto out_update_clusters;
 	}
 
-	right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	right_path = ocfs2_new_path_from_et(et);
 	if (!right_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4130,8 +4133,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 			goto out;
 
 		if (left_cpos != 0) {
-			left_path = ocfs2_new_path(path_root_bh(path),
-						   path_root_el(path));
+			left_path = ocfs2_new_path_from_path(path);
 			if (!left_path)
 				goto out;
 
@@ -4187,8 +4189,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
 		if (right_cpos == 0)
 			goto out;
 
-		right_path = ocfs2_new_path(path_root_bh(path),
-					    path_root_el(path));
+		right_path = ocfs2_new_path_from_path(path);
 		if (!right_path)
 			goto out;
 
@@ -4381,7 +4382,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
 		return 0;
 	}
 
-	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	path = ocfs2_new_path_from_et(et);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -4910,7 +4911,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
 	if (et->et_ops == &ocfs2_dinode_et_ops)
 		ocfs2_extent_map_trunc(inode, 0);
 
-	left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	left_path = ocfs2_new_path_from_et(et);
 	if (!left_path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
@@ -5082,8 +5083,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
 		}
 
 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
-			left_path = ocfs2_new_path(path_root_bh(path),
-						   path_root_el(path));
+			left_path = ocfs2_new_path_from_path(path);
 			if (!left_path) {
 				ret = -ENOMEM;
 				mlog_errno(ret);
@@ -5192,7 +5192,7 @@ int ocfs2_remove_extent(struct inode *inode,
 
 	ocfs2_extent_map_trunc(inode, 0);
 
-	path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	path = ocfs2_new_path_from_et(et);
 	if (!path) {
 		ret = -ENOMEM;
 		mlog_errno(ret);
-- 
cgit v1.2.3


From 13723d00e374c2a6d6ccb5af6de965e89c3e1b01 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 17 Oct 2008 19:25:01 -0700
Subject: ocfs2: Use metadata-specific ocfs2_journal_access_*() functions.

The per-metadata-type ocfs2_journal_access_*() functions hook up jbd2
commit triggers and allow us to compute metadata ecc right before the
buffers are written out.  This commit provides ecc for inodes, extent
blocks, group descriptors, and quota blocks.  It is not safe to use
extened attributes and metaecc at the same time yet.

The ocfs2_extent_tree and ocfs2_path abstractions in alloc.c both hide
the type of block at their root.  Before, it didn't matter, but now the
root block must use the appropriate ocfs2_journal_access_*() function.
To keep this abstract, the structures now have a pointer to the matching
journal_access function and a wrapper call to call it.

A few places use naked ocfs2_write_block() calls instead of adding the
blocks to the journal.  We make sure to calculate their checksum and ecc
before the write.

Since we pass around the journal_access functions.  Let's typedef them
in ocfs2.h.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c        | 233 ++++++++++++++++++++++++++++--------------------
 fs/ocfs2/alloc.h        |   5 +-
 fs/ocfs2/aops.c         |   8 +-
 fs/ocfs2/dir.c          |  48 ++++++----
 fs/ocfs2/file.c         |  16 ++--
 fs/ocfs2/inode.c        |  17 ++--
 fs/ocfs2/journal.c      |   2 +
 fs/ocfs2/journal.h      |   3 +-
 fs/ocfs2/localalloc.c   |  18 ++--
 fs/ocfs2/namei.c        |  38 ++++----
 fs/ocfs2/ocfs2.h        |   4 +
 fs/ocfs2/quota_global.c |   2 +-
 fs/ocfs2/quota_local.c  |  18 ++--
 fs/ocfs2/resize.c       |  16 ++--
 fs/ocfs2/suballoc.c     |  58 ++++++------
 15 files changed, 280 insertions(+), 206 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index c22ff49b5e3..6e58fd557e5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -298,11 +298,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 				     struct inode *inode,
 				     struct buffer_head *bh,
+				     ocfs2_journal_access_func access,
 				     void *obj,
 				     struct ocfs2_extent_tree_operations *ops)
 {
 	et->et_ops = ops;
 	et->et_root_bh = bh;
+	et->et_root_journal_access = access;
 	if (!obj)
 		obj = (void *)bh->b_data;
 	et->et_object = obj;
@@ -318,15 +320,16 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 				   struct inode *inode,
 				   struct buffer_head *bh)
 {
-	__ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+				 NULL, &ocfs2_dinode_et_ops);
 }
 
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 				       struct inode *inode,
 				       struct buffer_head *bh)
 {
-	__ocfs2_init_extent_tree(et, inode, bh, NULL,
-				 &ocfs2_xattr_tree_et_ops);
+	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
+				 NULL, &ocfs2_xattr_tree_et_ops);
 }
 
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
@@ -334,7 +337,7 @@ void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct buffer_head *bh,
 					struct ocfs2_xattr_value_root *xv)
 {
-	__ocfs2_init_extent_tree(et, inode, bh, xv,
+	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access, xv,
 				 &ocfs2_xattr_value_et_ops);
 }
 
@@ -356,6 +359,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
 	et->et_ops->eo_update_clusters(inode, et, clusters);
 }
 
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+					       struct inode *inode,
+					       struct ocfs2_extent_tree *et,
+					       int type)
+{
+	return et->et_root_journal_access(handle, inode, et->et_root_bh,
+					  type);
+}
+
 static inline int ocfs2_et_insert_check(struct inode *inode,
 					struct ocfs2_extent_tree *et,
 					struct ocfs2_extent_rec *rec)
@@ -396,12 +408,14 @@ struct ocfs2_path_item {
 #define OCFS2_MAX_PATH_DEPTH	5
 
 struct ocfs2_path {
-	int			p_tree_depth;
-	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
+	int				p_tree_depth;
+	ocfs2_journal_access_func	p_root_access;
+	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
 };
 
 #define path_root_bh(_path) ((_path)->p_node[0].bh)
 #define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -434,6 +448,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
 	 */
 	if (keep_root)
 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+	else
+		path_root_access(path) = NULL;
 
 	path->p_tree_depth = depth;
 }
@@ -459,6 +475,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 
 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
 	BUG_ON(path_root_el(dest) != path_root_el(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
 
 	ocfs2_reinit_path(dest, 1);
 
@@ -480,6 +497,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
 	int i;
 
 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+	BUG_ON(path_root_access(dest) != path_root_access(src));
 
 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
 		brelse(dest->p_node[i].bh);
@@ -515,7 +533,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
 }
 
 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
-					 struct ocfs2_extent_list *root_el)
+					 struct ocfs2_extent_list *root_el,
+					 ocfs2_journal_access_func access)
 {
 	struct ocfs2_path *path;
 
@@ -527,6 +546,7 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 		get_bh(root_bh);
 		path_root_bh(path) = root_bh;
 		path_root_el(path) = root_el;
+		path_root_access(path) = access;
 	}
 
 	return path;
@@ -534,12 +554,38 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
 
 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
 {
-	return ocfs2_new_path(path_root_bh(path), path_root_el(path));
+	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+			      path_root_access(path));
 }
 
 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
 {
-	return ocfs2_new_path(et->et_root_bh, et->et_root_el);
+	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+			      et->et_root_journal_access);
+}
+
+/*
+ * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+					struct inode *inode,
+					struct ocfs2_path *path,
+					int idx)
+{
+	ocfs2_journal_access_func access = path_root_access(path);
+
+	if (!access)
+		access = ocfs2_journal_access;
+
+	if (idx)
+		access = ocfs2_journal_access_eb;
+
+	return access(handle, inode, path->p_node[idx].bh,
+		      OCFS2_JOURNAL_ACCESS_WRITE);
 }
 
 /*
@@ -554,8 +600,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
 		goto out;
 
 	for(i = 0; i < path_num_items(path); i++) {
-		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -708,8 +753,11 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
 	 * local to this block.
 	 */
 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
-	if (rc)
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+		     (unsigned long long)bh->b_blocknr);
 		return rc;
+	}
 
 	/*
 	 * Errors after here are fatal.
@@ -842,8 +890,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
 			}
 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
 
-			status = ocfs2_journal_access(handle, inode, bhs[i],
-						      OCFS2_JOURNAL_ACCESS_CREATE);
+			status = ocfs2_journal_access_eb(handle, inode, bhs[i],
+							 OCFS2_JOURNAL_ACCESS_CREATE);
 			if (status < 0) {
 				mlog_errno(status);
 				goto bail;
@@ -986,8 +1034,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
 		eb_el = &eb->h_list;
 
-		status = ocfs2_journal_access(handle, inode, bh,
-					      OCFS2_JOURNAL_ACCESS_CREATE);
+		status = ocfs2_journal_access_eb(handle, inode, bh,
+						 OCFS2_JOURNAL_ACCESS_CREATE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1026,21 +1074,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	 * journal_dirty erroring as it won't unless we've aborted the
 	 * handle (in which case we would never be here) so reserving
 	 * the write with journal_access is all we need to do. */
-	status = ocfs2_journal_access(handle, inode, *last_eb_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
-	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_et_root_journal_access(handle, inode, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
 	if (eb_bh) {
-		status = ocfs2_journal_access(handle, inode, eb_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_eb(handle, inode, eb_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1129,8 +1177,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	eb_el = &eb->h_list;
 	root_el = et->et_root_el;
 
-	status = ocfs2_journal_access(handle, inode, new_eb_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1148,8 +1196,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_et_root_journal_access(handle, inode, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1918,25 +1966,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
 	root_bh = left_path->p_node[subtree_index].bh;
 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+					   subtree_index);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-		ret = ocfs2_journal_access(handle, inode,
-					   right_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   right_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		ret = ocfs2_journal_access(handle, inode,
-					   left_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   left_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2455,9 +2501,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 			return -EAGAIN;
 
 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
-			ret = ocfs2_journal_access(handle, inode,
-						   path_leaf_bh(right_path),
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_journal_access_eb(handle, inode,
+						      path_leaf_bh(right_path),
+						      OCFS2_JOURNAL_ACCESS_WRITE);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -2474,8 +2520,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 		 * We have to update i_last_eb_blk during the meta
 		 * data delete.
 		 */
-		ret = ocfs2_journal_access(handle, inode, et_root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_et_root_journal_access(handle, inode, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2490,25 +2536,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
 	 */
 	BUG_ON(right_has_empty && !del_right_subtree);
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+					   subtree_index);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
-		ret = ocfs2_journal_access(handle, inode,
-					   right_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   right_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		ret = ocfs2_journal_access(handle, inode,
-					   left_path->p_node[i].bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   left_path, i);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2653,16 +2697,17 @@ out:
 
 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
 					    handle_t *handle,
-					    struct buffer_head *bh,
-					    struct ocfs2_extent_list *el)
+					    struct ocfs2_path *path)
 {
 	int ret;
+	struct buffer_head *bh = path_leaf_bh(path);
+	struct ocfs2_extent_list *el = path_leaf_el(path);
 
 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
 		return 0;
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, path,
+					   path_num_items(path) - 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -2744,9 +2789,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
 		 * Caller might still want to make changes to the
 		 * tree root, so re-add it to the journal here.
 		 */
-		ret = ocfs2_journal_access(handle, inode,
-					   path_root_bh(left_path),
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode,
+						   left_path, 0);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -2929,8 +2973,7 @@ rightmost_no_delete:
 		 * it up front.
 		 */
 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
-						       path_leaf_bh(path),
-						       path_leaf_el(path));
+						       path);
 		if (ret)
 			mlog_errno(ret);
 		goto out;
@@ -3164,8 +3207,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 		root_bh = left_path->p_node[subtree_index].bh;
 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-		ret = ocfs2_journal_access(handle, inode, root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+						   subtree_index);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3173,17 +3216,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 
 		for (i = subtree_index + 1;
 		     i < path_num_items(right_path); i++) {
-			ret = ocfs2_journal_access(handle, inode,
-						   right_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   right_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 
-			ret = ocfs2_journal_access(handle, inode,
-						   left_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   left_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3195,8 +3236,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 		right_rec = &el->l_recs[index + 1];
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+					   path_num_items(left_path) - 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -3335,8 +3376,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 		root_bh = left_path->p_node[subtree_index].bh;
 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
 
-		ret = ocfs2_journal_access(handle, inode, root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+						   subtree_index);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -3344,17 +3385,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 
 		for (i = subtree_index + 1;
 		     i < path_num_items(right_path); i++) {
-			ret = ocfs2_journal_access(handle, inode,
-						   right_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   right_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
 
-			ret = ocfs2_journal_access(handle, inode,
-						   left_path->p_node[i].bh,
-						   OCFS2_JOURNAL_ACCESS_WRITE);
+			ret = ocfs2_path_bh_journal_access(handle, inode,
+							   left_path, i);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
@@ -3366,8 +3405,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 			has_empty_extent = 1;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+					   path_num_items(left_path) - 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4009,8 +4048,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 
 	el = et->et_root_el;
 
-	ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_et_root_journal_access(handle, inode, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4071,8 +4110,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
 		 * ocfs2_rotate_tree_right() might have extended the
 		 * transaction without re-journaling our tree root.
 		 */
-		ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_et_root_journal_access(handle, inode, et,
+						   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4593,9 +4632,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
 
 	BUG_ON(num_bits > clusters_to_add);
 
-	/* reserve our write early -- insert_extent may update the inode */
-	status = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	/* reserve our write early -- insert_extent may update the tree root */
+	status = ocfs2_et_root_journal_access(handle, inode, et,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -5347,8 +5386,8 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_et_root_journal_access(handle, inode, et,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -5461,8 +5500,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -5523,8 +5562,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 	while (i >= 0) {
 		/* Caller has given us at least enough credits to
 		 * update the truncate log dinode */
-		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -5780,6 +5819,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 		 * tl_used. */
 		tl->tl_used = 0;
 
+		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
 		if (status < 0) {
 			mlog_errno(status);
@@ -6546,8 +6586,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 	}
 
 	if (last_eb_bh) {
-		status = ocfs2_journal_access(handle, inode, last_eb_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -6908,8 +6948,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		goto out_unlock;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -7043,7 +7083,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+	path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+			      ocfs2_journal_access_di);
 	if (!path) {
 		status = -ENOMEM;
 		mlog_errno(status);
@@ -7276,8 +7317,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 59d37d1b7d4..4b6fea22748 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
  *
  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions.  With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
  * ocfs2_extent_tree_operations abstract the normal operations we do for
  * the root of extent b-tree.
  */
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
 	struct ocfs2_extent_tree_operations	*et_ops;
 	struct buffer_head			*et_root_bh;
 	struct ocfs2_extent_list		*et_root_el;
+	ocfs2_journal_access_func		et_root_journal_access;
 	void					*et_object;
 	unsigned int				et_max_leaf_clusters;
 };
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 6b647ec87bb..a067a6cffb0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1512,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		ocfs2_commit_trans(osb, handle);
 
@@ -1740,8 +1740,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * We don't want this to fail in ocfs2_write_end(), so do it
 	 * here.
 	 */
-	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_quota;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3708fe482e3..45e4e03d8f7 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -378,14 +378,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
 		       struct inode *new_entry_inode)
 {
 	int ret;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
 	/*
 	 * The same code works fine for both inline-data and extent
-	 * based directories, so no need to split this up.
+	 * based directories, so no need to split this up.  The only
+	 * difference is the journal_access function.
 	 */
 
-	ret = ocfs2_journal_access(handle, dir, de_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
+	ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -407,9 +411,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 {
 	struct ocfs2_dir_entry *de, *pde;
 	int i, status = -ENOENT;
+	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
 	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
 
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		access = ocfs2_journal_access_di;
+
 	i = 0;
 	pde = NULL;
 	de = (struct ocfs2_dir_entry *) first_de;
@@ -420,8 +428,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 			goto bail;
 		}
 		if (de == de_del)  {
-			status = ocfs2_journal_access(handle, dir, bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			status = access(handle, dir, bh,
+					OCFS2_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
 				status = -EIO;
 				mlog_errno(status);
@@ -581,8 +589,14 @@ int __ocfs2_add_entry(handle_t *handle,
 				goto bail;
 			}
 
-			status = ocfs2_journal_access(handle, dir, insert_bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			if (insert_bh == parent_fe_bh)
+				status = ocfs2_journal_access_di(handle, dir,
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
+			else
+				status = ocfs2_journal_access_db(handle, dir,
+								 insert_bh,
+								 OCFS2_JOURNAL_ACCESS_WRITE);
 			/* By now the buffer is marked for journaling */
 			offset += le16_to_cpu(de->rec_len);
 			if (le64_to_cpu(de->inode)) {
@@ -1081,8 +1095,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
 	struct ocfs2_inline_data *data = &di->id2.i_data;
 	unsigned int size = le16_to_cpu(data->id_count);
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1129,8 +1143,8 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 
 	ocfs2_set_new_buffer_uptodate(inode, new_bh);
 
-	status = ocfs2_journal_access(handle, inode, new_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_db(handle, inode, new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1292,8 +1306,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 
 	ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
 
-	ret = ocfs2_journal_access(handle, dir, dirdata_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1319,8 +1333,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	 * We let the later dirent insert modify c/mtime - to the user
 	 * the data hasn't changed.
 	 */
-	ret = ocfs2_journal_access(handle, dir, di_bh,
-				   OCFS2_JOURNAL_ACCESS_CREATE);
+	ret = ocfs2_journal_access_di(handle, dir, di_bh,
+				      OCFS2_JOURNAL_ACCESS_CREATE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1583,8 +1597,8 @@ do_extend:
 
 	ocfs2_set_new_buffer_uptodate(dir, new_bh);
 
-	status = ocfs2_journal_access(handle, dir, new_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_db(handle, dir, new_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9374d374a26..e8f795f978a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -256,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -353,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
@@ -590,8 +590,8 @@ restarted_transaction:
 	/* reserve a write to the file entry early on - that we if we
 	 * run out of credits in the allocation path, we can still
 	 * update i_size. */
-	status = ocfs2_journal_access(handle, inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1121,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_trans;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 9370b652ab9..229e707bc05 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -537,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 			goto out;
 		}
 
-		status = ocfs2_journal_access(handle, inode, fe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, inode, fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto out;
@@ -621,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
 	}
 
 	/* set the inodes dtime */
-	status = ocfs2_journal_access(handle, inode, di_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, di_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail_commit;
@@ -1190,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	mlog_entry("(inode %llu)\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
-	status = ocfs2_journal_access(handle, inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1277,8 +1277,11 @@ int ocfs2_validate_inode_block(struct super_block *sb,
 	 * local to this block.
 	 */
 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
-	if (rc)
+	if (rc) {
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
 		goto bail;
+	}
 
 	/*
 	 * Errors after here are fatal.
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 2daa5848faf..3b54dba0f74 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -752,6 +752,7 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	if (replayed)
 		ocfs2_bump_recovery_generation(fe);
 
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
 	status = ocfs2_write_block(osb, bh, journal->j_inode);
 	if (status < 0)
 		mlog_errno(status);
@@ -1486,6 +1487,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	osb->slot_recovery_generations[slot_num] =
 					ocfs2_get_recovery_generation(fe);
 
+	ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
 	status = ocfs2_write_block(osb, bh, inode);
 	if (status < 0)
 		mlog_errno(status);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index bca370dab02..3c3532e1307 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -247,9 +247,10 @@ int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
 #define OCFS2_JOURNAL_ACCESS_WRITE  1
 #define OCFS2_JOURNAL_ACCESS_UNDO   2
 
+
 /* ocfs2_inode */
 int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
-			       struct buffer_head *bh, int type);
+			    struct buffer_head *bh, int type);
 /* ocfs2_extent_block */
 int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
 			    struct buffer_head *bh, int type);
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 19cfb1b9ce0..ec70cdbe77f 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "inode.h"
 #include "journal.h"
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	}
 	memcpy(alloc_copy, alloc, bh->b_size);
 
-	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_commit;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
 	ocfs2_clear_local_alloc(alloc);
 
+	ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
 	status = ocfs2_write_block(osb, alloc_bh, inode);
 	if (status < 0)
 		mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	 * delete bits from it! */
 	*num_bits = bits_wanted;
 
-	status = ocfs2_journal_access(handle, local_alloc_inode,
-				      osb->local_alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode,
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 	}
 	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
 
-	status = ocfs2_journal_access(handle, local_alloc_inode,
-				      osb->local_alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, local_alloc_inode,
+					 osb->local_alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6173807ba23..084aba86c3b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -361,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
 			goto leave;
 		}
 
-		status = ocfs2_journal_access(handle, dir, parent_fe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto leave;
@@ -493,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 	}
 	ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
 
-	status = ocfs2_journal_access(handle, inode, *new_fe_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -664,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
 		goto out_unlock_inode;
 	}
 
-	err = ocfs2_journal_access(handle, inode, fe_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	err = ocfs2_journal_access_di(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (err < 0) {
 		mlog_errno(err);
 		goto out_commit;
@@ -851,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
 		goto leave;
 	}
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1265,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
 				goto bail;
 			}
 		}
-		status = ocfs2_journal_access(handle, new_inode, newfe_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1312,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
 	old_inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(old_inode);
 
-	status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status >= 0) {
 		old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
 
@@ -1389,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
 			     (int)old_dir_nlink, old_dir->i_nlink);
 		} else {
 			struct ocfs2_dinode *fe;
-			status = ocfs2_journal_access(handle, old_dir,
-						      old_dir_bh,
-						      OCFS2_JOURNAL_ACCESS_WRITE);
+			status = ocfs2_journal_access_di(handle, old_dir,
+							 old_dir_bh,
+							 OCFS2_JOURNAL_ACCESS_WRITE);
 			fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
 			fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
 			status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1898,8 +1898,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -1986,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	status = ocfs2_journal_access(handle,orphan_dir_inode,  orphan_dir_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle,orphan_dir_inode,  orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 2bb389fe739..bad87d0a03c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -339,6 +339,10 @@ struct ocfs2_super
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
 
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+					 struct buffer_head *bh, int type);
+
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
 	if (!S_ISREG(inode->i_mode))
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index a0b8b14cca8..444aa5a467f 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -244,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 	set_buffer_uptodate(bh);
 	unlock_buffer(bh);
 	ocfs2_set_buffer_uptodate(gqinode, bh);
-	err = ocfs2_journal_access(handle, gqinode, bh, ja_type);
+	err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
 	if (err < 0) {
 		brelse(bh);
 		goto out;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index d451b715aef..07deec5e972 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -106,8 +106,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
 		mlog_errno(status);
 		return status;
 	}
-	status = ocfs2_journal_access(handle, inode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_dq(handle, inode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		ocfs2_commit_trans(OCFS2_SB(sb), handle);
@@ -506,7 +506,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 				goto out_commit;
 			}
 			/* Release local quota file entry */
-			status = ocfs2_journal_access(handle, lqinode,
+			status = ocfs2_journal_access_dq(handle, lqinode,
 					qbh, OCFS2_JOURNAL_ACCESS_WRITE);
 			if (status < 0) {
 				mlog_errno(status);
@@ -614,8 +614,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
 			mlog_errno(status);
 			goto out_bh;
 		}
-		status = ocfs2_journal_access(handle, lqinode, bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
+		status = ocfs2_journal_access_dq(handle, lqinode, bh,
+						 OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
 			mlog_errno(status);
 			goto out_trans;
@@ -981,8 +981,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 		goto out;
 	}
 
-	status = ocfs2_journal_access(handle, lqinode, bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_dq(handle, lqinode, bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_trans;
@@ -1074,7 +1074,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 		mlog_errno(status);
 		goto out;
 	}
-	status = ocfs2_journal_access(handle, lqinode, chunk->qc_headerbh,
+	status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
 				 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1207,7 +1207,7 @@ static int ocfs2_local_release_dquot(struct dquot *dquot)
 		goto out;
 	}
 
-	status = ocfs2_journal_access(handle, sb_dqopt(sb)->files[type],
+	status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
 			od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 867de3ebfca..424adaa5f90 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
 		   new_clusters, first_new_cluster);
 
-	ret = ocfs2_journal_access(handle, bm_inode, group_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	}
 
 	/* update the inode accordingly. */
-	ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_rollback;
@@ -536,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	cl = &fe->id2.i_chain;
 	cr = &cl->cl_recs[input->chain];
 
-	ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -552,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_commit;
 	}
 
-	ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_commit;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 78755766c32..a69628603e1 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -261,7 +261,11 @@ int ocfs2_check_group_descriptor(struct super_block *sb,
 	 * local to this block.
 	 */
 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
-	if (!rc)
+	if (rc) {
+		mlog(ML_ERROR,
+		     "Checksum failed for group descriptor %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+	} else
 		rc = ocfs2_validate_gd_self(sb, bh, 1);
 	if (!rc)
 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
@@ -343,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      bg_bh,
-				      OCFS2_JOURNAL_ACCESS_CREATE);
+	status = ocfs2_journal_access_gd(handle,
+					 alloc_inode,
+					 bg_bh,
+					 OCFS2_JOURNAL_ACCESS_CREATE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -476,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 
 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 
-	status = ocfs2_journal_access(handle, alloc_inode,
-				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode,
+					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -986,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      group_bh,
-				      journal_type);
+	status = ocfs2_journal_access_gd(handle,
+					 alloc_inode,
+					 group_bh,
+					 journal_type);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1060,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
 
-	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_rollback;
@@ -1075,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		goto out_rollback;
 	}
 
-	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_rollback;
@@ -1090,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
 		goto out_rollback;
 	}
 
-	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out_rollback;
@@ -1242,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -1414,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 
 	/* Ok, claim our bits now: set the info on dinode, chainlist
 	 * and then the group */
-	status = ocfs2_journal_access(handle,
-				      alloc_inode,
-				      ac->ac_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle,
+					 alloc_inode,
+					 ac->ac_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1824,8 +1828,8 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
 
-	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
-				      journal_type);
+	status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
+					 journal_type);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1900,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
 		goto bail;
 	}
 
-	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
-- 
cgit v1.2.3


From 4d0e214ee83185fcaa2cb97cd026d32bdc5c994a Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 5 Dec 2008 11:19:37 -0800
Subject: ocfs2: Add ecc and checksums to ocfs2 xattr buckets.

The xattr bucket can span multiple blocks on disk.  We have wrappers
for this structure in the code.  We use the new multi-block ecc calls to
calculate and validate the bucket.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index bc822d6ba54..7c2f4c9d1bd 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -273,6 +273,15 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 	rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
 			       bucket->bu_blocks, bucket->bu_bhs, 0,
 			       NULL);
+	if (!rc) {
+		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+						 bucket->bu_bhs,
+						 bucket->bu_blocks,
+						 &bucket_xh(bucket)->xh_check);
+		if (rc)
+			mlog_errno(rc);
+	}
+
 	if (rc)
 		ocfs2_xattr_bucket_relse(bucket);
 	return rc;
@@ -301,6 +310,10 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
 {
 	int i;
 
+	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+				   bucket->bu_bhs, bucket->bu_blocks,
+				   &bucket_xh(bucket)->xh_check);
+
 	for (i = 0; i < bucket->bu_blocks; i++)
 		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
 }
-- 
cgit v1.2.3


From 2a50a743bdaab104155bd9e988d2ba3bb4177263 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 14:24:33 -0800
Subject: ocfs2: Create ocfs2_xattr_value_buf.

When an ocfs2 extended attribute is large enough to require its own
allocation tree, we root it with an ocfs2_xattr_value_root.  However,
these roots can be a part of inodes, xattr blocks, or xattr buckets.
Thus, they need a different journal access function for each container.

We wrap the bh, its journal access function, and the value root (xv) in
a structure called ocfs2_xattr_valu_buf.  This is a package that can
be passed around.  In this first pass, we simply pass it to the
extent tree code.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 25 +++++++++++--------------
 fs/ocfs2/alloc.h |  4 ++--
 fs/ocfs2/xattr.c | 34 ++++++++++++++++++++++------------
 fs/ocfs2/xattr.h | 14 ++++++++++++++
 4 files changed, 49 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 6e58fd557e5..874c0bd9e1c 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -48,6 +48,7 @@
 #include "file.h"
 #include "super.h"
 #include "uptodate.h"
+#include "xattr.h"
 
 #include "buffer_head_io.h"
 
@@ -207,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
 
 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
 {
-	struct ocfs2_xattr_value_root *xv = et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	et->et_root_el = &xv->xr_list;
+	et->et_root_el = &vb->vb_xv->xr_list;
 }
 
 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
 					      u64 blkno)
 {
-	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	xv->xr_last_eb_blk = cpu_to_le64(blkno);
+	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
 }
 
 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
 {
-	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *) et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	return le64_to_cpu(xv->xr_last_eb_blk);
+	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
 }
 
 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
 					      struct ocfs2_extent_tree *et,
 					      u32 clusters)
 {
-	struct ocfs2_xattr_value_root *xv =
-		(struct ocfs2_xattr_value_root *)et->et_object;
+	struct ocfs2_xattr_value_buf *vb = et->et_object;
 
-	le32_add_cpu(&xv->xr_clusters, clusters);
+	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 
 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -334,10 +332,9 @@ void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct inode *inode,
-					struct buffer_head *bh,
-					struct ocfs2_xattr_value_root *xv)
+					struct ocfs2_xattr_value_buf *vb)
 {
-	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access, xv,
+	__ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
 				 &ocfs2_xattr_value_et_ops);
 }
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4b6fea22748..cceff5c37f4 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -71,10 +71,10 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
 				       struct inode *inode,
 				       struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
 					struct inode *inode,
-					struct buffer_head *bh,
-					struct ocfs2_xattr_value_root *xv);
+					struct ocfs2_xattr_value_buf *vb);
 
 /*
  * Read an extent block into *bh.  If *bh is NULL, a bh will be
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 7c2f4c9d1bd..123d378aba9 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -581,21 +581,26 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 	handle_t *handle = ctxt->handle;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh	= xattr_bh,
+		.vb_xv = xv,
+		.vb_access = ocfs2_journal_access,
+	};
+	u32 prev_clusters, logical_start = le32_to_cpu(vb.vb_xv->xr_clusters);
 	struct ocfs2_extent_tree et;
 
 	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
 
-	ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+	ocfs2_init_xattr_value_extent_tree(&et, inode, &vb);
 
-	status = ocfs2_journal_access(handle, inode, xattr_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	status = vb.vb_access(handle, inode, vb.vb_bh,
+			      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	prev_clusters = le32_to_cpu(xv->xr_clusters);
+	prev_clusters = le32_to_cpu(vb.vb_xv->xr_clusters);
 	status = ocfs2_add_clusters_in_btree(osb,
 					     inode,
 					     &logical_start,
@@ -611,13 +616,13 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 		goto leave;
 	}
 
-	status = ocfs2_journal_dirty(handle, xattr_bh);
+	status = ocfs2_journal_dirty(handle, vb.vb_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+	clusters_to_add -= le32_to_cpu(vb.vb_xv->xr_clusters) - prev_clusters;
 
 	/*
 	 * We should have already allocated enough space before the transaction,
@@ -640,11 +645,16 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 	handle_t *handle = ctxt->handle;
 	struct ocfs2_extent_tree et;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = root_bh,
+		.vb_xv = xv,
+		.vb_access = ocfs2_journal_access,
+	};
 
-	ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+	ocfs2_init_xattr_value_extent_tree(&et, inode, &vb);
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = vb.vb_access(handle, inode, vb.vb_bh,
+			   OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -657,9 +667,9 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out;
 	}
 
-	le32_add_cpu(&xv->xr_clusters, -len);
+	le32_add_cpu(&vb.vb_xv->xr_clusters, -len);
 
-	ret = ocfs2_journal_dirty(handle, root_bh);
+	ret = ocfs2_journal_dirty(handle, vb.vb_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 9a67e7d8f81..5a1ebc789f7 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -70,4 +70,18 @@ int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
 			  int, struct ocfs2_security_xattr_info *,
 			  int *, int *, struct ocfs2_alloc_context **);
 
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block.  Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal.  Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+	struct buffer_head		*vb_bh;
+	ocfs2_journal_access_func	vb_access;
+	struct ocfs2_xattr_value_root	*vb_xv;
+};
+
+
 #endif /* OCFS2_XATTR_H */
-- 
cgit v1.2.3


From d72cc72d57ecaf9047da51269dabd6880c1399ac Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 14:30:41 -0800
Subject: ocfs2: Pull ocfs2_xattr_value_buf up from
 __ocfs2_remove_xattr_range().

Place an ocfs2_xattr_value_buf in __ocfs2_xattr_shrink_size() and pass
it down to __ocfs2_remove_xattr_range().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 123d378aba9..3b059cf2eb4 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -636,8 +636,7 @@ leave:
 }
 
 static int __ocfs2_remove_xattr_range(struct inode *inode,
-				      struct buffer_head *root_bh,
-				      struct ocfs2_xattr_value_root *xv,
+				      struct ocfs2_xattr_value_buf *vb,
 				      u32 cpos, u32 phys_cpos, u32 len,
 				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
@@ -645,16 +644,11 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 	handle_t *handle = ctxt->handle;
 	struct ocfs2_extent_tree et;
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_bh = root_bh,
-		.vb_xv = xv,
-		.vb_access = ocfs2_journal_access,
-	};
 
-	ocfs2_init_xattr_value_extent_tree(&et, inode, &vb);
+	ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
 
-	ret = vb.vb_access(handle, inode, vb.vb_bh,
-			   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = vb->vb_access(handle, inode, vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -667,9 +661,9 @@ static int __ocfs2_remove_xattr_range(struct inode *inode,
 		goto out;
 	}
 
-	le32_add_cpu(&vb.vb_xv->xr_clusters, -len);
+	le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
 
-	ret = ocfs2_journal_dirty(handle, vb.vb_bh);
+	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -693,6 +687,11 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	int ret = 0;
 	u32 trunc_len, cpos, phys_cpos, alloc_size;
 	u64 block;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = root_bh,
+		.vb_xv = xv,
+		.vb_access = ocfs2_journal_access,
+	};
 
 	if (old_clusters <= new_clusters)
 		return 0;
@@ -701,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	trunc_len = old_clusters - new_clusters;
 	while (trunc_len) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
-					       &alloc_size, &xv->xr_list);
+					       &alloc_size,
+					       &vb.vb_xv->xr_list);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -710,7 +710,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 		if (alloc_size > trunc_len)
 			alloc_size = trunc_len;
 
-		ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+		ret = __ocfs2_remove_xattr_range(inode, &vb, cpos,
 						 phys_cpos, alloc_size,
 						 ctxt);
 		if (ret) {
-- 
cgit v1.2.3


From 19b801f45fa5e4840b9be3dcf1e73b08f35b04d9 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 14:36:50 -0800
Subject: ocfs2: Pull ocfs2_xattr_value_buf up into
 ocfs2_xattr_value_truncate().

Place an ocfs2_xattr_value_buf in ocfs2_xattr_value_truncate() and pass
it down to ocfs2_xattr_shrink_size().  We can also pass it into
ocfs2_xattr_extend_allocation(), replacing its ocfs2_xattr_value_buf.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3b059cf2eb4..4ce8019f0ef 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -573,34 +573,28 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 
 static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 u32 clusters_to_add,
-					 struct buffer_head *xattr_bh,
-					 struct ocfs2_xattr_value_root *xv,
+					 struct ocfs2_xattr_value_buf *vb,
 					 struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int status = 0;
 	handle_t *handle = ctxt->handle;
 	enum ocfs2_alloc_restarted why;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_bh	= xattr_bh,
-		.vb_xv = xv,
-		.vb_access = ocfs2_journal_access,
-	};
-	u32 prev_clusters, logical_start = le32_to_cpu(vb.vb_xv->xr_clusters);
+	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
 	struct ocfs2_extent_tree et;
 
 	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
 
-	ocfs2_init_xattr_value_extent_tree(&et, inode, &vb);
+	ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
 
-	status = vb.vb_access(handle, inode, vb.vb_bh,
+	status = vb->vb_access(handle, inode, vb->vb_bh,
 			      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	prev_clusters = le32_to_cpu(vb.vb_xv->xr_clusters);
+	prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
 	status = ocfs2_add_clusters_in_btree(osb,
 					     inode,
 					     &logical_start,
@@ -616,13 +610,13 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 		goto leave;
 	}
 
-	status = ocfs2_journal_dirty(handle, vb.vb_bh);
+	status = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	clusters_to_add -= le32_to_cpu(vb.vb_xv->xr_clusters) - prev_clusters;
+	clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
 
 	/*
 	 * We should have already allocated enough space before the transaction,
@@ -680,18 +674,12 @@ out:
 static int ocfs2_xattr_shrink_size(struct inode *inode,
 				   u32 old_clusters,
 				   u32 new_clusters,
-				   struct buffer_head *root_bh,
-				   struct ocfs2_xattr_value_root *xv,
+				   struct ocfs2_xattr_value_buf *vb,
 				   struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret = 0;
 	u32 trunc_len, cpos, phys_cpos, alloc_size;
 	u64 block;
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_bh = root_bh,
-		.vb_xv = xv,
-		.vb_access = ocfs2_journal_access,
-	};
 
 	if (old_clusters <= new_clusters)
 		return 0;
@@ -701,7 +689,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 	while (trunc_len) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
 					       &alloc_size,
-					       &vb.vb_xv->xr_list);
+					       &vb->vb_xv->xr_list);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -710,7 +698,7 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
 		if (alloc_size > trunc_len)
 			alloc_size = trunc_len;
 
-		ret = __ocfs2_remove_xattr_range(inode, &vb, cpos,
+		ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
 						 phys_cpos, alloc_size,
 						 ctxt);
 		if (ret) {
@@ -738,6 +726,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	int ret;
 	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
 	u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = root_bh,
+		.vb_xv = xv,
+		.vb_access = ocfs2_journal_access,
+	};
 
 	if (new_clusters == old_clusters)
 		return 0;
@@ -745,11 +738,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	if (new_clusters > old_clusters)
 		ret = ocfs2_xattr_extend_allocation(inode,
 						    new_clusters - old_clusters,
-						    root_bh, xv, ctxt);
+						    &vb, ctxt);
 	else
 		ret = ocfs2_xattr_shrink_size(inode,
 					      old_clusters, new_clusters,
-					      root_bh, xv, ctxt);
+					      &vb, ctxt);
 
 	return ret;
 }
-- 
cgit v1.2.3


From b3e5d37905730dc5ddff717f55ed830caa80ea0e Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 15:01:04 -0800
Subject: ocfs2: Pass ocfs2_xattr_value_buf into ocfs2_xattr_value_truncate().

The callers of ocfs2_xattr_value_truncate() now pass in
ocfs2_xattr_value_bufs.  These callers are the ones that calculated the
xv location, so they are the right starting point.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 66 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4ce8019f0ef..409f9eeec70 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -718,19 +718,13 @@ out:
 }
 
 static int ocfs2_xattr_value_truncate(struct inode *inode,
-				      struct buffer_head *root_bh,
-				      struct ocfs2_xattr_value_root *xv,
+				      struct ocfs2_xattr_value_buf *vb,
 				      int len,
 				      struct ocfs2_xattr_set_ctxt *ctxt)
 {
 	int ret;
 	u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
-	u32 old_clusters = le32_to_cpu(xv->xr_clusters);
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_bh = root_bh,
-		.vb_xv = xv,
-		.vb_access = ocfs2_journal_access,
-	};
+	u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
 
 	if (new_clusters == old_clusters)
 		return 0;
@@ -738,11 +732,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	if (new_clusters > old_clusters)
 		ret = ocfs2_xattr_extend_allocation(inode,
 						    new_clusters - old_clusters,
-						    &vb, ctxt);
+						    vb, ctxt);
 	else
 		ret = ocfs2_xattr_shrink_size(inode,
 					      old_clusters, new_clusters,
-					      &vb, ctxt);
+					      vb, ctxt);
 
 	return ret;
 }
@@ -1330,6 +1324,10 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	struct ocfs2_xattr_value_root *xv = NULL;
 	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
 	int ret = 0;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = xs->xattr_bh,
+		.vb_access = ocfs2_journal_access
+	};
 
 	memset(val, 0, size);
 	memcpy(val, xi->name, name_len);
@@ -1340,9 +1338,9 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	xv->xr_list.l_tree_depth = 0;
 	xv->xr_list.l_count = cpu_to_le16(1);
 	xv->xr_list.l_next_free_rec = 0;
+	vb.vb_xv = xv;
 
-	ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
-					 xi->value_len, ctxt);
+	ret = ocfs2_xattr_value_truncate(inode, &vb, xi->value_len, ctxt);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1352,7 +1350,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, xv,
+	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb.vb_xv,
 					      xi->value, xi->value_len);
 	if (ret < 0)
 		mlog_errno(ret);
@@ -1550,9 +1548,12 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 			goto out;
 		} else if (!ocfs2_xattr_is_local(xs->here)) {
 			/* For existing xattr which has value outside */
-			struct ocfs2_xattr_value_root *xv = NULL;
-			xv = (struct ocfs2_xattr_value_root *)(val +
-				OCFS2_XATTR_SIZE(name_len));
+			struct ocfs2_xattr_value_buf vb = {
+				.vb_bh = xs->xattr_bh,
+				.vb_xv = (struct ocfs2_xattr_value_root *)
+					(val + OCFS2_XATTR_SIZE(name_len)),
+				.vb_access = ocfs2_journal_access,
+			};
 
 			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
 				/*
@@ -1561,8 +1562,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				 * then set new value with set_value_outside().
 				 */
 				ret = ocfs2_xattr_value_truncate(inode,
-								 xs->xattr_bh,
-								 xv,
+								 &vb,
 								 xi->value_len,
 								 ctxt);
 				if (ret < 0) {
@@ -1582,7 +1582,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 
 				ret = __ocfs2_xattr_set_value_outside(inode,
 								handle,
-								xv,
+								vb.vb_xv,
 								xi->value,
 								xi->value_len);
 				if (ret < 0)
@@ -1594,8 +1594,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 				 * just trucate old value to zero.
 				 */
 				 ret = ocfs2_xattr_value_truncate(inode,
-								  xs->xattr_bh,
-								  xv,
+								  &vb,
 								  0,
 								  ctxt);
 				if (ret < 0)
@@ -1714,15 +1713,17 @@ static int ocfs2_remove_value_outside(struct inode*inode,
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 
 		if (!ocfs2_xattr_is_local(entry)) {
-			struct ocfs2_xattr_value_root *xv;
+			struct ocfs2_xattr_value_buf vb = {
+				.vb_bh = bh,
+				.vb_access = ocfs2_journal_access,
+			};
 			void *val;
 
 			val = (void *)header +
 				le16_to_cpu(entry->xe_name_offset);
-			xv = (struct ocfs2_xattr_value_root *)
+			vb.vb_xv = (struct ocfs2_xattr_value_root *)
 				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-			ret = ocfs2_xattr_value_truncate(inode, bh, xv,
-							 0, &ctxt);
+			ret = ocfs2_xattr_value_truncate(inode, &vb, 0, &ctxt);
 			if (ret < 0) {
 				mlog_errno(ret);
 				break;
@@ -4651,11 +4652,12 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 {
 	int ret, offset;
 	u64 value_blk;
-	struct buffer_head *value_bh = NULL;
-	struct ocfs2_xattr_value_root *xv;
 	struct ocfs2_xattr_entry *xe;
 	struct ocfs2_xattr_header *xh = bucket_xh(bucket);
 	size_t blocksize = inode->i_sb->s_blocksize;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_access = ocfs2_journal_access,
+	};
 
 	xe = &xh->xh_entries[xe_off];
 
@@ -4669,11 +4671,11 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	/* We don't allow ocfs2_xattr_value to be stored in different block. */
 	BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
 
-	value_bh = bucket->bu_bhs[value_blk];
-	BUG_ON(!value_bh);
+	vb.vb_bh = bucket->bu_bhs[value_blk];
+	BUG_ON(!vb.vb_bh);
 
-	xv = (struct ocfs2_xattr_value_root *)
-		(value_bh->b_data + offset % blocksize);
+	vb.vb_xv = (struct ocfs2_xattr_value_root *)
+		(vb.vb_bh->b_data + offset % blocksize);
 
 	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
 						OCFS2_JOURNAL_ACCESS_WRITE);
@@ -4691,7 +4693,7 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	 */
 	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
 	     xe_off, (unsigned long long)bucket_blkno(bucket), len);
-	ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len, ctxt);
+	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_dirty;
-- 
cgit v1.2.3


From 0c748e95327d00e9eb19d0f34b32147ecbc02137 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 15:46:15 -0800
Subject: ocfs2: Pass value buf to ocfs2_xattr_update_entry().

ocfs2_xattr_update_entry() updates the entry portion of an xattr buffer.
This can be part of multiple metadata block types, so pass the buffer in
via an ocfs2_xattr_value_buf.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 409f9eeec70..6a056122771 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1282,12 +1282,13 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
 				    handle_t *handle,
 				    struct ocfs2_xattr_info *xi,
 				    struct ocfs2_xattr_search *xs,
+				    struct ocfs2_xattr_value_buf *vb,
 				    size_t offs)
 {
 	int ret;
 
-	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = vb->vb_access(handle, inode, vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1301,7 +1302,7 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
 		ocfs2_xattr_set_local(xs->here, 0);
 	ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
 
-	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
@@ -1345,7 +1346,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, offs);
+	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, &vb, offs);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
@@ -1574,6 +1575,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 							       handle,
 							       xi,
 							       xs,
+							       &vb,
 							       offs);
 				if (ret < 0) {
 					mlog_errno(ret);
-- 
cgit v1.2.3


From 512620f44df85df87348fc9a6fc54fcaa254b8d3 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 15:58:35 -0800
Subject: ocfs2: Use ocfs2_xattr_value_buf in ocfs2_xattr_set_entry().

ocfs2_xattr_set_entry is the function that knows what type of block it
is setting into.  This is what we wanted from ocfs2_xattr_value_buf.
Plus, moving the value buf up into ocfs2_xattr_set_entry() allows us to
pass it into ocfs2_xattr_set_value_outside() and ocfs2_xattr_cleanup().

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6a056122771..c08b5e8746c 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1252,6 +1252,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 			       handle_t *handle,
 			       struct ocfs2_xattr_info *xi,
 			       struct ocfs2_xattr_search *xs,
+			       struct ocfs2_xattr_value_buf *vb,
 			       size_t offs)
 {
 	int ret = 0;
@@ -1259,8 +1260,8 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 	void *val = xs->base + offs;
 	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
 
-	ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = vb->vb_access(handle, inode, vb->vb_bh,
+			    OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1271,7 +1272,7 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
 	memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
 	memset(val, 0, size);
 
-	ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+	ret = ocfs2_journal_dirty(handle, vb->vb_bh);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
@@ -1318,6 +1319,7 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 					 struct ocfs2_xattr_info *xi,
 					 struct ocfs2_xattr_search *xs,
 					 struct ocfs2_xattr_set_ctxt *ctxt,
+					 struct ocfs2_xattr_value_buf *vb,
 					 size_t offs)
 {
 	size_t name_len = strlen(xi->name);
@@ -1325,10 +1327,6 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	struct ocfs2_xattr_value_root *xv = NULL;
 	size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
 	int ret = 0;
-	struct ocfs2_xattr_value_buf vb = {
-		.vb_bh = xs->xattr_bh,
-		.vb_access = ocfs2_journal_access
-	};
 
 	memset(val, 0, size);
 	memcpy(val, xi->name, name_len);
@@ -1339,19 +1337,19 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
 	xv->xr_list.l_tree_depth = 0;
 	xv->xr_list.l_count = cpu_to_le16(1);
 	xv->xr_list.l_next_free_rec = 0;
-	vb.vb_xv = xv;
+	vb->vb_xv = xv;
 
-	ret = ocfs2_xattr_value_truncate(inode, &vb, xi->value_len, ctxt);
+	ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, &vb, offs);
+	ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
 	if (ret < 0) {
 		mlog_errno(ret);
 		return ret;
 	}
-	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb.vb_xv,
+	ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
 					      xi->value, xi->value_len);
 	if (ret < 0)
 		mlog_errno(ret);
@@ -1488,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		.value = xi->value,
 		.value_len = xi->value_len,
 	};
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = xs->xattr_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
+
+	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+		BUG_ON(xs->xattr_bh == xs->inode_bh);
+		vb.vb_access = ocfs2_journal_access_xb;
+	} else
+		BUG_ON(xs->xattr_bh != xs->inode_bh);
 
 	/* Compute min_offs, last and free space. */
 	last = xs->header->xh_entries;
@@ -1543,18 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
 			/* Replace existing local xattr with tree root */
 			ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
-							    ctxt, offs);
+							    ctxt, &vb, offs);
 			if (ret < 0)
 				mlog_errno(ret);
 			goto out;
 		} else if (!ocfs2_xattr_is_local(xs->here)) {
 			/* For existing xattr which has value outside */
-			struct ocfs2_xattr_value_buf vb = {
-				.vb_bh = xs->xattr_bh,
-				.vb_xv = (struct ocfs2_xattr_value_root *)
-					(val + OCFS2_XATTR_SIZE(name_len)),
-				.vb_access = ocfs2_journal_access,
-			};
+			vb.vb_xv = (struct ocfs2_xattr_value_root *)
+				(val + OCFS2_XATTR_SIZE(name_len));
 
 			if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
 				/*
@@ -1605,16 +1609,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		}
 	}
 
-	ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
 
 	if (!(flag & OCFS2_INLINE_XATTR_FL)) {
-		ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = vb.vb_access(handle, inode, vb.vb_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1674,7 +1678,8 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		 * This is the second step for value size > INLINE_SIZE.
 		 */
 		size_t offs = le16_to_cpu(xs->here->xe_name_offset);
-		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt, offs);
+		ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+						    &vb, offs);
 		if (ret < 0) {
 			int ret2;
 
@@ -1684,7 +1689,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 			 * the junk tree root we have already set in local.
 			 */
 			ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
-						   xi, xs, offs);
+						   xi, xs, &vb, offs);
 			if (ret2 < 0)
 				mlog_errno(ret2);
 		}
-- 
cgit v1.2.3


From 4311901daabe1d0f22cfcf86c57ad450f14b4e9f Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 16:24:43 -0800
Subject: ocfs2: Pass value buf to ocfs2_remove_value_outside().

ocfs2_remove_value_outside() needs to know the type of buffer it is
looking at.  Pass in an ocfs2_xattr_value_buf.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c08b5e8746c..d2760e64475 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1699,7 +1699,7 @@ out:
 }
 
 static int ocfs2_remove_value_outside(struct inode*inode,
-				      struct buffer_head *bh,
+				      struct ocfs2_xattr_value_buf *vb,
 				      struct ocfs2_xattr_header *header)
 {
 	int ret = 0, i;
@@ -1720,17 +1720,13 @@ static int ocfs2_remove_value_outside(struct inode*inode,
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 
 		if (!ocfs2_xattr_is_local(entry)) {
-			struct ocfs2_xattr_value_buf vb = {
-				.vb_bh = bh,
-				.vb_access = ocfs2_journal_access,
-			};
 			void *val;
 
 			val = (void *)header +
 				le16_to_cpu(entry->xe_name_offset);
-			vb.vb_xv = (struct ocfs2_xattr_value_root *)
+			vb->vb_xv = (struct ocfs2_xattr_value_root *)
 				(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
-			ret = ocfs2_xattr_value_truncate(inode, &vb, 0, &ctxt);
+			ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
 			if (ret < 0) {
 				mlog_errno(ret);
 				break;
@@ -1752,12 +1748,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_xattr_header *header;
 	int ret;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = di_bh,
+		.vb_access = ocfs2_journal_access_di,
+	};
 
 	header = (struct ocfs2_xattr_header *)
 		 ((void *)di + inode->i_sb->s_blocksize -
 		 le16_to_cpu(di->i_xattr_inline_size));
 
-	ret = ocfs2_remove_value_outside(inode, di_bh, header);
+	ret = ocfs2_remove_value_outside(inode, &vb, header);
 
 	return ret;
 }
@@ -1767,11 +1767,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
 {
 	struct ocfs2_xattr_block *xb;
 	int ret = 0;
+	struct ocfs2_xattr_value_buf vb = {
+		.vb_bh = blk_bh,
+		.vb_access = ocfs2_journal_access_xb,
+	};
 
 	xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
 	if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
 		struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
-		ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+		ret = ocfs2_remove_value_outside(inode, &vb, header);
 	} else
 		ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
 
-- 
cgit v1.2.3


From 84008972491ca91b240f106191519781dabb8016 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 9 Dec 2008 16:11:49 -0800
Subject: ocfs2: Use proper journal_access function in xattr.c

Change the rest of the naked ocfs2_journal_access() calls in
fs/ocfs2/xattr.c to use the appropriate ocfs2_journal_access_*() call
for their metadata type.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d2760e64475..17028aa7bc2 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1894,8 +1894,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 		mlog_errno(ret);
 		goto out;
 	}
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_di(handle, inode, di_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -2103,8 +2103,8 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 	int ret;
 
 	if (!xs->xattr_bh) {
-		ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
+		ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto end;
@@ -2121,8 +2121,8 @@ static int ocfs2_xattr_block_set(struct inode *inode,
 		new_bh = sb_getblk(inode->i_sb, first_blkno);
 		ocfs2_set_new_buffer_uptodate(inode, new_bh);
 
-		ret = ocfs2_journal_access(handle, inode, new_bh,
-					   OCFS2_JOURNAL_ACCESS_CREATE);
+		ret = ocfs2_journal_access_xb(handle, inode, new_bh,
+					      OCFS2_JOURNAL_ACCESS_CREATE);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto end;
@@ -3377,8 +3377,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	 */
 	down_write(&oi->ip_alloc_sem);
 
-	ret = ocfs2_journal_access(handle, inode, xb_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -4216,8 +4216,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 
 	ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto leave;
@@ -4808,8 +4808,8 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, root_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
-- 
cgit v1.2.3


From 87d35a74b15ec703910a63e0667692fb5e267be0 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Wed, 10 Dec 2008 17:36:25 -0800
Subject: ocfs2: Add directory block trailers.

Future ocfs2 features metaecc and indexed directories need to store a
little bit of data in each dirblock.  For compatibility, we place this
in a trailer at the end of the dirblock.  The trailer plays itself as an
empty dirent, so that if the features are turned off, it can be reused
without requiring a tunefs scan.

This code adds the trailer and validates it when the block is read in.

[ Mark is the original author, but I reinserted this code before his
  dir index work.  -- Joel ]

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c      | 197 ++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/ocfs2/ocfs2.h    |   3 +
 fs/ocfs2/ocfs2_fs.h |  29 ++++++++
 3 files changed, 215 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 45e4e03d8f7..1efd0ab680c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -83,6 +83,63 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct buffer_head **new_bh);
 
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
+{
+	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+		return 0;
+
+	return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+}
+
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+{
+	return ocfs2_meta_ecc(osb);
+}
+
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+	return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
+
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
+
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+				  struct ocfs2_dir_entry *de,
+				  unsigned long offset,
+				  unsigned long blklen)
+{
+	unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
+
+	if (!ocfs2_dir_has_trailer(dir))
+		return 0;
+
+	if (offset != toff)
+		return 0;
+
+	return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+				   struct buffer_head *bh)
+{
+	struct ocfs2_dir_block_trailer *trailer;
+
+	trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+	strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+	trailer->db_compat_rec_len =
+			cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+	trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+	trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
+}
+
 /*
  * bh passed here can be an inode block or a dir data block, depending
  * on the inode inline data flag.
@@ -232,16 +289,60 @@ static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
 {
 	int rc = 0;
 	struct buffer_head *tmp = *bh;
+	struct ocfs2_dir_block_trailer *trailer;
 
 	rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
 				    ocfs2_validate_dir_block);
-	if (rc)
+	if (rc) {
 		mlog_errno(rc);
+		goto out;
+	}
+
+	/*
+	 * We check the trailer here rather than in
+	 * ocfs2_validate_dir_block() because that function doesn't have
+	 * the inode to test.
+	 */
+	if (!(flags & OCFS2_BH_READAHEAD) &&
+	    ocfs2_dir_has_trailer(inode)) {
+		trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+		if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Invalid dirblock #%llu: "
+				    "signature = %.*s\n",
+				    (unsigned long long)tmp->b_blocknr, 7,
+				    trailer->db_signature);
+			goto out;
+		}
+		if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Directory block #%llu has an invalid "
+				    "db_blkno of %llu",
+				    (unsigned long long)tmp->b_blocknr,
+				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+			goto out;
+		}
+		if (le64_to_cpu(trailer->db_parent_dinode) !=
+		    OCFS2_I(inode)->ip_blkno) {
+			rc = -EINVAL;
+			ocfs2_error(inode->i_sb,
+				    "Directory block #%llu on dinode "
+				    "#%llu has an invalid parent_dinode "
+				    "of %llu",
+				    (unsigned long long)tmp->b_blocknr,
+				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+			goto out;
+		}
+	}
 
 	/* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
-	if (!rc && !*bh)
+	if (!*bh)
 		*bh = tmp;
 
+out:
 	return rc ? -EIO : 0;
 }
 
@@ -581,6 +682,16 @@ int __ocfs2_add_entry(handle_t *handle,
 			goto bail;
 		}
 
+		/* We're guaranteed that we should have space, so we
+		 * can't possibly have hit the trailer...right? */
+		mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+				"Hit dir trailer trying to insert %.*s "
+			        "(namelen %d) into directory %llu.  "
+				"offset is %lu, trailer offset is %d\n",
+				namelen, name, namelen,
+				(unsigned long long)parent_fe_bh->b_blocknr,
+				offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
 			retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -622,6 +733,7 @@ int __ocfs2_add_entry(handle_t *handle,
 			retval = 0;
 			goto bail;
 		}
+
 		offset += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
 	}
@@ -1059,9 +1171,15 @@ int ocfs2_empty_dir(struct inode *inode)
 	return !priv.seen_other;
 }
 
-static void ocfs2_fill_initial_dirents(struct inode *inode,
-				       struct inode *parent,
-				       char *start, unsigned int size)
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+							  struct inode *parent,
+							  char *start,
+							  unsigned int size)
 {
 	struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
 
@@ -1078,6 +1196,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
 	de->name_len = 2;
 	strcpy(de->name, "..");
 	ocfs2_set_de_type(de, S_IFDIR);
+
+	return de;
 }
 
 /*
@@ -1130,10 +1250,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 				 struct ocfs2_alloc_context *data_ac)
 {
 	int status;
+	unsigned int size = osb->sb->s_blocksize;
 	struct buffer_head *new_bh = NULL;
+	struct ocfs2_dir_entry *de;
 
 	mlog_entry_void();
 
+	if (ocfs2_supports_dir_trailer(osb))
+		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
 	status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
 				     data_ac, NULL, &new_bh);
 	if (status < 0) {
@@ -1151,8 +1276,9 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 	}
 	memset(new_bh->b_data, 0, osb->sb->s_blocksize);
 
-	ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
-				   osb->sb->s_blocksize);
+	de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+	if (ocfs2_supports_dir_trailer(osb))
+		ocfs2_init_dir_trailer(inode, new_bh);
 
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
@@ -1193,13 +1319,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 				     data_ac);
 }
 
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
 static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
-				     unsigned int new_size)
+				     struct super_block *sb)
 {
 	struct ocfs2_dir_entry *de;
 	struct ocfs2_dir_entry *prev_de;
 	char *de_buf, *limit;
-	unsigned int bytes = new_size - old_size;
+	unsigned int new_size = sb->s_blocksize;
+	unsigned int bytes;
+
+	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+		new_size = ocfs2_dir_trailer_blk_off(sb);
+
+	bytes = new_size - old_size;
 
 	limit = start + old_size;
 	de_buf = start;
@@ -1316,8 +1456,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
 	memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
 	memset(dirdata_bh->b_data + i_size_read(dir), 0,
 	       sb->s_blocksize - i_size_read(dir));
-	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
-				 sb->s_blocksize);
+	ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+	if (ocfs2_supports_dir_trailer(osb))
+		ocfs2_init_dir_trailer(dir, dirdata_bh);
 
 	ret = ocfs2_journal_dirty(handle, dirdata_bh);
 	if (ret) {
@@ -1604,9 +1745,15 @@ do_extend:
 		goto bail;
 	}
 	memset(new_bh->b_data, 0, sb->s_blocksize);
+
 	de = (struct ocfs2_dir_entry *) new_bh->b_data;
 	de->inode = 0;
-	de->rec_len = cpu_to_le16(sb->s_blocksize);
+	if (ocfs2_dir_has_trailer(dir)) {
+		de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+		ocfs2_init_dir_trailer(dir, new_bh);
+	} else {
+		de->rec_len = cpu_to_le16(sb->s_blocksize);
+	}
 	status = ocfs2_journal_dirty(handle, new_bh);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1648,11 +1795,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 				   unsigned int *blocks_wanted)
 {
 	int ret;
+	struct super_block *sb = dir->i_sb;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct ocfs2_dir_entry *de, *last_de = NULL;
 	char *de_buf, *limit;
 	unsigned long offset = 0;
-	unsigned int rec_len, new_rec_len;
+	unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+	/*
+	 * This calculates how many free bytes we'd have in block zero, should
+	 * this function force expansion to an extent tree.
+	 */
+	if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+		free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+	else
+		free_space = dir->i_sb->s_blocksize - i_size_read(dir);
 
 	de_buf = di->id2.i_data.id_data;
 	limit = de_buf + i_size_read(dir);
@@ -1669,6 +1826,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 			ret = -EEXIST;
 			goto out;
 		}
+		/*
+		 * No need to check for a trailing dirent record here as
+		 * they're not used for inline dirs.
+		 */
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			/* Ok, we found a spot. Return this bh and let
 			 * the caller actually fill it in. */
@@ -1689,7 +1851,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
 	 * dirent can be found.
 	 */
 	*blocks_wanted = 1;
-	new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+	new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
 	if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
 		*blocks_wanted = 2;
 
@@ -1707,6 +1869,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 	struct ocfs2_dir_entry *de;
 	struct super_block *sb = dir->i_sb;
 	int status;
+	int blocksize = dir->i_sb->s_blocksize;
 
 	status = ocfs2_read_dir_block(dir, 0, &bh, 0);
 	if (status) {
@@ -1748,6 +1911,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 			status = -EEXIST;
 			goto bail;
 		}
+
+		if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+					   blocksize))
+			goto next;
+
 		if (ocfs2_dirent_would_fit(de, rec_len)) {
 			/* Ok, we found a spot. Return this bh and let
 			 * the caller actually fill it in. */
@@ -1756,6 +1924,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
 			status = 0;
 			goto bail;
 		}
+next:
 		offset += le16_to_cpu(de->rec_len);
 		de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
 	}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index bad87d0a03c..ad5c24a29ed 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -470,6 +470,9 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
 #define OCFS2_IS_VALID_XATTR_BLOCK(ptr)					\
 	(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
 
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr)					\
+	(!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
 static inline unsigned long ino_from_blkno(struct super_block *sb,
 					   u64 blkno)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 290fa26fba6..af0013b9c17 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
 #define OCFS2_EXTENT_BLOCK_SIGNATURE	"EXBLK01"
 #define OCFS2_GROUP_DESC_SIGNATURE      "GROUP01"
 #define OCFS2_XATTR_BLOCK_SIGNATURE	"XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE	"DIRTRL1"
 
 /* Compatibility flags */
 #define OCFS2_HAS_COMPAT_FEATURE(sb,mask)			\
@@ -751,6 +752,34 @@ struct ocfs2_dir_entry {
 /* Actual on-disk length specified by rec_len */
 } __attribute__ ((packed));
 
+/*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/	__le64		db_compat_inode;	/* Always zero. Was inode */
+
+	__le16		db_compat_rec_len;	/* Backwards compatible with
+						 * ocfs2_dir_entry. */
+	__u8		db_compat_name_len;	/* Always zero. Was name_len */
+	__u8		db_reserved0;
+	__le16		db_reserved1;
+	__le16		db_free_rec_len;	/* Size of largest empty hole
+						 * in this block. (unused) */
+/*10*/	__u8		db_signature[8];	/* Signature for verification */
+	__le64		db_reserved2;
+	__le64		db_free_next;		/* Next block in list (unused) */
+/*20*/	__le64		db_blkno;		/* Offset on disk, in blocks */
+	__le64		db_parent_dinode;	/* dinode which owns me, in
+						   blocks */
+/*30*/	__le64		db_check;		/* Error checking */
+/*40*/
+};
+
 /*
  * On disk allocator group structure for OCFS2
  */
-- 
cgit v1.2.3


From c175a518b4a1d514483abf61813ce5d855917164 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Wed, 10 Dec 2008 17:58:22 -0800
Subject: ocfs2: Checksum and ECC for directory blocks.

Use the db_check field of ocfs2_dir_block_trailer to crc/ecc the
dirblocks.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dir.c      | 37 +++++++++++++++++++++++++++++++++++--
 fs/ocfs2/dir.h      |  2 ++
 fs/ocfs2/journal.c  | 31 +++++++++++++++++++++++++++++--
 fs/ocfs2/ocfs2_fs.h |  2 +-
 4 files changed, 67 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 1efd0ab680c..f2c4098cf33 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -48,6 +48,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dir.h"
 #include "dlmglue.h"
 #include "extent_map.h"
@@ -107,6 +108,17 @@ static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
 
 #define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
 
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data)
+{
+	char *p = data;
+
+	p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+	return (struct ocfs2_dir_block_trailer *)p;
+}
+
 /*
  * XXX: This is executed once on every dirent. We should consider optimizing
  * it.
@@ -268,14 +280,35 @@ out:
 static int ocfs2_validate_dir_block(struct super_block *sb,
 				    struct buffer_head *bh)
 {
+	int rc;
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_trailer_from_bh(bh, sb);
+
+
 	/*
-	 * Nothing yet.  We don't validate dirents here, that's handled
+	 * We don't validate dirents here, that's handled
 	 * in-place when the code walks them.
 	 */
 	mlog(0, "Validating dirblock %llu\n",
 	     (unsigned long long)bh->b_blocknr);
 
-	return 0;
+	BUG_ON(!buffer_uptodate(bh));
+
+	/*
+	 * If the ecc fails, we return the error but otherwise
+	 * leave the filesystem running.  We know any error is
+	 * local to this block.
+	 *
+	 * Note that we are safe to call this even if the directory
+	 * doesn't have a trailer.  Filesystems without metaecc will do
+	 * nothing, and filesystems with it will have one.
+	 */
+	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+	if (rc)
+		mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+		     (unsigned long long)bh->b_blocknr);
+
+	return rc;
 }
 
 /*
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d8..c511e2e18e9 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 		       struct buffer_head *fe_bh,
 		       struct ocfs2_alloc_context *data_ac);
 
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+							    void *data);
 #endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 3b54dba0f74..57d7d25a2b9 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -415,6 +415,26 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
 	ocfs2_block_check_compute(data, size, &dqt->dq_check);
 }
 
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+				 struct buffer_head *bh,
+				 void *data, size_t size)
+{
+	struct ocfs2_dir_block_trailer *trailer =
+		ocfs2_dir_trailer_from_size(size, data);
+
+	/*
+	 * We aren't guaranteed to have the superblock here, so we
+	 * must unconditionally compute the ecc data.
+	 * __ocfs2_journal_access() will only set the triggers if
+	 * metaecc is enabled.
+	 */
+	ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
 static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
 				struct buffer_head *bh)
 {
@@ -454,6 +474,13 @@ static struct ocfs2_triggers gd_triggers = {
 	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
 };
 
+static struct ocfs2_triggers db_triggers = {
+	.ot_triggers = {
+		.t_commit = ocfs2_db_commit_trigger,
+		.t_abort = ocfs2_abort_trigger,
+	},
+};
+
 static struct ocfs2_triggers xb_triggers = {
 	.ot_triggers = {
 		.t_commit = ocfs2_commit_trigger,
@@ -555,8 +582,8 @@ int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
 int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
 			    struct buffer_head *bh, int type)
 {
-	/* Right now, nothing for dirblocks */
-	return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+	return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+				      type);
 }
 
 int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index af0013b9c17..698ef3d2712 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -776,7 +776,7 @@ struct ocfs2_dir_block_trailer {
 /*20*/	__le64		db_blkno;		/* Offset on disk, in blocks */
 	__le64		db_parent_dinode;	/* dinode which owns me, in
 						   blocks */
-/*30*/	__le64		db_check;		/* Error checking */
+/*30*/	struct ocfs2_block_check db_check;	/* Error checking */
 /*40*/
 };
 
-- 
cgit v1.2.3


From d030cc978e9e636dc39ce9a9e8282d48698a3b30 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 11 Dec 2008 15:04:14 -0800
Subject: ocfs2: Validate superblock with checksum and ecc.

The superblock is read via a raw call.  Validate it after we find it
from its signature.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/super.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2eb657c3e7a..43ed11345b5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -52,6 +52,7 @@
 #include "ocfs1_fs_compat.h"
 
 #include "alloc.h"
+#include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
 #include "extent_map.h"
@@ -1989,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 
 	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
 		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+		/* We have to do a raw check of the feature here */
+		if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+		    OCFS2_FEATURE_INCOMPAT_META_ECC) {
+			status = ocfs2_block_check_validate(bh->b_data,
+							    bh->b_size,
+							    &di->i_check);
+			if (status)
+				goto out;
+		}
 		status = -EINVAL;
 		if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
 			mlog(ML_ERROR, "found superblock with incorrect block "
@@ -2030,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 		}
 	}
 
+out:
 	mlog_exit(status);
 	return status;
 }
-- 
cgit v1.2.3


From 9d28cfb73f3abccce001daf2d247b16bf20e2248 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 16 Oct 2008 17:53:29 -0700
Subject: ocfs2: Enable metadata checksums.

Add OCFS2_FEATURE_INCOMPAT_META_ECC to the list of supported features.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2_fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 698ef3d2712..c7ae45aaa36 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -94,7 +94,8 @@
 					 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
 					 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
 					 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
-					 | OCFS2_FEATURE_INCOMPAT_XATTR)
+					 | OCFS2_FEATURE_INCOMPAT_XATTR \
+					 | OCFS2_FEATURE_INCOMPAT_META_ECC)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
 					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
-- 
cgit v1.2.3


From e798b3f8a920c82a8e556dd54df97f0d3d0f9144 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 15 Dec 2008 17:13:48 -0800
Subject: ocfs2: Don't hand-code xor in ocfs2_hamming_encode().

When I wrote ocfs2_hamming_encode(), I was following documentation of
the algorithm and didn't have quite the (possibly still imperfect) grasp
of it I do now.  As part of this, I literally hand-coded xor.  I would
test a bit, and then add that bit via xor to the parity word.

I can, of course, just do a single xor of the parity word and the source
word (the code buffer bit offset).  This cuts CPU usage by 53% on a
mostly populated buffer (an inode containing utmp.h inline).

Joel

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/blockcheck.c | 67 +++++++++++++++------------------------------------
 1 file changed, 20 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 2ce6ae5e4b8..1d5083cef3a 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -31,7 +31,6 @@
 #include "blockcheck.h"
 
 
-
 /*
  * We use the following conventions:
  *
@@ -39,26 +38,6 @@
  * p = # parity bits
  * c = # total code bits (d + p)
  */
-static int calc_parity_bits(unsigned int d)
-{
-	unsigned int p;
-
-	/*
-	 * Bits required for Single Error Correction is as follows:
-	 *
-	 * d + p + 1 <= 2^p
-	 *
-	 * We're restricting ourselves to 31 bits of parity, that should be
-	 * sufficient.
-	 */
-	for (p = 1; p < 32; p++)
-	{
-		if ((d + p + 1) <= (1 << p))
-			return p;
-	}
-
-	return 0;
-}
 
 /*
  * Calculate the bit offset in the hamming code buffer based on the bit's
@@ -109,10 +88,9 @@ static unsigned int calc_code_bit(unsigned int i)
  */
 u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
 {
-	unsigned int p = calc_parity_bits(nr + d);
-	unsigned int i, j, b;
+	unsigned int i, b;
 
-	BUG_ON(!p);
+	BUG_ON(!d);
 
 	/*
 	 * b is the hamming code bit number.  Hamming code specifies a
@@ -131,27 +109,23 @@ u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr
 		 */
 		b = calc_code_bit(nr + i);
 
-		for (j = 0; j < p; j++)
-		{
-			/*
-			 * Data bits in the resultant code are checked by
-			 * parity bits that are part of the bit number
-			 * representation.  Huh?
-			 *
-			 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
-			 * In other words, the parity bit at position 2^k
-			 * checks bits in positions having bit k set in
-			 * their binary representation.  Conversely, for
-			 * instance, bit 13, i.e. 1101(2), is checked by
-			 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
-			 * </wikipedia>
-			 *
-			 * Note that 'k' is the _code_ bit number.  'b' in
-			 * our loop.
-			 */
-			if (b & (1 << j))
-				parity ^= (1 << j);
-		}
+		/*
+		 * Data bits in the resultant code are checked by
+		 * parity bits that are part of the bit number
+		 * representation.  Huh?
+		 *
+		 * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+		 * In other words, the parity bit at position 2^k
+		 * checks bits in positions having bit k set in
+		 * their binary representation.  Conversely, for
+		 * instance, bit 13, i.e. 1101(2), is checked by
+		 * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+		 * </wikipedia>
+		 *
+		 * Note that 'k' is the _code_ bit number.  'b' in
+		 * our loop.
+		 */
+		parity ^= b;
 	}
 
 	/* While the data buffer was treated as little endian, the
@@ -174,10 +148,9 @@ u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
 void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
 		       unsigned int fix)
 {
-	unsigned int p = calc_parity_bits(nr + d);
 	unsigned int i, b;
 
-	BUG_ON(!p);
+	BUG_ON(!d);
 
 	/*
 	 * If the bit to fix has an hweight of 1, it's a parity bit.  One
-- 
cgit v1.2.3


From 7bb458a58588f397068e4166c615e9fcc7480c16 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Mon, 15 Dec 2008 18:24:33 -0800
Subject: ocfs2: Another hamming code optimization.

In the calc_code_bit() function, we must find all powers of two beneath
the code bit number, *after* it's shifted by those powers of two.  This
requires a loop to see where it ends up.

We can optimize it by starting at its most significant bit.  This shaves
32% off the time, for a total of 67.6% shaved off of the original, naive
implementation.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/blockcheck.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 1d5083cef3a..f102ec939c9 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -39,6 +39,35 @@
  * c = # total code bits (d + p)
  */
 
+
+/*
+ * Find the log base 2 of 32-bit v.
+ *
+ * Algorithm found on http://graphics.stanford.edu/~seander/bithacks.html,
+ * by Sean Eron Anderson.  Code on the page is in the public domain unless
+ * otherwise noted.
+ *
+ * This particular algorithm is credited to Eric Cole.
+ */
+static int find_highest_bit_set(unsigned int v)
+{
+
+	static const int MultiplyDeBruijnBitPosition[32] =
+	{
+		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+	};
+
+	v |= v >> 1; /* first round down to power of 2 */
+	v |= v >> 2;
+	v |= v >> 4;
+	v |= v >> 8;
+	v |= v >> 16;
+	v = (v >> 1) + 1;
+
+	return MultiplyDeBruijnBitPosition[(u32)(v * 0x077CB531UL) >> 27];
+}
+
 /*
  * Calculate the bit offset in the hamming code buffer based on the bit's
  * offset in the data buffer.  Since the hamming code reserves all
@@ -63,13 +92,22 @@ static unsigned int calc_code_bit(unsigned int i)
 	 */
 	b = i + 1;
 
+	/*
+	 * As a cheat, we know that all bits below b's highest bit must be
+	 * parity bits, so we can start there.
+	 */
+        p = find_highest_bit_set(b);
+        b += p;
+
 	/*
 	 * For every power of two below our bit number, bump our bit.
 	 *
 	 * We compare with (b + 1) becuase we have to compare with what b
 	 * would be _if_ it were bumped up by the parity bit.  Capice?
+	 *
+	 * We start p at 2^p because of the cheat above.
 	 */
-	for (p = 0; (1 << p) < (b + 1); p++)
+	for (p = (1 << p); p < (b + 1); p <<= 1)
 		b++;
 
 	return b;
-- 
cgit v1.2.3


From 58896c4d0e5868360ea0693c607d5bf74f79da6b Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 16 Dec 2008 13:54:40 -0800
Subject: ocfs2: One more hamming code optimization.

The previous optimization used a fast find-highest-bit-set operation to
give us a good starting point in calc_code_bit().  This version lets the
caller cache the previous code buffer bit offset.  Thus, the next call
always starts where the last one left off.

This reduces the calculation another 39%, for a total 80% reduction from
the original, naive implementation.  At least, on my machine.  This also
brings the parity calculation to within an order of magnitude of the
crc32 calculation.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/blockcheck.c | 61 ++++++++++++++++-----------------------------------
 1 file changed, 19 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index f102ec939c9..2a947c44e59 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -40,34 +40,6 @@
  */
 
 
-/*
- * Find the log base 2 of 32-bit v.
- *
- * Algorithm found on http://graphics.stanford.edu/~seander/bithacks.html,
- * by Sean Eron Anderson.  Code on the page is in the public domain unless
- * otherwise noted.
- *
- * This particular algorithm is credited to Eric Cole.
- */
-static int find_highest_bit_set(unsigned int v)
-{
-
-	static const int MultiplyDeBruijnBitPosition[32] =
-	{
-		0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
-		31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
-	};
-
-	v |= v >> 1; /* first round down to power of 2 */
-	v |= v >> 2;
-	v |= v >> 4;
-	v |= v >> 8;
-	v |= v >> 16;
-	v = (v >> 1) + 1;
-
-	return MultiplyDeBruijnBitPosition[(u32)(v * 0x077CB531UL) >> 27];
-}
-
 /*
  * Calculate the bit offset in the hamming code buffer based on the bit's
  * offset in the data buffer.  Since the hamming code reserves all
@@ -81,10 +53,14 @@ static int find_highest_bit_set(unsigned int v)
  * so it's a parity bit.  2 is a power of two (2^1), so it's a parity bit.
  * 3 is not a power of two.  So bit 1 of the data buffer ends up as bit 3
  * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added.  This allows the function to start the
+ * calculation at the last place.
  */
-static unsigned int calc_code_bit(unsigned int i)
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
 {
-	unsigned int b, p;
+	unsigned int b, p = 0;
 
 	/*
 	 * Data bits are 0-based, but we're talking code bits, which
@@ -92,24 +68,25 @@ static unsigned int calc_code_bit(unsigned int i)
 	 */
 	b = i + 1;
 
-	/*
-	 * As a cheat, we know that all bits below b's highest bit must be
-	 * parity bits, so we can start there.
-	 */
-        p = find_highest_bit_set(b);
+	/* Use the cache if it is there */
+	if (p_cache)
+		p = *p_cache;
         b += p;
 
 	/*
 	 * For every power of two below our bit number, bump our bit.
 	 *
-	 * We compare with (b + 1) becuase we have to compare with what b
+	 * We compare with (b + 1) because we have to compare with what b
 	 * would be _if_ it were bumped up by the parity bit.  Capice?
 	 *
-	 * We start p at 2^p because of the cheat above.
+	 * p is set above.
 	 */
-	for (p = (1 << p); p < (b + 1); p <<= 1)
+	for (; (1 << p) < (b + 1); p++)
 		b++;
 
+	if (p_cache)
+		*p_cache = p;
+
 	return b;
 }
 
@@ -126,7 +103,7 @@ static unsigned int calc_code_bit(unsigned int i)
  */
 u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
 {
-	unsigned int i, b;
+	unsigned int i, b, p = 0;
 
 	BUG_ON(!d);
 
@@ -145,7 +122,7 @@ u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr
 		 * i is the offset in this hunk, nr + i is the total bit
 		 * offset.
 		 */
-		b = calc_code_bit(nr + i);
+		b = calc_code_bit(nr + i, &p);
 
 		/*
 		 * Data bits in the resultant code are checked by
@@ -201,7 +178,7 @@ void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
 	 * nr + d is the bit right past the data hunk we're looking at.
 	 * If fix after that, nothing to do
 	 */
-	if (fix >= calc_code_bit(nr + d))
+	if (fix >= calc_code_bit(nr + d, NULL))
 		return;
 
 	/*
@@ -209,7 +186,7 @@ void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
 	 * start b at the offset in the code buffer.  See hamming_encode()
 	 * for a more detailed description of 'b'.
 	 */
-	b = calc_code_bit(nr);
+	b = calc_code_bit(nr, NULL);
 	/* If the fix is before this hunk, nothing to do */
 	if (fix < b)
 		return;
-- 
cgit v1.2.3


From 2b83256407687613e906bee93d98a25339128a4d Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 16 Dec 2008 15:49:19 -0800
Subject: ocfs2/dlm: Fix a race between migrate request and exit domain

Patch address a racing migrate request message and an exit domain message.
Instead of blocking exit domains for the duration of the migrate, we ignore
failure to deliver that message. This is because an exiting domain should
not have any active locks and thus has no role to play in the migration.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf368..92fd1d7d612 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2949,7 +2949,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 				  struct dlm_node_iter *iter)
 {
 	struct dlm_migrate_request migrate;
-	int ret, status = 0;
+	int ret, skip, status = 0;
 	int nodenum;
 
 	memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2966,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
 		    nodenum == new_master)
 			continue;
 
+		/* We could race exit domain. If exited, skip. */
+		spin_lock(&dlm->spinlock);
+		skip = (!test_bit(nodenum, dlm->domain_map));
+		spin_unlock(&dlm->spinlock);
+		if (skip) {
+			clear_bit(nodenum, iter->node_map);
+			continue;
+		}
+
 		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
 					 &migrate, sizeof(migrate), nodenum,
 					 &status);
-		if (ret < 0)
-			mlog_errno(ret);
-		else if (status < 0) {
+		if (ret < 0) {
+			mlog(0, "migrate_request returned %d!\n", ret);
+			if (!dlm_is_host_down(ret)) {
+				mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+				BUG();
+			}
+			clear_bit(nodenum, iter->node_map);
+			ret = 0;
+		} else if (status < 0) {
 			mlog(0, "migrate request (node %u) returned %d!\n",
 			     nodenum, status);
 			ret = status;
-- 
cgit v1.2.3


From 57dff2676eb68d805883a2204faaa5339ac44e03 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 16 Dec 2008 15:49:20 -0800
Subject: ocfs2/dlm: Clean up errors in dlm_proxy_ast_handler()

Patch cleans printed errors in dlm_proxy_ast_handler(). The errors now includes
the node number that sent the (b)ast. Also it reduces the number of endian swaps
of the cookie.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmast.c | 52 ++++++++++++++++++++++++++-------------------------
 1 file changed, 27 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8b..d07ddbe4b28 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct list_head *iter, *head=NULL;
 	u64 cookie;
 	u32 flags;
+	u8 node;
 
 	if (!dlm_grab(dlm)) {
 		dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	name = past->name;
 	locklen = past->namelen;
-	cookie = be64_to_cpu(past->cookie);
+	cookie = past->cookie;
 	flags = be32_to_cpu(past->flags);
+	node = past->node_idx;
 
 	if (locklen > DLM_LOCKID_NAME_MAX) {
 		ret = DLM_IVBUFLEN;
-		mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+		mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+		     "handler!\n", locklen);
 		goto leave;
 	}
 
 	if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
 	     (LKM_PUT_LVB|LKM_GET_LVB)) {
-		mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+		mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+		     flags);
 		ret = DLM_BADARGS;
 		goto leave;
 	}
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	if (past->type != DLM_AST &&
 	    past->type != DLM_BAST) {
 		mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
-		     "name=%.*s\n", past->type, 
-		     dlm_get_lock_cookie_node(cookie),
-		     dlm_get_lock_cookie_seq(cookie),
-		     locklen, name);
+		     "name=%.*s, node=%u\n", past->type,
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
 		ret = DLM_IVLOCKID;
 		goto leave;
 	}
 
 	res = dlm_lookup_lockres(dlm, name, locklen);
 	if (!res) {
-		mlog(0, "got %sast for unknown lockres! "
-		     "cookie=%u:%llu, name=%.*s, namelen=%u\n",
-		     past->type == DLM_AST ? "" : "b",
-		     dlm_get_lock_cookie_node(cookie),
-		     dlm_get_lock_cookie_seq(cookie),
-		     locklen, name, locklen);
+		mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+		     "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+		     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+		     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+		     locklen, name, node);
 		ret = DLM_IVLOCKID;
 		goto leave;
 	}
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	spin_lock(&res->spinlock);
 	if (res->state & DLM_LOCK_RES_RECOVERING) {
-		mlog(0, "responding with DLM_RECOVERING!\n");
+		mlog(0, "Responding with DLM_RECOVERING!\n");
 		ret = DLM_RECOVERING;
 		goto unlock_out;
 	}
 	if (res->state & DLM_LOCK_RES_MIGRATING) {
-		mlog(0, "responding with DLM_MIGRATING!\n");
+		mlog(0, "Responding with DLM_MIGRATING!\n");
 		ret = DLM_MIGRATING;
 		goto unlock_out;
 	}
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	lock = NULL;
 	list_for_each(iter, head) {
 		lock = list_entry (iter, struct dlm_lock, list);
-		if (be64_to_cpu(lock->ml.cookie) == cookie)
+		if (lock->ml.cookie == cookie)
 			goto do_ast;
 	}
 
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	list_for_each(iter, head) {
 		lock = list_entry (iter, struct dlm_lock, list);
-		if (be64_to_cpu(lock->ml.cookie) == cookie)
+		if (lock->ml.cookie == cookie)
 			goto do_ast;
 	}
 
-	mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
-	     "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-	     dlm_get_lock_cookie_node(cookie),
-	     dlm_get_lock_cookie_seq(cookie),
-	     locklen, name, locklen);
+	mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+	     "node=%u\n", past->type == DLM_AST ? "" : "b",
+	     dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+	     dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+	     locklen, name, node);
 
 	ret = DLM_NORMAL;
 unlock_out:
@@ -383,8 +386,8 @@ do_ast:
 	if (past->type == DLM_AST) {
 		/* do not alter lock refcount.  switching lists. */
 		list_move_tail(&lock->list, &res->granted);
-		mlog(0, "ast: adding to granted list... type=%d, "
-			  "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+		mlog(0, "ast: Adding to granted list... type=%d, "
+		     "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
 		if (lock->ml.convert_type != LKM_IVMODE) {
 			lock->ml.type = lock->ml.convert_type;
 			lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
 		dlm_do_local_bast(dlm, res, lock, past->blocked_type);
 
 leave:
-
 	if (res)
 		dlm_lockres_put(res);
 
-- 
cgit v1.2.3


From d4f7e650e55af6b235871126f747da88600e8040 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 16 Dec 2008 15:49:21 -0800
Subject: ocfs2/dlm: Hold off sending lockres drop ref message while lockres is
 migrating

During lockres purge, o2dlm sends a drop reference message to the lockres
master. This patch delays the message if the lockres is being migrated.

Fixes oss bugzilla#1012
http://oss.oracle.com/bugzilla/show_bug.cgi?id=1012

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmthread.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc..d1295203029 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 		spin_lock(&res->spinlock);
 		/* This ensures that clear refmap is sent after the set */
-		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+		__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+						  DLM_LOCK_RES_MIGRATING));
 		spin_unlock(&res->spinlock);
 
 		/* clear our bit from the master's refmap, ignore errors */
-- 
cgit v1.2.3


From b0d4f817ba5de8adb875ace594554a96d7737710 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 16 Dec 2008 15:49:22 -0800
Subject: ocfs2/dlm: Fix race in adding/removing lockres' to/from the tracking
 list

This patch adds a new lock, dlm->tracking_lock, to protect adding/removing
lockres' to/from the dlm->tracking_list. We were previously using dlm->spinlock
for the same, but that proved inadequate as we could be freeing a lockres from
a context that did not hold that lock. As the new lock only protects this list,
we can explicitly take it when removing the lockres from the tracking list.

This bug was exposed when testing multiple processes concurrently flock() the
same file.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmcommon.h |  3 +++
 fs/ocfs2/dlm/dlmdebug.c  | 53 ++++++++++++++++++++++--------------------------
 fs/ocfs2/dlm/dlmdomain.c |  1 +
 fs/ocfs2/dlm/dlmmaster.c | 10 +++++++++
 4 files changed, 38 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a4..bb53714813a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
 	unsigned int purge_count;
 	spinlock_t spinlock;
 	spinlock_t ast_lock;
+	spinlock_t track_lock;
 	char *name;
 	u8 node_num;
 	u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
 	 * put on a list for the dlm thread to run. */
 	unsigned long    last_used;
 
+	struct dlm_ctxt *dlm;
+
 	unsigned migration_pending:1;
 	atomic_t asts_reserved;
 	spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175..b32f60a5acf 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
 {
 	struct debug_lockres *dl = m->private;
 	struct dlm_ctxt *dlm = dl->dl_ctxt;
+	struct dlm_lock_resource *oldres = dl->dl_res;
 	struct dlm_lock_resource *res = NULL;
+	struct list_head *track_list;
 
-	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->track_lock);
+	if (oldres)
+		track_list = &oldres->tracking;
+	else
+		track_list = &dlm->tracking_list;
 
-	if (dl->dl_res) {
-		list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
-			if (dl->dl_res) {
-				dlm_lockres_put(dl->dl_res);
-				dl->dl_res = NULL;
-			}
-			if (&res->tracking == &dlm->tracking_list) {
-				mlog(0, "End of list found, %p\n", res);
-				dl = NULL;
-				break;
-			}
+	list_for_each_entry(res, track_list, tracking) {
+		if (&res->tracking == &dlm->tracking_list)
+			res = NULL;
+		else
 			dlm_lockres_get(res);
-			dl->dl_res = res;
-			break;
-		}
-	} else {
-		if (!list_empty(&dlm->tracking_list)) {
-			list_for_each_entry(res, &dlm->tracking_list, tracking)
-				break;
-			dlm_lockres_get(res);
-			dl->dl_res = res;
-		} else
-			dl = NULL;
+		break;
 	}
+	spin_unlock(&dlm->track_lock);
 
-	if (dl) {
-		spin_lock(&dl->dl_res->spinlock);
-		dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
-		spin_unlock(&dl->dl_res->spinlock);
-	}
+	if (oldres)
+		dlm_lockres_put(oldres);
 
-	spin_unlock(&dlm->spinlock);
+	dl->dl_res = res;
+
+	if (res) {
+		spin_lock(&res->spinlock);
+		dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+		spin_unlock(&res->spinlock);
+	} else
+		dl = NULL;
 
+	/* passed to seq_show */
 	return dl;
 }
 
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e..d8d578f4561 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	spin_lock_init(&dlm->spinlock);
 	spin_lock_init(&dlm->master_lock);
 	spin_lock_init(&dlm->ast_lock);
+	spin_lock_init(&dlm->track_lock);
 	INIT_LIST_HEAD(&dlm->list);
 	INIT_LIST_HEAD(&dlm->dirty_list);
 	INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 92fd1d7d612..cbf3abe24cd 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 static void dlm_lockres_release(struct kref *kref)
 {
 	struct dlm_lock_resource *res;
+	struct dlm_ctxt *dlm;
 
 	res = container_of(kref, struct dlm_lock_resource, refs);
+	dlm = res->dlm;
 
 	/* This should not happen -- all lockres' have a name
 	 * associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 
+	spin_lock(&dlm->track_lock);
 	if (!list_empty(&res->tracking))
 		list_del_init(&res->tracking);
 	else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
 		     res->lockname.len, res->lockname.name);
 		dlm_print_one_lock_resource(res);
 	}
+	spin_unlock(&dlm->track_lock);
+
+	dlm_put(dlm);
 
 	if (!hlist_unhashed(&res->hash_node) ||
 	    !list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 	res->migration_pending = 0;
 	res->inflight_locks = 0;
 
+	/* put in dlm_lockres_release */
+	dlm_grab(dlm);
+	res->dlm = dlm;
+
 	kref_init(&res->refs);
 
 	/* just for consistency */
-- 
cgit v1.2.3


From 7b791d68562e4ce5ab57cbacb10a1ad4ee33956e Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 16 Dec 2008 15:49:23 -0800
Subject: ocfs2/dlm: Fix race during lockres mastery

dlm_get_lock_resource() is supposed to return a lock resource with a proper
master. If multiple concurrent threads attempt to lookup the lockres for the
same lockid while the lock mastery in underway, one or more threads are likely
to return a lockres without a proper master.

This patch makes the threads wait in dlm_get_lock_resource() while the mastery
is underway, ensuring all threads return the lockres with a proper master.

This issue is known to be limited to users using the flock() syscall. For all
other fs operations, the ocfs2 dlmglue layer serializes the dlm op for each
lockid.

Users encountering this bug will see flock() return EINVAL and dmesg have the
following error:
ERROR: Dlm error "DLM_BADARGS" while calling dlmlock on resource <LOCKID>: bad api args

Reported-by: Coly Li <coyli@suse.de>
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index cbf3abe24cd..54e182a27ca 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -732,14 +732,21 @@ lookup:
 	if (tmpres) {
 		int dropping_ref = 0;
 
+		spin_unlock(&dlm->spinlock);
+
 		spin_lock(&tmpres->spinlock);
+		/* We wait for the other thread that is mastering the resource */
+		if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+			__dlm_wait_on_lockres(tmpres);
+			BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+		}
+
 		if (tmpres->owner == dlm->node_num) {
 			BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
 			dlm_lockres_grab_inflight_ref(dlm, tmpres);
 		} else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
 			dropping_ref = 1;
 		spin_unlock(&tmpres->spinlock);
-		spin_unlock(&dlm->spinlock);
 
 		/* wait until done messaging the master, drop our ref to allow
 		 * the lockres to be purged, start over. */
-- 
cgit v1.2.3


From 71d548a6af36fe98c95fbd0522147f842bd5f054 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 5 Dec 2008 06:20:54 +0800
Subject: ocfs2/xattr: Remove extend_trans call and add its credits from the
 beginning

Actually, when setting a new xattr value, we know it from the very
beginning, and it isn't like the extension of bucket in which case
we can't figure it out. So remove ocfs2_extend_trans in that function
and calculate it before the transaction. It also relieve acl operation
from the worry about the side effect of ocfs2_extend_trans.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 17028aa7bc2..93a1ab4fe1d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1169,7 +1169,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 					   const void *value,
 					   int value_len)
 {
-	int ret = 0, i, cp_len, credits;
+	int ret = 0, i, cp_len;
 	u16 blocksize = inode->i_sb->s_blocksize;
 	u32 p_cluster, num_clusters;
 	u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
@@ -1179,18 +1179,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
 
 	BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
 
-	/*
-	 * In __ocfs2_xattr_set_value_outside has already been dirtied,
-	 * so we don't need to worry about whether ocfs2_extend_trans
-	 * will create a new transactio for us or not.
-	 */
-	credits = clusters * bpc;
-	ret = ocfs2_extend_trans(handle, credits);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
 					       &num_clusters, &xv->xr_list);
@@ -2233,6 +2221,15 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
 						    xi->value_len);
 	u64 value_size;
 
+	/*
+	 * Calculate the clusters we need to write.
+	 * No matter whether we replace an old one or add a new one,
+	 * we need this for writing.
+	 */
+	if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+		credits += new_clusters *
+			   ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
 	if (xis->not_found && xbs->not_found) {
 		credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
 
-- 
cgit v1.2.3


From 4b3f6209bf9eec46fe5ebb168718fef5c443c157 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 5 Dec 2008 06:20:55 +0800
Subject: ocfs2/xattr: Always updating ctime during xattr set.

In xattr set, we should always update ctime if the operation goes
sucessfully. The old one mistakenly put it in ocfs2_xattr_set_entry
which is only called when we set xattr in inode or xattr block. The
side benefit is that it resolve the bug 1052 since in that scenario,
ocfs2_calc_xattr_set_need only calc out the xattr set credits while
ocfs2_xattr_set_entry update the inode also which isn't concerned with
the process of xattr set.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 93a1ab4fe1d..3e2e92d7059 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1651,10 +1651,6 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 	oi->ip_dyn_features |= flag;
 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
 	spin_unlock(&oi->ip_lock);
-	/* Update inode ctime */
-	inode->i_ctime = CURRENT_TIME;
-	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
-	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
 
 	ret = ocfs2_journal_dirty(handle, xs->inode_bh);
 	if (ret < 0)
@@ -2574,6 +2570,20 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 		}
 	}
 
+	if (!ret) {
+		/* Update inode ctime. */
+		ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		inode->i_ctime = CURRENT_TIME;
+		di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+		di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+		ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+	}
 out:
 	return ret;
 }
@@ -2750,6 +2760,8 @@ int ocfs2_xattr_set(struct inode *inode,
 		goto cleanup;
 	}
 
+	/* we need to update inode's ctime field, so add credit for it. */
+	credits += OCFS2_INODE_UPDATE_CREDITS;
 	ctxt.handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(ctxt.handle)) {
 		ret = PTR_ERR(ctxt.handle);
-- 
cgit v1.2.3


From 90cb546cada68bb8c2278afdb4b65c2ac11f2877 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 5 Dec 2008 06:20:56 +0800
Subject: ocfs2/xattr: fix credits calculation during index create

When creating a xattr index block, the old calculation forget
to add credits for the meta change of the alloc file. So add
more credits and more comments to explain it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 3e2e92d7059..73fb9f76251 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2359,13 +2359,21 @@ meta_guess:
 		} else
 			xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
 
+		/*
+		 * If there is already an xattr tree, good, we can calculate
+		 * like other b-trees. Otherwise we may have the chance of
+		 * create a tree, the credit calculation is borrowed from
+		 * ocfs2_calc_extend_credits with root_el = NULL. And the
+		 * new tree will be cluster based, so no meta is needed.
+		 */
 		if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
 			struct ocfs2_extent_list *el =
 				 &xb->xb_attrs.xb_root.xt_list;
 			meta_add += ocfs2_extend_meta_needed(el);
 			credits += ocfs2_calc_extend_credits(inode->i_sb,
 							     el, 1);
-		}
+		} else
+			credits += OCFS2_SUBALLOC_ALLOC + 1;
 
 		/*
 		 * This cluster will be used either for new bucket or for
-- 
cgit v1.2.3


From 0e445b6fe93c723fe8093fd04ddfeb11ae2de082 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Tue, 9 Dec 2008 16:42:51 +0800
Subject: ocfs2: calculate and reserve credits for xattr value in mknod

We extend the credits for xattr's large value in set_value_outside
before, this can give rise to a credits issue when we set one security
entry and two acl entries duing mknod. As we remove extend_trans form
set_value_outside, we must calculate and reserve the credits for
xattr's large value in mknod.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 40 ++++++++++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 73fb9f76251..e5be470e750 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -490,9 +490,14 @@ int ocfs2_calc_security_init(struct inode *dir,
 	}
 
 	/* reserve clusters for xattr value which will be set in B tree*/
-	if (si->value_len > OCFS2_XATTR_INLINE_SIZE)
-		*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb,
-							   si->value_len);
+	if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							    si->value_len);
+
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
 	return ret;
 }
 
@@ -506,9 +511,7 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 {
 	int ret = 0;
 	struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
-	int s_size = 0;
-	int a_size = 0;
-	int acl_len = 0;
+	int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
 
 	if (si->enable)
 		s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
@@ -556,16 +559,25 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 		*xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
 	}
 
-	/* reserve clusters for xattr value which will be set in B tree*/
-	if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE)
-		*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb,
-							   si->value_len);
+	/*
+	 * reserve credits and clusters for xattrs which has large value
+	 * and have to be set outside
+	 */
+	if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+		new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+							si->value_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
+	}
 	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
 	    acl_len > OCFS2_XATTR_INLINE_SIZE) {
-		*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
-		if (S_ISDIR(mode))
-			*want_clusters += ocfs2_clusters_for_bytes(dir->i_sb,
-								   acl_len);
+		/* for directory, it has DEFAULT and ACCESS two types of acls */
+		new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+				ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+		*xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+							   new_clusters);
+		*want_clusters += new_clusters;
 	}
 
 	return ret;
-- 
cgit v1.2.3


From 008aafaf0b4aa0476da483e3c6e3edbe951811ff Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Tue, 9 Dec 2008 16:43:08 +0800
Subject: ocfs2: alloc xattr bucket in ocfs2_xattr_set_handle

In extreme situation, may need xattr bucket for setting
security entry and acl entries during mknod. This only
happens when block size is too small.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e5be470e750..095b0bb6e59 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2611,9 +2611,7 @@ out:
 /*
  * This function only called duing creating inode
  * for init security/acl xattrs of the new inode.
- * The xattrs could be put into ibody or extent block,
- * xattr bucket would not be use in this case.
- * transanction credits also be reserved in here.
+ * All transanction credits have been reserved in mknod.
  */
 int ocfs2_xattr_set_handle(handle_t *handle,
 			   struct inode *inode,
@@ -2653,6 +2651,19 @@ int ocfs2_xattr_set_handle(handle_t *handle,
 	if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
 		return -EOPNOTSUPP;
 
+	/*
+	 * In extreme situation, may need xattr bucket when
+	 * block size is too small. And we have already reserved
+	 * the credits for bucket in mknod.
+	 */
+	if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+		xbs.bucket = ocfs2_xattr_bucket_new(inode);
+		if (!xbs.bucket) {
+			mlog_errno(-ENOMEM);
+			return -ENOMEM;
+		}
+	}
+
 	xis.inode_bh = xbs.inode_bh = di_bh;
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 
@@ -2672,6 +2683,7 @@ int ocfs2_xattr_set_handle(handle_t *handle,
 cleanup:
 	up_write(&OCFS2_I(inode)->ip_xattr_sem);
 	brelse(xbs.xattr_bh);
+	ocfs2_xattr_bucket_free(xbs.bucket);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 38d59ef61c11cafc50a66787bdbbe80d58bbd9c0 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Wed, 17 Dec 2008 10:22:56 +0800
Subject: ocfs2: Add xattr support checking in init_security

We must check whether ocfs2 volume support xattr in init_security,
if not support xattr and security is enable, would cause failure of mknod.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 095b0bb6e59..e1d638af6ac 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5324,6 +5324,9 @@ int ocfs2_init_security_get(struct inode *inode,
 			    struct inode *dir,
 			    struct ocfs2_security_xattr_info *si)
 {
+	/* check whether ocfs2 support feature xattr */
+	if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+		return -EOPNOTSUPP;
 	return security_inode_init_security(inode, dir, &si->name, &si->value,
 					    &si->value_len);
 }
-- 
cgit v1.2.3


From a641dc2a5a1445eb4cb491080dfc41c42a9eb37d Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Wed, 24 Dec 2008 16:03:48 -0800
Subject: ocfs2: remove unneeded lvb casts

dlmglue.c has lots of code which casts the return value of ocfs2_dlm_lvb().
This is pointless however, as ocfs2_dlm_lvb() returns void *.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b1c75911d8a..f731ab49179 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -115,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
 				     unsigned int line,
 				     struct ocfs2_lock_res *lockres)
 {
-	struct ocfs2_meta_lvb *lvb =
-		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	mlog(level, "LVB information for %s (called from %s:%u):\n",
 	     lockres->l_name, function, line);
@@ -1864,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 
 	mlog_entry_void();
 
-	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/*
 	 * Invalidate the LVB of a deleted inode - this way other
@@ -1916,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 
 	mlog_meta_lvb(0, lockres);
 
-	lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/* We're safe here without the lockres lock... */
 	spin_lock(&oi->ip_lock);
@@ -1951,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
 					      struct ocfs2_lock_res *lockres)
 {
-	struct ocfs2_meta_lvb *lvb =
-		(struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	if (lvb->lvb_version == OCFS2_LVB_VERSION
 	    && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -3489,7 +3487,7 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
 
 	mlog_entry_void();
 
-	lvb = (struct ocfs2_qinfo_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
 	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
 	lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
-- 
cgit v1.2.3


From dad7d975e4bd893c79fd122105b37b9a1776816a Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Wed, 24 Dec 2008 16:33:08 -0800
Subject: ocfs2: use min_t in ocfs2_quota_read()

This is preferred to min().

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 444aa5a467f..6aff8f2d3e4 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -167,7 +167,7 @@ ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
 		len = i_size - off;
 	toread = len;
 	while (toread > 0) {
-		tocopy = min((size_t)(sb->s_blocksize - offset), toread);
+		tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
 		bh = NULL;
 		err = ocfs2_read_quota_block(gqinode, blk, &bh);
 		if (err) {
-- 
cgit v1.2.3


From 9047beabb8a396f0b18de1e4a9ab920cf92054af Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 5 Jan 2009 14:45:24 +0800
Subject: ocfs2: Access the right buffer_head in ocfs2_merge_rec_left.

In commit "ocfs2: Use metadata-specific ocfs2_journal_access_*()
functions", the wrong buffer_head is accessed. So change it
to the right buffer_head.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 874c0bd9e1c..54ff4c77aaa 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3402,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
 			has_empty_extent = 1;
 	}
 
-	ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
-					   path_num_items(left_path) - 1);
+	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+					   path_num_items(right_path) - 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
-- 
cgit v1.2.3


From 5b6f1eb97d462a45be3b30759758b5fdbb562c8c Mon Sep 17 00:00:00 2001
From: Alain Knaff <alain@knaff.lu>
Date: Mon, 10 Nov 2008 17:08:08 -0800
Subject: vfs: lseek(fd, 0, SEEK_CUR) race condition

This patch fixes a race condition in lseek. While it is expected that
unpredictable behaviour may result while repositioning the offset of a
file descriptor concurrently with reading/writing to the same file
descriptor, this should not happen when merely *reading* the file
descriptor's offset.

Unfortunately, the only portable way in Unix to read a file
descriptor's offset is lseek(fd, 0, SEEK_CUR); however executing this
concurrently with read/write may mess up the position.

[with fixes from akpm]

Signed-off-by: Alain Knaff <alain@knaff.lu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020..5cc6924eb15 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 		offset += inode->i_size;
 		break;
 	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (offset == 0)
+			return file->f_pos;
 		offset += file->f_pos;
 		break;
 	}
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 			offset += i_size_read(file->f_path.dentry->d_inode);
 			break;
 		case SEEK_CUR:
+			if (offset == 0) {
+				retval = file->f_pos;
+				goto out;
+			}
 			offset += file->f_pos;
 	}
 	retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 		}
 		retval = offset;
 	}
+out:
 	unlock_kernel();
 	return retval;
 }
-- 
cgit v1.2.3


From c765d479037808532310212e9b3fa95760e975f2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Dec 2008 09:50:55 -0500
Subject: affs: do not zero ->i_op

it is already set to empty table and should never be NULL

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac1..3c4ec7d864c 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 		goto bad_inode;
 #else
 		inode->i_mode |= S_IFDIR;
-		inode->i_op = NULL;
-		inode->i_fop = NULL;
+		/* ... and leave ->i_op and ->i_fop pointing to empty */
 		break;
 #endif
 	case ST_LINKFILE:
-- 
cgit v1.2.3


From 261964c60ff6524076d439da9386d4782729c4d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Dec 2008 09:57:25 -0500
Subject: isofs check for NULL ->i_op in root directory is dead code

for one thing it never happens, for another we check that inode
is a directory right after that place anyway (and we'd already
checked that reading it from disk has not failed).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/isofs/inode.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505..6147ec3643a 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
 	}
 	sbi->s_joliet_level = joliet_level;
 
-	/* check the root inode */
-	if (!inode->i_op)
-		goto out_bad_root;
-
 	/* Make sure the root inode is a directory */
 	if (!S_ISDIR(inode->i_mode)) {
 		printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
 	/*
 	 * Display error messages and free resources.
 	 */
-out_bad_root:
-	printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
 out_iput:
 	iput(inode);
 	goto out_no_inode;
-- 
cgit v1.2.3


From 9742df331deb3fce95b321f38d4ea0c4e75edb63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Dec 2008 09:59:23 -0500
Subject: ntfs: don't NULL i_op

it's already set to empty table (and no, ntfs doesn't have any explicit
checks for NULL ->i_op or NULL ->i_fop)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ntfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e277..86bef156cf0 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
 		ni->allocated_size = sle64_to_cpu(
 				a->data.non_resident.allocated_size);
 	}
-	/* Setup the operations for this attribute inode. */
-	vi->i_op = NULL;
-	vi->i_fop = NULL;
 	if (NInoMstProtected(ni))
 		vi->i_mapping->a_ops = &ntfs_mst_aops;
 	else
-- 
cgit v1.2.3


From acfa4380efe77e290d3a96b11cd4c9f24f4fbb18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Dec 2008 10:06:33 -0500
Subject: inode->i_op is never NULL

We used to have rather schizophrenic set of checks for NULL ->i_op even
though it had been eliminated years ago.  You'd need to go out of your
way to set it to NULL explicitly _and_ a bunch of code would die on
such inodes anyway.  After killing two remaining places that still
did that bogosity, all that crap can go away.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cifs/inode.c     |  2 +-
 fs/ecryptfs/inode.c |  3 +--
 fs/namei.c          | 32 +++++++++++++-------------------
 fs/nfsd/vfs.c       |  8 ++++----
 fs/open.c           |  2 +-
 fs/stat.c           |  2 +-
 fs/xattr.c          |  2 +-
 7 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4ed..5ab9896fdcb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 out_truncate:
-	if (inode->i_op && inode->i_op->truncate)
+	if (inode->i_op->truncate)
 		inode->i_op->truncate(inode);
 	return 0;
 out_sig:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5e78fc17988..0111906a887 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -612,8 +612,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
 	struct ecryptfs_crypt_stat *crypt_stat;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	if (!lower_dentry->d_inode->i_op ||
-	    !lower_dentry->d_inode->i_op->readlink) {
+	if (!lower_dentry->d_inode->i_op->readlink) {
 		rc = -EINVAL;
 		goto out;
 	}
diff --git a/fs/namei.c b/fs/namei.c
index dd5c9f0bf82..1f6656c3d1b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -257,7 +257,7 @@ int inode_permission(struct inode *inode, int mask)
 			return -EACCES;
 	}
 
-	if (inode->i_op && inode->i_op->permission)
+	if (inode->i_op->permission)
 		retval = inode->i_op->permission(inode, mask);
 	else
 		retval = generic_permission(inode, mask, NULL);
@@ -432,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
 {
 	umode_t	mode = inode->i_mode;
 
-	if (inode->i_op && inode->i_op->permission)
+	if (inode->i_op->permission)
 		return -EAGAIN;
 
 	if (current_fsuid() == inode->i_uid)
@@ -908,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		inode = next.dentry->d_inode;
 		if (!inode)
 			goto out_dput;
-		err = -ENOTDIR; 
-		if (!inode->i_op)
-			goto out_dput;
 
 		if (inode->i_op->follow_link) {
 			err = do_follow_link(&next, nd);
@@ -920,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 			inode = nd->path.dentry->d_inode;
 			if (!inode)
 				break;
-			err = -ENOTDIR; 
-			if (!inode->i_op)
-				break;
 		} else
 			path_to_nameidata(&next, nd);
 		err = -ENOTDIR; 
@@ -961,7 +955,7 @@ last_component:
 			break;
 		inode = next.dentry->d_inode;
 		if ((lookup_flags & LOOKUP_FOLLOW)
-		    && inode && inode->i_op && inode->i_op->follow_link) {
+		    && inode && inode->i_op->follow_link) {
 			err = do_follow_link(&next, nd);
 			if (err)
 				goto return_err;
@@ -973,7 +967,7 @@ last_component:
 			break;
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
-			if (!inode->i_op || !inode->i_op->lookup)
+			if (!inode->i_op->lookup)
 				break;
 		}
 		goto return_base;
@@ -1469,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->create)
+	if (!dir->i_op->create)
 		return -EACCES;	/* shouldn't it be ENOSYS? */
 	mode &= S_IALLUGO;
 	mode |= S_IFREG;
@@ -1752,7 +1746,7 @@ do_last:
 	error = -ENOENT;
 	if (!path.dentry->d_inode)
 		goto exit_dput;
-	if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+	if (path.dentry->d_inode->i_op->follow_link)
 		goto do_link;
 
 	path_to_nameidata(&path, &nd);
@@ -1933,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
 		return -EPERM;
 
-	if (!dir->i_op || !dir->i_op->mknod)
+	if (!dir->i_op->mknod)
 		return -EPERM;
 
 	error = devcgroup_inode_mknod(mode, dev);
@@ -2035,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->mkdir)
+	if (!dir->i_op->mkdir)
 		return -EPERM;
 
 	mode &= (S_IRWXUGO|S_ISVTX);
@@ -2126,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->rmdir)
+	if (!dir->i_op->rmdir)
 		return -EPERM;
 
 	DQUOT_INIT(dir);
@@ -2213,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->unlink)
+	if (!dir->i_op->unlink)
 		return -EPERM;
 
 	DQUOT_INIT(dir);
@@ -2320,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 	if (error)
 		return error;
 
-	if (!dir->i_op || !dir->i_op->symlink)
+	if (!dir->i_op->symlink)
 		return -EPERM;
 
 	error = security_inode_symlink(dir, dentry, oldname);
@@ -2401,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 */
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return -EPERM;
-	if (!dir->i_op || !dir->i_op->link)
+	if (!dir->i_op->link)
 		return -EPERM;
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
@@ -2608,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 
-	if (!old_dir->i_op || !old_dir->i_op->rename)
+	if (!old_dir->i_op->rename)
 		return -EPERM;
 
 	DQUOT_INIT(old_dir);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b36..5245a396500 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1211,7 +1211,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	dirp = dentry->d_inode;
 
 	err = nfserr_notdir;
-	if(!dirp->i_op || !dirp->i_op->lookup)
+	if (!dirp->i_op->lookup)
 		goto out;
 	/*
 	 * Check whether the response file handle has been verified yet.
@@ -1347,7 +1347,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	/* Get all the sanity checks out of the way before
 	 * we lock the parent. */
 	err = nfserr_notdir;
-	if(!dirp->i_op || !dirp->i_op->lookup)
+	if (!dirp->i_op->lookup)
 		goto out;
 	fh_lock_nested(fhp, I_MUTEX_PARENT);
 
@@ -1482,7 +1482,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 	inode = dentry->d_inode;
 
 	err = nfserr_inval;
-	if (!inode->i_op || !inode->i_op->readlink)
+	if (!inode->i_op->readlink)
 		goto out;
 
 	touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2162,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 	size_t size;
 	int error;
 
-	if (!IS_POSIXACL(inode) || !inode->i_op ||
+	if (!IS_POSIXACL(inode) ||
 	    !inode->i_op->setxattr || !inode->i_op->removexattr)
 		return -EOPNOTSUPP;
 	switch(type) {
diff --git a/fs/open.c b/fs/open.c
index 1cd7d40e999..d882fd2351d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -412,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
 	if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
 		goto out_fput;
 
-	if (inode->i_op && inode->i_op->fallocate)
+	if (inode->i_op->fallocate)
 		ret = inode->i_op->fallocate(inode, mode, offset, len);
 	else
 		ret = -EOPNOTSUPP;
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b7..7e12a6f8279 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
 		struct inode *inode = path.dentry->d_inode;
 
 		error = -EINVAL;
-		if (inode->i_op && inode->i_op->readlink) {
+		if (inode->i_op->readlink) {
 			error = security_inode_readlink(path.dentry);
 			if (!error) {
 				touch_atime(path.mnt, path.dentry);
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e6653..237804cd6b5 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
 	if (error)
 		return error;
 	error = -EOPNOTSUPP;
-	if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+	if (d->d_inode->i_op->listxattr) {
 		error = d->d_inode->i_op->listxattr(d, list, size);
 	} else {
 		error = security_inode_listsecurity(d->d_inode, list, size);
-- 
cgit v1.2.3


From 56ff5efad96182f4d3cb3dc6b07396762c658f16 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 9 Dec 2008 09:34:39 -0500
Subject: zero i_uid/i_gid on inode allocation

... and don't bother in callers.  Don't bother with zeroing i_blocks,
while we are at it - it's already been zeroed.

i_mode is not worth the effort; it has no common default value.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs/inode.c     | 2 --
 fs/autofs4/inode.c    | 4 ----
 fs/binfmt_misc.c      | 3 ---
 fs/configfs/inode.c   | 3 ---
 fs/cramfs/inode.c     | 2 --
 fs/debugfs/inode.c    | 3 ---
 fs/devpts/inode.c     | 4 ----
 fs/hugetlbfs/inode.c  | 1 -
 fs/inode.c            | 2 ++
 fs/libfs.c            | 5 -----
 fs/ocfs2/dlm/dlmfs.c  | 2 --
 fs/omfs/inode.c       | 1 -
 fs/openpromfs/inode.c | 3 ---
 fs/proc/base.c        | 4 ----
 fs/proc/proc_sysctl.c | 1 -
 fs/ramfs/inode.c      | 1 -
 fs/romfs/inode.c      | 1 -
 fs/sysfs/inode.c      | 3 ---
 18 files changed, 2 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c6..e1734f2d6e2 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
 	inode->i_nlink = 2;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks = 0;
 
 	if (ino == AUTOFS_ROOT_INO) {
 		inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 		inode->i_op = &autofs_root_inode_operations;
 		inode->i_fop = &autofs_root_operations;
-		inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
 		goto done;
 	} 
 	
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef..cfc23e53b6f 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
 	if (sb->s_root) {
 		inode->i_uid = sb->s_root->d_inode->i_uid;
 		inode->i_gid = sb->s_root->d_inode->i_gid;
-	} else {
-		inode->i_uid = 0;
-		inode->i_gid = 0;
 	}
-	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	if (S_ISDIR(inf->mode)) {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b..e1158cb4fbd 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
 
 	if (inode) {
 		inode->i_mode = mode;
-		inode->i_uid = 0;
-		inode->i_gid = 0;
-		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime =
 			current_fs_time(inode->i_sb);
 	}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc9448..5d349d38e05 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
 
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
 {
 	struct inode * inode = new_inode(configfs_sb);
 	if (inode) {
-		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &configfs_aops;
 		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a1..a07338d2d14 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 			inode->i_op = &page_symlink_inode_operations;
 			inode->i_data.a_ops = &cramfs_aops;
 		} else {
-			inode->i_size = 0;
-			inode->i_blocks = 0;
 			init_special_inode(inode, inode->i_mode,
 				old_decode_dev(cramfs_inode->size));
 		}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf3..81ae9ea3c6e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
 
 	if (inode) {
 		inode->i_mode = mode;
-		inode->i_uid = 0;
-		inode->i_gid = 0;
-		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index fff96e152c0..5f3231b9633 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -189,8 +189,6 @@ static int mknod_ptmx(struct super_block *sb)
 	}
 
 	inode->i_ino = 2;
-	inode->i_uid = inode->i_gid = 0;
-	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 
 	mode = S_IFCHR|opts->ptmxmode;
@@ -300,8 +298,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
 		goto free_fsi;
 	inode->i_ino = 1;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_blocks = 0;
-	inode->i_uid = inode->i_gid = 0;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3ace..0ab0c6f5f43 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -506,7 +506,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 		inode->i_mode = mode;
 		inode->i_uid = uid;
 		inode->i_gid = gid;
-		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 7de1cda9248..bd48e5e6d3e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -131,6 +131,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_op = &empty_iops;
 	inode->i_fop = &empty_fops;
 	inode->i_nlink = 1;
+	inode->i_uid = 0;
+	inode->i_gid = 0;
 	atomic_set(&inode->i_writecount, 0);
 	inode->i_size = 0;
 	inode->i_blocks = 0;
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a832190..7de05f7ce74 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	 */
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_uid = root->i_gid = 0;
 	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
 	dentry = d_alloc(NULL, &d_name);
 	if (!dentry) {
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 	 */
 	inode->i_ino = 1;
 	inode->i_mode = S_IFDIR | 0755;
-	inode->i_uid = inode->i_gid = 0;
-	inode->i_blocks = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
 		if (!inode)
 			goto out;
 		inode->i_mode = S_IFREG | files->mode;
-		inode->i_uid = inode->i_gid = 0;
-		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_fop = files->ops;
 		inode->i_ino = i;
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d5402..1c9efb406a9 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
-		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_blocks = 0;
 	inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f8..633e9dc972b 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
 	inode->i_mode = mode;
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
-	inode->i_blocks = 0;
 	inode->i_mapping->a_ops = &omfs_aops;
 
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de..ffcd04f0012 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
 		break;
 	}
 
-	inode->i_gid = 0;
-	inode->i_uid = 0;
-
 	d_add(dentry, inode);
 	return NULL;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b..10fd5223d60 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1426,8 +1426,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
 	if (!ei->pid)
 		goto out_unlock;
 
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	if (task_dumpable(task)) {
 		rcu_read_lock();
 		cred = __task_cred(task);
@@ -2349,8 +2347,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 	if (!ei->pid)
 		goto out_iput;
 
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	inode->i_mode = p->mode;
 	if (S_ISDIR(inode->i_mode))
 		inode->i_nlink = 2;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9..94fcfff6863 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
 	inode->i_mode = table->mode;
-	inode->i_uid = inode->i_gid = 0;
 	if (!table->child) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae3..b7e6ac706b8 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
-		inode->i_blocks = 0;
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87..c97d4c93171 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -524,7 +524,6 @@ romfs_iget(struct super_block *sb, unsigned long ino)
 	i->i_size = be32_to_cpu(ri.size);
 	i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
 	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
-	i->i_uid = i->i_gid = 0;
 
         /* Precalculate the data offset */
         ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f85..dfa3d94cfc7 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
 {
 	inode->i_mode = mode;
-	inode->i_uid = 0;
-	inode->i_gid = 0;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 }
 
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct bin_attribute *bin_attr;
 
-	inode->i_blocks = 0;
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
-- 
cgit v1.2.3


From 6110e3abbff8b785907d4db50240e63c1be726e3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 17 Dec 2008 13:53:20 -0500
Subject: sys_execve and sys_uselib do not call into fsnotify

sys_execve and sys_uselib do not call into fsnotify so inotify does not get
open events for these types of syscalls.  This patch simply makes the
requisite fsnotify calls.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3ef9cf9b187..9c33f542dc7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,6 +51,7 @@
 #include <linux/audit.h>
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
+#include <linux/fsnotify.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -132,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (IS_ERR(file))
 		goto out;
 
+	fsnotify_open(file->f_path.dentry);
+
 	error = -ENOEXEC;
 	if(file->f_op) {
 		struct linux_binfmt * fmt;
@@ -684,6 +687,8 @@ struct file *open_exec(const char *name)
 	if (IS_ERR(file))
 		return file;
 
+	fsnotify_open(file->f_path.dentry);
+
 	err = deny_write_access(file);
 	if (err) {
 		fput(file);
-- 
cgit v1.2.3


From 4c728ef583b3d82266584da5cb068294c09df31e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 22 Dec 2008 21:11:15 +0100
Subject: add a vfs_fsync helper

Fsync currently has a fdatawrite/fdatawait pair around the method call,
and a mutex_lock/unlock of the inode mutex.  All callers of fsync have
to duplicate this, but we have a few and most of them don't quite get
it right.  This patch adds a new vfs_fsync that takes care of this.
It's a little more complicated as usual as ->fsync might get a NULL file
pointer and just a dentry from nfsd, but otherwise gets afile and we
want to take the mapping and file operations from it when it is there.

Notes on the fsync callers:

 - ecryptfs wasn't calling filemap_fdatawrite / filemap_fdatawait on the
   	lower file
 - coda wasn't calling filemap_fdatawrite / filemap_fdatawait on the host
	file, and returning 0 when ->fsync was missing
 - shm wasn't calling either filemap_fdatawrite / filemap_fdatawait nor
   taking i_mutex.  Now given that shared memory doesn't have disk
   backing not doing anything in fsync seems fine and I left it out of
   the vfs_fsync conversion for now, but in that case we might just
   not pass it through to the lower file at all but just call the no-op
   simple_sync_file directly.

[and now actually export vfs_fsync]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/file.c     | 12 ++----------
 fs/ecryptfs/file.c | 15 +++------------
 fs/nfsd/vfs.c      | 35 +++--------------------------------
 fs/sync.c          | 48 +++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 45 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df..6a347fbc998 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
 int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 {
 	struct file *host_file;
-	struct dentry *host_dentry;
-	struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
+	struct inode *coda_inode = coda_dentry->d_inode;
 	struct coda_file_info *cfi;
 	int err = 0;
 
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (host_file->f_op && host_file->f_op->fsync) {
-		host_dentry = host_file->f_path.dentry;
-		host_inode = host_dentry->d_inode;
-		mutex_lock(&host_inode->i_mutex);
-		err = host_file->f_op->fsync(host_file, host_dentry, datasync);
-		mutex_unlock(&host_inode->i_mutex);
-	}
-
+	err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
 	if ( !err && !datasync ) {
 		lock_kernel();
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac0..71383437122 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-	struct file *lower_file = ecryptfs_file_to_lower(file);
-	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	struct inode *lower_inode = lower_dentry->d_inode;
-	int rc = -EINVAL;
-
-	if (lower_inode->i_fop->fsync) {
-		mutex_lock(&lower_inode->i_mutex);
-		rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
-					       datasync);
-		mutex_unlock(&lower_inode->i_mutex);
-	}
-	return rc;
+	return vfs_fsync(ecryptfs_file_to_lower(file),
+			 ecryptfs_dentry_to_lower(dentry),
+			 datasync);
 }
 
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5245a396500..44aa92aba89 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -744,45 +744,16 @@ nfsd_close(struct file *filp)
 	fput(filp);
 }
 
-/*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
- */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
-			      const struct file_operations *fop)
-{
-	struct inode *inode = dp->d_inode;
-	int (*fsync) (struct file *, struct dentry *, int);
-	int err;
-
-	err = filemap_fdatawrite(inode->i_mapping);
-	if (err == 0 && fop && (fsync = fop->fsync))
-		err = fsync(filp, dp, 0);
-	if (err == 0)
-		err = filemap_fdatawait(inode->i_mapping);
-
-	return err;
-}
-	
-
 static int
 nfsd_sync(struct file *filp)
 {
-        int err;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
-	mutex_lock(&inode->i_mutex);
-	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
-	mutex_unlock(&inode->i_mutex);
-
-	return err;
+	return vfs_fsync(filp, filp->f_path.dentry, 0);
 }
 
 int
-nfsd_sync_dir(struct dentry *dp)
+nfsd_sync_dir(struct dentry *dentry)
 {
-	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
+	return vfs_fsync(NULL, dentry, 0);
 }
 
 /*
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416..0921d6d4b5e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 	return ret;
 }
 
-long do_fsync(struct file *file, int datasync)
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:		file to sync
+ * @dentry:		dentry of @file
+ * @data:		only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
-	int ret;
-	int err;
-	struct address_space *mapping = file->f_mapping;
+	const struct file_operations *fop;
+	struct address_space *mapping;
+	int err, ret;
+
+	/*
+	 * Get mapping and operations from the file in case we have
+	 * as file, or get the default values for them in case we
+	 * don't have a struct file available.  Damn nfsd..
+	 */
+	if (file) {
+		mapping = file->f_mapping;
+		fop = file->f_op;
+	} else {
+		mapping = dentry->d_inode->i_mapping;
+		fop = dentry->d_inode->i_fop;
+	}
 
-	if (!file->f_op || !file->f_op->fsync) {
-		/* Why?  We can still call filemap_fdatawrite */
+	if (!fop || !fop->fsync) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
 	 * livelocks in fsync_buffers_list().
 	 */
 	mutex_lock(&mapping->host->i_mutex);
-	err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+	err = fop->fsync(file, dentry, datasync);
 	if (!ret)
 		ret = err;
 	mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
 out:
 	return ret;
 }
+EXPORT_SYMBOL(vfs_fsync);
 
-static long __do_fsync(unsigned int fd, int datasync)
+static int do_fsync(unsigned int fd, int datasync)
 {
 	struct file *file;
 	int ret = -EBADF;
 
 	file = fget(fd);
 	if (file) {
-		ret = do_fsync(file, datasync);
+		ret = vfs_fsync(file, file->f_path.dentry, datasync);
 		fput(file);
 	}
 	return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
 
 asmlinkage long sys_fsync(unsigned int fd)
 {
-	return __do_fsync(fd, 0);
+	return do_fsync(fd, 0);
 }
 
 asmlinkage long sys_fdatasync(unsigned int fd)
 {
-	return __do_fsync(fd, 1);
+	return do_fsync(fd, 1);
 }
 
 /*
-- 
cgit v1.2.3


From d8e9650dff48055057253ca30933605bd7d0733b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 25 Dec 2008 13:32:15 +0800
Subject: vfs: remove duplicate code in get_fs_type()

save 14 bytes:

   text    data     bss     dec     hex filename
   1354      32       4    1390     56e fs/filesystems.o.before
   text    data     bss     dec     hex filename
   1340      32       4    1376     560 fs/filesystems.o

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/filesystems.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62d..d488dcd7f2b 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
 module_init(proc_filesystems_init);
 #endif
 
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
 {
 	struct file_system_type *fs;
-	const char *dot = strchr(name, '.');
-	unsigned len = dot ? dot - name : strlen(name);
 
 	read_lock(&file_systems_lock);
 	fs = *(find_filesystem(name, len));
 	if (fs && !try_module_get(fs->owner))
 		fs = NULL;
 	read_unlock(&file_systems_lock);
-	if (!fs && (request_module("%.*s", len, name) == 0)) {
-		read_lock(&file_systems_lock);
-		fs = *(find_filesystem(name, len));
-		if (fs && !try_module_get(fs->owner))
-			fs = NULL;
-		read_unlock(&file_systems_lock);
-	}
+	return fs;
+}
+
+struct file_system_type *get_fs_type(const char *name)
+{
+	struct file_system_type *fs;
+	const char *dot = strchr(name, '.');
+	int len = dot ? dot - name : strlen(name);
+
+	fs = __get_fs_type(name, len);
+	if (!fs && (request_module("%.*s", len, name) == 0))
+		fs = __get_fs_type(name, len);
 
 	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
 		put_filesystem(fs);
-- 
cgit v1.2.3


From 5b45d96bf963afeb931a75faf02fb424e446e5a9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 29 Dec 2008 07:40:31 -0500
Subject: fix the treatment of jfs special inodes

We used to put them on a single list, without any locking.  Racy.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/jfs_imap.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d..0f94381ca6d 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
 
 /*
  * __mark_inode_dirty expects inodes to be hashed.  Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
  */
-static HLIST_HEAD(aggregate_hash);
 
 /*
  * imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
 	/* release the page */
 	release_metapage(mp);
 
-	hlist_add_head(&ip->i_hash, &aggregate_hash);
+	/*
+	 * that will look hashed, but won't be on any list; hlist_del()
+	 * will work fine and require no locking.
+	 */
+	ip->i_hash.pprev = &ip->i_hash.next;
 
 	return (ip);
 }
-- 
cgit v1.2.3


From 2f1169e2dc0c70e213f79ada88a10912cc2fbe94 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 2 Jan 2009 08:16:51 -0500
Subject: fix breakage in reiserfs_new_inode()

now that we use ih.key earlier, we need to do all its setup early enough

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/inode.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 145c2d3e5e0..1306d4f0f44 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,6 +1782,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 		goto out_bad_inode;
 	}
 	args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+	if (old_format_only(sb))
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+	else
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
 	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
 	args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
 	if (insert_inode_locked4(inode, args.objectid,
@@ -1834,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	reiserfs_init_acl_default(inode);
 	reiserfs_init_xattr_rwsem(inode);
 
-	if (old_format_only(sb))
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-	else
-		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
-				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
-
 	/* key to search for correct place for new stat data */
 	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
 		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
-- 
cgit v1.2.3


From 4ae8978cf92a96257cd8998a49e781be83571d64 Mon Sep 17 00:00:00 2001
From: Michael Kerrisk <mtk.manpages@googlemail.com>
Date: Mon, 5 Jan 2009 07:19:16 -0500
Subject: inotify: fix type errors in interfaces

The problems lie in the types used for some inotify interfaces, both at the kernel level and at the glibc level. This mail addresses the kernel problem. I will follow up with some suggestions for glibc changes.

For the sys_inotify_rm_watch() interface, the type of the 'wd' argument is
currently 'u32', it should be '__s32' .  That is Robert's suggestion, and
is consistent with the other declarations of watch descriptors in the
kernel source, in particular, the inotify_event structure in
include/linux/inotify.h:

struct inotify_event {
        __s32           wd;             /* watch descriptor */
        __u32           mask;           /* watch mask */
        __u32           cookie;         /* cookie to synchronize two events */
        __u32           len;            /* length (including nulls) of name */
        char            name[0];        /* stub for possible name */
};

The patch makes the changes needed for inotify_rm_watch().

Signed-off-by: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Robert Love <rlove@google.com>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/notify/inotify/inotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 400f8064a54..81b8644b013 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -704,7 +704,7 @@ fput_and_out:
 	return ret;
 }
 
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
 {
 	struct file *filp;
 	struct inotify_device *dev;
-- 
cgit v1.2.3


From e441d54de4fd97dd381f3e73636f5ba51ff4c7d9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 Jan 2009 16:57:23 -0500
Subject: Btrfs: add permission checks to the ioctls

Only root can add/remove devices
Only root can defrag subtrees
Only files open for writing can be defragged
Only files open for writing can be the destination for a clone

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 25 +++++++++++++++++++++++--
 fs/btrfs/super.c |  3 +++
 2 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ab429fe0fa0..150784e936e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -453,6 +453,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
 
 	if (!vol_args)
@@ -638,16 +641,24 @@ static int btrfs_ioctl_defrag(struct file *file)
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
+		if (!capable(CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out;
+		}
 		btrfs_defrag_root(root, 0);
 		btrfs_defrag_root(root->fs_info->extent_root, 0);
 		break;
 	case S_IFREG:
+		if (!(file->f_mode & FMODE_WRITE)) {
+			ret = -EINVAL;
+			goto out;
+		}
 		btrfs_defrag_file(file);
 		break;
 	}
-
+out:
 	mnt_drop_write(file->f_path.mnt);
-	return 0;
+	return ret;
 }
 
 static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
@@ -655,6 +666,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
 
 	if (!vol_args)
@@ -677,6 +691,9 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	if (root->fs_info->sb->s_flags & MS_RDONLY)
 		return -EROFS;
 
@@ -726,6 +743,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	 *   they don't overlap)?
 	 */
 
+	/* the destination must be opened for writing */
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EINVAL;
+
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
 		return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 84c3b66564d..3814238d6eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -589,6 +589,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	int ret = 0;
 	int len;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
 	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
 		ret = -EFAULT;
-- 
cgit v1.2.3


From 52c2617990fed072220708d6b771dc10f37547b0 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 5 Jan 2009 15:43:43 -0500
Subject: Btrfs: update directory's size when creating subvol/snapshot

Make sure directory's size properly updated when creating
subvol/snapshot.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ioctl.c       | 4 ++++
 fs/btrfs/transaction.c | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 150784e936e..ba484aac1b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -147,6 +147,10 @@ static noinline int create_subvol(struct btrfs_root *root,
 	if (ret)
 		goto fail;
 
+	btrfs_i_size_write(dir, dir->i_size + namelen * 2);
+	ret = btrfs_update_inode(trans, root, dir);
+	BUG_ON(ret);
+
 	/* add the backref first */
 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
 				 objectid, BTRFS_ROOT_BACKREF_KEY,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4604178a43a..4e7b56e9d3a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -819,6 +819,10 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 	if (ret)
 		goto fail;
 
+	btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+	ret = btrfs_update_inode(trans, parent_root, parent_inode);
+	BUG_ON(ret);
+
 	/* add the backref first */
 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
 				 pending->root_key.objectid,
-- 
cgit v1.2.3


From 9aead43588f4bdb1bb61e348ad0f33794bbddc0f Mon Sep 17 00:00:00 2001
From: yanhai zhu <zhu.yanhai@gmail.com>
Date: Mon, 5 Jan 2009 15:49:11 -0500
Subject: Btrfs: Fix memset length in btrfs_file_write

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 507081059d9..5908521922f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1094,7 +1094,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 					PAGE_CACHE_SHIFT;
 
 		WARN_ON(num_pages > nrptrs);
-		memset(pages, 0, sizeof(pages));
+		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
 		ret = btrfs_check_free_space(root, write_bytes, 0);
 		if (ret)
-- 
cgit v1.2.3


From c584482b47f47b051cdc1d5236b99ad18f1b1cfb Mon Sep 17 00:00:00 2001
From: Liu Hui <onlyflyer@gmail.com>
Date: Mon, 5 Jan 2009 15:49:55 -0500
Subject: Btrfs: Fix typo in clear_state_cb

In clear_state_cb, we should check 'tree->ops->clear_bit_hook' instead
of 'tree->ops->set_bit_hook'.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 25ce2d18e5b..0bf7684207a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -340,7 +340,7 @@ static void clear_state_cb(struct extent_io_tree *tree,
 			   struct extent_state *state,
 			   unsigned long bits)
 {
-	if (tree->ops && tree->ops->set_bit_hook) {
+	if (tree->ops && tree->ops->clear_bit_hook) {
 		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
 					  state->end, state->state, bits);
 	}
-- 
cgit v1.2.3


From 1f48366084a7b046bcb7741ed4e607774f96e3da Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Mon, 5 Jan 2009 15:43:42 -0500
Subject: Btrfs: fix a memory leak in btrfs_get_sb

subvol_name should be freed if error occurs.

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
---
 fs/btrfs/super.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3814238d6eb..ccdcb7bb7ad 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -429,7 +429,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &fs_devices);
 	if (error)
-		goto error;
+		return error;
 
 	error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
 	if (error)
@@ -468,7 +468,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
-			goto error;
+			goto error_free_subvol_name;
 		}
 
 		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
@@ -485,14 +485,14 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			up_write(&s->s_umount);
 			deactivate_super(s);
 			error = PTR_ERR(root);
-			goto error;
+			goto error_free_subvol_name;
 		}
 		if (!root->d_inode) {
 			dput(root);
 			up_write(&s->s_umount);
 			deactivate_super(s);
 			error = -ENXIO;
-			goto error;
+			goto error_free_subvol_name;
 		}
 	}
 
@@ -508,7 +508,6 @@ error_close_devices:
 	btrfs_close_devices(fs_devices);
 error_free_subvol_name:
 	kfree(subvol_name);
-error:
 	return error;
 }
 
-- 
cgit v1.2.3


From dd3fd8bdf7238f99561ee236060b04d5b9a84953 Mon Sep 17 00:00:00 2001
From: Shen Feng <shen@cn.fujitsu.com>
Date: Mon, 5 Jan 2009 15:43:42 -0500
Subject: Btrfs: do not call kfree if kmalloc failed in btrfs_sysfs_add_super

Signed-off-by: Shen Feng <shen@cn.fujitsu.com>
---
 fs/btrfs/sysfs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 300076e6676..04087c02084 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -212,14 +212,13 @@ int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
 	fs->super_kobj.kset = btrfs_kset;
 	error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
 				     NULL, "%s", name);
+	kfree(name);
 	if (error)
 		goto fail;
 
-	kfree(name);
 	return 0;
 
 fail:
-	kfree(name);
 	printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
 	return error;
 }
-- 
cgit v1.2.3


From 2d69a0f88459fae35df3ddef4934a2dad67e2765 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 5 Jan 2009 15:43:42 -0500
Subject: Btrfs: avoid potential super block corruption

The data in fs_info->super_for_commit are zeros before the
first transaction commit. If tree log sync and system crash
both occur before the first transaction commit, super block
will get corrupted.

This fixes it by properly filling in the super_for_commit field at
open time.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 40a540f3116..dae25e78a6b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1609,6 +1609,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_iput;
 
 	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+	       sizeof(fs_info->super_for_commit));
 	brelse(bh);
 
 	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
@@ -1790,7 +1792,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_read_block_groups(extent_root);
 
-	fs_info->generation = generation + 1;
+	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
 	fs_info->data_alloc_profile = (u64)-1;
 	fs_info->metadata_alloc_profile = (u64)-1;
-- 
cgit v1.2.3


From ec051c0f929afe5c42c24bb07abf577c616c208c Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 5 Jan 2009 15:43:42 -0500
Subject: Btrfs: avoid orphan inode caused by log replay

drop_one_dir_item does not properly update inode's link count. It can be
reproduced by executing following commands:

#touch test
#sync
#rm -f test
#dd if=/dev/zero bs=4k count=1 of=test conv=fsync
#echo b > /proc/sysrq-trigger

This fixes it by adding an BTRFS_ORPHAN_ITEM_KEY for the inode

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/tree-log.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 33eee256ee8..b1c2921f5be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -50,6 +50,9 @@
 static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
 			     int inode_only);
+static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
+			     struct btrfs_path *path, u64 objectid);
 
 /*
  * tree logging is a special write ahead log used to make sure that
@@ -638,8 +641,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	inode = read_one_inode(root, location.objectid);
 	BUG_ON(!inode);
 
-	btrfs_inc_nlink(inode);
+	ret = link_to_fixup_dir(trans, root, path, location.objectid);
+	BUG_ON(ret);
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+	BUG_ON(ret);
 	kfree(name);
 
 	iput(inode);
-- 
cgit v1.2.3


From 1f3c79a28c8837e8572b98f6d14142d9a6133c56 Mon Sep 17 00:00:00 2001
From: Liu Hui <onlyflyer@gmail.com>
Date: Mon, 5 Jan 2009 15:57:51 -0500
Subject: Btrfs: Fix free block discard calls down to the block layer

This is a patch to fix discard semantic to make Btrfs work with FTL and SSD.
We can improve FTL's performance by telling it which sectors are freed by file
system. But if we don't tell FTL the information of free sectors in proper
time, the transaction mechanism of Btrfs will be destroyed and Btrfs could not
roll back the previous transaction under the power loss condition.

There are some problems in the old implementation:
1, In __free_extent(), the pinned down extents should not be discarded.
2, In free_extents(), the free extents are all pinned, so they need to
be discarded in transaction committing time instead of free_extents().
3, The reserved extent used by log tree should be discard too.

This patch change discard behavior as follows:
1, For the extents which need to be free at once,
   we discard them in update_block_group().
2, Delay discarding the pinned extent in btrfs_finish_extent_commit()
   when committing transaction.
3, Remove discarding from free_extents() and __free_extent()
4, Add discard interface into btrfs_free_reserved_extent()
5, Discard sectors before updating the free space cache, otherwise,
   FTL will destroy file system data.
---
 fs/btrfs/extent-tree.c | 99 ++++++++++++++++++++++++--------------------------
 1 file changed, 48 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe0e59ab33c..780c1eeb829 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -876,6 +876,38 @@ static void btrfs_issue_discard(struct block_device *bdev,
 }
 #endif
 
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+				u64 num_bytes)
+{
+#ifdef BIO_RW_DISCARD
+	int ret;
+	u64 map_length = num_bytes;
+	struct btrfs_multi_bio *multi = NULL;
+
+	/* Tell the block device(s) that the sectors can be discarded */
+	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+			      bytenr, &map_length, &multi, 0);
+	if (!ret) {
+		struct btrfs_bio_stripe *stripe = multi->stripes;
+		int i;
+
+		if (map_length > num_bytes)
+			map_length = num_bytes;
+
+		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+			btrfs_issue_discard(stripe->dev->bdev,
+					    stripe->physical,
+					    map_length);
+		}
+		kfree(multi);
+	}
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
 static int noinline free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
 				 struct list_head *del_list)
@@ -1069,10 +1101,6 @@ search:
 		for (pos = cur, n = pos->next; pos != end;
 		     pos = n, n = pos->next) {
 			struct pending_extent_op *tmp;
-#ifdef BIO_RW_DISCARD
-			u64 map_length;
-			struct btrfs_multi_bio *multi = NULL;
-#endif
 			tmp = list_entry(pos, struct pending_extent_op, list);
 
 			/*
@@ -1084,28 +1112,6 @@ search:
 						 tmp->del);
 			BUG_ON(ret);
 
-#ifdef BIO_RW_DISCARD
-			map_length = tmp->num_bytes;
-			ret = btrfs_map_block(&info->mapping_tree, READ,
-					      tmp->bytenr, &map_length, &multi,
-					      0);
-			if (!ret) {
-				struct btrfs_bio_stripe *stripe;
-				int i;
-
-				stripe = multi->stripes;
-
-				if (map_length > tmp->num_bytes)
-					map_length = tmp->num_bytes;
-
-				for (i = 0; i < multi->num_stripes;
-				     i++, stripe++)
-					btrfs_issue_discard(stripe->dev->bdev,
-							    stripe->physical,
-							    map_length);
-				kfree(multi);
-			}
-#endif
 			list_del_init(&tmp->list);
 			unlock_extent(&info->extent_ins, tmp->bytenr,
 				      tmp->bytenr + tmp->num_bytes - 1,
@@ -1965,6 +1971,11 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			spin_unlock(&cache->space_info->lock);
 			if (mark_free) {
 				int ret;
+
+				ret = btrfs_discard_extent(root, bytenr,
+							   num_bytes);
+				WARN_ON(ret);
+
 				ret = btrfs_add_free_space(cache, bytenr,
 							   num_bytes);
 				WARN_ON(ret);
@@ -2104,8 +2115,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
+
+		ret = btrfs_discard_extent(root, start, end + 1 - start);
+
 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
+
 		if (need_resched()) {
 			mutex_unlock(&root->fs_info->pinned_mutex);
 			cond_resched();
@@ -2113,7 +2128,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		}
 	}
 	mutex_unlock(&root->fs_info->pinned_mutex);
-	return 0;
+	return ret;
 }
 
 static int finish_current_insert(struct btrfs_trans_handle *trans,
@@ -2458,10 +2473,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	if (refs == 0) {
 		u64 super_used;
 		u64 root_used;
-#ifdef BIO_RW_DISCARD
-		u64 map_length = num_bytes;
-		struct btrfs_multi_bio *multi = NULL;
-#endif
 
 		if (pin) {
 			mutex_lock(&root->fs_info->pinned_mutex);
@@ -2496,25 +2507,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free);
 		BUG_ON(ret);
-#ifdef BIO_RW_DISCARD
-		/* Tell the block device(s) that the sectors can be discarded */
-		ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-				      bytenr, &map_length, &multi, 0);
-		if (!ret) {
-			struct btrfs_bio_stripe *stripe = multi->stripes;
-			int i;
-
-			if (map_length > num_bytes)
-				map_length = num_bytes;
-
-			for (i = 0; i < multi->num_stripes; i++, stripe++) {
-				btrfs_issue_discard(stripe->dev->bdev,
-						    stripe->physical,
-						     map_length);
-			}
-			kfree(multi);
-		}
-#endif
 	}
 	btrfs_free_path(path);
 	finish_current_insert(trans, extent_root, 0);
@@ -3112,16 +3104,21 @@ again:
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	struct btrfs_block_group_cache *cache;
+	int ret = 0;
 
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
 		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
 		return -ENOSPC;
 	}
+
+	ret = btrfs_discard_extent(root, start, len);
+
 	btrfs_add_free_space(cache, start, len);
 	put_block_group(cache);
 	update_reserved_extents(root, start, len, 0);
-	return 0;
+
+	return ret;
 }
 
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 Jan 2009 21:25:51 -0500
Subject: Btrfs: Fix checkpatch.pl warnings

There were many, most are fixed now.  struct-funcs.c generates some warnings
but these are bogus.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c              |   5 +-
 fs/btrfs/async-thread.c     |   6 +-
 fs/btrfs/compat.h           |   4 +-
 fs/btrfs/compression.c      |  12 +--
 fs/btrfs/ctree.c            | 217 ++++++++++++++++--------------------------
 fs/btrfs/ctree.h            |  30 +++---
 fs/btrfs/dir-item.c         |   2 +-
 fs/btrfs/disk-io.c          | 187 +++++++++++++------------------------
 fs/btrfs/export.c           |   8 +-
 fs/btrfs/extent-tree.c      | 223 +++++++++++++++++++++++---------------------
 fs/btrfs/extent_io.c        | 213 ++++++++++++++++++++----------------------
 fs/btrfs/extent_map.c       |  14 +--
 fs/btrfs/file-item.c        |  18 ++--
 fs/btrfs/file.c             |  49 +++++-----
 fs/btrfs/free-space-cache.c |  37 ++++----
 fs/btrfs/inode-map.c        |   1 -
 fs/btrfs/inode.c            | 173 +++++++++++++++++-----------------
 fs/btrfs/ioctl.c            |  37 ++++----
 fs/btrfs/locking.c          |   5 +-
 fs/btrfs/ordered-data.c     |  34 +++----
 fs/btrfs/print-tree.c       |  73 +++++++++------
 fs/btrfs/ref-cache.c        |  12 +--
 fs/btrfs/root-tree.c        |  17 ++--
 fs/btrfs/struct-funcs.c     |   4 +-
 fs/btrfs/super.c            |  25 +++--
 fs/btrfs/sysfs.c            |   6 +-
 fs/btrfs/transaction.c      |  45 ++++-----
 fs/btrfs/transaction.h      |   6 +-
 fs/btrfs/tree-defrag.c      |   9 +-
 fs/btrfs/tree-log.c         |  70 +++++++-------
 fs/btrfs/volumes.c          |  78 ++++++++--------
 fs/btrfs/xattr.c            |   3 +-
 fs/btrfs/zlib.c             |  45 ++++-----
 33 files changed, 770 insertions(+), 898 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 867eaf1f8ef..1d53b62dbba 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -161,8 +161,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	ret = __btrfs_setxattr(inode, name, value, size, 0);
 
 out:
-	if (value)
-		kfree(value);
+	kfree(value);
 
 	if (!ret)
 		btrfs_update_cached_acl(inode, p_acl, acl);
@@ -213,7 +212,7 @@ static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
 }
 
 static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
-				       const void *value, size_t size, int flags)
+			       const void *value, size_t size, int flags)
 {
 	return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4229450b759..8e2fec05dbe 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -104,7 +104,7 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
 	spin_lock_irqsave(&workers->lock, flags);
 
-	while(!list_empty(&workers->order_list)) {
+	while (!list_empty(&workers->order_list)) {
 		work = list_entry(workers->order_list.next,
 				  struct btrfs_work, order_list);
 
@@ -143,7 +143,7 @@ static int worker_loop(void *arg)
 	struct btrfs_work *work;
 	do {
 		spin_lock_irq(&worker->lock);
-		while(!list_empty(&worker->pending)) {
+		while (!list_empty(&worker->pending)) {
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
@@ -188,7 +188,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 	struct btrfs_worker_thread *worker;
 
 	list_splice_init(&workers->idle_list, &workers->worker_list);
-	while(!list_empty(&workers->worker_list)) {
+	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
 				    worker_list);
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index 75e4426d6fb..594d60bdd3c 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -4,7 +4,7 @@
 #define btrfs_drop_nlink(inode) drop_nlink(inode)
 #define btrfs_inc_nlink(inode)	inc_nlink(inode)
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 27)
 static inline struct dentry *d_obtain_alias(struct inode *inode)
 {
 	struct dentry *d;
@@ -21,7 +21,7 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
 # define  __pagevec_lru_add_file __pagevec_lru_add
 # define open_bdev_exclusive open_bdev_excl
 # define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2436163d543..ee848d8585d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -137,7 +137,8 @@ static int check_compressed_csum(struct inode *inode,
 		kunmap_atomic(kaddr, KM_USER0);
 
 		if (csum != *cb_sum) {
-			printk("btrfs csum failed ino %lu extent %llu csum %u "
+			printk(KERN_INFO "btrfs csum failed ino %lu "
+			       "extent %llu csum %u "
 			       "wanted %u mirror %d\n", inode->i_ino,
 			       (unsigned long long)disk_start,
 			       csum, *cb_sum, cb->mirror_num);
@@ -217,7 +218,7 @@ csum_failed:
 		 * we have verified the checksum already, set page
 		 * checked so the end_io handlers know about it
 		 */
-		while(bio_index < cb->orig_bio->bi_vcnt) {
+		while (bio_index < cb->orig_bio->bi_vcnt) {
 			SetPageChecked(bvec->bv_page);
 			bvec++;
 			bio_index++;
@@ -246,7 +247,7 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
 	int i;
 	int ret;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
@@ -463,7 +464,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 
 	pagevec_init(&pvec, 0);
-	while(last_offset < compressed_end) {
+	while (last_offset < compressed_end) {
 		page_index = last_offset >> PAGE_CACHE_SHIFT;
 
 		if (page_index > end_index)
@@ -697,9 +698,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	if (!btrfs_test_flag(inode, NODATASUM)) {
+	if (!btrfs_test_flag(inode, NODATASUM))
 		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
-	}
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
 	BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7fad2e3ad6f..9e46c077681 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -67,7 +67,7 @@ void btrfs_free_path(struct btrfs_path *p)
  *
  * It is safe to call this on paths that no locks or extent buffers held.
  */
-void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
 
@@ -112,7 +112,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
 
-	while(1) {
+	while (1) {
 		eb = btrfs_root_node(root);
 		btrfs_tree_lock(eb);
 
@@ -202,22 +202,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 }
 
 /*
- * does the dirty work in cow of a single block.  The parent block
- * (if supplied) is updated to point to the new cow copy.  The new
- * buffer is marked dirty and returned locked.  If you modify the block
- * it needs to be marked dirty again.
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
  *
  * search_start -- an allocation hint for the new block
  *
- * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
- * the allocator should try to find free next to the block it returns.  This is
- * just a hint and may be ignored by the allocator.
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
  *
  * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
- * is used to finish the allocation.
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
  */
-static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
@@ -366,7 +366,7 @@ static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
  * This version of it has extra checks so that a block isn't cow'd more than
  * once per transaction, as long as it hasn't been written yet
  */
-int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret, u64 prealloc_dest)
@@ -375,13 +375,16 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 	int ret;
 
 	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)
 		       root->fs_info->running_transaction->transid);
 		WARN_ON(1);
 	}
 	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->generation);
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
 
@@ -489,16 +492,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (cache_only && parent_level != 1)
 		return 0;
 
-	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->running_transaction->transid);
+	if (trans->transaction != root->fs_info->running_transaction)
 		WARN_ON(1);
-	}
-	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->generation);
+	if (trans->transid != root->fs_info->generation)
 		WARN_ON(1);
-	}
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
@@ -681,51 +678,18 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
 		       btrfs_header_bytenr(leaf));
 	}
-#if 0
-	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
-		btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
-		btrfs_item_key(leaf, &leaf_key, i);
-		if (comp_keys(&leaf_key, &cpukey) >= 0) {
-			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad key\n", i);
-			BUG_ON(1);
-		}
-		if (btrfs_item_offset_nr(leaf, i) !=
-			btrfs_item_end_nr(leaf, i + 1)) {
-			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", i);
-			BUG_ON(1);
-		}
-		if (i == 0) {
-			if (btrfs_item_offset_nr(leaf, i) +
-			       btrfs_item_size_nr(leaf, i) !=
-			       BTRFS_LEAF_DATA_SIZE(root)) {
-				btrfs_print_leaf(root, leaf);
-				printk("slot %d first offset bad\n", i);
-				BUG_ON(1);
-			}
-		}
-	}
-	if (nritems > 0) {
-		if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
-				btrfs_print_leaf(root, leaf);
-				printk("slot %d bad size \n", nritems - 1);
-				BUG_ON(1);
-		}
-	}
-#endif
 	if (slot != 0 && slot < nritems - 1) {
 		btrfs_item_key(leaf, &leaf_key, slot);
 		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
 		if (comp_keys(&leaf_key, &cpukey) <= 0) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad key\n", slot);
+			printk(KERN_CRIT "slot %d offset bad key\n", slot);
 			BUG_ON(1);
 		}
 		if (btrfs_item_offset_nr(leaf, slot - 1) !=
 		       btrfs_item_end_nr(leaf, slot)) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", slot);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
 			BUG_ON(1);
 		}
 	}
@@ -736,7 +700,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		if (btrfs_item_offset_nr(leaf, slot) !=
 			btrfs_item_end_nr(leaf, slot + 1)) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", slot);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
 			BUG_ON(1);
 		}
 	}
@@ -745,30 +709,10 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	return 0;
 }
 
-static int noinline check_block(struct btrfs_root *root,
+static noinline int check_block(struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
-	u64 found_start;
 	return 0;
-	if (btrfs_header_level(path->nodes[level]) != level)
-	    printk("warning: bad level %Lu wanted %d found %d\n",
-		   path->nodes[level]->start, level,
-		   btrfs_header_level(path->nodes[level]));
-	found_start = btrfs_header_bytenr(path->nodes[level]);
-	if (found_start != path->nodes[level]->start) {
-	    printk("warning: bad bytentr %Lu found %Lu\n",
-		   path->nodes[level]->start, found_start);
-	}
-#if 0
-	struct extent_buffer *buf = path->nodes[level];
-
-	if (memcmp_extent_buffer(buf, root->fs_info->fsid,
-				 (unsigned long)btrfs_header_fsid(buf),
-				 BTRFS_FSID_SIZE)) {
-		printk("warning bad block %Lu\n", buf->start);
-		return 1;
-	}
-#endif
 	if (level == 0)
 		return check_leaf(root, path, level);
 	return check_node(root, path, level);
@@ -802,7 +746,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 	unsigned long map_len = 0;
 	int err;
 
-	while(low < high) {
+	while (low < high) {
 		mid = (low + high) / 2;
 		offset = p + mid * item_size;
 
@@ -1130,7 +1074,7 @@ enospc:
  * when they are completely full.  This is also done top down, so we
  * have to be pessimistic.
  */
-static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, int level)
 {
@@ -1296,7 +1240,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
-	while(1) {
+	while (1) {
 		if (direction < 0) {
 			if (nr == 0)
 				break;
@@ -1322,7 +1266,8 @@ static noinline void reada_for_search(struct btrfs_root *root,
 		nscan++;
 		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
 			break;
-		if(nread > (256 * 1024) || nscan > 128)
+
+		if (nread > (256 * 1024) || nscan > 128)
 			break;
 
 		if (search < lowest_read)
@@ -1333,17 +1278,17 @@ static noinline void reada_for_search(struct btrfs_root *root,
 }
 
 /*
- * when we walk down the tree, it is usually safe to unlock the higher layers in
- * the tree.  The exceptions are when our path goes through slot 0, because operations
- * on the tree might require changing key pointers higher up in the tree.
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
  *
- * callers might also have set path->keep_locks, which tells this code to
- * keep the lock if the path points to the last slot in the block.  This is
- * part of walking through the tree, and selecting the next slot in the higher
- * block.
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
  *
- * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
- * so if lowest_unlock is 1, level 0 won't be unlocked
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
  */
 static noinline void unlock_up(struct btrfs_path *path, int level,
 			       int lowest_unlock)
@@ -1832,9 +1777,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	if (!empty && src_nritems <= 8)
 		return 1;
 
-	if (push_items <= 0) {
+	if (push_items <= 0)
 		return 1;
-	}
 
 	if (empty) {
 		push_items = min(src_nritems, push_items);
@@ -1854,7 +1798,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
 			   btrfs_node_key_ptr_offset(0),
-		           push_items * sizeof(struct btrfs_key_ptr));
+			   push_items * sizeof(struct btrfs_key_ptr));
 
 	if (push_items < src_nritems) {
 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
@@ -1899,19 +1843,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	src_nritems = btrfs_header_nritems(src);
 	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
-	if (push_items <= 0) {
+	if (push_items <= 0)
 		return 1;
-	}
 
-	if (src_nritems < 4) {
+	if (src_nritems < 4)
 		return 1;
-	}
 
 	max_push = src_nritems / 2 + 1;
 	/* don't try to empty the node */
-	if (max_push >= src_nritems) {
+	if (max_push >= src_nritems)
 		return 1;
-	}
 
 	if (max_push < push_items)
 		push_items = max_push;
@@ -1924,7 +1865,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
-		           push_items * sizeof(struct btrfs_key_ptr));
+			   push_items * sizeof(struct btrfs_key_ptr));
 
 	btrfs_set_header_nritems(src, src_nritems - push_items);
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
@@ -1945,7 +1886,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
  *
  * returns zero on success or < 0 on failure.
  */
-static int noinline insert_new_root(struct btrfs_trans_handle *trans,
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
@@ -2176,14 +2117,15 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int noinline btrfs_leaf_free_space(struct btrfs_root *root,
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 				   struct extent_buffer *leaf)
 {
 	int nritems = btrfs_header_nritems(leaf);
 	int ret;
 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
 	if (ret < 0) {
-		printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
+		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+		       "used %d nritems %d\n",
 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
 		       leaf_space_used(leaf, 0, nritems), nritems);
 	}
@@ -2219,9 +2161,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 
 	slot = path->slots[1];
-	if (!path->nodes[1]) {
+	if (!path->nodes[1])
 		return 1;
-	}
+
 	upper = path->nodes[1];
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
@@ -2418,9 +2360,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 
 	right_nritems = btrfs_header_nritems(right);
-	if (right_nritems == 0) {
+	if (right_nritems == 0)
 		return 1;
-	}
 
 	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
 
@@ -2502,7 +2443,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			   push_items * sizeof(struct btrfs_item));
 
 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
-		     btrfs_item_offset_nr(right, push_items -1);
+		     btrfs_item_offset_nr(right, push_items - 1);
 
 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
 		     leaf_data_end(root, left) - push_space,
@@ -2537,7 +2478,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* fixup right node */
 	if (push_items > right_nritems) {
-		printk("push items %d nr %u\n", push_items, right_nritems);
+		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+		       right_nritems);
 		WARN_ON(1);
 	}
 
@@ -2640,9 +2582,8 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	/* first try to make some room by pushing left and right */
 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
-		if (wret < 0) {
+		if (wret < 0)
 			return wret;
-		}
 		if (wret) {
 			wret = push_leaf_left(trans, root, path, data_size, 0);
 			if (wret < 0)
@@ -2665,7 +2606,7 @@ again:
 	l = path->nodes[0];
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(l);
-	mid = (nritems + 1)/ 2;
+	mid = (nritems + 1) / 2;
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
 					path->nodes[1]->start,
@@ -2734,7 +2675,7 @@ again:
 				path->slots[0] = 0;
 				if (path->slots[1] == 0) {
 					wret = fixup_low_keys(trans, root,
-					           path, &disk_key, 1);
+						      path, &disk_key, 1);
 					if (wret)
 						ret = wret;
 				}
@@ -3033,8 +2974,8 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			    BTRFS_FILE_EXTENT_INLINE) {
 				ptr = btrfs_item_ptr_offset(leaf, slot);
 				memmove_extent_buffer(leaf, ptr,
-				        (unsigned long)fi,
-				        offsetof(struct btrfs_file_extent_item,
+				      (unsigned long)fi,
+				      offsetof(struct btrfs_file_extent_item,
 						 disk_bytenr));
 			}
 		}
@@ -3096,7 +3037,8 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
 		btrfs_print_leaf(root, leaf);
-		printk("slot %d too large, nritems %d\n", slot, nritems);
+		printk(KERN_CRIT "slot %d too large, nritems %d\n",
+		       slot, nritems);
 		BUG_ON(1);
 	}
 
@@ -3218,7 +3160,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d old_data %d data_end %d\n",
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -3317,9 +3259,8 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
 
-	for (i = 0; i < nr; i++) {
+	for (i = 0; i < nr; i++)
 		total_data += data_size[i];
-	}
 
 	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
@@ -3336,7 +3277,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
 		btrfs_print_leaf(root, leaf);
-		printk("not enough freespace need %u have %d\n",
+		printk(KERN_CRIT "not enough freespace need %u have %d\n",
 		       total_size, btrfs_leaf_free_space(root, leaf));
 		BUG();
 	}
@@ -3349,7 +3290,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d old_data %d data_end %d\n",
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -3457,7 +3398,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int wret;
 
 	nritems = btrfs_header_nritems(parent);
-	if (slot != nritems -1) {
+	if (slot != nritems - 1) {
 		memmove_extent_buffer(parent,
 			      btrfs_node_key_ptr_offset(slot),
 			      btrfs_node_key_ptr_offset(slot + 1),
@@ -3614,7 +3555,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path, leaf->start);
+				ret = btrfs_del_leaf(trans, root, path,
+						     leaf->start);
 				BUG_ON(ret);
 				free_extent_buffer(leaf);
 			} else {
@@ -3717,7 +3659,7 @@ again:
 		ret = 1;
 		goto out;
 	}
-	while(1) {
+	while (1) {
 		nritems = btrfs_header_nritems(cur);
 		level = btrfs_header_level(cur);
 		sret = bin_search(cur, min_key, level, &slot);
@@ -3738,7 +3680,7 @@ again:
 		 * min_trans parameters.  If it isn't in cache or is too
 		 * old, skip to the next one.
 		 */
-		while(slot < nritems) {
+		while (slot < nritems) {
 			u64 blockptr;
 			u64 gen;
 			struct extent_buffer *tmp;
@@ -3830,7 +3772,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 	struct extent_buffer *c;
 
 	WARN_ON(!path->keep_locks);
-	while(level < BTRFS_MAX_LEVEL) {
+	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 
@@ -3839,9 +3781,8 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 next:
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL) {
+			if (level == BTRFS_MAX_LEVEL)
 				return 1;
-			}
 			continue;
 		}
 		if (level == 0)
@@ -3889,9 +3830,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	int ret;
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
-	if (nritems == 0) {
+	if (nritems == 0)
 		return 1;
-	}
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
 
@@ -3915,7 +3855,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		goto done;
 	}
 
-	while(level < BTRFS_MAX_LEVEL) {
+	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 
@@ -3923,9 +3863,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		c = path->nodes[level];
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL) {
+			if (level == BTRFS_MAX_LEVEL)
 				return 1;
-			}
 			continue;
 		}
 
@@ -3946,7 +3885,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		break;
 	}
 	path->slots[level] = slot;
-	while(1) {
+	while (1) {
 		level--;
 		c = path->nodes[level];
 		if (path->locks[level])
@@ -3986,7 +3925,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 	u32 nritems;
 	int ret;
 
-	while(1) {
+	while (1) {
 		if (path->slots[0] == 0) {
 			ret = btrfs_prev_leaf(root, path);
 			if (ret != 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ccea0648e10..eee060f8811 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -126,7 +126,6 @@ struct btrfs_ordered_sum;
 static int btrfs_csum_sizes[] = { 4, 0 };
 
 /* four bytes for CRC32 */
-//#define BTRFS_CRC32_SIZE 4
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 #define BTRFS_FT_UNKNOWN	0
@@ -283,8 +282,8 @@ struct btrfs_header {
 } __attribute__ ((__packed__));
 
 #define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
-			        sizeof(struct btrfs_header)) / \
-			        sizeof(struct btrfs_key_ptr))
+				      sizeof(struct btrfs_header)) / \
+				     sizeof(struct btrfs_key_ptr))
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
 #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
@@ -1512,7 +1511,7 @@ static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
 
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
 {
-	return (btrfs_header_level(eb) == 0);
+	return btrfs_header_level(eb) == 0;
 }
 
 /* struct btrfs_root_item */
@@ -1597,8 +1596,8 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
 
-static inline unsigned long btrfs_file_extent_inline_start(struct
-						   btrfs_file_extent_item *e)
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
 {
 	unsigned long offset = (unsigned long)e;
 	offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
@@ -1660,20 +1659,20 @@ static inline int btrfs_set_root_name(struct btrfs_root *root,
 				      const char *name, int len)
 {
 	/* if we already have a name just free it */
-	if (root->name)
-		kfree(root->name);
+	kfree(root->name);
 
 	root->name = kmalloc(len+1, GFP_KERNEL);
 	if (!root->name)
 		return -ENOMEM;
 
 	memcpy(root->name, name, len);
-	root->name[len] ='\0';
+	root->name[len] = '\0';
 
 	return 0;
 }
 
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
 	if (level == 0)
 		return root->leafsize;
 	return root->nodesize;
@@ -1707,9 +1706,9 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
-struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
-							 btrfs_fs_info *info,
-							 u64 bytenr);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr);
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -1908,8 +1907,9 @@ int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest_root);
 /* dir-item.c */
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, const char *name,
+			  int name_len, u64 dir,
 			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 5040b71f190..926a0b287a7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -333,7 +333,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 	leaf = path->nodes[0];
 	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
-	while(cur < total_len) {
+	while (cur < total_len) {
 		this_len = sizeof(*dir_item) +
 			btrfs_dir_name_len(leaf, dir_item) +
 			btrfs_dir_data_len(leaf, dir_item);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dae25e78a6b..81a313874ae 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,7 +23,7 @@
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h> // for block_sync_page
+#include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -40,19 +40,6 @@
 #include "ref-cache.h"
 #include "tree-log.h"
 
-#if 0
-static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
-{
-	if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) {
-		printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n",
-		       (unsigned long long)extent_buffer_blocknr(buf),
-		       (unsigned long long)btrfs_header_blocknr(buf));
-		return 1;
-	}
-	return 0;
-}
-#endif
-
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 
@@ -128,23 +115,13 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 		u64 failed_start = em->start;
 		u64 failed_len = em->len;
 
-		printk("failed to insert %Lu %Lu -> %Lu into tree\n",
-		       em->start, em->len, em->block_start);
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (em) {
-			printk("after failing, found %Lu %Lu %Lu\n",
-			       em->start, em->len, em->block_start);
 			ret = 0;
 		} else {
 			em = lookup_extent_mapping(em_tree, failed_start,
 						   failed_len);
-			if (em) {
-				printk("double failure lookup gives us "
-				       "%Lu %Lu -> %Lu\n", em->start,
-				       em->len, em->block_start);
-				free_extent_map(em);
-			}
 			ret = -EIO;
 		}
 	} else if (ret) {
@@ -191,15 +168,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	unsigned long inline_result;
 
 	len = buf->len - offset;
-	while(len > 0) {
+	while (len > 0) {
 		err = map_private_extent_buffer(buf, offset, 32,
 					&map_token, &kaddr,
 					&map_start, &map_len, KM_USER0);
-		if (err) {
-			printk("failed to map extent buffer! %lu\n",
-			       offset);
+		if (err)
 			return 1;
-		}
 		cur_len = min(len, map_len - (offset - map_start));
 		crc = btrfs_csum_data(root, kaddr + offset - map_start,
 				      crc, cur_len);
@@ -218,15 +192,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	btrfs_csum_final(crc, result);
 
 	if (verify) {
-		/* FIXME, this is not good */
 		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 			u32 val;
 			u32 found = 0;
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk("btrfs: %s checksum verify failed on %llu "
-			       "wanted %X found %X level %d\n",
+			printk(KERN_INFO "btrfs: %s checksum verify failed "
+			       "on %llu wanted %X found %X level %d\n",
 			       root->fs_info->sb->s_id,
 			       buf->start, val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
@@ -293,7 +266,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
-printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
+
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
@@ -307,9 +280,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
 }
 
 /*
- * checksum a dirty tree block before IO.  This has extra checks to make
- * sure we only fill in the checksum field in the first page of a multi-page block
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
  */
+
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
@@ -327,28 +301,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	if (!page->private)
 		goto out;
 	len = page->private >> 2;
-	if (len == 0) {
-		WARN_ON(1);
-	}
+	WARN_ON(len == 0);
+
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 					     btrfs_header_generation(eb));
 	BUG_ON(ret);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
-		       start, found_start, len);
 		WARN_ON(1);
 		goto err;
 	}
 	if (eb->first_page != page) {
-		printk("bad first page %lu %lu\n", eb->first_page->index,
-		       page->index);
 		WARN_ON(1);
 		goto err;
 	}
 	if (!PageUptodate(page)) {
-		printk("csum not up to date page %lu\n", page->index);
 		WARN_ON(1);
 		goto err;
 	}
@@ -396,29 +364,30 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		goto out;
 	if (!page->private)
 		goto out;
+
 	len = page->private >> 2;
-	if (len == 0) {
-		WARN_ON(1);
-	}
+	WARN_ON(len == 0);
+
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("bad tree block start %llu %llu\n",
+		printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
 		       (unsigned long long)found_start,
 		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (eb->first_page != page) {
-		printk("bad first page %lu %lu\n", eb->first_page->index,
-		       page->index);
+		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+		       eb->first_page->index, page->index);
 		WARN_ON(1);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
-		printk("bad fsid on block %Lu\n", eb->start);
+		printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
@@ -578,7 +547,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			   HZ/10);
 	}
 #endif
-	while(atomic_read(&fs_info->async_submit_draining) &&
+	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
 			   (atomic_read(&fs_info->nr_async_submits) == 0));
@@ -594,7 +563,7 @@ static int btree_csum_one_bio(struct bio *bio)
 	struct btrfs_root *root;
 
 	WARN_ON(bio->bi_vcnt <= 0);
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 		csum_dirty_buffer(root, bvec->bv_page);
 		bio_index++;
@@ -680,9 +649,8 @@ static int btree_writepages(struct address_space *mapping,
 
 		num_dirty = count_range_bits(tree, &start, (u64)-1,
 					     thresh, EXTENT_DIRTY);
-		if (num_dirty < thresh) {
+		if (num_dirty < thresh)
 			return 0;
-		}
 	}
 	return extent_writepages(tree, mapping, btree_get_extent, wbc);
 }
@@ -701,15 +669,14 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	int ret;
 
 	if (PageWriteback(page) || PageDirty(page))
-	    return 0;
+		return 0;
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 
 	ret = try_release_extent_state(map, tree, page, gfp_flags);
-	if (!ret) {
+	if (!ret)
 		return 0;
-	}
 
 	ret = try_release_extent_buffer(tree, page);
 	if (ret == 1) {
@@ -728,8 +695,8 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		printk("warning page private not zero on page %Lu\n",
-		       page_offset(page));
+		printk(KERN_WARNING "btrfs warning page private not zero "
+		       "on page %llu\n", (unsigned long long)page_offset(page));
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -813,7 +780,7 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
 	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
-				  buf->start, buf->start + buf->len -1);
+				  buf->start, buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -832,11 +799,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
-	if (ret == 0) {
+	if (ret == 0)
 		buf->flags |= EXTENT_UPTODATE;
-	} else {
+	else
 		WARN_ON(1);
-	}
 	return buf;
 
 }
@@ -944,7 +910,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	if (!log_root_tree)
 		return 0;
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
 				    0, &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -1165,24 +1131,6 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	root->in_sysfs = 1;
 	return root;
 }
-#if 0
-static int add_hasher(struct btrfs_fs_info *info, char *type) {
-	struct btrfs_hasher *hasher;
-
-	hasher = kmalloc(sizeof(*hasher), GFP_NOFS);
-	if (!hasher)
-		return -ENOMEM;
-	hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC);
-	if (!hasher->hash_tfm) {
-		kfree(hasher);
-		return -EINVAL;
-	}
-	spin_lock(&info->hash_lock);
-	list_add(&hasher->list, &info->hashers);
-	spin_unlock(&info->hash_lock);
-	return 0;
-}
-#endif
 
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
@@ -1226,9 +1174,8 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 			continue;
 
 		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn) {
+		if (bdi->unplug_io_fn)
 			bdi->unplug_io_fn(bdi, page);
-		}
 	}
 }
 
@@ -1420,8 +1367,9 @@ static int transaction_kthread(void *arg)
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-			printk("btrfs: total reference cache size %Lu\n",
-				root->fs_info->total_ref_cache_size);
+			printk(KERN_INFO "btrfs: total reference cache "
+			       "size %llu\n",
+			       root->fs_info->total_ref_cache_size);
 		}
 
 		mutex_lock(&root->fs_info->trans_mutex);
@@ -1592,14 +1540,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->tree_log_writers, 0);
 	fs_info->tree_log_transid = 0;
 
-#if 0
-	ret = add_hasher(fs_info, "crc32c");
-	if (ret) {
-		printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
-		err = -ENOMEM;
-		goto fail_iput;
-	}
-#endif
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -1720,7 +1660,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
-		printk("btrfs: valid FS not found on %s\n", sb->s_id);
+		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
 
@@ -1728,8 +1668,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk("btrfs: failed to read the system array on %s\n",
-		       sb->s_id);
+		printk(KERN_WARNING "btrfs: failed to read the system "
+		       "array on %s\n", sb->s_id);
 		goto fail_sys_array;
 	}
 
@@ -1746,14 +1686,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	BUG_ON(!chunk_root->node);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
-	         (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
-		 BTRFS_UUID_SIZE);
+	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+	   BTRFS_UUID_SIZE);
 
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk("btrfs: failed to read chunk tree on %s\n", sb->s_id);
+		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+		       sb->s_id);
 		goto fail_chunk_root;
 	}
 
@@ -1812,7 +1753,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
-			printk("Btrfs log replay required on RO media\n");
+			printk(KERN_WARNING "Btrfs log replay required "
+			       "on RO media\n");
 			err = -EIO;
 			goto fail_trans_kthread;
 		}
@@ -2097,7 +2039,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk("btrfs: %d errors while writing supers\n", total_errors);
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
 		BUG();
 	}
 
@@ -2114,7 +2057,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk("btrfs: %d errors while writing supers\n", total_errors);
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
 		BUG();
 	}
 	return 0;
@@ -2137,16 +2081,11 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 		down_write(&root->anon_super.s_umount);
 		kill_anon_super(&root->anon_super);
 	}
-#if 0
-	if (root->in_sysfs)
-		btrfs_sysfs_del_root(root);
-#endif
 	if (root->node)
 		free_extent_buffer(root->node);
 	if (root->commit_root)
 		free_extent_buffer(root->commit_root);
-	if (root->name)
-		kfree(root->name);
+	kfree(root->name);
 	kfree(root);
 	return 0;
 }
@@ -2157,7 +2096,7 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 	struct btrfs_root *gang[8];
 	int i;
 
-	while(1) {
+	while (1) {
 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
 					     (void **)gang, 0,
 					     ARRAY_SIZE(gang));
@@ -2228,18 +2167,17 @@ int close_ctree(struct btrfs_root *root)
 
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret =  btrfs_commit_super(root);
-		if (ret) {
-			printk("btrfs: commit super returns %d\n", ret);
-		}
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
 	if (fs_info->delalloc_bytes) {
-		printk("btrfs: at unmount delalloc count %Lu\n",
+		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       fs_info->delalloc_bytes);
 	}
 	if (fs_info->total_ref_cache_size) {
-		printk("btrfs: at umount reference cache size %Lu\n",
-			fs_info->total_ref_cache_size);
+		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+		       (unsigned long long)fs_info->total_ref_cache_size);
 	}
 
 	if (fs_info->extent_root->node)
@@ -2248,13 +2186,13 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->tree_root->node)
 		free_extent_buffer(fs_info->tree_root->node);
 
-	if (root->fs_info->chunk_root->node);
+	if (root->fs_info->chunk_root->node)
 		free_extent_buffer(root->fs_info->chunk_root->node);
 
-	if (root->fs_info->dev_root->node);
+	if (root->fs_info->dev_root->node)
 		free_extent_buffer(root->fs_info->dev_root->node);
 
-	if (root->fs_info->csum_root->node);
+	if (root->fs_info->csum_root->node)
 		free_extent_buffer(root->fs_info->csum_root->node);
 
 	btrfs_free_block_groups(root->fs_info);
@@ -2273,7 +2211,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 #if 0
-	while(!list_empty(&fs_info->hashers)) {
+	while (!list_empty(&fs_info->hashers)) {
 		struct btrfs_hasher *hasher;
 		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
 				    hashers);
@@ -2324,9 +2262,11 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 	WARN_ON(!btrfs_tree_locked(buf));
 	if (transid != root->fs_info->generation) {
-		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
+		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+		       "found %llu running %llu\n",
 			(unsigned long long)buf->start,
-			transid, root->fs_info->generation);
+			(unsigned long long)transid,
+			(unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
@@ -2361,9 +2301,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	int ret;
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-	if (ret == 0) {
+	if (ret == 0)
 		buf->flags |= EXTENT_UPTODATE;
-	}
 	return ret;
 }
 
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 48b82cd7583..85315d2c90d 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,9 +7,11 @@
 #include "export.h"
 #include "compat.h"
 
-#define BTRFS_FID_SIZE_NON_CONNECTABLE		(offsetof(struct btrfs_fid, parent_objectid)/4)
-#define BTRFS_FID_SIZE_CONNECTABLE		(offsetof(struct btrfs_fid, parent_root_objectid)/4)
-#define BTRFS_FID_SIZE_CONNECTABLE_ROOT		(sizeof(struct btrfs_fid)/4)
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+						 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+					     parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
 
 static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 			   int connectable)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 780c1eeb829..ec43fa526d7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,10 +49,10 @@ struct pending_extent_op {
 	int del;
 };
 
-static int finish_current_insert(struct btrfs_trans_handle *trans, struct
-				 btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root, int all);
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all);
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 bytenr, u64 num_bytes, int is_data);
@@ -247,7 +247,7 @@ static int cache_block_group(struct btrfs_root *root,
 	if (ret < 0)
 		goto err;
 
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
@@ -292,9 +292,8 @@ err:
 /*
  * return the block group that starts at or after bytenr
  */
-static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
-						       btrfs_fs_info *info,
-							 u64 bytenr)
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 {
 	struct btrfs_block_group_cache *cache;
 
@@ -306,9 +305,9 @@ static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 /*
  * return the block group that contains teh given bytenr
  */
-struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
-							 btrfs_fs_info *info,
-							 u64 bytenr)
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr)
 {
 	struct btrfs_block_group_cache *cache;
 
@@ -492,7 +491,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  * to the key objectid.
  */
 
-static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
@@ -537,7 +536,7 @@ out:
  * updates all the backrefs that are pending on update_list for the
  * extent_root
  */
-static int noinline update_backrefs(struct btrfs_trans_handle *trans,
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_path *path,
 				    struct list_head *update_list)
@@ -573,9 +572,11 @@ loop:
 	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
 	    (ref_objectid != op->level &&
 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-		printk(KERN_ERR "couldn't find %Lu, parent %Lu, root %Lu, "
-		       "owner %u\n", op->bytenr, op->orig_parent,
-		       ref_root, op->level);
+		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+		       "root %llu, owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)op->orig_parent,
+		       (unsigned long long)ref_root, op->level);
 		btrfs_print_leaf(extent_root, leaf);
 		BUG();
 	}
@@ -620,7 +621,7 @@ out:
 	return 0;
 }
 
-static int noinline insert_extents(struct btrfs_trans_handle *trans,
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *extent_root,
 				   struct btrfs_path *path,
 				   struct list_head *insert_list, int nr)
@@ -781,7 +782,7 @@ static int noinline insert_extents(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
@@ -840,7 +841,7 @@ out:
 	return ret;
 }
 
-static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path)
 {
@@ -868,7 +869,7 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
 #else
 	blkdev_issue_discard(bdev, start >> 9, len >> 9);
@@ -908,7 +909,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
-static int noinline free_extents(struct btrfs_trans_handle *trans,
+static noinline int free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
 				 struct list_head *del_list)
 {
@@ -937,10 +938,11 @@ search:
 				    extent_root->root_key.objectid,
 				    op->orig_generation, op->level, 1);
 	if (ret) {
-		printk("Unable to find backref byte nr %Lu root %Lu gen %Lu "
-		       "owner %u\n", op->bytenr,
-		       extent_root->root_key.objectid, op->orig_generation,
-		       op->level);
+		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+		       "root %llu gen %llu owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)extent_root->root_key.objectid,
+		       (unsigned long long)op->orig_generation, op->level);
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		goto out;
@@ -1282,7 +1284,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 	if (key.objectid != bytenr) {
 		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
-		printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
+		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)key.objectid);
 		BUG();
 	}
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
@@ -1353,7 +1357,8 @@ int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk("failed to find block number %Lu\n", bytenr);
+		printk(KERN_INFO "btrfs failed to find block number %llu\n",
+		       (unsigned long long)bytenr);
 		BUG();
 	}
 	l = path->nodes[0];
@@ -1738,7 +1743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	while(1) {
+	while (1) {
 		cache = NULL;
 		spin_lock(&root->fs_info->block_group_cache_lock);
 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
@@ -1921,10 +1926,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	spin_unlock(&space_info->lock);
 
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
-	if (ret) {
-printk("space info full %Lu\n", flags);
+	if (ret)
 		space_info->full = 1;
-	}
 out:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return ret;
@@ -1941,7 +1944,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
-	while(total) {
+	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
 			return -1;
@@ -2089,7 +2092,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	int ret;
 
 	mutex_lock(&root->fs_info->pinned_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -2110,7 +2113,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	int ret;
 
 	mutex_lock(&root->fs_info->pinned_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
@@ -2400,7 +2403,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	if (ret == 0) {
 		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
-		while(extent_slot > 0) {
+		while (extent_slot > 0) {
 			extent_slot--;
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 					      extent_slot);
@@ -2422,8 +2425,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 						&key, path, -1, 1);
 			if (ret) {
 				printk(KERN_ERR "umm, got %d back from search"
-				       ", was looking for %Lu\n", ret,
-				       bytenr);
+				       ", was looking for %llu\n", ret,
+				       (unsigned long long)bytenr);
 				btrfs_print_leaf(extent_root, path->nodes[0]);
 			}
 			BUG_ON(ret);
@@ -2432,9 +2435,12 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	} else {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
-		printk("Unable to find ref byte nr %Lu root %Lu "
-		       "gen %Lu owner %Lu\n", bytenr,
-		       root_objectid, ref_generation, owner_objectid);
+		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+		       "root %llu gen %llu owner %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)root_objectid,
+		       (unsigned long long)ref_generation,
+		       (unsigned long long)owner_objectid);
 	}
 
 	leaf = path->nodes[0];
@@ -2517,8 +2523,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
  * find all the blocks marked as pending in the radix tree and remove
  * them from the extent map
  */
-static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root, int all)
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all)
 {
 	int ret;
 	int err = 0;
@@ -2539,7 +2545,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 
 again:
 	mutex_lock(&info->extent_ins_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(pending_del, search, &start, &end,
 					    EXTENT_WRITEBACK);
 		if (ret) {
@@ -2753,7 +2759,7 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-static int noinline find_free_extent(struct btrfs_trans_handle *trans,
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
 				     u64 search_start, u64 search_end,
@@ -2762,7 +2768,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				     int data)
 {
 	int ret = 0;
-	struct btrfs_root * root = orig_root->fs_info->extent_root;
+	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	u64 total_needed = num_bytes;
 	u64 *last_ptr = NULL;
 	u64 last_wanted = 0;
@@ -2995,8 +3001,10 @@ loop_check:
 			*last_ptr = ins->objectid + ins->offset;
 		ret = 0;
 	} else if (!ret) {
-		printk(KERN_ERR "we were searching for %Lu bytes, num_bytes %Lu,"
-		       " loop %d, allowed_alloc %d\n", total_needed, num_bytes,
+		printk(KERN_ERR "btrfs searching for %llu bytes, "
+		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
+		       (unsigned long long)total_needed,
+		       (unsigned long long)num_bytes,
 		       loop, allowed_chunk_alloc);
 		ret = -ENOSPC;
 	}
@@ -3012,19 +3020,22 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	struct btrfs_block_group_cache *cache;
 	struct list_head *l;
 
-	printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
-	       info->total_bytes - info->bytes_used - info->bytes_pinned -
-	       info->bytes_reserved, (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	       (unsigned long long)(info->total_bytes - info->bytes_used -
+				    info->bytes_pinned - info->bytes_reserved),
+	       (info->full) ? "" : "not ");
 
 	down_read(&info->groups_sem);
 	list_for_each(l, &info->block_groups) {
 		cache = list_entry(l, struct btrfs_block_group_cache, list);
 		spin_lock(&cache->lock);
-		printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
-		       "%Lu pinned %Lu reserved\n",
-		       cache->key.objectid, cache->key.offset,
-		       btrfs_block_group_used(&cache->item),
-		       cache->pinned, cache->reserved);
+		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+		       "%llu pinned %llu reserved\n",
+		       (unsigned long long)cache->key.objectid,
+		       (unsigned long long)cache->key.offset,
+		       (unsigned long long)btrfs_block_group_used(&cache->item),
+		       (unsigned long long)cache->pinned,
+		       (unsigned long long)cache->reserved);
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
@@ -3045,15 +3056,15 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
-			        info->data_alloc_profile;
+			info->data_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
 	} else if (root == root->fs_info->chunk_root) {
 		alloc_profile = info->avail_system_alloc_bits &
-			        info->system_alloc_profile;
+			info->system_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
 	} else {
 		alloc_profile = info->avail_metadata_alloc_bits &
-			        info->metadata_alloc_profile;
+			info->metadata_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
@@ -3092,8 +3103,9 @@ again:
 		struct btrfs_space_info *sinfo;
 
 		sinfo = __find_space_info(root->fs_info, data);
-		printk("allocation failed flags %Lu, wanted %Lu\n",
-		       data, num_bytes);
+		printk(KERN_ERR "btrfs allocation failed flags %llu, "
+		       "wanted %llu\n", (unsigned long long)data,
+		       (unsigned long long)num_bytes);
 		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
@@ -3108,7 +3120,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
+		printk(KERN_ERR "Unable to find block group for %llu\n",
+		       (unsigned long long)start);
 		return -ENOSPC;
 	}
 
@@ -3235,10 +3248,12 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	}
 
 update_block:
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
+	ret = update_block_group(trans, root, ins->objectid,
+				 ins->offset, 1, 0);
 	if (ret) {
-		printk("update block group failed for %Lu %Lu\n",
-		       ins->objectid, ins->offset);
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
 		BUG();
 	}
 out:
@@ -3420,7 +3435,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_leaf_ref *ref)
 {
@@ -3445,15 +3460,15 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
-			      u32 *refs)
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+				     u64 len, u32 *refs)
 {
 	int ret;
 
 	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
 	BUG_ON(ret);
 
-#if 0 // some debugging code in case we see problems here
+#if 0 /* some debugging code in case we see problems here */
 	/* if the refs count is one, it won't get increased again.  But
 	 * if the ref count is > 1, someone may be decreasing it at
 	 * the same time we are.
@@ -3474,8 +3489,8 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len
 			free_extent_buffer(eb);
 		}
 		if (*refs == 1) {
-			printk("block %llu went down to one during drop_snap\n",
-			       (unsigned long long)start);
+			printk(KERN_ERR "btrfs block %llu went down to one "
+			       "during drop_snap\n", (unsigned long long)start);
 		}
 
 	}
@@ -3489,7 +3504,7 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
-static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level)
 {
@@ -3516,7 +3531,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 	/*
 	 * walk down to the last node level and free all the leaves
 	 */
-	while(*level >= 0) {
+	while (*level >= 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
@@ -3576,10 +3591,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				*level = 0;
 				break;
 			}
-			if (printk_ratelimit()) {
-				printk("leaf ref miss for bytenr %llu\n",
-				       (unsigned long long)bytenr);
-			}
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
@@ -3641,7 +3652,7 @@ out:
  * walk_down_tree. The main difference is that it checks reference
  * counts while tree blocks are locked.
  */
-static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path, int *level)
 {
@@ -3730,7 +3741,7 @@ out:
  * to find the first node higher up where we haven't yet gone through
  * all the slots
  */
-static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 int *level, int max_level)
@@ -3839,7 +3850,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			}
 		}
 	}
-	while(1) {
+	while (1) {
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
@@ -3920,7 +3931,7 @@ static unsigned long calc_ra(unsigned long start, unsigned long last,
 	return min(last, start + nr - 1);
 }
 
-static int noinline relocate_inode_pages(struct inode *inode, u64 start,
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
 					 u64 len)
 {
 	u64 page_start;
@@ -4011,7 +4022,7 @@ out_unlock:
 	return ret;
 }
 
-static int noinline relocate_data_extent(struct inode *reloc_inode,
+static noinline int relocate_data_extent(struct inode *reloc_inode,
 					 struct btrfs_key *extent_key,
 					 u64 offset)
 {
@@ -4087,7 +4098,7 @@ static int is_cowonly_root(u64 root_objectid)
 	return 0;
 }
 
-static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_ref_path *ref_path,
 				    int first_time)
@@ -4119,11 +4130,10 @@ walk_down:
 		if (level < ref_path->lowest_level)
 			break;
 
-		if (level >= 0) {
+		if (level >= 0)
 			bytenr = ref_path->nodes[level];
-		} else {
+		else
 			bytenr = ref_path->extent_start;
-		}
 		BUG_ON(bytenr == 0);
 
 		parent = ref_path->nodes[level + 1];
@@ -4170,11 +4180,12 @@ walk_up:
 	level = ref_path->current_level;
 	while (level < BTRFS_MAX_LEVEL - 1) {
 		u64 ref_objectid;
-		if (level >= 0) {
+
+		if (level >= 0)
 			bytenr = ref_path->nodes[level];
-		} else {
+		else
 			bytenr = ref_path->extent_start;
-		}
+
 		BUG_ON(bytenr == 0);
 
 		key.objectid = bytenr;
@@ -4299,7 +4310,7 @@ static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
 	return __next_ref_path(trans, extent_root, ref_path, 0);
 }
 
-static int noinline get_new_locations(struct inode *reloc_inode,
+static noinline int get_new_locations(struct inode *reloc_inode,
 				      struct btrfs_key *extent_key,
 				      u64 offset, int no_fragment,
 				      struct disk_extent **extents,
@@ -4420,7 +4431,7 @@ out:
 	return ret;
 }
 
-static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key,
@@ -4778,7 +4789,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline invalidate_extent_cache(struct btrfs_root *root,
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
 					struct extent_buffer *leaf,
 					struct btrfs_block_group_cache *group,
 					struct btrfs_root *target_root)
@@ -4826,7 +4837,7 @@ static int noinline invalidate_extent_cache(struct btrfs_root *root,
 	return 0;
 }
 
-static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct extent_buffer *leaf,
 					struct btrfs_block_group_cache *group,
@@ -5035,7 +5046,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
 	return 0;
 }
 
-static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root)
 {
 	struct btrfs_root *reloc_root;
@@ -5102,7 +5113,7 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
  * tree blocks are shared between reloc trees, so they are also shared
  * between subvols.
  */
-static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path,
 				      struct btrfs_key *first_key,
@@ -5199,7 +5210,7 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct btrfs_key *first_key,
@@ -5217,7 +5228,7 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_path *path,
 				    struct btrfs_key *extent_key)
@@ -5233,7 +5244,7 @@ out:
 	return ret;
 }
 
-static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
 						struct btrfs_ref_path *ref_path)
 {
 	struct btrfs_key root_key;
@@ -5248,7 +5259,7 @@ static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
 }
 
-static int noinline relocate_one_extent(struct btrfs_root *extent_root,
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key,
 					struct btrfs_block_group_cache *group,
@@ -5276,8 +5287,8 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 
 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
 	if (!ref_path) {
-	       ret = -ENOMEM;
-	       goto out;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	for (loops = 0; ; loops++) {
@@ -5497,7 +5508,7 @@ out:
 	return ret;
 }
 
-static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
+static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 					struct btrfs_block_group_cache *group)
 {
 	struct inode *inode = NULL;
@@ -5617,7 +5628,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	block_group = btrfs_lookup_block_group(info, group_start);
 	BUG_ON(!block_group);
 
-	printk("btrfs relocating block group %llu flags %llu\n",
+	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
 	       (unsigned long long)block_group->key.objectid,
 	       (unsigned long long)block_group->flags);
 
@@ -5649,7 +5660,7 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
@@ -5712,7 +5723,7 @@ next:
 	}
 
 	if (total_found > 0) {
-		printk("btrfs found %llu extents in pass %d\n",
+		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
 		       (unsigned long long)total_found, pass);
 		pass++;
 		if (total_found == skipped && pass > 2) {
@@ -5754,7 +5765,7 @@ static int find_first_block_group(struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 
-	while(1) {
+	while (1) {
 		slot = path->slots[0];
 		leaf = path->nodes[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
@@ -5825,7 +5836,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	while(1) {
+	while (1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
 			ret = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0bf7684207a..39edb551dca 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -32,7 +32,7 @@ static LIST_HEAD(states);
 
 #define LEAK_DEBUG 0
 #ifdef LEAK_DEBUG
-static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(leak_lock);
 #endif
 
 #define BUFFER_LRU_MAX 64
@@ -81,7 +81,11 @@ void extent_io_exit(void)
 
 	while (!list_empty(&states)) {
 		state = list_entry(states.next, struct extent_state, leak_list);
-		printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
+		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       (unsigned long long)state->start,
+		       (unsigned long long)state->end,
+		       state->state, state->tree, atomic_read(&state->refs));
 		list_del(&state->leak_list);
 		kmem_cache_free(extent_state_cache, state);
 
@@ -89,7 +93,9 @@ void extent_io_exit(void)
 
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
+		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		       "refs %d\n", (unsigned long long)eb->start,
+		       eb->len, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
@@ -158,11 +164,11 @@ EXPORT_SYMBOL(free_extent_state);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
 
@@ -185,13 +191,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 				     struct rb_node **next_ret)
 {
 	struct rb_root *root = &tree->state;
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct tree_entry, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -200,14 +206,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 			n = n->rb_left;
 		else if (offset > entry->end)
 			n = n->rb_right;
-		else {
+		else
 			return n;
-		}
 	}
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset > prev_entry->end) {
+		while (prev && offset > prev_entry->end) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
@@ -217,7 +222,7 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 
 	if (next_ret) {
 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
-		while(prev && offset < prev_entry->start) {
+		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
@@ -233,9 +238,8 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 	struct rb_node *ret;
 
 	ret = __etree_search(tree, offset, &prev, NULL);
-	if (!ret) {
+	if (!ret)
 		return prev;
-	}
 	return ret;
 }
 
@@ -243,11 +247,11 @@ static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
 					  u64 offset, struct rb_node *node)
 {
 	struct rb_root *root = &tree->buffer;
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct extent_buffer *eb;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		eb = rb_entry(parent, struct extent_buffer, rb_node);
 
@@ -268,10 +272,10 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
 					   u64 offset)
 {
 	struct rb_root *root = &tree->buffer;
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct extent_buffer *eb;
 
-	while(n) {
+	while (n) {
 		eb = rb_entry(n, struct extent_buffer, rb_node);
 		if (offset < eb->start)
 			n = n->rb_left;
@@ -363,7 +367,9 @@ static int insert_state(struct extent_io_tree *tree,
 	struct rb_node *node;
 
 	if (end < start) {
-		printk("end < start %Lu %Lu\n", end, start);
+		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+		       (unsigned long long)end,
+		       (unsigned long long)start);
 		WARN_ON(1);
 	}
 	if (bits & EXTENT_DIRTY)
@@ -376,7 +382,10 @@ static int insert_state(struct extent_io_tree *tree,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+		       "%llu %llu\n", (unsigned long long)found->start,
+		       (unsigned long long)found->end,
+		       (unsigned long long)start, (unsigned long long)end);
 		free_extent_state(state);
 		return -EEXIST;
 	}
@@ -412,7 +421,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
@@ -661,8 +669,9 @@ static void set_state_bits(struct extent_io_tree *tree,
  * [start, end] is inclusive
  * This takes the tree lock.
  */
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
-		   int exclusive, u64 *failed_start, gfp_t mask)
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			  int bits, int exclusive, u64 *failed_start,
+			  gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -763,7 +772,7 @@ again:
 		if (end < last_start)
 			this_end = end;
 		else
-			this_end = last_start -1;
+			this_end = last_start - 1;
 		err = insert_state(tree, prealloc, start, this_end,
 				   bits);
 		prealloc = NULL;
@@ -891,8 +900,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_uptodate);
 
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			  gfp_t mask)
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+				 u64 end, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
 }
@@ -904,8 +913,8 @@ static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			      0, NULL, mask);
 }
 
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
-			   gfp_t mask)
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
 }
@@ -1025,11 +1034,10 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->end >= start && (state->state & bits)) {
 			*start_ret = state->start;
@@ -1062,15 +1070,14 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->end >= start && (state->state & bits)) {
+		if (state->end >= start && (state->state & bits))
 			return state;
-		}
+
 		node = rb_next(node);
 		if (!node)
 			break;
@@ -1108,7 +1115,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 		goto out;
 	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (found && (state->start != cur_start ||
 			      (state->state & EXTENT_BOUNDARY))) {
@@ -1150,7 +1157,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
 	if (index == locked_page->index && end_index == index)
 		return 0;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long, nr_pages,
 				     ARRAY_SIZE(pages)), pages);
@@ -1186,7 +1193,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 
 	/* skip the page at the start index */
 	nrpages = end_index - index + 1;
-	while(nrpages > 0) {
+	while (nrpages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nrpages, ARRAY_SIZE(pages)), pages);
@@ -1263,17 +1270,16 @@ again:
 	 * pages in order, so we can't process delalloc bytes before
 	 * locked_page
 	 */
-	if (delalloc_start < *start) {
+	if (delalloc_start < *start)
 		delalloc_start = *start;
-	}
 
 	/*
 	 * make sure to limit the number of pages we try to lock down
 	 * if we're looping.
 	 */
-	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
-	}
+
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
 				  delalloc_start, delalloc_end);
@@ -1341,7 +1347,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
 		return 0;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
@@ -1384,7 +1390,6 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	int found = 0;
 
 	if (search_end <= cur_start) {
-		printk("search_end %Lu start %Lu\n", search_end, cur_start);
 		WARN_ON(1);
 		return 0;
 	}
@@ -1399,11 +1404,10 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, cur_start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->start > search_end)
 			break;
@@ -1927,19 +1931,15 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		nr = bio_get_nr_vecs(bdev);
 
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
-	if (!bio) {
-		printk("failed to allocate bio nr %d\n", nr);
-	}
 
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 
-	if (bio_ret) {
+	if (bio_ret)
 		*bio_ret = bio;
-	} else {
+	else
 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
-	}
 
 	return ret;
 }
@@ -2028,13 +2028,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			break;
 		}
 		extent_offset = cur - em->start;
-		if (extent_map_end(em) <= cur) {
-printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
-		}
 		BUG_ON(extent_map_end(em) <= cur);
-		if (end < cur) {
-printk("2bad mapping end %Lu cur %Lu\n", end, cur);
-		}
 		BUG_ON(end < cur);
 
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
@@ -2199,7 +2193,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
-		while(delalloc_end < page_end) {
+		while (delalloc_end < page_end) {
 			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
 						       &delalloc_start,
@@ -2242,9 +2236,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	nr_written++;
 
 	end = page_end;
-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-		printk("found delalloc bits after lock_extent\n");
-	}
+	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
 
 	if (last_byte <= start) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
@@ -2297,7 +2290,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
 
-			unlock_extent(tree, unlock_start, cur + iosize -1,
+			unlock_extent(tree, unlock_start, cur + iosize - 1,
 				      GFP_NOFS);
 
 			/*
@@ -2344,9 +2337,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
-				printk("warning page %lu not writeback, "
-				       "cur %llu end %llu\n", page->index,
-				       (unsigned long long)cur,
+				printk(KERN_ERR "btrfs warning page %lu not "
+				       "writeback, cur %llu end %llu\n",
+				       page->index, (unsigned long long)cur,
 				       (unsigned long long)end);
 			}
 
@@ -2430,8 +2423,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 retry:
 	while (!done && (index <= end) &&
 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+			      PAGECACHE_TAG_DIRTY, min(end - index,
+				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
@@ -2536,9 +2529,8 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd, flush_write_bio);
-	if (epd.bio) {
+	if (epd.bio)
 		submit_one_bio(WRITE, epd.bio, 0, 0);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_full_page);
@@ -2568,7 +2560,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.range_end	= end + 1,
 	};
 
-	while(start <= end) {
+	while (start <= end) {
 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
 		if (clear_page_dirty_for_io(page))
 			ret = __extent_writepage(page, &wbc_writepages, &epd);
@@ -2606,9 +2598,8 @@ int extent_writepages(struct extent_io_tree *tree,
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd,
 				       flush_write_bio);
-	if (epd.bio) {
+	if (epd.bio)
 		submit_one_bio(WRITE, epd.bio, 0, 0);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_writepages);
@@ -2666,7 +2657,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
-	start += (offset + blocksize -1) & ~(blocksize - 1);
+	start += (offset + blocksize - 1) & ~(blocksize - 1);
 	if (start > end)
 		return 0;
 
@@ -2727,12 +2718,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
 	orig_block_start = block_start;
 
 	lock_extent(tree, page_start, page_end, GFP_NOFS);
-	while(block_start <= block_end) {
+	while (block_start <= block_end) {
 		em = get_extent(inode, page, page_offset, block_start,
 				block_end - block_start + 1, 1);
-		if (IS_ERR(em) || !em) {
+		if (IS_ERR(em) || !em)
 			goto err;
-		}
+
 		cur_end = min(block_end, extent_map_end(em) - 1);
 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
 		block_off_end = block_off_start + blocksize;
@@ -3170,7 +3161,7 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
 		set_extent_dirty(tree, page_offset(page),
-				 page_offset(page) + PAGE_CACHE_SIZE -1,
+				 page_offset(page) + PAGE_CACHE_SIZE - 1,
 				 GFP_NOFS);
 		unlock_page(page);
 	}
@@ -3235,7 +3226,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
 	if (ret)
 		return 1;
-	while(start <= end) {
+	while (start <= end) {
 		index = start >> PAGE_CACHE_SHIFT;
 		page = find_get_page(tree->mapping, index);
 		uptodate = PageUptodate(page);
@@ -3321,16 +3312,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			lock_page(page);
 		}
 		locked_pages++;
-		if (!PageUptodate(page)) {
+		if (!PageUptodate(page))
 			all_uptodate = 0;
-		}
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
 			eb->flags |= EXTENT_UPTODATE;
-		if (ret) {
-			printk("all up to date but ret is %d\n", ret);
-		}
 		goto unlock_exit;
 	}
 
@@ -3345,10 +3332,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
 						      mirror_num, &bio_flags);
-			if (err) {
+			if (err)
 				ret = err;
-				printk("err %d from __extent_read_full_page\n", ret);
-			}
 		} else {
 			unlock_page(page);
 		}
@@ -3357,26 +3342,23 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (bio)
 		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-	if (ret || !wait) {
-		if (ret)
-			printk("ret %d wait %d returning\n", ret, wait);
+	if (ret || !wait)
 		return ret;
-	}
+
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
-		if (!PageUptodate(page)) {
-			printk("page not uptodate after wait_on_page_locked\n");
+		if (!PageUptodate(page))
 			ret = -EIO;
-		}
 	}
+
 	if (!ret)
 		eb->flags |= EXTENT_UPTODATE;
 	return ret;
 
 unlock_exit:
 	i = start_i;
-	while(locked_pages > 0) {
+	while (locked_pages > 0) {
 		page = extent_buffer_page(eb, i);
 		i++;
 		unlock_page(page);
@@ -3403,7 +3385,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -3442,8 +3424,11 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		offset = 0;
 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
 	}
+
 	if (start + min_len > eb->len) {
-printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
+		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n", (unsigned long long)eb->start,
+		       eb->len, start, min_len);
 		WARN_ON(1);
 	}
 
@@ -3506,7 +3491,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -3542,7 +3527,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3574,7 +3559,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3607,7 +3592,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	offset = (start_offset + dst_offset) &
 		((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(dst, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3674,17 +3659,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 
-	while(len > 0) {
+	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
@@ -3722,20 +3707,20 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset < src_offset) {
 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
 		return;
 	}
-	while(len > 0) {
+	while (len > 0) {
 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fd3ebfb8c3c..4a83e33ada3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -89,11 +89,11 @@ EXPORT_SYMBOL(free_extent_map);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct extent_map *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
@@ -122,13 +122,13 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 				     struct rb_node **prev_ret,
 				     struct rb_node **next_ret)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct extent_map *entry;
 	struct extent_map *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct extent_map, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -145,7 +145,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset >= extent_map_end(prev_entry)) {
+		while (prev && offset >= extent_map_end(prev_entry)) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
@@ -155,7 +155,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (next_ret) {
 		prev_entry = rb_entry(prev, struct extent_map, rb_node);
-		while(prev && offset < prev_entry->start) {
+		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index cc6e0b6de94..b11abfad81a 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,7 +24,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-#define MAX_CSUM_ITEMS(r,size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
@@ -166,7 +166,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	WARN_ON(bio->bi_vcnt <= 0);
 
 	disk_bytenr = (u64)bio->bi_sector << 9;
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
@@ -192,8 +192,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 						offset + bvec->bv_len - 1,
 						EXTENT_NODATASUM, GFP_NOFS);
 				} else {
-					printk("no csum found for inode %lu "
-					       "start %llu\n", inode->i_ino,
+					printk(KERN_INFO "btrfs no csum found "
+					       "for inode %lu start %llu\n",
+					       inode->i_ino,
 					       (unsigned long long)offset);
 				}
 				item = NULL;
@@ -373,7 +374,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	BUG_ON(!ordered);
 	sums->bytenr = ordered->start;
 
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		if (!contig)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 
@@ -507,7 +508,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 
-	while(1) {
+	while (1) {
 		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 		key.offset = end_byte - 1;
 		key.type = BTRFS_EXTENT_CSUM_KEY;
@@ -715,9 +716,8 @@ again:
 			goto csum;
 
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != csum_size) {
+		if (diff != csum_size)
 			goto insert;
-		}
 
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
@@ -732,7 +732,7 @@ insert:
 		u64 next_sector = sector_sum->bytenr;
 		struct btrfs_sector_sum *next = sector_sum + 1;
 
-		while(tmp < sums->len) {
+		while (tmp < sums->len) {
 			if (next_sector + root->sectorsize != next->bytenr)
 				break;
 			tmp += root->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5908521922f..0e3a13a4565 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -44,10 +44,10 @@
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
-static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 					 int write_bytes,
 					 struct page **prepared_pages,
-					 const char __user * buf)
+					 const char __user *buf)
 {
 	long page_fault = 0;
 	int i;
@@ -78,7 +78,7 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
@@ -103,7 +103,7 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
  * this also makes the decision about creating an inline extent vs
  * doing real data extents, marking pages dirty and delalloc as required.
  */
-static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct file *file,
 				   struct page **pages,
@@ -137,9 +137,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	btrfs_set_trans_block_group(trans, inode);
 	hint_byte = 0;
 
-	if ((end_of_last_block & 4095) == 0) {
-		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
-	}
 	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
 	/* check for reserved extents on each page, we don't want
@@ -185,7 +182,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		len = (u64)-1;
 		testend = 0;
 	}
-	while(1) {
+	while (1) {
 		if (!split)
 			split = alloc_extent_map(GFP_NOFS);
 		if (!split2)
@@ -295,7 +292,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 	path = btrfs_alloc_path();
 	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
 				       last_offset, 0);
-	while(1) {
+	while (1) {
 		nritems = btrfs_header_nritems(path->nodes[0]);
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(root, path);
@@ -314,8 +311,10 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 		if (found_key.offset < last_offset) {
 			WARN_ON(1);
 			btrfs_print_leaf(root, leaf);
-			printk("inode %lu found offset %Lu expected %Lu\n",
-			       inode->i_ino, found_key.offset, last_offset);
+			printk(KERN_ERR "inode %lu found offset %llu "
+			       "expected %llu\n", inode->i_ino,
+			       (unsigned long long)found_key.offset,
+			       (unsigned long long)last_offset);
 			err = 1;
 			goto out;
 		}
@@ -331,7 +330,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 			extent_end = found_key.offset +
 			     btrfs_file_extent_inline_len(leaf, extent);
 			extent_end = (extent_end + root->sectorsize - 1) &
-				~((u64)root->sectorsize -1 );
+				~((u64)root->sectorsize - 1);
 		}
 		last_offset = extent_end;
 		path->slots[0]++;
@@ -339,8 +338,9 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 	if (0 && last_offset < inode->i_size) {
 		WARN_ON(1);
 		btrfs_print_leaf(root, leaf);
-		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
-		       last_offset, inode->i_size);
+		printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
+		       inode->i_ino, (unsigned long long)last_offset,
+		       (unsigned long long)inode->i_size);
 		err = 1;
 
 	}
@@ -362,7 +362,7 @@ out:
  * inline_limit is used to tell this code which offsets in the file to keep
  * if they contain inline extents.
  */
-int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
+noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
@@ -398,7 +398,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	while(1) {
+	while (1) {
 		recow = 0;
 		btrfs_release_path(root, path);
 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
@@ -649,16 +649,15 @@ next_slot:
 			if (disk_bytenr != 0) {
 				ret = btrfs_update_extent_ref(trans, root,
 						disk_bytenr, orig_parent,
-					        leaf->start,
+						leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
 
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
-			if (disk_bytenr != 0) {
+			if (disk_bytenr != 0)
 				inode_add_bytes(inode, extent_end - end);
-			}
 		}
 
 		if (found_extent && !keep) {
@@ -944,7 +943,7 @@ done:
  * waits for data=ordered extents to finish before allowing the pages to be
  * modified.
  */
-static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
 			 loff_t pos, unsigned long first_index,
 			 unsigned long last_index, size_t write_bytes)
@@ -979,7 +978,8 @@ again:
 		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(inode)->io_tree,
 			    start_pos, last_pos - 1, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    last_pos - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset < last_pos) {
@@ -1085,7 +1085,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		}
 	}
 
-	while(count > 0) {
+	while (count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(count, nrptrs *
 					(size_t)PAGE_CACHE_SIZE -
@@ -1178,7 +1178,7 @@ out_nolock:
 	return num_written ? num_written : err;
 }
 
-int btrfs_release_file(struct inode * inode, struct file * filp)
+int btrfs_release_file(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
@@ -1237,9 +1237,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-	if (ret < 0) {
+	if (ret < 0)
 		goto out;
-	}
 
 	/* we've logged all the items and now have a consistent
 	 * version of the file in the log.  It is possible that
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2e69b9c3043..d1e5f0e84c5 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,10 +213,13 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 		info->offset = offset;
 		info->bytes += bytes;
 	} else if (right_info && right_info->offset != offset+bytes) {
-		printk(KERN_ERR "adding space in the middle of an existing "
-		       "free space area. existing: offset=%Lu, bytes=%Lu. "
-		       "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
-		       right_info->bytes, offset, bytes);
+		printk(KERN_ERR "btrfs adding space in the middle of an "
+		       "existing free space area. existing: "
+		       "offset=%llu, bytes=%llu. new: offset=%llu, "
+		       "bytes=%llu\n", (unsigned long long)right_info->offset,
+		       (unsigned long long)right_info->bytes,
+		       (unsigned long long)offset,
+		       (unsigned long long)bytes);
 		BUG();
 	}
 
@@ -225,11 +228,14 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 
 		if (unlikely((left_info->offset + left_info->bytes) !=
 			     offset)) {
-			printk(KERN_ERR "free space to the left of new free "
-			       "space isn't quite right. existing: offset=%Lu,"
-			       " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
-			       left_info->offset, left_info->bytes, offset,
-			       bytes);
+			printk(KERN_ERR "btrfs free space to the left "
+			       "of new free space isn't "
+			       "quite right. existing: offset=%llu, "
+			       "bytes=%llu. new: offset=%llu, bytes=%llu\n",
+			       (unsigned long long)left_info->offset,
+			       (unsigned long long)left_info->bytes,
+			       (unsigned long long)offset,
+			       (unsigned long long)bytes);
 			BUG();
 		}
 
@@ -265,8 +271,7 @@ out:
 			BUG();
 	}
 
-	if (alloc_info)
-		kfree(alloc_info);
+	kfree(alloc_info);
 
 	return ret;
 }
@@ -283,9 +288,11 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (info && info->offset == offset) {
 		if (info->bytes < bytes) {
-			printk(KERN_ERR "Found free space at %Lu, size %Lu,"
-			       "trying to use %Lu\n",
-			       info->offset, info->bytes, bytes);
+			printk(KERN_ERR "Found free space at %llu, size %llu,"
+			       "trying to use %llu\n",
+			       (unsigned long long)info->offset,
+			       (unsigned long long)info->bytes,
+			       (unsigned long long)bytes);
 			WARN_ON(1);
 			ret = -EINVAL;
 			goto out;
@@ -401,8 +408,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes)
 			count++;
-		//printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
-		//       info->bytes);
 	}
 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
 	       "\n", count);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 80038c5ef7c..2aa79873eb4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -129,7 +129,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 		last_ino = key.objectid + 1;
 		path->slots[0]++;
 	}
-	// FIXME -ENOSPC
 	BUG_ON(1);
 found:
 	btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 068bad46338..1b35ea63b6c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -124,7 +124,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
  * the btree.  The caller should have done a btrfs_drop_extents so that
  * no overlapping inline items exist in the btree
  */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode,
 				u64 start, size_t size, size_t compressed_size,
 				struct page **compressed_pages)
@@ -148,7 +148,8 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 		cur_size = compressed_size;
 	}
 
-	path = btrfs_alloc_path(); if (!path)
+	path = btrfs_alloc_path();
+	if (!path)
 		return -ENOMEM;
 
 	btrfs_set_trans_block_group(trans, inode);
@@ -165,7 +166,6 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	if (ret) {
 		err = ret;
-		printk("got bad ret %d\n", ret);
 		goto fail;
 	}
 	leaf = path->nodes[0];
@@ -181,7 +181,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 	if (use_compress) {
 		struct page *cpage;
 		int i = 0;
-		while(compressed_size > 0) {
+		while (compressed_size > 0) {
 			cpage = compressed_pages[i];
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
@@ -519,8 +519,7 @@ free_pages_out:
 		WARN_ON(pages[i]->mapping);
 		page_cache_release(pages[i]);
 	}
-	if (pages)
-		kfree(pages);
+	kfree(pages);
 
 	goto out;
 }
@@ -549,7 +548,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
 	trans = btrfs_join_transaction(root, 1);
 
-	while(!list_empty(&async_cow->extents)) {
+	while (!list_empty(&async_cow->extents)) {
 		async_extent = list_entry(async_cow->extents.next,
 					  struct async_extent, list);
 		list_del(&async_extent->list);
@@ -562,8 +561,8 @@ static noinline int submit_compressed_extents(struct inode *inode,
 			unsigned long nr_written = 0;
 
 			lock_extent(io_tree, async_extent->start,
-				    async_extent->start + async_extent->ram_size - 1,
-				    GFP_NOFS);
+				    async_extent->start +
+				    async_extent->ram_size - 1, GFP_NOFS);
 
 			/* allocate blocks */
 			cow_file_range(inode, async_cow->locked_page,
@@ -581,7 +580,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 			if (!page_started)
 				extent_write_locked_range(io_tree,
 						  inode, async_extent->start,
-					          async_extent->start +
+						  async_extent->start +
 						  async_extent->ram_size - 1,
 						  btrfs_get_extent,
 						  WB_SYNC_ALL);
@@ -618,7 +617,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
-		while(1) {
+		while (1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
 			spin_unlock(&em_tree->lock);
@@ -651,11 +650,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
 					     NULL, 1, 1, 0, 1, 1, 0);
 
 		ret = btrfs_submit_compressed_write(inode,
-				         async_extent->start,
-					 async_extent->ram_size,
-					 ins.objectid,
-					 ins.offset, async_extent->pages,
-					 async_extent->nr_pages);
+				    async_extent->start,
+				    async_extent->ram_size,
+				    ins.objectid,
+				    ins.offset, async_extent->pages,
+				    async_extent->nr_pages);
 
 		BUG_ON(ret);
 		trans = btrfs_join_transaction(root, 1);
@@ -735,14 +734,13 @@ static noinline int cow_file_range(struct inode *inode,
 
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
-	while(disk_num_bytes > 0) {
+	while (disk_num_bytes > 0) {
 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
-		if (ret) {
-			BUG();
-		}
+		BUG_ON(ret);
+
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
 		em->orig_start = em->start;
@@ -755,7 +753,7 @@ static noinline int cow_file_range(struct inode *inode,
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-		while(1) {
+		while (1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
 			spin_unlock(&em_tree->lock);
@@ -779,11 +777,9 @@ static noinline int cow_file_range(struct inode *inode,
 			BUG_ON(ret);
 		}
 
-		if (disk_num_bytes < cur_alloc_size) {
-			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
-			       cur_alloc_size);
+		if (disk_num_bytes < cur_alloc_size)
 			break;
-		}
+
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
@@ -842,9 +838,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
 
-	if (async_cow->inode) {
+	if (async_cow->inode)
 		submit_compressed_extents(async_cow->inode, async_cow);
-	}
 }
 
 static noinline void async_cow_free(struct btrfs_work *work)
@@ -871,7 +866,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
 			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
-	while(start < end) {
+	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		async_cow->inode = inode;
 		async_cow->root = root;
@@ -904,7 +899,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 			    limit));
 		}
 
-		while(atomic_read(&root->fs_info->async_submit_draining) &&
+		while (atomic_read(&root->fs_info->async_submit_draining) &&
 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
 			wait_event(root->fs_info->async_submit_wait,
 			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
@@ -918,7 +913,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	return 0;
 }
 
-static int noinline csum_exist_in_range(struct btrfs_root *root,
+static noinline int csum_exist_in_range(struct btrfs_root *root,
 					u64 bytenr, u64 num_bytes)
 {
 	int ret;
@@ -1146,13 +1141,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 
 	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-                                        page_started, 1, nr_written);
+					 page_started, 1, nr_written);
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-                                        page_started, 0, nr_written);
+					 page_started, 0, nr_written);
 	else
 		ret = cow_file_range_async(inode, locked_page, start, end,
-				     page_started, nr_written);
+					   page_started, nr_written);
 
 	return ret;
 }
@@ -1200,8 +1195,11 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 
 		spin_lock(&root->fs_info->delalloc_lock);
 		if (end - start + 1 > root->fs_info->delalloc_bytes) {
-			printk("warning: delalloc account %Lu %Lu\n",
-			       end - start + 1, root->fs_info->delalloc_bytes);
+			printk(KERN_INFO "btrfs warning: delalloc account "
+			       "%llu %llu\n",
+			       (unsigned long long)end - start + 1,
+			       (unsigned long long)
+			       root->fs_info->delalloc_bytes);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
@@ -1241,9 +1239,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	ret = btrfs_map_block(map_tree, READ, logical,
 			      &map_length, NULL, 0);
 
-	if (map_length < length + size) {
+	if (map_length < length + size)
 		return 1;
-	}
 	return 0;
 }
 
@@ -1255,8 +1252,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags)
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1341,9 +1339,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
 {
-	if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
+	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
 		WARN_ON(1);
-	}
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   GFP_NOFS);
 }
@@ -1755,14 +1752,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
-	if (ret) {
+	if (ret)
 		goto zeroit;
-	}
+
 	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
 	btrfs_csum_final(csum, (char *)&csum);
-	if (csum != private) {
+	if (csum != private)
 		goto zeroit;
-	}
+
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
 good:
@@ -1773,9 +1770,10 @@ good:
 	return 0;
 
 zeroit:
-	printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
-	       page->mapping->host->i_ino, (unsigned long long)start, csum,
-	       private);
+	printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+	       "private %llu\n", page->mapping->host->i_ino,
+	       (unsigned long long)start, csum,
+	       (unsigned long long)private);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
@@ -2097,9 +2095,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 /*
  * copy everything in the in-memory inode into the btree.
  */
-int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *inode)
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_path *path;
@@ -2174,7 +2171,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 				  inode->i_ino,
 				  dir->i_ino, &index);
 	if (ret) {
-		printk("failed to delete reference to %.*s, "
+		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
 		       "inode %lu parent %lu\n", name_len, name,
 		       inode->i_ino, dir->i_ino);
 		goto err;
@@ -2280,9 +2277,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
-	if (!err) {
+	if (!err)
 		btrfs_i_size_write(inode, 0);
-	}
 
 fail_trans:
 	nr = trans->blocks_used;
@@ -2516,9 +2512,9 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0) {
+	if (ret < 0)
 		goto error;
-	}
+
 	if (ret > 0) {
 		/* there are no items in the tree for us to truncate, we're
 		 * done
@@ -2530,7 +2526,7 @@ search_again:
 		path->slots[0]--;
 	}
 
-	while(1) {
+	while (1) {
 		fi = NULL;
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -2562,19 +2558,18 @@ search_again:
 			item_end--;
 		}
 		if (item_end < new_size) {
-			if (found_type == BTRFS_DIR_ITEM_KEY) {
+			if (found_type == BTRFS_DIR_ITEM_KEY)
 				found_type = BTRFS_INODE_ITEM_KEY;
-			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
+			else if (found_type == BTRFS_EXTENT_ITEM_KEY)
 				found_type = BTRFS_EXTENT_DATA_KEY;
-			} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			else if (found_type == BTRFS_EXTENT_DATA_KEY)
 				found_type = BTRFS_XATTR_ITEM_KEY;
-			} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
+			else if (found_type == BTRFS_XATTR_ITEM_KEY)
 				found_type = BTRFS_INODE_REF_KEY;
-			} else if (found_type) {
+			else if (found_type)
 				found_type--;
-			} else {
+			else
 				break;
-			}
 			btrfs_set_key_type(&key, found_type);
 			goto next;
 		}
@@ -2656,7 +2651,7 @@ delete:
 				pending_del_nr++;
 				pending_del_slot = path->slots[0];
 			} else {
-				printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
+				BUG();
 			}
 		} else {
 			break;
@@ -2938,9 +2933,10 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 				    namelen, 0);
 	if (IS_ERR(di))
 		ret = PTR_ERR(di);
-	if (!di || IS_ERR(di)) {
+
+	if (!di || IS_ERR(di))
 		goto out_err;
-	}
+
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 out:
 	btrfs_free_path(path);
@@ -3020,8 +3016,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct btrfs_iget_args *args = opaque;
-	return (args->ino == inode->i_ino &&
-		args->root == BTRFS_I(inode)->root);
+	return args->ino == inode->i_ino &&
+		args->root == BTRFS_I(inode)->root;
 }
 
 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
@@ -3085,7 +3081,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
-	struct inode * inode;
+	struct inode *inode;
 	struct btrfs_inode *bi = BTRFS_I(dir);
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
@@ -3385,9 +3381,8 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
 
 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
 		ret = btrfs_set_inode_index_count(dir);
-		if (ret) {
+		if (ret)
 			return ret;
-		}
 	}
 
 	*index = BTRFS_I(dir)->index_cnt;
@@ -3879,12 +3874,13 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
- * the ugly parts come from merging extents from the disk with the
- * in-ram representation.  This gets more complex because of the data=ordered code,
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
  * where the in-ram extents might be locked pending data=ordered completion.
  *
  * This also copies inline extents directly into the page.
  */
+
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t pg_offset, u64 start, u64 len,
 				    int create)
@@ -4081,7 +4077,7 @@ again:
 				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
 	} else {
-		printk("unkknown found_type %d\n", found_type);
+		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
 		WARN_ON(1);
 	}
 not_found:
@@ -4093,7 +4089,11 @@ not_found_em:
 insert:
 	btrfs_release_path(root, path);
 	if (em->start > start || extent_map_end(em) <= start) {
-		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
+		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+		       "[%llu %llu]\n", (unsigned long long)em->start,
+		       (unsigned long long)em->len,
+		       (unsigned long long)start,
+		       (unsigned long long)len);
 		err = -EIO;
 		goto out;
 	}
@@ -4130,8 +4130,6 @@ insert:
 				}
 			} else {
 				err = -EIO;
-				printk("failing to insert %Lu %Lu\n",
-				       start, len);
 				free_extent_map(em);
 				em = NULL;
 			}
@@ -4147,9 +4145,8 @@ out:
 		btrfs_free_path(path);
 	if (trans) {
 		ret = btrfs_end_transaction(trans, root);
-		if (!err) {
+		if (!err)
 			err = ret;
-		}
 	}
 	if (err) {
 		free_extent_map(em);
@@ -4482,13 +4479,15 @@ void btrfs_destroy_inode(struct inode *inode)
 	}
 	spin_unlock(&BTRFS_I(inode)->root->list_lock);
 
-	while(1) {
+	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
 			break;
 		else {
-			printk("found ordered extent %Lu %Lu\n",
-			       ordered->file_offset, ordered->len);
+			printk(KERN_ERR "btrfs found ordered "
+			       "extent %llu %llu on inode cleanup\n",
+			       (unsigned long long)ordered->file_offset,
+			       (unsigned long long)ordered->len);
 			btrfs_remove_ordered_extent(inode, ordered);
 			btrfs_put_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
@@ -4572,8 +4571,8 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	return 0;
 }
 
-static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -4663,7 +4662,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 		return -EROFS;
 
 	spin_lock(&root->fs_info->delalloc_lock);
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
 				    delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
@@ -4684,7 +4683,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	 * ordered extents get created before we return
 	 */
 	atomic_inc(&root->fs_info->async_submit_draining);
-	while(atomic_read(&root->fs_info->nr_async_submits) ||
+	while (atomic_read(&root->fs_info->nr_async_submits) ||
 	      atomic_read(&root->fs_info->async_delalloc_pages)) {
 		wait_event(root->fs_info->async_submit_wait,
 		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ba484aac1b9..c2aa33e3feb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -311,7 +311,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 		 * to see if is references the subvolume where we are
 		 * placing this new snapshot.
 		 */
-		while(1) {
+		while (1) {
 			if (!test ||
 			    dir == snap_src->fs_info->sb->s_root ||
 			    test == snap_src->fs_info->sb->s_root ||
@@ -319,7 +319,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 				break;
 			}
 			if (S_ISLNK(test->d_inode->i_mode)) {
-				printk("Symlink in snapshot path, failed\n");
+				printk(KERN_INFO "Btrfs symlink in snapshot "
+				       "path, failed\n");
 				error = -EMLINK;
 				btrfs_free_path(path);
 				goto out_drop_write;
@@ -329,7 +330,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
 				  path, test_oid, parent_oid);
 			if (ret == 0) {
-				printk("Snapshot creation failed, looping\n");
+				printk(KERN_INFO "Btrfs snapshot creation "
+				       "failed, looping\n");
 				error = -EMLINK;
 				btrfs_free_path(path);
 				goto out_drop_write;
@@ -617,7 +619,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
 		src_inode = src_file->f_path.dentry->d_inode;
 		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
-			printk("btrfs: Snapshot src from another FS\n");
+			printk(KERN_INFO "btrfs: Snapshot src from "
+			       "another FS\n");
 			ret = -EINVAL;
 			fput(src_file);
 			goto out;
@@ -810,9 +813,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	    ((off + len) & (bs-1)))
 		goto out_unlock;
 
-	printk("final src extent is %llu~%llu\n", off, len);
-	printk("final dst extent is %llu~%llu\n", destoff, len);
-
 	/* do any pending delalloc/csum calc on src, one way or
 	   another, and lock file content */
 	while (1) {
@@ -883,10 +883,13 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			comp = btrfs_file_extent_compression(leaf, extent);
 			type = btrfs_file_extent_type(leaf, extent);
 			if (type == BTRFS_FILE_EXTENT_REG) {
-				disko = btrfs_file_extent_disk_bytenr(leaf, extent);
-				diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+				disko = btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
 				datao = btrfs_file_extent_offset(leaf, extent);
-				datal = btrfs_file_extent_num_bytes(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf,
+								    extent);
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				/* take upper bound, may be compressed */
 				datal = btrfs_file_extent_ram_bytes(leaf,
@@ -916,8 +919,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 				extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
-				printk("  orig disk %llu~%llu data %llu~%llu\n",
-				       disko, diskl, datao, datal);
 
 				if (off > key.offset) {
 					datao += off - key.offset;
@@ -929,8 +930,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				/* disko == 0 means it's a hole */
 				if (!disko)
 					datao = 0;
-				printk(" final disk %llu~%llu data %llu~%llu\n",
-				       disko, diskl, datao, datal);
 
 				btrfs_set_file_extent_offset(leaf, extent,
 							     datao);
@@ -952,12 +951,11 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					skip = off - key.offset;
 					new_key.offset += skip;
 				}
+
 				if (key.offset + datal > off+len)
 					trim = key.offset + datal - (off+len);
-				printk("len %lld skip %lld trim %lld\n",
-				       datal, skip, trim);
+
 				if (comp && (skip || trim)) {
-					printk("btrfs clone_range can't split compressed inline extents yet\n");
 					ret = -EINVAL;
 					goto out;
 				}
@@ -969,7 +967,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					goto out;
 
 				if (skip) {
-					u32 start = btrfs_file_extent_calc_inline_size(0);
+					u32 start =
+					  btrfs_file_extent_calc_inline_size(0);
 					memmove(buf+start, buf+start+skip,
 						datal);
 				}
@@ -985,7 +984,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_mark_buffer_dirty(leaf);
 		}
 
-	next:
+next:
 		btrfs_release_path(root, path);
 		key.offset++;
 	}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index e30aa6e2958..39bae7761db 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -31,9 +31,10 @@
  * difference in almost every workload, but spinning for the right amount of
  * time needs some help.
  *
- * In general, we want to spin as long as the lock holder is doing btree searches,
- * and we should give up if they are in more expensive code.
+ * In general, we want to spin as long as the lock holder is doing btree
+ * searches, and we should give up if they are in more expensive code.
  */
+
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
 	int i;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d9e232227da..a2094017027 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -39,11 +39,11 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)
 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct btrfs_ordered_extent *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
 
@@ -67,13 +67,13 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 				     struct rb_node **prev_ret)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *test;
 	struct btrfs_ordered_extent *entry;
 	struct btrfs_ordered_extent *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -88,7 +88,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	if (!prev_ret)
 		return NULL;
 
-	while(prev && file_offset >= entry_end(prev_entry)) {
+	while (prev && file_offset >= entry_end(prev_entry)) {
 		test = rb_next(prev);
 		if (!test)
 			break;
@@ -102,7 +102,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	if (prev)
 		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
 				      rb_node);
-	while(prev && file_offset < entry_end(prev_entry)) {
+	while (prev && file_offset < entry_end(prev_entry)) {
 		test = rb_prev(prev);
 		if (!test)
 			break;
@@ -193,10 +193,8 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
-	if (node) {
-		printk("warning dup entry from add_ordered_extent\n");
-		BUG();
-	}
+	BUG_ON(node);
+
 	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
 			   entry_end(entry) - 1, GFP_NOFS);
 
@@ -282,7 +280,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 	struct btrfs_ordered_sum *sum;
 
 	if (atomic_dec_and_test(&entry->refs)) {
-		while(!list_empty(&entry->list)) {
+		while (!list_empty(&entry->list)) {
 			cur = entry->list.next;
 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
 			list_del(&sum->list);
@@ -432,11 +430,10 @@ again:
 					   orig_end >> PAGE_CACHE_SHIFT);
 
 	end = orig_end;
-	while(1) {
+	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
-		if (!ordered) {
+		if (!ordered)
 			break;
-		}
 		if (ordered->file_offset > orig_end) {
 			btrfs_put_ordered_extent(ordered);
 			break;
@@ -492,7 +489,7 @@ out:
  * if none is found
  */
 struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -553,7 +550,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * yet
 	 */
 	node = &ordered->rb_node;
-	while(1) {
+	while (1) {
 		node = rb_prev(node);
 		if (!node)
 			break;
@@ -581,9 +578,8 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 		 * between our ordered extent and the next one.
 		 */
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		if (test->file_offset > entry_end(ordered)) {
+		if (test->file_offset > entry_end(ordered))
 			i_size_test = test->file_offset;
-		}
 	} else {
 		i_size_test = i_size_read(inode);
 	}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 64725c13aa1..5f8f218c100 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -24,13 +24,14 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 {
 	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
 	int i;
-	printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
+	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+	       "num_stripes %d\n",
 	       (unsigned long long)btrfs_chunk_length(eb, chunk),
 	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
 	       (unsigned long long)btrfs_chunk_type(eb, chunk),
 	       num_stripes);
 	for (i = 0 ; i < num_stripes ; i++) {
-		printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
+		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
 		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
 		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
 	}
@@ -38,8 +39,8 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 static void print_dev_item(struct extent_buffer *eb,
 			   struct btrfs_dev_item *dev_item)
 {
-	printk("\t\tdev item devid %llu "
-	       "total_bytes %llu bytes used %Lu\n",
+	printk(KERN_INFO "\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %llu\n",
 	       (unsigned long long)btrfs_device_id(eb, dev_item),
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
@@ -61,14 +62,15 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_dev_extent *dev_extent;
 	u32 type;
 
-	printk("leaf %llu total ptrs %d free space %d\n",
+	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(l, i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
-		printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
+		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		       "itemsize %d\n",
 			i,
 			(unsigned long long)key.objectid, type,
 			(unsigned long long)key.offset,
@@ -76,33 +78,36 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-			printk("\t\tinode generation %llu size %llu mode %o\n",
-		              (unsigned long long)btrfs_inode_generation(l, ii),
+			printk(KERN_INFO "\t\tinode generation %llu size %llu "
+			       "mode %o\n",
+			       (unsigned long long)
+			       btrfs_inode_generation(l, ii),
 			      (unsigned long long)btrfs_inode_size(l, ii),
 			       btrfs_inode_mode(l, ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
-			printk("\t\tdir oid %llu type %u\n",
+			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
 				(unsigned long long)found_key.objectid,
 				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-			printk("\t\troot data bytenr %llu refs %u\n",
-				(unsigned long long)btrfs_disk_root_bytenr(l, ri),
+			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+				(unsigned long long)
+				btrfs_disk_root_bytenr(l, ri),
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
 			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-			printk("\t\textent data refs %u\n",
+			printk(KERN_INFO "\t\textent data refs %u\n",
 				btrfs_extent_refs(l, ei));
 			break;
 		case BTRFS_EXTENT_REF_KEY:
 			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
-			printk("\t\textent back ref root %llu gen %llu "
-			       "owner %llu num_refs %lu\n",
+			printk(KERN_INFO "\t\textent back ref root %llu "
+			       "gen %llu owner %llu num_refs %lu\n",
 			       (unsigned long long)btrfs_ref_root(l, ref),
 			       (unsigned long long)btrfs_ref_generation(l, ref),
 			       (unsigned long long)btrfs_ref_objectid(l, ref),
@@ -114,26 +119,36 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(l, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE) {
-				printk("\t\tinline extent data size %u\n",
-			           btrfs_file_extent_inline_len(l, fi));
+				printk(KERN_INFO "\t\tinline extent data "
+				       "size %u\n",
+				       btrfs_file_extent_inline_len(l, fi));
 				break;
 			}
-			printk("\t\textent data disk bytenr %llu nr %llu\n",
-			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
-			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-			printk("\t\textent data offset %llu nr %llu ram %llu\n",
-			  (unsigned long long)btrfs_file_extent_offset(l, fi),
-			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
-			  (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data disk bytenr %llu "
+			       "nr %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_disk_bytenr(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_disk_num_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data offset %llu "
+			       "nr %llu ram %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_offset(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_num_bytes(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
-			printk("\t\tblock group used %llu\n",
-			       (unsigned long long)btrfs_disk_block_group_used(l, bi));
+			printk(KERN_INFO "\t\tblock group used %llu\n",
+			       (unsigned long long)
+			       btrfs_disk_block_group_used(l, bi));
 			break;
 		case BTRFS_CHUNK_ITEM_KEY:
-			print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
+			print_chunk(l, btrfs_item_ptr(l, i,
+						      struct btrfs_chunk));
 			break;
 		case BTRFS_DEV_ITEM_KEY:
 			print_dev_item(l, btrfs_item_ptr(l, i,
@@ -142,7 +157,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DEV_EXTENT_KEY:
 			dev_extent = btrfs_item_ptr(l, i,
 						    struct btrfs_dev_extent);
-			printk("\t\tdev extent chunk_tree %llu\n"
+			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
 			       "\t\tchunk objectid %llu chunk offset %llu "
 			       "length %llu\n",
 			       (unsigned long long)
@@ -171,13 +186,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		btrfs_print_leaf(root, c);
 		return;
 	}
-	printk("node %llu level %d total ptrs %d free spc %u\n",
+	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
 	       (unsigned long long)btrfs_header_bytenr(c),
 	       btrfs_header_level(c), nr,
 	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
-		printk("\tkey %d (%llu %u %llu) block %llu\n",
+		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
 		       i,
 		       (unsigned long long)key.objectid,
 		       key.type,
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index a50ebb67055..6f0acc4c9ea 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -74,11 +74,11 @@ void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct btrfs_leaf_ref *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
 
@@ -98,10 +98,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 
 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct btrfs_leaf_ref *entry;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
 		WARN_ON(!entry->in_tree);
 
@@ -127,7 +127,7 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 		return 0;
 
 	spin_lock(&tree->lock);
-	while(!list_empty(&tree->list)) {
+	while (!list_empty(&tree->list)) {
 		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
 		BUG_ON(ref->tree != tree);
 		if (ref->root_gen > max_root_gen)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f99335a999d..b48650de447 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -132,8 +132,9 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk("unable to update root key %Lu %u %Lu\n",
-		       key->objectid, key->type, key->offset);
+		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+		       (unsigned long long)key->objectid, key->type,
+		       (unsigned long long)key->offset);
 		BUG_ON(1);
 	}
 
@@ -159,9 +160,9 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 /*
  * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an offset
- * lower than the latest root.  They need to be queued for deletion to finish
- * what was happening when we crashed.
+ * the process of being deleted if we crashed.  This is any root item with an
+ * offset lower than the latest root.  They need to be queued for deletion to
+ * finish what was happening when we crashed.
  */
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest)
@@ -188,7 +189,7 @@ again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
@@ -258,11 +259,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-	if (ret) {
-btrfs_print_leaf(root, path->nodes[0]);
-printk("failed to del %Lu %u %Lu\n", key->objectid, key->type, key->offset);
 
-	}
 	BUG_ON(ret != 0);
 	leaf = path->nodes[0];
 	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 8d7f568009c..c0f7ecaf1e7 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -66,7 +66,7 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\
 		unsigned long map_len;					\
 		u##bits res;						\
 		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
+				sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
@@ -103,7 +103,7 @@ void btrfs_set_##name(struct extent_buffer *eb,				\
 		unsigned long map_start;				\
 		unsigned long map_len;					\
 		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
+				sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ccdcb7bb7ad..b4c101d9322 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,18 +55,12 @@
 
 static struct super_operations btrfs_super_ops;
 
-static void btrfs_put_super (struct super_block * sb)
+static void btrfs_put_super(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
 	ret = close_ctree(root);
-	if (ret) {
-		printk("close ctree returns %d\n", ret);
-	}
-#if 0
-	btrfs_sysfs_del_super(root->fs_info);
-#endif
 	sb->s_fs_info = NULL;
 }
 
@@ -299,12 +293,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	return error;
 }
 
-static int btrfs_fill_super(struct super_block * sb,
+static int btrfs_fill_super(struct super_block *sb,
 			    struct btrfs_fs_devices *fs_devices,
-			    void * data, int silent)
+			    void *data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root_dentry;
+	struct inode *inode;
+	struct dentry *root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
 	struct btrfs_inode *bi;
@@ -479,8 +473,10 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		root = dget(s->s_root);
 	else {
 		mutex_lock(&s->s_root->d_inode->i_mutex);
-		root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
+		root = lookup_one_len(subvol_name, s->s_root,
+				      strlen(subvol_name));
 		mutex_unlock(&s->s_root->d_inode->i_mutex);
+
 		if (IS_ERR(root)) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
@@ -557,8 +553,9 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
+
 	/* We treat it as constant endianness (it doesn't matter _which_)
-	   because we want the fsid to come out the same whether mounted 
+	   because we want the fsid to come out the same whether mounted
 	   on a big-endian or little-endian host */
 	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
 	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
@@ -658,7 +655,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk("misc_deregister failed for control device");
+		printk(KERN_INFO "misc_deregister failed for control device");
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 04087c02084..a240b6fa81d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -67,7 +67,8 @@ struct btrfs_root_attr {
 };
 
 #define ROOT_ATTR(name, mode, show, store) \
-static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store)
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+							      show, store)
 
 ROOT_ATTR(blocks_used,	0444,	root_blocks_used_show,	NULL);
 ROOT_ATTR(block_limit,	0644,	root_block_limit_show,	NULL);
@@ -86,7 +87,8 @@ struct btrfs_super_attr {
 };
 
 #define SUPER_ATTR(name, mode, show, store) \
-static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store)
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+								show, store)
 
 SUPER_ATTR(blocks_used,		0444,	super_blocks_used_show,		NULL);
 SUPER_ATTR(total_blocks,	0444,	super_total_blocks_show,	NULL);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4e7b56e9d3a..56ab1f5ea11 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,9 +28,6 @@
 #include "ref-cache.h"
 #include "tree-log.h"
 
-extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_transaction_cachep;
-
 #define BTRFS_ROOT_TRANS_TAG 0
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
@@ -85,10 +82,10 @@ static noinline int join_transaction(struct btrfs_root *root)
 }
 
 /*
- * this does all the record keeping required to make sure that a
- * reference counted root is properly recorded in a given transaction.
- * This is required to make sure the old root from before we joined the transaction
- * is deleted when the transaction commits
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
  */
 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
@@ -144,7 +141,7 @@ static void wait_current_trans(struct btrfs_root *root)
 	if (cur_trans && cur_trans->blocked) {
 		DEFINE_WAIT(wait);
 		cur_trans->use_count++;
-		while(1) {
+		while (1) {
 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
 			if (cur_trans->blocked) {
@@ -213,7 +210,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 {
 	DEFINE_WAIT(wait);
 	mutex_lock(&root->fs_info->trans_mutex);
-	while(!commit->commit_done) {
+	while (!commit->commit_done) {
 		prepare_to_wait(&commit->commit_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (commit->commit_done)
@@ -228,8 +225,8 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 }
 
 /*
- * rate limit against the drop_snapshot code.  This helps to slow down new operations
- * if the drop_snapshot code isn't able to keep up.
+ * rate limit against the drop_snapshot code.  This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
  */
 static void throttle_on_drops(struct btrfs_root *root)
 {
@@ -332,12 +329,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	u64 end;
 	unsigned long index;
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-		while(start <= end) {
+		while (start <= end) {
 			cond_resched();
 
 			index = start >> PAGE_CACHE_SHIFT;
@@ -368,14 +365,14 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 			page_cache_release(page);
 		}
 	}
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
 
 		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
-		while(start <= end) {
+		while (start <= end) {
 			index = start >> PAGE_CACHE_SHIFT;
 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 			page = find_get_page(btree_inode->i_mapping, index);
@@ -431,7 +428,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	btrfs_write_dirty_block_groups(trans, root);
 	btrfs_extent_post_op(trans, root);
 
-	while(1) {
+	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
 			break;
@@ -472,7 +469,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 
 	btrfs_extent_post_op(trans, fs_info->tree_root);
 
-	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
+	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
@@ -521,7 +518,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 	int err = 0;
 	u32 refs;
 
-	while(1) {
+	while (1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
 						 ARRAY_SIZE(gang),
 						 BTRFS_ROOT_TRANS_TAG);
@@ -653,7 +650,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	int ret = 0;
 	int err;
 
-	while(!list_empty(list)) {
+	while (!list_empty(list)) {
 		struct btrfs_root *root;
 
 		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
@@ -663,13 +660,12 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		root = dirty->latest_root;
 		atomic_inc(&root->fs_info->throttles);
 
-		while(1) {
+		while (1) {
 			trans = btrfs_start_transaction(tree_root, 1);
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
-			if (ret != -EAGAIN) {
+			if (ret != -EAGAIN)
 				break;
-			}
 			mutex_unlock(&root->fs_info->drop_mutex);
 
 			err = btrfs_update_root(trans,
@@ -874,7 +870,7 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 	struct list_head *head = &trans->transaction->pending_snapshots;
 	int ret;
 
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		pending = list_entry(head->next,
 				     struct btrfs_pending_snapshot, list);
 		ret = finish_pending_snapshot(fs_info, pending);
@@ -1076,9 +1072,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-	if (root->fs_info->closing) {
+	if (root->fs_info->closing)
 		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
-	}
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ffe7f639732..ea292117f88 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -66,9 +66,9 @@ static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 	trans->block_group = BTRFS_I(inode)->block_group;
 }
 
-static inline void btrfs_update_inode_block_group(struct
-						  btrfs_trans_handle *trans,
-						  struct inode *inode)
+static inline void btrfs_update_inode_block_group(
+					  struct btrfs_trans_handle *trans,
+					  struct inode *inode)
 {
 	BTRFS_I(inode)->block_group = trans->block_group;
 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a6a3956cedf..3e8358c3616 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,10 +23,11 @@
 #include "transaction.h"
 #include "locking.h"
 
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read things
- * from disk, otherwise read all the leaves and try to get key order to
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
  * better reflect disk order
  */
+
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only)
 {
@@ -65,9 +66,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(root->node);
 	orig_level = level;
 
-	if (level == 0) {
+	if (level == 0)
 		goto out;
-	}
+
 	if (root->defrag_progress.objectid == 0) {
 		struct extent_buffer *root_node;
 		u32 nritems;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b1c2921f5be..3a72a1b6c24 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -829,7 +829,7 @@ conflict_again:
 		 */
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
-		while(ptr < ptr_end) {
+		while (ptr < ptr_end) {
 			victim_ref = (struct btrfs_inode_ref *)ptr;
 			victim_name_len = btrfs_inode_ref_name_len(leaf,
 								   victim_ref);
@@ -938,9 +938,8 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 
 	file_bytes = (item_size / csum_size) * root->sectorsize;
 	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
-	if (!sums) {
+	if (!sums)
 		return -ENOMEM;
-	}
 
 	INIT_LIST_HEAD(&sums->list);
 	sums->len = file_bytes;
@@ -952,7 +951,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	sector_sum = sums->sums;
 	cur_offset = key->offset;
 	ptr = btrfs_item_ptr_offset(eb, slot);
-	while(item_size > 0) {
+	while (item_size > 0) {
 		sector_sum->bytenr = cur_offset;
 		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
@@ -995,7 +994,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			break;
@@ -1012,7 +1011,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
 						   path->slots[0]);
-		while(ptr < ptr_end) {
+		while (ptr < ptr_end) {
 			struct btrfs_inode_ref *ref;
 
 			ref = (struct btrfs_inode_ref *)ptr;
@@ -1048,7 +1047,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
 	key.type = BTRFS_ORPHAN_ITEM_KEY;
 	key.offset = (u64)-1;
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0)
 			break;
@@ -1206,8 +1205,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	if (key->type == BTRFS_DIR_ITEM_KEY) {
 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
 				       name, name_len, 1);
-	}
-	else if (key->type == BTRFS_DIR_INDEX_KEY) {
+	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
 						     key->objectid,
 						     key->offset, name,
@@ -1282,7 +1280,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	ptr_end = ptr + item_size;
-	while(ptr < ptr_end) {
+	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
@@ -1408,7 +1406,7 @@ again:
 	item_size = btrfs_item_size_nr(eb, slot);
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	ptr_end = ptr + item_size;
-	while(ptr < ptr_end) {
+	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
 		name_len = btrfs_dir_name_len(eb, di);
 		name = kmalloc(name_len, GFP_NOFS);
@@ -1513,14 +1511,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 again:
 	range_start = 0;
 	range_end = 0;
-	while(1) {
+	while (1) {
 		ret = find_dir_range(log, path, dirid, key_type,
 				     &range_start, &range_end);
 		if (ret != 0)
 			break;
 
 		dir_key.offset = range_start;
-		while(1) {
+		while (1) {
 			int nritems;
 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
 						0, 0);
@@ -1676,7 +1674,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	return 0;
 }
 
-static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level,
 				   struct walk_control *wc)
@@ -1694,7 +1692,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
-	while(*level > 0) {
+	while (*level > 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
@@ -1753,11 +1751,11 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
-	if (path->nodes[*level] == root->node) {
+	if (path->nodes[*level] == root->node)
 		parent = path->nodes[*level];
-	} else {
+	else
 		parent = path->nodes[*level + 1];
-	}
+
 	bytenr = path->nodes[*level]->start;
 
 	blocksize = btrfs_level_size(root, *level);
@@ -1790,7 +1788,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path, int *level,
 				 struct walk_control *wc)
@@ -1801,7 +1799,7 @@ static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
 	int slot;
 	int ret;
 
-	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			struct extent_buffer *node;
@@ -1875,7 +1873,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	extent_buffer_get(log->node);
 	path->slots[level] = 0;
 
-	while(1) {
+	while (1) {
 		wret = walk_down_log_tree(trans, log, path, &level, wc);
 		if (wret > 0)
 			break;
@@ -1941,7 +1939,7 @@ static int wait_log_commit(struct btrfs_root *log)
 			schedule();
 		finish_wait(&log->fs_info->tree_log_wait, &wait);
 		mutex_lock(&log->fs_info->tree_log_mutex);
-	} while(transid == log->fs_info->tree_log_transid &&
+	} while (transid == log->fs_info->tree_log_transid &&
 		atomic_read(&log->fs_info->tree_log_commit));
 	return 0;
 }
@@ -1965,13 +1963,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 	atomic_set(&log->fs_info->tree_log_commit, 1);
 
-	while(1) {
+	while (1) {
 		batch = log->fs_info->tree_log_batch;
 		mutex_unlock(&log->fs_info->tree_log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&log->fs_info->tree_log_mutex);
 
-		while(atomic_read(&log->fs_info->tree_log_writers)) {
+		while (atomic_read(&log->fs_info->tree_log_writers)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
@@ -2030,7 +2028,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	ret = walk_log_tree(trans, log, &wc);
 	BUG_ON(ret);
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
 				    0, &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -2287,9 +2285,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			struct btrfs_key tmp;
 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
 					      path->slots[0]);
-			if (key_type == tmp.type) {
+			if (key_type == tmp.type)
 				first_offset = max(min_offset, tmp.offset) + 1;
-			}
 		}
 		goto done;
 	}
@@ -2319,7 +2316,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * we have a block from this transaction, log every item in it
 	 * from our directory
 	 */
-	while(1) {
+	while (1) {
 		struct btrfs_key tmp;
 		src = path->nodes[0];
 		nritems = btrfs_header_nritems(src);
@@ -2396,7 +2393,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 again:
 	min_key = 0;
 	max_key = 0;
-	while(1) {
+	while (1) {
 		ret = log_dir_items(trans, root, inode, path,
 				    dst_path, key_type, min_key,
 				    &max_key);
@@ -2432,7 +2429,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	key.type = max_key_type;
 	key.offset = (u64)-1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
 
 		if (ret != 1)
@@ -2481,7 +2478,7 @@ static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
 	list_add_tail(&sums->list, list);
 
 	path = btrfs_alloc_path();
-	while(disk_bytenr < end) {
+	while (disk_bytenr < end) {
 		if (!item || disk_bytenr < item_start_offset ||
 		    disk_bytenr >= item_last_offset) {
 			struct btrfs_key found_key;
@@ -2496,7 +2493,8 @@ static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
 				if (ret == -ENOENT || ret == -EFBIG)
 					ret = 0;
 				sum = 0;
-				printk("log no csum found for byte %llu\n",
+				printk(KERN_INFO "log no csum found for "
+				       "byte %llu\n",
 				       (unsigned long long)disk_bytenr);
 				item = NULL;
 				btrfs_release_path(root, path);
@@ -2643,7 +2641,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	 * we have to do this after the loop above to avoid changing the
 	 * log tree while trying to change the log tree.
 	 */
-	while(!list_empty(&ordered_sums)) {
+	while (!list_empty(&ordered_sums)) {
 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
 						   struct btrfs_ordered_sum,
 						   list);
@@ -2736,7 +2734,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	path->keep_locks = 1;
 
-	while(1) {
+	while (1) {
 		ins_nr = 0;
 		ret = btrfs_search_forward(root, &min_key, &max_key,
 					   path, 0, trans->transid);
@@ -2848,7 +2846,7 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 
 	start_log_trans(trans, root);
 	sb = dentry->d_inode->i_sb;
-	while(1) {
+	while (1) {
 		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
 					inode_only);
 		BUG_ON(ret);
@@ -2919,7 +2917,7 @@ again:
 	key.offset = (u64)-1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 		if (ret < 0)
 			break;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6672adcec9f..b187b537888 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -140,7 +140,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-static int noinline run_scheduled_bios(struct btrfs_device *device)
+static noinline int run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -187,7 +187,7 @@ loop:
 	}
 	spin_unlock(&device->io_lock);
 
-	while(pending) {
+	while (pending) {
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
@@ -458,7 +458,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		bdev = open_bdev_exclusive(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
-			printk("open %s failed\n", device->name);
+			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
 		}
 		set_blocksize(bdev, 4096);
@@ -570,14 +570,15 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
-		printk("device label %s ", disk_super->label);
+		printk(KERN_INFO "device label %s ", disk_super->label);
 	else {
 		/* FIXME, make a readl uuid parser */
-		printk("device fsid %llx-%llx ",
+		printk(KERN_INFO "device fsid %llx-%llx ",
 		       *(unsigned long long *)disk_super->fsid,
 		       *(unsigned long long *)(disk_super->fsid + 8));
 	}
-	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
+	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 	brelse(bh);
@@ -683,9 +684,8 @@ no_more_items:
 				goto check_pending;
 			}
 		}
-		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 			goto next;
-		}
 
 		start_found = 1;
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1001,14 +1001,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
 	    root->fs_info->fs_devices->rw_devices <= 4) {
-		printk("btrfs: unable to go below four devices on raid10\n");
+		printk(KERN_ERR "btrfs: unable to go below four devices "
+		       "on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
 	    root->fs_info->fs_devices->rw_devices <= 2) {
-		printk("btrfs: unable to go below two devices on raid1\n");
+		printk(KERN_ERR "btrfs: unable to go below two "
+		       "devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1031,7 +1033,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk("btrfs: no missing devices found to remove\n");
+			printk(KERN_ERR "btrfs: no missing devices found to "
+			       "remove\n");
 			goto out;
 		}
 	} else {
@@ -1060,7 +1063,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk("btrfs: unable to remove the only writeable device\n");
+		printk(KERN_ERR "btrfs: unable to remove the only writeable "
+		       "device\n");
 		ret = -EINVAL;
 		goto error_brelse;
 	}
@@ -1286,9 +1290,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EINVAL;
 
 	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
-	if (!bdev) {
+	if (!bdev)
 		return -EIO;
-	}
 
 	if (root->fs_info->fs_devices->seeding) {
 		seeding_dev = 1;
@@ -1401,8 +1404,8 @@ error:
 	goto out;
 }
 
-static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
-				 struct btrfs_device *device)
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+					struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1563,7 +1566,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
-	printk("btrfs relocating chunk %llu\n",
+	printk(KERN_INFO "btrfs relocating chunk %llu\n",
 	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
@@ -1748,7 +1751,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -1916,7 +1919,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
 					int num_stripes, int sub_stripes)
 {
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
@@ -2041,7 +2044,7 @@ again:
 		min_free += 1024 * 1024;
 
 	INIT_LIST_HEAD(&private_devs);
-	while(index < num_stripes) {
+	while (index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 		BUG_ON(!device->writeable);
 		if (device->total_bytes > device->bytes_used)
@@ -2242,7 +2245,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root,
 					 struct btrfs_device *device)
 {
@@ -2338,7 +2341,7 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 {
 	struct extent_map *em;
 
-	while(1) {
+	while (1) {
 		spin_lock(&tree->map_tree.lock);
 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
 		if (em)
@@ -2413,9 +2416,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
-	if (multi_ret && !(rw & (1 << BIO_RW))) {
+	if (multi_ret && !(rw & (1 << BIO_RW)))
 		stripes_allocated = 1;
-	}
 again:
 	if (multi_ret) {
 		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
@@ -2434,7 +2436,9 @@ again:
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu len %Lu\n", logical, *length);
+		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		       (unsigned long long)logical,
+		       (unsigned long long)*length);
 		BUG();
 	}
 
@@ -2541,9 +2545,8 @@ again:
 			device = map->stripes[stripe_index].dev;
 			if (device->bdev) {
 				bdi = blk_get_backing_dev_info(device->bdev);
-				if (bdi->unplug_io_fn) {
+				if (bdi->unplug_io_fn)
 					bdi->unplug_io_fn(bdi, unplug_page);
-				}
 			}
 		} else {
 			multi->stripes[i].physical =
@@ -2717,7 +2720,7 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-static int noinline schedule_bio(struct btrfs_root *root,
+static noinline int schedule_bio(struct btrfs_root *root,
 				 struct btrfs_device *device,
 				 int rw, struct bio *bio)
 {
@@ -2785,8 +2788,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
 	total_devs = multi->num_stripes;
 	if (map_length < length) {
-		printk("mapping failed logical %Lu bio len %Lu "
-		       "len %Lu\n", logical, length, map_length);
+		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		       "len %llu\n", (unsigned long long)logical,
+		       (unsigned long long)length,
+		       (unsigned long long)map_length);
 		BUG();
 	}
 	multi->end_io = first_bio->bi_end_io;
@@ -2794,7 +2799,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
-	while(dev_nr < total_devs) {
+	while (dev_nr < total_devs) {
 		if (total_devs > 1) {
 			if (dev_nr < total_devs - 1) {
 				bio = bio_clone(first_bio, GFP_NOFS);
@@ -3058,7 +3063,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -EIO;
 
 		if (!device) {
-			printk("warning devid %Lu missing\n", devid);
+			printk(KERN_WARNING "warning devid %llu missing\n",
+			       (unsigned long long)devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -3078,12 +3084,6 @@ static int read_one_dev(struct btrfs_root *root,
 	if (device->writeable)
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 	ret = 0;
-#if 0
-	ret = btrfs_open_device(device);
-	if (ret) {
-		kfree(device);
-	}
-#endif
 	return ret;
 }
 
@@ -3174,7 +3174,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	key.type = 0;
 again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 4146f0710e6..7f332e27089 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -264,7 +264,8 @@ struct xattr_handler *btrfs_xattr_handlers[] = {
  */
 static bool btrfs_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	return !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c4617cde6c7..ecfbce836d3 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -152,7 +152,7 @@ static int free_workspace(struct workspace *workspace)
 static void free_workspaces(void)
 {
 	struct workspace *workspace;
-	while(!list_empty(&idle_workspace)) {
+	while (!list_empty(&idle_workspace)) {
 		workspace = list_entry(idle_workspace.next, struct workspace,
 				       list);
 		list_del(&workspace->list);
@@ -397,12 +397,10 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 		ret = -1;
 		goto out;
 	}
-	while(workspace->inf_strm.total_in < srclen) {
+	while (workspace->inf_strm.total_in < srclen) {
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
-		if (ret != Z_OK && ret != Z_STREAM_END) {
+		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		}
-
 		/*
 		 * buf start is the byte offset we're of the start of
 		 * our workspace buffer
@@ -424,16 +422,14 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 			/* we didn't make progress in this inflate
 			 * call, we're done
 			 */
-			if (ret != Z_STREAM_END) {
+			if (ret != Z_STREAM_END)
 				ret = -1;
-			}
 			break;
 		}
 
 		/* we haven't yet hit data corresponding to this page */
-		if (total_out <= start_byte) {
+		if (total_out <= start_byte)
 			goto next;
-		}
 
 		/*
 		 * the start of the data we care about is offset into
@@ -448,7 +444,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 		current_buf_start = buf_start;
 
 		/* copy bytes from the working buffer into the pages */
-		while(working_bytes > 0) {
+		while (working_bytes > 0) {
 			bytes = min(PAGE_CACHE_SIZE - pg_offset,
 				    PAGE_CACHE_SIZE - buf_offset);
 			bytes = min(bytes, working_bytes);
@@ -471,6 +467,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 					ret = 0;
 					goto done;
 				}
+
 				page_out = bvec[page_out_index].bv_page;
 				pg_offset = 0;
 				page_bytes_left = PAGE_CACHE_SIZE;
@@ -480,9 +477,8 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 				 * make sure our new page is covered by this
 				 * working buffer
 				 */
-				if (total_out <= start_byte) {
+				if (total_out <= start_byte)
 					goto next;
-				}
 
 				/* the next page in the biovec might not
 				 * be adjacent to the last page, but it
@@ -517,11 +513,10 @@ next:
 							   PAGE_CACHE_SIZE);
 		}
 	}
-	if (ret != Z_STREAM_END) {
+	if (ret != Z_STREAM_END)
 		ret = -1;
-	} else {
+	else
 		ret = 0;
-	}
 done:
 	zlib_inflateEnd(&workspace->inf_strm);
 	if (data_in)
@@ -579,16 +574,15 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 		goto out;
 	}
 
-	while(bytes_left > 0) {
+	while (bytes_left > 0) {
 		unsigned long buf_start;
 		unsigned long buf_offset;
 		unsigned long bytes;
 		unsigned long pg_offset = 0;
 
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
-		if (ret != Z_OK && ret != Z_STREAM_END) {
+		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		}
 
 		buf_start = total_out;
 		total_out = workspace->inf_strm.total_out;
@@ -598,15 +592,13 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 			break;
 		}
 
-		if (total_out <= start_byte) {
+		if (total_out <= start_byte)
 			goto next;
-		}
 
-		if (total_out > start_byte && buf_start < start_byte) {
+		if (total_out > start_byte && buf_start < start_byte)
 			buf_offset = start_byte - buf_start;
-		} else {
+		else
 			buf_offset = 0;
-		}
 
 		bytes = min(PAGE_CACHE_SIZE - pg_offset,
 			    PAGE_CACHE_SIZE - buf_offset);
@@ -622,11 +614,12 @@ next:
 		workspace->inf_strm.next_out = workspace->buf;
 		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 	}
-	if (ret != Z_STREAM_END && bytes_left != 0) {
+
+	if (ret != Z_STREAM_END && bytes_left != 0)
 		ret = -1;
-	} else {
+	else
 		ret = 0;
-	}
+
 	zlib_inflateEnd(&workspace->inf_strm);
 out:
 	free_workspace(workspace);
-- 
cgit v1.2.3


From 43b774ba1378dc5e952aadb829048bbe90ba33e4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 Jan 2009 22:05:48 -0500
Subject: Btrfs: drop EXPORT symbols from extent_io.c

They should stay out until this is turned into generic code.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 56 ----------------------------------------------------
 1 file changed, 56 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 39edb551dca..e086d407f1f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -116,7 +116,6 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 	spin_lock_init(&tree->buffer_lock);
 	tree->mapping = mapping;
 }
-EXPORT_SYMBOL(extent_io_tree_init);
 
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
@@ -140,7 +139,6 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 	init_waitqueue_head(&state->wq);
 	return state;
 }
-EXPORT_SYMBOL(alloc_extent_state);
 
 static void free_extent_state(struct extent_state *state)
 {
@@ -159,7 +157,6 @@ static void free_extent_state(struct extent_state *state)
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
-EXPORT_SYMBOL(free_extent_state);
 
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
@@ -580,7 +577,6 @@ search_again:
 		cond_resched();
 	goto again;
 }
-EXPORT_SYMBOL(clear_extent_bit);
 
 static int wait_on_state(struct extent_io_tree *tree,
 			 struct extent_state *state)
@@ -644,7 +640,6 @@ out:
 	spin_unlock(&tree->lock);
 	return 0;
 }
-EXPORT_SYMBOL(wait_extent_bit);
 
 static void set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
@@ -821,7 +816,6 @@ search_again:
 		cond_resched();
 	goto again;
 }
-EXPORT_SYMBOL(set_extent_bit);
 
 /* wrappers around set/clear extent bit */
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -830,14 +824,12 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
 			      mask);
 }
-EXPORT_SYMBOL(set_extent_dirty);
 
 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
 }
-EXPORT_SYMBOL(set_extent_ordered);
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
@@ -845,14 +837,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 	return set_extent_bit(tree, start, end, bits, 0, NULL,
 			      mask);
 }
-EXPORT_SYMBOL(set_extent_bits);
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_bits);
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
@@ -861,7 +851,6 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			      EXTENT_DELALLOC | EXTENT_DIRTY,
 			      0, NULL, mask);
 }
-EXPORT_SYMBOL(set_extent_delalloc);
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
@@ -869,14 +858,12 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 	return clear_extent_bit(tree, start, end,
 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_dirty);
 
 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
 			 gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_ordered);
 
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
@@ -884,7 +871,6 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
 			      mask);
 }
-EXPORT_SYMBOL(set_extent_new);
 
 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
@@ -898,7 +884,6 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
 			      mask);
 }
-EXPORT_SYMBOL(set_extent_uptodate);
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
 				 u64 end, gfp_t mask)
@@ -923,7 +908,6 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
 }
-EXPORT_SYMBOL(wait_on_extent_writeback);
 
 /*
  * either insert or lock state struct between start and end use mask to tell
@@ -946,7 +930,6 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 	}
 	return err;
 }
-EXPORT_SYMBOL(lock_extent);
 
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask)
@@ -964,14 +947,12 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 	}
 	return 1;
 }
-EXPORT_SYMBOL(try_lock_extent);
 
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		  gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
 }
-EXPORT_SYMBOL(unlock_extent);
 
 /*
  * helper function to set pages and extents in the tree dirty
@@ -992,7 +973,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
 	set_extent_dirty(tree, start, end, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(set_range_dirty);
 
 /*
  * helper function to set both pages and extents in the tree writeback
@@ -1053,7 +1033,6 @@ out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
-EXPORT_SYMBOL(find_first_extent_bit);
 
 /* find the first state struct with 'bits' set after 'start', and
  * return it.  tree->lock must be held.  NULL will returned if
@@ -1085,7 +1064,6 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 out:
 	return NULL;
 }
-EXPORT_SYMBOL(find_first_extent_bit_state);
 
 /*
  * find a contiguous range of bytes in the file marked as delalloc, not
@@ -1372,7 +1350,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(extent_clear_unlock_delalloc);
 
 /*
  * count the number of bytes in the tree that have a given bit(s)
@@ -1598,7 +1575,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	spin_unlock(&tree->lock);
 	return bitset;
 }
-EXPORT_SYMBOL(test_range_bit);
 
 /*
  * helper function to set a given page up to date if all the
@@ -1952,7 +1928,6 @@ void set_page_extent_mapped(struct page *page)
 		set_page_private(page, EXTENT_PAGE_PRIVATE);
 	}
 }
-EXPORT_SYMBOL(set_page_extent_mapped);
 
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
@@ -2128,7 +2103,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 		submit_one_bio(READ, bio, 0, bio_flags);
 	return ret;
 }
-EXPORT_SYMBOL(extent_read_full_page);
 
 /*
  * the writepage semantics are similar to regular writepage.  extent
@@ -2533,7 +2507,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		submit_one_bio(WRITE, epd.bio, 0, 0);
 	return ret;
 }
-EXPORT_SYMBOL(extent_write_full_page);
 
 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 			      u64 start, u64 end, get_extent_t *get_extent,
@@ -2579,8 +2552,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		submit_one_bio(WRITE, epd.bio, 0, 0);
 	return ret;
 }
-EXPORT_SYMBOL(extent_write_locked_range);
-
 
 int extent_writepages(struct extent_io_tree *tree,
 		      struct address_space *mapping,
@@ -2602,7 +2573,6 @@ int extent_writepages(struct extent_io_tree *tree,
 		submit_one_bio(WRITE, epd.bio, 0, 0);
 	return ret;
 }
-EXPORT_SYMBOL(extent_writepages);
 
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
@@ -2643,7 +2613,6 @@ int extent_readpages(struct extent_io_tree *tree,
 		submit_one_bio(READ, bio, 0, bio_flags);
 	return 0;
 }
-EXPORT_SYMBOL(extent_readpages);
 
 /*
  * basic invalidatepage code, this waits on any locked or writeback
@@ -2668,7 +2637,6 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 			 1, 1, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(extent_invalidatepage);
 
 /*
  * simple commit_write call, set_range_dirty is used to mark both
@@ -2689,7 +2657,6 @@ int extent_commit_write(struct extent_io_tree *tree,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(extent_commit_write);
 
 int extent_prepare_write(struct extent_io_tree *tree,
 			 struct inode *inode, struct page *page,
@@ -2787,7 +2754,6 @@ err:
 	/* FIXME, zero out newly allocated blocks on error */
 	return err;
 }
-EXPORT_SYMBOL(extent_prepare_write);
 
 /*
  * a helper for releasepage, this tests for areas of the page that
@@ -2813,7 +2779,6 @@ int try_release_extent_state(struct extent_map_tree *map,
 	}
 	return ret;
 }
-EXPORT_SYMBOL(try_release_extent_state);
 
 /*
  * a helper for releasepage.  As long as there are no locked extents
@@ -2863,7 +2828,6 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 	}
 	return try_release_extent_state(map, tree, page, mask);
 }
-EXPORT_SYMBOL(try_release_extent_mapping);
 
 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 		get_extent_t *get_extent)
@@ -3042,7 +3006,6 @@ free_eb:
 	__free_extent_buffer(eb);
 	return exists;
 }
-EXPORT_SYMBOL(alloc_extent_buffer);
 
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					 u64 start, unsigned long len,
@@ -3061,7 +3024,6 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 
 	return eb;
 }
-EXPORT_SYMBOL(find_extent_buffer);
 
 void free_extent_buffer(struct extent_buffer *eb)
 {
@@ -3073,7 +3035,6 @@ void free_extent_buffer(struct extent_buffer *eb)
 
 	WARN_ON(1);
 }
-EXPORT_SYMBOL(free_extent_buffer);
 
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb)
@@ -3128,7 +3089,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(clear_extent_buffer_dirty);
 
 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
 				    struct extent_buffer *eb)
@@ -3136,7 +3096,6 @@ int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
 	return wait_on_extent_writeback(tree, eb->start,
 					eb->start + eb->len - 1);
 }
-EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
 
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
 			     struct extent_buffer *eb)
@@ -3167,7 +3126,6 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(set_extent_buffer_dirty);
 
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 				struct extent_buffer *eb)
@@ -3212,7 +3170,6 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(set_extent_buffer_uptodate);
 
 int extent_range_uptodate(struct extent_io_tree *tree,
 			  u64 start, u64 end)
@@ -3267,7 +3224,6 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 	}
 	return pg_uptodate;
 }
-EXPORT_SYMBOL(extent_buffer_uptodate);
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb,
@@ -3366,7 +3322,6 @@ unlock_exit:
 	}
 	return ret;
 }
-EXPORT_SYMBOL(read_extent_buffer_pages);
 
 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 			unsigned long start,
@@ -3399,7 +3354,6 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 		i++;
 	}
 }
-EXPORT_SYMBOL(read_extent_buffer);
 
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 			       unsigned long min_len, char **token, char **map,
@@ -3439,7 +3393,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	*map_len = PAGE_CACHE_SIZE - offset;
 	return 0;
 }
-EXPORT_SYMBOL(map_private_extent_buffer);
 
 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		      unsigned long min_len,
@@ -3465,13 +3418,11 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	}
 	return err;
 }
-EXPORT_SYMBOL(map_extent_buffer);
 
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
 {
 	kunmap_atomic(token, km);
 }
-EXPORT_SYMBOL(unmap_extent_buffer);
 
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
@@ -3509,7 +3460,6 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 	}
 	return ret;
 }
-EXPORT_SYMBOL(memcmp_extent_buffer);
 
 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 			 unsigned long start, unsigned long len)
@@ -3542,7 +3492,6 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 		i++;
 	}
 }
-EXPORT_SYMBOL(write_extent_buffer);
 
 void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len)
@@ -3573,7 +3522,6 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 		i++;
 	}
 }
-EXPORT_SYMBOL(memset_extent_buffer);
 
 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 			unsigned long dst_offset, unsigned long src_offset,
@@ -3608,7 +3556,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 		i++;
 	}
 }
-EXPORT_SYMBOL(copy_extent_buffer);
 
 static void move_pages(struct page *dst_page, struct page *src_page,
 		       unsigned long dst_off, unsigned long src_off,
@@ -3692,7 +3639,6 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		len -= cur;
 	}
 }
-EXPORT_SYMBOL(memcpy_extent_buffer);
 
 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 			   unsigned long src_offset, unsigned long len)
@@ -3741,7 +3687,6 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		len -= cur;
 	}
 }
-EXPORT_SYMBOL(memmove_extent_buffer);
 
 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 {
@@ -3770,4 +3715,3 @@ out:
 	spin_unlock(&tree->buffer_lock);
 	return ret;
 }
-EXPORT_SYMBOL(try_release_extent_buffer);
-- 
cgit v1.2.3


From b3881f74b31b7d47d0f1c4d89ac3e7f0b9c05e3e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 5 Jan 2009 22:46:26 -0500
Subject: ext4: Add mount option to set kjournald's I/O priority

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
 fs/ext4/super.c | 29 +++++++++++++++++++++++++----
 fs/ioprio.c     |  3 ++-
 2 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8036392b212..8ff8709828f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1013,7 +1013,7 @@ enum {
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
-	Opt_inode_readahead_blks
+	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
 static const match_table_t tokens = {
@@ -1074,6 +1074,7 @@ static const match_table_t tokens = {
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
+	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_err, NULL},
 };
 
@@ -1098,8 +1099,11 @@ static ext4_fsblk_t get_sb_block(void **data)
 	return sb_block;
 }
 
+#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
+			 unsigned int *journal_ioprio,
 			 ext4_fsblk_t *n_blocks_count, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1492,6 +1496,14 @@ set_qf_format:
 				return 0;
 			sbi->s_inode_readahead_blks = option;
 			break;
+		case Opt_journal_ioprio:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 7)
+				break;
+			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
+							    option);
+			break;
 		default:
 			printk(KERN_ERR
 			       "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -2035,6 +2047,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	int features;
 	__u64 blocks_count;
 	int err;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -2141,7 +2154,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
 
-	if (!parse_options((char *) data, sb, &journal_devnum, NULL, 0))
+	if (!parse_options((char *) data, sb, &journal_devnum,
+			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
 
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -2506,6 +2520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	default:
 		break;
 	}
+	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 no_journal:
 
@@ -3127,6 +3142,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
 	ext4_group_t g;
+	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	int err;
 #ifdef CONFIG_QUOTA
 	int i;
@@ -3145,11 +3161,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	for (i = 0; i < MAXQUOTAS; i++)
 		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
 #endif
+	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
+		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, &journal_ioprio,
+			   &n_blocks_count, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -3162,8 +3181,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 	es = sbi->s_es;
 
-	if (sbi->s_journal)
+	if (sbi->s_journal) {
 		ext4_init_journal_params(sb, sbi->s_journal);
+		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+	}
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > ext4_blocks_count(es)) {
diff --git a/fs/ioprio.c b/fs/ioprio.c
index 3569e0ad86a..1a39ac37094 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -27,7 +27,7 @@
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
 
-static int set_task_ioprio(struct task_struct *task, int ioprio)
+int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	struct io_context *ioc;
@@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	task_unlock(task);
 	return err;
 }
+EXPORT_SYMBOL_GPL(set_task_ioprio);
 
 asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 {
-- 
cgit v1.2.3


From 4ec110281379826c5cf6ed14735e47027c3c5765 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 6 Jan 2009 14:53:26 -0500
Subject: ext4: Add sanity checks for the superblock before mounting the
 filesystem

This avoids insane superblock configurations that could lead to kernel
oops due to null pointer derefences.

http://bugzilla.kernel.org/show_bug.cgi?id=12371

Thanks to David Maciejak at Fortinet's FortiGuard Global Security
Research Team who discovered this bug independently (but at
approximately the same time) as Thiemo Nagel, who submitted the patch.

Signed-off-by: Thiemo Nagel <thiemo.nagel@ph.tum.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/super.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8ff8709828f..517c90ad25b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2041,8 +2041,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	const char *descr;
 	int ret = -EINVAL;
 	int blocksize;
-	int db_count;
-	int i;
+	unsigned int db_count;
+	unsigned int i;
 	int needs_recovery, has_huge_files;
 	int features;
 	__u64 blocks_count;
@@ -2331,20 +2331,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
 
-	/* ensure blocks_count calculation below doesn't sign-extend */
-	if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
-	    le32_to_cpu(es->s_first_data_block) + 1) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
-		       "first data block %u, blocks per group %lu\n",
-			ext4_blocks_count(es),
-			le32_to_cpu(es->s_first_data_block),
-			EXT4_BLOCKS_PER_GROUP(sb));
+        /*
+         * It makes no sense for the first data block to be beyond the end
+         * of the filesystem.
+         */
+        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+		       "block %u is beyond end of filesystem (%llu)\n",
+		       le32_to_cpu(es->s_first_data_block),
+		       ext4_blocks_count(es));
 		goto failed_mount;
 	}
 	blocks_count = (ext4_blocks_count(es) -
 			le32_to_cpu(es->s_first_data_block) +
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
+	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
+		printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+		       "(block count %llu, first data block %u, "
+		       "blocks per group %lu)\n", sbi->s_groups_count,
+		       ext4_blocks_count(es),
+		       le32_to_cpu(es->s_first_data_block),
+		       EXT4_BLOCKS_PER_GROUP(sb));
+		goto failed_mount;
+	}
 	sbi->s_groups_count = blocks_count;
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
-- 
cgit v1.2.3


From abda14189251563a50f56da5ea2e37e904ac4cba Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 6 Jan 2009 00:20:32 -0500
Subject: ext4: Make printk's consistently prefixed with "EXT4-fs: "

Previously, some were "ext4: ", and some were "EXT4: "; change them to
be consistent with most ext4 printk's, which is to use "EXT4-fs: ".

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c   |  2 +-
 fs/ext4/super.c | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index cf3ccf4a94b..2df2e40b01a 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent,
 	sb = inode->i_sb;
 
 	if (!fname) {
-		printk(KERN_ERR "ext4: call_filldir: called with "
+		printk(KERN_ERR "EXT4-fs: call_filldir: called with "
 		       "null fname?!?\n");
 		return 0;
 	}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 517c90ad25b..b69d0920386 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -505,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
 	return bdev;
 
 fail:
-	printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n",
+	printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
 			__bdevname(dev, b), PTR_ERR(bdev));
 	return NULL;
 }
@@ -2485,7 +2485,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext4_blocks_count(es) > 0xffffffffULL &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
-		printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n");
+		printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
 		goto failed_mount4;
 	}
 
@@ -2766,7 +2766,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 
 	if (bd_claim(bdev, sb)) {
 		printk(KERN_ERR
-			"EXT4: failed to claim external journal device.\n");
+			"EXT4-fs: failed to claim external journal device.\n");
 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
@@ -2949,7 +2949,7 @@ static void ext4_commit_super(struct super_block *sb,
 		 * be remapped.  Nothing we can do but to retry the
 		 * write and hope for the best.
 		 */
-		printk(KERN_ERR "ext4: previous I/O error to "
+		printk(KERN_ERR "EXT4-fs: previous I/O error to "
 		       "superblock detected for %s.\n", sb->s_id);
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
@@ -2965,7 +2965,7 @@ static void ext4_commit_super(struct super_block *sb,
 	if (sync) {
 		sync_dirty_buffer(sbh);
 		if (buffer_write_io_error(sbh)) {
-			printk(KERN_ERR "ext4: I/O error while writing "
+			printk(KERN_ERR "EXT4-fs: I/O error while writing "
 			       "superblock for %s.\n", sb->s_id);
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
-- 
cgit v1.2.3


From 025dfdafe77f20b3890981a394774baab7b9c827 Mon Sep 17 00:00:00 2001
From: Frederik Schwarzer <schwarzerf@gmail.com>
Date: Thu, 16 Oct 2008 19:02:37 +0200
Subject: trivial: fix then -> than typos in comments and documentation

- (better, more, bigger ...) then -> (...) than

Signed-off-by: Frederik Schwarzer <schwarzerf@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/ocfs2/cluster/heartbeat.c | 2 +-
 fs/proc/task_nommu.c         | 2 +-
 fs/ubifs/Kconfig             | 2 +-
 fs/ubifs/budget.c            | 4 ++--
 fs/ubifs/gc.c                | 2 +-
 fs/ubifs/journal.c           | 2 +-
 fs/ubifs/shrinker.c          | 2 +-
 fs/xfs/linux-2.6/xfs_super.c | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 6ebaa58e2c0..04697ba7f73 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -854,7 +854,7 @@ static int o2hb_thread(void *data)
 
 	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
 		/* We track the time spent inside
-		 * o2hb_do_disk_heartbeat so that we avoid more then
+		 * o2hb_do_disk_heartbeat so that we avoid more than
 		 * hr_timeout_ms between disk writes. On busy systems
 		 * this should result in a heartbeat which is less
 		 * likely to time itself out. */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 219bd79ea89..d4a8be32b90 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -9,7 +9,7 @@
 
 /*
  * Logic: we've got two memory sums for each process, "shared", and
- * "non-shared". Shared memory may get counted more then once, for
+ * "non-shared". Shared memory may get counted more than once, for
  * each process that owns it. Non-shared memory is counted
  * accurately.
  */
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 91ceeda7e5b..e35b54d5059 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB
 	depends on UBIFS_FS
 	default y
 	help
-	  Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+	  Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
 
 # Debugging-related stuff
 config UBIFS_FS_DEBUG
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 0e5e54d8292..175f9c590b7 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c)
  *
  * This function is called when an operation cannot be budgeted because there
  * is supposedly no free space. But in most cases there is some free space:
- *   o budgeting is pessimistic, so it always budgets more then it is actually
+ *   o budgeting is pessimistic, so it always budgets more than it is actually
  *     needed, so shrinking the liability is one way to make free space - the
  *     cached data will take less space then it was budgeted for;
  *   o GC may turn some dark space into free space (budgeting treats dark space
@@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
  * @c: UBIFS file-system description object
  *
  * This function converts budget which was allocated for a new page of data to
- * the budget of changing an existing page of data. The latter is smaller then
+ * the budget of changing an existing page of data. The latter is smaller than
  * the former, so this function only does simple re-calculation and does not
  * involve any write-back.
  */
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 0bef6501d58..9832f9abe28 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -45,7 +45,7 @@
 #define SMALL_NODE_WM  UBIFS_MAX_DENT_NODE_SZ
 
 /*
- * GC may need to move more then one LEB to make progress. The below constants
+ * GC may need to move more than one LEB to make progress. The below constants
  * define "soft" and "hard" limits on the number of LEBs the garbage collector
  * may move.
  */
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 10ae25b7d1d..9b7c54e0cd2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -191,7 +191,7 @@ again:
 	if (wbuf->lnum != -1 && avail >= len) {
 		/*
 		 * Someone else has switched the journal head and we have
-		 * enough space now. This happens when more then one process is
+		 * enough space now. This happens when more than one process is
 		 * trying to write to the same journal head at the same time.
 		 */
 		dbg_jnl("return LEB %d back, already have LEB %d:%d",
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f248533841a..e7bab52a141 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
  * @contention: if any contention, this is set to %1
  *
  * This function walks the list of mounted UBIFS file-systems and frees clean
- * znodes which are older then @age, until at least @nr znodes are freed.
+ * znodes which are older than @age, until at least @nr znodes are freed.
  * Returns the number of freed znodes.
  */
 static int shrink_tnc_trees(int nr, int age, int *contention)
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 36f6cc703ef..be846d606ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1348,7 +1348,7 @@ xfs_finish_flags(
 {
 	int			ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
 
-	/* Fail a mount where the logbuf is smaller then the log stripe */
+	/* Fail a mount where the logbuf is smaller than the log stripe */
 	if (xfs_sb_version_haslogv2(&mp->m_sb)) {
 		if (mp->m_logbsize <= 0 &&
 		    mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
-- 
cgit v1.2.3


From 0211a9c8508b2183e0e539509aad60414f1c3813 Mon Sep 17 00:00:00 2001
From: Frederik Schwarzer <schwarzerf@gmail.com>
Date: Mon, 29 Dec 2008 22:14:56 +0100
Subject: trivial: fix an -> a typos in documentation and comments

It is always "an" if there is a vowel _spoken_ (not written).
So it is:
"an hour" (spoken vowel)
but
"a uniform" (spoken 'j')

Signed-off-by: Frederik Schwarzer <schwarzerf@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 fs/ncpfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 6d04e050c74..f54360f50a9 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl
 {
 	s32		auth_type;
 	u32		object_name_len;
-	compat_caddr_t	object_name;	/* an userspace data, in most cases user name */
+	compat_caddr_t	object_name;	/* a userspace data, in most cases user name */
 };
 
 struct compat_ncp_fs_info_v2 {
-- 
cgit v1.2.3


From 9ca03b997f71787e345951e6267fbd8eba14d49f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 6 Jan 2009 09:38:55 -0500
Subject: Btrfs: drop remaining LINUX_KERNEL_VERSION checks and compat code

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compat.h      | 25 -------------------------
 fs/btrfs/extent-tree.c |  4 ----
 2 files changed, 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index 594d60bdd3c..7c4503ef6ef 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -4,29 +4,4 @@
 #define btrfs_drop_nlink(inode) drop_nlink(inode)
 #define btrfs_inc_nlink(inode)	inc_nlink(inode)
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 27)
-static inline struct dentry *d_obtain_alias(struct inode *inode)
-{
-	struct dentry *d;
-
-	if (!inode)
-		return NULL;
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	d = d_alloc_anon(inode);
-	if (!d)
-		iput(inode);
-	return d;
-}
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
-# define  __pagevec_lru_add_file __pagevec_lru_add
-# define open_bdev_exclusive open_bdev_excl
-# define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
-typedef unsigned __bitwise__ fmode_t;
-#endif
-
-
 #endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ec43fa526d7..171ca30a375 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -869,11 +869,7 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
-#else
-	blkdev_issue_discard(bdev, start >> 9, len >> 9);
-#endif
 }
 #endif
 
-- 
cgit v1.2.3


From 180591bcfed1a2cec048abb21d3dab840625caab Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 6 Jan 2009 09:58:06 -0500
Subject: Btrfs: Use btrfs_join_transaction to avoid deadlocks during snapshot
 creation

Snapshot creation happens at a specific time during transaction commit.  We
need to make sure the code called by snapshot creation doesn't wait
for the running transaction to commit.

This changes btrfs_delete_inode and finish_pending_snaps to use
btrfs_join_transaction instead of btrfs_start_transaction to avoid deadlocks.

It would be better if btrfs_delete_inode didn't use the join, but the
call path that triggers it is:

btrfs_commit_transaction->create_pending_snapshots->
create_pending_snapshot->btrfs_lookup_dentry->
fixup_tree_root_location->btrfs_read_fs_root->
btrfs_read_fs_root_no_name->btrfs_orphan_cleanup->iput

This will be fixed in a later patch by moving the orphan cleanup to the
cleaner thread.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c       | 2 +-
 fs/btrfs/transaction.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1b35ea63b6c..c0ca9c3723c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2886,7 +2886,7 @@ void btrfs_delete_inode(struct inode *inode)
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	btrfs_i_size_write(inode, 0);
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
 	ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 56ab1f5ea11..8a08f944334 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -800,7 +800,7 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
 
 	parent_inode = pending->dentry->d_parent->d_inode;
 	parent_root = BTRFS_I(parent_inode)->root;
-	trans = btrfs_start_transaction(parent_root, 1);
+	trans = btrfs_join_transaction(parent_root, 1);
 
 	/*
 	 * insert the directory item
-- 
cgit v1.2.3


From 1ba12553f3600ffebad226c5204ab0e46df98161 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 6 Jan 2009 09:58:02 -0500
Subject: Btrfs: don't change file extent's ram_bytes in btrfs_drop_extents

btrfs_drop_extents doesn't change file extent's ram_bytes
in the case of booked extent. To be consistent, we should
also not change ram_bytes when truncating existing extent.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/file.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0e3a13a4565..90268334145 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -556,10 +556,6 @@ next_slot:
 					inode_sub_bytes(inode, old_num -
 							new_num);
 				}
-				if (!compression && !encryption) {
-					btrfs_set_file_extent_ram_bytes(leaf,
-							extent, new_num);
-				}
 				btrfs_set_file_extent_num_bytes(leaf,
 							extent, new_num);
 				btrfs_mark_buffer_dirty(leaf);
-- 
cgit v1.2.3


From 07d400a6df4767a90d49a153fdb7f4cfa1e3f23e Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Tue, 6 Jan 2009 11:42:00 -0500
Subject: Btrfs: tree logging checksum fixes

This patch contains following things.

1) Limit the max size of btrfs_ordered_sum structure to PAGE_SIZE.  This
struct is kmalloced so we want to keep it reasonable.

2) Replace copy_extent_csums by btrfs_lookup_csums_range.  This was
duplicated code in tree-log.c

3) Remove replay_one_csum. csum items are replayed at the same time as
   replaying file extents. This guarantees we only replay useful csums.

4) nbytes accounting fix.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c |   2 +-
 fs/btrfs/file-item.c   |  62 ++++++-----
 fs/btrfs/inode.c       |   5 +-
 fs/btrfs/tree-log.c    | 293 +++++++++++++++----------------------------------
 4 files changed, 130 insertions(+), 232 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 171ca30a375..293da650873 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5579,7 +5579,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
 
 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
-	ret = btrfs_lookup_csums_range(root, disk_bytenr,
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
 				       disk_bytenr + len - 1, &list);
 
 	while (!list_empty(&list)) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b11abfad81a..964652435fd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -27,6 +27,12 @@
 #define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
+
+#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
+				   sizeof(struct btrfs_ordered_sum)) / \
+				   sizeof(struct btrfs_sector_sum) * \
+				   (r)->sectorsize - (r)->sectorsize)
+
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -259,8 +265,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	key.offset = start;
 	key.type = BTRFS_EXTENT_CSUM_KEY;
 
-	ret = btrfs_search_slot(NULL, root->fs_info->csum_root,
-				&key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto fail;
 	if (ret > 0 && path->slots[0] > 0) {
@@ -279,7 +284,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	while (start <= end) {
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-			ret = btrfs_next_leaf(root->fs_info->csum_root, path);
+			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto fail;
 			if (ret > 0)
@@ -306,33 +311,38 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			continue;
 		}
 
-		size = min(csum_end, end + 1) - start;
-		sums = kzalloc(btrfs_ordered_sum_size(root, size), GFP_NOFS);
-		BUG_ON(!sums);
+		csum_end = min(csum_end, end + 1);
+		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				      struct btrfs_csum_item);
+		while (start < csum_end) {
+			size = min_t(size_t, csum_end - start,
+					MAX_ORDERED_SUM_BYTES(root));
+			sums = kzalloc(btrfs_ordered_sum_size(root, size),
+					GFP_NOFS);
+			BUG_ON(!sums);
 
-		sector_sum = sums->sums;
-		sums->bytenr = start;
-		sums->len = size;
+			sector_sum = sums->sums;
+			sums->bytenr = start;
+			sums->len = size;
 
-		offset = (start - key.offset) >>
-			 root->fs_info->sb->s_blocksize_bits;
-		offset *= csum_size;
+			offset = (start - key.offset) >>
+				root->fs_info->sb->s_blocksize_bits;
+			offset *= csum_size;
 
-		item = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				      struct btrfs_csum_item);
-		while (size > 0) {
-			read_extent_buffer(path->nodes[0], &sector_sum->sum,
-					   ((unsigned long)item) + offset,
-					   csum_size);
-			sector_sum->bytenr = start;
-
-			size -= root->sectorsize;
-			start += root->sectorsize;
-			offset += csum_size;
-			sector_sum++;
+			while (size > 0) {
+				read_extent_buffer(path->nodes[0],
+						&sector_sum->sum,
+						((unsigned long)item) +
+						offset, csum_size);
+				sector_sum->bytenr = start;
+
+				size -= root->sectorsize;
+				start += root->sectorsize;
+				offset += csum_size;
+				sector_sum++;
+			}
+			list_add_tail(&sums->list, list);
 		}
-		list_add_tail(&sums->list, list);
-
 		path->slots[0]++;
 	}
 	ret = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0ca9c3723c..4e57fe68e4b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -157,7 +157,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 	key.objectid = inode->i_ino;
 	key.offset = start;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-	inode_add_bytes(inode, size);
 	datasize = btrfs_file_extent_calc_inline_size(cur_size);
 
 	inode_add_bytes(inode, size);
@@ -920,8 +919,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
 	struct btrfs_ordered_sum *sums;
 	LIST_HEAD(list);
 
-	ret = btrfs_lookup_csums_range(root, bytenr, bytenr + num_bytes - 1,
-				       &list);
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
+				       bytenr + num_bytes - 1, &list);
 	if (ret == 0 && list_empty(&list))
 		return 0;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3a72a1b6c24..332ec35d2c0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -433,49 +433,6 @@ insert:
 						   trans->transid);
 		}
 	}
-
-	if (overwrite_root &&
-	    key->type == BTRFS_EXTENT_DATA_KEY) {
-		int extent_type;
-		struct btrfs_file_extent_item *fi;
-
-		fi = (struct btrfs_file_extent_item *)dst_ptr;
-		extent_type = btrfs_file_extent_type(path->nodes[0], fi);
-		if (extent_type == BTRFS_FILE_EXTENT_REG ||
-		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
-			struct btrfs_key ins;
-			ins.objectid = btrfs_file_extent_disk_bytenr(
-							path->nodes[0], fi);
-			ins.offset = btrfs_file_extent_disk_num_bytes(
-							path->nodes[0], fi);
-			ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-			/*
-			 * is this extent already allocated in the extent
-			 * allocation tree?  If so, just add a reference
-			 */
-			ret = btrfs_lookup_extent(root, ins.objectid,
-						  ins.offset);
-			if (ret == 0) {
-				ret = btrfs_inc_extent_ref(trans, root,
-						ins.objectid, ins.offset,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid);
-			} else {
-				/*
-				 * insert the extent pointer in the extent
-				 * allocation tree
-				 */
-				ret = btrfs_alloc_logged_extent(trans, root,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid,
-						&ins);
-				BUG_ON(ret);
-			}
-		}
-	}
 no_copy:
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_release_path(root, path);
@@ -530,6 +487,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	u64 extent_end;
 	u64 alloc_hint;
 	u64 start = key->offset;
+	u64 saved_nbytes;
 	struct btrfs_file_extent_item *item;
 	struct inode *inode = NULL;
 	unsigned long size;
@@ -591,17 +549,95 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(root, path);
 
+	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
 	ret = btrfs_drop_extents(trans, root, inode,
 			 start, extent_end, start, &alloc_hint);
 	BUG_ON(ret);
 
-	/* insert the extent */
-	ret = overwrite_item(trans, root, path, eb, slot, key);
-	BUG_ON(ret);
+	if (found_type == BTRFS_FILE_EXTENT_REG ||
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		unsigned long dest_offset;
+		struct btrfs_key ins;
+
+		ret = btrfs_insert_empty_item(trans, root, path, key,
+					      sizeof(*item));
+		BUG_ON(ret);
+		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
+						    path->slots[0]);
+		copy_extent_buffer(path->nodes[0], eb, dest_offset,
+				(unsigned long)item,  sizeof(*item));
+
+		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+		if (ins.objectid > 0) {
+			u64 csum_start;
+			u64 csum_end;
+			LIST_HEAD(ordered_sums);
+			/*
+			 * is this extent already allocated in the extent
+			 * allocation tree?  If so, just add a reference
+			 */
+			ret = btrfs_lookup_extent(root, ins.objectid,
+						ins.offset);
+			if (ret == 0) {
+				ret = btrfs_inc_extent_ref(trans, root,
+						ins.objectid, ins.offset,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid);
+			} else {
+				/*
+				 * insert the extent pointer in the extent
+				 * allocation tree
+				 */
+				ret = btrfs_alloc_logged_extent(trans, root,
+						path->nodes[0]->start,
+						root->root_key.objectid,
+						trans->transid, key->objectid,
+						&ins);
+				BUG_ON(ret);
+			}
+			btrfs_release_path(root, path);
+
+			if (btrfs_file_extent_compression(eb, item)) {
+				csum_start = ins.objectid;
+				csum_end = csum_start + ins.offset;
+			} else {
+				csum_start = ins.objectid +
+					btrfs_file_extent_offset(eb, item);
+				csum_end = csum_start +
+					btrfs_file_extent_num_bytes(eb, item);
+			}
+
+			ret = btrfs_lookup_csums_range(root->log_root,
+						csum_start, csum_end - 1,
+						&ordered_sums);
+			BUG_ON(ret);
+			while (!list_empty(&ordered_sums)) {
+				struct btrfs_ordered_sum *sums;
+				sums = list_entry(ordered_sums.next,
+						struct btrfs_ordered_sum,
+						list);
+				ret = btrfs_csum_file_blocks(trans,
+						root->fs_info->csum_root,
+						sums);
+				BUG_ON(ret);
+				list_del(&sums->list);
+				kfree(sums);
+			}
+		} else {
+			btrfs_release_path(root, path);
+		}
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		/* inline extents are easy, we just overwrite them */
+		ret = overwrite_item(trans, root, path, eb, slot, key);
+		BUG_ON(ret);
+	}
 
-	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
-	inode_add_bytes(inode, extent_end - start);
+	inode_set_bytes(inode, saved_nbytes);
 	btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
@@ -902,70 +938,6 @@ out_nowrite:
 	return 0;
 }
 
-/*
- * replay one csum item from the log tree into the subvolume 'root'
- * eb, slot and key all refer to the log tree
- * path is for temp use by this function and should be released on return
- *
- * This copies the checksums out of the log tree and inserts them into
- * the subvolume.  Any existing checksums for this range in the file
- * are overwritten, and new items are added where required.
- *
- * We keep this simple by reusing the btrfs_ordered_sum code from
- * the data=ordered mode.  This basically means making a copy
- * of all the checksums in ram, which we have to do anyway for kmap
- * rules.
- *
- * The copy is then sent down to btrfs_csum_file_blocks, which
- * does all the hard work of finding existing items in the file
- * or adding new ones.
- */
-static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct extent_buffer *eb, int slot,
-				      struct btrfs_key *key)
-{
-	int ret;
-	u32 item_size = btrfs_item_size_nr(eb, slot);
-	u64 cur_offset;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
-	unsigned long file_bytes;
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	unsigned long ptr;
-
-	file_bytes = (item_size / csum_size) * root->sectorsize;
-	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
-	if (!sums)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&sums->list);
-	sums->len = file_bytes;
-	sums->bytenr = key->offset;
-
-	/*
-	 * copy all the sums into the ordered sum struct
-	 */
-	sector_sum = sums->sums;
-	cur_offset = key->offset;
-	ptr = btrfs_item_ptr_offset(eb, slot);
-	while (item_size > 0) {
-		sector_sum->bytenr = cur_offset;
-		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
-		sector_sum++;
-		item_size -= csum_size;
-		ptr += csum_size;
-		cur_offset += root->sectorsize;
-	}
-
-	/* let btrfs_csum_file_blocks add them into the file */
-	ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
-	BUG_ON(ret);
-	kfree(sums);
-	return 0;
-}
 /*
  * There are a few corners where the link count of the file can't
  * be properly maintained during replay.  So, instead of adding
@@ -1659,10 +1631,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
 			BUG_ON(ret);
-		} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
-			ret = replay_one_csum(wc->trans, root, path,
-					      eb, i, &key);
-			BUG_ON(ret);
 		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
 			   key.type == BTRFS_DIR_INDEX_KEY) {
 			ret = replay_one_dir_item(wc->trans, root, path,
@@ -2021,7 +1989,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 		.process_func = process_one_buffer
 	};
 
-	if (!root->log_root)
+	if (!root->log_root || root->fs_info->log_root_recovering)
 		return 0;
 
 	log = root->log_root;
@@ -2453,86 +2421,6 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
-				      struct list_head *list,
-				      struct btrfs_root *root,
-				      u64 disk_bytenr, u64 len)
-{
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	int ret;
-	struct btrfs_path *path;
-	struct btrfs_csum_item *item = NULL;
-	u64 end = disk_bytenr + len;
-	u64 item_start_offset = 0;
-	u64 item_last_offset = 0;
-	u32 diff;
-	u32 sum;
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
-
-	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
-
-	sector_sum = sums->sums;
-	sums->bytenr = disk_bytenr;
-	sums->len = len;
-	list_add_tail(&sums->list, list);
-
-	path = btrfs_alloc_path();
-	while (disk_bytenr < end) {
-		if (!item || disk_bytenr < item_start_offset ||
-		    disk_bytenr >= item_last_offset) {
-			struct btrfs_key found_key;
-			u32 item_size;
-
-			if (item)
-				btrfs_release_path(root, path);
-			item = btrfs_lookup_csum(NULL, root, path,
-						 disk_bytenr, 0);
-			if (IS_ERR(item)) {
-				ret = PTR_ERR(item);
-				if (ret == -ENOENT || ret == -EFBIG)
-					ret = 0;
-				sum = 0;
-				printk(KERN_INFO "log no csum found for "
-				       "byte %llu\n",
-				       (unsigned long long)disk_bytenr);
-				item = NULL;
-				btrfs_release_path(root, path);
-				goto found;
-			}
-			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-					      path->slots[0]);
-
-			item_start_offset = found_key.offset;
-			item_size = btrfs_item_size_nr(path->nodes[0],
-						       path->slots[0]);
-			item_last_offset = item_start_offset +
-				(item_size / csum_size) *
-				root->sectorsize;
-			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					      struct btrfs_csum_item);
-		}
-		/*
-		 * this byte range must be able to fit inside
-		 * a single leaf so it will also fit inside a u32
-		 */
-		diff = disk_bytenr - item_start_offset;
-		diff = diff / root->sectorsize;
-		diff = diff * csum_size;
-
-		read_extent_buffer(path->nodes[0], &sum,
-				   ((unsigned long)item) + diff,
-				   csum_size);
-found:
-		sector_sum->bytenr = disk_bytenr;
-		sector_sum->sum = sum;
-		disk_bytenr += root->sectorsize;
-		sector_sum++;
-	}
-	btrfs_free_path(path);
-	return 0;
-}
-
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *log,
 			       struct btrfs_path *dst_path,
@@ -2622,10 +2510,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						   trans->transid,
 						   ins_keys[i].objectid);
 					BUG_ON(ret);
-					ret = copy_extent_csums(trans,
-						&ordered_sums,
-						log->fs_info->csum_root,
-						ds + cs, cl);
+					ret = btrfs_lookup_csums_range(
+						   log->fs_info->csum_root,
+						   ds + cs, ds + cs + cl - 1,
+						   &ordered_sums);
 					BUG_ON(ret);
 				}
 			}
@@ -2942,9 +2830,9 @@ again:
 		tmp_key.offset = (u64)-1;
 
 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
-
 		BUG_ON(!wc.replay_dest);
 
+		wc.replay_dest->log_root = log;
 		btrfs_record_root_in_trans(wc.replay_dest);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
@@ -2961,6 +2849,7 @@ again:
 		}
 
 		key.offset = found_key.offset - 1;
+		wc.replay_dest->log_root = NULL;
 		free_extent_buffer(log->node);
 		kfree(log);
 
-- 
cgit v1.2.3


From b3d47676d474ecd914c72049c87e71e5f0ffe040 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 20 Oct 2008 13:01:59 -0400
Subject: nfsd: update fh_verify description

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsfh.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f0da7d9c3a9..019a8a20184 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -258,14 +258,32 @@ out:
 	return error;
 }
 
-/*
- * Perform sanity checks on the dentry in a client's file handle.
+/**
+ * fh_verify - filehandle lookup and access checking
+ * @rqstp: pointer to current rpc request
+ * @fhp: filehandle to be verified
+ * @type: expected type of object pointed to by filehandle
+ * @access: type of access needed to object
+ *
+ * Look up a dentry from the on-the-wire filehandle, check the client's
+ * access to the export, and set the current task's credentials.
+ *
+ * Regardless of success or failure of fh_verify(), fh_put() should be
+ * called on @fhp when the caller is finished with the filehandle.
+ *
+ * fh_verify() may be called multiple times on a given filehandle, for
+ * example, when processing an NFSv4 compound.  The first call will look
+ * up a dentry using the on-the-wire filehandle.  Subsequent calls will
+ * skip the lookup and just perform the other checks and possibly change
+ * the current task's credentials.
  *
- * Note that the file handle dentry may need to be freed even after
- * an error return.
+ * @type specifies the type of object expected using one of the S_IF*
+ * constants defined in include/linux/stat.h.  The caller may use zero
+ * to indicate that it doesn't care, or a negative integer to indicate
+ * that it expects something not of the given type.
  *
- * This is only called at the start of an nfsproc call, so fhp points to
- * a svc_fh which is all 0 except for the over-the-wire file handle.
+ * @access is formed from the NFSD_MAY_* constants defined in
+ * include/linux/nfsd/nfsd.h.
  */
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
-- 
cgit v1.2.3


From 9346eff0dea1e5855fba25c9fe639d92a4db3135 Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Mon, 20 Oct 2008 11:44:28 +0530
Subject: nfsd: Minor cleanup of find_stateid

Minor cleanup/rewrite of find_stateid. Compile tested.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 13e0e074dbb..06b89df9221 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2435,13 +2435,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
 static struct nfs4_stateid *
 find_stateid(stateid_t *stid, int flags)
 {
-	struct nfs4_stateid *local = NULL;
+	struct nfs4_stateid *local;
 	u32 st_id = stid->si_stateownerid;
 	u32 f_id = stid->si_fileid;
 	unsigned int hashval;
 
 	dprintk("NFSD: find_stateid flags 0x%x\n",flags);
-	if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+	if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) {
 		hashval = stateid_hashval(st_id, f_id);
 		list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) {
 			if ((local->st_stateid.si_stateownerid == st_id) &&
@@ -2449,7 +2449,8 @@ find_stateid(stateid_t *stid, int flags)
 				return local;
 		}
 	} 
-	if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) {
+
+	if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) {
 		hashval = stateid_hashval(st_id, f_id);
 		list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) {
 			if ((local->st_stateid.si_stateownerid == st_id) &&
-- 
cgit v1.2.3


From 2bd9e7b62e6e1da3f881c40c73d93e9a212ce6de Mon Sep 17 00:00:00 2001
From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Mon, 20 Oct 2008 11:47:09 +0530
Subject: nfsd: Fix leaked memory in nfs4_make_rec_clidname

cksum.data is not freed up in one error case. Compile tested.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 0f9d6efaa62..74f7b67567f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 
 	md5_to_hex(dname, cksum.data);
 
-	kfree(cksum.data);
 	status = nfs_ok;
 out:
+	kfree(cksum.data);
 	crypto_free_hash(desc.tfm);
 out_no_tfm:
 	return status;
-- 
cgit v1.2.3


From c72a476b4b7ecadb80185de31236edb303c1a5d0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 20 Oct 2008 11:51:58 -0400
Subject: lockd: set svc_serv->sv_maxconn to a more reasonable value (try #3)

The default method for calculating the number of connections allowed
per RPC service arbitrarily limits single-threaded services to 80
connections. This is too low for services like lockd and artificially
limits the number of TCP clients that it can support.

Have lockd set a default sv_maxconn value to 1024 (which is the typical
default value for RLIMIT_NOFILE. Also add a module parameter to allow an
admin to set this to an arbitrary value.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Neil Brown <neilb@suse.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 252d80163d0..bc3c3cb62db 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -62,6 +62,9 @@ static unsigned long		nlm_timeout = LOCKD_DFLT_TIMEO;
 static int			nlm_udpport, nlm_tcpport;
 int				nsm_use_hostnames = 0;
 
+/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
+static unsigned int		nlm_max_connections = 1024;
+
 /*
  * Constants needed for the sysctl interface.
  */
@@ -143,6 +146,9 @@ lockd(void *vrqstp)
 		long timeout = MAX_SCHEDULE_TIMEOUT;
 		RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
 
+		/* update sv_maxconn if it has changed */
+		rqstp->rq_server->sv_maxconn = nlm_max_connections;
+
 		if (signalled()) {
 			flush_signals(current);
 			if (nlmsvc_ops) {
@@ -276,6 +282,7 @@ int lockd_up(void)
 	}
 
 	svc_sock_update_bufs(serv);
+	serv->sv_maxconn = nlm_max_connections;
 
 	nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
 	if (IS_ERR(nlmsvc_task)) {
@@ -485,6 +492,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int,
 module_param_call(nlm_tcpport, param_set_port, param_get_int,
 		  &nlm_tcpport, 0644);
 module_param(nsm_use_hostnames, bool, 0644);
+module_param(nlm_max_connections, uint, 0644);
 
 /*
  * Initialising and terminating the module.
-- 
cgit v1.2.3


From 1df40b609ad5a622904eb652109c287fe9c93ec5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:19:53 -0500
Subject: NLM: Remove address eye-catcher buffers from nlm_host

The h_name field in struct nlm_host is a just copy of
h_nsmhandle->sm_name.  Likewise, the contents of the h_addrbuf field
should be identical to the sm_addrbuf field.

The h_srcaddrbuf field is used only in one place for debugging.  We can
live without this until we get %pI formatting for printk().

Currently these buffers are 48 bytes, but we need to support scope IDs
in IPv6 presentation addresses, which means making the buffers even
larger.  Instead, let's find ways to eliminate them to save space.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index abdebf76b82..33bf67af7ab 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -206,6 +206,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 		goto out;
 	}
 	host->h_name	   = nsm->sm_name;
+	host->h_addrbuf    = nsm->sm_addrbuf;
 	memcpy(nlm_addr(host), ni->sap, ni->salen);
 	host->h_addrlen = ni->salen;
 	nlm_clear_port(nlm_addr(host));
@@ -232,11 +233,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 
 	nrhosts++;
 
-	nlm_display_address((struct sockaddr *)&host->h_addr,
-				host->h_addrbuf, sizeof(host->h_addrbuf));
-	nlm_display_address((struct sockaddr *)&host->h_srcaddr,
-				host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
-
 	dprintk("lockd: nlm_lookup_host created host %s\n",
 			host->h_name);
 
@@ -378,8 +374,8 @@ nlm_bind_host(struct nlm_host *host)
 {
 	struct rpc_clnt	*clnt;
 
-	dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
-			host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
+	dprintk("lockd: nlm_bind_host %s (%s)\n",
+			host->h_name, host->h_addrbuf);
 
 	/* Lock host handle */
 	mutex_lock(&host->h_mutex);
-- 
cgit v1.2.3


From 6999fb4016b2604c2f8a65586bba4a62a4b24ce7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:20:01 -0500
Subject: NLM: Remove AF_UNSPEC arm in nlm_display_address()

AF_UNSPEC support is no longer needed in nlm_display_address() now
that a presentation address is no longer generated for the h_srcaddr
field.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 33bf67af7ab..beb5da81016 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -112,9 +112,6 @@ static void nlm_display_address(const struct sockaddr *sap,
 	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
 
 	switch (sap->sa_family) {
-	case AF_UNSPEC:
-		snprintf(buf, len, "unspecified");
-		break;
 	case AF_INET:
 		snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
 		break;
-- 
cgit v1.2.3


From bc995801a09d1fead0bec1356bfd836911c8eed7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:20:08 -0500
Subject: NLM: Support IPv6 scope IDs in nlm_display_address()

Scope ID support is needed since the kernel's NSM implementation is
about to use these displayed addresses as a mon_name in some cases.

When nsm_use_hostnames is zero, without scope ID support NSM will fail
to handle peers that contact us via a link-local address.  Link-local
addresses do not work without an interface ID, which is stored in the
sockaddr's sin6_scope_id field.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index beb5da81016..012e49aaecd 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -105,22 +105,31 @@ static void nlm_clear_port(struct sockaddr *sap)
 	}
 }
 
+static void nlm_display_ipv6_address(const struct sockaddr *sap, char *buf,
+				     const size_t len)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+		snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
+	else if (sin6->sin6_scope_id != 0)
+		snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
+				sin6->sin6_scope_id);
+	else
+		snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+}
+
 static void nlm_display_address(const struct sockaddr *sap,
 				char *buf, const size_t len)
 {
 	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
 
 	switch (sap->sa_family) {
 	case AF_INET:
 		snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
 		break;
 	case AF_INET6:
-		if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-			snprintf(buf, len, "%pI4",
-				 &sin6->sin6_addr.s6_addr32[3]);
-		else
-			snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+		nlm_display_ipv6_address(sap, buf, len);
 		break;
 	default:
 		snprintf(buf, len, "unsupported address family");
-- 
cgit v1.2.3


From afb03699dc0a920aed3322ad0e6895533941fb1e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:20:16 -0500
Subject: NLM: Add helper to handle IPv4 addresses

Clean up: introduce a helper function to generate IPv4 addresses using
the same style as the IPv6 helper function we just added.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 012e49aaecd..780918acd6f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -105,6 +105,13 @@ static void nlm_clear_port(struct sockaddr *sap)
 	}
 }
 
+static void nlm_display_ipv4_address(const struct sockaddr *sap, char *buf,
+				     const size_t len)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+}
+
 static void nlm_display_ipv6_address(const struct sockaddr *sap, char *buf,
 				     const size_t len)
 {
@@ -122,11 +129,9 @@ static void nlm_display_ipv6_address(const struct sockaddr *sap, char *buf,
 static void nlm_display_address(const struct sockaddr *sap,
 				char *buf, const size_t len)
 {
-	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-
 	switch (sap->sa_family) {
 	case AF_INET:
-		snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+		nlm_display_ipv4_address(sap, buf, len);
 		break;
 	case AF_INET6:
 		nlm_display_ipv6_address(sap, buf, len);
-- 
cgit v1.2.3


From a4846750f090702e2fb848ac4fe5827bcef34060 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:20:23 -0500
Subject: NSM: Use C99 structure initializer to initialize nsm_args

Clean up: Use a C99 structure initializer instead of open-coding the
initialization of nsm_args.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index ffd3461f75e..6f6ff410341 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -37,7 +37,13 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
 	struct rpc_clnt	*clnt;
 	int		status;
-	struct nsm_args	args;
+	struct nsm_args args = {
+		.addr		= nsm_addr_in(nsm)->sin_addr.s_addr,
+		.prog		= NLM_PROGRAM,
+		.vers		= 3,
+		.proc		= NLMPROC_NSM_NOTIFY,
+		.mon_name	= nsm->sm_name,
+	};
 	struct rpc_message msg = {
 		.rpc_argp	= &args,
 		.rpc_resp	= res,
@@ -49,12 +55,6 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 		goto out;
 	}
 
-	memset(&args, 0, sizeof(args));
-	args.mon_name = nsm->sm_name;
-	args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
-	args.prog = NLM_PROGRAM;
-	args.vers = 3;
-	args.proc = NLMPROC_NSM_NOTIFY;
 	memset(res, 0, sizeof(*res));
 
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
-- 
cgit v1.2.3


From 5acf43155d1bcc412d892c73f64044f9a826cde6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:20:31 -0500
Subject: NSM: convert printk(KERN_DEBUG) to a dprintk()

Clean up: make the printk(KERN_DEBUG) in nsm_mon_unmon() a dprintk,
and add another dprintk to note if creating an RPC client for the
upcall failed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 6f6ff410341..497dfea02e8 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -52,6 +52,8 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 	clnt = nsm_create();
 	if (IS_ERR(clnt)) {
 		status = PTR_ERR(clnt);
+		dprintk("lockd: failed to create NSM upcall transport, "
+				"status=%d\n", status);
 		goto out;
 	}
 
@@ -60,8 +62,8 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 	msg.rpc_proc = &clnt->cl_procinfo[proc];
 	status = rpc_call_sync(clnt, &msg, 0);
 	if (status < 0)
-		printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n",
-			status);
+		dprintk("lockd: NSM upcall RPC failed, status=%d\n",
+				status);
 	else
 		status = 0;
 	rpc_shutdown_client(clnt);
-- 
cgit v1.2.3


From 29ed1407ed81086b778ebf12145b048ac3f7e10e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:20:46 -0500
Subject: NSM: Support IPv6 version of mon_name

The "mon_name" argument of the NSMPROC_MON and NSMPROC_UNMON upcalls
is a string that contains the hostname or IP address of the remote peer
to be notified when this host has rebooted.  The sm-notify command uses
this identifier to contact the peer when we reboot, so it must be
either a well-qualified DNS hostname or a presentation format IP
address string.

When the "nsm_use_hostnames" sysctl is set to zero, the kernel's NSM
provides a presentation format IP address in the "mon_name" argument.
Otherwise, the "caller_name" argument from NLM requests is used,
which is usually just the DNS hostname of the peer.

To support IPv6 addresses for the mon_name argument, we use the
nsm_handle's address eye-catcher, which already contains an appropriate
presentation format address string.  Using the eye-catcher string
obviates the need to use a large buffer on the stack to form the
presentation address string for the upcall.

This patch also addresses a subtle bug.

An NSMPROC_MON request and the subsequent NSMPROC_UNMON request for the
same peer are required to use the same value for the "mon_name"
argument.  Otherwise, rpc.statd's NSMPROC_UNMON processing cannot
locate the database entry for that peer and remove it.

If the setting of nsm_use_hostnames is changed between the time the
kernel sends an NSMPROC_MON request and the time it sends the
NSMPROC_UNMON request for the same peer, the "mon_name" argument for
these two requests may not be the same.  This is because the value of
"mon_name" is currently chosen at the moment the call is made based on
the setting of nsm_use_hostnames

To ensure both requests pass identical contents in the "mon_name"
argument, we now select which string to use for the argument in the
nsm_monitor() function.  A pointer to this string is saved in the
nsm_handle so it can be used for a subsequent NSMPROC_UNMON upcall.

NB: There are other potential problems, such as how nlm_host_rebooted()
might behave if nsm_use_hostnames were changed while hosts are still
being monitored.  This patch does not attempt to address those
problems.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 497dfea02e8..a606fbbf804 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -18,8 +18,6 @@
 
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
 
-#define XDR_ADDRBUF_LEN		(20)
-
 static struct rpc_clnt *	nsm_create(void);
 
 static struct rpc_program	nsm_program;
@@ -42,7 +40,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 		.prog		= NLM_PROGRAM,
 		.vers		= 3,
 		.proc		= NLMPROC_NSM_NOTIFY,
-		.mon_name	= nsm->sm_name,
+		.mon_name	= nsm->sm_mon_name,
 	};
 	struct rpc_message msg = {
 		.rpc_argp	= &args,
@@ -87,6 +85,12 @@ nsm_monitor(struct nlm_host *host)
 	if (nsm->sm_monitored)
 		return 0;
 
+	/*
+	 * Choose whether to record the caller_name or IP address of
+	 * this peer in the local rpc.statd's database.
+	 */
+	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
+
 	status = nsm_mon_unmon(nsm, SM_MON, &res);
 
 	if (status < 0 || res.status != 0)
@@ -167,25 +171,10 @@ static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
 
 /*
  * "mon_name" specifies the host to be monitored.
- *
- * Linux uses a text version of the IP address of the remote
- * host as the host identifier (the "mon_name" argument).
- *
- * Linux statd always looks up the canonical hostname first for
- * whatever remote hostname it receives, so this works alright.
  */
 static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
 {
-	char	buffer[XDR_ADDRBUF_LEN + 1];
-	char	*name = argp->mon_name;
-
-	if (!nsm_use_hostnames) {
-		snprintf(buffer, XDR_ADDRBUF_LEN,
-			 "%pI4", &argp->addr);
-		name = buffer;
-	}
-
-	return xdr_encode_nsm_string(p, name);
+	return xdr_encode_nsm_string(p, argp->mon_name);
 }
 
 /*
-- 
cgit v1.2.3


From 9fee49024ed19d849413df4ab6ec1a1a60aaae94 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:20:53 -0500
Subject: NSM: Use sm_name instead of h_name in nsm_monitor() and
 nsm_unmonitor()

Clean up: Use the sm_name field for reporting the hostname in nsm_monitor()
and nsm_unmonitor(), just as the other functions in fs/lockd/mon.c do.

The h_name field is just a copy of the sm_name pointer.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index a606fbbf804..697bdcdd20c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -79,7 +79,7 @@ nsm_monitor(struct nlm_host *host)
 	struct nsm_res	res;
 	int		status;
 
-	dprintk("lockd: nsm_monitor(%s)\n", host->h_name);
+	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
 	BUG_ON(nsm == NULL);
 
 	if (nsm->sm_monitored)
@@ -94,7 +94,7 @@ nsm_monitor(struct nlm_host *host)
 	status = nsm_mon_unmon(nsm, SM_MON, &res);
 
 	if (status < 0 || res.status != 0)
-		printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name);
+		printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
 	else
 		nsm->sm_monitored = 1;
 	return status;
@@ -116,12 +116,12 @@ nsm_unmonitor(struct nlm_host *host)
 
 	if (atomic_read(&nsm->sm_count) == 1
 	 && nsm->sm_monitored && !nsm->sm_sticky) {
-		dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name);
+		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
 
 		status = nsm_mon_unmon(nsm, SM_UNMON, &res);
 		if (status < 0)
 			printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
-					host->h_name);
+					nsm->sm_name);
 		else
 			nsm->sm_monitored = 0;
 	}
-- 
cgit v1.2.3


From 501c1ed3fb5c2648ba1709282c71617910917f66 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:21:01 -0500
Subject: NLM: Remove redundant printk() in nlmclnt_lock()

The nsm_monitor() function already generates a printk(KERN_NOTICE) if
the SM_MON upcall fails, so the similar printk() in the nlmclnt_lock()
function is redundant.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/clntproc.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 31668b690e0..5ce42e0ed4a 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -518,11 +518,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 	unsigned char fl_type;
 	int status = -ENOLCK;
 
-	if (nsm_monitor(host) < 0) {
-		printk(KERN_NOTICE "lockd: failed to monitor %s\n",
-					host->h_name);
+	if (nsm_monitor(host) < 0)
 		goto out;
-	}
+
 	fl->fl_flags |= FL_ACCESS;
 	status = do_vfs_lock(fl);
 	fl->fl_flags = fl_flags;
-- 
cgit v1.2.3


From 5bc74bef7c9b652f0f2aa9c5a8d5ac86881aba79 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:21:08 -0500
Subject: NSM: Remove BUG_ON() in nsm_monitor()

Clean up: Remove the BUG_ON() invocation in nsm_monitor().  It's not
likely that nsm_monitor() is ever called with a NULL host pointer, and
the code will die anyway if host is NULL.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 697bdcdd20c..bb5fc1bb37f 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -80,7 +80,6 @@ nsm_monitor(struct nlm_host *host)
 	int		status;
 
 	dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
-	BUG_ON(nsm == NULL);
 
 	if (nsm->sm_monitored)
 		return 0;
-- 
cgit v1.2.3


From 5d254b119823658cc318f88589c6c426b3d0a153 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:21:15 -0500
Subject: NSM: Make sure to return an error if the SM_MON call result is not
 zero

The nsm_monitor() function reports an error and does not set sm_monitored
if the SM_MON upcall reply has a non-zero result code, but nsm_monitor()
does not return an error to its caller in this case.

Since sm_monitored is not set, the upcall is retried when the next NLM
request invokes nsm_monitor().  However, that may not come for a while.
In the meantime, at least one NLM request will potentially proceed
without the peer being monitored properly.

Have nsm_monitor() return an error if the result code is non-zero.
This will cause all NLM requests to fail immediately if the upcall
completed successfully but rpc.statd returned an error.

This may be inconvenient in some cases (for example if rpc.statd
cannot complete a proper DNS reverse lookup of the hostname), but will
make the reboot monitoring service more robust by forcing such issues
to be corrected by an admin.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index bb5fc1bb37f..07e16b81498 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -91,8 +91,9 @@ nsm_monitor(struct nlm_host *host)
 	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
 	status = nsm_mon_unmon(nsm, SM_MON, &res);
-
-	if (status < 0 || res.status != 0)
+	if (res.status != 0)
+		status = -EIO;
+	if (status < 0)
 		printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name);
 	else
 		nsm->sm_monitored = 1;
-- 
cgit v1.2.3


From 1e49323c4ab044d05bbc68cf13cadcbd4372468c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:21:24 -0500
Subject: NLM: Move the public declaration of nsm_monitor() to lockd.h

Clean up.

Make the nlm_host argument "const," and move the public declaration to
lockd.h with other NSM public function (nsm_release, eg) and global
variable declarations.

Add a documenting comment.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 07e16b81498..aaaa08e7ae7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -69,11 +69,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 	return status;
 }
 
-/*
- * Set up monitoring of a remote host
+/**
+ * nsm_monitor - Notify a peer in case we reboot
+ * @host: pointer to nlm_host of peer to notify
+ *
+ * If this peer is not already monitored, this function sends an
+ * upcall to the local rpc.statd to record the name/address of
+ * the peer to notify in case we reboot.
+ *
+ * Returns zero if the peer is monitored by the local rpc.statd;
+ * otherwise a negative errno value is returned.
  */
-int
-nsm_monitor(struct nlm_host *host)
+int nsm_monitor(const struct nlm_host *host)
 {
 	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
-- 
cgit v1.2.3


From c8c23c423dec49cb439697d3dc714e1500ff1610 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:21:31 -0500
Subject: NSM: Release nsmhandle in nlm_destroy_host

The nsm_handle's reference count is bumped in nlm_lookup_host().  It
should be decremented in nlm_destroy_host() to make it easier to see
the balance of these two operations.

Move the nsm_release() call to fs/lockd/host.c.

The h_nsmhandle pointer is set in nlm_lookup_host(), and never cleared.
The nlm_destroy_host() function is never called for the same nlm_host
twice, so h_nsmhandle won't ever be NULL when nsm_unmonitor() is
called.

All references to the nlm_host are gone before it is freed.  We can
skip making h_nsmhandle NULL just before the nlm_host is deallocated.

It's also likely we can remove the h_nsmhandle NULL check in
nlmsvc_is_client() as well, but we can do that later when rearchitect-
ing the nlm_host cache.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 8 +++-----
 fs/lockd/mon.c  | 5 -----
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 780918acd6f..1d523c1a7b6 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -37,6 +37,7 @@ static struct nsm_handle	*nsm_find(const struct sockaddr *sap,
 						const char *hostname,
 						const size_t hostname_len,
 						const int create);
+static void			nsm_release(struct nsm_handle *nsm);
 
 struct nlm_lookup_host_info {
 	const int		server;		/* search for server|client */
@@ -263,10 +264,8 @@ nlm_destroy_host(struct nlm_host *host)
 	BUG_ON(!list_empty(&host->h_lockowners));
 	BUG_ON(atomic_read(&host->h_count));
 
-	/*
-	 * Release NSM handle and unmonitor host.
-	 */
 	nsm_unmonitor(host);
+	nsm_release(host->h_nsmhandle);
 
 	clnt = host->h_rpcclnt;
 	if (clnt != NULL)
@@ -711,8 +710,7 @@ found:
 /*
  * Release an NSM handle
  */
-void
-nsm_release(struct nsm_handle *nsm)
+static void nsm_release(struct nsm_handle *nsm)
 {
 	if (!nsm)
 		return;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index aaaa08e7ae7..15fab22db02 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -117,10 +117,6 @@ nsm_unmonitor(struct nlm_host *host)
 	struct nsm_res	res;
 	int		status = 0;
 
-	if (nsm == NULL)
-		return 0;
-	host->h_nsmhandle = NULL;
-
 	if (atomic_read(&nsm->sm_count) == 1
 	 && nsm->sm_monitored && !nsm->sm_sticky) {
 		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
@@ -132,7 +128,6 @@ nsm_unmonitor(struct nlm_host *host)
 		else
 			nsm->sm_monitored = 0;
 	}
-	nsm_release(nsm);
 	return status;
 }
 
-- 
cgit v1.2.3


From 356c3eb466fd1a12afd6448d90fba3922836e5f1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:21:38 -0500
Subject: NLM: Move the public declaration of nsm_unmonitor() to lockd.h

Clean up.

Make the nlm_host argument "const," and move the public declaration to
lockd.h.  Add a documenting comment.

Bruce observed that nsm_unmonitor()'s only caller doesn't care about
its return code, so make nsm_unmonitor() return void.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 15fab22db02..d61cdc61cb5 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -107,15 +107,19 @@ int nsm_monitor(const struct nlm_host *host)
 	return status;
 }
 
-/*
- * Cease to monitor remote host
+/**
+ * nsm_unmonitor - Unregister peer notification
+ * @host: pointer to nlm_host of peer to stop monitoring
+ *
+ * If this peer is monitored, this function sends an upcall to
+ * tell the local rpc.statd not to send this peer a notification
+ * when we reboot.
  */
-int
-nsm_unmonitor(struct nlm_host *host)
+void nsm_unmonitor(const struct nlm_host *host)
 {
 	struct nsm_handle *nsm = host->h_nsmhandle;
 	struct nsm_res	res;
-	int		status = 0;
+	int status;
 
 	if (atomic_read(&nsm->sm_count) == 1
 	 && nsm->sm_monitored && !nsm->sm_sticky) {
@@ -128,7 +132,6 @@ nsm_unmonitor(struct nlm_host *host)
 		else
 			nsm->sm_monitored = 0;
 	}
-	return status;
 }
 
 /*
-- 
cgit v1.2.3


From 0c7aef4569f8680951b7dee01dddffb9d2f809ff Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 4 Dec 2008 14:21:46 -0500
Subject: NSM: Check result of SM_UNMON upcall

Make sure any error returned by rpc.statd during an SM_UNMON call is
reported rather than ignored completely.  There isn't much to do with
such an error, but we should log it in any case.

Similar to a recent change to nsm_monitor().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index d61cdc61cb5..3bb71e1b1e1 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -126,6 +126,8 @@ void nsm_unmonitor(const struct nlm_host *host)
 		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
 
 		status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+		if (res.status != 0)
+			status = -EIO;
 		if (status < 0)
 			printk(KERN_NOTICE "lockd: cannot unmonitor %s\n",
 					nsm->sm_name);
-- 
cgit v1.2.3


From 9c1bfd037f7ff8badaecb47418f109148d88bf45 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:01:59 -0500
Subject: NSM: Move NSM-related XDR data structures to lockd's xdr.h

Clean up: NSM's XDR data structures are used only in fs/lockd/mon.c,
so move them there.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 3bb71e1b1e1..81308832e99 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -18,6 +18,20 @@
 
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
 
+struct nsm_args {
+	__be32			addr;		/* remote address */
+	u32			prog;		/* RPC callback info */
+	u32			vers;
+	u32			proc;
+
+	char			*mon_name;
+};
+
+struct nsm_res {
+	u32			status;
+	u32			state;
+};
+
 static struct rpc_clnt *	nsm_create(void);
 
 static struct rpc_program	nsm_program;
-- 
cgit v1.2.3


From 36e8e668d3e6a61848a8921ddeb663b417299fa5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:02:07 -0500
Subject: NSM: Move NSM program and procedure numbers to fs/lockd/mon.c

Clean up: Move the RPC program and procedure numbers for NSM into the
one source file that needs them: fs/lockd/mon.c.

And, as with NLM, NFS, and rpcbind calls, use NSMPROC_FOO instead of
SM_FOO for NSM procedure numbers.

Finally, make a couple of comments more precise: what is referred to
here as SM_NOTIFY is really the NLM (lockd) NLMPROC_SM_NOTIFY downcall,
not NSMPROC_NOTIFY.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 42 +++++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 81308832e99..0fc9836db4e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -17,6 +17,18 @@
 
 
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
+#define NSM_PROGRAM		100024
+#define NSM_VERSION		1
+
+enum {
+	NSMPROC_NULL,
+	NSMPROC_STAT,
+	NSMPROC_MON,
+	NSMPROC_UNMON,
+	NSMPROC_UNMON_ALL,
+	NSMPROC_SIMU_CRASH,
+	NSMPROC_NOTIFY,
+};
 
 struct nsm_args {
 	__be32			addr;		/* remote address */
@@ -42,7 +54,7 @@ static struct rpc_program	nsm_program;
 int				nsm_local_state;
 
 /*
- * Common procedure for SM_MON/SM_UNMON calls
+ * Common procedure for NSMPROC_MON/NSMPROC_UNMON calls
  */
 static int
 nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
@@ -111,7 +123,7 @@ int nsm_monitor(const struct nlm_host *host)
 	 */
 	nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
 
-	status = nsm_mon_unmon(nsm, SM_MON, &res);
+	status = nsm_mon_unmon(nsm, NSMPROC_MON, &res);
 	if (res.status != 0)
 		status = -EIO;
 	if (status < 0)
@@ -139,7 +151,7 @@ void nsm_unmonitor(const struct nlm_host *host)
 	 && nsm->sm_monitored && !nsm->sm_sticky) {
 		dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
 
-		status = nsm_mon_unmon(nsm, SM_UNMON, &res);
+		status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res);
 		if (res.status != 0)
 			status = -EIO;
 		if (status < 0)
@@ -167,7 +179,7 @@ nsm_create(void)
 		.addrsize	= sizeof(sin),
 		.servername	= "localhost",
 		.program	= &nsm_program,
-		.version	= SM_VERSION,
+		.version	= NSM_VERSION,
 		.authflavor	= RPC_AUTH_NULL,
 	};
 
@@ -201,7 +213,7 @@ static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
 /*
  * The "my_id" argument specifies the hostname and RPC procedure
  * to be called when the status manager receives notification
- * (via the SM_NOTIFY call) that the state of host "mon_name"
+ * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
  * has changed.
  */
 static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
@@ -219,7 +231,7 @@ static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
 
 /*
  * The "mon_id" argument specifies the non-private arguments
- * of an SM_MON or SM_UNMON call.
+ * of an NSMPROC_MON or NSMPROC_UNMON call.
  */
 static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
 {
@@ -232,8 +244,8 @@ static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
 
 /*
  * The "priv" argument may contain private information required
- * by the SM_MON call. This information will be supplied in the
- * SM_NOTIFY call.
+ * by the NSMPROC_MON call. This information will be supplied in the
+ * NLMPROC_SM_NOTIFY call.
  *
  * Linux provides the raw IP address of the monitored host,
  * left in network byte order.
@@ -300,22 +312,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 #define SM_unmonres_sz	1
 
 static struct rpc_procinfo	nsm_procedures[] = {
-[SM_MON] = {
-		.p_proc		= SM_MON,
+[NSMPROC_MON] = {
+		.p_proc		= NSMPROC_MON,
 		.p_encode	= (kxdrproc_t) xdr_encode_mon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat_res,
 		.p_arglen	= SM_mon_sz,
 		.p_replen	= SM_monres_sz,
-		.p_statidx	= SM_MON,
+		.p_statidx	= NSMPROC_MON,
 		.p_name		= "MONITOR",
 	},
-[SM_UNMON] = {
-		.p_proc		= SM_UNMON,
+[NSMPROC_UNMON] = {
+		.p_proc		= NSMPROC_UNMON,
 		.p_encode	= (kxdrproc_t) xdr_encode_unmon,
 		.p_decode	= (kxdrproc_t) xdr_decode_stat,
 		.p_arglen	= SM_mon_id_sz,
 		.p_replen	= SM_unmonres_sz,
-		.p_statidx	= SM_UNMON,
+		.p_statidx	= NSMPROC_UNMON,
 		.p_name		= "UNMONITOR",
 	},
 };
@@ -334,7 +346,7 @@ static struct rpc_stat		nsm_stats;
 
 static struct rpc_program	nsm_program = {
 		.name		= "statd",
-		.number		= SM_PROGRAM,
+		.number		= NSM_PROGRAM,
 		.nrvers		= ARRAY_SIZE(nsm_version),
 		.version	= nsm_version,
 		.stats		= &nsm_stats
-- 
cgit v1.2.3


From 03eb1dcbb799304b58730f4dba65812f49fb305e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:02:15 -0500
Subject: NSM: move to xdr_stream-based XDR encoders and decoders

Introduce xdr_stream-based XDR encoder and decoder functions, which are
more careful about preventing RPC buffer overflows.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 130 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 78 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 0fc9836db4e..81e1cc14246 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -193,21 +193,26 @@ nsm_create(void)
  * Status Monitor wire protocol.
  */
 
-static __be32 *xdr_encode_nsm_string(__be32 *p, char *string)
+static int encode_nsm_string(struct xdr_stream *xdr, const char *string)
 {
-	size_t len = strlen(string);
-
-	if (len > SM_MAXSTRLEN)
-		len = SM_MAXSTRLEN;
-	return xdr_encode_opaque(p, string, len);
+	const u32 len = strlen(string);
+	__be32 *p;
+
+	if (unlikely(len > SM_MAXSTRLEN))
+		return -EIO;
+	p = xdr_reserve_space(xdr, sizeof(u32) + len);
+	if (unlikely(p == NULL))
+		return -EIO;
+	xdr_encode_opaque(p, string, len);
+	return 0;
 }
 
 /*
  * "mon_name" specifies the host to be monitored.
  */
-static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
+static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	return xdr_encode_nsm_string(p, argp->mon_name);
+	return encode_nsm_string(xdr, argp->mon_name);
 }
 
 /*
@@ -216,30 +221,35 @@ static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp)
  * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name"
  * has changed.
  */
-static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp)
+static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	p = xdr_encode_nsm_string(p, utsname()->nodename);
-	if (!p)
-		return ERR_PTR(-EIO);
-
+	int status;
+	__be32 *p;
+
+	status = encode_nsm_string(xdr, utsname()->nodename);
+	if (unlikely(status != 0))
+		return status;
+	p = xdr_reserve_space(xdr, 3 * sizeof(u32));
+	if (unlikely(p == NULL))
+		return -EIO;
 	*p++ = htonl(argp->prog);
 	*p++ = htonl(argp->vers);
 	*p++ = htonl(argp->proc);
-
-	return p;
+	return 0;
 }
 
 /*
  * The "mon_id" argument specifies the non-private arguments
  * of an NSMPROC_MON or NSMPROC_UNMON call.
  */
-static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
+static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
-	p = xdr_encode_mon_name(p, argp);
-	if (!p)
-		return ERR_PTR(-EIO);
+	int status;
 
-	return xdr_encode_my_id(p, argp);
+	status = encode_mon_name(xdr, argp);
+	if (unlikely(status != 0))
+		return status;
+	return encode_my_id(xdr, argp);
 }
 
 /*
@@ -250,55 +260,71 @@ static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp)
  * Linux provides the raw IP address of the monitored host,
  * left in network byte order.
  */
-static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp)
+static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
+	if (unlikely(p == NULL))
+		return -EIO;
 	*p++ = argp->addr;
 	*p++ = 0;
 	*p++ = 0;
 	*p++ = 0;
-
-	return p;
+	return 0;
 }
 
-static int
-xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p,
+		       const struct nsm_args *argp)
 {
-	p = xdr_encode_mon_id(p, argp);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-
-	p = xdr_encode_priv(p, argp);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
+	struct xdr_stream xdr;
+	int status;
 
-	rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
-	return 0;
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	status = encode_mon_id(&xdr, argp);
+	if (unlikely(status))
+		return status;
+	return encode_priv(&xdr, argp);
 }
 
-static int
-xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp)
+static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p,
+			 const struct nsm_args *argp)
 {
-	p = xdr_encode_mon_id(p, argp);
-	if (IS_ERR(p))
-		return PTR_ERR(p);
-	rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p);
-	return 0;
+	struct xdr_stream xdr;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	return encode_mon_id(&xdr, argp);
 }
 
-static int
-xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p,
+			    struct nsm_res *resp)
 {
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	p = xdr_inline_decode(&xdr, 2 * sizeof(u32));
+	if (unlikely(p == NULL))
+		return -EIO;
 	resp->status = ntohl(*p++);
-	resp->state = ntohl(*p++);
-	dprintk("nsm: xdr_decode_stat_res status %d state %d\n",
+	resp->state = ntohl(*p);
+
+	dprintk("lockd: xdr_dec_stat_res status %d state %d\n",
 			resp->status, resp->state);
 	return 0;
 }
 
-static int
-xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
+static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p,
+			struct nsm_res *resp)
 {
-	resp->state = ntohl(*p++);
+	struct xdr_stream xdr;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	p = xdr_inline_decode(&xdr, sizeof(u32));
+	if (unlikely(p == NULL))
+		return -EIO;
+	resp->state = ntohl(*p);
+
+	dprintk("lockd: xdr_dec_stat state %d\n", resp->state);
 	return 0;
 }
 
@@ -314,8 +340,8 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp)
 static struct rpc_procinfo	nsm_procedures[] = {
 [NSMPROC_MON] = {
 		.p_proc		= NSMPROC_MON,
-		.p_encode	= (kxdrproc_t) xdr_encode_mon,
-		.p_decode	= (kxdrproc_t) xdr_decode_stat_res,
+		.p_encode	= (kxdrproc_t)xdr_enc_mon,
+		.p_decode	= (kxdrproc_t)xdr_dec_stat_res,
 		.p_arglen	= SM_mon_sz,
 		.p_replen	= SM_monres_sz,
 		.p_statidx	= NSMPROC_MON,
@@ -323,8 +349,8 @@ static struct rpc_procinfo	nsm_procedures[] = {
 	},
 [NSMPROC_UNMON] = {
 		.p_proc		= NSMPROC_UNMON,
-		.p_encode	= (kxdrproc_t) xdr_encode_unmon,
-		.p_decode	= (kxdrproc_t) xdr_decode_stat,
+		.p_encode	= (kxdrproc_t)xdr_enc_unmon,
+		.p_decode	= (kxdrproc_t)xdr_dec_stat,
 		.p_arglen	= SM_mon_id_sz,
 		.p_replen	= SM_unmonres_sz,
 		.p_statidx	= NSMPROC_UNMON,
-- 
cgit v1.2.3


From 67c6d107a689243979a2b5f15244b5261634a924 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:02:45 -0500
Subject: NSM: Move nsm_find() to fs/lockd/mon.c

The nsm_find() function sets up fresh nsm_handle entries.  This is
where we will store the "priv" cookie used to lookup nsm_handles during
reboot recovery.  The cookie will be constructed when nsm_find()
creates a new nsm_handle.

As much as possible, I would like to keep everything that handles a
"priv" cookie in fs/lockd/mon.c so that all the smarts are in one
source file.  That organization should make it pretty simple to see how
all this works.

To me, it makes more sense than the current arrangement to keep
nsm_find() with nsm_monitor() and nsm_unmonitor().

So, start reorganizing by moving nsm_find() into fs/lockd/mon.c.  The
nsm_release() function comes along too, since it shares the nsm_lock
global variable.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 128 -----------------------------------------------------
 fs/lockd/mon.c  | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+), 128 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 1d523c1a7b6..dbdeaa88d2f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -32,12 +32,6 @@ static int			nrhosts;
 static DEFINE_MUTEX(nlm_host_mutex);
 
 static void			nlm_gc_hosts(void);
-static struct nsm_handle	*nsm_find(const struct sockaddr *sap,
-						const size_t salen,
-						const char *hostname,
-						const size_t hostname_len,
-						const int create);
-static void			nsm_release(struct nsm_handle *nsm);
 
 struct nlm_lookup_host_info {
 	const int		server;		/* search for server|client */
@@ -106,43 +100,6 @@ static void nlm_clear_port(struct sockaddr *sap)
 	}
 }
 
-static void nlm_display_ipv4_address(const struct sockaddr *sap, char *buf,
-				     const size_t len)
-{
-	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-	snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-}
-
-static void nlm_display_ipv6_address(const struct sockaddr *sap, char *buf,
-				     const size_t len)
-{
-	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-
-	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-		snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
-	else if (sin6->sin6_scope_id != 0)
-		snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
-				sin6->sin6_scope_id);
-	else
-		snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-}
-
-static void nlm_display_address(const struct sockaddr *sap,
-				char *buf, const size_t len)
-{
-	switch (sap->sa_family) {
-	case AF_INET:
-		nlm_display_ipv4_address(sap, buf, len);
-		break;
-	case AF_INET6:
-		nlm_display_ipv6_address(sap, buf, len);
-		break;
-	default:
-		snprintf(buf, len, "unsupported address family");
-		break;
-	}
-}
-
 /*
  * Common host lookup routine for server & client
  */
@@ -635,88 +592,3 @@ nlm_gc_hosts(void)
 
 	next_gc = jiffies + NLM_HOST_COLLECT;
 }
-
-
-/*
- * Manage NSM handles
- */
-static LIST_HEAD(nsm_handles);
-static DEFINE_SPINLOCK(nsm_lock);
-
-static struct nsm_handle *nsm_find(const struct sockaddr *sap,
-				   const size_t salen,
-				   const char *hostname,
-				   const size_t hostname_len,
-				   const int create)
-{
-	struct nsm_handle *nsm = NULL;
-	struct nsm_handle *pos;
-
-	if (!sap)
-		return NULL;
-
-	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
-		if (printk_ratelimit()) {
-			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
-					    "in NFS lock request\n",
-				(int)hostname_len, hostname);
-		}
-		return NULL;
-	}
-
-retry:
-	spin_lock(&nsm_lock);
-	list_for_each_entry(pos, &nsm_handles, sm_link) {
-
-		if (hostname && nsm_use_hostnames) {
-			if (strlen(pos->sm_name) != hostname_len
-			 || memcmp(pos->sm_name, hostname, hostname_len))
-				continue;
-		} else if (!nlm_cmp_addr(nsm_addr(pos), sap))
-			continue;
-		atomic_inc(&pos->sm_count);
-		kfree(nsm);
-		nsm = pos;
-		goto found;
-	}
-	if (nsm) {
-		list_add(&nsm->sm_link, &nsm_handles);
-		goto found;
-	}
-	spin_unlock(&nsm_lock);
-
-	if (!create)
-		return NULL;
-
-	nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
-	if (nsm == NULL)
-		return NULL;
-
-	memcpy(nsm_addr(nsm), sap, salen);
-	nsm->sm_addrlen = salen;
-	nsm->sm_name = (char *) (nsm + 1);
-	memcpy(nsm->sm_name, hostname, hostname_len);
-	nsm->sm_name[hostname_len] = '\0';
-	nlm_display_address((struct sockaddr *)&nsm->sm_addr,
-				nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
-	atomic_set(&nsm->sm_count, 1);
-	goto retry;
-
-found:
-	spin_unlock(&nsm_lock);
-	return nsm;
-}
-
-/*
- * Release an NSM handle
- */
-static void nsm_release(struct nsm_handle *nsm)
-{
-	if (!nsm)
-		return;
-	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
-		list_del(&nsm->sm_link);
-		spin_unlock(&nsm_lock);
-		kfree(nsm);
-	}
-}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 81e1cc14246..8e68e799293 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -47,12 +47,51 @@ struct nsm_res {
 static struct rpc_clnt *	nsm_create(void);
 
 static struct rpc_program	nsm_program;
+static				LIST_HEAD(nsm_handles);
+static				DEFINE_SPINLOCK(nsm_lock);
 
 /*
  * Local NSM state
  */
 int				nsm_local_state;
 
+static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
+				     const size_t len)
+{
+	const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+	snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
+}
+
+static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
+				     const size_t len)
+{
+	const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+		snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
+	else if (sin6->sin6_scope_id != 0)
+		snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
+				sin6->sin6_scope_id);
+	else
+		snprintf(buf, len, "%pI6", &sin6->sin6_addr);
+}
+
+static void nsm_display_address(const struct sockaddr *sap,
+				char *buf, const size_t len)
+{
+	switch (sap->sa_family) {
+	case AF_INET:
+		nsm_display_ipv4_address(sap, buf, len);
+		break;
+	case AF_INET6:
+		nsm_display_ipv6_address(sap, buf, len);
+		break;
+	default:
+		snprintf(buf, len, "unsupported address family");
+		break;
+	}
+}
+
 /*
  * Common procedure for NSMPROC_MON/NSMPROC_UNMON calls
  */
@@ -162,6 +201,100 @@ void nsm_unmonitor(const struct nlm_host *host)
 	}
 }
 
+/**
+ * nsm_find - Find or create a cached nsm_handle
+ * @sap: pointer to socket address of handle to find
+ * @salen: length of socket address
+ * @hostname: pointer to C string containing hostname to find
+ * @hostname_len: length of C string
+ * @create: one means create new handle if not found in cache
+ *
+ * Behavior is modulated by the global nsm_use_hostnames variable
+ * and by the @create argument.
+ *
+ * Returns a cached nsm_handle after bumping its ref count, or if
+ * @create is set, returns a fresh nsm_handle if a handle that
+ * matches @sap and/or @hostname cannot be found in the handle cache.
+ * Returns NULL if an error occurs.
+ */
+struct nsm_handle *nsm_find(const struct sockaddr *sap, const size_t salen,
+			    const char *hostname, const size_t hostname_len,
+			    const int create)
+{
+	struct nsm_handle *nsm = NULL;
+	struct nsm_handle *pos;
+
+	if (!sap)
+		return NULL;
+
+	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
+		if (printk_ratelimit()) {
+			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
+					    "in NFS lock request\n",
+				(int)hostname_len, hostname);
+		}
+		return NULL;
+	}
+
+retry:
+	spin_lock(&nsm_lock);
+	list_for_each_entry(pos, &nsm_handles, sm_link) {
+
+		if (hostname && nsm_use_hostnames) {
+			if (strlen(pos->sm_name) != hostname_len
+			 || memcmp(pos->sm_name, hostname, hostname_len))
+				continue;
+		} else if (!nlm_cmp_addr(nsm_addr(pos), sap))
+			continue;
+		atomic_inc(&pos->sm_count);
+		kfree(nsm);
+		nsm = pos;
+		goto found;
+	}
+	if (nsm) {
+		list_add(&nsm->sm_link, &nsm_handles);
+		goto found;
+	}
+	spin_unlock(&nsm_lock);
+
+	if (!create)
+		return NULL;
+
+	nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
+	if (nsm == NULL)
+		return NULL;
+
+	memcpy(nsm_addr(nsm), sap, salen);
+	nsm->sm_addrlen = salen;
+	nsm->sm_name = (char *) (nsm + 1);
+	memcpy(nsm->sm_name, hostname, hostname_len);
+	nsm->sm_name[hostname_len] = '\0';
+	nsm_display_address((struct sockaddr *)&nsm->sm_addr,
+				nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
+	atomic_set(&nsm->sm_count, 1);
+	goto retry;
+
+found:
+	spin_unlock(&nsm_lock);
+	return nsm;
+}
+
+/**
+ * nsm_release - Release an NSM handle
+ * @nsm: pointer to handle to be released
+ *
+ */
+void nsm_release(struct nsm_handle *nsm)
+{
+	if (!nsm)
+		return;
+	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
+		list_del(&nsm->sm_link);
+		spin_unlock(&nsm_lock);
+		kfree(nsm);
+	}
+}
+
 /*
  * Create NSM client for the local host
  */
-- 
cgit v1.2.3


From 5cf1c4b19db99d21d44c2ab457cfd44eb86b4439 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:02:53 -0500
Subject: NSM: Add dprintk() calls in nsm_find and nsm_release

Introduce some dprintk() calls in fs/lockd/mon.c that are enabled by
the NLMDBG_MONITOR flag.  These report when we find, create, and
release nsm_handles.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 8e68e799293..38255455563 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -249,10 +249,15 @@ retry:
 		atomic_inc(&pos->sm_count);
 		kfree(nsm);
 		nsm = pos;
+		dprintk("lockd: found nsm_handle for %s (%s), cnt %d\n",
+				pos->sm_name, pos->sm_addrbuf,
+				atomic_read(&pos->sm_count));
 		goto found;
 	}
 	if (nsm) {
 		list_add(&nsm->sm_link, &nsm_handles);
+		dprintk("lockd: created nsm_handle for %s (%s)\n",
+				nsm->sm_name, nsm->sm_addrbuf);
 		goto found;
 	}
 	spin_unlock(&nsm_lock);
@@ -291,6 +296,8 @@ void nsm_release(struct nsm_handle *nsm)
 	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
 		list_del(&nsm->sm_link);
 		spin_unlock(&nsm_lock);
+		dprintk("lockd: destroyed nsm_handle for %s (%s)\n",
+				nsm->sm_name, nsm->sm_addrbuf);
 		kfree(nsm);
 	}
 }
-- 
cgit v1.2.3


From bc1cc6c4e476b60df48227165990c87a22db6bb7 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:01 -0500
Subject: NSM: Remove NULL pointer check from nsm_find()

The nsm_find() function should never be called with a NULL IP address
pointer.  If it is, that's a bug.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 38255455563..0a066a13478 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -224,9 +224,6 @@ struct nsm_handle *nsm_find(const struct sockaddr *sap, const size_t salen,
 	struct nsm_handle *nsm = NULL;
 	struct nsm_handle *pos;
 
-	if (!sap)
-		return NULL;
-
 	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
 		if (printk_ratelimit()) {
 			printk(KERN_WARNING "Invalid hostname \"%.*s\" "
-- 
cgit v1.2.3


From 05f3a9af58180d24a9decedd71d4587935782d70 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:09 -0500
Subject: NSM: Remove !nsm check from nsm_release()

The nsm_release() function should never be called with a NULL handle
point.  If it is, that's a bug.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 0a066a13478..0792900b628 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -288,8 +288,6 @@ found:
  */
 void nsm_release(struct nsm_handle *nsm)
 {
-	if (!nsm)
-		return;
 	if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) {
 		list_del(&nsm->sm_link);
 		spin_unlock(&nsm_lock);
-- 
cgit v1.2.3


From 7e44d3bea21fbb9494930d1cd35ca92a9a4a3279 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:16 -0500
Subject: NSM: Generate NSMPROC_MON's "priv" argument when nsm_handle is
 created

Introduce a new data type, used by both the in-kernel NLM and NSM
implementations, that is used to manage the opaque "priv" argument
for the NSMPROC_MON and NLMPROC_SM_NOTIFY calls.

Construct the "priv" cookie when the nsm_handle is created.

The nsm_init_private() function may look a little strange, but it is
roughly equivalent to how the XDR encoder formed the "priv" argument.
It's going to go away soon.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 0792900b628..c8d18cd22b8 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -201,6 +201,21 @@ void nsm_unmonitor(const struct nlm_host *host)
 	}
 }
 
+/*
+ * Construct a unique cookie to match this nsm_handle to this monitored
+ * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
+ * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
+ * requests.
+ *
+ * Linux provides the raw IP address of the monitored host,
+ * left in network byte order.
+ */
+static void nsm_init_private(struct nsm_handle *nsm)
+{
+	__be32 *p = (__be32 *)&nsm->sm_priv.data;
+	*p = nsm_addr_in(nsm)->sin_addr.s_addr;
+}
+
 /**
  * nsm_find - Find or create a cached nsm_handle
  * @sap: pointer to socket address of handle to find
@@ -271,6 +286,7 @@ retry:
 	nsm->sm_name = (char *) (nsm + 1);
 	memcpy(nsm->sm_name, hostname, hostname_len);
 	nsm->sm_name[hostname_len] = '\0';
+	nsm_init_private(nsm);
 	nsm_display_address((struct sockaddr *)&nsm->sm_addr,
 				nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
 	atomic_set(&nsm->sm_count, 1);
-- 
cgit v1.2.3


From cab2d3c99165abbba2943f1b269003b17fd3b1cb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:24 -0500
Subject: NSM: Encode the new "priv" cookie for NSMPROC_MON requests

Pass the new "priv" cookie to NSMPROC_MON's XDR encoder, instead of
creating the "priv" argument in the encoder at call time.

This patch should not cause a behavioral change: the contents of the
cookie remain the same for the time being.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index c8d18cd22b8..4424b0a5a51 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -31,7 +31,7 @@ enum {
 };
 
 struct nsm_args {
-	__be32			addr;		/* remote address */
+	struct nsm_private	*priv;
 	u32			prog;		/* RPC callback info */
 	u32			vers;
 	u32			proc;
@@ -101,7 +101,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 	struct rpc_clnt	*clnt;
 	int		status;
 	struct nsm_args args = {
-		.addr		= nsm_addr_in(nsm)->sin_addr.s_addr,
+		.priv		= &nsm->sm_priv,
 		.prog		= NLM_PROGRAM,
 		.vers		= 3,
 		.proc		= NLMPROC_NSM_NOTIFY,
@@ -407,9 +407,6 @@ static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp)
  * The "priv" argument may contain private information required
  * by the NSMPROC_MON call. This information will be supplied in the
  * NLMPROC_SM_NOTIFY call.
- *
- * Linux provides the raw IP address of the monitored host,
- * left in network byte order.
  */
 static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 {
@@ -418,10 +415,7 @@ static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp)
 	p = xdr_reserve_space(xdr, SM_PRIV_SIZE);
 	if (unlikely(p == NULL))
 		return -EIO;
-	*p++ = argp->addr;
-	*p++ = 0;
-	*p++ = 0;
-	*p++ = 0;
+	xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 7fefc9cb9d5f129c238d93166f705c96ca2e7e51 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:31 -0500
Subject: NLM: Change nlm_host_rebooted() to take a single nlm_reboot argument

Pass the nlm_reboot data structure directly from the NLMPROC_SM_NOTIFY
XDR decoders to nlm_host_rebooted().  This eliminates some packing and
unpacking of the NLMPROC_SM_NOTIFY results, and prepares for passing
these results, including the "priv" cookie, directly to a lookup
routine in fs/lockd/mon.c.

This patch changes code organization but should not cause any
behavioral change.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c     | 31 +++++++++++++++++--------------
 fs/lockd/svc4proc.c | 11 +----------
 fs/lockd/svcproc.c  | 11 +----------
 3 files changed, 19 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index dbdeaa88d2f..ed103387964 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -444,31 +444,34 @@ void nlm_release_host(struct nlm_host *host)
 	}
 }
 
-/*
- * We were notified that the host indicated by address &sin
- * has rebooted.
- * Release all resources held by that peer.
+/**
+ * nlm_host_rebooted - Release all resources held by rebooted host
+ * @info: pointer to decoded results of NLM_SM_NOTIFY call
+ *
+ * We were notified that the specified host has rebooted.  Release
+ * all resources held by that peer.
  */
-void nlm_host_rebooted(const struct sockaddr_in *sin,
-				const char *hostname,
-				unsigned int hostname_len,
-				u32 new_state)
+void nlm_host_rebooted(const struct nlm_reboot *info)
 {
+	const struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= info->addr,
+	};
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
-	nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
-			hostname, hostname_len, 0);
+	nsm = nsm_find((struct sockaddr *)&sin, sizeof(sin),
+			info->mon, info->len, 0);
 	if (nsm == NULL) {
 		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-				hostname_len, hostname);
+				info->len, info->mon);
 		return;
 	}
 
 	dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
-			hostname_len, hostname, nsm->sm_addrbuf);
+			info->len, info->mon, nsm->sm_addrbuf);
 
 	/* When reclaiming locks on this peer, make sure that
 	 * we set up a new notification */
@@ -483,8 +486,8 @@ again:	mutex_lock(&nlm_host_mutex);
 	for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) {
 		hlist_for_each_entry(host, pos, chain, h_hash) {
 			if (host->h_nsmhandle == nsm
-			 && host->h_nsmstate != new_state) {
-				host->h_nsmstate = new_state;
+			 && host->h_nsmstate != info->state) {
+				host->h_nsmstate = info->state;
 				host->h_state++;
 
 				nlm_get_host(host);
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4dfdcbc6bf6..bb79a53e060 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -419,8 +419,6 @@ static __be32
 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 					      void	        *resp)
 {
-	struct sockaddr_in	saddr;
-
 	dprintk("lockd: SM_NOTIFY     called\n");
 
 	if (!nlm_privileged_requester(rqstp)) {
@@ -430,14 +428,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	/* Obtain the host pointer for this NFS server and try to
-	 * reclaim all locks we hold on this server.
-	 */
-	memset(&saddr, 0, sizeof(saddr));
-	saddr.sin_family = AF_INET;
-	saddr.sin_addr.s_addr = argp->addr;
-	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-
+	nlm_host_rebooted(argp);
 	return rpc_success;
 }
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 3ca89e2a938..e44310c0211 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -451,8 +451,6 @@ static __be32
 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 					      void	        *resp)
 {
-	struct sockaddr_in	saddr;
-
 	dprintk("lockd: SM_NOTIFY     called\n");
 
 	if (!nlm_privileged_requester(rqstp)) {
@@ -462,14 +460,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
 		return rpc_system_err;
 	}
 
-	/* Obtain the host pointer for this NFS server and try to
-	 * reclaim all locks we hold on this server.
-	 */
-	memset(&saddr, 0, sizeof(saddr));
-	saddr.sin_family = AF_INET;
-	saddr.sin_addr.s_addr = argp->addr;
-	nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state);
-
+	nlm_host_rebooted(argp);
 	return rpc_success;
 }
 
-- 
cgit v1.2.3


From 576df4634e37e46b441fefb91915184edb13bb94 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:39 -0500
Subject: NLM: Decode "priv" argument of NLMPROC_SM_NOTIFY as an opaque

The NLM XDR decoders for the NLMPROC_SM_NOTIFY procedure should treat
their "priv" argument truly as an opaque, as defined by the protocol,
and let the upper layers figure out what is in it.

This will make it easier to modify the contents and interpretation of
the "priv" argument, and keep knowledge about what's in "priv" local
to fs/lockd/mon.c.

For now, the NLM and NSM implementations should behave exactly as they
did before.

The formation of the address of the rebooted host in
nlm_host_rebooted() may look a little strange, but it is the inverse
of how nsm_init_private() forms the private cookie.  Plus, it's
going away soon anyway.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 3 ++-
 fs/lockd/xdr.c  | 4 ++--
 fs/lockd/xdr4.c | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index ed103387964..dc41e46ef74 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -453,9 +453,10 @@ void nlm_release_host(struct nlm_host *host)
  */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
+	__be32 *p = (__be32 *)&info->priv.data;
 	const struct sockaddr_in sin = {
 		.sin_family		= AF_INET,
-		.sin_addr.s_addr	= info->addr,
+		.sin_addr.s_addr	= *p,
 	};
 	struct hlist_head *chain;
 	struct hlist_node *pos;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 1f226290c67..4cc7d01a1eb 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -349,8 +349,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
 	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
 		return 0;
 	argp->state = ntohl(*p++);
-	/* Preserve the address in network byte order */
-	argp->addr = *p++;
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
 	return xdr_argsize_check(rqstp, p);
 }
 
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 50c493a8ad8..61d1714a470 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -356,8 +356,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
 	if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
 		return 0;
 	argp->state = ntohl(*p++);
-	/* Preserve the address in network byte order */
-	argp->addr  = *p++;
+	memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+	p += XDR_QUADLEN(SM_PRIV_SIZE);
 	return xdr_argsize_check(rqstp, p);
 }
 
-- 
cgit v1.2.3


From 3420a8c4359a189f7d854ed7075d151257415447 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:46 -0500
Subject: NSM: Add nsm_lookup() function

Introduce a new API to fs/lockd/mon.c that allows nlm_host_rebooted()
to lookup up nsm_handles via the contents of an nlm_reboot struct.

The new function is equivalent to calling nsm_find() with @create set
to zero, but it takes a struct nlm_reboot instead of separate
arguments.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 4424b0a5a51..e46903995c9 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -201,6 +201,29 @@ void nsm_unmonitor(const struct nlm_host *host)
 	}
 }
 
+static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
+					      const size_t len)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (strlen(nsm->sm_name) == len &&
+		    memcmp(nsm->sm_name, hostname, len) == 0)
+			return nsm;
+	return NULL;
+}
+
+static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (memcmp(nsm->sm_priv.data, priv->data,
+					sizeof(priv->data)) == 0)
+			return nsm;
+	return NULL;
+}
+
 /*
  * Construct a unique cookie to match this nsm_handle to this monitored
  * host.  It is passed to the local rpc.statd via NSMPROC_MON, and
@@ -297,6 +320,47 @@ found:
 	return nsm;
 }
 
+/**
+ * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @info: pointer to NLMPROC_SM_NOTIFY arguments
+ *
+ * Returns a matching nsm_handle if found in the nsm cache; the returned
+ * nsm_handle's reference count is bumped and sm_monitored is cleared.
+ * Otherwise returns NULL if some error occurred.
+ */
+struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+{
+	struct nsm_handle *cached;
+
+	spin_lock(&nsm_lock);
+
+	if (nsm_use_hostnames && info->mon != NULL)
+		cached = nsm_lookup_hostname(info->mon, info->len);
+	else
+		cached = nsm_lookup_priv(&info->priv);
+
+	if (unlikely(cached == NULL)) {
+		spin_unlock(&nsm_lock);
+		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+				info->len, info->mon);
+		return cached;
+	}
+
+	atomic_inc(&cached->sm_count);
+	spin_unlock(&nsm_lock);
+
+	/*
+	 * During subsequent lock activity, force a fresh
+	 * notification to be set up for this host.
+	 */
+	cached->sm_monitored = 0;
+
+	dprintk("lockd: host %s (%s) rebooted, cnt %d\n",
+			cached->sm_name, cached->sm_addrbuf,
+			atomic_read(&cached->sm_count));
+	return cached;
+}
+
 /**
  * nsm_release - Release an NSM handle
  * @nsm: pointer to handle to be released
-- 
cgit v1.2.3


From 8c7378fd2a5f22016542931b887a2ae98d146eaf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:03:54 -0500
Subject: NLM: Call nsm_reboot_lookup() instead of nsm_find()

Invoke the newly introduced nsm_reboot_lookup() function in
nlm_host_rebooted() instead of nsm_find().

This introduces just one behavioral change: debugging messages
produced during reboot notification will now appear when the
NLMDBG_MONITOR flag is set, but not when the NLMDBG_HOSTCACHE flag
is set.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c | 20 ++------------------
 1 file changed, 2 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index dc41e46ef74..230de93fc04 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -453,30 +453,14 @@ void nlm_release_host(struct nlm_host *host)
  */
 void nlm_host_rebooted(const struct nlm_reboot *info)
 {
-	__be32 *p = (__be32 *)&info->priv.data;
-	const struct sockaddr_in sin = {
-		.sin_family		= AF_INET,
-		.sin_addr.s_addr	= *p,
-	};
 	struct hlist_head *chain;
 	struct hlist_node *pos;
 	struct nsm_handle *nsm;
 	struct nlm_host	*host;
 
-	nsm = nsm_find((struct sockaddr *)&sin, sizeof(sin),
-			info->mon, info->len, 0);
-	if (nsm == NULL) {
-		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-				info->len, info->mon);
+	nsm = nsm_reboot_lookup(info);
+	if (unlikely(nsm == NULL))
 		return;
-	}
-
-	dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
-			info->len, info->mon, nsm->sm_addrbuf);
-
-	/* When reclaiming locks on this peer, make sure that
-	 * we set up a new notification */
-	nsm->sm_monitored = 0;
 
 	/* Mark all hosts tied to this NSM state as having rebooted.
 	 * We run the loop repeatedly, because we drop the host table
-- 
cgit v1.2.3


From 92fd91b998a5216a6d6606704e71d541a180216c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 5 Dec 2008 19:04:01 -0500
Subject: NLM: Remove "create" argument from nsm_find()

Clean up: nsm_find() now has only one caller, and that caller
unconditionally sets the @create argument. Thus the @create
argument is no longer needed.

Since nsm_find() now has a more specific purpose, pick a more
appropriate name for it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/host.c |  4 ++--
 fs/lockd/mon.c  | 23 +++++++++--------------
 2 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 230de93fc04..e5a65df4c0c 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -159,8 +159,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
 		atomic_inc(&nsm->sm_count);
 	else {
 		host = NULL;
-		nsm = nsm_find(ni->sap, ni->salen,
-				ni->hostname, ni->hostname_len, 1);
+		nsm = nsm_get_handle(ni->sap, ni->salen,
+					ni->hostname, ni->hostname_len);
 		if (!nsm) {
 			dprintk("lockd: nlm_lookup_host failed; "
 				"no nsm handle\n");
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e46903995c9..74070221604 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -240,24 +240,22 @@ static void nsm_init_private(struct nsm_handle *nsm)
 }
 
 /**
- * nsm_find - Find or create a cached nsm_handle
+ * nsm_get_handle - Find or create a cached nsm_handle
  * @sap: pointer to socket address of handle to find
  * @salen: length of socket address
  * @hostname: pointer to C string containing hostname to find
  * @hostname_len: length of C string
- * @create: one means create new handle if not found in cache
  *
- * Behavior is modulated by the global nsm_use_hostnames variable
- * and by the @create argument.
+ * Behavior is modulated by the global nsm_use_hostnames variable.
  *
- * Returns a cached nsm_handle after bumping its ref count, or if
- * @create is set, returns a fresh nsm_handle if a handle that
- * matches @sap and/or @hostname cannot be found in the handle cache.
- * Returns NULL if an error occurs.
+ * Returns a cached nsm_handle after bumping its ref count, or
+ * returns a fresh nsm_handle if a handle that matches @sap and/or
+ * @hostname cannot be found in the handle cache.  Returns NULL if
+ * an error occurs.
  */
-struct nsm_handle *nsm_find(const struct sockaddr *sap, const size_t salen,
-			    const char *hostname, const size_t hostname_len,
-			    const int create)
+struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+				  const size_t salen, const char *hostname,
+				  const size_t hostname_len)
 {
 	struct nsm_handle *nsm = NULL;
 	struct nsm_handle *pos;
@@ -297,9 +295,6 @@ retry:
 	}
 	spin_unlock(&nsm_lock);
 
-	if (!create)
-		return NULL;
-
 	nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
 	if (nsm == NULL)
 		return NULL;
-- 
cgit v1.2.3


From b39b897c259fc1fd1998505f2b1d4ec1f115bce1 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:55:52 -0500
Subject: NSM: Refactor nsm_handle creation into a helper function

Clean up.  Refactor the creation of nsm_handles into a helper.  Fields
are initialized in increasing address order to make efficient use of
CPU caches.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 38 ++++++++++++++++++++++++++------------
 1 file changed, 26 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 74070221604..315ca07715c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -239,6 +239,30 @@ static void nsm_init_private(struct nsm_handle *nsm)
 	*p = nsm_addr_in(nsm)->sin_addr.s_addr;
 }
 
+static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
+					    const size_t salen,
+					    const char *hostname,
+					    const size_t hostname_len)
+{
+	struct nsm_handle *new;
+
+	new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
+	if (unlikely(new == NULL))
+		return NULL;
+
+	atomic_set(&new->sm_count, 1);
+	new->sm_name = (char *)(new + 1);
+	memcpy(nsm_addr(new), sap, salen);
+	new->sm_addrlen = salen;
+	nsm_init_private(new);
+	nsm_display_address((const struct sockaddr *)&new->sm_addr,
+				new->sm_addrbuf, sizeof(new->sm_addrbuf));
+	memcpy(new->sm_name, hostname, hostname_len);
+	new->sm_name[hostname_len] = '\0';
+
+	return new;
+}
+
 /**
  * nsm_get_handle - Find or create a cached nsm_handle
  * @sap: pointer to socket address of handle to find
@@ -295,19 +319,9 @@ retry:
 	}
 	spin_unlock(&nsm_lock);
 
-	nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL);
-	if (nsm == NULL)
+	nsm = nsm_create_handle(sap, salen, hostname, hostname_len);
+	if (unlikely(nsm == NULL))
 		return NULL;
-
-	memcpy(nsm_addr(nsm), sap, salen);
-	nsm->sm_addrlen = salen;
-	nsm->sm_name = (char *) (nsm + 1);
-	memcpy(nsm->sm_name, hostname, hostname_len);
-	nsm->sm_name[hostname_len] = '\0';
-	nsm_init_private(nsm);
-	nsm_display_address((struct sockaddr *)&nsm->sm_addr,
-				nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
-	atomic_set(&nsm->sm_count, 1);
 	goto retry;
 
 found:
-- 
cgit v1.2.3


From 77a3ef33e2de6fc8aabd7cb1700bfef81757c28a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:55:59 -0500
Subject: NSM: More clean up of nsm_get_handle()

Clean up: refactor nsm_get_handle() so it is organized the same way that
nsm_reboot_lookup() is.

There is an additional micro-optimization here.  This change moves the
"hostname & nsm_use_hostnames" test out of the list_for_each_entry()
clause in nsm_get_handle(), since it is loop-invariant.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 62 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 315ca07715c..99aec744474 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -213,6 +213,16 @@ static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
 	return NULL;
 }
 
+static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+{
+	struct nsm_handle *nsm;
+
+	list_for_each_entry(nsm, &nsm_handles, sm_link)
+		if (nlm_cmp_addr(nsm_addr(nsm), sap))
+			return nsm;
+	return NULL;
+}
+
 static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
 {
 	struct nsm_handle *nsm;
@@ -281,8 +291,7 @@ struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
 				  const size_t salen, const char *hostname,
 				  const size_t hostname_len)
 {
-	struct nsm_handle *nsm = NULL;
-	struct nsm_handle *pos;
+	struct nsm_handle *cached, *new = NULL;
 
 	if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
 		if (printk_ratelimit()) {
@@ -295,38 +304,37 @@ struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
 
 retry:
 	spin_lock(&nsm_lock);
-	list_for_each_entry(pos, &nsm_handles, sm_link) {
-
-		if (hostname && nsm_use_hostnames) {
-			if (strlen(pos->sm_name) != hostname_len
-			 || memcmp(pos->sm_name, hostname, hostname_len))
-				continue;
-		} else if (!nlm_cmp_addr(nsm_addr(pos), sap))
-			continue;
-		atomic_inc(&pos->sm_count);
-		kfree(nsm);
-		nsm = pos;
-		dprintk("lockd: found nsm_handle for %s (%s), cnt %d\n",
-				pos->sm_name, pos->sm_addrbuf,
-				atomic_read(&pos->sm_count));
-		goto found;
+
+	if (nsm_use_hostnames && hostname != NULL)
+		cached = nsm_lookup_hostname(hostname, hostname_len);
+	else
+		cached = nsm_lookup_addr(sap);
+
+	if (cached != NULL) {
+		atomic_inc(&cached->sm_count);
+		spin_unlock(&nsm_lock);
+		kfree(new);
+		dprintk("lockd: found nsm_handle for %s (%s), "
+				"cnt %d\n", cached->sm_name,
+				cached->sm_addrbuf,
+				atomic_read(&cached->sm_count));
+		return cached;
 	}
-	if (nsm) {
-		list_add(&nsm->sm_link, &nsm_handles);
+
+	if (new != NULL) {
+		list_add(&new->sm_link, &nsm_handles);
+		spin_unlock(&nsm_lock);
 		dprintk("lockd: created nsm_handle for %s (%s)\n",
-				nsm->sm_name, nsm->sm_addrbuf);
-		goto found;
+				new->sm_name, new->sm_addrbuf);
+		return new;
 	}
+
 	spin_unlock(&nsm_lock);
 
-	nsm = nsm_create_handle(sap, salen, hostname, hostname_len);
-	if (unlikely(nsm == NULL))
+	new = nsm_create_handle(sap, salen, hostname, hostname_len);
+	if (unlikely(new == NULL))
 		return NULL;
 	goto retry;
-
-found:
-	spin_unlock(&nsm_lock);
-	return nsm;
 }
 
 /**
-- 
cgit v1.2.3


From 94da7663db26530a8377f7219f8be8bd4d4822c2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:56:07 -0500
Subject: NSM: Replace IP address as our nlm_reboot lookup key

NLM provides file locking services for NFS files.  Part of this service
includes a second protocol, known as NSM, which is a reboot
notification service.  NLM uses this service to determine when to
reclaim locks or enter a grace period after a client or server reboots.

The NLM service (implemented by lockd in the Linux kernel) contacts
the local NSM service (implemented by rpc.statd in Linux user space)
via NSM protocol upcalls to register a callback when a particular
remote peer reboots.

To match the callback to the correct remote peer, the NLM service
constructs a cookie that it passes in the request.  The NSM service
passes that cookie back to the NLM service when it is notified that
the given remote peer has indeed rebooted.

Currently on Linux, the cookie is the raw 32-bit IPv4 address of the
remote peer.  To support IPv6 addresses, which are larger, we could
use all 16 bytes of the cookie to represent a full IPv6 address,
although we still can't represent an IPv6 address with a scope ID in
just 16 bytes.

Instead, to avoid the need for future changes to support additional
address types, we'll use a manufactured value for the cookie, and use
that to find the corresponding nsm_handle struct in the kernel during
the NLMPROC_SM_NOTIFY callback.

This should provide complete support in the kernel's NSM
implementation for IPv6 hosts, while remaining backwards compatible
with older rpc.statd implementations.

Note we also deal with another case where nsm_use_hostnames can change
while there are outstanding notifications, possibly resulting in the
loss of reboot notifications.  After this patch, the priv cookie is
always used to lookup rebooted hosts in the kernel.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 99aec744474..8ae4c02d7df 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -9,6 +9,8 @@
 #include <linux/types.h>
 #include <linux/utsname.h>
 #include <linux/kernel.h>
+#include <linux/ktime.h>
+
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
@@ -240,13 +242,25 @@ static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
  * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these
  * requests.
  *
- * Linux provides the raw IP address of the monitored host,
- * left in network byte order.
+ * The NSM protocol requires that these cookies be unique while the
+ * system is running.  We prefer a stronger requirement of making them
+ * unique across reboots.  If user space bugs cause a stale cookie to
+ * be sent to the kernel, it could cause the wrong host to lose its
+ * lock state if cookies were not unique across reboots.
+ *
+ * The cookies are exposed only to local user space via loopback.  They
+ * do not appear on the physical network.  If we want greater security
+ * for some reason, nsm_init_private() could perform a one-way hash to
+ * obscure the contents of the cookie.
  */
 static void nsm_init_private(struct nsm_handle *nsm)
 {
-	__be32 *p = (__be32 *)&nsm->sm_priv.data;
-	*p = nsm_addr_in(nsm)->sin_addr.s_addr;
+	u64 *p = (u64 *)&nsm->sm_priv.data;
+	struct timespec ts;
+
+	ktime_get_ts(&ts);
+	*p++ = timespec_to_ns(&ts);
+	*p = (unsigned long)nsm;
 }
 
 static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
@@ -351,11 +365,7 @@ struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
 
 	spin_lock(&nsm_lock);
 
-	if (nsm_use_hostnames && info->mon != NULL)
-		cached = nsm_lookup_hostname(info->mon, info->len);
-	else
-		cached = nsm_lookup_priv(&info->priv);
-
+	cached = nsm_lookup_priv(&info->priv);
 	if (unlikely(cached == NULL)) {
 		spin_unlock(&nsm_lock);
 		dprintk("lockd: never saw rebooted peer '%.*s' before\n",
-- 
cgit v1.2.3


From e6765b83977f07983c7a10e6bbb19d6c7bbfc3a4 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:56:14 -0500
Subject: NSM: Remove include/linux/lockd/sm_inter.h

Clean up: The include/linux/lockd/sm_inter.h header is nearly empty
now.  Remove it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/clntproc.c | 1 -
 fs/lockd/host.c     | 1 -
 fs/lockd/mon.c      | 2 --
 fs/lockd/svc.c      | 1 -
 fs/lockd/svc4proc.c | 2 --
 fs/lockd/svcproc.c  | 2 --
 fs/lockd/svcsubs.c  | 1 -
 fs/lockd/xdr.c      | 1 -
 fs/lockd/xdr4.c     | 1 -
 9 files changed, 12 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5ce42e0ed4a..dd7957064a8 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT	(5*HZ)
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index e5a65df4c0c..99d737bd432 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -15,7 +15,6 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/mutex.h>
 
 #include <net/ipv6.h>
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 8ae4c02d7df..dfa9d80efcb 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -15,8 +15,6 @@
 #include <linux/sunrpc/xprtsock.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
-
 
 #define NLMDBG_FACILITY		NLMDBG_MONITOR
 #define NSM_PROGRAM		100024
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index bc3c3cb62db..0b13392931a 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -35,7 +35,6 @@
 #include <linux/sunrpc/svcsock.h>
 #include <net/ip.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/nfs.h>
 
 #define NLMDBG_FACILITY		NLMDBG_SVC
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bb79a53e060..1725037374c 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
-
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index e44310c0211..3688e55901f 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -16,8 +16,6 @@
 #include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
-
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 34c2766e27c..9e4d6aab611 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -17,7 +17,6 @@
 #include <linux/nfsd/export.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
-#include <linux/lockd/sm_inter.h>
 #include <linux/module.h>
 #include <linux/mount.h>
 
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 4cc7d01a1eb..0336f2beacd 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,7 +16,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 
 #define NLMDBG_FACILITY		NLMDBG_XDR
 
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 61d1714a470..e1d52865319 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -17,7 +17,6 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/lockd/lockd.h>
-#include <linux/lockd/sm_inter.h>
 
 #define NLMDBG_FACILITY		NLMDBG_XDR
 
-- 
cgit v1.2.3


From 8529bc51d30b8f001734b29b21a51b579c260f5b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:56:22 -0500
Subject: NSM: Move nsm_addr() to fs/lockd/mon.c

Clean up: nsm_addr_in() is no longer used, and nsm_addr() is used only in
fs/lockd/mon.c, so move it there.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index dfa9d80efcb..43be31c4a2d 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -55,6 +55,11 @@ static				DEFINE_SPINLOCK(nsm_lock);
  */
 int				nsm_local_state;
 
+static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
+{
+	return (struct sockaddr *)&nsm->sm_addr;
+}
+
 static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
 				     const size_t len)
 {
-- 
cgit v1.2.3


From b7ba597fb964dfa44284904b3b3d74d44b8e1c42 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:56:29 -0500
Subject: NSM: Move nsm_use_hostnames to mon.c

Clean up.

Treat the nsm_use_hostnames global variable like nsm_local_state.
Note that the default value of nsm_use_hostnames is still zero.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 3 ++-
 fs/lockd/svc.c | 1 -
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 43be31c4a2d..fafa0ea7193 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -53,7 +53,8 @@ static				DEFINE_SPINLOCK(nsm_lock);
 /*
  * Local NSM state
  */
-int				nsm_local_state;
+int	__read_mostly		nsm_local_state;
+int	__read_mostly		nsm_use_hostnames;
 
 static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
 {
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 0b13392931a..7ac7d72e3b5 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -59,7 +59,6 @@ unsigned long			nlmsvc_timeout;
 static unsigned long		nlm_grace_period;
 static unsigned long		nlm_timeout = LOCKD_DFLT_TIMEO;
 static int			nlm_udpport, nlm_tcpport;
-int				nsm_use_hostnames = 0;
 
 /* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */
 static unsigned int		nlm_max_connections = 1024;
-- 
cgit v1.2.3


From 49b5699b3fc22b363534c509c1b7dba06bc677bf Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:56:37 -0500
Subject: NSM: Move nsm_create()

Clean up: one last thing... relocate nsm_create() to eliminate the forward
declaration and group it near the only function that actually uses it.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/mon.c | 51 ++++++++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index fafa0ea7193..5e2c4d5ac82 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -44,8 +44,6 @@ struct nsm_res {
 	u32			state;
 };
 
-static struct rpc_clnt *	nsm_create(void);
-
 static struct rpc_program	nsm_program;
 static				LIST_HEAD(nsm_handles);
 static				DEFINE_SPINLOCK(nsm_lock);
@@ -98,11 +96,26 @@ static void nsm_display_address(const struct sockaddr *sap,
 	}
 }
 
-/*
- * Common procedure for NSMPROC_MON/NSMPROC_UNMON calls
- */
-static int
-nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
+static struct rpc_clnt *nsm_create(void)
+{
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_LOOPBACK),
+	};
+	struct rpc_create_args args = {
+		.protocol		= XPRT_TRANSPORT_UDP,
+		.address		= (struct sockaddr *)&sin,
+		.addrsize		= sizeof(sin),
+		.servername		= "rpc.statd",
+		.program		= &nsm_program,
+		.version		= NSM_VERSION,
+		.authflavor		= RPC_AUTH_NULL,
+	};
+
+	return rpc_create(&args);
+}
+
+static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
 {
 	struct rpc_clnt	*clnt;
 	int		status;
@@ -408,30 +421,6 @@ void nsm_release(struct nsm_handle *nsm)
 	}
 }
 
-/*
- * Create NSM client for the local host
- */
-static struct rpc_clnt *
-nsm_create(void)
-{
-	struct sockaddr_in	sin = {
-		.sin_family	= AF_INET,
-		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
-		.sin_port	= 0,
-	};
-	struct rpc_create_args args = {
-		.protocol	= XPRT_TRANSPORT_UDP,
-		.address	= (struct sockaddr *)&sin,
-		.addrsize	= sizeof(sin),
-		.servername	= "localhost",
-		.program	= &nsm_program,
-		.version	= NSM_VERSION,
-		.authflavor	= RPC_AUTH_NULL,
-	};
-
-	return rpc_create(&args);
-}
-
 /*
  * XDR functions for NSM.
  *
-- 
cgit v1.2.3


From b064ec038a6180b13e5f89b6a30b42cb5ce8febc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 11 Dec 2008 17:56:59 -0500
Subject: lockd: Enable NLM use of AF_INET6

If the kernel is configured to support IPv6 and the RPC server can register
services via rpcbindv4, we are all set to enable IPv6 support for lockd.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: Aime Le Rouzic <aime.le-rouzic@bull.net>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 7ac7d72e3b5..3e5f9f07911 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -52,6 +52,17 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
+/*
+ * If the kernel has IPv6 support available, always listen for
+ * both AF_INET and AF_INET6 requests.
+ */
+#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
+	defined(CONFIG_SUNRPC_REGISTER_V4)
+static const sa_family_t	nlmsvc_family = AF_INET6;
+#else	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+static const sa_family_t	nlmsvc_family = AF_INET;
+#endif	/* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
+
 /*
  * These can be set at insmod time (useful for NFS as root filesystem),
  * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
@@ -256,7 +267,7 @@ int lockd_up(void)
 			"lockd_up: no pid, %d users??\n", nlmsvc_users);
 
 	error = -ENOMEM;
-	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
+	serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
 	if (!serv) {
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		goto out;
-- 
cgit v1.2.3


From b046ccdc1f8171f6d0129dcc2a28d49187b4bf69 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 12 Dec 2008 16:57:13 -0500
Subject: NFSD: clean up failover sysctl function naming

Clean up: Rename recently-added failover functions to match the naming
convention in fs/nfsd/nfsctl.c.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 77d7b8c531a..173c4dd3d7a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size);
 static ssize_t write_getfd(struct file *file, char *buf, size_t size);
 static ssize_t write_getfs(struct file *file, char *buf, size_t size);
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size);
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size);
 static ssize_t write_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_versions(struct file *file, char *buf, size_t size);
@@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 #endif
 
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size);
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size);
-
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Svc] = write_svc,
 	[NFSD_Add] = write_add,
@@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Getfd] = write_getfd,
 	[NFSD_Getfs] = write_getfs,
 	[NFSD_Fh] = write_filehandle,
-	[NFSD_FO_UnlockIP] = failover_unlock_ip,
-	[NFSD_FO_UnlockFS] = failover_unlock_fs,
+	[NFSD_FO_UnlockIP] = write_unlock_ip,
+	[NFSD_FO_UnlockFS] = write_unlock_fs,
 	[NFSD_Threads] = write_threads,
 	[NFSD_Pool_Threads] = write_pool_threads,
 	[NFSD_Versions] = write_versions,
@@ -309,7 +308,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 	return err;
 }
 
-static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
+static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
 	struct sockaddr_in sin = {
 		.sin_family	= AF_INET,
@@ -339,7 +338,7 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
 	return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
 
-static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
+static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 {
 	struct path path;
 	char *fo_path;
-- 
cgit v1.2.3


From 54224f04ae95d86b27c0673cd773ebb120d86876 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 12 Dec 2008 16:57:20 -0500
Subject: NFSD: Fix a handful of coding style issues in write_filehandle()

Clean up: follow kernel coding style.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 173c4dd3d7a..498d763b932 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -390,11 +390,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 
 	dname = mesg;
 	len = qword_get(&mesg, dname, size);
-	if (len <= 0) return -EINVAL;
+	if (len <= 0)
+		return -EINVAL;
 	
 	path = dname+len+1;
 	len = qword_get(&mesg, path, size);
-	if (len <= 0) return -EINVAL;
+	if (len <= 0)
+		return -EINVAL;
 
 	len = get_int(&mesg, &maxsize);
 	if (len)
@@ -418,7 +420,8 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 	if (len)
 		return len;
 	
-	mesg = buf; len = SIMPLE_TRANSACTION_LIMIT;
+	mesg = buf;
+	len = SIMPLE_TRANSACTION_LIMIT;
 	qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
 	mesg[-1] = '\n';
 	return mesg - buf;	
-- 
cgit v1.2.3


From 9e074856caf13ba83363f73759f5e395f74ccf41 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 12 Dec 2008 16:57:27 -0500
Subject: NFSD: Replace open-coded integer with macro

Clean up: Instead of open-coding 2049, use the NFS_PORT macro.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 498d763b932..856b8646a48 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -439,9 +439,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
 		rv = get_int(&mesg, &newthreads);
 		if (rv)
 			return rv;
-		if (newthreads <0)
+		if (newthreads < 0)
 			return -EINVAL;
-		rv = nfsd_svc(2049, newthreads);
+		rv = nfsd_svc(NFS_PORT, newthreads);
 		if (rv)
 			return rv;
 	}
-- 
cgit v1.2.3


From 262a09823bb07c6aafb6c1d312cde613d0b90c85 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 12 Dec 2008 16:57:35 -0500
Subject: NFSD: Add documenting comments for nfsctl interface

Document the NFSD sysctl interface laid out in fs/nfsd/nfsctl.c.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsctl.c | 453 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 437 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 856b8646a48..3d93b2064ce 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -175,10 +175,24 @@ static const struct file_operations exports_operations = {
 /*----------------------------------------------------------------------------*/
 /*
  * payload - write methods
- * If the method has a response, the response should be put in buf,
- * and the length returned.  Otherwise return 0 or and -error.
  */
 
+/**
+ * write_svc - Start kernel's NFSD server
+ *
+ * Deprecated.  /proc/fs/nfsd/threads is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_svc
+ *				svc_port:	port number of this
+ *						server's listener
+ *				svc_nthreads:	number of threads to start
+ *			size:	size in bytes of passed in nfsctl_svc
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_svc(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_svc *data;
@@ -188,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size)
 	return nfsd_svc(data->svc_port, data->svc_nthreads);
 }
 
+/**
+ * write_add - Add or modify client entry in auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_client
+ *				cl_ident:	'\0'-terminated C string
+ *						containing domain name
+ *						of client
+ *				cl_naddr:	no. of items in cl_addrlist
+ *				cl_addrlist:	array of client addresses
+ *				cl_fhkeytype:	ignored
+ *				cl_fhkeylen:	ignored
+ *				cl_fhkey:	ignored
+ *			size:	size in bytes of passed in nfsctl_client
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_add(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_client *data;
@@ -197,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size)
 	return exp_addclient(data);
 }
 
+/**
+ * write_del - Remove client from auth unix cache
+ *
+ * Deprecated.  /proc/net/rpc/auth.unix.ip is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_client
+ *				cl_ident:	'\0'-terminated C string
+ *						containing domain name
+ *						of client
+ *				cl_naddr:	ignored
+ *				cl_addrlist:	ignored
+ *				cl_fhkeytype:	ignored
+ *				cl_fhkeylen:	ignored
+ *				cl_fhkey:	ignored
+ *			size:	size in bytes of passed in nfsctl_client
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since
+ * nfsctl_client.cl_addrlist contains only in_addr fields for addresses.
+ */
 static ssize_t write_del(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_client *data;
@@ -206,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size)
 	return exp_delclient(data);
 }
 
+/**
+ * write_export - Export part or all of a local file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_export
+ *				ex_client:	'\0'-terminated C string
+ *						containing domain name
+ *						of client allowed to access
+ *						this export
+ *				ex_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				ex_dev:		fsid to use for this export
+ *				ex_ino:		ignored
+ *				ex_flags:	export flags for this export
+ *				ex_anon_uid:	UID to use for anonymous
+ *						requests
+ *				ex_anon_gid:	GID to use for anonymous
+ *						requests
+ *			size:	size in bytes of passed in nfsctl_export
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_export(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_export *data;
@@ -215,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size)
 	return exp_export(data);
 }
 
+/**
+ * write_unexport - Unexport a previously exported file system
+ *
+ * Deprecated.  /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_export
+ *				ex_client:	'\0'-terminated C string
+ *						containing domain name
+ *						of client no longer allowed
+ *						to access this export
+ *				ex_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				ex_dev:		ignored
+ *				ex_ino:		ignored
+ *				ex_flags:	ignored
+ *				ex_anon_uid:	ignored
+ *				ex_anon_gid:	ignored
+ *			size:	size in bytes of passed in nfsctl_export
+ * Output:
+ *	On success:	returns zero
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_unexport(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_export *data;
@@ -225,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size)
 	return exp_unexport(data);
 }
 
+/**
+ * write_getfs - Get a variable-length NFS file handle by path
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_fsparm
+ *				gd_addr:	socket address of client
+ *				gd_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				gd_maxlen:	maximum size of returned file
+ *						handle
+ *			size:	size in bytes of passed in nfsctl_fsparm
+ * Output:
+ *	On success:	passed-in buffer filled with a knfsd_fh structure
+ *			(a variable-length raw NFS file handle);
+ *			return code is the size in bytes of the file handle
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_fsparm *data;
@@ -264,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size)
 	return err;
 }
 
+/**
+ * write_getfd - Get a fixed-length NFS file handle by path (used by mountd)
+ *
+ * Deprecated.  /proc/fs/nfsd/filehandle is preferred.
+ * Function remains to support old versions of nfs-utils.
+ *
+ * Input:
+ *			buf:	struct nfsctl_fdparm
+ *				gd_addr:	socket address of client
+ *				gd_path:	'\0'-terminated C string
+ *						containing pathname of
+ *						directory in local file system
+ *				gd_version:	fdparm structure version
+ *			size:	size in bytes of passed in nfsctl_fdparm
+ * Output:
+ *	On success:	passed-in buffer filled with nfsctl_res
+ *			(a fixed-length raw NFS file handle);
+ *			return code is the size in bytes of the file handle
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in, since gd_addr
+ * is the same size as a struct sockaddr_in.
+ */
 static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 {
 	struct nfsctl_fdparm *data;
@@ -308,6 +469,22 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 	return err;
 }
 
+/**
+ * write_unlock_ip - Release all locks used by a client
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing a
+ *				presentation format IPv4 address
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ *
+ * Note: Only AF_INET client addresses are passed in
+ */
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
 	struct sockaddr_in sin = {
@@ -338,6 +515,20 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 	return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
 }
 
+/**
+ * write_unlock_fs - Release all locks on a local file system
+ *
+ * Experimental.
+ *
+ * Input:
+ *			buf:	'\n'-terminated C string containing the
+ *				absolute pathname of a local file system
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	returns zero if all specified locks were released;
+ *			returns one if one or more locks were not released
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 {
 	struct path path;
@@ -359,21 +550,44 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 	if (error)
 		return error;
 
+	/*
+	 * XXX: Needs better sanity checking.  Otherwise we could end up
+	 * releasing locks on the wrong file system.
+	 *
+	 * For example:
+	 * 1.  Does the path refer to a directory?
+	 * 2.  Is that directory a mount point, or
+	 * 3.  Is that directory the root of an exported file system?
+	 */
 	error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb);
 
 	path_put(&path);
 	return error;
 }
 
+/**
+ * write_filehandle - Get a variable-length NFS file handle by path
+ *
+ * On input, the buffer contains a '\n'-terminated C string comprised of
+ * three alphanumeric words separated by whitespace.  The string may
+ * contain escape sequences.
+ *
+ * Input:
+ *			buf:
+ *				domain:		client domain name
+ *				path:		export pathname
+ *				maxsize:	numeric maximum size of
+ *						@buf
+ *			size:	length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing a ASCII hex text version
+ *			of the NFS file handle;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is negative errno value
+ */
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 {
-	/* request is:
-	 *   domain path maxsize
-	 * response is
-	 *   filehandle
-	 *
-	 * qword quoting is used, so filehandle will be \x....
-	 */
 	char *dname, *path;
 	int uninitialized_var(maxsize);
 	char *mesg = buf;
@@ -427,11 +641,36 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
 	return mesg - buf;	
 }
 
+/**
+ * write_threads - Start NFSD, or report the current number of running threads
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the
+ *					number of NFSD threads to start
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with '\n'-terminated C
+ *			string numeric value representing the number of
+ *			running NFSD threads;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_threads(struct file *file, char *buf, size_t size)
 {
-	/* if size > 0, look for a number of threads and call nfsd_svc
-	 * then write out number of threads as reply
-	 */
 	char *mesg = buf;
 	int rv;
 	if (size > 0) {
@@ -449,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+/**
+ * write_pool_threads - Set or report the current number of threads per pool
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated unsigned integer values
+ *					representing the number of NFSD
+ *					threads to start in each pool
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing integer values representing the
+ *			number of NFSD threads in each pool;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
 {
 	/* if size > 0, look for an array of number of threads per node
@@ -519,10 +780,6 @@ out_free:
 
 static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 {
-	/*
-	 * Format:
-	 *   [-/+]vers [-/+]vers ...
-	 */
 	char *mesg = buf;
 	char *vers, sign;
 	int len, num;
@@ -580,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
 	return len;
 }
 
+/**
+ * write_versions - Set or report the available NFS protocol versions
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing positive or negative integer
+ *			values representing the current status of each
+ *			protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing whitespace-
+ * 					separated positive or negative
+ * 					integer values representing NFS
+ * 					protocol versions to enable ("+n")
+ * 					or disable ("-n")
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	status of zero or more protocol versions has
+ *			been updated; passed-in buffer filled with
+ *			'\n'-terminated C string containing positive
+ *			or negative integer values representing the
+ *			current status of each protocol version;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_versions(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
@@ -689,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
 	return -EINVAL;
 }
 
+/**
+ * write_ports - Pass a socket file descriptor or transport name to listen on
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ * Output:
+ *	On success:	passed-in buffer filled with a '\n'-terminated C
+ *			string containing a whitespace-separated list of
+ *			named NFSD listeners;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing a bound
+ *					but unconnected socket that is to be
+ *					used as an NFSD listener
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service is started;
+ *			passed-in buffer filled with a '\n'-terminated C
+ *			string containing a unique alphanumeric name of
+ *			the listener;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a "-" followed
+ *					by an integer value representing a
+ *					previously passed in socket file
+ *					descriptor
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	NFS service no longer listens on that socket;
+ *			passed-in buffer filled with a '\n'-terminated C
+ *			string containing a unique name of the listener;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a transport
+ *					name and an unsigned integer value
+ *					representing the port to listen on,
+ *					separated by whitespace
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	returns zero; NFS service is started
+ *	On error:	return code is a negative errno value
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing a "-" followed
+ *					by a transport name and an unsigned
+ *					integer value representing the port
+ *					to listen on, separated by whitespace
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	returns zero; NFS service no longer listens
+ *			on that transport
+ *	On error:	return code is a negative errno value
+ */
 static ssize_t write_ports(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
@@ -702,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
 
 int nfsd_max_blksize;
 
+/**
+ * write_maxblksize - Set or report the current NFS blksize
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ * 			buf:		C string containing an unsigned
+ * 					integer value representing the new
+ * 					NFS blksize
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing numeric value of the current NFS blksize
+ *			setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
 {
 	char *mesg = buf;
@@ -754,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+/**
+ * write_leasetime - Set or report the current NFSv4 lease time
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing an unsigned
+ *					integer value representing the new
+ *					NFSv4 lease expiry time
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C
+ *			string containing unsigned integer value of the
+ *			current lease expiry time;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
@@ -790,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+/**
+ * write_recoverydir - Set or report the pathname of the recovery directory
+ *
+ * Input:
+ *			buf:		ignored
+ *			size:		zero
+ *
+ * OR
+ *
+ * Input:
+ *			buf:		C string containing the pathname
+ *					of the directory on a local file
+ *					system containing permanent NFSv4
+ *					recovery data
+ *			size:		non-zero length of C string in @buf
+ * Output:
+ *	On success:	passed-in buffer filled with '\n'-terminated C string
+ *			containing the current recovery pathname setting;
+ *			return code is the size in bytes of the string
+ *	On error:	return code is zero or a negative errno value
+ */
 static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
 {
 	ssize_t rv;
-- 
cgit v1.2.3


From cc7172defcf253335b16cf703fe4ac1ade15e1b1 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 6 Jan 2009 13:26:40 -0500
Subject: Btrfs: Don't use kmap_atomic(..., KM_IRQ0) during checksum verifies

Checksum verification happens in a helper thread, and there is no
need to mess with interrupts.  This switches to kmap() instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4e57fe68e4b..cdb701165a0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1727,7 +1727,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u32 csum = ~(u32)0;
-	unsigned long flags;
 
 	if (PageChecked(page)) {
 		ClearPageChecked(page);
@@ -1749,8 +1748,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	} else {
 		ret = get_state_private(io_tree, start, &private);
 	}
-	local_irq_save(flags);
-	kaddr = kmap_atomic(page, KM_IRQ0);
+	kaddr = kmap(page);
 	if (ret)
 		goto zeroit;
 
@@ -1759,8 +1757,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (csum != private)
 		goto zeroit;
 
-	kunmap_atomic(kaddr, KM_IRQ0);
-	local_irq_restore(flags);
+	kunmap(page);
 good:
 	/* if the io failure tree for this inode is non-empty,
 	 * check to see if we've recovered from a failed IO
@@ -1775,8 +1772,7 @@ zeroit:
 	       (unsigned long long)private);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_IRQ0);
-	local_irq_restore(flags);
+	kunmap(page);
 	if (private == 0)
 		return 0;
 	return -EIO;
-- 
cgit v1.2.3


From 3ada8b7e980dac7cc42937d42d90ee51b19204fe Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Tue, 6 Jan 2009 10:44:43 -0800
Subject: block: struct device - replace bus_id with dev_name(), dev_set_name()

Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/partitions/check.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d5b213b8a9..5198ada6739 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -384,9 +384,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 
 	dname = dev_name(ddev);
 	if (isdigit(dname[strlen(dname) - 1]))
-		snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
+		dev_set_name(pdev, "%sp%d", dname, partno);
 	else
-		snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
+		dev_set_name(pdev, "%s%d", dname, partno);
 
 	device_initialize(pdev);
 	pdev->class = &block_class;
@@ -447,16 +447,11 @@ void register_disk(struct gendisk *disk)
 	struct block_device *bdev;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
-	char *s;
 	int err;
 
 	ddev->parent = disk->driverfs_dev;
 
-	strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
-	/* ewww... some of these buggers have / in the name... */
-	s = strchr(ddev->bus_id, '/');
-	if (s)
-		*s = '!';
+	dev_set_name(ddev, disk->disk_name);
 
 	/* delay uevents, until we scanned partition table */
 	ddev->uevent_suppress = 1;
-- 
cgit v1.2.3


From 83982b6f47201c4c7767210d24d7d8c99567a0b3 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 6 Jan 2009 14:53:16 -0500
Subject: ext4: Remove "extents" mount option

This mount option is largely superfluous, and in fact the way it was
implemented was buggy; if a filesystem which did not have the extents
feature flag was mounted -o extents, the filesystem would attempt to
create and use extents-based file even though the extents feature flag
was not eabled.  The simplest thing to do is to nuke the mount option
entirely.  It's not all that useful to force the non-creation of new
extent-based files if the filesystem can support it.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h      |  1 -
 fs/ext4/ext4_jbd2.h |  4 ++--
 fs/ext4/extents.c   |  4 ++--
 fs/ext4/ialloc.c    |  2 +-
 fs/ext4/migrate.c   | 14 +++++++-------
 fs/ext4/super.c     | 48 ++----------------------------------------------
 6 files changed, 14 insertions(+), 59 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 695b45cc34e..db1718833f5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -536,7 +536,6 @@ do {									       \
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
-#define EXT4_MOUNT_EXTENTS		0x400000 /* Extents support */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 663197adae5..be2f426f680 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -32,8 +32,8 @@
  * 5 levels of tree + root which are stored in the inode. */
 
 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
-	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)	\
-		|| test_opt(sb, EXTENTS) ? 27U : 8U)
+	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)   \
+	 ? 27U : 8U)
 
 /* Extended attribute operations touch at most two data buffers,
  * two bitmap buffers, and two group summaries, in addition to the inode
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c64080e4949..240cf0daad4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2247,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb)
 	 * possible initialization would be here
 	 */
 
-	if (test_opt(sb, EXTENTS)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
 		printk(", aggressive tests");
@@ -2272,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb)
  */
 void ext4_ext_release(struct super_block *sb)
 {
-	if (!test_opt(sb, EXTENTS))
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
 		return;
 
 #ifdef EXTENTS_STATS
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 369c34c6429..4fb86a0061d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -917,7 +917,7 @@ got:
 	if (err)
 		goto fail_free_drop;
 
-	if (test_opt(sb, EXTENTS)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
 			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7cd488da4b..734abca25e3 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -459,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode)
 	struct list_blocks_struct lb;
 	unsigned long max_entries;
 
-	if (!test_opt(inode->i_sb, EXTENTS))
-		/*
-		 * if mounted with noextents we don't allow the migrate
-		 */
-		return -EINVAL;
-
-	if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	/*
+	 * If the filesystem does not support extents, or the inode
+	 * already is extent-based, error out.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+	    (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
 		return -EINVAL;
 
 	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b69d0920386..acb69c00fd4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -829,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",journal_async_commit");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
-	if (!test_opt(sb, EXTENTS))
-		seq_puts(seq, ",noextents");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
 	if (!test_opt(sb, DELALLOC))
@@ -1011,7 +1009,7 @@ enum {
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
-	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
+	Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1066,8 +1064,6 @@ static const match_table_t tokens = {
 	{Opt_quota, "quota"},
 	{Opt_usrquota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
-	{Opt_extents, "extents"},
-	{Opt_noextents, "noextents"},
 	{Opt_i_version, "i_version"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
@@ -1115,7 +1111,6 @@ static int parse_options(char *options, struct super_block *sb,
 	int qtype, qfmt;
 	char *qname;
 #endif
-	ext4_fsblk_t last_block;
 
 	if (!options)
 		return 1;
@@ -1445,33 +1440,6 @@ set_qf_format:
 		case Opt_bh:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
-		case Opt_extents:
-			if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
-					EXT4_FEATURE_INCOMPAT_EXTENTS)) {
-				ext4_warning(sb, __func__,
-					"extents feature not enabled "
-					"on this filesystem, use tune2fs");
-				return 0;
-			}
-			set_opt(sbi->s_mount_opt, EXTENTS);
-			break;
-		case Opt_noextents:
-			/*
-			 * When e2fsprogs support resizing an already existing
-			 * ext3 file system to greater than 2**32 we need to
-			 * add support to block allocator to handle growing
-			 * already existing block  mapped inode so that blocks
-			 * allocated for them fall within 2**32
-			 */
-			last_block = ext4_blocks_count(sbi->s_es) - 1;
-			if (last_block  > 0xffffffffULL) {
-				printk(KERN_ERR "EXT4-fs: Filesystem too "
-						"large to mount with "
-						"-o noextents options\n");
-				return 0;
-			}
-			clear_opt(sbi->s_mount_opt, EXTENTS);
-			break;
 		case Opt_i_version:
 			set_opt(sbi->s_mount_opt, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
@@ -2135,18 +2103,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_opt(sbi->s_mount_opt, RESERVATION);
 	set_opt(sbi->s_mount_opt, BARRIER);
 
-	/*
-	 * turn on extents feature by default in ext4 filesystem
-	 * only if feature flag already set by mkfs or tune2fs.
-	 * Use -o noextents to turn it off
-	 */
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
-		set_opt(sbi->s_mount_opt, EXTENTS);
-	else
-		ext4_warning(sb, __func__,
-			"extents feature not enabled on this filesystem, "
-			"use tune2fs.");
-
 	/*
 	 * enable delayed allocation by default
 	 * Use -o nodelalloc to turn it off
@@ -3825,7 +3781,7 @@ static void __exit exit_ext4_fs(void)
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-MODULE_DESCRIPTION("Fourth Extended Filesystem with extents");
+MODULE_DESCRIPTION("Fourth Extended Filesystem");
 MODULE_LICENSE("GPL");
 module_init(init_ext4_fs)
 module_exit(exit_ext4_fs)
-- 
cgit v1.2.3


From 4b905671d2ea09fd48fed72c581df17e40823f39 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 6 Jan 2009 14:53:35 -0500
Subject: jbd2: Fix oops in jbd2_journal_init_inode() on corrupted fs

On 32-bit system with CONFIG_LBD getblk can fail because provided
block number is too big.  Add error checks so we fail gracefully if
getblk() returns NULL (which can also happen on memory allocation
failures).

Thanks to David Maciejak from Fortinet's FortiGuard Global Security
Research Team for reporting this bug.

http://bugzilla.kernel.org/show_bug.cgi?id=12370

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
cc: stable@kernel.org
---
 fs/jbd2/journal.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fe20e40ee7c..2932c8f5519 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -632,6 +632,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal)
 		return NULL;
 
 	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
+	if (!bh)
+		return NULL;
 	lock_buffer(bh);
 	memset(bh->b_data, 0, journal->j_blocksize);
 	set_buffer_uptodate(bh);
@@ -1021,15 +1023,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 
 	/* journal descriptor can store up to n blocks -bzzz */
 	journal->j_blocksize = blocksize;
+	jbd2_stats_proc_init(journal);
 	n = journal->j_blocksize / sizeof(journal_block_tag_t);
 	journal->j_wbufsize = n;
 	journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
-		kfree(journal);
-		journal = NULL;
-		goto out;
+		goto out_err;
 	}
 	journal->j_dev = bdev;
 	journal->j_fs_dev = fs_dev;
@@ -1039,14 +1040,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
 	p = journal->j_devname;
 	while ((p = strchr(p, '/')))
 		*p = '!';
-	jbd2_stats_proc_init(journal);
 
 	bh = __getblk(journal->j_dev, start, journal->j_blocksize);
-	J_ASSERT(bh != NULL);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
-out:
+
 	return journal;
+out_err:
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
 }
 
 /**
@@ -1094,9 +1103,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (!journal->j_wbuf) {
 		printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n",
 			__func__);
-		jbd2_stats_proc_exit(journal);
-		kfree(journal);
-		return NULL;
+		goto out_err;
 	}
 
 	err = jbd2_journal_bmap(journal, 0, &blocknr);
@@ -1104,17 +1111,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	if (err) {
 		printk(KERN_ERR "%s: Cannnot locate journal superblock\n",
 		       __func__);
-		jbd2_stats_proc_exit(journal);
-		kfree(journal);
-		return NULL;
+		goto out_err;
 	}
 
 	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
-	J_ASSERT(bh != NULL);
+	if (!bh) {
+		printk(KERN_ERR
+		       "%s: Cannot get buffer for journal superblock\n",
+		       __func__);
+		goto out_err;
+	}
 	journal->j_sb_buffer = bh;
 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
 
 	return journal;
+out_err:
+	jbd2_stats_proc_exit(journal);
+	kfree(journal);
+	return NULL;
 }
 
 /*
-- 
cgit v1.2.3


From 08fba69986e20c1c9e5fe2e6064d146cc4f42480 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 6 Jan 2009 14:38:53 -0800
Subject: mm: report the pagesize backing a VMA in /proc/pid/smaps

It is useful to verify a hugepage-aware application is using the expected
pagesizes for its memory regions. This patch creates an entry called
KernelPageSize in /proc/pid/smaps that is the size of page used by the
kernel to back a VMA. The entry is not called PageSize as it is possible
the MMU uses a different size. This extension should not break any sensible
parser that skips lines containing unrecognised information.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: "KOSAKI Motohiro" <kosaki.motohiro@jp.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3a8bdd7f575..41ef5f23e77 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -396,7 +396,8 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Private_Clean:  %8lu kB\n"
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
-		   "Swap:           %8lu kB\n",
+		   "Swap:           %8lu kB\n"
+		   "KernelPageSize: %8lu kB\n",
 		   (vma->vm_end - vma->vm_start) >> 10,
 		   mss.resident >> 10,
 		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -405,7 +406,8 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.private_clean >> 10,
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
-		   mss.swap >> 10);
+		   mss.swap >> 10,
+		   vma_kernel_pagesize(vma) >> 10);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
-- 
cgit v1.2.3


From 3340289ddf29ca75c3acfb3a6b72f234b2f74d5c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 6 Jan 2009 14:38:54 -0800
Subject: mm: report the MMU pagesize in /proc/pid/smaps

The KernelPageSize entry in /proc/pid/smaps is the pagesize used by the
kernel to back a VMA.  This matches the size used by the MMU in the
majority of cases.  However, one counter-example occurs on PPC64 kernels
whereby a kernel using 64K as a base pagesize may still use 4K pages for
the MMU on older processor.  To distinguish, this patch reports
MMUPageSize as the pagesize used by the MMU in /proc/pid/smaps.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Cc: "KOSAKI Motohiro" <kosaki.motohiro@jp.fujitsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 41ef5f23e77..94063840832 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -397,7 +397,8 @@ static int show_smap(struct seq_file *m, void *v)
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
 		   "Swap:           %8lu kB\n"
-		   "KernelPageSize: %8lu kB\n",
+		   "KernelPageSize: %8lu kB\n"
+		   "MMUPageSize:    %8lu kB\n",
 		   (vma->vm_end - vma->vm_start) >> 10,
 		   mss.resident >> 10,
 		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
@@ -407,7 +408,8 @@ static int show_smap(struct seq_file *m, void *v)
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
 		   mss.swap >> 10,
-		   vma_kernel_pagesize(vma) >> 10);
+		   vma_kernel_pagesize(vma) >> 10,
+		   vma_mmu_pagesize(vma) >> 10);
 
 	if (m->count < m->size)  /* vma is copied successfully */
 		m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
-- 
cgit v1.2.3


From 38c8e6180939e5619140b2e9e479cb26029ff8b1 Mon Sep 17 00:00:00 2001
From: Miquel van Smoorenburg <mikevs@xs4all.net>
Date: Tue, 6 Jan 2009 14:39:02 -0800
Subject: do_mpage_readpage(): don't submit lots of small bios on boundary

While tracing I/O patterns with blktrace (a great tool) a few weeks ago I
identified a minor issue in fs/mpage.c

As the comment above mpage_readpages() says, a fs's get_block function
will set BH_Boundary when it maps a block just before a block for which
extra I/O is required.

Since get_block() can map a range of pages, for all these pages the
BH_Boundary flag will be set.  But we only need to push what I/O we have
accumulated at the last block of this range.

This makes do_mpage_readpage() send out the largest possible bio instead
of a bunch of page-sized ones in the BH_Boundary case.

Signed-off-by: Miquel van Smoorenburg <mikevs@xs4all.net>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/mpage.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/mpage.c b/fs/mpage.c
index 552b80b3fac..46e977efd50 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -308,7 +308,10 @@ alloc_new:
 		goto alloc_new;
 	}
 
-	if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
+	relative_block = block_in_file - *first_logical_block;
+	nblocks = map_bh->b_size >> blkbits;
+	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
+	    (first_hole != blocks_per_page))
 		bio = mpage_bio_submit(READ, bio);
 	else
 		*last_block_in_bio = blocks[blocks_per_page - 1];
-- 
cgit v1.2.3


From ee53a891f47444c53318b98dac947ede963db400 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:39:12 -0800
Subject: mm: do_sync_mapping_range integrity fix

Chris Mason notices do_sync_mapping_range didn't actually ask for data
integrity writeout.  Unfortunately, it is advertised as being usable for
data integrity operations.

This is a data integrity bug.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 0921d6d4b5e..ac02b56548b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -295,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
 		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-						WB_SYNC_NONE);
+						WB_SYNC_ALL);
 		if (ret < 0)
 			goto out;
 	}
-- 
cgit v1.2.3


From 39f0dee2d8abe902617622b71f8f6f73985ec71c Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Tue, 6 Jan 2009 14:39:21 -0800
Subject: do_mpage_readpage(): remove useless clear_buffer_mapped() call

It is known that buffer_mapped() is false in this code path.

Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/mpage.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/mpage.c b/fs/mpage.c
index 46e977efd50..16c3ef37eae 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 				first_hole = page_block;
 			page_block++;
 			block_in_file++;
-			clear_buffer_mapped(map_bh);
 			continue;
 		}
 
-- 
cgit v1.2.3


From 3c1d43787b48c798f44dc32a6e6deb5ca2da3e68 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 6 Jan 2009 14:39:23 -0800
Subject: mm: remove GFP_HIGHUSER_PAGECACHE

GFP_HIGHUSER_PAGECACHE is just an alias for GFP_HIGHUSER_MOVABLE, making
that harder to track down: remove it, and its out-of-work brothers
GFP_NOFS_PAGECACHE and GFP_USER_PAGECACHE.

Since we're making that improvement to hotremove_migrate_alloc(), I think
we can now also remove one of the "o"s from its comment.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index bd48e5e6d3e..a903860bc5a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,7 +166,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	mapping->a_ops = &empty_aops;
 	mapping->host = inode;
 	mapping->flags = 0;
-	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->assoc_mapping = NULL;
 	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
@@ -601,7 +601,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists);
  *	@sb: superblock
  *
  *	Allocates a new inode for given superblock. The default gfp_mask
- *	for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE.
+ *	for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
  *	If HIGHMEM pages are unsuitable or it is known that pages allocated
  *	for the page cache are not reclaimable or migratable,
  *	mapping_set_gfp_mask() must be called with suitable flags on the
-- 
cgit v1.2.3


From 0f64415d42760379753e6088787ce3fd3e069509 Mon Sep 17 00:00:00 2001
From: Dmitri Monakhov <dmonakhov@openvz.org>
Date: Tue, 6 Jan 2009 14:40:04 -0800
Subject: fs: truncate blocks outside i_size after O_DIRECT write error

In case of error extending write may have instantiated a few blocks
outside i_size.  We need to trim these blocks.  We have to do it
*regardless* to blocksize.  At least ext2, ext3 and reiserfs interpret
(i_size < biggest block) condition as error.  Fsck will complain about
wrong i_size.  Then fsck will fix the error by changing i_size according
to the biggest block.  This is bad because this blocks contain garbage
from previous write attempt.  And result in data corruption.

####TESTCASE_BEGIN
$touch /mnt/test/BIG_FILE
## at this moment /mnt/test/BIG_FILE size and blocks equal to zero
open("/mnt/test/BIG_FILE", O_WRONLY|O_CREAT|O_DIRECT, 0666) = 3
write(3, "aaaaaaaaaaaa"..., 104857600) = -1 ENOSPC (No space left on device)
## size and block sould't be changed because write op failed.
$stat /mnt/test/BIG_FILE
File: `/mnt/test/BIG_FILE'
Size: 0 Blocks: 110896 IO Block: 1024 regular empty file
<<<<<<<<^^^^^^^^^^^^^^^^^^^^^^^^^^^^^file size is less than biggest block idx
Device: fe07h/65031d Inode: 14 Links: 1
Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root)
Access: 2007-01-24 20:03:38.000000000 +0300
Modify: 2007-01-24 20:03:38.000000000 +0300
Change: 2007-01-24 20:03:39.000000000 +0300

#fsck.ext3 -f /dev/VG/test
e2fsck 1.39 (29-May-2006)
Pass 1: Checking inodes, blocks, and sizes
Inode 14, i_size is 0, should be 56556544. Fix<y>? yes
Pass 2: Checking directory structure
....
#####TESTCASE_ENDdiff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558d..4e88bea 100644

[akpm@linux-foundation.org: use i_size_read()]
Signed-off-by: Dmitri Monakhov <dmonakhov@openvz.org>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/direct-io.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index af0558dbe8b..b6d43908ff7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	retval = direct_io_worker(rw, iocb, inode, iov, offset,
 				nr_segs, blkbits, get_block, end_io, dio);
 
+	/*
+	 * In case of error extending write may have instantiated a few
+	 * blocks outside i_size. Trim these off again for DIO_LOCKING.
+	 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+	 * it's own meaner.
+	 */
+	if (unlikely(retval < 0 && (rw & WRITE))) {
+		loff_t isize = i_size_read(inode);
+
+		if (end > isize && dio_lock_type == DIO_LOCKING)
+			vmtruncate(inode, isize);
+	}
+
 	if (rw == READ && dio_lock_type == DIO_LOCKING)
 		release_i_mutex = 0;
 
-- 
cgit v1.2.3


From 91bf189c3a766927694ce9de7d545e96b23f20fc Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Tue, 6 Jan 2009 14:40:14 -0800
Subject: hugetlb: unsigned ret cannot be negative

unsigned long ret cannot be negative, but ret can get -EFAULT.

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Ken Chen <kenchen@google.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0ab0c6f5f43..6903d37af03 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 	for (;;) {
 		struct page *page;
 		unsigned long nr, ret;
+		int ra;
 
 		/* nr is the maximum number of bytes to copy from this page */
 		nr = huge_page_size(h);
@@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			 */
 			ret = len < nr ? len : nr;
 			if (clear_user(buf, ret))
-				ret = -EFAULT;
+				ra = -EFAULT;
+			else
+				ra = 0;
 		} else {
 			/*
 			 * We have the page, copy it to user space buffer.
 			 */
-			ret = hugetlbfs_read_actor(page, offset, buf, len, nr);
+			ra = hugetlbfs_read_actor(page, offset, buf, len, nr);
+			ret = ra;
 		}
-		if (ret < 0) {
+		if (ra < 0) {
 			if (retval == 0)
-				retval = ret;
+				retval = ra;
 			if (page)
 				page_cache_release(page);
 			goto out;
-- 
cgit v1.2.3


From 69e9930993cfd70d82c8d9dd96fc3a88854d06fc Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Tue, 6 Jan 2009 14:40:19 -0800
Subject: block_write_begin(): remove useless goto

Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index a13f09b696f..c26da785938 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping,
 			if (pos + len > inode->i_size)
 				vmtruncate(inode, inode->i_size);
 		}
-		goto out;
 	}
 
 out:
-- 
cgit v1.2.3


From e8ea1759138d4279869f52bfb7dca8f02f8ccfe5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 6 Jan 2009 14:40:23 -0800
Subject: UBIFS: do not use WB_SYNC_HOLD

WB_SYNC_HOLD is going to be zapped so we should not use it. Use
%WB_SYNC_NONE instead. Here is what akpm said:

"I think I'll just switch that to WB_SYNC_NONE.  The `wait==0' mode is
just an advisory thing to help the fs shove lots of data into the
queues.  If some gets missed then it'll be picked up on the second
->sync_fs call, with wait==1."

Thanks to Randy Dunlap for catching this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ubifs/super.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 0d7564b95f8..89556ee7251 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
 	struct writeback_control wbc = {
-		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
 		.range_start = 0,
 		.range_end   = LLONG_MAX,
 		.nr_to_write = LONG_MAX,
 	};
 
+	/*
+	 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
+	 * advisory thing to help the file system shove lots of data into the
+	 * queues. If some gets missed then it'll be picked up on the second
+	 * '->sync_fs()' call, with non-zero @wait.
+	 */
+
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-- 
cgit v1.2.3


From 4f5a99d64c17470a784a6c68064207d82e3e74a5 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:40:25 -0800
Subject: fs: remove WB_SYNC_HOLD

Remove WB_SYNC_HOLD.  The primary motiviation is the design of my
anti-starvation code for fsync.  It requires taking an inode lock over the
sync operation, so we could run into lock ordering problems with multiple
inodes.  It is possible to take a single global lock to solve the ordering
problem, but then that would prevent a future nice implementation of "sync
multiple inodes" based on lock order via inode address.

Seems like a backward step to remove this, but actually it is busted
anyway: we can't use the inode lists for data integrity wait: an inode can
be taken off the dirty lists but still be under writeback.  In order to
satisfy data integrity semantics, we should wait for it to finish
writeback, but if we only search the dirty lists, we'll miss it.

It would be possible to have a "writeback" list, for sys_sync, I suppose.
But why complicate things by prematurely optimise?  For unmounting, we
could avoid the "livelock avoidance" code, which would be easier, but
again premature IMO.

Fixing the existing data integrity problem will come next.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0ff0b8cf30..d99601af9e4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
  * If we're a pdlfush thread, then implement pdflush collision avoidance
  * against the entire list.
  *
- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
- * that it can be located for waiting on in __writeback_single_inode().
- *
  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
  * This function assumes that the blockdev superblock's inodes are backed by
  * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -499,10 +496,6 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		__iget(inode);
 		pages_skipped = wbc->pages_skipped;
 		__writeback_single_inode(inode, wbc);
-		if (wbc->sync_mode == WB_SYNC_HOLD) {
-			inode->dirtied_when = jiffies;
-			list_move(&inode->i_list, &sb->s_dirty);
-		}
 		if (current_is_pdflush())
 			writeback_release(bdi);
 		if (wbc->pages_skipped != pages_skipped) {
@@ -588,8 +581,7 @@ restart:
 
 /*
  * writeback and wait upon the filesystem's dirty inodes.  The caller will
- * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
- * used to park the written inodes on sb->s_dirty for the wait pass.
+ * do this in two passes - one to write, and one to wait.
  *
  * A finite limit is set on the number of pages which will be written.
  * To prevent infinite livelock of sys_sync().
@@ -600,7 +592,7 @@ restart:
 void sync_inodes_sb(struct super_block *sb, int wait)
 {
 	struct writeback_control wbc = {
-		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_NONE,
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
-- 
cgit v1.2.3


From 38f21977663126fef53f5585e7f1653d8ebe55c4 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:40:25 -0800
Subject: fs: sync_sb_inodes fix

Fix data integrity semantics required by sys_sync, by iterating over all
inodes and waiting for any writeback pages after the initial writeout.
Comments explain the exact problem.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 53 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d99601af9e4..a9ee474f969 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -440,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
 				struct writeback_control *wbc)
 {
 	const unsigned long start = jiffies;	/* livelock avoidance */
+	int sync = wbc->sync_mode == WB_SYNC_ALL;
 
 	spin_lock(&inode_lock);
 	if (!wbc->for_kupdate || list_empty(&sb->s_io))
@@ -516,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		if (!list_empty(&sb->s_more_io))
 			wbc->more_io = 1;
 	}
-	spin_unlock(&inode_lock);
+
+	if (sync) {
+		struct inode *inode, *old_inode = NULL;
+
+		/*
+		 * Data integrity sync. Must wait for all pages under writeback,
+		 * because there may have been pages dirtied before our sync
+		 * call, but which had writeout started before we write it out.
+		 * In which case, the inode may not be on the dirty list, but
+		 * we still have to wait for that writeout.
+		 */
+		list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+			struct address_space *mapping;
+
+			if (inode->i_state & (I_FREEING|I_WILL_FREE))
+				continue;
+			mapping = inode->i_mapping;
+			if (mapping->nrpages == 0)
+				continue;
+			__iget(inode);
+			spin_unlock(&inode_lock);
+			/*
+			 * We hold a reference to 'inode' so it couldn't have
+			 * been removed from s_inodes list while we dropped the
+			 * inode_lock.  We cannot iput the inode now as we can
+			 * be holding the last reference and we cannot iput it
+			 * under inode_lock. So we keep the reference and iput
+			 * it later.
+			 */
+			iput(old_inode);
+			old_inode = inode;
+
+			filemap_fdatawait(mapping);
+
+			cond_resched();
+
+			spin_lock(&inode_lock);
+		}
+		spin_unlock(&inode_lock);
+		iput(old_inode);
+	} else
+		spin_unlock(&inode_lock);
+
 	return;		/* Leave any unwritten inodes on s_io */
 }
 EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
@@ -596,13 +639,16 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 		.range_start	= 0,
 		.range_end	= LLONG_MAX,
 	};
-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
 
-	wbc.nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
-			nr_dirty + nr_unstable;
-	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
+	if (!wait) {
+		unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+		unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+
+		wbc.nr_to_write = nr_dirty + nr_unstable +
+			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	} else
+		wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
+
 	sync_sb_inodes(sb, &wbc);
 }
 
-- 
cgit v1.2.3


From 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:40:26 -0800
Subject: fs: sys_sync fix

s_syncing livelock avoidance was breaking data integrity guarantee of
sys_sync, by allowing sys_sync to skip writing or waiting for superblocks
if there is a concurrent sys_sync happening.

This livelock avoidance is much less important now that we don't have the
get_super_to_sync() call after every sb that we sync.  This was replaced
by __put_super_and_need_restart.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a9ee474f969..e5eaa62fd17 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -652,18 +652,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 	sync_sb_inodes(sb, &wbc);
 }
 
-/*
- * Rather lame livelock avoidance.
- */
-static void set_sb_syncing(int val)
-{
-	struct super_block *sb;
-	spin_lock(&sb_lock);
-	list_for_each_entry_reverse(sb, &super_blocks, s_list)
-		sb->s_syncing = val;
-	spin_unlock(&sb_lock);
-}
-
 /**
  * sync_inodes - writes all inodes to disk
  * @wait: wait for completion
@@ -690,9 +678,6 @@ static void __sync_inodes(int wait)
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_syncing)
-			continue;
-		sb->s_syncing = 1;
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
@@ -710,13 +695,10 @@ restart:
 
 void sync_inodes(int wait)
 {
-	set_sb_syncing(0);
 	__sync_inodes(0);
 
-	if (wait) {
-		set_sb_syncing(0);
+	if (wait)
 		__sync_inodes(1);
-	}
 }
 
 /**
-- 
cgit v1.2.3


From eaccbfa564e48c87626594511f42dc8c0ad2daae Mon Sep 17 00:00:00 2001
From: "Luiz Fernando N. Capitulino" <lcapitulino@mandriva.com.br>
Date: Tue, 6 Jan 2009 14:40:44 -0800
Subject: fs/exec.c:__bprm_mm_init(): clean up error handling

Untangle the error unwinding in this function, saving a test of local
variable `vma'.

Signed-off-by: Luiz Fernando N. Capitulino <lcapitulino@mandriva.com.br>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 9c33f542dc7..6ca4e3d22a3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -232,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
 
 static int __bprm_mm_init(struct linux_binprm *bprm)
 {
-	int err = -ENOMEM;
+	int err;
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
 	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma)
-		goto err;
+		return -ENOMEM;
 
 	down_write(&mm->mmap_sem);
 	vma->vm_mm = mm;
@@ -251,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	 */
 	vma->vm_end = STACK_TOP_MAX;
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
-
 	vma->vm_flags = VM_STACK_FLAGS;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	err = insert_vm_struct(mm, vma);
-	if (err) {
-		up_write(&mm->mmap_sem);
+	if (err)
 		goto err;
-	}
 
 	mm->stack_vm = mm->total_vm = 1;
 	up_write(&mm->mmap_sem);
-
 	bprm->p = vma->vm_end - sizeof(void *);
-
 	return 0;
-
 err:
-	if (vma) {
-		bprm->vma = NULL;
-		kmem_cache_free(vm_area_cachep, vma);
-	}
-
+	up_write(&mm->mmap_sem);
+	bprm->vma = NULL;
+	kmem_cache_free(vm_area_cachep, vma);
 	return err;
 }
 
-- 
cgit v1.2.3


From 67ec7d3ab779ad9001ef57a6b4cfdf80ac9f9acc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 6 Jan 2009 14:40:57 -0800
Subject: fs: use menuconfig to control the Misc. filesystems menu

Have one option to control Miscellaneous filesystems.  This makes it easy
to disable all of them at one time.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f9b6e2979aa..32883589ee5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -721,7 +721,20 @@ config CONFIGFS_FS
 
 endmenu
 
-menu "Miscellaneous filesystems"
+menuconfig MISC_FILESYSTEMS
+	bool "Miscellaneous filesystems"
+	default y
+	---help---
+	  Say Y here to get to see options for various miscellaneous
+	  filesystems, such as filesystems that came from other
+	  operating systems.
+
+	  This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and
+	  disabled; if unsure, say Y here.
+
+if MISC_FILESYSTEMS
 
 config ADFS_FS
 	tristate "ADFS file system support (EXPERIMENTAL)"
@@ -1091,7 +1104,7 @@ config UFS_DEBUG
 	  Y here.  This will result in _many_ additional debugging messages to be
 	  written to the system log.
 
-endmenu
+endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
 	bool "Network File Systems"
-- 
cgit v1.2.3


From 5f820f648c92a5ecc771a96b3c29aa6e90013bba Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 6 Jan 2009 14:40:59 -0800
Subject: poll: allow f_op->poll to sleep

f_op->poll is the only vfs operation which is not allowed to sleep.  It's
because poll and select implementation used task state to synchronize
against wake ups, which doesn't have to be the case anymore as wait/wake
interface can now use custom wake up functions.  The non-sleep restriction
can be a bit tricky because ->poll is not called from an atomic context
and the result of accidentally sleeping in ->poll only shows up as
temporary busy looping when the timing is right or rather wrong.

This patch converts poll/select to use custom wake up function and use
separate triggered variable to synchronize against wake up events.  The
only added overhead is an extra function call during wake up and
negligible.

This patch removes the one non-sleep exception from vfs locking rules and
is beneficial to userland filesystem implementations like FUSE, 9p or
peculiar fs like spufs as it's very difficult for those to implement
non-sleeping poll method.

While at it, make the following cosmetic changes to make poll.h and
select.c checkpatch friendly.

* s/type * symbol/type *symbol/		   : three places in poll.h
* remove blank line before EXPORT_SYMBOL() : two places in select.c

Oleg: spotted missing barrier in poll_schedule_timeout()
Davide: spotted missing write barrier in pollwake()

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Brad Boyer <flar@allandria.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Roland McGrath <roland@redhat.com>
Cc: Mauro Carvalho Chehab <mchehab@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 62 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 87df51eadcf..08b91beed80 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 void poll_initwait(struct poll_wqueues *pwq)
 {
 	init_poll_funcptr(&pwq->pt, __pollwait);
+	pwq->polling_task = current;
 	pwq->error = 0;
 	pwq->table = NULL;
 	pwq->inline_index = 0;
 }
-
 EXPORT_SYMBOL(poll_initwait);
 
 static void free_poll_entry(struct poll_table_entry *entry)
@@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq)
 		free_page((unsigned long) old);
 	}
 }
-
 EXPORT_SYMBOL(poll_freewait);
 
-static struct poll_table_entry *poll_get_entry(poll_table *_p)
+static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
 {
-	struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt);
 	struct poll_table_page *table = p->table;
 
 	if (p->inline_index < N_INLINE_POLL_ENTRIES)
@@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
 		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
 		if (!new_table) {
 			p->error = -ENOMEM;
-			__set_current_state(TASK_RUNNING);
 			return NULL;
 		}
 		new_table->entry = new_table->entries;
@@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p)
 	return table->entry++;
 }
 
+static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	struct poll_wqueues *pwq = wait->private;
+	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
+
+	/*
+	 * Although this function is called under waitqueue lock, LOCK
+	 * doesn't imply write barrier and the users expect write
+	 * barrier semantics on wakeup functions.  The following
+	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+	 * and is paired with set_mb() in poll_schedule_timeout.
+	 */
+	smp_wmb();
+	pwq->triggered = 1;
+
+	/*
+	 * Perform the default wake up operation using a dummy
+	 * waitqueue.
+	 *
+	 * TODO: This is hacky but there currently is no interface to
+	 * pass in @sync.  @sync is scheduled to be removed and once
+	 * that happens, wake_up_process() can be used directly.
+	 */
+	return default_wake_function(&dummy_wait, mode, sync, key);
+}
+
 /* Add a new entry */
 static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 				poll_table *p)
 {
-	struct poll_table_entry *entry = poll_get_entry(p);
+	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
+	struct poll_table_entry *entry = poll_get_entry(pwq);
 	if (!entry)
 		return;
 	get_file(filp);
 	entry->filp = filp;
 	entry->wait_address = wait_address;
-	init_waitqueue_entry(&entry->wait, current);
+	init_waitqueue_func_entry(&entry->wait, pollwake);
+	entry->wait.private = pwq;
 	add_wait_queue(wait_address, &entry->wait);
 }
 
+int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+			  ktime_t *expires, unsigned long slack)
+{
+	int rc = -EINTR;
+
+	set_current_state(state);
+	if (!pwq->triggered)
+		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
+	__set_current_state(TASK_RUNNING);
+
+	/*
+	 * Prepare for the next iteration.
+	 *
+	 * The following set_mb() serves two purposes.  First, it's
+	 * the counterpart rmb of the wmb in pollwake() such that data
+	 * written before wake up is always visible after wake up.
+	 * Second, the full barrier guarantees that triggered clearing
+	 * doesn't pass event check of the next iteration.  Note that
+	 * this problem doesn't exist for the first iteration as
+	 * add_wait_queue() has full barrier semantics.
+	 */
+	set_mb(pwq->triggered, 0);
+
+	return rc;
+}
+EXPORT_SYMBOL(poll_schedule_timeout);
+
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
  * @to:		pointer to timespec variable for the final timeout
@@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 	for (;;) {
 		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
 
-		set_current_state(TASK_INTERRUPTIBLE);
-
 		inp = fds->in; outp = fds->out; exp = fds->ex;
 		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
 
@@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+					   to, slack))
 			timed_out = 1;
 	}
-	__set_current_state(TASK_RUNNING);
 
 	poll_freewait(&table);
 
@@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 	for (;;) {
 		struct poll_list *walk;
 
-		set_current_state(TASK_INTERRUPTIBLE);
 		for (walk = list; walk != NULL; walk = walk->next) {
 			struct pollfd * pfd, * pfd_end;
 
@@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds,  struct poll_list *list,
 			to = &expire;
 		}
 
-		if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
 			timed_out = 1;
 	}
-	__set_current_state(TASK_RUNNING);
 	return count;
 }
 
-- 
cgit v1.2.3


From 179f7ebff6be45738c6e2fa68c8d2cc5c2c6308e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Tue, 6 Jan 2009 14:41:04 -0800
Subject: percpu_counter: FBC_BATCH should be a variable

For NR_CPUS >= 16 values, FBC_BATCH is 2*NR_CPUS

Considering more and more distros are using high NR_CPUS values, it makes
sense to use a more sensible value for FBC_BATCH, and get rid of NR_CPUS.

A sensible value is 2*num_online_cpus(), with a minimum value of 32 (This
minimum value helps branch prediction in __percpu_counter_add())

We already have a hotcpu notifier, so we can adjust FBC_BATCH dynamically.

We rename FBC_BATCH to percpu_counter_batch since its not a constant
anymore.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/ext4.h  | 6 +++---
 fs/ext4/inode.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b0537c82702..6c46c648430 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1225,11 +1225,11 @@ do {								\
 } while (0)
 
 #ifdef CONFIG_SMP
-/* Each CPU can accumulate FBC_BATCH blocks in their local
+/* Each CPU can accumulate percpu_counter_batch blocks in their local
  * counters. So we need to make sure we have free blocks more
- * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
+ * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
  */
-#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
 #else
 #define EXT4_FREEBLOCKS_WATERMARK 0
 #endif
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6702a49992a..98d3fe7057e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2498,7 +2498,7 @@ static int ext4_nonda_switch(struct super_block *sb)
 	/*
 	 * switch to non delalloc mode if we are running low
 	 * on free block. The free block accounting via percpu
-	 * counters can get slightly wrong with FBC_BATCH getting
+	 * counters can get slightly wrong with percpu_counter_batch getting
 	 * accumulated on each CPU without updating global counters
 	 * Delalloc need an accurate free block accounting. So switch
 	 * to non delalloc when we are near to error range.
-- 
cgit v1.2.3


From 8c4018884a49eb2c6c7ca90804f331b12983561c Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 6 Jan 2009 14:41:08 -0800
Subject: fs: fix name overwrite in __register_chrdev_region()

It's possible to register a chrdev with a name size exactly the same as
was allocated in structure.  It seems it was not intended behaviour.

At least chrdev_show does not like it.

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/char_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/char_dev.c b/fs/char_dev.c
index 700697a7261..38f71222a55 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor,
 	cd->major = major;
 	cd->baseminor = baseminor;
 	cd->minorct = minorct;
-	strncpy(cd->name,name, 64);
+	strlcpy(cd->name, name, sizeof(cd->name));
 
 	i = major_to_index(major);
 
-- 
cgit v1.2.3


From ca8a5bd28226d62c045e0e55b42b9d10b146c205 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 6 Jan 2009 14:41:09 -0800
Subject: add missing accounting calls to compat_sys_{readv,writev}

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index d1ece79b641..30f2faa22f5 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign
 	ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos);
 
 out:
+	if (ret > 0)
+		add_rchar(current, ret);
+	inc_syscr(current);
 	fput(file);
 	return ret;
 }
@@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig
 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos);
 
 out:
+	if (ret > 0)
+		add_wchar(current, ret);
+	inc_syscw(current);
 	fput(file);
 	return ret;
 }
-- 
cgit v1.2.3


From 350eaf791bebccb9ad5999351f3e328319545f03 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 6 Jan 2009 14:41:11 -0800
Subject: do_coredump(): check return from argv_split()

do_coredump() accesses helper_argv[0] without checking helper_argv !=
NULL.  This can happen if page allocation failed.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 6ca4e3d22a3..6b09d6fa4f7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1770,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 
  	if (ispipe) {
 		helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+		if (!helper_argv) {
+			printk(KERN_WARNING "%s failed to allocate memory\n",
+			       __func__);
+			goto fail_unlock;
+		}
 		/* Terminate the string before the first option */
 		delimit = strchr(corename, ' ');
 		if (delimit)
-- 
cgit v1.2.3


From 0bc02f3fa433a98631a932e77c4b1f873da35aee Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 6 Jan 2009 14:41:13 -0800
Subject: fs/inode: fix kernel-doc notation

Fix kernel-doc notation:

Warning(linux-2.6.28-git3//fs/inode.c:120): No description found for parameter 'sb'
Warning(linux-2.6.28-git3//fs/inode.c:120): No description found for parameter 'inode'
Warning(linux-2.6.28-git3//fs/inode.c:588): No description found for parameter 'sb'
Warning(linux-2.6.28-git3//fs/inode.c:588): No description found for parameter 'inode'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/inode.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index a903860bc5a..7a6e8c2ff7b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -110,8 +110,8 @@ static void wake_up_inode(struct inode *inode)
 
 /**
  * inode_init_always - perform inode structure intialisation
- * @sb		- superblock inode belongs to.
- * @inode	- inode to initialise
+ * @sb: superblock inode belongs to
+ * @inode: inode to initialise
  *
  * These are initializations that need to be done on every inode
  * allocation as the fields are not initialised by slab allocation.
@@ -576,8 +576,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
 
 /**
  * inode_add_to_lists - add a new inode to relevant lists
- * @sb		- superblock inode belongs to.
- * @inode	- inode to mark in use
+ * @sb: superblock inode belongs to
+ * @inode: inode to mark in use
  *
  * When an inode is allocated it needs to be accounted for, added to the in use
  * list, the owning superblock and the inode hash. This needs to be done under
-- 
cgit v1.2.3


From 94e2959e7a6a4ef0969932c30349ce6f4469a3cf Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Tue, 6 Jan 2009 14:41:15 -0800
Subject: fs: fix function param name in kernel-doc

Fix function parameter name in kernel-doc:

Warning(linux-2.6.28-git5//fs/block_dev.c:1272): No description found for parameter 'pathname'
Warning(linux-2.6.28-git5//fs/block_dev.c:1272): Excess function parameter 'path' description in 'lookup_bdev'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 349a26c1000..b957717e25a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1262,7 +1262,7 @@ EXPORT_SYMBOL(ioctl_by_bdev);
 
 /**
  * lookup_bdev  - lookup a struct block_device by name
- * @path:	special file representing the block device
+ * @pathname:	special file representing the block device
  *
  * Get a reference to the blockdevice at @pathname in the current
  * namespace if possible and return it.  Return ERR_PTR(error)
-- 
cgit v1.2.3


From 87113e806a9ee48c6c989513ef3e9c1d31e06ac4 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Tue, 6 Jan 2009 14:41:38 -0800
Subject: fs/binfmt_misc.c: add terminating newline to
 /proc/sys/fs/binfmt_misc/status

The following is what it looks like before patching.
It is not much readable.

user@ubuntu:/proc/sys/fs/binfmt_misc$ cat status
enableduser@ubuntu:/proc/sys/fs/binfmt_misc$

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e1158cb4fbd..c4e83537ead 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -649,7 +649,7 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled" : "disabled";
+	char *s = enabled ? "enabled\n" : "disabled\n";
 
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
-- 
cgit v1.2.3


From ee9ef6b778fbe1cacbec4fcd18a93f322ff93354 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Tue, 6 Jan 2009 14:41:39 -0800
Subject: fs/ncpfs/getopt.c: cleanup keneldoc

There are no argument named @flag in ncp_getopt(), remove it.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Petr Vandrovec <VANDROVE@vc.cvut.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ncpfs/getopt.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 335b003dddf..0af3349de85 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -16,7 +16,6 @@
  *	@opts: an array of &struct option entries controlling parser operations
  *	@optopt: output; will contain the current option
  *	@optarg: output; will contain the value (if one exists)
- *	@flag: output; may be NULL; should point to a long for or'ing flags
  *	@value: output; may be NULL; will be overwritten with the integer value
  *		of the current argument.
  *
-- 
cgit v1.2.3


From 9c79f34f7ee71cd28272332b424ca64b2be006ab Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:41:57 -0800
Subject: eCryptfs: Filename Encryption: Tag 70 packets

This patchset implements filename encryption via a passphrase-derived
mount-wide Filename Encryption Key (FNEK) specified as a mount parameter.
Each encrypted filename has a fixed prefix indicating that eCryptfs should
try to decrypt the filename.  When eCryptfs encounters this prefix, it
decodes the filename into a tag 70 packet and then decrypts the packet
contents using the FNEK, setting the filename to the decrypted filename.
Both unencrypted and encrypted filenames can reside in the same lower
filesystem.

Because filename encryption expands the length of the filename during the
encoding stage, eCryptfs will not properly handle filenames that are
already near the maximum filename length.

In the present implementation, eCryptfs must be able to produce a match
against the lower encrypted and encoded filename representation when given
a plaintext filename.  Therefore, two files having the same plaintext name
will encrypt and encode into the same lower filename if they are both
encrypted using the same FNEK.  This can be changed by finding a way to
replace the prepended bytes in the blocked-aligned filename with random
characters; they are hashes of the FNEK right now, so that it is possible
to deterministically map from a plaintext filename to an encrypted and
encoded filename in the lower filesystem.  An implementation using random
characters will have to decode and decrypt every single directory entry in
any given directory any time an event occurs wherein the VFS needs to
determine whether a particular file exists in the lower directory and the
decrypted and decoded filenames have not yet been extracted for that
directory.

Thanks to Tyler Hicks and David Kleikamp for assistance in the development
of this patchset.

This patch:

A tag 70 packet contains a filename encrypted with a Filename Encryption
Key (FNEK).  This patch implements functions for writing and parsing tag
70 packets.  This patch also adds definitions and extends structures to
support filename encryption.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c          |  11 +-
 fs/ecryptfs/ecryptfs_kernel.h |  38 ++-
 fs/ecryptfs/keystore.c        | 634 +++++++++++++++++++++++++++++++++++++-----
 3 files changed, 613 insertions(+), 70 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 6046239465a..485732751f0 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1149,19 +1149,20 @@ ecryptfs_cipher_code_str_map[] = {
 
 /**
  * ecryptfs_code_for_cipher_string
- * @crypt_stat: The cryptographic context
+ * @cipher_name: The string alias for the cipher
+ * @key_bytes: Length of key in bytes; used for AES code selection
  *
  * Returns zero on no match, or the cipher code on match
  */
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes)
 {
 	int i;
 	u8 code = 0;
 	struct ecryptfs_cipher_code_str_map_elem *map =
 		ecryptfs_cipher_code_str_map;
 
-	if (strcmp(crypt_stat->cipher, "aes") == 0) {
-		switch (crypt_stat->key_size) {
+	if (strcmp(cipher_name, "aes") == 0) {
+		switch (key_bytes) {
 		case 16:
 			code = RFC2440_CIPHER_AES_128;
 			break;
@@ -1173,7 +1174,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat)
 		}
 	} else {
 		for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++)
-			if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){
+			if (strcmp(cipher_name, map[i].cipher_str) == 0) {
 				code = map[i].cipher_code;
 				break;
 			}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a75026d35d1..76a95bd8819 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -199,6 +199,7 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_DEFAULT_CIPHER "aes"
 #define ECRYPTFS_DEFAULT_KEY_BYTES 16
 #define ECRYPTFS_DEFAULT_HASH "md5"
+#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH
 #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01
 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C
 #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED
@@ -206,7 +207,25 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41
 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42
 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43
+#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename
+					  * as dentry name */
+#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in
+					  * metadata */
+#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as
+					  * dentry name */
+#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
+					  * metadata */
+/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
+ * ECRYPTFS_MAX_IV_BYTES */
+#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
+#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */
 #define MD5_DIGEST_SIZE 16
+#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED."
+#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED."
+#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24
+#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32)
 
 struct ecryptfs_key_sig {
 	struct list_head crypt_stat_list;
@@ -332,13 +351,20 @@ struct ecryptfs_mount_crypt_stat {
 #define ECRYPTFS_XATTR_METADATA_ENABLED        0x00000002
 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED        0x00000004
 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED  0x00000008
+#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES      0x00000010
+#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
+#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
 	u32 flags;
 	struct list_head global_auth_tok_list;
 	struct mutex global_auth_tok_list_mutex;
 	size_t num_global_auth_toks;
 	size_t global_default_cipher_key_size;
+	size_t global_default_fn_cipher_key_bytes;
 	unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
 						 + 1];
+	unsigned char global_default_fn_cipher_name[
+		ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
+	char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1];
 };
 
 /* superblock private data. */
@@ -599,7 +625,7 @@ int ecryptfs_read_and_validate_header_region(char *data,
 					     struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
 					    struct dentry *ecryptfs_dentry);
-u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat);
+u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
 void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
 int ecryptfs_generate_key_packet_set(char *dest_base,
@@ -694,5 +720,15 @@ int ecryptfs_privileged_open(struct file **lower_file,
 			     struct vfsmount *lower_mnt,
 			     const struct cred *cred);
 int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size);
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 0d713b69194..dafceb5560e 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -402,6 +402,569 @@ out:
 	return rc;
 }
 
+static int
+ecryptfs_find_global_auth_tok_for_sig(
+	struct ecryptfs_global_auth_tok **global_auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
+{
+	struct ecryptfs_global_auth_tok *walker;
+	int rc = 0;
+
+	(*global_auth_tok) = NULL;
+	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	list_for_each_entry(walker,
+			    &mount_crypt_stat->global_auth_tok_list,
+			    mount_crypt_stat_list) {
+		if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
+			(*global_auth_tok) = walker;
+			goto out;
+		}
+	}
+	rc = -EINVAL;
+out:
+	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
+	return rc;
+}
+
+/**
+ * ecryptfs_find_auth_tok_for_sig
+ * @auth_tok: Set to the matching auth_tok; NULL if not found
+ * @crypt_stat: inode crypt_stat crypto context
+ * @sig: Sig of auth_tok to find
+ *
+ * For now, this function simply looks at the registered auth_tok's
+ * linked off the mount_crypt_stat, so all the auth_toks that can be
+ * used must be registered at mount time. This function could
+ * potentially try a lot harder to find auth_tok's (e.g., by calling
+ * out to ecryptfsd to dynamically retrieve an auth_tok object) so
+ * that static registration of auth_tok's will no longer be necessary.
+ *
+ * Returns zero on no error; non-zero on error
+ */
+static int
+ecryptfs_find_auth_tok_for_sig(
+	struct ecryptfs_auth_tok **auth_tok,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	char *sig)
+{
+	struct ecryptfs_global_auth_tok *global_auth_tok;
+	int rc = 0;
+
+	(*auth_tok) = NULL;
+	if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
+						  mount_crypt_stat, sig)) {
+		struct key *auth_tok_key;
+
+		rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
+						       sig);
+	} else
+		(*auth_tok) = global_auth_tok->global_auth_tok;
+	return rc;
+}
+
+/**
+ * write_tag_70_packet can gobble a lot of stack space. We stuff most
+ * of the function's parameters in a kmalloc'd struct to help reduce
+ * eCryptfs' overall stack usage.
+ */
+struct ecryptfs_write_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	size_t j;
+	size_t num_rand_bytes;
+	struct mutex *tfm_mutex;
+	char *block_aligned_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg;
+	struct scatterlist dst_sg;
+	struct blkcipher_desc desc;
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE];
+	struct hash_desc hash_desc;
+	struct scatterlist hash_sg;
+};
+
+/**
+ * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK
+ * @filename: NULL-terminated filename string
+ *
+ * This is the simplest mechanism for achieving filename encryption in
+ * eCryptfs. It encrypts the given filename with the mount-wide
+ * filename encryption key (FNEK) and stores it in a packet to @dest,
+ * which the callee will encode and write directly into the dentry
+ * name.
+ */
+int
+ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *filename, size_t filename_size)
+{
+	struct ecryptfs_write_tag_70_packet_silly_stack *s;
+	int rc = 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%d] bytes of kernel memory\n", __func__, sizeof(*s));
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	(*packet_size) = 0;
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
+		&s->desc.tfm,
+		&s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       mount_crypt_stat->global_default_fn_cipher_name, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	s->block_size = crypto_blkcipher_blocksize(s->desc.tfm);
+	/* Plus one for the \0 separator between the random prefix
+	 * and the plaintext filename */
+	s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1);
+	s->block_aligned_filename_size = (s->num_rand_bytes + filename_size);
+	if ((s->block_aligned_filename_size % s->block_size) != 0) {
+		s->num_rand_bytes += (s->block_size
+				      - (s->block_aligned_filename_size
+					 % s->block_size));
+		s->block_aligned_filename_size = (s->num_rand_bytes
+						  + filename_size);
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random characters, a \0
+	 *    separator, and then the filename */
+	s->max_packet_size = (1                   /* Tag 70 identifier */
+			      + 3                 /* Max Tag 70 packet size */
+			      + ECRYPTFS_SIG_SIZE /* FNEK sig */
+			      + 1                 /* Cipher identifier */
+			      + s->block_aligned_filename_size);
+	if (dest == NULL) {
+		(*packet_size) = s->max_packet_size;
+		goto out_unlock;
+	}
+	if (s->max_packet_size > (*remaining_bytes)) {
+		printk(KERN_WARNING "%s: Require [%d] bytes to write; only "
+		       "[%d] available\n", __func__, s->max_packet_size,
+		       (*remaining_bytes));
+		rc = -EINVAL;
+		goto out_unlock;
+	}
+	s->block_aligned_filename = kzalloc(s->block_aligned_filename_size,
+					    GFP_KERNEL);
+	if (!s->block_aligned_filename) {
+		printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
+		       "kzalloc [%Zd] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	s->i = 0;
+	dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE;
+	rc = ecryptfs_write_packet_length(&dest[s->i],
+					  (ECRYPTFS_SIG_SIZE
+					   + 1 /* Cipher code */
+					   + s->block_aligned_filename_size),
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error generating tag 70 packet "
+		       "header; cannot generate packet length; rc = [%d]\n",
+		       __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i += s->packet_size_len;
+	ecryptfs_from_hex(&dest[s->i],
+			  mount_crypt_stat->global_default_fnek_sig,
+			  ECRYPTFS_SIG_SIZE);
+	s->i += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = ecryptfs_code_for_cipher_string(
+		mount_crypt_stat->global_default_fn_cipher_name,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (s->cipher_code == 0) {
+		printk(KERN_WARNING "%s: Unable to generate code for "
+		       "cipher [%s] with key bytes [%d]\n", __func__,
+		       mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	dest[s->i++] = s->cipher_code;
+	rc = ecryptfs_find_auth_tok_for_sig(
+		&s->auth_tok, mount_crypt_stat,
+		mount_crypt_stat->global_default_fnek_sig);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__,
+		       mount_crypt_stat->global_default_fnek_sig, rc);
+		goto out_free_unlock;
+	}
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+	sg_init_one(
+		&s->hash_sg,
+		(u8 *)s->auth_tok->token.password.session_key_encryption_key,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0,
+					     CRYPTO_ALG_ASYNC);
+	if (IS_ERR(s->hash_desc.tfm)) {
+			rc = PTR_ERR(s->hash_desc.tfm);
+			printk(KERN_ERR "%s: Error attempting to "
+			       "allocate hash crypto context; rc = [%d]\n",
+			       __func__, rc);
+			goto out_free_unlock;
+	}
+	rc = crypto_hash_init(&s->hash_desc);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error initializing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_update(
+		&s->hash_desc, &s->hash_sg,
+		s->auth_tok->token.password.session_key_encryption_key_bytes);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error updating crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_hash_final(&s->hash_desc, s->hash);
+	if (rc) {
+		printk(KERN_ERR
+		       "%s: Error finalizing crypto hash; rc = [%d]\n",
+		       __func__, rc);
+		goto out_release_free_unlock;
+	}
+	for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) {
+		s->block_aligned_filename[s->j] =
+			s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)];
+		if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)
+		    == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) {
+			sg_init_one(&s->hash_sg, (u8 *)s->hash,
+				    ECRYPTFS_TAG_70_DIGEST_SIZE);
+			rc = crypto_hash_init(&s->hash_desc);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error initializing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_update(&s->hash_desc, &s->hash_sg,
+						ECRYPTFS_TAG_70_DIGEST_SIZE);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error updating crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			rc = crypto_hash_final(&s->hash_desc, s->tmp_hash);
+			if (rc) {
+				printk(KERN_ERR
+				       "%s: Error finalizing crypto hash; "
+				       "rc = [%d]\n", __func__, rc);
+				goto out_release_free_unlock;
+			}
+			memcpy(s->hash, s->tmp_hash,
+			       ECRYPTFS_TAG_70_DIGEST_SIZE);
+		}
+		if (s->block_aligned_filename[s->j] == '\0')
+			s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL;
+	}
+	memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename,
+	       filename_size);
+	rc = virt_to_scatterlist(s->block_aligned_filename,
+				 s->block_aligned_filename_size, &s->src_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size,
+				 &s->dst_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_release_free_unlock;
+	}
+	/* The characters in the first block effectively do the job
+	 * of the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%Zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_release_free_unlock;
+	}
+	rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_release_free_unlock;
+	}
+	s->i += s->block_aligned_filename_size;
+	(*packet_size) = s->i;
+	(*remaining_bytes) -= (*packet_size);
+out_release_free_unlock:
+	crypto_free_hash(s->hash_desc.tfm);
+out_free_unlock:
+	memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+	kfree(s->block_aligned_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	kfree(s);
+	return rc;
+}
+
+struct ecryptfs_parse_tag_70_packet_silly_stack {
+	u8 cipher_code;
+	size_t max_packet_size;
+	size_t packet_size_len;
+	size_t parsed_tag_70_packet_size;
+	size_t block_aligned_filename_size;
+	size_t block_size;
+	size_t i;
+	struct mutex *tfm_mutex;
+	char *decrypted_filename;
+	struct ecryptfs_auth_tok *auth_tok;
+	struct scatterlist src_sg;
+	struct scatterlist dst_sg;
+	struct blkcipher_desc desc;
+	char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
+	char iv[ECRYPTFS_MAX_IV_BYTES];
+	char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+};
+
+/**
+ * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
+ * @filename: This function kmalloc's the memory for the filename
+ */
+int
+ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
+			     size_t *packet_size,
+			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+			     char *data, size_t max_packet_size)
+{
+	struct ecryptfs_parse_tag_70_packet_silly_stack *s;
+	int rc = 0;
+
+	(*packet_size) = 0;
+	(*filename_size) = 0;
+	(*filename) = NULL;
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s) {
+		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
+		       "[%d] bytes of kernel memory\n", __func__, sizeof(*s));
+		goto out;
+	}
+	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+	if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
+		printk(KERN_WARNING "%s: max_packet_size is [%Zd]; it must be "
+		       "at least [%d]\n", __func__, max_packet_size,
+			(1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
+		rc = -EINVAL;
+		goto out;
+	}
+	/* Octet 0: Tag 70 identifier
+	 * Octets 1-N1: Tag 70 packet size (includes cipher identifier
+	 *              and block-aligned encrypted filename size)
+	 * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE)
+	 * Octet N2-N3: Cipher identifier (1 octet)
+	 * Octets N3-N4: Block-aligned encrypted filename
+	 *  - Consists of a minimum number of random numbers, a \0
+	 *    separator, and then the filename */
+	if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) {
+		printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be "
+		       "tag [0x%.2x]\n", __func__,
+		       data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE);
+		rc = -EINVAL;
+		goto out;
+	}
+	rc = ecryptfs_parse_packet_length(&data[(*packet_size)],
+					  &s->parsed_tag_70_packet_size,
+					  &s->packet_size_len);
+	if (rc) {
+		printk(KERN_WARNING "%s: Error parsing packet length; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out;
+	}
+	s->block_aligned_filename_size = (s->parsed_tag_70_packet_size
+					  - ECRYPTFS_SIG_SIZE - 1);
+	if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
+	    > max_packet_size) {
+		printk(KERN_WARNING "%s: max_packet_size is [%d]; real packet "
+		       "size is [%d]\n", __func__, max_packet_size,
+		       (1 + s->packet_size_len + 1
+			+ s->block_aligned_filename_size));
+		rc = -EINVAL;
+		goto out;
+	}
+	(*packet_size) += s->packet_size_len;
+	ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)],
+			ECRYPTFS_SIG_SIZE);
+	s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0';
+	(*packet_size) += ECRYPTFS_SIG_SIZE;
+	s->cipher_code = data[(*packet_size)++];
+	rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code);
+	if (rc) {
+		printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n",
+		       __func__, s->cipher_code);
+		goto out;
+	}
+	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
+							&s->tfm_mutex,
+							s->cipher_string);
+	if (unlikely(rc)) {
+		printk(KERN_ERR "Internal error whilst attempting to get "
+		       "tfm and mutex for cipher name [%s]; rc = [%d]\n",
+		       s->cipher_string, rc);
+		goto out;
+	}
+	mutex_lock(s->tfm_mutex);
+	rc = virt_to_scatterlist(&data[(*packet_size)],
+				 s->block_aligned_filename_size, &s->src_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert encrypted filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_unlock;
+	}
+	(*packet_size) += s->block_aligned_filename_size;
+	s->decrypted_filename = kmalloc(s->block_aligned_filename_size,
+					GFP_KERNEL);
+	if (!s->decrypted_filename) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%d] bytes\n", __func__,
+		       s->block_aligned_filename_size);
+		rc = -ENOMEM;
+		goto out_unlock;
+	}
+	rc = virt_to_scatterlist(s->decrypted_filename,
+				 s->block_aligned_filename_size, &s->dst_sg, 1);
+	if (rc != 1) {
+		printk(KERN_ERR "%s: Internal error whilst attempting to "
+		       "convert decrypted filename memory to scatterlist; "
+		       "expected rc = 1; got rc = [%d]. "
+		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       s->block_aligned_filename_size);
+		goto out_free_unlock;
+	}
+	/* The characters in the first block effectively do the job of
+	 * the IV here, so we just use 0's for the IV. Note the
+	 * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES
+	 * >= ECRYPTFS_MAX_IV_BYTES. */
+	memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
+	s->desc.info = s->iv;
+	rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat,
+					    s->fnek_sig_hex);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to find auth tok for "
+		       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
+		       rc);
+		goto out_free_unlock;
+	}
+	/* TODO: Support other key modules than passphrase for
+	 * filename encryption */
+	BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD);
+	rc = crypto_blkcipher_setkey(
+		s->desc.tfm,
+		s->auth_tok->token.password.session_key_encryption_key,
+		mount_crypt_stat->global_default_fn_cipher_key_bytes);
+	if (rc < 0) {
+		printk(KERN_ERR "%s: Error setting key for crypto context; "
+		       "rc = [%d]. s->auth_tok->token.password.session_key_"
+		       "encryption_key = [0x%p]; mount_crypt_stat->"
+		       "global_default_fn_cipher_key_bytes = [%Zd]\n", __func__,
+		       rc,
+		       s->auth_tok->token.password.session_key_encryption_key,
+		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		goto out_free_unlock;
+	}
+	rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg,
+					 s->block_aligned_filename_size);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decrypt filename; "
+		       "rc = [%d]\n", __func__, rc);
+		goto out_free_unlock;
+	}
+	s->i = 0;
+	while (s->decrypted_filename[s->i] != '\0'
+	       && s->i < s->block_aligned_filename_size)
+		s->i++;
+	if (s->i == s->block_aligned_filename_size) {
+		printk(KERN_WARNING "%s: Invalid tag 70 packet; could not "
+		       "find valid separator between random characters and "
+		       "the filename\n", __func__);
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	s->i++;
+	(*filename_size) = (s->block_aligned_filename_size - s->i);
+	if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
+		printk(KERN_WARNING "%s: Filename size is [%Zd], which is "
+		       "invalid\n", __func__, (*filename_size));
+		rc = -EINVAL;
+		goto out_free_unlock;
+	}
+	(*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
+	if (!(*filename)) {
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%d] bytes\n", __func__,
+		       ((*filename_size) + 1));
+		rc = -ENOMEM;
+		goto out_free_unlock;
+	}
+	memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size));
+	(*filename)[(*filename_size)] = '\0';
+out_free_unlock:
+	kfree(s->decrypted_filename);
+out_unlock:
+	mutex_unlock(s->tfm_mutex);
+out:
+	if (rc) {
+		(*packet_size) = 0;
+		(*filename_size) = 0;
+		(*filename) = NULL;
+	}
+	kfree(s);
+	return rc;
+}
+
 static int
 ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok)
 {
@@ -897,30 +1460,6 @@ out:
 	return rc;
 }
 
-static int
-ecryptfs_find_global_auth_tok_for_sig(
-	struct ecryptfs_global_auth_tok **global_auth_tok,
-	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
-{
-	struct ecryptfs_global_auth_tok *walker;
-	int rc = 0;
-
-	(*global_auth_tok) = NULL;
-	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
-	list_for_each_entry(walker,
-			    &mount_crypt_stat->global_auth_tok_list,
-			    mount_crypt_stat_list) {
-		if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) {
-			(*global_auth_tok) = walker;
-			goto out;
-		}
-	}
-	rc = -EINVAL;
-out:
-	mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
-	return rc;
-}
-
 /**
  * ecryptfs_verify_version
  * @version: The version number to confirm
@@ -989,43 +1528,6 @@ out:
 	return rc;
 }
 
-/**
- * ecryptfs_find_auth_tok_for_sig
- * @auth_tok: Set to the matching auth_tok; NULL if not found
- * @crypt_stat: inode crypt_stat crypto context
- * @sig: Sig of auth_tok to find
- *
- * For now, this function simply looks at the registered auth_tok's
- * linked off the mount_crypt_stat, so all the auth_toks that can be
- * used must be registered at mount time. This function could
- * potentially try a lot harder to find auth_tok's (e.g., by calling
- * out to ecryptfsd to dynamically retrieve an auth_tok object) so
- * that static registration of auth_tok's will no longer be necessary.
- *
- * Returns zero on no error; non-zero on error
- */
-static int
-ecryptfs_find_auth_tok_for_sig(
-	struct ecryptfs_auth_tok **auth_tok,
-	struct ecryptfs_crypt_stat *crypt_stat, char *sig)
-{
-	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-		crypt_stat->mount_crypt_stat;
-	struct ecryptfs_global_auth_tok *global_auth_tok;
-	int rc = 0;
-
-	(*auth_tok) = NULL;
-	if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok,
-						  mount_crypt_stat, sig)) {
-		struct key *auth_tok_key;
-
-		rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok,
-						       sig);
-	} else
-		(*auth_tok) = global_auth_tok->global_auth_tok;
-	return rc;
-}
-
 /**
  * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok.
  * @auth_tok: The passphrase authentication token to use to encrypt the FEK
@@ -1256,7 +1758,8 @@ find_next_matching_auth_tok:
 			rc = -EINVAL;
 			goto out_wipe_list;
 		}
-		ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat,
+		ecryptfs_find_auth_tok_for_sig(&matching_auth_tok,
+					       crypt_stat->mount_crypt_stat,
 					       candidate_auth_tok_sig);
 		if (matching_auth_tok) {
 			found_auth_tok = 1;
@@ -1336,7 +1839,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok,
 	int rc;
 
 	rc = write_tag_66_packet(auth_tok->token.private_key.signature,
-				 ecryptfs_code_for_cipher_string(crypt_stat),
+				 ecryptfs_code_for_cipher_string(
+					 crypt_stat->cipher,
+					 crypt_stat->key_size),
 				 crypt_stat, &payload, &payload_len);
 	if (rc) {
 		ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n");
@@ -1696,7 +2201,8 @@ encrypted_session_key_set:
 	dest[(*packet_size)++] = 0x04; /* version 4 */
 	/* TODO: Break from RFC2440 so that arbitrary ciphers can be
 	 * specified with strings */
-	cipher_code = ecryptfs_code_for_cipher_string(crypt_stat);
+	cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher,
+						      crypt_stat->key_size);
 	if (cipher_code == 0) {
 		ecryptfs_printk(KERN_WARNING, "Unable to generate code for "
 				"cipher [%s]\n", crypt_stat->cipher);
-- 
cgit v1.2.3


From a34f60f748c6fe5d791e9b54cffe442201428254 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:41:58 -0800
Subject: eCryptfs: Filename Encryption: Header updates

Extensions to the header file to support filename encryption.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c          |  4 +--
 fs/ecryptfs/ecryptfs_kernel.h | 61 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 51 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 485732751f0..c9839df37c7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -175,8 +175,8 @@ out:
  *
  * Returns zero on success; non-zero on error.
  */
-static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
-			      loff_t offset)
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset)
 {
 	int rc = 0;
 	char dst[MD5_DIGEST_SIZE];
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 76a95bd8819..b648175a44c 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -51,12 +51,16 @@
 #define ECRYPTFS_VERSIONING_XATTR                 0x00000010
 #define ECRYPTFS_VERSIONING_MULTKEY               0x00000020
 #define ECRYPTFS_VERSIONING_DEVMISC               0x00000040
+#define ECRYPTFS_VERSIONING_HMAC                  0x00000080
+#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION   0x00000100
+#define ECRYPTFS_VERSIONING_GCM                   0x00000200
 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \
 				  | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \
 				  | ECRYPTFS_VERSIONING_PUBKEY \
 				  | ECRYPTFS_VERSIONING_XATTR \
 				  | ECRYPTFS_VERSIONING_MULTKEY \
-				  | ECRYPTFS_VERSIONING_DEVMISC)
+				  | ECRYPTFS_VERSIONING_DEVMISC \
+				  | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION)
 #define ECRYPTFS_MAX_PASSWORD_LENGTH 64
 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH
 #define ECRYPTFS_SALT_SIZE 8
@@ -232,23 +236,39 @@ struct ecryptfs_key_sig {
 	char keysig[ECRYPTFS_SIG_SIZE_HEX];
 };
 
+struct ecryptfs_filename {
+	struct list_head crypt_stat_list;
+#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001
+	u32 flags;
+	u32 seq_no;
+	char *filename;
+	char *encrypted_filename;
+	size_t filename_size;
+	size_t encrypted_filename_size;
+	char fnek_sig[ECRYPTFS_SIG_SIZE_HEX];
+	char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1];
+};
+
 /**
  * This is the primary struct associated with each encrypted file.
  *
  * TODO: cache align/pack?
  */
 struct ecryptfs_crypt_stat {
-#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
-#define ECRYPTFS_POLICY_APPLIED     0x00000002
-#define ECRYPTFS_NEW_FILE           0x00000004
-#define ECRYPTFS_ENCRYPTED          0x00000008
-#define ECRYPTFS_SECURITY_WARNING   0x00000010
-#define ECRYPTFS_ENABLE_HMAC        0x00000020
-#define ECRYPTFS_ENCRYPT_IV_PAGES   0x00000040
-#define ECRYPTFS_KEY_VALID          0x00000080
-#define ECRYPTFS_METADATA_IN_XATTR  0x00000100
-#define ECRYPTFS_VIEW_AS_ENCRYPTED  0x00000200
-#define ECRYPTFS_KEY_SET            0x00000400
+#define ECRYPTFS_STRUCT_INITIALIZED   0x00000001
+#define ECRYPTFS_POLICY_APPLIED       0x00000002
+#define ECRYPTFS_NEW_FILE             0x00000004
+#define ECRYPTFS_ENCRYPTED            0x00000008
+#define ECRYPTFS_SECURITY_WARNING     0x00000010
+#define ECRYPTFS_ENABLE_HMAC          0x00000020
+#define ECRYPTFS_ENCRYPT_IV_PAGES     0x00000040
+#define ECRYPTFS_KEY_VALID            0x00000080
+#define ECRYPTFS_METADATA_IN_XATTR    0x00000100
+#define ECRYPTFS_VIEW_AS_ENCRYPTED    0x00000200
+#define ECRYPTFS_KEY_SET              0x00000400
+#define ECRYPTFS_ENCRYPT_FILENAMES    0x00000800
+#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000
+#define ECRYPTFS_ENCFN_USE_FEK        0x00002000
 	u32 flags;
 	unsigned int file_version;
 	size_t iv_bytes;
@@ -597,6 +617,15 @@ struct ecryptfs_open_req {
 int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       struct dentry *this_dentry, struct super_block *sb,
 		       u32 flags);
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+					struct dentry *lower_dentry,
+					struct ecryptfs_crypt_stat *crypt_stat,
+					struct inode *ecryptfs_dir_inode,
+					struct nameidata *ecryptfs_nd);
+int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
+					 size_t *decrypted_name_size,
+					 struct dentry *ecryptfs_dentry,
+					 const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
 int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
 			     const char *name, int length,
@@ -604,6 +633,12 @@ int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
 int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
 			     const char *name, int length,
 			     char **encoded_name);
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size);
 struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry);
 void ecryptfs_dump_hex(char *data, int bytes);
 int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
@@ -730,5 +765,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 			     size_t *packet_size,
 			     struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 			     char *data, size_t max_packet_size);
+int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat,
+		       loff_t offset);
 
 #endif /* #ifndef ECRYPTFS_KERNEL_H */
-- 
cgit v1.2.3


From 51ca58dcc9f0d6b1e78954d08bd4954fb6a1421c Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:41:59 -0800
Subject: eCryptfs: Filename Encryption: Encoding and encryption functions

These functions support encrypting and encoding the filename contents.
The encrypted filename contents may consist of any ASCII characters.  This
patch includes a custom encoding mechanism to map the ASCII characters to
a reduced character set that is appropriate for filenames.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 433 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 433 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c9839df37c7..18c78abba68 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1720,6 +1720,98 @@ out:
 	return error;
 }
 
+/**
+ * ecryptfs_encrypt_filename - encrypt filename
+ *
+ * CBC-encrypts the filename. We do not want to encrypt the same
+ * filename with the same key and IV, which may happen with hard
+ * links, so we prepend random bits to each filename.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int
+ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
+			  struct ecryptfs_crypt_stat *crypt_stat,
+			  struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
+{
+	int rc = 0;
+
+	filename->encrypted_filename = NULL;
+	filename->encrypted_filename_size = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+		size_t packet_size;
+		size_t remaining_bytes;
+
+		rc = ecryptfs_write_tag_70_packet(
+			NULL, NULL,
+			&filename->encrypted_filename_size,
+			mount_crypt_stat, NULL,
+			filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to get packet "
+			       "size for tag 72; rc = [%d]\n", __func__,
+			       rc);
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename =
+			kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
+		if (!filename->encrypted_filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%Zd] bytes\n", __func__,
+			       filename->encrypted_filename_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		remaining_bytes = filename->encrypted_filename_size;
+		rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename,
+						  &remaining_bytes,
+						  &packet_size,
+						  mount_crypt_stat,
+						  filename->filename,
+						  filename->filename_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to generate "
+			       "tag 70 packet; rc = [%d]\n", __func__,
+			       rc);
+			kfree(filename->encrypted_filename);
+			filename->encrypted_filename = NULL;
+			filename->encrypted_filename_size = 0;
+			goto out;
+		}
+		filename->encrypted_filename_size = packet_size;
+	} else {
+		printk(KERN_ERR "%s: No support for requested filename "
+		       "encryption method in this release\n", __func__);
+		rc = -ENOTSUPP;
+		goto out;
+	}
+out:
+	return rc;
+}
+
+static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
+				  const char *name, size_t name_size)
+{
+	int rc = 0;
+
+	(*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
+	if (!(*copied_name)) {
+		rc = -ENOMEM;
+		goto out;
+	}
+	memcpy((void *)(*copied_name), (void *)name, name_size);
+	(*copied_name)[(name_size)] = '\0';	/* Only for convenience
+						 * in printing out the
+						 * string in debug
+						 * messages */
+	(*copied_name_size) = (name_size + 1);
+out:
+	return rc;
+}
+
 /**
  * ecryptfs_process_key_cipher - Perform key cipher initialization.
  * @key_tfm: Crypto context for key material, set by this function
@@ -1911,3 +2003,344 @@ out:
 	mutex_unlock(&key_tfm_list_mutex);
 	return rc;
 }
+
+/* 64 characters forming a 6-bit target field */
+static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
+						 "EFGHIJKLMNOPQRST"
+						 "UVWXYZabcdefghij"
+						 "klmnopqrstuvwxyz");
+
+/* We could either offset on every reverse map or just pad some 0x00's
+ * at the front here */
+static unsigned char filename_rev_map[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */
+	0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */
+	0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */
+	0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */
+	0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */
+	0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */
+	0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */
+	0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */
+	0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */
+	0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */
+	0x3D, 0x3E, 0x3F
+};
+
+/**
+ * ecryptfs_encode_for_filename
+ * @dst: Destination location for encoded filename
+ * @dst_size: Size of the encoded filename in bytes
+ * @src: Source location for the filename to encode
+ * @src_size: Size of the source in bytes
+ */
+void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size,
+				  unsigned char *src, size_t src_size)
+{
+	size_t num_blocks;
+	size_t block_num = 0;
+	size_t dst_offset = 0;
+	unsigned char last_block[3];
+
+	if (src_size == 0) {
+		(*dst_size) = 0;
+		goto out;
+	}
+	num_blocks = (src_size / 3);
+	if ((src_size % 3) == 0) {
+		memcpy(last_block, (&src[src_size - 3]), 3);
+	} else {
+		num_blocks++;
+		last_block[2] = 0x00;
+		switch (src_size % 3) {
+		case 1:
+			last_block[0] = src[src_size - 1];
+			last_block[1] = 0x00;
+			break;
+		case 2:
+			last_block[0] = src[src_size - 2];
+			last_block[1] = src[src_size - 1];
+		}
+	}
+	(*dst_size) = (num_blocks * 4);
+	if (!dst)
+		goto out;
+	while (block_num < num_blocks) {
+		unsigned char *src_block;
+		unsigned char dst_block[4];
+
+		if (block_num == (num_blocks - 1))
+			src_block = last_block;
+		else
+			src_block = &src[block_num * 3];
+		dst_block[0] = ((src_block[0] >> 2) & 0x3F);
+		dst_block[1] = (((src_block[0] << 4) & 0x30)
+				| ((src_block[1] >> 4) & 0x0F));
+		dst_block[2] = (((src_block[1] << 2) & 0x3C)
+				| ((src_block[2] >> 6) & 0x03));
+		dst_block[3] = (src_block[2] & 0x3F);
+		dst[dst_offset++] = portable_filename_chars[dst_block[0]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[1]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[2]];
+		dst[dst_offset++] = portable_filename_chars[dst_block[3]];
+		block_num++;
+	}
+out:
+	return;
+}
+
+int ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+				  const unsigned char *src, size_t src_size)
+{
+	u8 current_bit_offset = 0;
+	size_t src_byte_offset = 0;
+	size_t dst_byte_offset = 0;
+	int rc = 0;
+
+	if (dst == NULL) {
+		/* Not exact; conservatively long */
+		(*dst_size) = (((src_size + 1) * 3) / 4);
+		goto out;
+	}
+	while (src_byte_offset < src_size) {
+		unsigned char src_byte =
+				filename_rev_map[(int)src[src_byte_offset]];
+
+		switch (current_bit_offset) {
+		case 0:
+			dst[dst_byte_offset] = (src_byte << 2);
+			current_bit_offset = 6;
+			break;
+		case 6:
+			dst[dst_byte_offset++] |= (src_byte >> 4);
+			dst[dst_byte_offset] = ((src_byte & 0xF)
+						 << 4);
+			current_bit_offset = 4;
+			break;
+		case 4:
+			dst[dst_byte_offset++] |= (src_byte >> 2);
+			dst[dst_byte_offset] = (src_byte << 6);
+			current_bit_offset = 2;
+			break;
+		case 2:
+			dst[dst_byte_offset++] |= (src_byte);
+			dst[dst_byte_offset] = 0;
+			current_bit_offset = 0;
+			break;
+		}
+		src_byte_offset++;
+	}
+	(*dst_size) = dst_byte_offset;
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text
+ * @crypt_stat: The crypt_stat struct associated with the file anem to encode
+ * @name: The plaintext name
+ * @length: The length of the plaintext
+ * @encoded_name: The encypted name
+ *
+ * Encrypts and encodes a filename into something that constitutes a
+ * valid filename for a filesystem, with printable characters.
+ *
+ * We assume that we have a properly initialized crypto context,
+ * pointed to by crypt_stat->tfm.
+ *
+ * Returns zero on success; non-zero on otherwise
+ */
+int ecryptfs_encrypt_and_encode_filename(
+	char **encoded_name,
+	size_t *encoded_name_size,
+	struct ecryptfs_crypt_stat *crypt_stat,
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
+	const char *name, size_t name_size)
+{
+	size_t encoded_name_no_prefix_size;
+	int rc = 0;
+
+	(*encoded_name) = NULL;
+	(*encoded_name_size) = 0;
+	if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+	    || (mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) {
+		struct ecryptfs_filename *filename;
+
+		filename = kzalloc(sizeof(*filename), GFP_KERNEL);
+		if (!filename) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%d] bytes\n", __func__,
+			       sizeof(*filename));
+			rc = -ENOMEM;
+			goto out;
+		}
+		filename->filename = (char *)name;
+		filename->filename_size = name_size;
+		rc = ecryptfs_encrypt_filename(filename, crypt_stat,
+					       mount_crypt_stat);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encrypt "
+			       "filename; rc = [%d]\n", __func__, rc);
+			kfree(filename);
+			goto out;
+		}
+		ecryptfs_encode_for_filename(
+			NULL, &encoded_name_no_prefix_size,
+			filename->encrypted_filename,
+			filename->encrypted_filename_size);
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)))
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		else
+			(*encoded_name_size) =
+				(ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+		(*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
+		if (!(*encoded_name)) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kzalloc [%d] bytes\n", __func__,
+			       (*encoded_name_size));
+			rc = -ENOMEM;
+			kfree(filename->encrypted_filename);
+			kfree(filename);
+			goto out;
+		}
+		if ((crypt_stat && (crypt_stat->flags
+				    & ECRYPTFS_ENCFN_USE_MOUNT_FNEK))
+		    || (mount_crypt_stat
+			&& (mount_crypt_stat->flags
+			    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) {
+			memcpy((*encoded_name),
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			       ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE);
+			ecryptfs_encode_for_filename(
+			    ((*encoded_name)
+			     + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE),
+			    &encoded_name_no_prefix_size,
+			    filename->encrypted_filename,
+			    filename->encrypted_filename_size);
+			(*encoded_name_size) =
+				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
+				 + encoded_name_no_prefix_size);
+			(*encoded_name)[(*encoded_name_size)] = '\0';
+			(*encoded_name_size)++;
+		} else {
+			rc = -ENOTSUPP;
+		}
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to encode "
+			       "encrypted filename; rc = [%d]\n", __func__,
+			       rc);
+			kfree((*encoded_name));
+			(*encoded_name) = NULL;
+			(*encoded_name_size) = 0;
+		}
+		kfree(filename->encrypted_filename);
+		kfree(filename);
+	} else {
+		rc = ecryptfs_copy_filename(encoded_name,
+					    encoded_name_size,
+					    name, name_size);
+	}
+out:
+	return rc;
+}
+
+/**
+ * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
+ * @plaintext_name: The plaintext name
+ * @plaintext_name_size: The plaintext name size
+ * @ecryptfs_dir_dentry: eCryptfs directory dentry
+ * @name: The filename in cipher text
+ * @name_size: The cipher text name size
+ *
+ * Decrypts and decodes the filename.
+ *
+ * Returns zero on error; non-zero otherwise
+ */
+int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
+					 size_t *plaintext_name_size,
+					 struct dentry *ecryptfs_dir_dentry,
+					 const char *name, size_t name_size)
+{
+	char *decoded_name;
+	size_t decoded_name_size;
+	size_t packet_size;
+	int rc = 0;
+
+	if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+	    && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
+			ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
+		struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+			&ecryptfs_superblock_to_private(
+				ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
+		const char *orig_name = name;
+		size_t orig_name_size = name_size;
+
+		name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
+		rc = ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+						   name, name_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to decode "
+			       "filename; rc = [%d]\n", __func__, rc);
+			rc = ecryptfs_copy_filename(plaintext_name,
+						    plaintext_name_size,
+						    orig_name, orig_name_size);
+			goto out;
+		}
+		decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
+		if (!decoded_name) {
+			printk(KERN_ERR "%s: Out of memory whilst attempting "
+			       "to kmalloc [%Zd] bytes\n", __func__,
+			       decoded_name_size);
+			rc = -ENOMEM;
+			goto out;
+		}
+		rc = ecryptfs_decode_from_filename(decoded_name,
+						   &decoded_name_size,
+						   name, name_size);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to decode "
+			       "filename; rc = [%d]\n", __func__, rc);
+			rc = ecryptfs_copy_filename(plaintext_name,
+						    plaintext_name_size,
+						    orig_name, orig_name_size);
+			goto out_free;
+		}
+		rc = ecryptfs_parse_tag_70_packet(plaintext_name,
+						  plaintext_name_size,
+						  &packet_size,
+						  mount_crypt_stat,
+						  decoded_name,
+						  decoded_name_size);
+		if (rc) {
+			printk(KERN_INFO "%s: Could not parse tag 70 packet "
+			       "from filename; copying through filename "
+			       "as-is\n", __func__);
+			rc = ecryptfs_copy_filename(plaintext_name,
+						    plaintext_name_size,
+						    orig_name, orig_name_size);
+			goto out_free;
+		}
+	} else {
+		rc = ecryptfs_copy_filename(plaintext_name,
+					    plaintext_name_size,
+					    name, name_size);
+		goto out;
+	}
+out_free:
+	kfree(decoded_name);
+out:
+	return rc;
+}
-- 
cgit v1.2.3


From addd65ad8d19a7d7982130b16f957d5d01d3f8df Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:42:00 -0800
Subject: eCryptfs: Filename Encryption: filldir, lookup, and readlink

Make the requisite modifications to ecryptfs_filldir(), ecryptfs_lookup(),
and ecryptfs_readlink() to call out to filename encryption functions.
Propagate filename encryption policy flags from mount-wide crypt_stat to
inode crypt_stat.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c          | 107 ++-------------
 fs/ecryptfs/ecryptfs_kernel.h |   6 -
 fs/ecryptfs/file.c            |  30 ++---
 fs/ecryptfs/inode.c           | 294 ++++++++++++++++++++++++------------------
 4 files changed, 195 insertions(+), 242 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 18c78abba68..ea2afd2ce22 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags(
 		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
 		crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED;
+	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+		crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES;
+		if (mount_crypt_stat->flags
+		    & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK;
+		else if (mount_crypt_stat->flags
+			 & ECRYPTFS_GLOBAL_ENCFN_USE_FEK)
+			crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK;
+	}
 }
 
 static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
@@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem {
 static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = {
 	{0x00000001, ECRYPTFS_ENABLE_HMAC},
 	{0x00000002, ECRYPTFS_ENCRYPTED},
-	{0x00000004, ECRYPTFS_METADATA_IN_XATTR}
+	{0x00000004, ECRYPTFS_METADATA_IN_XATTR},
+	{0x00000008, ECRYPTFS_ENCRYPT_FILENAMES}
 };
 
 /**
@@ -1213,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data,
 		&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
 	int rc;
 
+	if (crypt_stat->extent_size == 0)
+		crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
 	rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
 				 ecryptfs_inode);
 	if (rc) {
@@ -1222,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data,
 	}
 	if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
 		rc = -EINVAL;
-		ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n");
 	}
 out:
 	return rc;
@@ -1628,98 +1639,6 @@ out:
 	return rc;
 }
 
-/**
- * ecryptfs_encode_filename - converts a plaintext file name to cipher text
- * @crypt_stat: The crypt_stat struct associated with the file anem to encode
- * @name: The plaintext name
- * @length: The length of the plaintext
- * @encoded_name: The encypted name
- *
- * Encrypts and encodes a filename into something that constitutes a
- * valid filename for a filesystem, with printable characters.
- *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of encoded filename; negative if error
- */
-int
-ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			 const char *name, int length, char **encoded_name)
-{
-	int error = 0;
-
-	(*encoded_name) = kmalloc(length + 2, GFP_KERNEL);
-	if (!(*encoded_name)) {
-		error = -ENOMEM;
-		goto out;
-	}
-	/* TODO: Filename encryption is a scheduled feature for a
-	 * future version of eCryptfs. This function is here only for
-	 * the purpose of providing a framework for other developers
-	 * to easily implement filename encryption. Hint: Replace this
-	 * memcpy() with a call to encrypt and encode the
-	 * filename, the set the length accordingly. */
-	memcpy((void *)(*encoded_name), (void *)name, length);
-	(*encoded_name)[length] = '\0';
-	error = length + 1;
-out:
-	return error;
-}
-
-/**
- * ecryptfs_decode_filename - converts the cipher text name to plaintext
- * @crypt_stat: The crypt_stat struct associated with the file
- * @name: The filename in cipher text
- * @length: The length of the cipher text name
- * @decrypted_name: The plaintext name
- *
- * Decodes and decrypts the filename.
- *
- * We assume that we have a properly initialized crypto context,
- * pointed to by crypt_stat->tfm.
- *
- * TODO: Implement filename decoding and decryption here, in place of
- * memcpy. We are keeping the framework around for now to (1)
- * facilitate testing of the components needed to implement filename
- * encryption and (2) to provide a code base from which other
- * developers in the community can easily implement this feature.
- *
- * Returns the length of decoded filename; negative if error
- */
-int
-ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			 const char *name, int length, char **decrypted_name)
-{
-	int error = 0;
-
-	(*decrypted_name) = kmalloc(length + 2, GFP_KERNEL);
-	if (!(*decrypted_name)) {
-		error = -ENOMEM;
-		goto out;
-	}
-	/* TODO: Filename encryption is a scheduled feature for a
-	 * future version of eCryptfs. This function is here only for
-	 * the purpose of providing a framework for other developers
-	 * to easily implement filename encryption. Hint: Replace this
-	 * memcpy() with a call to decode and decrypt the
-	 * filename, the set the length accordingly. */
-	memcpy((void *)(*decrypted_name), (void *)name, length);
-	(*decrypted_name)[length + 1] = '\0';	/* Only for convenience
-						 * in printing out the
-						 * string in debug
-						 * messages */
-	error = length;
-out:
-	return error;
-}
-
 /**
  * ecryptfs_encrypt_filename - encrypt filename
  *
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index b648175a44c..c11fc95714a 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -627,12 +627,6 @@ int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
 					 struct dentry *ecryptfs_dentry,
 					 const char *name, size_t name_size);
 int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
-int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			     const char *name, int length,
-			     char **decrypted_name);
-int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat,
-			     const char *name, int length,
-			     char **encoded_name);
 int ecryptfs_encrypt_and_encode_filename(
 	char **encoded_name,
 	size_t *encoded_name_size,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 71383437122..567eb4bee1b 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback {
 
 /* Inspired by generic filldir in fs/readdir.c */
 static int
-ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset,
-		 u64 ino, unsigned int d_type)
+ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
+		 loff_t offset, u64 ino, unsigned int d_type)
 {
-	struct ecryptfs_crypt_stat *crypt_stat;
 	struct ecryptfs_getdents_callback *buf =
 	    (struct ecryptfs_getdents_callback *)dirent;
+	int name_size;
+	char *name;
 	int rc;
-	int decoded_length;
-	char *decoded_name;
 
-	crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat;
 	buf->filldir_called++;
-	decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen,
-						  &decoded_name);
-	if (decoded_length < 0) {
-		rc = decoded_length;
+	rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size,
+						  buf->dentry, lower_name,
+						  lower_namelen);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to decode and decrypt "
+		       "filename [%s]; rc = [%d]\n", __func__, lower_name,
+		       rc);
 		goto out;
 	}
-	rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset,
-			  ino, d_type);
-	kfree(decoded_name);
+	rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type);
+	kfree(name);
 	if (rc >= 0)
 		buf->entries_written++;
 out:
@@ -106,8 +106,8 @@ out:
 
 /**
  * ecryptfs_readdir
- * @file: The ecryptfs file struct
- * @dirent: Directory entry
+ * @file: The eCryptfs directory file
+ * @dirent: Directory entry handle
  * @filldir: The filldir callback function
  */
 static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0111906a887..38309ce94d7 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -228,8 +228,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 {
 	int rc;
 
-	/* ecryptfs_do_create() calls ecryptfs_interpose(), which opens
-	 * the crypt_stat->lower_file (persistent file) */
+	/* ecryptfs_do_create() calls ecryptfs_interpose() */
 	rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd);
 	if (unlikely(rc)) {
 		ecryptfs_printk(KERN_WARNING, "Failed to create file in"
@@ -244,141 +243,91 @@ out:
 }
 
 /**
- * ecryptfs_lookup
- * @dir: inode
- * @dentry: The dentry
- * @nd: nameidata, may be NULL
- *
- * Find a file on disk. If the file does not exist, then we'll add it to the
- * dentry cache and continue on to read it from the disk.
+ * ecryptfs_lookup_and_interpose_lower - Perform a lookup
  */
-static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
-				      struct nameidata *nd)
+int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+					struct dentry *lower_dentry,
+					struct ecryptfs_crypt_stat *crypt_stat,
+					struct inode *ecryptfs_dir_inode,
+					struct nameidata *ecryptfs_nd)
 {
-	int rc = 0;
 	struct dentry *lower_dir_dentry;
-	struct dentry *lower_dentry;
 	struct vfsmount *lower_mnt;
-	char *encoded_name;
-	int encoded_namelen;
-	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	struct inode *lower_inode;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
 	char *page_virt = NULL;
-	struct inode *lower_inode;
 	u64 file_size;
+	int rc = 0;
 
-	lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
-	dentry->d_op = &ecryptfs_dops;
-	if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, "."))
-	    || (dentry->d_name.len == 2
-		&& !strcmp(dentry->d_name.name, ".."))) {
-		d_drop(dentry);
-		goto out;
-	}
-	encoded_namelen = ecryptfs_encode_filename(crypt_stat,
-						   dentry->d_name.name,
-						   dentry->d_name.len,
-						   &encoded_name);
-	if (encoded_namelen < 0) {
-		rc = encoded_namelen;
-		d_drop(dentry);
-		goto out;
-	}
-	ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen "
-			"= [%d]\n", encoded_name, encoded_namelen);
-	lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry,
-				      encoded_namelen - 1);
-	kfree(encoded_name);
-	if (IS_ERR(lower_dentry)) {
-		ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n");
-		rc = PTR_ERR(lower_dentry);
-		d_drop(dentry);
-		goto out;
-	}
-	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
-	ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->"
-       		"d_name.name = [%s]\n", lower_dentry,
-		lower_dentry->d_name.name);
+	lower_dir_dentry = lower_dentry->d_parent;
+	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
+				   ecryptfs_dentry->d_parent));
 	lower_inode = lower_dentry->d_inode;
-	fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode);
+	fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
 	BUG_ON(!atomic_read(&lower_dentry->d_count));
-	ecryptfs_set_dentry_private(dentry,
+	ecryptfs_set_dentry_private(ecryptfs_dentry,
 				    kmem_cache_alloc(ecryptfs_dentry_info_cache,
 						     GFP_KERNEL));
-	if (!ecryptfs_dentry_to_private(dentry)) {
+	if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
 		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting "
-				"to allocate ecryptfs_dentry_info struct\n");
+		printk(KERN_ERR "%s: Out of memory whilst attempting "
+		       "to allocate ecryptfs_dentry_info struct\n",
+			__func__);
 		goto out_dput;
 	}
-	ecryptfs_set_dentry_lower(dentry, lower_dentry);
-	ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
+	ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
+	ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
 	if (!lower_dentry->d_inode) {
 		/* We want to add because we couldn't find in lower */
-		d_add(dentry, NULL);
+		d_add(ecryptfs_dentry, NULL);
 		goto out;
 	}
-	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
-				ECRYPTFS_INTERPOSE_FLAG_D_ADD);
+	rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
+				ecryptfs_dir_inode->i_sb, 1);
 	if (rc) {
-		ecryptfs_printk(KERN_ERR, "Error interposing\n");
+		printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
+		       __func__, rc);
 		goto out;
 	}
-	if (S_ISDIR(lower_inode->i_mode)) {
-		ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
+	if (S_ISDIR(lower_inode->i_mode))
 		goto out;
-	}
-	if (S_ISLNK(lower_inode->i_mode)) {
-		ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n");
+	if (S_ISLNK(lower_inode->i_mode))
 		goto out;
-	}
-	if (special_file(lower_inode->i_mode)) {
-		ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n");
+	if (special_file(lower_inode->i_mode))
 		goto out;
-	}
-	if (!nd) {
-		ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave"
-				"as we *think* we are about to unlink\n");
+	if (!ecryptfs_nd)
 		goto out;
-	}
 	/* Released in this function */
-	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2,
-				      GFP_USER);
+	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
 	if (!page_virt) {
+		printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
+		       __func__);
 		rc = -ENOMEM;
-		ecryptfs_printk(KERN_ERR,
-				"Cannot ecryptfs_kmalloc a page\n");
 		goto out;
 	}
-	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
-	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
-		ecryptfs_set_default_sizes(crypt_stat);
-	if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
-		rc = ecryptfs_init_persistent_file(dentry);
+	if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+		rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
 		if (rc) {
 			printk(KERN_ERR "%s: Error attempting to initialize "
 			       "the persistent file for the dentry with name "
 			       "[%s]; rc = [%d]\n", __func__,
-			       dentry->d_name.name, rc);
-			goto out;
+			       ecryptfs_dentry->d_name.name, rc);
+			goto out_free_kmem;
 		}
 	}
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
-						      dentry->d_inode);
+						      ecryptfs_dentry->d_inode);
 	if (rc) {
-		rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry);
+		rc = ecryptfs_read_and_validate_xattr_region(page_virt,
+							     ecryptfs_dentry);
 		if (rc) {
-			printk(KERN_DEBUG "Valid metadata not found in header "
-			       "region or xattr region; treating file as "
-			       "unencrypted\n");
 			rc = 0;
-			kmem_cache_free(ecryptfs_header_cache_2, page_virt);
-			goto out;
+			goto out_free_kmem;
 		}
 		crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
 	}
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
-		dentry->d_sb)->mount_crypt_stat;
+		ecryptfs_dentry->d_sb)->mount_crypt_stat;
 	if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
 		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
 			file_size = (crypt_stat->num_header_bytes_at_front
@@ -388,14 +337,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
 	} else {
 		file_size = get_unaligned_be64(page_virt);
 	}
-	i_size_write(dentry->d_inode, (loff_t)file_size);
+	i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size);
+out_free_kmem:
 	kmem_cache_free(ecryptfs_header_cache_2, page_virt);
 	goto out;
-
 out_dput:
 	dput(lower_dentry);
-	d_drop(dentry);
+	d_drop(ecryptfs_dentry);
 out:
+	return rc;
+}
+
+/**
+ * ecryptfs_lookup
+ * @ecryptfs_dir_inode: The eCryptfs directory inode
+ * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
+ * @ecryptfs_nd: nameidata; may be NULL
+ *
+ * Find a file on disk. If the file does not exist, then we'll add it to the
+ * dentry cache and continue on to read it from the disk.
+ */
+static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
+				      struct dentry *ecryptfs_dentry,
+				      struct nameidata *ecryptfs_nd)
+{
+	char *encrypted_and_encoded_name = NULL;
+	int encrypted_and_encoded_name_size;
+	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
+	struct ecryptfs_inode_info *inode_info;
+	struct dentry *lower_dir_dentry, *lower_dentry;
+	int rc = 0;
+
+	ecryptfs_dentry->d_op = &ecryptfs_dops;
+	if ((ecryptfs_dentry->d_name.len == 1
+	     && !strcmp(ecryptfs_dentry->d_name.name, "."))
+	    || (ecryptfs_dentry->d_name.len == 2
+		&& !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
+		goto out_d_drop;
+	}
+	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
+	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+				      lower_dir_dentry,
+				      ecryptfs_dentry->d_name.len);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+		       "lower_dentry = [%s]\n", __func__, rc,
+		       ecryptfs_dentry->d_name.name);
+		goto out_d_drop;
+	}
+	if (lower_dentry->d_inode)
+		goto lookup_and_interpose;
+	inode_info =  ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+	if (inode_info) {
+		crypt_stat = &inode_info->crypt_stat;
+		/* TODO: lock for crypt_stat comparison */
+		if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+			ecryptfs_set_default_sizes(crypt_stat);
+	}
+	if (crypt_stat)
+		mount_crypt_stat = crypt_stat->mount_crypt_stat;
+	else
+		mount_crypt_stat = &ecryptfs_superblock_to_private(
+			ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
+	    && !(mount_crypt_stat && (mount_crypt_stat->flags
+				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+		goto lookup_and_interpose;
+	dput(lower_dentry);
+	rc = ecryptfs_encrypt_and_encode_filename(
+		&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
+		crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+		ecryptfs_dentry->d_name.len);
+	if (rc) {
+		printk(KERN_ERR "%s: Error attempting to encrypt and encode "
+		       "filename; rc = [%d]\n", __func__, rc);
+		goto out_d_drop;
+	}
+	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+				      lower_dir_dentry,
+				      encrypted_and_encoded_name_size - 1);
+	if (IS_ERR(lower_dentry)) {
+		rc = PTR_ERR(lower_dentry);
+		printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+		       "lower_dentry = [%s]\n", __func__, rc,
+		       encrypted_and_encoded_name);
+		goto out_d_drop;
+	}
+lookup_and_interpose:
+	rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
+						 crypt_stat, ecryptfs_dir_inode,
+						 ecryptfs_nd);
+	goto out;
+out_d_drop:
+	d_drop(ecryptfs_dentry);
+out:
+	kfree(encrypted_and_encoded_name);
 	return ERR_PTR(rc);
 }
 
@@ -466,19 +504,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
 	char *encoded_symname;
-	int encoded_symlen;
-	struct ecryptfs_crypt_stat *crypt_stat = NULL;
+	size_t encoded_symlen;
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
-	encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
-						  strlen(symname),
-						  &encoded_symname);
-	if (encoded_symlen < 0) {
-		rc = encoded_symlen;
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+		dir->i_sb)->mount_crypt_stat;
+	rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
+						  &encoded_symlen,
+						  NULL,
+						  mount_crypt_stat, symname,
+						  strlen(symname));
+	if (rc)
 		goto out_lock;
-	}
 	rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
 			 encoded_symname);
 	kfree(encoded_symname);
@@ -602,52 +642,54 @@ out_lock:
 }
 
 static int
-ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 {
-	int rc;
-	struct dentry *lower_dentry;
-	char *decoded_name;
 	char *lower_buf;
-	mm_segment_t old_fs;
+	struct dentry *lower_dentry;
 	struct ecryptfs_crypt_stat *crypt_stat;
+	char *plaintext_name;
+	size_t plaintext_name_size;
+	mm_segment_t old_fs;
+	int rc;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	if (!lower_dentry->d_inode->i_op->readlink) {
 		rc = -EINVAL;
 		goto out;
 	}
+	crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
 	/* Released in this function */
 	lower_buf = kmalloc(bufsiz, GFP_KERNEL);
 	if (lower_buf == NULL) {
-		ecryptfs_printk(KERN_ERR, "Out of memory\n");
+		printk(KERN_ERR "%s: Out of memory whilst attempting to "
+		       "kmalloc [%d] bytes\n", __func__, bufsiz);
 		rc = -ENOMEM;
 		goto out;
 	}
 	old_fs = get_fs();
 	set_fs(get_ds());
-	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-			"lower_dentry->d_name.name = [%s]\n",
-			lower_dentry->d_name.name);
 	rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
 						   (char __user *)lower_buf,
 						   bufsiz);
 	set_fs(old_fs);
 	if (rc >= 0) {
-		crypt_stat = NULL;
-		rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc,
-					      &decoded_name);
-		if (rc == -ENOMEM)
+		rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
+							  &plaintext_name_size,
+							  dentry, lower_buf,
+							  rc);
+		if (rc) {
+			printk(KERN_ERR "%s: Error attempting to decode and "
+			       "decrypt filename; rc = [%d]\n", __func__,
+				rc);
 			goto out_free_lower_buf;
-		if (rc > 0) {
-			ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes "
-					"to userspace: [%*s]\n", rc,
-					decoded_name);
-			if (copy_to_user(buf, decoded_name, rc))
-				rc = -EFAULT;
 		}
-		kfree(decoded_name);
-		fsstack_copy_attr_atime(dentry->d_inode,
-					lower_dentry->d_inode);
+		rc = copy_to_user(buf, plaintext_name, plaintext_name_size);
+		if (rc)
+			rc = -EFAULT;
+		else
+			rc = plaintext_name_size;
+		kfree(plaintext_name);
+		fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
 	}
 out_free_lower_buf:
 	kfree(lower_buf);
@@ -669,8 +711,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 	}
 	old_fs = get_fs();
 	set_fs(get_ds());
-	ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
-			"dentry->d_name.name = [%s]\n", dentry->d_name.name);
 	rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
 	set_fs(old_fs);
 	if (rc < 0)
-- 
cgit v1.2.3


From 87c94c4df0149786ad91d8a03c738a03369ee9c8 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:42:01 -0800
Subject: eCryptfs: Filename Encryption: mount option

Enable mount-wide filename encryption by providing the Filename Encryption
Key (FNEK) signature as a mount option.  Note that the ecryptfs-utils
userspace package versions 61 or later support this option.

When mounting with ecryptfs-utils version 61 or later, the mount helper
will detect the availability of the passphrase-based filename encryption
in the kernel (via the eCryptfs sysfs handle) and query the user
interactively as to whether or not he wants to enable the feature for the
mount.  If the user enables filename encryption, the mount helper will
then prompt for the FNEK signature that the user wishes to use, suggesting
by default the signature for the mount passphrase that the user has
already entered for encrypting the file contents.

When not using the mount helper, the user can specify the signature for
the passphrase key with the ecryptfs_fnek_sig= mount option.  This key
must be available in the user's keyring.  The mount helper usually takes
care of this step.  If, however, the user is not mounting with the mount
helper, then he will need to enter the passphrase key into his keyring
with some other utility prior to mounting, such as ecryptfs-manager.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/main.c | 126 +++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index fd630713c5c..789cf2e1be1 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
        ecryptfs_opt_ecryptfs_key_bytes,
        ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
-       ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
+       ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
+       ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
+       ecryptfs_opt_err };
 
 static const match_table_t tokens = {
 	{ecryptfs_opt_sig, "sig=%s"},
@@ -217,6 +219,9 @@ static const match_table_t tokens = {
 	{ecryptfs_opt_passthrough, "ecryptfs_passthrough"},
 	{ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"},
 	{ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"},
+	{ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"},
+	{ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"},
+	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
 	{ecryptfs_opt_err, NULL}
 };
 
@@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 	int rc = 0;
 	int sig_set = 0;
 	int cipher_name_set = 0;
+	int fn_cipher_name_set = 0;
 	int cipher_key_bytes;
 	int cipher_key_bytes_set = 0;
+	int fn_cipher_key_bytes;
+	int fn_cipher_key_bytes_set = 0;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
 		&ecryptfs_superblock_to_private(sb)->mount_crypt_stat;
 	substring_t args[MAX_OPT_ARGS];
@@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 	char *sig_src;
 	char *cipher_name_dst;
 	char *cipher_name_src;
+	char *fn_cipher_name_dst;
+	char *fn_cipher_name_src;
+	char *fnek_dst;
+	char *fnek_src;
 	char *cipher_key_bytes_src;
+	char *fn_cipher_key_bytes_src;
 
 	if (!options) {
 		rc = -EINVAL;
@@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 				global_default_cipher_name;
 			strncpy(cipher_name_dst, cipher_name_src,
 				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-			ecryptfs_printk(KERN_DEBUG,
-					"The mount_crypt_stat "
-					"global_default_cipher_name set to: "
-					"[%s]\n", cipher_name_dst);
+			cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
 			cipher_name_set = 1;
 			break;
 		case ecryptfs_opt_ecryptfs_key_bytes:
@@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 						   &cipher_key_bytes_src, 0);
 			mount_crypt_stat->global_default_cipher_key_size =
 				cipher_key_bytes;
-			ecryptfs_printk(KERN_DEBUG,
-					"The mount_crypt_stat "
-					"global_default_cipher_key_size "
-					"set to: [%d]\n", mount_crypt_stat->
-					global_default_cipher_key_size);
 			cipher_key_bytes_set = 1;
 			break;
 		case ecryptfs_opt_passthrough:
@@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 			mount_crypt_stat->flags |=
 				ECRYPTFS_ENCRYPTED_VIEW_ENABLED;
 			break;
+		case ecryptfs_opt_fnek_sig:
+			fnek_src = args[0].from;
+			fnek_dst =
+				mount_crypt_stat->global_default_fnek_sig;
+			strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX);
+			mount_crypt_stat->global_default_fnek_sig[
+				ECRYPTFS_SIG_SIZE_HEX] = '\0';
+			rc = ecryptfs_add_global_auth_tok(
+				mount_crypt_stat,
+				mount_crypt_stat->global_default_fnek_sig);
+			if (rc) {
+				printk(KERN_ERR "Error attempting to register "
+				       "global fnek sig [%s]; rc = [%d]\n",
+				       mount_crypt_stat->global_default_fnek_sig,
+				       rc);
+				goto out;
+			}
+			mount_crypt_stat->flags |=
+				(ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES
+				 | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK);
+			break;
+		case ecryptfs_opt_fn_cipher:
+			fn_cipher_name_src = args[0].from;
+			fn_cipher_name_dst =
+				mount_crypt_stat->global_default_fn_cipher_name;
+			strncpy(fn_cipher_name_dst, fn_cipher_name_src,
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+			mount_crypt_stat->global_default_fn_cipher_name[
+				ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0';
+			fn_cipher_name_set = 1;
+			break;
+		case ecryptfs_opt_fn_cipher_key_bytes:
+			fn_cipher_key_bytes_src = args[0].from;
+			fn_cipher_key_bytes =
+				(int)simple_strtol(fn_cipher_key_bytes_src,
+						   &fn_cipher_key_bytes_src, 0);
+			mount_crypt_stat->global_default_fn_cipher_key_bytes =
+				fn_cipher_key_bytes;
+			fn_cipher_key_bytes_set = 1;
+			break;
 		case ecryptfs_opt_err:
 		default:
-			ecryptfs_printk(KERN_WARNING,
-					"eCryptfs: unrecognized option '%s'\n",
-					p);
+			printk(KERN_WARNING
+			       "%s: eCryptfs: unrecognized option [%s]\n",
+			       __func__, p);
 		}
 	}
 	if (!sig_set) {
@@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
 
 		BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
-
 		strcpy(mount_crypt_stat->global_default_cipher_name,
 		       ECRYPTFS_DEFAULT_CIPHER);
 	}
-	if (!cipher_key_bytes_set) {
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_name_set)
+		strcpy(mount_crypt_stat->global_default_fn_cipher_name,
+		       mount_crypt_stat->global_default_cipher_name);
+	if (!cipher_key_bytes_set)
 		mount_crypt_stat->global_default_cipher_key_size = 0;
-	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !fn_cipher_key_bytes_set)
+		mount_crypt_stat->global_default_fn_cipher_key_bytes =
+			mount_crypt_stat->global_default_cipher_key_size;
 	mutex_lock(&key_tfm_list_mutex);
 	if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
-				 NULL))
+				 NULL)) {
 		rc = ecryptfs_add_new_key_tfm(
 			NULL, mount_crypt_stat->global_default_cipher_name,
 			mount_crypt_stat->global_default_cipher_key_size);
-	mutex_unlock(&key_tfm_list_mutex);
-	if (rc) {
-		printk(KERN_ERR "Error attempting to initialize cipher with "
-		       "name = [%s] and key size = [%td]; rc = [%d]\n",
-		       mount_crypt_stat->global_default_cipher_name,
-		       mount_crypt_stat->global_default_cipher_key_size, rc);
-		rc = -EINVAL;
-		goto out;
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_cipher_name,
+			       mount_crypt_stat->global_default_cipher_key_size,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
 	}
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !ecryptfs_tfm_exists(
+		    mount_crypt_stat->global_default_fn_cipher_name, NULL)) {
+		rc = ecryptfs_add_new_key_tfm(
+			NULL, mount_crypt_stat->global_default_fn_cipher_name,
+			mount_crypt_stat->global_default_fn_cipher_key_bytes);
+		if (rc) {
+			printk(KERN_ERR "Error attempting to initialize "
+			       "cipher with name = [%s] and key size = [%td]; "
+			       "rc = [%d]\n",
+			       mount_crypt_stat->global_default_fn_cipher_name,
+			       mount_crypt_stat->global_default_fn_cipher_key_bytes,
+			       rc);
+			rc = -EINVAL;
+			mutex_unlock(&key_tfm_list_mutex);
+			goto out;
+		}
+	}
+	mutex_unlock(&key_tfm_list_mutex);
 	rc = ecryptfs_init_global_auth_toks(mount_crypt_stat);
-	if (rc) {
+	if (rc)
 		printk(KERN_WARNING "One or more global auth toks could not "
 		       "properly register; rc = [%d]\n", rc);
-	}
 out:
 	return rc;
 }
-- 
cgit v1.2.3


From df261c52abdef147084c76ecf14473184e907547 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:42:02 -0800
Subject: eCryptfs: Replace %Z with %z

%Z is a gcc-ism. Using %z instead.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c    |  8 ++++----
 fs/ecryptfs/keystore.c  | 18 +++++++++---------
 fs/ecryptfs/messaging.c |  4 ++--
 fs/ecryptfs/miscdev.c   | 18 +++++++++---------
 4 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index ea2afd2ce22..490b129311e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1679,7 +1679,7 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename,
 			kmalloc(filename->encrypted_filename_size, GFP_KERNEL);
 		if (!filename->encrypted_filename) {
 			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc [%Zd] bytes\n", __func__,
+			       "to kmalloc [%zd] bytes\n", __func__,
 			       filename->encrypted_filename_size);
 			rc = -ENOMEM;
 			goto out;
@@ -1752,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
 	*key_tfm = NULL;
 	if (*key_size > ECRYPTFS_MAX_KEY_BYTES) {
 		rc = -EINVAL;
-		printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum "
+		printk(KERN_ERR "Requested key size is [%zd] bytes; maximum "
 		      "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES);
 		goto out;
 	}
@@ -1777,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm,
 	get_random_bytes(dummy_key, *key_size);
 	rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size);
 	if (rc) {
-		printk(KERN_ERR "Error attempting to set key of size [%Zd] for "
+		printk(KERN_ERR "Error attempting to set key of size [%zd] for "
 		       "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc);
 		rc = -EINVAL;
 		goto out;
@@ -2221,7 +2221,7 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 		decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
 		if (!decoded_name) {
 			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc [%Zd] bytes\n", __func__,
+			       "to kmalloc [%zd] bytes\n", __func__,
 			       decoded_name_size);
 			rc = -ENOMEM;
 			goto out;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index dafceb5560e..e6a96e8f5e6 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
 	/* verify that everything through the encrypted FEK size is present */
 	if (message_len < 4) {
 		rc = -EIO;
-		printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable "
+		printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable "
 		       "message length is [%d]\n", __func__, message_len, 4);
 		goto out;
 	}
@@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec,
 	i += data_len;
 	if (message_len < (i + key_rec->enc_key_size)) {
 		rc = -EIO;
-		printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n",
+		printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n",
 		       __func__, message_len, (i + key_rec->enc_key_size));
 		goto out;
 	}
 	if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
 		rc = -EIO;
-		printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than "
+		printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than "
 		       "the maximum key size [%d]\n", __func__,
 		       key_rec->enc_key_size,
 		       ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES);
@@ -511,7 +511,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
-		       "[%d] bytes of kernel memory\n", __func__, sizeof(*s));
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
 		goto out;
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -566,7 +566,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 					    GFP_KERNEL);
 	if (!s->block_aligned_filename) {
 		printk(KERN_ERR "%s: Out of kernel memory whilst attempting to "
-		       "kzalloc [%Zd] bytes\n", __func__,
+		       "kzalloc [%zd] bytes\n", __func__,
 		       s->block_aligned_filename_size);
 		rc = -ENOMEM;
 		goto out_unlock;
@@ -721,7 +721,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 		printk(KERN_ERR "%s: Error setting key for crypto context; "
 		       "rc = [%d]. s->auth_tok->token.password.session_key_"
 		       "encryption_key = [0x%p]; mount_crypt_stat->"
-		       "global_default_fn_cipher_key_bytes = [%Zd]\n", __func__,
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
 		       rc,
 		       s->auth_tok->token.password.session_key_encryption_key,
 		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
@@ -792,7 +792,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 	if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) {
-		printk(KERN_WARNING "%s: max_packet_size is [%Zd]; it must be "
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be "
 		       "at least [%d]\n", __func__, max_packet_size,
 			(1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1));
 		rc = -EINVAL;
@@ -909,7 +909,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 		printk(KERN_ERR "%s: Error setting key for crypto context; "
 		       "rc = [%d]. s->auth_tok->token.password.session_key_"
 		       "encryption_key = [0x%p]; mount_crypt_stat->"
-		       "global_default_fn_cipher_key_bytes = [%Zd]\n", __func__,
+		       "global_default_fn_cipher_key_bytes = [%zd]\n", __func__,
 		       rc,
 		       s->auth_tok->token.password.session_key_encryption_key,
 		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
@@ -936,7 +936,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	s->i++;
 	(*filename_size) = (s->block_aligned_filename_size - s->i);
 	if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) {
-		printk(KERN_WARNING "%s: Filename size is [%Zd], which is "
+		printk(KERN_WARNING "%s: Filename size is [%zd], which is "
 		       "invalid\n", __func__, (*filename_size));
 		rc = -EINVAL;
 		goto out_free_unlock;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 6913f727624..96ef51489e0 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
 	(*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL);
 	if (!(*daemon)) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
 		       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
 		goto out;
 	}
@@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
 	msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL);
 	if (!msg_ctx->msg) {
 		rc = -ENOMEM;
-		printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of "
+		printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of "
 		       "GFP_KERNEL memory\n", __func__, msg_size);
 		goto unlock;
 	}
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index efd95a0ed1e..a67fea655f4 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 		if (!msg_ctx->msg) {
 			rc = -ENOMEM;
 			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kmalloc(%Zd, GFP_KERNEL)\n", __func__,
+			       "to kmalloc(%zd, GFP_KERNEL)\n", __func__,
 			       (sizeof(*msg_ctx->msg) + data_size));
 			goto out_unlock;
 		}
@@ -322,7 +322,7 @@ check_list:
 	if (count < total_length) {
 		rc = 0;
 		printk(KERN_WARNING "%s: Only given user buffer of "
-		       "size [%Zd], but we need [%Zd] to read the "
+		       "size [%zd], but we need [%zd] to read the "
 		       "pending message\n", __func__, count, total_length);
 		goto out_unlock_msg_ctx;
 	}
@@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
 
 	if ((sizeof(*msg) + msg->data_len) != data_size) {
 		printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = "
-		       "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__,
+		       "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__,
 		       (sizeof(*msg) + msg->data_len), data_size);
 		rc = -EINVAL;
 		goto out;
@@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	data = kmalloc(count, GFP_KERNEL);
 	if (!data) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count);
+		       "kmalloc([%zd], GFP_KERNEL)\n", __func__, count);
 		goto out;
 	}
 	rc = copy_from_user(data, buf, count);
@@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 	case ECRYPTFS_MSG_RESPONSE:
 		if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) {
 			printk(KERN_WARNING "%s: Minimum acceptable packet "
-			       "size is [%Zd], but amount of data written is "
-			       "only [%Zd]. Discarding response packet.\n",
+			       "size is [%zd], but amount of data written is "
+			       "only [%zd]. Discarding response packet.\n",
 			       __func__,
 			       (1 + 4 + 1 + sizeof(struct ecryptfs_message)),
 			       count);
@@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		}
 		i += packet_size_length;
 		if ((1 + 4 + packet_size_length + packet_size) != count) {
-			printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])"
-			       " + packet_size([%Zd]))([%Zd]) != "
-			       "count([%Zd]). Invalid packet format.\n",
+			printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
+			       " + packet_size([%zd]))([%zd]) != "
+			       "count([%zd]). Invalid packet format.\n",
 			       __func__, packet_size_length, packet_size,
 			       (1 + packet_size_length + packet_size), count);
 			goto out_free;
-- 
cgit v1.2.3


From a8f12864c52f8ab8520568dc97969c1749ae60bf Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:42:03 -0800
Subject: eCryptfs: Fix data types (int/size_t)

Correct several format string data type specifiers.  Correct filename size
data types; they should be size_t rather than int when passed as
parameters to some other functions (although note that the filenames will
never be larger than int).

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c   |  4 ++--
 fs/ecryptfs/file.c     |  2 +-
 fs/ecryptfs/inode.c    |  2 +-
 fs/ecryptfs/keystore.c | 24 ++++++++++++------------
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 490b129311e..e935a222498 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -2093,7 +2093,7 @@ int ecryptfs_encrypt_and_encode_filename(
 		filename = kzalloc(sizeof(*filename), GFP_KERNEL);
 		if (!filename) {
 			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kzalloc [%d] bytes\n", __func__,
+			       "to kzalloc [%zd] bytes\n", __func__,
 			       sizeof(*filename));
 			rc = -ENOMEM;
 			goto out;
@@ -2127,7 +2127,7 @@ int ecryptfs_encrypt_and_encode_filename(
 		(*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL);
 		if (!(*encoded_name)) {
 			printk(KERN_ERR "%s: Out of memory whilst attempting "
-			       "to kzalloc [%d] bytes\n", __func__,
+			       "to kzalloc [%zd] bytes\n", __func__,
 			       (*encoded_name_size));
 			rc = -ENOMEM;
 			kfree(filename->encrypted_filename);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 567eb4bee1b..9e944057001 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -82,7 +82,7 @@ ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen,
 {
 	struct ecryptfs_getdents_callback *buf =
 	    (struct ecryptfs_getdents_callback *)dirent;
-	int name_size;
+	size_t name_size;
 	char *name;
 	int rc;
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 38309ce94d7..7168a88cdbc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -362,7 +362,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 				      struct nameidata *ecryptfs_nd)
 {
 	char *encrypted_and_encoded_name = NULL;
-	int encrypted_and_encoded_name_size;
+	size_t encrypted_and_encoded_name_size;
 	struct ecryptfs_crypt_stat *crypt_stat = NULL;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 	struct ecryptfs_inode_info *inode_info;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e6a96e8f5e6..c90ca5dfc50 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -556,8 +556,8 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 		goto out_unlock;
 	}
 	if (s->max_packet_size > (*remaining_bytes)) {
-		printk(KERN_WARNING "%s: Require [%d] bytes to write; only "
-		       "[%d] available\n", __func__, s->max_packet_size,
+		printk(KERN_WARNING "%s: Require [%zd] bytes to write; only "
+		       "[%zd] available\n", __func__, s->max_packet_size,
 		       (*remaining_bytes));
 		rc = -EINVAL;
 		goto out_unlock;
@@ -594,7 +594,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 		mount_crypt_stat->global_default_fn_cipher_key_bytes);
 	if (s->cipher_code == 0) {
 		printk(KERN_WARNING "%s: Unable to generate code for "
-		       "cipher [%s] with key bytes [%d]\n", __func__,
+		       "cipher [%s] with key bytes [%zd]\n", __func__,
 		       mount_crypt_stat->global_default_fn_cipher_name,
 		       mount_crypt_stat->global_default_fn_cipher_key_bytes);
 		rc = -EINVAL;
@@ -693,7 +693,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 		printk(KERN_ERR "%s: Internal error whilst attempting to "
 		       "convert filename memory to scatterlist; "
 		       "expected rc = 1; got rc = [%d]. "
-		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
 		       s->block_aligned_filename_size);
 		goto out_release_free_unlock;
 	}
@@ -703,7 +703,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 		printk(KERN_ERR "%s: Internal error whilst attempting to "
 		       "convert encrypted filename memory to scatterlist; "
 		       "expected rc = 1; got rc = [%d]. "
-		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
 		       s->block_aligned_filename_size);
 		goto out_release_free_unlock;
 	}
@@ -787,7 +787,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
-		       "[%d] bytes of kernel memory\n", __func__, sizeof(*s));
+		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
 		goto out;
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -825,8 +825,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 					  - ECRYPTFS_SIG_SIZE - 1);
 	if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size)
 	    > max_packet_size) {
-		printk(KERN_WARNING "%s: max_packet_size is [%d]; real packet "
-		       "size is [%d]\n", __func__, max_packet_size,
+		printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet "
+		       "size is [%zd]\n", __func__, max_packet_size,
 		       (1 + s->packet_size_len + 1
 			+ s->block_aligned_filename_size));
 		rc = -EINVAL;
@@ -860,7 +860,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 		printk(KERN_ERR "%s: Internal error whilst attempting to "
 		       "convert encrypted filename memory to scatterlist; "
 		       "expected rc = 1; got rc = [%d]. "
-		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
 		       s->block_aligned_filename_size);
 		goto out_unlock;
 	}
@@ -869,7 +869,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 					GFP_KERNEL);
 	if (!s->decrypted_filename) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%d] bytes\n", __func__,
+		       "kmalloc [%zd] bytes\n", __func__,
 		       s->block_aligned_filename_size);
 		rc = -ENOMEM;
 		goto out_unlock;
@@ -880,7 +880,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 		printk(KERN_ERR "%s: Internal error whilst attempting to "
 		       "convert decrypted filename memory to scatterlist; "
 		       "expected rc = 1; got rc = [%d]. "
-		       "block_aligned_filename_size = [%d]\n", __func__, rc,
+		       "block_aligned_filename_size = [%zd]\n", __func__, rc,
 		       s->block_aligned_filename_size);
 		goto out_free_unlock;
 	}
@@ -944,7 +944,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	(*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL);
 	if (!(*filename)) {
 		printk(KERN_ERR "%s: Out of memory whilst attempting to "
-		       "kmalloc [%d] bytes\n", __func__,
+		       "kmalloc [%zd] bytes\n", __func__,
 		       ((*filename_size) + 1));
 		rc = -ENOMEM;
 		goto out_free_unlock;
-- 
cgit v1.2.3


From 7d8bc2be51706152828164b305e969b4a8471041 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:42:04 -0800
Subject: eCryptfs: kerneldoc for ecryptfs_parse_tag_70_packet()

Kerneldoc updates for ecryptfs_parse_tag_70_packet().

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/keystore.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index c90ca5dfc50..ff539420cc6 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -771,6 +771,17 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
 /**
  * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet
  * @filename: This function kmalloc's the memory for the filename
+ * @filename_size: This function sets this to the amount of memory
+ *                 kmalloc'd for the filename
+ * @packet_size: This function sets this to the the number of octets
+ *               in the packet parsed
+ * @mount_crypt_stat: The mount-wide cryptographic context
+ * @data: The memory location containing the start of the tag 70
+ *        packet
+ * @max_packet_size: The maximum legal size of the packet to be parsed
+ *                   from @data
+ *
+ * Returns zero on success; non-zero otherwise
  */
 int
 ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
-- 
cgit v1.2.3


From 71c11c378f46e42ca67c1e227646ce23bf43a8c6 Mon Sep 17 00:00:00 2001
From: Michael Halcrow <mhalcrow@us.ibm.com>
Date: Tue, 6 Jan 2009 14:42:05 -0800
Subject: eCryptfs: Clean up ecryptfs_decode_from_filename()

Flesh out the comments for ecryptfs_decode_from_filename(). Remove the
return condition, since it is always 0.

Signed-off-by: Michael Halcrow <mhalcrow@us.ibm.com>
Cc: Dustin Kirkland <dustin.kirkland@gmail.com>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Tyler Hicks <tchicks@us.ibm.com>
Cc: David Kleikamp <shaggy@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/crypto.c | 51 ++++++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e935a222498..c01e043670e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1931,7 +1931,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD"
 
 /* We could either offset on every reverse map or just pad some 0x00's
  * at the front here */
-static unsigned char filename_rev_map[] = {
+static const unsigned char filename_rev_map[] = {
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */
@@ -2012,16 +2012,30 @@ out:
 	return;
 }
 
-int ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
-				  const unsigned char *src, size_t src_size)
+/**
+ * ecryptfs_decode_from_filename
+ * @dst: If NULL, this function only sets @dst_size and returns. If
+ *       non-NULL, this function decodes the encoded octets in @src
+ *       into the memory that @dst points to.
+ * @dst_size: Set to the size of the decoded string.
+ * @src: The encoded set of octets to decode.
+ * @src_size: The size of the encoded set of octets to decode.
+ */
+static void
+ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
+			      const unsigned char *src, size_t src_size)
 {
 	u8 current_bit_offset = 0;
 	size_t src_byte_offset = 0;
 	size_t dst_byte_offset = 0;
-	int rc = 0;
 
 	if (dst == NULL) {
-		/* Not exact; conservatively long */
+		/* Not exact; conservatively long. Every block of 4
+		 * encoded characters decodes into a block of 3
+		 * decoded characters. This segment of code provides
+		 * the caller with the maximum amount of allocated
+		 * space that @dst will need to point to in a
+		 * subsequent call. */
 		(*dst_size) = (((src_size + 1) * 3) / 4);
 		goto out;
 	}
@@ -2055,7 +2069,7 @@ int ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size,
 	}
 	(*dst_size) = dst_byte_offset;
 out:
-	return rc;
+	return;
 }
 
 /**
@@ -2208,16 +2222,8 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 
 		name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
 		name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE;
-		rc = ecryptfs_decode_from_filename(NULL, &decoded_name_size,
-						   name, name_size);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to decode "
-			       "filename; rc = [%d]\n", __func__, rc);
-			rc = ecryptfs_copy_filename(plaintext_name,
-						    plaintext_name_size,
-						    orig_name, orig_name_size);
-			goto out;
-		}
+		ecryptfs_decode_from_filename(NULL, &decoded_name_size,
+					      name, name_size);
 		decoded_name = kmalloc(decoded_name_size, GFP_KERNEL);
 		if (!decoded_name) {
 			printk(KERN_ERR "%s: Out of memory whilst attempting "
@@ -2226,17 +2232,8 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 			rc = -ENOMEM;
 			goto out;
 		}
-		rc = ecryptfs_decode_from_filename(decoded_name,
-						   &decoded_name_size,
-						   name, name_size);
-		if (rc) {
-			printk(KERN_ERR "%s: Error attempting to decode "
-			       "filename; rc = [%d]\n", __func__, rc);
-			rc = ecryptfs_copy_filename(plaintext_name,
-						    plaintext_name_size,
-						    orig_name, orig_name_size);
-			goto out_free;
-		}
+		ecryptfs_decode_from_filename(decoded_name, &decoded_name_size,
+					      name, name_size);
 		rc = ecryptfs_parse_tag_70_packet(plaintext_name,
 						  plaintext_name_size,
 						  &packet_size,
-- 
cgit v1.2.3


From f70f582f0072f37790d2984647198deb3e7782a3 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Tue, 6 Jan 2009 14:42:05 -0800
Subject: fs/ecryptfs/inode.c: cleanup kerneldoc

Arguments lower_dentry and ecryptfs_dentry in ecryptfs_create_underlying_file()
have been merged into dentry, now fix it.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Michael Halcrow <mhalcrow@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ecryptfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 7168a88cdbc..5697899a168 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir)
 /**
  * ecryptfs_create_underlying_file
  * @lower_dir_inode: inode of the parent in the lower fs of the new file
- * @lower_dentry: New file's dentry in the lower fs
- * @ecryptfs_dentry: New file's dentry in ecryptfs
+ * @dentry: New file's dentry
  * @mode: The mode of the new file
  * @nd: nameidata of ecryptfs' parent's dentry & vfsmount
  *
-- 
cgit v1.2.3


From 730c9eeca9808fc2cfb506cc68c90aa330da17b0 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 6 Jan 2009 14:42:06 -0800
Subject: autofs4: improve parameter usage

The parameter usage in the device node ioctl code uses arg1 and arg2 as
parameter names.  This patch redefines the parameter names to reflect what
they actually are in an effort to make the code more readable.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c | 54 +++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 63b7c7afe8d..054d6d9ad9b 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -180,7 +180,7 @@ static int autofs_dev_ioctl_protover(struct file *fp,
 				     struct autofs_sb_info *sbi,
 				     struct autofs_dev_ioctl *param)
 {
-	param->arg1 = sbi->version;
+	param->protover.version = sbi->version;
 	return 0;
 }
 
@@ -189,7 +189,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
 					struct autofs_sb_info *sbi,
 					struct autofs_dev_ioctl *param)
 {
-	param->arg1 = sbi->sub_version;
+	param->protosubver.sub_version = sbi->sub_version;
 	return 0;
 }
 
@@ -335,13 +335,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
 	int err, fd;
 
 	/* param->path has already been checked */
-	if (!param->arg1)
+	if (!param->openmount.devid)
 		return -EINVAL;
 
 	param->ioctlfd = -1;
 
 	path = param->path;
-	devid = param->arg1;
+	devid = param->openmount.devid;
 
 	err = 0;
 	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -373,7 +373,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
 {
 	autofs_wqt_t token;
 
-	token = (autofs_wqt_t) param->arg1;
+	token = (autofs_wqt_t) param->ready.token;
 	return autofs4_wait_release(sbi, token, 0);
 }
 
@@ -388,8 +388,8 @@ static int autofs_dev_ioctl_fail(struct file *fp,
 	autofs_wqt_t token;
 	int status;
 
-	token = (autofs_wqt_t) param->arg1;
-	status = param->arg2 ? param->arg2 : -ENOENT;
+	token = (autofs_wqt_t) param->fail.token;
+	status = param->fail.status ? param->fail.status : -ENOENT;
 	return autofs4_wait_release(sbi, token, status);
 }
 
@@ -412,10 +412,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 	int pipefd;
 	int err = 0;
 
-	if (param->arg1 == -1)
+	if (param->setpipefd.pipefd == -1)
 		return -EINVAL;
 
-	pipefd = param->arg1;
+	pipefd = param->setpipefd.pipefd;
 
 	mutex_lock(&sbi->wq_mutex);
 	if (!sbi->catatonic) {
@@ -457,8 +457,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
 {
 	unsigned long timeout;
 
-	timeout = param->arg1;
-	param->arg1 = sbi->exp_timeout / HZ;
+	timeout = param->timeout.timeout;
+	param->timeout.timeout = sbi->exp_timeout / HZ;
 	sbi->exp_timeout = timeout * HZ;
 	return 0;
 }
@@ -489,7 +489,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 	path = param->path;
 	devid = sbi->sb->s_dev;
 
-	param->arg1 = param->arg2 = -1;
+	param->requester.uid = param->requester.gid = -1;
 
 	/* Get nameidata of the parent directory */
 	err = path_lookup(path, LOOKUP_PARENT, &nd);
@@ -505,8 +505,8 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		err = 0;
 		autofs4_expire_wait(nd.path.dentry);
 		spin_lock(&sbi->fs_lock);
-		param->arg1 = ino->uid;
-		param->arg2 = ino->gid;
+		param->requester.uid = ino->uid;
+		param->requester.gid = ino->gid;
 		spin_unlock(&sbi->fs_lock);
 	}
 
@@ -529,7 +529,7 @@ static int autofs_dev_ioctl_expire(struct file *fp,
 	int err = -EAGAIN;
 	int how;
 
-	how = param->arg1;
+	how = param->expire.how;
 	mnt = fp->f_path.mnt;
 
 	if (sbi->type & AUTOFS_TYPE_TRIGGER)
@@ -565,9 +565,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp,
 				      struct autofs_sb_info *sbi,
 				      struct autofs_dev_ioctl *param)
 {
-	param->arg1 = 0;
+	param->askumount.may_umount = 0;
 	if (may_umount(fp->f_path.mnt))
-		param->arg1 = 1;
+		param->askumount.may_umount = 1;
 	return 0;
 }
 
@@ -600,6 +600,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 	struct nameidata nd;
 	const char *path;
 	unsigned int type;
+	unsigned int devid, magic;
 	int err = -ENOENT;
 
 	if (param->size <= sizeof(*param)) {
@@ -608,10 +609,10 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 	}
 
 	path = param->path;
-	type = param->arg1;
+	type = param->ismountpoint.in.type;
 
-	param->arg1 = 0;
-	param->arg2 = 0;
+	param->ismountpoint.out.devid = devid = 0;
+	param->ismountpoint.out.magic = magic = 0;
 
 	if (!fp || param->ioctlfd == -1) {
 		if (type == AUTOFS_TYPE_ANY) {
@@ -622,7 +623,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 				goto out;
 
 			sb = nd.path.dentry->d_sb;
-			param->arg1 = new_encode_dev(sb->s_dev);
+			devid = new_encode_dev(sb->s_dev);
 		} else {
 			struct autofs_info *ino;
 
@@ -635,14 +636,14 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 				goto out_release;
 
 			ino = autofs4_dentry_ino(nd.path.dentry);
-			param->arg1 = autofs4_get_dev(ino->sbi);
+			devid = autofs4_get_dev(ino->sbi);
 		}
 
 		err = 0;
 		if (nd.path.dentry->d_inode &&
 		    nd.path.mnt->mnt_root == nd.path.dentry) {
 			err = 1;
-			param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic;
+			magic = nd.path.dentry->d_inode->i_sb->s_magic;
 		}
 	} else {
 		dev_t devid = new_encode_dev(sbi->sb->s_dev);
@@ -655,18 +656,21 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		if (err)
 			goto out_release;
 
-		param->arg1 = autofs4_get_dev(sbi);
+		devid = autofs4_get_dev(sbi);
 
 		err = have_submounts(nd.path.dentry);
 
 		if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
 			if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
 				struct inode *inode = nd.path.dentry->d_inode;
-				param->arg2 = inode->i_sb->s_magic;
+				magic = inode->i_sb->s_magic;
 			}
 		}
 	}
 
+	param->ismountpoint.out.devid = devid;
+	param->ismountpoint.out.magic = magic;
+
 out_release:
 	path_put(&nd.path);
 out:
-- 
cgit v1.2.3


From 41cfef2eb87694a8d64105c059b39f7bd6b7d4fe Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 6 Jan 2009 14:42:07 -0800
Subject: autofs4: fix var shadowed by local delaration

A local definition of devid in autofs_dev_ioctl_ismountpoint() shadows
the fuction wide definition.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 054d6d9ad9b..0566ff8db4c 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -646,17 +646,17 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 			magic = nd.path.dentry->d_inode->i_sb->s_magic;
 		}
 	} else {
-		dev_t devid = new_encode_dev(sbi->sb->s_dev);
+		dev_t dev = autofs4_get_dev(sbi);
 
 		err = path_lookup(path, LOOKUP_PARENT, &nd);
 		if (err)
 			goto out;
 
-		err = autofs_dev_ioctl_find_super(&nd, devid);
+		err = autofs_dev_ioctl_find_super(&nd, dev);
 		if (err)
 			goto out_release;
 
-		devid = autofs4_get_dev(sbi);
+		devid = dev;
 
 		err = have_submounts(nd.path.dentry);
 
-- 
cgit v1.2.3


From a92daf6ba1f9ace8584edc8eb557a77aa7c2c71d Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 6 Jan 2009 14:42:08 -0800
Subject: autofs4: make autofs type usage explicit

- the type assigned at mount when no type is given is changed
  from 0 to AUTOFS_TYPE_INDIRECT. This was done because 0 and
  AUTOFS_TYPE_INDIRECT were being treated implicitly as the same
  type.

- previously, an offset mount had it's type set to
  AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET but the mount control
  re-implementation needs to be able distinguish all three types.
  So this was changed to make the type setting explicit.

- a type AUTOFS_TYPE_ANY was added for use by the re-implementation
  when checking if a given path is a mountpoint. It's not really a
  type as we use this to ask if a given path is a mountpoint in the
  autofs_dev_ioctl_ismountpoint() function.

- functions to set and test the autofs mount types have been added to
  improve readability and make the type usage explicit.

- the mount type is used from user space for the mount control
  re-implementtion so, for consistency, all the definitions have
  been moved to the user space include file include/linux/auto_fs4.h.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h  |  2 --
 fs/autofs4/dev-ioctl.c |  4 ++--
 fs/autofs4/expire.c    |  4 ++--
 fs/autofs4/inode.c     | 14 +++++++-------
 fs/autofs4/waitq.c     |  8 ++++----
 5 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index e0f16da00e5..a76803108d0 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -25,8 +25,6 @@
 #define AUTOFS_DEV_IOCTL_IOC_FIRST	(AUTOFS_DEV_IOCTL_VERSION)
 #define AUTOFS_DEV_IOCTL_IOC_COUNT	(AUTOFS_IOC_COUNT - 11)
 
-#define AUTOFS_TYPE_TRIGGER	(AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)
-
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/time.h>
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 0566ff8db4c..aa3f47293bb 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -532,7 +532,7 @@ static int autofs_dev_ioctl_expire(struct file *fp,
 	how = param->expire.how;
 	mnt = fp->f_path.mnt;
 
-	if (sbi->type & AUTOFS_TYPE_TRIGGER)
+	if (autofs_type_trigger(sbi->type))
 		dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
 	else
 		dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
@@ -615,7 +615,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 	param->ismountpoint.out.magic = magic = 0;
 
 	if (!fp || param->ioctlfd == -1) {
-		if (type == AUTOFS_TYPE_ANY) {
+		if (autofs_type_any(type)) {
 			struct super_block *sb;
 
 			err = path_lookup(path, LOOKUP_FOLLOW, &nd);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4b6fb3f628c..e3bd50776f9 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
 		/* This is an autofs submount, we can't expire it */
-		if (sbi->type == AUTOFS_TYPE_INDIRECT)
+		if (autofs_type_indirect(sbi->type))
 			goto done;
 
 		/*
@@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 	if (arg && get_user(do_now, arg))
 		return -EFAULT;
 
-	if (sbi->type & AUTOFS_TYPE_TRIGGER)
+	if (autofs_type_trigger(sbi->type))
 		dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
 	else
 		dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index cfc23e53b6f..716e12b627b 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt)
 	seq_printf(m, ",minproto=%d", sbi->min_proto);
 	seq_printf(m, ",maxproto=%d", sbi->max_proto);
 
-	if (sbi->type & AUTOFS_TYPE_OFFSET)
+	if (autofs_type_offset(sbi->type))
 		seq_printf(m, ",offset");
-	else if (sbi->type & AUTOFS_TYPE_DIRECT)
+	else if (autofs_type_direct(sbi->type))
 		seq_printf(m, ",direct");
 	else
 		seq_printf(m, ",indirect");
@@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid,
 			*maxproto = option;
 			break;
 		case Opt_indirect:
-			*type = AUTOFS_TYPE_INDIRECT;
+			set_autofs_type_indirect(type);
 			break;
 		case Opt_direct:
-			*type = AUTOFS_TYPE_DIRECT;
+			set_autofs_type_direct(type);
 			break;
 		case Opt_offset:
-			*type = AUTOFS_TYPE_OFFSET;
+			set_autofs_type_offset(type);
 			break;
 		default:
 			return 1;
@@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->sb = s;
 	sbi->version = 0;
 	sbi->sub_version = 0;
-	sbi->type = AUTOFS_TYPE_INDIRECT;
+	set_autofs_type_indirect(&sbi->type);
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
@@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	}
 
 	root_inode->i_fop = &autofs4_root_operations;
-	root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ?
+	root_inode->i_op = autofs_type_trigger(sbi->type) ?
 			&autofs4_direct_root_inode_operations :
 			&autofs4_indirect_root_inode_operations;
 
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e02cc8ae5eb..eeb24684590 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		 * is very similar for indirect mounts except only dentrys
 		 * in the root of the autofs file system may be negative.
 		 */
-		if (sbi->type & AUTOFS_TYPE_TRIGGER)
+		if (autofs_type_trigger(sbi->type))
 			return -ENOENT;
 		else if (!IS_ROOT(dentry->d_parent))
 			return -ENOENT;
@@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		return -ENOMEM;
 
 	/* If this is a direct mount request create a dummy name */
-	if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER)
+	if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
 		qstr.len = sprintf(name, "%p", dentry);
 	else {
 		qstr.len = autofs4_getpath(sbi, dentry, &name);
@@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 				type = autofs_ptype_expire_multi;
 		} else {
 			if (notify == NFY_MOUNT)
-				type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+				type = autofs_type_trigger(sbi->type) ?
 					autofs_ptype_missing_direct :
 					 autofs_ptype_missing_indirect;
 			else
-				type = (sbi->type & AUTOFS_TYPE_TRIGGER) ?
+				type = autofs_type_trigger(sbi->type) ?
 					autofs_ptype_expire_direct :
 					autofs_ptype_expire_indirect;
 		}
-- 
cgit v1.2.3


From bae8ec66554b27967f057a4b7888b09481ff1b8b Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 6 Jan 2009 14:42:09 -0800
Subject: autofs4: fix string validation check order

In function validate_dev_ioctl() we check that the string we've been sent
is a valid path.  The function that does this check assumes the string is
NULL terminated but our NULL termination check isn't done until after this
call.  This patch changes the order of the check.

Signed-off-by: Ian Kent <raven@themaw.net>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index aa3f47293bb..025e105bffe 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 
 /*
  * Check sanity of parameter control fields and if a path is present
- * check that it has a "/" and is terminated.
+ * check that it is terminated and contains at least one "/".
  */
 static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 {
@@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 	}
 
 	if (param->size > sizeof(*param)) {
-		err = check_name(param->path);
+		err = invalid_str(param->path,
+				 (void *) ((size_t) param + param->size));
 		if (err) {
-			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
-				    cmd);
+			AUTOFS_WARN(
+			  "path string terminator missing for cmd(0x%08x)",
+			  cmd);
 			goto out;
 		}
 
-		err = invalid_str(param->path,
-				 (void *) ((size_t) param + param->size));
+		err = check_name(param->path);
 		if (err) {
 			AUTOFS_WARN("invalid path supplied for cmd(0x%08x)",
 				    cmd);
-- 
cgit v1.2.3


From d6b54841f4ddd836c886d1e6ac381cf309ee98a3 Mon Sep 17 00:00:00 2001
From: Evgeniy Dushistov <dushistov@mail.ru>
Date: Tue, 6 Jan 2009 14:42:38 -0800
Subject: minix: fix add link's wrong position calculation

Fix the add link method.  The oosition in the directory was calculated in
wrong way - it had the incorrect shift direction.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Evgeniy Dushistov <dushistov@mail.ru>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: <stable@kernel.org>		[2.6.lots]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/minix/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index f70433816a3..d4946c4c90e 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
 	return -EINVAL;
 
 got_it:
-	pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page);
+	pos = page_offset(page) + p - (char *)page_address(page);
 	err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize,
 					AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
 	if (err)
-- 
cgit v1.2.3


From 8cd3ac3aca3f2afe8570708066d64d893da468e8 Mon Sep 17 00:00:00 2001
From: WANG Cong <wangcong@zeuux.org>
Date: Tue, 6 Jan 2009 14:42:48 -0800
Subject: fs/exec.c: make do_coredump() void

No one cares do_coredump()'s return value, and also it seems that it
is also not necessary. So make it void.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: WANG Cong <wangcong@zeuux.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 6b09d6fa4f7..71a6efe5d8b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1686,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm)
 	return (ret >= 2) ? 2 : ret;
 }
 
-int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
 	struct core_state core_state;
 	char corename[CORENAME_MAX_SIZE + 1];
@@ -1842,5 +1842,5 @@ fail_unlock:
 	put_cred(cred);
 	coredump_finish(mm);
 fail:
-	return retval;
+	return;
 }
-- 
cgit v1.2.3


From e1f89ec95bd28b0927e76c46a7cc0927b7521c1d Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Tue, 6 Jan 2009 14:43:12 -0800
Subject: bfs: add some basic sanity checks

bfs_fill_super() already touches all inodes, so we can easily add some
cheap sanity checks and check if the inode start and end blocks are
smaller than the maximum number of blocks, the inode start block lies
behind the end block or the file end offset is behind the end of the
filesystem.  Also check if the start of data offset in the super block
fits the filesystem.

The added sanity checks catch softlockup issues early when we try to
sb_bread() lots of blocks in a loop in bfs_readdir() and bfs_find_entry().
 In addition an oom issue in bfs_fill_super() is prevented by this when
s_start is corrupted, which influences imap_len and we try to allocate a
huge info->si_imap.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Acked-by: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/inode.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'fs')

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 0ed57b5ee01..1d2bfafcad7 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s)
 {
 	struct bfs_sb_info *info = BFS_SB(s);
 
+	if (!info)
+		return;
+
 	brelse(info->si_sbh);
 	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
@@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 	unsigned i, imap_len;
 	struct bfs_sb_info *info;
 	long ret = -EINVAL;
+	unsigned long i_sblock, i_eblock, i_eoff, s_size;
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
@@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 
 	s->s_magic = BFS_MAGIC;
 	info->si_sbh = bh;
+
+	if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) {
+		printf("Superblock is corrupted\n");
+		goto out;
+	}
+
 	info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) /
 					sizeof(struct bfs_inode)
 					+ BFS_ROOT_INO - 1;
@@ -397,6 +407,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 
 		di = (struct bfs_inode *)bh->b_data + off;
 
+		/* test if filesystem is not corrupted */
+
+		i_eoff = le32_to_cpu(di->i_eoffset);
+		i_sblock = le32_to_cpu(di->i_sblock);
+		i_eblock = le32_to_cpu(di->i_eblock);
+		s_size = le32_to_cpu(bfs_sb->s_end);
+
+		if (i_sblock > info->si_blocks ||
+			i_eblock > info->si_blocks ||
+			i_sblock > i_eblock ||
+			i_eoff > s_size ||
+			i_sblock * BFS_BSIZE > i_eoff) {
+
+			printf("Inode 0x%08x corrupted\n", i);
+
+			brelse(bh);
+			s->s_root = NULL;
+			kfree(info->si_imap);
+			kfree(info);
+			s->s_fs_info = NULL;
+			return -EIO;
+		}
+
 		if (!di->i_ino) {
 			info->si_freei++;
 			continue;
-- 
cgit v1.2.3


From 50682bb4de35544466c264c017030de826614367 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Tue, 6 Jan 2009 14:43:13 -0800
Subject: bfs: check that filesystem fits on the blockdevice

Since all sanity checks rely on the validity of s_start which gets only
checked to be smaller than s_end, we should also check if s_end is sane.
Now we also try to retrieve the last block of the filesystem, which is
computed by s_end.  If this fails, something is bogus.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Acked-by: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bfs/inode.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1d2bfafcad7..cc4062d12ca 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -390,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
 			- le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS;
 	info->si_freei = 0;
 	info->si_lf_eblk = 0;
+
+	/* can we read the last block? */
+	bh = sb_bread(s, info->si_blocks - 1);
+	if (!bh) {
+		printf("Last block not available: %lu\n", info->si_blocks - 1);
+		iput(inode);
+		ret = -EIO;
+		kfree(info->si_imap);
+		goto out;
+	}
+	brelse(bh);
+
 	bh = NULL;
 	for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) {
 		struct bfs_inode *di;
-- 
cgit v1.2.3


From e4fefbac6c5bcb0388d95e83801210e7d81a071b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 6 Jan 2009 10:08:33 +0000
Subject: GFS2: Set GFP_NOFS when allocating page on write

We need to ensure that we always set GFP_NOFS in this one
particular case when allocating pages for write.

Reported-by: Fabio M. Di Nitto <fdinitto@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 6e4ea36c660..4ddab67867e 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -675,6 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		goto out_trans_fail;
 
 	error = -ENOMEM;
+	flags |= AOP_FLAG_NOFS;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	*pagep = page;
 	if (unlikely(!page))
-- 
cgit v1.2.3


From 0027ce681e3cd49fa34dab023574611f4384291d Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 6 Jan 2009 14:56:26 +0000
Subject: GFS2: LSF and LBD are now one and the same

As a result of this recent patch:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=b3a6ffe16b5cc48abe7db8d04882dc45280eb693
We only need to depend on LBD.

Reported-by: Fabio M. Di Nitto <fdinitto@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index ab2f57e3fb8..e563a644981 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
+	depends on EXPERIMENTAL && (64BIT || LBD)
 	select FS_POSIX_ACL
 	select CRC32
 	help
-- 
cgit v1.2.3


From c8f554b947e80a90e1b43bbd4fd26c27765b5f96 Mon Sep 17 00:00:00 2001
From: Benjamin Marzinski <bmarzins@redhat.com>
Date: Tue, 6 Jan 2009 10:47:50 -0600
Subject: GFS2: Fix typo in gfs_page_mkwrite()

There is a typo in gfs2_page_mkwrite()

gfs2_write_alloc_required() expects pos to be the offset in bytes. However,
instead of the page index being shifted by by PAGE_CACHE_SHIFT, it was shifted
by (PAGE_CACHE_SIZE - inode->i_blkbits).  This patch simply shifts the page
index by the proper amount.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 289c5f54ba5..93fe41b67f9 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -342,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned long last_index;
-	u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits);
+	u64 pos = page->index << PAGE_CACHE_SHIFT;
 	unsigned int data_blocks, ind_blocks, rblocks;
 	int alloc_required = 0;
 	struct gfs2_holder gh;
-- 
cgit v1.2.3


From 9ab86c8e01c3f298dba0cbf2502c635b7f6fc6f9 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Jan 2009 09:48:51 -0500
Subject: Btrfs: kmap_atomic(KM_USER0) is safe for btrfs_readpage_end_io_hook

None of the checksum verification code schedules, so we can use the faster
kmap_atomic

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cdb701165a0..8adfe059ab4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1748,7 +1748,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	} else {
 		ret = get_state_private(io_tree, start, &private);
 	}
-	kaddr = kmap(page);
+	kaddr = kmap_atomic(page, KM_USER0);
 	if (ret)
 		goto zeroit;
 
@@ -1757,7 +1757,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (csum != private)
 		goto zeroit;
 
-	kunmap(page);
+	kunmap_atomic(kaddr, KM_USER0);
 good:
 	/* if the io failure tree for this inode is non-empty,
 	 * check to see if we've recovered from a failed IO
@@ -1772,7 +1772,7 @@ zeroit:
 	       (unsigned long long)private);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
-	kunmap(page);
+	kunmap_atomic(kaddr, KM_USER0);
 	if (private == 0)
 		return 0;
 	return -EIO;
-- 
cgit v1.2.3


From 709ac06a148a33493d3e2f9391bb746b067d96d6 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Wed, 7 Jan 2009 09:54:24 -0500
Subject: Btrfs: Add Documentation/filesystem/btrfs.txt, remove old COPYING

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/COPYING | 356 -------------------------------------------------------
 fs/btrfs/INSTALL |  48 --------
 2 files changed, 404 deletions(-)
 delete mode 100644 fs/btrfs/COPYING
 delete mode 100644 fs/btrfs/INSTALL

(limited to 'fs')

diff --git a/fs/btrfs/COPYING b/fs/btrfs/COPYING
deleted file mode 100644
index ca442d313d8..00000000000
--- a/fs/btrfs/COPYING
+++ /dev/null
@@ -1,356 +0,0 @@
-
-   NOTE! This copyright does *not* cover user programs that use kernel
- services by normal system calls - this is merely considered normal use
- of the kernel, and does *not* fall under the heading of "derived work".
- Also note that the GPL below is copyrighted by the Free Software
- Foundation, but the instance of code that it refers to (the Linux
- kernel) is copyrighted by me and others who actually wrote it.
-
- Also note that the only valid version of the GPL as far as the kernel
- is concerned is _this_ particular version of the license (ie v2, not
- v2.2 or v3.x or whatever), unless explicitly otherwise stated.
-
-			Linus Torvalds
-
-----------------------------------------
-
-		    GNU GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
-
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
-                       51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-License is intended to guarantee your freedom to share and change free
-software--to make sure the software is free for all its users.  This
-General Public License applies to most of the Free Software
-Foundation's software and to any other program whose authors commit to
-using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
-your programs, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-this service if you wish), that you receive source code or can get it
-if you want it, that you can change the software or use pieces of it
-in new free programs; and that you know you can do these things.
-
-  To protect your rights, we need to make restrictions that forbid
-anyone to deny you these rights or to ask you to surrender the rights.
-These restrictions translate to certain responsibilities for you if you
-distribute copies of the software, or if you modify it.
-
-  For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must give the recipients all the rights that
-you have.  You must make sure that they, too, receive or can get the
-source code.  And you must show them these terms so they know their
-rights.
-
-  We protect your rights with two steps: (1) copyright the software, and
-(2) offer you this license which gives you legal permission to copy,
-distribute and/or modify the software.
-
-  Also, for each author's protection and ours, we want to make certain
-that everyone understands that there is no warranty for this free
-software.  If the software is modified by someone else and passed on, we
-want its recipients to know that what they have is not the original, so
-that any problems introduced by others will not reflect on the original
-authors' reputations.
-
-  Finally, any free program is threatened constantly by software
-patents.  We wish to avoid the danger that redistributors of a free
-program will individually obtain patent licenses, in effect making the
-program proprietary.  To prevent this, we have made it clear that any
-patent must be licensed for everyone's free use or not licensed at all.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.
-
-		    GNU GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License applies to any program or other work which contains
-a notice placed by the copyright holder saying it may be distributed
-under the terms of this General Public License.  The "Program", below,
-refers to any such program or work, and a "work based on the Program"
-means either the Program or any derivative work under copyright law:
-that is to say, a work containing the Program or a portion of it,
-either verbatim or with modifications and/or translated into another
-language.  (Hereinafter, translation is included without limitation in
-the term "modification".)  Each licensee is addressed as "you".
-
-Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running the Program is not restricted, and the output from the Program
-is covered only if its contents constitute a work based on the
-Program (independent of having been made by running the Program).
-Whether that is true depends on what the Program does.
-
-  1. You may copy and distribute verbatim copies of the Program's
-source code as you receive it, in any medium, provided that you
-conspicuously and appropriately publish on each copy an appropriate
-copyright notice and disclaimer of warranty; keep intact all the
-notices that refer to this License and to the absence of any warranty;
-and give any other recipients of the Program a copy of this License
-along with the Program.
-
-You may charge a fee for the physical act of transferring a copy, and
-you may at your option offer warranty protection in exchange for a fee.
-
-  2. You may modify your copy or copies of the Program or any portion
-of it, thus forming a work based on the Program, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) You must cause the modified files to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    b) You must cause any work that you distribute or publish, that in
-    whole or in part contains or is derived from the Program or any
-    part thereof, to be licensed as a whole at no charge to all third
-    parties under the terms of this License.
-
-    c) If the modified program normally reads commands interactively
-    when run, you must cause it, when started running for such
-    interactive use in the most ordinary way, to print or display an
-    announcement including an appropriate copyright notice and a
-    notice that there is no warranty (or else, saying that you provide
-    a warranty) and that users may redistribute the program under
-    these conditions, and telling the user how to view a copy of this
-    License.  (Exception: if the Program itself is interactive but
-    does not normally print such an announcement, your work based on
-    the Program is not required to print an announcement.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Program,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Program, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Program.
-
-In addition, mere aggregation of another work not based on the Program
-with the Program (or with a work based on the Program) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may copy and distribute the Program (or a work based on it,
-under Section 2) in object code or executable form under the terms of
-Sections 1 and 2 above provided that you also do one of the following:
-
-    a) Accompany it with the complete corresponding machine-readable
-    source code, which must be distributed under the terms of Sections
-    1 and 2 above on a medium customarily used for software interchange; or,
-
-    b) Accompany it with a written offer, valid for at least three
-    years, to give any third party, for a charge no more than your
-    cost of physically performing source distribution, a complete
-    machine-readable copy of the corresponding source code, to be
-    distributed under the terms of Sections 1 and 2 above on a medium
-    customarily used for software interchange; or,
-
-    c) Accompany it with the information you received as to the offer
-    to distribute corresponding source code.  (This alternative is
-    allowed only for noncommercial distribution and only if you
-    received the program in object code or executable form with such
-    an offer, in accord with Subsection b above.)
-
-The source code for a work means the preferred form of the work for
-making modifications to it.  For an executable work, complete source
-code means all the source code for all modules it contains, plus any
-associated interface definition files, plus the scripts used to
-control compilation and installation of the executable.  However, as a
-special exception, the source code distributed need not include
-anything that is normally distributed (in either source or binary
-form) with the major components (compiler, kernel, and so on) of the
-operating system on which the executable runs, unless that component
-itself accompanies the executable.
-
-If distribution of executable or object code is made by offering
-access to copy from a designated place, then offering equivalent
-access to copy the source code from the same place counts as
-distribution of the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  4. You may not copy, modify, sublicense, or distribute the Program
-except as expressly provided under this License.  Any attempt
-otherwise to copy, modify, sublicense or distribute the Program is
-void, and will automatically terminate your rights under this License.
-However, parties who have received copies, or rights, from you under
-this License will not have their licenses terminated so long as such
-parties remain in full compliance.
-
-  5. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Program or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Program (or any work based on the
-Program), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Program or works based on it.
-
-  6. Each time you redistribute the Program (or any work based on the
-Program), the recipient automatically receives a license from the
-original licensor to copy, distribute or modify the Program subject to
-these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties to
-this License.
-
-  7. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Program at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Program by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Program.
-
-If any portion of this section is held invalid or unenforceable under
-any particular circumstance, the balance of the section is intended to
-apply and the section as a whole is intended to apply in other
-circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system, which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  8. If the distribution and/or use of the Program is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Program under this License
-may add an explicit geographical distribution limitation excluding
-those countries, so that distribution is permitted only in or among
-countries not thus excluded.  In such case, this License incorporates
-the limitation as if written in the body of this License.
-
-  9. The Free Software Foundation may publish revised and/or new versions
-of the General Public License from time to time.  Such new versions will
-be similar in spirit to the present version, but may differ in detail to
-address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Program
-specifies a version number of this License which applies to it and "any
-later version", you have the option of following the terms and conditions
-either of that version or of any later version published by the Free
-Software Foundation.  If the Program does not specify a version number of
-this License, you may choose any version ever published by the Free Software
-Foundation.
-
-  10. If you wish to incorporate parts of the Program into other free
-programs whose distribution conditions are different, write to the author
-to ask for permission.  For software which is copyrighted by the Free
-Software Foundation, write to the Free Software Foundation; we sometimes
-make exceptions for this.  Our decision will be guided by the two goals
-of preserving the free status of all derivatives of our free software and
-of promoting the sharing and reuse of software generally.
-
-			    NO WARRANTY
-
-  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
-FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
-OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
-PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
-OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
-TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
-PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
-REPAIR OR CORRECTION.
-
-  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
-REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
-INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
-OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
-TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
-YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
-PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-	    How to Apply These Terms to Your New Programs
-
-  If you develop a new program, and you want it to be of the greatest
-possible use to the public, the best way to achieve this is to make it
-free software which everyone can redistribute and change under these terms.
-
-  To do so, attach the following notices to the program.  It is safest
-to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least
-the "copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the program's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This program is free software; you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-
-Also add information on how to contact you by electronic and paper mail.
-
-If the program is interactive, make it output a short notice like this
-when it starts in an interactive mode:
-
-    Gnomovision version 69, Copyright (C) year name of author
-    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
-    This is free software, and you are welcome to redistribute it
-    under certain conditions; type `show c' for details.
-
-The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, the commands you use may
-be called something other than `show w' and `show c'; they could even be
-mouse-clicks or menu items--whatever suits your program.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the program, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
-  `Gnomovision' (which makes passes at compilers) written by James Hacker.
-
-  <signature of Ty Coon>, 1 April 1989
-  Ty Coon, President of Vice
-
-This General Public License does not permit incorporating your program into
-proprietary programs.  If your program is a subroutine library, you may
-consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
-Public License instead of this License.
diff --git a/fs/btrfs/INSTALL b/fs/btrfs/INSTALL
deleted file mode 100644
index 16b45a56878..00000000000
--- a/fs/btrfs/INSTALL
+++ /dev/null
@@ -1,48 +0,0 @@
-Install Instructions
-
-Btrfs puts snapshots and subvolumes into the root directory of the FS.  This
-directory can only be changed by btrfsctl right now, and normal filesystem
-operations do not work on it.  The default subvolume is called 'default',
-and you can create files and directories in mount_point/default
-
-Btrfs uses libcrc32c in the kernel for file and metadata checksums.  You need
-to compile the kernel with:
-
-CONFIG_LIBCRC32C=m
-
-libcrc32c can be static as well.  Once your kernel is setup, typing make in the
-btrfs module sources will build against the running kernel.  When the build is
-complete:
-
-modprobe libcrc32c
-insmod btrfs.ko
-
-The Btrfs utility programs require libuuid to build.  This can be found
-in the e2fsprogs sources, and is usually available as libuuid or
-e2fsprogs-devel from various distros.
-
-Building the utilities is just make ; make install.  The programs go
-into /usr/local/bin.  The commands available are:
-
-mkfs.btrfs: create a filesystem
-
-btrfsctl: control program to create snapshots and subvolumes:
-
-	mount /dev/sda2 /mnt
-	btrfsctl -s new_subvol_name /mnt
-	btrfsctl -s snapshot_of_default /mnt/default
-	btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
-	btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
-	ls /mnt
-	default snapshot_of_a_snapshot snapshot_of_new_subvol
-	new_subvol_name snapshot_of_default
-
-	Snapshots and subvolumes cannot be deleted right now, but you can
-	rm -rf all the files and directories inside them.
-
-btrfsck: do a limited check of the FS extent trees.</li>
-
-debug-tree: print all of the FS metadata in text form.  Example:
-
-	debug-tree /dev/sda2 >& big_output_file
-
-- 
cgit v1.2.3


From efaee192063a54749c56b7383803e16fe553630e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 6 Jan 2009 07:20:54 -0800
Subject: async: make the final inode deletion an asynchronous event

this makes "rm -rf" on a (names cached) kernel tree go from
11.6 to 8.6 seconds on an ext3 filesystem

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/inode.c | 20 +++++++++++++-------
 fs/super.c | 10 ++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 7a6e8c2ff7b..0013ac1af8e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
 #include <linux/mount.h>
+#include <linux/async.h>
 
 /*
  * This is needed for the following functions:
@@ -1138,16 +1139,11 @@ EXPORT_SYMBOL(remove_inode_hash);
  * I_FREEING is set so that no-one will take a new reference to the inode while
  * it is being deleted.
  */
-void generic_delete_inode(struct inode *inode)
+static void generic_delete_inode_async(void *data, async_cookie_t cookie)
 {
+	struct inode *inode = data;
 	const struct super_operations *op = inode->i_sb->s_op;
 
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-
 	security_inode_delete(inode);
 
 	if (op->delete_inode) {
@@ -1171,6 +1167,16 @@ void generic_delete_inode(struct inode *inode)
 	destroy_inode(inode);
 }
 
+void generic_delete_inode(struct inode *inode)
+{
+	list_del_init(&inode->i_list);
+	list_del_init(&inode->i_sb_list);
+	inode->i_state |= I_FREEING;
+	inodes_stat.nr_inodes--;
+	spin_unlock(&inode_lock);
+	async_schedule_special(generic_delete_inode_async, inode, &inode->i_sb->s_async_list);
+}
+
 EXPORT_SYMBOL(generic_delete_inode);
 
 static void generic_forget_inode(struct inode *inode)
diff --git a/fs/super.c b/fs/super.c
index ddba069d7a9..cb20744ec78 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,6 +38,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
+		INIT_LIST_HEAD(&s->s_async_list);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb)
 {
 	const struct super_operations *sop = sb->s_op;
 
+
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
 		fsync_super(sb);
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
+
+		/*
+		 * wait for asynchronous fs operations to finish before going further
+		 */
+		async_synchronize_full_special(&sb->s_async_list);
+
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
 		lock_kernel();
@@ -449,6 +458,7 @@ void sync_filesystems(int wait)
 		if (sb->s_flags & MS_RDONLY)
 			continue;
 		sb->s_need_sync_fs = 1;
+		async_synchronize_full_special(&sb->s_async_list);
 	}
 
 restart:
-- 
cgit v1.2.3


From 5e07878787ad07361571150230cc3a8d522ae046 Mon Sep 17 00:00:00 2001
From: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Date: Sat, 20 Dec 2008 16:57:39 -0800
Subject: debugfs: add helpers for exporting a size_t simple value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the same spirit as debugfs_create_*(), introduce helpers for
exporting size_t values over debugfs.

The only trick done is that the format verifier is kept at %llu
instead of %zu; otherwise type warnings would pop up:

format ‘%zu’ expects type ‘size_t’, but argument 2 has type ‘long long unsigned int’

There is no real way to fix this one--however, we can consider %llu
and %zu to be compatible if we consider that we are using the same for
validating in debugfs_create_{x,u}{8,16,32}().

Signed-off-by: Inaky Perez-Gonzalez <inaky@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/debugfs/file.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'fs')

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 159a5efd6a8..33a90120f6a 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
 
+
+static int debugfs_size_t_set(void *data, u64 val)
+{
+	*(size_t *)data = val;
+	return 0;
+}
+static int debugfs_size_t_get(void *data, u64 *val)
+{
+	*val = *(size_t *)data;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
+			"%llu\n");	/* %llu and %zu are more or less the same */
+
+/**
+ * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_size_t(const char *name, mode_t mode,
+				     struct dentry *parent, size_t *value)
+{
+	return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_size_t);
+
+
 static ssize_t read_file_bool(struct file *file, char __user *user_buf,
 			      size_t count, loff_t *ppos)
 {
-- 
cgit v1.2.3


From 55ef1274dddd4de387c54d110e354ffbb6cdc706 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sat, 20 Dec 2008 11:58:38 -0800
Subject: nfsd: Ensure nfsv4 calls the underlying filesystem on LOCKT

Since nfsv4 allows LOCKT without an open, but the ->lock() method is a
file method, we fake up a struct file in the nfsv4 code with just the
fields we need initialized.  But we forgot to initialize the file
operations, with the result that LOCKT never results in a call to the
filesystem's ->lock() method (if it exists).

We could just add that one more initialization.  But this hack of faking
up a struct file with only some fields initialized seems the kind of
thing that might cause more problems in the future.  We should either do
an open and get a real struct file, or make lock-testing an inode (not a
file) method.

This patch does the former.

Reported-by: Marc Eshel <eshel@almaden.ibm.com>
Tested-by: Marc Eshel <eshel@almaden.ibm.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 06b89df9221..e62d0e3df8b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2781,6 +2781,25 @@ out:
 	return status;
 }
 
+/*
+ * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN,
+ * so we do a temporary open here just to get an open file to pass to
+ * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
+ * inode operation.)
+ */
+static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+{
+	struct file *file;
+	int err;
+
+	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	if (err)
+		return err;
+	err = vfs_test_lock(file, lock);
+	nfsd_close(file);
+	return err;
+}
+
 /*
  * LOCKT operation
  */
@@ -2789,7 +2808,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	    struct nfsd4_lockt *lockt)
 {
 	struct inode *inode;
-	struct file file;
 	struct file_lock file_lock;
 	int error;
 	__be32 status;
@@ -2847,16 +2865,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_transform_lock_offset(&file_lock);
 
-	/* vfs_test_lock uses the struct file _only_ to resolve the inode.
-	 * since LOCKT doesn't require an OPEN, and therefore a struct
-	 * file may not exist, pass vfs_test_lock a struct file with
-	 * only the dentry:inode set.
-	 */
-	memset(&file, 0, sizeof (struct file));
-	file.f_path.dentry = cstate->current_fh.fh_dentry;
-
 	status = nfs_ok;
-	error = vfs_test_lock(&file, &file_lock);
+	error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
 	if (error) {
 		status = nfserrno(error);
 		goto out;
-- 
cgit v1.2.3


From d3fe5ea7cf815c037c90b1f1464ffc1ab5e8601b Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 31 Dec 2008 16:06:04 -0500
Subject: NLM: Refactor make_socks() function

Clean up: extract common logic in NLM's make_socks() function
into a helper.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 3e5f9f07911..cf3899aec37 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -204,6 +204,19 @@ lockd(void *vrqstp)
 	return 0;
 }
 
+static int create_lockd_listener(struct svc_serv *serv, char *name,
+				 unsigned short port)
+{
+	struct svc_xprt *xprt;
+
+	xprt = svc_find_xprt(serv, name, 0, 0);
+	if (xprt == NULL)
+		return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+
+	svc_xprt_put(xprt);
+	return 0;
+}
+
 /*
  * Ensure there are active UDP and TCP listeners for lockd.
  *
@@ -217,23 +230,11 @@ lockd(void *vrqstp)
 static int make_socks(struct svc_serv *serv)
 {
 	static int warned;
-	struct svc_xprt *xprt;
 	int err = 0;
 
-	xprt = svc_find_xprt(serv, "udp", 0, 0);
-	if (!xprt)
-		err = svc_create_xprt(serv, "udp", nlm_udpport,
-				      SVC_SOCK_DEFAULTS);
-	else
-		svc_xprt_put(xprt);
-	if (err >= 0) {
-		xprt = svc_find_xprt(serv, "tcp", 0, 0);
-		if (!xprt)
-			err = svc_create_xprt(serv, "tcp", nlm_tcpport,
-					      SVC_SOCK_DEFAULTS);
-		else
-			svc_xprt_put(xprt);
-	}
+	err = create_lockd_listener(serv, "udp", nlm_udpport);
+	if (err >= 0)
+		err = create_lockd_listener(serv, "tcp", nlm_tcpport);
 	if (err >= 0) {
 		warned = 0;
 		err = 0;
-- 
cgit v1.2.3


From 0dba7c2a9ed3d4a1e58f5d94fffa9f44dbe012e6 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 31 Dec 2008 16:06:11 -0500
Subject: NLM: Clean up flow of control in make_socks() function

Clean up: Use Bruce's preferred control flow style in make_socks().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svc.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index cf3899aec37..64f1c31b585 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -230,17 +230,23 @@ static int create_lockd_listener(struct svc_serv *serv, char *name,
 static int make_socks(struct svc_serv *serv)
 {
 	static int warned;
-	int err = 0;
+	int err;
 
 	err = create_lockd_listener(serv, "udp", nlm_udpport);
-	if (err >= 0)
-		err = create_lockd_listener(serv, "tcp", nlm_tcpport);
-	if (err >= 0) {
-		warned = 0;
-		err = 0;
-	} else if (warned++ == 0)
+	if (err < 0)
+		goto out_err;
+
+	err = create_lockd_listener(serv, "tcp", nlm_tcpport);
+	if (err < 0)
+		goto out_err;
+
+	warned = 0;
+	return 0;
+
+out_err:
+	if (warned++ == 0)
 		printk(KERN_WARNING
-		       "lockd_up: makesock failed, error=%d\n", err);
+			"lockd_up: makesock failed, error=%d\n", err);
 	return err;
 }
 
-- 
cgit v1.2.3


From f05ef8db1abe68e3f6fc272efee51bc54ce528c5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 5 Jan 2009 17:19:37 +0000
Subject: CRED: Fix NFSD regression

Fix a regression in NFSD's permission checking introduced by the credentials
patches.  There are two parts to the problem, both in nfsd_setuser():

 (1) The return value of set_groups() is -ve if in error, not 0, and should be
     checked appropriately.  0 indicates success.

 (2) The UID to use for fs accesses is in new->fsuid, not new->uid (which is
     0).  This causes CAP_DAC_OVERRIDE to always be set, rather than being
     cleared if the UID is anything other than 0 after squashing.

Reported-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/auth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 0184fe9b514..c903e04aa21 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 
 	ret = set_groups(new, gi);
 	put_group_info(gi);
-	if (!ret)
+	if (ret < 0)
 		goto error;
 
-	if (new->uid)
+	if (new->fsuid)
 		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
 	else
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
-- 
cgit v1.2.3


From 9a8d248e2d2e9c880ac4561f27fea5dc200655bd Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 6 Jan 2009 13:37:03 -0500
Subject: nfsd: fix double-locks of directory mutex

A number of nfsd operations depend on the i_mutex to cover more code
than just the fsync, so the approach of 4c728ef583b3d8 "add a vfs_fsync
helper" doesn't work for nfsd.  Revert the parts of those patches that
touch nfsd.

Note: we can't, however, remove the logic from vfs_fsync that was needed
only for the special case of nfsd, because a vfs_fsync(NULL,...) call
can still result indirectly from a stackable filesystem that was called
by nfsd.  (Thanks to Christoph Hellwig for pointing this out.)

Reported-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 44aa92aba89..6e50aaa56ca 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -744,16 +744,44 @@ nfsd_close(struct file *filp)
 	fput(filp);
 }
 
+/*
+ * Sync a file
+ * As this calls fsync (not fdatasync) there is no need for a write_inode
+ * after it.
+ */
+static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
+			      const struct file_operations *fop)
+{
+	struct inode *inode = dp->d_inode;
+	int (*fsync) (struct file *, struct dentry *, int);
+	int err;
+
+	err = filemap_fdatawrite(inode->i_mapping);
+	if (err == 0 && fop && (fsync = fop->fsync))
+		err = fsync(filp, dp, 0);
+	if (err == 0)
+		err = filemap_fdatawait(inode->i_mapping);
+
+	return err;
+}
+
 static int
 nfsd_sync(struct file *filp)
 {
-	return vfs_fsync(filp, filp->f_path.dentry, 0);
+        int err;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
+	mutex_lock(&inode->i_mutex);
+	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
+	mutex_unlock(&inode->i_mutex);
+
+	return err;
 }
 
 int
-nfsd_sync_dir(struct dentry *dentry)
+nfsd_sync_dir(struct dentry *dp)
 {
-	return vfs_fsync(NULL, dentry, 0);
+	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
 }
 
 /*
-- 
cgit v1.2.3


From 30fa8c0157e4591ee2227aaa0b17cd3b0da5e6cb Mon Sep 17 00:00:00 2001
From: Steve Dickson <SteveD@redhat.com>
Date: Wed, 7 Jan 2009 16:54:30 -0500
Subject: NFSD: FIDs need to take precedence over UUIDs

When determining the fsid_type in fh_compose(), the setting of the FID
via fsid= export option needs to take precedence over using the UUID
device id.

Signed-off-by: Steve Dickson <steved@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsfh.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 019a8a20184..9f1ca17293d 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -484,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 				goto retry;
 			break;
 		}
+	} else if (exp->ex_flags & NFSEXP_FSID) {
+		fsid_type = FSID_NUM;
 	} else if (exp->ex_uuid) {
 		if (fhp->fh_maxsize >= 64) {
 			if (root_export)
@@ -496,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
 			else
 				fsid_type = FSID_UUID4_INUM;
 		}
-	} else if (exp->ex_flags & NFSEXP_FSID)
-		fsid_type = FSID_NUM;
-	else if (!old_valid_dev(ex_dev))
+	} else if (!old_valid_dev(ex_dev))
 		/* for newer device numbers, we must use a newer fsid format */
 		fsid_type = FSID_ENCODE_DEV;
 	else
-- 
cgit v1.2.3


From b7aeda40d3010666d2c024c80557b6aa92a1a1ad Mon Sep 17 00:00:00 2001
From: Dean Hildebrand <dhildeb@us.ibm.com>
Date: Mon, 15 Dec 2008 19:40:15 +0200
Subject: nfsd: add etoosmall to nfserrno

Signed-off-by: Dean Hildebrand <dhildeb@us.ibm.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfsproc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 5cffeca7ace..6f7f2635122 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -622,6 +622,7 @@ nfserrno (int errno)
 		{ nfserr_badname, -ESRCH },
 		{ nfserr_io, -ETXTBSY },
 		{ nfserr_notsupp, -EOPNOTSUPP },
+		{ nfserr_toosmall, -ETOOSMALL },
 	};
 	int	i;
 
-- 
cgit v1.2.3


From 0407717d8587f60003f4904bff27650cd836c00c Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 15 Dec 2008 19:40:49 +0200
Subject: nfsd: dprint each op status in nfsd4_proc_compound

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4proc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 669461e291a..9fa60a3ad48 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -946,6 +946,11 @@ encode_op:
 			nfsd4_encode_operation(resp, op);
 			status = op->status;
 		}
+
+		dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n",
+			args->ops, args->opcnt, resp->opcnt, op->opnum,
+			be32_to_cpu(status));
+
 		if (cstate->replay_owner) {
 			nfs4_put_stateowner(cstate->replay_owner);
 			cstate->replay_owner = NULL;
-- 
cgit v1.2.3


From df96fcf02a5fd2ae4e9b09e079dd6ef12d10ecd7 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 15 Dec 2008 19:41:10 +0200
Subject: nfsd: git rid of nfs4_cb_null_ops declaration

There's no use for nfs4_cb_null_ops's declaration in fs/nfsd/nfs4callback.c

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4callback.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 6d7d8c02c19..c464181b599 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -53,9 +53,6 @@
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
 
-/* declarations */
-static const struct rpc_call_ops nfs4_cb_null_ops;
-
 /* Index of predefined Linux callback client operations */
 
 enum {
-- 
cgit v1.2.3


From 4e65ebf08951326709817e654c149d0a94982e01 Mon Sep 17 00:00:00 2001
From: Marc Eshel <eshel@almaden.ibm.com>
Date: Mon, 15 Dec 2008 19:41:31 +0200
Subject: nfsd: delete wrong file comment from nfsd/nfs4xdr.c

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4xdr.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index afcdf4b7684..f65953be39c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfs/nfs4xdr.c
- *
  *  Server-side XDR for NFSv4
  *
  *  Copyright (c) 2002 The Regents of the University of Michigan.
-- 
cgit v1.2.3


From 87df4de8073f922a1f643b9fa6ba0412d5529ecf Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Mon, 15 Dec 2008 19:42:03 +0200
Subject: nfsd: last_byte_offset

refactor the nfs4 server lock code to use last_byte_offset
to compute the last byte covered by the lock.  Check for overflow
so that the last byte is set to NFS4_MAX_UINT64 if offset + len
wraps around.

Also, use NFS4_MAX_UINT64 for ~(u64)0 where appropriate.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/nfs4state.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e62d0e3df8b..88db7d3ec12 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2416,6 +2416,26 @@ out:
 #define LOCK_HASH_SIZE             (1 << LOCK_HASH_BITS)
 #define LOCK_HASH_MASK             (LOCK_HASH_SIZE - 1)
 
+static inline u64
+end_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	end = start + len;
+	return end >= start ? end: NFS4_MAX_UINT64;
+}
+
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+	u64 end;
+
+	BUG_ON(!len);
+	end = start + len;
+	return end > start ? end - 1: NFS4_MAX_UINT64;
+}
+
 #define lockownerid_hashval(id) \
         ((id) & LOCK_HASH_MASK)
 
@@ -2519,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 		deny->ld_clientid.cl_id = 0;
 	}
 	deny->ld_start = fl->fl_start;
-	deny->ld_length = ~(u64)0;
-	if (fl->fl_end != ~(u64)0)
+	deny->ld_length = NFS4_MAX_UINT64;
+	if (fl->fl_end != NFS4_MAX_UINT64)
 		deny->ld_length = fl->fl_end - fl->fl_start + 1;        
 	deny->ld_type = NFS4_READ_LT;
 	if (fl->fl_type != F_RDLCK)
@@ -2617,7 +2637,7 @@ out:
 static int
 check_lock_length(u64 offset, u64 length)
 {
-	return ((length == 0)  || ((length != ~(u64)0) &&
+	return ((length == 0)  || ((length != NFS4_MAX_UINT64) &&
 	     LOFF_OVERFLOW(offset, length)));
 }
 
@@ -2737,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lock->lk_offset;
-	if ((lock->lk_length == ~(u64)0) || 
-			LOFF_OVERFLOW(lock->lk_offset, lock->lk_length))
-		file_lock.fl_end = ~(u64)0;
-	else
-		file_lock.fl_end = lock->lk_offset + lock->lk_length - 1;
+	file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
 	nfs4_transform_lock_offset(&file_lock);
 
 	/*
@@ -2858,10 +2874,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lockt->lt_offset;
-	if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
-		file_lock.fl_end = ~(u64)0;
-	else
-		file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1;
+	file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
 
 	nfs4_transform_lock_offset(&file_lock);
 
@@ -2917,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 	file_lock.fl_start = locku->lu_offset;
 
-	if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
-		file_lock.fl_end = ~(u64)0;
-	else
-		file_lock.fl_end = locku->lu_offset + locku->lu_length - 1;
+	file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length);
 	nfs4_transform_lock_offset(&file_lock);
 
 	/*
-- 
cgit v1.2.3


From 755efdc3c4d3b42d5ffcef0f4d6e5b37ecd3bf21 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Jan 2009 19:56:59 -0500
Subject: Btrfs: Drop the hardware crc32c asm code

This is already in the arch specific directories in mainline and
shouldn't be copied into btrfs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/crc32c.h | 97 ++-----------------------------------------------------
 1 file changed, 3 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
index 1eaf11d334f..6e1b3de3670 100644
--- a/fs/btrfs/crc32c.h
+++ b/fs/btrfs/crc32c.h
@@ -18,103 +18,12 @@
 
 #ifndef __BTRFS_CRC32C__
 #define __BTRFS_CRC32C__
-#include <asm/byteorder.h>
 #include <linux/crc32c.h>
-#include <linux/version.h>
 
-/* #define CONFIG_BTRFS_HW_SUM 1 */
-
-#ifdef CONFIG_BTRFS_HW_SUM
-#ifdef CONFIG_X86
 /*
- * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
- * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
- * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
- * http://www.intel.com/products/processor/manuals/
- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
- * Volume 2A: Instruction Set Reference, A-M
- */
-
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-
-#define X86_FEATURE_XMM4_2     (4*32+20) /* Streaming SIMD Extensions-4.2 */
-#define cpu_has_xmm4_2         boot_cpu_has(X86_FEATURE_XMM4_2)
-
-#ifdef CONFIG_X86_64
-#define REX_PRE	"0x48, "
-#define SCALE_F	8
-#else
-#define REX_PRE
-#define SCALE_F	4
-#endif
-
-static inline u32 btrfs_crc32c_le_hw_byte(u32 crc, unsigned char const *data,
-				   size_t length)
-{
-	while (length--) {
-		__asm__ __volatile__(
-			".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
-			:"=S"(crc)
-			:"0"(crc), "c"(*data)
-		);
-		data++;
-	}
-
-	return crc;
-}
-
-static inline u32 __pure btrfs_crc32c_le_hw(u32 crc, unsigned char const *p,
-				     size_t len)
-{
-	unsigned int iquotient = len / SCALE_F;
-	unsigned int iremainder = len % SCALE_F;
-#ifdef CONFIG_X86_64
-	u64 *ptmp = (u64 *)p;
-#else
-	u32 *ptmp = (u32 *)p;
-#endif
-
-	while (iquotient--) {
-		__asm__ __volatile__(
-			".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
-			:"=S"(crc)
-			:"0"(crc), "c"(*ptmp)
-		);
-		ptmp++;
-	}
-
-	if (iremainder)
-		crc = btrfs_crc32c_le_hw_byte(crc, (unsigned char *)ptmp,
-					      iremainder);
-
-	return crc;
-}
-#endif /* CONFIG_BTRFS_HW_SUM */
-
-static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
-				 size_t len)
-{
-#ifdef CONFIG_BTRFS_HW_SUM
-	if (cpu_has_xmm4_2)
-		return btrfs_crc32c_le_hw(crc, address, len);
-#endif
-	return crc32c_le(crc, address, len);
-}
-
-#else
-
-#define __btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
-
-#endif /* CONFIG_X86 */
-
-/**
- * implementation of crc32c_le() changed in linux-2.6.23,
- * has of v0.13 btrfs-progs is using the latest version.
- * We must workaround older implementations of crc32c_le()
- * found on older kernel versions.
+ * this file used to do more for selecting the HW version of crc32c,
+ * perhaps it will one day again soon.
  */
-#define btrfs_crc32c(seed, data, length) \
-	__btrfs_crc32c(seed, (unsigned char const *)data, length)
+#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
 #endif
 
-- 
cgit v1.2.3


From 0e8f989a253b1bf85ea1c8d7987d67c054f4af91 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:46 +0000
Subject: NOMMU: Fix cleanup handling in ramfs_nommu_get_umapped_area()

Fix cleanup handling in ramfs_nommu_get_umapped_area() by only freeing the
number of pages that find_get_pages() said it had returned (nr) rather than
attempting to free the number of pages we asked for (lpages) - thus avoiding
the situation whereby put_page() may be handed NULL pointers if
find_get_pages() returned fewer pages that were requested.

Also avoid a warning about nr being uninitialised and the need for an
if-statement in the cleanup path by using appropriate gotos.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/ramfs/file-nommu.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 76acdbc3461..b9b567a2837 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 	ret = -ENOMEM;
 	pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL);
 	if (!pages)
-		goto out;
+		goto out_free;
 
 	nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
 	if (nr != lpages)
-		goto out; /* leave if some pages were missing */
+		goto out_free_pages; /* leave if some pages were missing */
 
 	/* check the pages for physical adjacency */
 	ptr = pages;
@@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 	page++;
 	for (loop = lpages; loop > 1; loop--)
 		if (*ptr++ != page++)
-			goto out;
+			goto out_free_pages;
 
 	/* okay - all conditions fulfilled */
 	ret = (unsigned long) page_address(pages[0]);
 
- out:
-	if (pages) {
-		ptr = pages;
-		for (loop = lpages; loop > 0; loop--)
-			put_page(*ptr++);
-		kfree(pages);
-	}
-
+out_free_pages:
+	ptr = pages;
+	for (loop = nr; loop > 0; loop--)
+		put_page(*ptr++);
+out_free:
+	kfree(pages);
+out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From 8feae13110d60cc6287afabc2887366b0eb226c2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: NOMMU: Make VMAs per MM as for MMU-mode linux

Make VMAs per mm_struct as for MMU-mode linux.  This solves two problems:

 (1) In SYSV SHM where nattch for a segment does not reflect the number of
     shmat's (and forks) done.

 (2) In mmap() where the VMA's vm_mm is set to point to the parent mm by an
     exec'ing process when VM_EXECUTABLE is specified, regardless of the fact
     that a VMA might be shared and already have its vm_mm assigned to another
     process or a dead process.

A new struct (vm_region) is introduced to track a mapped region and to remember
the circumstances under which it may be shared and the vm_list_struct structure
is discarded as it's no longer required.

This patch makes the following additional changes:

 (1) Regions are now allocated with alloc_pages() rather than kmalloc() and
     with no recourse to __GFP_COMP, so the pages are not composite.  Instead,
     each page has a reference on it held by the region.  Anything else that is
     interested in such a page will have to get a reference on it to retain it.
     When the pages are released due to unmapping, each page is passed to
     put_page() and will be freed when the page usage count reaches zero.

 (2) Excess pages are trimmed after an allocation as the allocation must be
     made as a power-of-2 quantity of pages.

 (3) VMAs are added to the parent MM's R/B tree and mmap lists.  As an MM may
     end up with overlapping VMAs within the tree, the VMA struct address is
     appended to the sort key.

 (4) Non-anonymous VMAs are now added to the backing inode's prio list.

 (5) Holes may be punched in anonymous VMAs with munmap(), releasing parts of
     the backing region.  The VMA and region structs will be split if
     necessary.

 (6) sys_shmdt() only releases one attachment to a SYSV IPC shared memory
     segment instead of all the attachments at that addresss.  Multiple
     shmat()'s return the same address under NOMMU-mode instead of different
     virtual addresses as under MMU-mode.

 (7) Core dumping for ELF-FDPIC requires fewer exceptions for NOMMU-mode.

 (8) /proc/maps is now the global list of mapped regions, and may list bits
     that aren't actually mapped anywhere.

 (9) /proc/meminfo gains a line (tagged "MmapCopy") that indicates the amount
     of RAM currently allocated by mmap to hold mappable regions that can't be
     mapped directly.  These are copies of the backing device or file if not
     anonymous.

These changes make NOMMU mode more similar to MMU mode.  The downside is that
NOMMU mode requires some extra memory to track things over NOMMU without this
patch (VMAs are no longer shared, and there are now region structs).

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 fs/binfmt_elf_fdpic.c |  27 ++-----------
 fs/proc/internal.h    |   2 -
 fs/proc/meminfo.c     |   6 +++
 fs/proc/nommu.c       |  71 +++++++++++++++------------------
 fs/proc/task_nommu.c  | 108 +++++++++++++++++++++++++++++++++++---------------
 5 files changed, 116 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index aa5b43205e3..22baf1b1349 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1567,11 +1567,9 @@ end_coredump:
 static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 			   unsigned long *limit, unsigned long mm_flags)
 {
-	struct vm_list_struct *vml;
-
-	for (vml = current->mm->context.vmlist; vml; vml = vml->next) {
-	struct vm_area_struct *vma = vml->vma;
+	struct vm_area_struct *vma;
 
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		if (!maydump(vma, mm_flags))
 			continue;
 
@@ -1617,9 +1615,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	elf_fpxregset_t *xfpu = NULL;
 #endif
 	int thread_status_size = 0;
-#ifndef CONFIG_MMU
-	struct vm_list_struct *vml;
-#endif
 	elf_addr_t *auxv;
 	unsigned long mm_flags;
 
@@ -1685,13 +1680,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	fill_prstatus(prstatus, current, signr);
 	elf_core_copy_regs(&prstatus->pr_reg, regs);
 
-#ifdef CONFIG_MMU
 	segs = current->mm->map_count;
-#else
-	segs = 0;
-	for (vml = current->mm->context.vmlist; vml; vml = vml->next)
-	    segs++;
-#endif
 #ifdef ELF_CORE_EXTRA_PHDRS
 	segs += ELF_CORE_EXTRA_PHDRS;
 #endif
@@ -1766,20 +1755,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
 	mm_flags = current->mm->flags;
 
 	/* write program headers for segments dump */
-	for (
-#ifdef CONFIG_MMU
-		vma = current->mm->mmap; vma; vma = vma->vm_next
-#else
-			vml = current->mm->context.vmlist; vml; vml = vml->next
-#endif
-	     ) {
+	for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
 		struct elf_phdr phdr;
 		size_t sz;
 
-#ifndef CONFIG_MMU
-		vma = vml->vma;
-#endif
-
 		sz = vma->vm_end - vma->vm_start;
 
 		phdr.p_type = PT_LOAD;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3e8aeb8b61c..cd53ff83849 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -41,8 +41,6 @@ do {						\
 	(vmi)->used = 0;			\
 	(vmi)->largest_chunk = 0;		\
 } while(0)
-
-extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
 #endif
 
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b1675c4e66d..43d23948384 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -73,6 +73,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"HighFree:       %8lu kB\n"
 		"LowTotal:       %8lu kB\n"
 		"LowFree:        %8lu kB\n"
+#endif
+#ifndef CONFIG_MMU
+		"MmapCopy:       %8lu kB\n"
 #endif
 		"SwapTotal:      %8lu kB\n"
 		"SwapFree:       %8lu kB\n"
@@ -115,6 +118,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freehigh),
 		K(i.totalram-i.totalhigh),
 		K(i.freeram-i.freehigh),
+#endif
+#ifndef CONFIG_MMU
+		K((unsigned long) atomic_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 3f87d263294..b446d7ad0b0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -33,33 +33,33 @@
 #include "internal.h"
 
 /*
- * display a single VMA to a sequenced file
+ * display a single region to a sequenced file
  */
-int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 {
 	unsigned long ino = 0;
 	struct file *file;
 	dev_t dev = 0;
 	int flags, len;
 
-	flags = vma->vm_flags;
-	file = vma->vm_file;
+	flags = region->vm_flags;
+	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		struct inode *inode = region->vm_file->f_path.dentry->d_inode;
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
 
 	seq_printf(m,
 		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
-		   vma->vm_start,
-		   vma->vm_end,
+		   region->vm_start,
+		   region->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
+		   ((loff_t)region->vm_pgoff) << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
@@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 }
 
 /*
- * display a list of all the VMAs the kernel knows about
+ * display a list of all the REGIONs the kernel knows about
  * - nommu kernals have a single flat list
  */
-static int nommu_vma_list_show(struct seq_file *m, void *v)
+static int nommu_region_list_show(struct seq_file *m, void *_p)
 {
-	struct vm_area_struct *vma;
+	struct rb_node *p = _p;
 
-	vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb);
-	return nommu_vma_show(m, vma);
+	return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb));
 }
 
-static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos)
+static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos)
 {
-	struct rb_node *_rb;
+	struct rb_node *p;
 	loff_t pos = *_pos;
-	void *next = NULL;
 
-	down_read(&nommu_vma_sem);
+	down_read(&nommu_region_sem);
 
-	for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) {
-		if (pos == 0) {
-			next = _rb;
-			break;
-		}
-		pos--;
-	}
-
-	return next;
+	for (p = rb_first(&nommu_region_tree); p; p = rb_next(p))
+		if (pos-- == 0)
+			return p;
+	return NULL;
 }
 
-static void nommu_vma_list_stop(struct seq_file *m, void *v)
+static void nommu_region_list_stop(struct seq_file *m, void *v)
 {
-	up_read(&nommu_vma_sem);
+	up_read(&nommu_region_sem);
 }
 
-static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos)
+static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
 	return rb_next((struct rb_node *) v);
 }
 
-static const struct seq_operations proc_nommu_vma_list_seqop = {
-	.start	= nommu_vma_list_start,
-	.next	= nommu_vma_list_next,
-	.stop	= nommu_vma_list_stop,
-	.show	= nommu_vma_list_show
+static struct seq_operations proc_nommu_region_list_seqop = {
+	.start	= nommu_region_list_start,
+	.next	= nommu_region_list_next,
+	.stop	= nommu_region_list_stop,
+	.show	= nommu_region_list_show
 };
 
-static int proc_nommu_vma_list_open(struct inode *inode, struct file *file)
+static int proc_nommu_region_list_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &proc_nommu_vma_list_seqop);
+	return seq_open(file, &proc_nommu_region_list_seqop);
 }
 
-static const struct file_operations proc_nommu_vma_list_operations = {
-	.open    = proc_nommu_vma_list_open,
+static const struct file_operations proc_nommu_region_list_operations = {
+	.open    = proc_nommu_region_list_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release,
@@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = {
 
 static int __init proc_nommu_init(void)
 {
-	proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations);
+	proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations);
 	return 0;
 }
 
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index d4a8be32b90..ca4a48d0d31 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -15,25 +15,25 @@
  */
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	struct vm_list_struct *vml;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	unsigned long bytes = 0, sbytes = 0, slack = 0;
         
 	down_read(&mm->mmap_sem);
-	for (vml = mm->context.vmlist; vml; vml = vml->next) {
-		if (!vml->vma)
-			continue;
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
 
-		bytes += kobjsize(vml);
+		bytes += kobjsize(vma);
 		if (atomic_read(&mm->mm_count) > 1 ||
-		    atomic_read(&vml->vma->vm_usage) > 1
-		    ) {
-			sbytes += kobjsize((void *) vml->vma->vm_start);
-			sbytes += kobjsize(vml->vma);
+		    vma->vm_region ||
+		    vma->vm_flags & VM_MAYSHARE) {
+			sbytes += kobjsize((void *) vma->vm_start);
+			if (vma->vm_region)
+				sbytes += kobjsize(vma->vm_region);
 		} else {
-			bytes += kobjsize((void *) vml->vma->vm_start);
-			bytes += kobjsize(vml->vma);
-			slack += kobjsize((void *) vml->vma->vm_start) -
-				(vml->vma->vm_end - vml->vma->vm_start);
+			bytes += kobjsize((void *) vma->vm_start);
+			slack += kobjsize((void *) vma->vm_start) -
+				(vma->vm_end - vma->vm_start);
 		}
 	}
 
@@ -70,13 +70,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 
 unsigned long task_vsize(struct mm_struct *mm)
 {
-	struct vm_list_struct *tbp;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	unsigned long vsize = 0;
 
 	down_read(&mm->mmap_sem);
-	for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
-		if (tbp->vma)
-			vsize += kobjsize((void *) tbp->vma->vm_start);
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		vsize += vma->vm_region->vm_end - vma->vm_region->vm_start;
 	}
 	up_read(&mm->mmap_sem);
 	return vsize;
@@ -85,16 +86,15 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	struct vm_list_struct *tbp;
+	struct vm_area_struct *vma;
+	struct rb_node *p;
 	int size = kobjsize(mm);
 
 	down_read(&mm->mmap_sem);
-	for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) {
-		size += kobjsize(tbp);
-		if (tbp->vma) {
-			size += kobjsize(tbp->vma);
-			size += kobjsize((void *) tbp->vma->vm_start);
-		}
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
+		vma = rb_entry(p, struct vm_area_struct, vm_rb);
+		size += kobjsize(vma);
+		size += kobjsize((void *) vma->vm_start);
 	}
 
 	size += (*text = mm->end_code - mm->start_code);
@@ -104,21 +104,63 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	return size;
 }
 
+/*
+ * display a single VMA to a sequenced file
+ */
+static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
+{
+	unsigned long ino = 0;
+	struct file *file;
+	dev_t dev = 0;
+	int flags, len;
+
+	flags = vma->vm_flags;
+	file = vma->vm_file;
+
+	if (file) {
+		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		dev = inode->i_sb->s_dev;
+		ino = inode->i_ino;
+	}
+
+	seq_printf(m,
+		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   vma->vm_start,
+		   vma->vm_end,
+		   flags & VM_READ ? 'r' : '-',
+		   flags & VM_WRITE ? 'w' : '-',
+		   flags & VM_EXEC ? 'x' : '-',
+		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
+		   vma->vm_pgoff << PAGE_SHIFT,
+		   MAJOR(dev), MINOR(dev), ino, &len);
+
+	if (file) {
+		len = 25 + sizeof(void *) * 6 - len;
+		if (len < 1)
+			len = 1;
+		seq_printf(m, "%*c", len, ' ');
+		seq_path(m, &file->f_path, "");
+	}
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
 /*
  * display mapping lines for a particular process's /proc/pid/maps
  */
-static int show_map(struct seq_file *m, void *_vml)
+static int show_map(struct seq_file *m, void *_p)
 {
-	struct vm_list_struct *vml = _vml;
+	struct rb_node *p = _p;
 
-	return nommu_vma_show(m, vml->vma);
+	return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
 }
 
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_maps_private *priv = m->private;
-	struct vm_list_struct *vml;
 	struct mm_struct *mm;
+	struct rb_node *p;
 	loff_t n = *pos;
 
 	/* pin the task and mm whilst we play with them */
@@ -134,9 +176,9 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 	}
 
 	/* start from the Nth VMA */
-	for (vml = mm->context.vmlist; vml; vml = vml->next)
+	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
 		if (n-- == 0)
-			return vml;
+			return p;
 	return NULL;
 }
 
@@ -152,12 +194,12 @@ static void m_stop(struct seq_file *m, void *_vml)
 	}
 }
 
-static void *m_next(struct seq_file *m, void *_vml, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
 {
-	struct vm_list_struct *vml = _vml;
+	struct rb_node *p = _p;
 
 	(*pos)++;
-	return vml ? vml->next : NULL;
+	return p ? rb_next(p) : NULL;
 }
 
 static const struct seq_operations proc_pid_maps_ops = {
-- 
cgit v1.2.3


From 38f714795b7cf4103c54152200ca66b524f8ed6e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: NOMMU: Improve procfs output using per-MM VMAs

Improve procfs output using per-MM VMAs for process memory accounting.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 fs/proc/task_nommu.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index ca4a48d0d31..343ea1216bc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -16,24 +16,31 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
+	struct vm_region *region;
 	struct rb_node *p;
-	unsigned long bytes = 0, sbytes = 0, slack = 0;
+	unsigned long bytes = 0, sbytes = 0, slack = 0, size;
         
 	down_read(&mm->mmap_sem);
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
 		vma = rb_entry(p, struct vm_area_struct, vm_rb);
 
 		bytes += kobjsize(vma);
+
+		region = vma->vm_region;
+		if (region) {
+			size = kobjsize(region);
+			size += region->vm_end - region->vm_start;
+		} else {
+			size = vma->vm_end - vma->vm_start;
+		}
+
 		if (atomic_read(&mm->mm_count) > 1 ||
-		    vma->vm_region ||
 		    vma->vm_flags & VM_MAYSHARE) {
-			sbytes += kobjsize((void *) vma->vm_start);
-			if (vma->vm_region)
-				sbytes += kobjsize(vma->vm_region);
+			sbytes += size;
 		} else {
-			bytes += kobjsize((void *) vma->vm_start);
-			slack += kobjsize((void *) vma->vm_start) -
-				(vma->vm_end - vma->vm_start);
+			bytes += size;
+			if (region)
+				slack = region->vm_end - vma->vm_end;
 		}
 	}
 
@@ -77,7 +84,7 @@ unsigned long task_vsize(struct mm_struct *mm)
 	down_read(&mm->mmap_sem);
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
 		vma = rb_entry(p, struct vm_area_struct, vm_rb);
-		vsize += vma->vm_region->vm_end - vma->vm_region->vm_start;
+		vsize += vma->vm_end - vma->vm_start;
 	}
 	up_read(&mm->mmap_sem);
 	return vsize;
@@ -87,6 +94,7 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
 	struct vm_area_struct *vma;
+	struct vm_region *region;
 	struct rb_node *p;
 	int size = kobjsize(mm);
 
@@ -94,7 +102,11 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
 	for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) {
 		vma = rb_entry(p, struct vm_area_struct, vm_rb);
 		size += kobjsize(vma);
-		size += kobjsize((void *) vma->vm_start);
+		region = vma->vm_region;
+		if (region) {
+			size += kobjsize(region);
+			size += region->vm_end - region->vm_start;
+		}
 	}
 
 	size += (*text = mm->end_code - mm->start_code);
-- 
cgit v1.2.3


From f4bbf51050a1e1dd485e9cd89eef4619a7453d71 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: FDPIC: Don't attempt to expand the userspace stack to fill the space
 allocated

Stop the ELF-FDPIC binfmt from attempting to expand the userspace stack and brk
segments to fill the space actually allocated for it.  The space allocated may
be rounded up by mmap(), and may be wasted.

However, finding out how much space we actually obtained uses the contentious
kobjsize() function which we'd like to get rid of as it doesn't necessarily
work for all slab allocators.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 fs/binfmt_elf_fdpic.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 22baf1b1349..f3e72c5c19f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	struct elf_fdpic_params exec_params, interp_params;
 	struct elf_phdr *phdr;
 	unsigned long stack_size, entryaddr;
-#ifndef CONFIG_MMU
-	unsigned long fullsize;
-#endif
 #ifdef ELF_FDPIC_PLAT_INIT
 	unsigned long dynaddr;
 #endif
@@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 		goto error_kill;
 	}
 
-	/* expand the stack mapping to use up the entire allocation granule */
-	fullsize = kobjsize((char *) current->mm->start_brk);
-	if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
-				    fullsize, 0, 0)))
-		stack_size = fullsize;
 	up_write(&current->mm->mmap_sem);
 
 	current->mm->brk = current->mm->start_brk;
-- 
cgit v1.2.3


From 0f3e442a403a344a5d0a49af9ecd7632b7e7343a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 8 Jan 2009 12:04:47 +0000
Subject: FLAT: Don't attempt to expand the userspace stack to fill the space
 allocated

Stop the FLAT binfmt from attempting to expand the userspace stack and brk
segments to fill the space actually allocated for it.  The space allocated may
be rounded up by mmap(), and may be wasted.

However, finding out how much space we actually obtained uses the contentious
kobjsize() function which we'd like to get rid of as it doesn't necessarily
work for all slab allocators.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Mike Frysinger <vapier.adi@gmail.com>
Acked-by: Paul Mundt <lethal@linux-sh.org>
---
 fs/binfmt_flat.c | 34 +++++++++++-----------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7bbd5c6b372..5cebf0b3779 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 	unsigned long textpos = 0, datapos = 0, result;
 	unsigned long realdatastart = 0;
 	unsigned long text_len, data_len, bss_len, stack_len, flags;
-	unsigned long len, reallen, memp = 0;
-	unsigned long extra, rlim;
+	unsigned long len, memp = 0;
+	unsigned long memp_size, extra, rlim;
 	unsigned long *reloc = 0, *rp;
 	struct inode *inode;
 	int i, rev, relocs = 0;
@@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm,
 		}
 
 		len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+		len = PAGE_ALIGN(len);
 		down_write(&current->mm->mmap_sem);
 		realdatastart = do_mmap(0, 0, len,
 			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
-		/* Remap to use all availabe slack region space */
-		if (realdatastart && (realdatastart < (unsigned long)-4096)) {
-			reallen = kobjsize((void *)realdatastart);
-			if (reallen > len) {
-				realdatastart = do_mremap(realdatastart, len,
-					reallen, MREMAP_FIXED, realdatastart);
-			}
-		}
 		up_write(&current->mm->mmap_sem);
 
 		if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) {
@@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 		reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
 		memp = realdatastart;
-
+		memp_size = len;
 	} else {
 
 		len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+		len = PAGE_ALIGN(len);
 		down_write(&current->mm->mmap_sem);
 		textpos = do_mmap(0, 0, len,
 			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
-		/* Remap to use all availabe slack region space */
-		if (textpos && (textpos < (unsigned long) -4096)) {
-			reallen = kobjsize((void *)textpos);
-			if (reallen > len) {
-				textpos = do_mremap(textpos, len, reallen,
-					MREMAP_FIXED, textpos);
-			}
-		}
 		up_write(&current->mm->mmap_sem);
 
 		if (!textpos  || textpos >= (unsigned long) -4096) {
@@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) +
 				MAX_SHARED_LIBS * sizeof(unsigned long));
 		memp = textpos;
-
+		memp_size = len;
 #ifdef CONFIG_BINFMT_ZFLAT
 		/*
 		 * load it all in and treat it like a RAM load from now on
@@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm,
 		 * set up the brk stuff, uses any slack left in data/bss/stack
 		 * allocation.  We put the brk after the bss (between the bss
 		 * and stack) like other platforms.
+		 * Userspace code relies on the stack pointer starting out at
+		 * an address right at the end of a page.
 		 */
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
-		current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len;
+		current->mm->context.end_brk = memp + memp_size - stack_len;
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
@@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 	/* zero the BSS,  BRK and stack areas */
 	memset((void*)(datapos + data_len), 0, bss_len + 
-			(memp + kobjsize((void *) memp) - stack_len -	/* end brk */
-			libinfo->lib_list[id].start_brk) +		/* start brk */
+			(memp + memp_size - stack_len -		/* end brk */
+			libinfo->lib_list[id].start_brk) +	/* start brk */
 			stack_len);
 
 	return 0;
-- 
cgit v1.2.3


From 96777fe7b042e5a5d0fe5fb861fcd6cd80ef9634 Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Thu, 8 Jan 2009 09:46:31 -0600
Subject: async: Don't call async_synchronize_full_special() while holding
 sb_lock

sync_filesystems() shouldn't be calling async_synchronize_full_special
while holding a spinlock.  The second while loop in that function is the
right place for this anyway.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Reported-by: Grissiom <chaos.proton@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index cb20744ec78..7d67387496c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -458,7 +458,6 @@ void sync_filesystems(int wait)
 		if (sb->s_flags & MS_RDONLY)
 			continue;
 		sb->s_need_sync_fs = 1;
-		async_synchronize_full_special(&sb->s_async_list);
 	}
 
 restart:
@@ -471,6 +470,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
+		async_synchronize_full_special(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
-- 
cgit v1.2.3


From 22d613d13445de9dea6edc3289c304237eb191f6 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Wed, 7 Jan 2009 18:07:18 -0800
Subject: ext2: fix ext2_splice_branch() comments

There is no argument named @chain in ext2_splice_branch, remove references
to it.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 02b39a5deb7..23fff2f8778 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode,
  * ext2_splice_branch - splice the allocated branch onto inode.
  * @inode: owner
  * @block: (logical) number of block we are adding
- * @chain: chain of indirect blocks (with a missing link - see
- *	ext2_alloc_branch)
  * @where: location of missing link
  * @num:   number of indirect blocks we are adding
  * @blks:  number of direct blocks we are adding
-- 
cgit v1.2.3


From 18a82eb9f980b5e02cea651e4ecda26265d98933 Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.helsinki.fi>
Date: Wed, 7 Jan 2009 18:07:19 -0800
Subject: ext2: allocate ->s_blockgroup_lock separately

As spotted by kmemtrace, struct ext2_sb_info is 17024 bytes on 64-bit
which makes it a very bad fit for SLAB allocators.  The culprit of the
wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when
NR_CPUS >= 32.

To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2
page in the worst case, separately.  This shinks down struct ext2_sb_info
enough to fit a 1 KB slab cache so now we allocate 16 KB + 1 KB instead of
32 KB saving 15 KB of memory.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/super.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 647cd888ac8..da8bdeaa2e6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb)
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 
 	return;
@@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
 
@@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		printk ("EXT2-fs: not enough memory\n");
 		goto failed_mount;
 	}
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 	sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL);
 	if (!sbi->s_debts) {
 		printk ("EXT2-fs: not enough memory\n");
-- 
cgit v1.2.3


From 0e090f1e05a563cc9acdda442767176bf1616001 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:20 -0800
Subject: ext2: don't inherit inappropriate inode flags from parent

At present BTREE/INDEX is the only flag that new ext2 inodes do NOT
inherit from their parent.  In addition prevent the flags DIRTY, ECOMPR,
INDEX, IMAGIC and TOPDIR from being inherited.  List inheritable flags
explicitly to prevent future flags from accidentally being inherited.

This fixes the TOPDIR flag inheritance bug reported at
http://bugzilla.kernel.org/show_bug.cgi?id=9866.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index c454d5db28a..b5598e1393d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,7 +565,7 @@ got:
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	memset(ei->i_data, 0, sizeof(ei->i_data));
-	ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL;
+	ei->i_flags = EXT2_I(dir)->i_flags & EXT2_FL_INHERITED;
 	if (S_ISLNK(mode))
 		ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
 	/* dirsync is only applied to directories */
-- 
cgit v1.2.3


From ef8b646183868b2d042fa6cde0eef2a31263ff85 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:21 -0800
Subject: ext2: tighten restrictions on inode flags

At the moment there are few restrictions on which flags may be set on
which inodes.  Specifically DIRSYNC may only be set on directories and
IMMUTABLE and APPEND may not be set on links.  Tighten that to disallow
TOPDIR being set on non-directories and only NODUMP and NOATIME to be set
on non-regular file, non-directories.

Introduces a flags masking function which masks flags based on mode and
use it during inode creation and when flags are set via the ioctl to
facilitate future consistency.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/ialloc.c | 8 ++------
 fs/ext2/ioctl.c  | 3 +--
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index b5598e1393d..66321a877e7 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -565,12 +565,8 @@ got:
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
 	memset(ei->i_data, 0, sizeof(ei->i_data));
-	ei->i_flags = EXT2_I(dir)->i_flags & EXT2_FL_INHERITED;
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL);
-	/* dirsync is only applied to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT2_DIRSYNC_FL;
+	ei->i_flags =
+		ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
 	ei->i_faddr = 0;
 	ei->i_frag_no = 0;
 	ei->i_frag_size = 0;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index de876fa793e..7cb4badef92 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			goto setflags_out;
 		}
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT2_DIRSYNC_FL;
+		flags = ext2_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
-- 
cgit v1.2.3


From f420d4dc4272fd223986762df2ad06056ddebada Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 7 Jan 2009 18:07:24 -0800
Subject: jbd: improve fsync batching

There is a flaw with the way jbd handles fsync batching.  If we fsync() a
file and we were not the last person to run fsync() on this fs then we
automatically sleep for 1 jiffie in order to wait for new writers to join
into the transaction before forcing the commit.  The problem with this is
that with really fast storage (ie a Clariion) the time it takes to commit
a transaction to disk is way faster than 1 jiffie in most cases, so
sleeping means waiting longer with nothing to do than if we just committed
the transaction and kept going.  Ric Wheeler noticed this when using
fs_mark with more than 1 thread, the throughput would plummet as he added
more threads.

This patch attempts to fix this problem by recording the average time in
nanoseconds that it takes to commit a transaction to disk, and what time
we started the transaction.  If we run an fsync() and we have been running
for less time than it takes to commit the transaction to disk, we sleep
for the delta amount of time and then commit to disk.  We acheive
sub-jiffie sleeping using schedule_hrtimeout.  This means that the wait
time is auto-tuned to the speed of the underlying disk, instead of having
this static timeout.  I weighted the average according to somebody's
comments (Andreas Dilger I think) in order to help normalize random
outliers where we take way longer or way less time to commit than the
average.  I also have a min() check in there to make sure we don't sleep
longer than a jiffie in case our storage is super slow, this was requested
by Andrew.

I unfortunately do not have access to a Clariion, so I had to use a
ramdisk to represent a super fast array.  I tested with a SATA drive with
barrier=1 to make sure there was no regression with local disks, I tested
with a 4 way multipathed Apple Xserve RAID array and of course the
ramdisk.  I ran the following command

fs_mark -d /mnt/ext3-test -s 4096 -n 2000 -D 64 -t $i

where $i was 2, 4, 8, 16 and 32.  I mkfs'ed the fs each time.  Here are my
results

type	threads		with patch	without patch
sata	2		24.6		26.3
sata	4		49.2		48.1
sata	8		70.1		67.0
sata	16		104.0		94.1
sata	32		153.6		142.7

xserve	2		246.4		222.0
xserve	4		480.0		440.8
xserve	8		829.5		730.8
xserve	16		1172.7		1026.9
xserve	32		1816.3		1650.5

ramdisk	2		2538.3		1745.6
ramdisk	4		2942.3		661.9
ramdisk	8		2882.5		999.8
ramdisk	16		2738.7		1801.9
ramdisk	32		2541.9		2394.0

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Cc: Andreas Dilger <adilger@sun.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Ric Wheeler <rwheeler@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/commit.c      | 15 +++++++++++++++
 fs/jbd/transaction.c | 38 +++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 25719d902c5..3fbffb1ea71 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal)
 	int flags;
 	int err;
 	unsigned long blocknr;
+	ktime_t start_time;
+	u64 commit_time;
 	char *tagp = NULL;
 	journal_header_t *header;
 	journal_block_tag_t *tag = NULL;
@@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal)
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
 	journal->j_running_transaction = NULL;
+	start_time = ktime_get();
 	commit_transaction->t_log_start = journal->j_head;
 	wake_up(&journal->j_wait_transaction_locked);
 	spin_unlock(&journal->j_state_lock);
@@ -913,6 +916,18 @@ restart_loop:
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
+	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
+
+	/*
+	 * weight the commit time higher than the average time so we don't
+	 * react too strongly to vast changes in commit time
+	 */
+	if (likely(journal->j_average_commit_time))
+		journal->j_average_commit_time = (commit_time*3 +
+				journal->j_average_commit_time) / 4;
+	else
+		journal->j_average_commit_time = commit_time;
+
 	spin_unlock(&journal->j_state_lock);
 
 	if (commit_transaction->t_checkpoint_list == NULL &&
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d4c32c880..b51fbd4b291 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -25,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/hrtimer.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
 {
 	transaction->t_journal = journal;
 	transaction->t_state = T_RUNNING;
+	transaction->t_start_time = ktime_get();
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
@@ -1370,7 +1372,7 @@ int journal_stop(handle_t *handle)
 {
 	transaction_t *transaction = handle->h_transaction;
 	journal_t *journal = transaction->t_journal;
-	int old_handle_count, err;
+	int err;
 	pid_t pid;
 
 	J_ASSERT(journal_current_handle() == handle);
@@ -1399,6 +1401,17 @@ int journal_stop(handle_t *handle)
 	 * on IO anyway.  Speeds up many-threaded, many-dir operations
 	 * by 30x or more...
 	 *
+	 * We try and optimize the sleep time against what the underlying disk
+	 * can do, instead of having a static sleep time.  This is usefull for
+	 * the case where our storage is so fast that it is more optimal to go
+	 * ahead and force a flush and wait for the transaction to be committed
+	 * than it is to wait for an arbitrary amount of time for new writers to
+	 * join the transaction.  We acheive this by measuring how long it takes
+	 * to commit a transaction, and compare it with how long this
+	 * transaction has been running, and if run time < commit time then we
+	 * sleep for the delta and commit.  This greatly helps super fast disks
+	 * that would see slowdowns as more threads started doing fsyncs.
+	 *
 	 * But don't do this if this process was the most recent one to
 	 * perform a synchronous write.  We do this to detect the case where a
 	 * single process is doing a stream of sync writes.  No point in waiting
@@ -1406,11 +1419,26 @@ int journal_stop(handle_t *handle)
 	 */
 	pid = current->pid;
 	if (handle->h_sync && journal->j_last_sync_writer != pid) {
+		u64 commit_time, trans_time;
+
 		journal->j_last_sync_writer = pid;
-		do {
-			old_handle_count = transaction->t_handle_count;
-			schedule_timeout_uninterruptible(1);
-		} while (old_handle_count != transaction->t_handle_count);
+
+		spin_lock(&journal->j_state_lock);
+		commit_time = journal->j_average_commit_time;
+		spin_unlock(&journal->j_state_lock);
+
+		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
+						   transaction->t_start_time));
+
+		commit_time = min_t(u64, commit_time,
+				    1000*jiffies_to_usecs(1));
+
+		if (trans_time < commit_time) {
+			ktime_t expires = ktime_add_ns(ktime_get(),
+						       commit_time);
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
+		}
 	}
 
 	current->journal_info = NULL;
-- 
cgit v1.2.3


From 5df096d67ec2b6578518caed7d57317a4b807aa1 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 7 Jan 2009 18:07:25 -0800
Subject: ext3: allocate ->s_blockgroup_lock separately

As spotted by kmemtrace, struct ext3_sb_info is 17152 bytes on 64-bit
which makes it a very bad fit for SLAB allocators.  The culprit of the
wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when
NR_CPUS >= 32.

To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2
page in the worst case, separately.  This shinks down struct ext3_sb_info
enough to fit a 1 KB slab cache so now we allocate 16 KB + 1 KB instead of
32 KB saving 15 KB of memory.

Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c22d01467bd..01c235bc205 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb)
 		ext3_blkdev_remove(sbi);
 	}
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return;
 }
@@ -1546,6 +1547,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock) {
+		kfree(sbi);
+		return -ENOMEM;
+	}
 	sb->s_fs_info = sbi;
 	sbi->s_mount_opt = 0;
 	sbi->s_resuid = EXT3_DEF_RESUID;
@@ -1786,7 +1794,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	bgl_lock_init(&sbi->s_blockgroup_lock);
+	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
-- 
cgit v1.2.3


From 2e8671cb566da993425d324fc355af31edc6e7f1 Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:26 -0800
Subject: ext3: don't inherit inappropriate inode flags from parent

At present INDEX is the only flag that new ext3 inodes do NOT inherit from
their parent.  In addition prevent the flags DIRTY, ECOMPR, IMAGIC and
TOPDIR from being inherited.  List inheritable flags explicitly to prevent
future flags from accidentally being inherited.

This fixes the TOPDIR flag inheritance bug reported at
http://bugzilla.kernel.org/show_bug.cgi?id=9866.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/ialloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 5655fbcbd11..ba9186a21d1 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,7 +559,7 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL;
+	ei->i_flags = EXT3_I(dir)->i_flags & EXT3_FL_INHERITED;
 	if (S_ISLNK(mode))
 		ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
 	/* dirsync only applies to directories */
-- 
cgit v1.2.3


From 04143e2fb9d512c21e1dcfb561dbb0445dcfdc8c Mon Sep 17 00:00:00 2001
From: Duane Griffin <duaneg@dghda.com>
Date: Wed, 7 Jan 2009 18:07:26 -0800
Subject: ext3: tighten restrictions on inode flags

At the moment there are few restrictions on which flags may be set on
which inodes.  Specifically DIRSYNC may only be set on directories and
IMMUTABLE and APPEND may not be set on links.  Tighten that to disallow
TOPDIR being set on non-directories and only NODUMP and NOATIME to be set
on non-regular file, non-directories.

Introduces a flags masking function which masks flags based on mode and
use it during inode creation and when flags are set via the ioctl to
facilitate future consistency.

Signed-off-by: Duane Griffin <duaneg@dghda.com>
Acked-by: Andreas Dilger <adilger@sun.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/ialloc.c | 8 ++------
 fs/ext3/ioctl.c  | 3 +--
 2 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index ba9186a21d1..8de6c720e51 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -559,12 +559,8 @@ got:
 	ei->i_dir_start_lookup = 0;
 	ei->i_disksize = 0;
 
-	ei->i_flags = EXT3_I(dir)->i_flags & EXT3_FL_INHERITED;
-	if (S_ISLNK(mode))
-		ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL);
-	/* dirsync only applies to directories */
-	if (!S_ISDIR(mode))
-		ei->i_flags &= ~EXT3_DIRSYNC_FL;
+	ei->i_flags =
+		ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED);
 #ifdef EXT3_FRAGMENTS
 	ei->i_faddr = 0;
 	ei->i_frag_no = 0;
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index b7394d05ee8..5e86ce9a86e 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 			goto flags_out;
 		}
 
-		if (!S_ISDIR(inode->i_mode))
-			flags &= ~EXT3_DIRSYNC_FL;
+		flags = ext3_mask_flags(inode->i_mode, flags);
 
 		mutex_lock(&inode->i_mutex);
 		/* Is it quota file? Do not allow user to mess with it */
-- 
cgit v1.2.3


From 1579c3a15c06055713b42b077b805f818638302c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Wed, 7 Jan 2009 18:07:27 -0800
Subject: jbd: remove excess kernel-doc notation

Remove excess kernel-doc from fs/jbd/transaction.c:

Warning(linux-2.6.28-git5//fs/jbd/transaction.c:764): Excess function parameter 'credits' description in 'journal_get_write_access'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/transaction.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index b51fbd4b291..e6a11743127 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -754,7 +754,6 @@ out:
  * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
  * @handle: transaction to add buffer modifications to
  * @bh:     bh to be used for metadata writes
- * @credits: variable that will receive credits for the buffer
  *
  * Returns an error code or 0 on success.
  *
-- 
cgit v1.2.3


From 87d1fda5e2ff6527740604a7918fc273b6f9ae13 Mon Sep 17 00:00:00 2001
From: "Richard A. Holden III" <aciddeath@gmail.com>
Date: Wed, 7 Jan 2009 18:07:28 -0800
Subject: coda: fix fs/coda/sysctl.c build warnings when !CONFIG_SYSCTL

Fix
fs/coda/sysctl.c:14: warning: 'fs_table_header' defined but not used
fs/coda/sysctl.c:44: warning: 'fs_table' defined but not used

these are only used when CONFIG_SYSCTL is defined.

Signed-off-by: Richard A. Holden III <aciddeath@gmail.com>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/sysctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index 81b7771c646..43c96ce2961 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -11,7 +11,9 @@
 
 #include "coda_int.h"
 
+#ifdef CONFIG_SYSCTL
 static struct ctl_table_header *fs_table_header;
+#endif
 
 static ctl_table coda_table[] = {
 	{
@@ -41,6 +43,7 @@ static ctl_table coda_table[] = {
 	{}
 };
 
+#ifdef CONFIG_SYSCTL
 static ctl_table fs_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -50,7 +53,7 @@ static ctl_table fs_table[] = {
 	},
 	{}
 };
-
+#endif
 
 void coda_sysctl_init(void)
 {
-- 
cgit v1.2.3


From e04a88a920ff36d03641e1b9c01b7960d94209f1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 7 Jan 2009 18:07:29 -0800
Subject: quota: don't set grace time when user isn't above softlimit

do_set_dqblk() allowed SETDQBLK quotactl to set user's grace time even if
user was not above his softlimit.  This does not make much sence and by
coincidence causes quota code to omit softlimit warning when user really
exceeds softlimit.  This patch makes do_set_dqblk() reset user's grace
time if he has not exceeded softlimit.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dquot.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 61bfff64e5a..48c0571f831 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -2090,10 +2090,12 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	}
 	if (di->dqb_valid & QIF_BTIME) {
 		dm->dqb_btime = di->dqb_btime;
+		check_blim = 1;
 		__set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
 	}
 	if (di->dqb_valid & QIF_ITIME) {
 		dm->dqb_itime = di->dqb_itime;
+		check_ilim = 1;
 		__set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
 	}
 
-- 
cgit v1.2.3


From 08e552c69c6930d64722de3ec18c51844d06ee28 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Wed, 7 Jan 2009 18:08:01 -0800
Subject: memcg: synchronized LRU

A big patch for changing memcg's LRU semantics.

Now,
  - page_cgroup is linked to mem_cgroup's its own LRU (per zone).

  - LRU of page_cgroup is not synchronous with global LRU.

  - page and page_cgroup is one-to-one and statically allocated.

  - To find page_cgroup is on what LRU, you have to check pc->mem_cgroup as
    - lru = page_cgroup_zoneinfo(pc, nid_of_pc, zid_of_pc);

  - SwapCache is handled.

And, when we handle LRU list of page_cgroup, we do following.

	pc = lookup_page_cgroup(page);
	lock_page_cgroup(pc); .....................(1)
	mz = page_cgroup_zoneinfo(pc);
	spin_lock(&mz->lru_lock);
	.....add to LRU
	spin_unlock(&mz->lru_lock);
	unlock_page_cgroup(pc);

But (1) is spin_lock and we have to be afraid of dead-lock with zone->lru_lock.
So, trylock() is used at (1), now. Without (1), we can't trust "mz" is correct.

This is a trial to remove this dirty nesting of locks.
This patch changes mz->lru_lock to be zone->lru_lock.
Then, above sequence will be written as

        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU
	mem_cgroup_add/remove/etc_lru() {
		pc = lookup_page_cgroup(page);
		mz = page_cgroup_zoneinfo(pc);
		if (PageCgroupUsed(pc)) {
			....add to LRU
		}
        spin_lock(&zone->lru_lock); # in vmscan.c or swap.c via global LRU

This is much simpler.
(*) We're safe even if we don't take lock_page_cgroup(pc). Because..
    1. When pc->mem_cgroup can be modified.
       - at charge.
       - at account_move().
    2. at charge
       the PCG_USED bit is not set before pc->mem_cgroup is fixed.
    3. at account_move()
       the page is isolated and not on LRU.

Pros.
  - easy for maintenance.
  - memcg can make use of laziness of pagevec.
  - we don't have to duplicated LRU/Active/Unevictable bit in page_cgroup.
  - LRU status of memcg will be synchronized with global LRU's one.
  - # of locks are reduced.
  - account_move() is simplified very much.
Cons.
  - may increase cost of LRU rotation.
    (no impact if memcg is not configured.)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/splice.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 1abab5cee4b..a54b3e3f10a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,6 +21,7 @@
 #include <linux/file.h>
 #include <linux/pagemap.h>
 #include <linux/splice.h>
+#include <linux/memcontrol.h>
 #include <linux/mm_inline.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
-- 
cgit v1.2.3


From f06295b44c296c8fb08823a3118468ae343b60f2 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Wed, 7 Jan 2009 18:08:52 -0800
Subject: ELF: implement AT_RANDOM for glibc PRNG seeding

While discussing[1] the need for glibc to have access to random bytes
during program load, it seems that an earlier attempt to implement
AT_RANDOM got stalled.  This implements a random 16 byte string, available
to every ELF program via a new auxv AT_RANDOM vector.

[1] http://sourceware.org/ml/libc-alpha/2008-10/msg00006.html

Ulrich said:

glibc needs right after startup a bit of random data for internal
protections (stack canary etc).  What is now in upstream glibc is that we
always unconditionally open /dev/urandom, read some data, and use it.  For
every process startup.  That's slow.

...

The solution is to provide a limited amount of random data to the
starting process in the aux vector.  I suggested 16 bytes and this is
what the patch implements.  If we need only 16 bytes or less we use the
data directly.  If we need more we'll use the 16 bytes to see a PRNG.
This avoids the costly /dev/urandom use and it allows the kernel to use
the most adequate source of random data for this purpose.  It might not
be the same pool as that for /dev/urandom.

Concerns were expressed about the depletion of the randomness pool.  But
this patch doesn't make the situation worse, it doesn't deplete entropy
more than happens now.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c41fa2af767..e3ff2b9e602 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	elf_addr_t __user *sp;
 	elf_addr_t __user *u_platform;
 	elf_addr_t __user *u_base_platform;
+	elf_addr_t __user *u_rand_bytes;
 	const char *k_platform = ELF_PLATFORM;
 	const char *k_base_platform = ELF_BASE_PLATFORM;
+	unsigned char k_rand_bytes[16];
 	int items;
 	elf_addr_t *elf_info;
 	int ei_index = 0;
@@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 	}
 
+	/*
+	 * Generate 16 random bytes for userspace PRNG seeding.
+	 */
+	get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes));
+	u_rand_bytes = (elf_addr_t __user *)
+		       STACK_ALLOC(p, sizeof(k_rand_bytes));
+	if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes)))
+		return -EFAULT;
+
 	/* Create the ELF interpreter info */
 	elf_info = (elf_addr_t *)current->mm->saved_auxv;
 	/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 	NEW_AUX_ENT(AT_GID, cred->gid);
 	NEW_AUX_ENT(AT_EGID, cred->egid);
  	NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+	NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
 	NEW_AUX_ENT(AT_EXECFN, bprm->exec);
 	if (k_platform) {
 		NEW_AUX_ENT(AT_PLATFORM,
-- 
cgit v1.2.3


From 921d58c0e6992f74d3a48180604a298f115d2dee Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 7 Jan 2009 18:09:06 -0800
Subject: vmcore: remove saved_max_pfn check

Remove the saved_max_pfn check from the /proc/vmcore function
read_from_oldmem().  No need to verify, we should be able to just trust
that "elfcorehdr=" is correctly passed to the crash kernel on the kernel
command line like we do with other parameters.

The read_from_oldmem() function in fs/proc/vmcore.c is quite similar to
read_from_oldmem() in drivers/char/mem.c, but only in the latter it makes
sense to use saved_max_pfn.  For oldmem it is used to determine when to
stop reading.  For vmcore we already have the elf header info pointing out
the physical memory regions, no need to pass the end-of- old-memory twice.

Removing the saved_max_pfn check from vmcore makes it possible for
architectures to skip oldmem but still support crash dump through vmcore -
without the need for the old saved_max_pfn cruft.

Architectures that want to play safe can do the saved_max_pfn check in
copy_oldmem_page().  Not sure why anyone would want to do that, but that's
even safer than today - the saved_max_pfn check in vmcore removed by this
patch only checks the first page.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Acked-by: Simon Horman <horms@verge.net.au>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 03ec5950490..5edcc3f92ba 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
 
 	offset = (unsigned long)(*ppos % PAGE_SIZE);
 	pfn = (unsigned long)(*ppos / PAGE_SIZE);
-	if (pfn > saved_max_pfn)
-		return -EINVAL;
 
 	do {
 		if (count > (PAGE_SIZE - offset))
-- 
cgit v1.2.3


From f15659628b43b27c20447c731456c39cbec973e9 Mon Sep 17 00:00:00 2001
From: roel kluin <roel.kluin@gmail.com>
Date: Wed, 7 Jan 2009 18:09:08 -0800
Subject: romfs: romfs_iget() - unsigned ino >= 0 is always true

romfs_strnlen() returns int
unsigned X >= 0 is always true

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: roel kluin <roel.kluin@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/romfs/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index c97d4c93171..98a232f7196 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -490,7 +490,7 @@ static mode_t romfs_modemap[] =
 static struct inode *
 romfs_iget(struct super_block *sb, unsigned long ino)
 {
-	int nextfh;
+	int nextfh, ret;
 	struct romfs_inode ri;
 	struct inode *i;
 
@@ -526,11 +526,11 @@ romfs_iget(struct super_block *sb, unsigned long ino)
 	i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
 
         /* Precalculate the data offset */
-        ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
-        if (ino >= 0)
-                ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK);
-        else
-                ino = 0;
+	ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
+	if (ret >= 0)
+		ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
+	else
+		ino = 0;
 
         ROMFS_I(i)->i_metasize = ino;
         ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
-- 
cgit v1.2.3


From c19a28e1193a6c854738d609ae9b2fe2f6e6bea4 Mon Sep 17 00:00:00 2001
From: Fernando Carrijo <fcarrijo@yahoo.com.br>
Date: Wed, 7 Jan 2009 18:09:08 -0800
Subject: remove lots of double-semicolons

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: James Morris <jmorris@namei.org>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 2 +-
 fs/ocfs2/file.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 54ff4c77aaa..d861096c9d8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3868,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode,
 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
 	struct ocfs2_extent_rec *rec, *tmprec;
 
-	right_el = path_leaf_el(right_path);;
+	right_el = path_leaf_el(right_path);
 	if (left_path)
 		left_el = path_leaf_el(left_path);
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e8f795f978a..a5887df2cd8 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1605,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 			    struct ocfs2_space_resv *sr)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
 	    !ocfs2_writes_unwritten_extents(osb))
-- 
cgit v1.2.3


From be857df1dd8d8e1491e60d999caf3b8446ccd475 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 7 Jan 2009 18:09:12 -0800
Subject: generic swap(): ext3: remove local swap() macro

Use the new generic implementation.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/namei.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1dd2abe6313..8d6f965e502 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
 
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
-- 
cgit v1.2.3


From 97e133b4543c5c677e768a8538d6d704c4218ff2 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 7 Jan 2009 18:09:13 -0800
Subject: generic swap(): ext4: remove local swap() macro

Use the new generic implementation.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/namei.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9fd2a5e1be4..4b8d431d7df 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle,
 #define assert(test) J_ASSERT(test)
 #endif
 
-#ifndef swap
-#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
-#endif
-
 #ifdef DX_DEBUG
 #define dxtrace(command) command
 #else
-- 
cgit v1.2.3


From 9a8d5bb4ad829e66ab5428ccdce2cbc8ab0ac96c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 7 Jan 2009 18:09:14 -0800
Subject: generic swap(): dcache: use swap() instead of private do_switch()

Use the new generic implementation.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index e88c23b85a3..4547f66884a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry)
 	spin_unlock(&dcache_lock);
 }
 
-#define do_switch(x,y) do { \
-	__typeof__ (x) __tmp = x; \
-	x = y; y = __tmp; } while (0)
-
 /*
  * When switching names, the actual string doesn't strictly have to
  * be preserved in the target - because we're dropping the target
@@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 			/*
 			 * Both external: swap the pointers
 			 */
-			do_switch(target->d_name.name, dentry->d_name.name);
+			swap(target->d_name.name, dentry->d_name.name);
 		} else {
 			/*
 			 * dentry:internal, target:external.  Steal target's
@@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
 			return;
 		}
 	}
-	do_switch(dentry->d_name.len, target->d_name.len);
+	swap(dentry->d_name.len, target->d_name.len);
 }
 
 /*
@@ -1680,7 +1676,7 @@ already_unhashed:
 
 	/* Switch the names.. */
 	switch_names(dentry, target);
-	do_switch(dentry->d_name.hash, target->d_name.hash);
+	swap(dentry->d_name.hash, target->d_name.hash);
 
 	/* ... and switch the parents */
 	if (IS_ROOT(dentry)) {
@@ -1688,7 +1684,7 @@ already_unhashed:
 		target->d_parent = target;
 		INIT_LIST_HEAD(&target->d_u.d_child);
 	} else {
-		do_switch(dentry->d_parent, target->d_parent);
+		swap(dentry->d_parent, target->d_parent);
 
 		/* And add them back to the (new) parent lists */
 		list_add(&target->d_u.d_child, &target->d_parent->d_subdirs);
@@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	struct dentry *dparent, *aparent;
 
 	switch_names(dentry, anon);
-	do_switch(dentry->d_name.hash, anon->d_name.hash);
+	swap(dentry->d_name.hash, anon->d_name.hash);
 
 	dparent = dentry->d_parent;
 	aparent = anon->d_parent;
-- 
cgit v1.2.3


From 73ac36ea14fd18ea3dc057e41b16ff31a3c0bd5a Mon Sep 17 00:00:00 2001
From: Coly Li <coyli@suse.de>
Date: Wed, 7 Jan 2009 18:09:16 -0800
Subject: fix similar typos to successfull

When I review ocfs2 code, find there are 2 typos to "successfull".  After
doing grep "successfull " in kernel tree, 22 typos found totally -- great
minds always think alike :)

This patch fixes all the similar typos. Thanks for Randy's ack and comments.

Signed-off-by: Coly Li <coyli@suse.de>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Roland Dreier <rolandd@cisco.com>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: Jeff Garzik <jeff@garzik.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Vlad Yasevich <vladislav.yasevich@hp.com>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext4/extents.c  | 2 +-
 fs/ocfs2/dlmglue.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea2ce3c0ae6..3f54db31cdc 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2536,7 +2536,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		 */
 		newdepth = ext_depth(inode);
 		/*
-		 * update the extent length after successfull insert of the
+		 * update the extent length after successful insert of the
 		 * split extent
 		 */
 		orig_ex.ee_len = cpu_to_le16(ee_len -
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index f731ab49179..b0c4cadd4c4 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1324,7 +1324,7 @@ again:
 			goto out;
 		}
 
-		mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
+		mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n",
 		     lockres->l_name);
 
 		/* At this point we've gone inside the dlm and need to
@@ -2951,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 		ocfs2_dlm_dump_lksb(&lockres->l_lksb);
 		BUG();
 	}
-	mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
+	mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n",
 	     lockres->l_name);
 
 	ocfs2_wait_on_busy_lock(lockres);
-- 
cgit v1.2.3


From 892c4467e335e9050c95e0d8409c136c4dadaca2 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 7 Jan 2009 16:48:52 -0600
Subject: dlm: fix seq_file usage in debugfs lock dump

The old code would leak iterators and leave reference counts on
rsbs because it was ignoring the "stop" seq callback.  The code
followed an example that used the seq operations differently.
This new code is based on actually understanding how the seq
operations work.  It also improves things by saving the hash bucket
in the position to avoid cycling through completed buckets in start.

Siged-off-by: Davd Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c | 696 ++++++++++++++++++++++++++----------------------------
 1 file changed, 337 insertions(+), 359 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 2f107d1a6a4..bc4af3ef65a 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -1,7 +1,7 @@
 /******************************************************************************
 *******************************************************************************
 **
-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2005-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -25,19 +25,6 @@ static struct mutex debug_buf_lock;
 
 static struct dentry *dlm_root;
 
-struct rsb_iter {
-	int entry;
-	int format;
-	int header;
-	struct dlm_ls *ls;
-	struct list_head *next;
-	struct dlm_rsb *rsb;
-};
-
-/*
- * dump all rsb's in the lockspace hash table
- */
-
 static char *print_lockmode(int mode)
 {
 	switch (mode) {
@@ -60,13 +47,13 @@ static char *print_lockmode(int mode)
 	}
 }
 
-static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
-			       struct dlm_rsb *res)
+static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      struct dlm_rsb *res)
 {
 	seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode));
 
-	if (lkb->lkb_status == DLM_LKSTS_CONVERT
-	    || lkb->lkb_status == DLM_LKSTS_WAITING)
+	if (lkb->lkb_status == DLM_LKSTS_CONVERT ||
+	    lkb->lkb_status == DLM_LKSTS_WAITING)
 		seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode));
 
 	if (lkb->lkb_nodeid) {
@@ -80,33 +67,42 @@ static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	if (lkb->lkb_wait_type)
 		seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
 
-	seq_printf(s, "\n");
+	return seq_printf(s, "\n");
 }
 
 static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
 	int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list;
+	int rv;
 
 	lock_rsb(res);
 
-	seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length);
+	rv = seq_printf(s, "\nResource %p Name (len=%d) \"",
+			res, res->res_length);
+	if (rv)
+		goto out;
+
 	for (i = 0; i < res->res_length; i++) {
 		if (isprint(res->res_name[i]))
 			seq_printf(s, "%c", res->res_name[i]);
 		else
 			seq_printf(s, "%c", '.');
 	}
+
 	if (res->res_nodeid > 0)
-		seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
-			   res->res_nodeid);
+		rv = seq_printf(s, "\"  \nLocal Copy, Master is node %d\n",
+				res->res_nodeid);
 	else if (res->res_nodeid == 0)
-		seq_printf(s, "\"  \nMaster Copy\n");
+		rv = seq_printf(s, "\"  \nMaster Copy\n");
 	else if (res->res_nodeid == -1)
-		seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
-			   res->res_first_lkid);
+		rv = seq_printf(s, "\"  \nLooking up master (lkid %x)\n",
+			   	res->res_first_lkid);
 	else
-		seq_printf(s, "\"  \nInvalid master %d\n", res->res_nodeid);
+		rv = seq_printf(s, "\"  \nInvalid master %d\n",
+				res->res_nodeid);
+	if (rv)
+		goto out;
 
 	/* Print the LVB: */
 	if (res->res_lvbptr) {
@@ -119,52 +115,66 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
 		}
 		if (rsb_flag(res, RSB_VALNOTVALID))
 			seq_printf(s, " (INVALID)");
-		seq_printf(s, "\n");
+		rv = seq_printf(s, "\n");
+		if (rv)
+			goto out;
 	}
 
 	root_list = !list_empty(&res->res_root_list);
 	recover_list = !list_empty(&res->res_recover_list);
 
 	if (root_list || recover_list) {
-		seq_printf(s, "Recovery: root %d recover %d flags %lx "
-			   "count %d\n", root_list, recover_list,
-			   res->res_flags, res->res_recover_locks_count);
+		rv = seq_printf(s, "Recovery: root %d recover %d flags %lx "
+				"count %d\n", root_list, recover_list,
+			   	res->res_flags, res->res_recover_locks_count);
+		if (rv)
+			goto out;
 	}
 
 	/* Print the locks attached to this resource */
 	seq_printf(s, "Granted Queue\n");
-	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue)
-		print_format1_lock(s, lkb, res);
+	list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
 
 	seq_printf(s, "Conversion Queue\n");
-	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue)
-		print_format1_lock(s, lkb, res);
+	list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
 
 	seq_printf(s, "Waiting Queue\n");
-	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue)
-		print_format1_lock(s, lkb, res);
+	list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
+		rv = print_format1_lock(s, lkb, res);
+		if (rv)
+			goto out;
+	}
 
 	if (list_empty(&res->res_lookup))
 		goto out;
 
 	seq_printf(s, "Lookup Queue\n");
 	list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
-		seq_printf(s, "%08x %s", lkb->lkb_id,
-			   print_lockmode(lkb->lkb_rqmode));
+		rv = seq_printf(s, "%08x %s", lkb->lkb_id,
+				print_lockmode(lkb->lkb_rqmode));
 		if (lkb->lkb_wait_type)
 			seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
-		seq_printf(s, "\n");
+		rv = seq_printf(s, "\n");
 	}
  out:
 	unlock_rsb(res);
-	return 0;
+	return rv;
 }
 
-static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
-			       struct dlm_rsb *r)
+static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      struct dlm_rsb *r)
 {
 	u64 xid = 0;
 	u64 us;
+	int rv;
 
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		if (lkb->lkb_ua)
@@ -177,69 +187,82 @@ static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb,
 	/* id nodeid remid pid xid exflags flags sts grmode rqmode time_us
 	   r_nodeid r_len r_name */
 
-	seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
-		   lkb->lkb_id,
-		   lkb->lkb_nodeid,
-		   lkb->lkb_remid,
-		   lkb->lkb_ownpid,
-		   (unsigned long long)xid,
-		   lkb->lkb_exflags,
-		   lkb->lkb_flags,
-		   lkb->lkb_status,
-		   lkb->lkb_grmode,
-		   lkb->lkb_rqmode,
-		   (unsigned long long)us,
-		   r->res_nodeid,
-		   r->res_length,
-		   r->res_name);
+	rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n",
+			lkb->lkb_id,
+			lkb->lkb_nodeid,
+			lkb->lkb_remid,
+			lkb->lkb_ownpid,
+			(unsigned long long)xid,
+			lkb->lkb_exflags,
+			lkb->lkb_flags,
+			lkb->lkb_status,
+			lkb->lkb_grmode,
+			lkb->lkb_rqmode,
+			(unsigned long long)us,
+			r->res_nodeid,
+			r->res_length,
+			r->res_name);
+	return rv;
 }
 
 static int print_format2(struct dlm_rsb *r, struct seq_file *s)
 {
 	struct dlm_lkb *lkb;
+	int rv = 0;
 
 	lock_rsb(r);
 
-	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-		print_format2_lock(s, lkb, r);
-
-	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-		print_format2_lock(s, lkb, r);
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
 
-	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-		print_format2_lock(s, lkb, r);
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
 
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		rv = print_format2_lock(s, lkb, r);
+		if (rv)
+			goto out;
+	}
+ out:
 	unlock_rsb(r);
-	return 0;
+	return rv;
 }
 
-static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
-			       int rsb_lookup)
+static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
+			      int rsb_lookup)
 {
 	u64 xid = 0;
+	int rv;
 
 	if (lkb->lkb_flags & DLM_IFL_USER) {
 		if (lkb->lkb_ua)
 			xid = lkb->lkb_ua->xid;
 	}
 
-	seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
-		   lkb->lkb_id,
-		   lkb->lkb_nodeid,
-		   lkb->lkb_remid,
-		   lkb->lkb_ownpid,
-		   (unsigned long long)xid,
-		   lkb->lkb_exflags,
-		   lkb->lkb_flags,
-		   lkb->lkb_status,
-		   lkb->lkb_grmode,
-		   lkb->lkb_rqmode,
-		   lkb->lkb_highbast,
-		   rsb_lookup,
-		   lkb->lkb_wait_type,
-		   lkb->lkb_lvbseq,
-		   (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
-		   (unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+	rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n",
+			lkb->lkb_id,
+			lkb->lkb_nodeid,
+			lkb->lkb_remid,
+			lkb->lkb_ownpid,
+			(unsigned long long)xid,
+			lkb->lkb_exflags,
+			lkb->lkb_flags,
+			lkb->lkb_status,
+			lkb->lkb_grmode,
+			lkb->lkb_rqmode,
+			lkb->lkb_highbast,
+			rsb_lookup,
+			lkb->lkb_wait_type,
+			lkb->lkb_lvbseq,
+			(unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
+			(unsigned long long)ktime_to_ns(lkb->lkb_time_bast));
+	return rv;
 }
 
 static int print_format3(struct dlm_rsb *r, struct seq_file *s)
@@ -247,18 +270,21 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
 	struct dlm_lkb *lkb;
 	int i, lvblen = r->res_ls->ls_lvblen;
 	int print_name = 1;
+	int rv;
 
 	lock_rsb(r);
 
-	seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
-		   r,
-		   r->res_nodeid,
-		   r->res_first_lkid,
-		   r->res_flags,
-		   !list_empty(&r->res_root_list),
-		   !list_empty(&r->res_recover_list),
-		   r->res_recover_locks_count,
-		   r->res_length);
+	rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ",
+			r,
+			r->res_nodeid,
+			r->res_first_lkid,
+			r->res_flags,
+			!list_empty(&r->res_root_list),
+			!list_empty(&r->res_recover_list),
+			r->res_recover_locks_count,
+			r->res_length);
+	if (rv)
+		goto out;
 
 	for (i = 0; i < r->res_length; i++) {
 		if (!isascii(r->res_name[i]) || !isprint(r->res_name[i]))
@@ -273,7 +299,9 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
 		else
 			seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
 	}
-	seq_printf(s, "\n");
+	rv = seq_printf(s, "\n");
+	if (rv)
+		goto out;
 
 	if (!r->res_lvbptr)
 		goto do_locks;
@@ -282,344 +310,294 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
 
 	for (i = 0; i < lvblen; i++)
 		seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
-	seq_printf(s, "\n");
+	rv = seq_printf(s, "\n");
+	if (rv)
+		goto out;
 
  do_locks:
-	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
-		print_format3_lock(s, lkb, 0);
-
-	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
-		print_format3_lock(s, lkb, 0);
-
-	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
-		print_format3_lock(s, lkb, 0);
-
-	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
-		print_format3_lock(s, lkb, 1);
-
-	unlock_rsb(r);
-	return 0;
-}
-
-static int rsb_iter_next(struct rsb_iter *ri)
-{
-	struct dlm_ls *ls = ri->ls;
-	int i;
-
-	if (!ri->next) {
- top:
-		/* Find the next non-empty hash bucket */
-		for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) {
-			read_lock(&ls->ls_rsbtbl[i].lock);
-			if (!list_empty(&ls->ls_rsbtbl[i].list)) {
-				ri->next = ls->ls_rsbtbl[i].list.next;
-				ri->rsb = list_entry(ri->next, struct dlm_rsb,
-							res_hashchain);
-				dlm_hold_rsb(ri->rsb);
-				read_unlock(&ls->ls_rsbtbl[i].lock);
-				break;
-			}
-			read_unlock(&ls->ls_rsbtbl[i].lock);
-		}
-		ri->entry = i;
-
-		if (ri->entry >= ls->ls_rsbtbl_size)
-			return 1;
-	} else {
-		struct dlm_rsb *old = ri->rsb;
-		i = ri->entry;
-		read_lock(&ls->ls_rsbtbl[i].lock);
-		ri->next = ri->next->next;
-		if (ri->next->next == ls->ls_rsbtbl[i].list.next) {
-			/* End of list - move to next bucket */
-			ri->next = NULL;
-			ri->entry++;
-			read_unlock(&ls->ls_rsbtbl[i].lock);
-			dlm_put_rsb(old);
-			goto top;
-		}
-		ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain);
-		dlm_hold_rsb(ri->rsb);
-		read_unlock(&ls->ls_rsbtbl[i].lock);
-		dlm_put_rsb(old);
+	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
 	}
 
-	return 0;
-}
-
-static void rsb_iter_free(struct rsb_iter *ri)
-{
-	kfree(ri);
-}
-
-static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls)
-{
-	struct rsb_iter *ri;
-
-	ri = kzalloc(sizeof *ri, GFP_KERNEL);
-	if (!ri)
-		return NULL;
-
-	ri->ls = ls;
-	ri->entry = 0;
-	ri->next = NULL;
-	ri->format = 1;
-
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
 	}
 
-	return ri;
-}
-
-static void *rsb_seq_start(struct seq_file *file, loff_t *pos)
-{
-	struct rsb_iter *ri;
-	loff_t n = *pos;
-
-	ri = rsb_iter_init(file->private);
-	if (!ri)
-		return NULL;
-
-	while (n--) {
-		if (rsb_iter_next(ri)) {
-			rsb_iter_free(ri);
-			return NULL;
-		}
+	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+		rv = print_format3_lock(s, lkb, 0);
+		if (rv)
+			goto out;
 	}
 
-	return ri;
-}
-
-static void *rsb_seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos)
-{
-	struct rsb_iter *ri = iter_ptr;
-
-	(*pos)++;
-
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) {
+		rv = print_format3_lock(s, lkb, 1);
+		if (rv)
+			goto out;
 	}
-
-	return ri;
+ out:
+	unlock_rsb(r);
+	return rv;
 }
 
-static void rsb_seq_stop(struct seq_file *file, void *iter_ptr)
-{
-	/* nothing for now */
-}
+struct rsbtbl_iter {
+	struct dlm_rsb *rsb;
+	unsigned bucket;
+	int format;
+	int header;
+};
 
-static int rsb_seq_show(struct seq_file *file, void *iter_ptr)
+/* seq_printf returns -1 if the buffer is full, and 0 otherwise.
+   If the buffer is full, seq_printf can be called again, but it
+   does nothing and just returns -1.  So, the these printing routines
+   periodically check the return value to avoid wasting too much time
+   trying to print to a full buffer. */
+
+static int table_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-	struct rsb_iter *ri = iter_ptr;
+	struct rsbtbl_iter *ri = iter_ptr;
+	int rv = 0;
 
 	switch (ri->format) {
 	case 1:
-		print_format1(ri->rsb, file);
+		rv = print_format1(ri->rsb, seq);
 		break;
 	case 2:
 		if (ri->header) {
-			seq_printf(file, "id nodeid remid pid xid exflags "
-					 "flags sts grmode rqmode time_ms "
-					 "r_nodeid r_len r_name\n");
+			seq_printf(seq, "id nodeid remid pid xid exflags "
+					"flags sts grmode rqmode time_ms "
+					"r_nodeid r_len r_name\n");
 			ri->header = 0;
 		}
-		print_format2(ri->rsb, file);
+		rv = print_format2(ri->rsb, seq);
 		break;
 	case 3:
 		if (ri->header) {
-			seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+			seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
 			ri->header = 0;
 		}
-		print_format3(ri->rsb, file);
+		rv = print_format3(ri->rsb, seq);
 		break;
 	}
 
-	return 0;
+	return rv;
 }
 
-static struct seq_operations rsb_seq_ops = {
-	.start = rsb_seq_start,
-	.next  = rsb_seq_next,
-	.stop  = rsb_seq_stop,
-	.show  = rsb_seq_show,
-};
+static struct seq_operations format1_seq_ops;
+static struct seq_operations format2_seq_ops;
+static struct seq_operations format3_seq_ops;
 
-static int rsb_open(struct inode *inode, struct file *file)
+static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct seq_file *seq;
-	int ret;
-
-	ret = seq_open(file, &rsb_seq_ops);
-	if (ret)
-		return ret;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-
-	return 0;
-}
-
-static const struct file_operations rsb_fops = {
-	.owner   = THIS_MODULE,
-	.open    = rsb_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri;
+	struct dlm_rsb *r;
+	loff_t n = *pos;
+	unsigned bucket, entry;
 
-/*
- * Dump state in compact per-lock listing
- */
+	bucket = n >> 32;
+	entry = n & ((1LL << 32) - 1);
 
-static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos)
-{
-	struct rsb_iter *ri;
+	if (bucket >= ls->ls_rsbtbl_size)
+		return NULL;
 
-	ri = kzalloc(sizeof *ri, GFP_KERNEL);
+	ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
 	if (!ri)
 		return NULL;
-
-	ri->ls = ls;
-	ri->entry = 0;
-	ri->next = NULL;
-	ri->format = 2;
-
-	if (*pos == 0)
+	if (n == 0)
 		ri->header = 1;
-
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+	if (seq->op == &format1_seq_ops)
+		ri->format = 1;
+	if (seq->op == &format2_seq_ops)
+		ri->format = 2;
+	if (seq->op == &format3_seq_ops)
+		ri->format = 3;
+
+	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+		list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
+				    res_hashchain) {
+			if (!entry--) {
+				dlm_hold_rsb(r);
+				ri->rsb = r;
+				ri->bucket = bucket;
+				read_unlock(&ls->ls_rsbtbl[bucket].lock);
+				return ri;
+			}
+		}
 	}
+	read_unlock(&ls->ls_rsbtbl[bucket].lock);
 
-	return ri;
-}
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
 
-static void *locks_seq_start(struct seq_file *file, loff_t *pos)
-{
-	struct rsb_iter *ri;
-	loff_t n = *pos;
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
 
-	ri = locks_iter_init(file->private, pos);
-	if (!ri)
-		return NULL;
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
 
-	while (n--) {
-		if (rsb_iter_next(ri)) {
-			rsb_iter_free(ri);
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
 			return NULL;
 		}
-	}
 
-	return ri;
+		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+					     struct dlm_rsb, res_hashchain);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	}
 }
 
-static struct seq_operations locks_seq_ops = {
-	.start = locks_seq_start,
-	.next  = rsb_seq_next,
-	.stop  = rsb_seq_stop,
-	.show  = rsb_seq_show,
-};
-
-static int locks_open(struct inode *inode, struct file *file)
+static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 {
-	struct seq_file *seq;
-	int ret;
-
-	ret = seq_open(file, &locks_seq_ops);
-	if (ret)
-		return ret;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-
-	return 0;
-}
-
-static const struct file_operations locks_fops = {
-	.owner   = THIS_MODULE,
-	.open    = locks_open,
-	.read    = seq_read,
-	.llseek  = seq_lseek,
-	.release = seq_release
-};
-
-/*
- * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks
- * This can replace both formats 1 and 2 eventually.
- */
+	struct dlm_ls *ls = seq->private;
+	struct rsbtbl_iter *ri = iter_ptr;
+	struct list_head *next;
+	struct dlm_rsb *r, *rp;
+	loff_t n = *pos;
+	unsigned bucket;
+
+	bucket = n >> 32;
+
+	/*
+	 * move to the next rsb in the same bucket
+	 */
+
+	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	rp = ri->rsb;
+	next = rp->res_hashchain.next;
+
+	if (next != &ls->ls_rsbtbl[bucket].list) {
+		r = list_entry(next, struct dlm_rsb, res_hashchain);
+		dlm_hold_rsb(r);
+		ri->rsb = r;
+		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		dlm_put_rsb(rp);
+		++*pos;
+		return ri;
+	}
+	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	dlm_put_rsb(rp);
 
-static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos)
-{
-	struct rsb_iter *ri;
+	/*
+	 * move to the first rsb in the next non-empty bucket
+	 */
 
-	ri = kzalloc(sizeof *ri, GFP_KERNEL);
-	if (!ri)
-		return NULL;
+	/* zero the entry */
+	n &= ~((1LL << 32) - 1);
 
-	ri->ls = ls;
-	ri->entry = 0;
-	ri->next = NULL;
-	ri->format = 3;
+	while (1) {
+		bucket++;
+		n += 1LL << 32;
 
-	if (*pos == 0)
-		ri->header = 1;
+		if (bucket >= ls->ls_rsbtbl_size) {
+			kfree(ri);
+			return NULL;
+		}
 
-	if (rsb_iter_next(ri)) {
-		rsb_iter_free(ri);
-		return NULL;
+		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
+			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
+					     struct dlm_rsb, res_hashchain);
+			dlm_hold_rsb(r);
+			ri->rsb = r;
+			ri->bucket = bucket;
+			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			*pos = n;
+			return ri;
+		}
+		read_unlock(&ls->ls_rsbtbl[bucket].lock);
 	}
-
-	return ri;
 }
 
-static void *all_seq_start(struct seq_file *file, loff_t *pos)
+static void table_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
-	struct rsb_iter *ri;
-	loff_t n = *pos;
-
-	ri = all_iter_init(file->private, pos);
-	if (!ri)
-		return NULL;
+	struct rsbtbl_iter *ri = iter_ptr;
 
-	while (n--) {
-		if (rsb_iter_next(ri)) {
-			rsb_iter_free(ri);
-			return NULL;
-		}
+	if (ri) {
+		dlm_put_rsb(ri->rsb);
+		kfree(ri);
 	}
-
-	return ri;
 }
 
-static struct seq_operations all_seq_ops = {
-	.start = all_seq_start,
-	.next  = rsb_seq_next,
-	.stop  = rsb_seq_stop,
-	.show  = rsb_seq_show,
+static struct seq_operations format1_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
 };
 
-static int all_open(struct inode *inode, struct file *file)
+static struct seq_operations format2_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static struct seq_operations format3_seq_ops = {
+	.start = table_seq_start,
+	.next  = table_seq_next,
+	.stop  = table_seq_stop,
+	.show  = table_seq_show,
+};
+
+static const struct file_operations format1_fops;
+static const struct file_operations format2_fops;
+static const struct file_operations format3_fops;
+
+static int table_open(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq;
-	int ret;
+	int ret = -1;
+
+	if (file->f_op == &format1_fops)
+		ret = seq_open(file, &format1_seq_ops);
+	else if (file->f_op == &format2_fops)
+		ret = seq_open(file, &format2_seq_ops);
+	else if (file->f_op == &format3_fops)
+		ret = seq_open(file, &format3_seq_ops);
 
-	ret = seq_open(file, &all_seq_ops);
 	if (ret)
 		return ret;
 
 	seq = file->private_data;
-	seq->private = inode->i_private;
-
+	seq->private = inode->i_private; /* the dlm_ls */
 	return 0;
 }
 
-static const struct file_operations all_fops = {
+static const struct file_operations format1_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format2_fops = {
+	.owner   = THIS_MODULE,
+	.open    = table_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static const struct file_operations format3_fops = {
 	.owner   = THIS_MODULE,
-	.open    = all_open,
+	.open    = table_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
 	.release = seq_release
@@ -689,7 +667,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 						      S_IFREG | S_IRUGO,
 						      dlm_root,
 						      ls,
-						      &rsb_fops);
+						      &format1_fops);
 	if (!ls->ls_debug_rsb_dentry)
 		goto fail;
 
@@ -702,7 +680,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 							S_IFREG | S_IRUGO,
 							dlm_root,
 							ls,
-							&locks_fops);
+							&format2_fops);
 	if (!ls->ls_debug_locks_dentry)
 		goto fail;
 
@@ -715,7 +693,7 @@ int dlm_create_debug_file(struct dlm_ls *ls)
 						      S_IFREG | S_IRUGO,
 						      dlm_root,
 						      ls,
-						      &all_fops);
+						      &format3_fops);
 	if (!ls->ls_debug_all_dentry)
 		goto fail;
 
-- 
cgit v1.2.3


From c7be761a8163d2f1ac0b606c21e4316b7abc5af7 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 7 Jan 2009 16:50:41 -0600
Subject: dlm: change rsbtbl rwlock to spinlock

The rwlock is almost always used in write mode, so there's no reason
to not use a spinlock instead.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/debug_fs.c     | 24 ++++++++++++------------
 fs/dlm/dlm_internal.h |  2 +-
 fs/dlm/lock.c         | 26 +++++++++++++-------------
 fs/dlm/lockspace.c    |  2 +-
 fs/dlm/recover.c      | 10 +++++-----
 5 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index bc4af3ef65a..1d1d2744223 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -416,7 +416,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 	if (seq->op == &format3_seq_ops)
 		ri->format = 3;
 
-	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
 		list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list,
 				    res_hashchain) {
@@ -424,12 +424,12 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 				dlm_hold_rsb(r);
 				ri->rsb = r;
 				ri->bucket = bucket;
-				read_unlock(&ls->ls_rsbtbl[bucket].lock);
+				spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 				return ri;
 			}
 		}
 	}
-	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 
 	/*
 	 * move to the first rsb in the next non-empty bucket
@@ -447,18 +447,18 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
 			return NULL;
 		}
 
-		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
 		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
 			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
 					     struct dlm_rsb, res_hashchain);
 			dlm_hold_rsb(r);
 			ri->rsb = r;
 			ri->bucket = bucket;
-			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 			*pos = n;
 			return ri;
 		}
-		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	}
 }
 
@@ -477,7 +477,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 	 * move to the next rsb in the same bucket
 	 */
 
-	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	rp = ri->rsb;
 	next = rp->res_hashchain.next;
 
@@ -485,12 +485,12 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 		r = list_entry(next, struct dlm_rsb, res_hashchain);
 		dlm_hold_rsb(r);
 		ri->rsb = r;
-		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 		dlm_put_rsb(rp);
 		++*pos;
 		return ri;
 	}
-	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	dlm_put_rsb(rp);
 
 	/*
@@ -509,18 +509,18 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
 			return NULL;
 		}
 
-		read_lock(&ls->ls_rsbtbl[bucket].lock);
+		spin_lock(&ls->ls_rsbtbl[bucket].lock);
 		if (!list_empty(&ls->ls_rsbtbl[bucket].list)) {
 			r = list_first_entry(&ls->ls_rsbtbl[bucket].list,
 					     struct dlm_rsb, res_hashchain);
 			dlm_hold_rsb(r);
 			ri->rsb = r;
 			ri->bucket = bucket;
-			read_unlock(&ls->ls_rsbtbl[bucket].lock);
+			spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 			*pos = n;
 			return ri;
 		}
-		read_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	}
 }
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index ef2f1e35396..076e86f38bc 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -105,7 +105,7 @@ struct dlm_dirtable {
 struct dlm_rsbtable {
 	struct list_head	list;
 	struct list_head	toss;
-	rwlock_t		lock;
+	spinlock_t		lock;
 };
 
 struct dlm_lkbtable {
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 6cfe65bbf4a..01e7d39c5fb 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -412,9 +412,9 @@ static int search_rsb(struct dlm_ls *ls, char *name, int len, int b,
 		      unsigned int flags, struct dlm_rsb **r_ret)
 {
 	int error;
-	write_lock(&ls->ls_rsbtbl[b].lock);
+	spin_lock(&ls->ls_rsbtbl[b].lock);
 	error = _search_rsb(ls, name, len, b, flags, r_ret);
-	write_unlock(&ls->ls_rsbtbl[b].lock);
+	spin_unlock(&ls->ls_rsbtbl[b].lock);
 	return error;
 }
 
@@ -478,16 +478,16 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
 		r->res_nodeid = nodeid;
 	}
 
-	write_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	error = _search_rsb(ls, name, namelen, bucket, 0, &tmp);
 	if (!error) {
-		write_unlock(&ls->ls_rsbtbl[bucket].lock);
+		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 		dlm_free_rsb(r);
 		r = tmp;
 		goto out;
 	}
 	list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list);
-	write_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	error = 0;
  out:
 	*r_ret = r;
@@ -530,9 +530,9 @@ static void put_rsb(struct dlm_rsb *r)
 	struct dlm_ls *ls = r->res_ls;
 	uint32_t bucket = r->res_bucket;
 
-	write_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	kref_put(&r->res_ref, toss_rsb);
-	write_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 }
 
 void dlm_put_rsb(struct dlm_rsb *r)
@@ -967,7 +967,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
 
 	for (;;) {
 		found = 0;
-		write_lock(&ls->ls_rsbtbl[b].lock);
+		spin_lock(&ls->ls_rsbtbl[b].lock);
 		list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
 					    res_hashchain) {
 			if (!time_after_eq(jiffies, r->res_toss_time +
@@ -978,20 +978,20 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
 		}
 
 		if (!found) {
-			write_unlock(&ls->ls_rsbtbl[b].lock);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
 			break;
 		}
 
 		if (kref_put(&r->res_ref, kill_rsb)) {
 			list_del(&r->res_hashchain);
-			write_unlock(&ls->ls_rsbtbl[b].lock);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
 
 			if (is_master(r))
 				dir_remove(r);
 			dlm_free_rsb(r);
 			count++;
 		} else {
-			write_unlock(&ls->ls_rsbtbl[b].lock);
+			spin_unlock(&ls->ls_rsbtbl[b].lock);
 			log_error(ls, "tossed rsb in use %s", r->res_name);
 		}
 	}
@@ -4224,7 +4224,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
 {
 	struct dlm_rsb *r, *r_ret = NULL;
 
-	read_lock(&ls->ls_rsbtbl[bucket].lock);
+	spin_lock(&ls->ls_rsbtbl[bucket].lock);
 	list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) {
 		if (!rsb_flag(r, RSB_LOCKS_PURGED))
 			continue;
@@ -4233,7 +4233,7 @@ static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
 		r_ret = r;
 		break;
 	}
-	read_unlock(&ls->ls_rsbtbl[bucket].lock);
+	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 	return r_ret;
 }
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 8d86b7960f0..aa32e5f0249 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -464,7 +464,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 	for (i = 0; i < size; i++) {
 		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list);
 		INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss);
-		rwlock_init(&ls->ls_rsbtbl[i].lock);
+		spin_lock_init(&ls->ls_rsbtbl[i].lock);
 	}
 
 	size = dlm_config.ci_lkbtbl_size;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 80aba5bdd4a..eda43f36261 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -726,7 +726,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
 	}
 
 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-		read_lock(&ls->ls_rsbtbl[i].lock);
+		spin_lock(&ls->ls_rsbtbl[i].lock);
 		list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) {
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
@@ -737,7 +737,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
 		   but no other recovery steps should do anything with them. */
 
 		if (dlm_no_directory(ls)) {
-			read_unlock(&ls->ls_rsbtbl[i].lock);
+			spin_unlock(&ls->ls_rsbtbl[i].lock);
 			continue;
 		}
 
@@ -745,7 +745,7 @@ int dlm_create_root_list(struct dlm_ls *ls)
 			list_add(&r->res_root_list, &ls->ls_root_list);
 			dlm_hold_rsb(r);
 		}
-		read_unlock(&ls->ls_rsbtbl[i].lock);
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
 	}
  out:
 	up_write(&ls->ls_root_sem);
@@ -775,7 +775,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
 	int i;
 
 	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
-		write_lock(&ls->ls_rsbtbl[i].lock);
+		spin_lock(&ls->ls_rsbtbl[i].lock);
 		list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss,
 					 res_hashchain) {
 			if (dlm_no_directory(ls) || !is_master(r)) {
@@ -783,7 +783,7 @@ void dlm_clear_toss_list(struct dlm_ls *ls)
 				dlm_free_rsb(r);
 			}
 		}
-		write_unlock(&ls->ls_rsbtbl[i].lock);
+		spin_unlock(&ls->ls_rsbtbl[i].lock);
 	}
 }
 
-- 
cgit v1.2.3


From d3374825ce57ba2214d375023979f6197ccc1385 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 9 Jan 2009 08:31:10 +1100
Subject: md: make devices disappear when they are no longer needed.

Currently md devices, once created, never disappear until the module
is unloaded.  This is essentially because the gendisk holds a
reference to the mddev, and the mddev holds a reference to the
gendisk, this a circular reference.

If we drop the reference from mddev to gendisk, then we need to ensure
that the mddev is destroyed when the gendisk is destroyed.  However it
is not possible to hook into the gendisk destruction process to enable
this.

So we drop the reference from the gendisk to the mddev and destroy the
gendisk when the mddev gets destroyed.  However this has a
complication.
Between the call
   __blkdev_get->get_gendisk->kobj_lookup->md_probe
and the call
   __blkdev_get->md_open

there is no obvious way to hold a reference on the mddev any more, so
unless something is done, it will disappear and gendisk will be
destroyed prematurely.

Also, once we decide to destroy the mddev, there will be an unlockable
moment before the gendisk is unlinked (blk_unregister_region) during
which a new reference to the gendisk can be created.  We need to
ensure that this reference can not be used.  i.e. the ->open must
fail.

So:
 1/  in md_probe we set a flag in the mddev (hold_active) which
     indicates that the array should be treated as active, even
     though there are no references, and no appearance of activity.
     This is cleared by md_release when the device is closed if it
     is no longer needed.
     This ensures that the gendisk will survive between md_probe and
     md_open.

 2/  In md_open we check if the mddev we expect to open matches
     the gendisk that we did open.
     If there is a mismatch we return -ERESTARTSYS and modify
     __blkdev_get to retry from the top in that case.
     In the -ERESTARTSYS sys case we make sure to wait until
     the old gendisk (that we succeeded in opening) is really gone so
     we loop at most once.

Some udev configurations will always open an md device when it first
appears.   If we allow an md device that was just created by an open
to disappear on an immediate close, then this can race with such udev
configurations and result in an infinite loop the device being opened
and closed, then re-open due to the 'ADD' even from the first open,
and then close and so on.
So we make sure an md device, once created by an open, remains active
at least until some md 'ioctl' has been made on it.  This means that
all normal usage of md devices will allow them to disappear promptly
when not needed, but the worst that an incorrect usage will do it
cause an inactive md device to be left in existence (it can easily be
removed).

As an array can be stopped by writing to a sysfs attribute
  echo clear > /sys/block/mdXXX/md/array_state
we need to use scheduled work for deleting the gendisk and other
kobjects.  This allows us to wait for any pending gendisk deletion to
complete by simply calling flush_scheduled_work().


Signed-off-by: NeilBrown <neilb@suse.de>
---
 fs/block_dev.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index b957717e25a..8ebbfdf708c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	}
 
 	lock_kernel();
+ restart:
 
 	ret = -ENXIO;
 	disk = get_gendisk(bdev->bd_dev, &partno);
@@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
 			if (disk->fops->open) {
 				ret = disk->fops->open(bdev, mode);
+				if (ret == -ERESTARTSYS) {
+					/* Lost a race with 'disk' being
+					 * deleted, try again.
+					 * See md.c
+					 */
+					disk_put_part(bdev->bd_part);
+					bdev->bd_part = NULL;
+					module_put(disk->fops->owner);
+					put_disk(disk);
+					bdev->bd_disk = NULL;
+					mutex_unlock(&bdev->bd_mutex);
+					goto restart;
+				}
 				if (ret)
 					goto out_clear;
 			}
-- 
cgit v1.2.3


From c9a98553d513dfc82cdce869970d5662c1f22c68 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 1 Jan 2009 14:21:16 -0500
Subject: [XFS] pass XFS_IGET_BULKSTAT to xfs_iget for handle operations

NFS clients or users of the handle ioctls can pass us arbitrary inode
numbers through the exportfs interface.  Make sure we use the
XFS_IGET_BULKSTAT so that these don't cause shutdowns due to the corruption
checks.  Also translate the EINVAL we get back for invalid inode clusters
into an ESTALE which is more appropinquate, and remove the useless check
for a NULL inode on a successfull xfs_iget return.

I have a testcase to reproduce this using the handle interface which
I will submit to xfsqa.

Reported-by: Mario Becroft <mb@gem.win.co.nz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_export.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 595751f7835..87b8cbd23d4 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -126,11 +126,26 @@ xfs_nfs_get_inode(
 	if (ino == 0)
 		return ERR_PTR(-ESTALE);
 
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-	if (error)
+	/*
+	 * The XFS_IGET_BULKSTAT means that an invalid inode number is just
+	 * fine and not an indication of a corrupted filesystem.  Because
+	 * clients can send any kind of invalid file handle, e.g. after
+	 * a restore on the server we have to deal with this case gracefully.
+	 */
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_BULKSTAT,
+			 XFS_ILOCK_SHARED, &ip, 0);
+	if (error) {
+		/*
+		 * EINVAL means the inode cluster doesn't exist anymore.
+		 * This implies the filehandle is stale, so we should
+		 * translate it here.
+		 * We don't use ESTALE directly down the chain to not
+		 * confuse applications using bulkstat that expect EINVAL.
+		 */
+		if (error == EINVAL)
+			error = ESTALE;
 		return ERR_PTR(-error);
-	if (!ip)
-		return ERR_PTR(-EIO);
+	}
 
 	if (ip->i_d.di_gen != generation) {
 		xfs_iput_new(ip, XFS_ILOCK_SHARED);
-- 
cgit v1.2.3


From 9800b550355e99c9bcaba7ec6540751dce0823d7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 1 Jan 2009 16:40:10 -0600
Subject: [XFS] Remove several unused typedefs.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.h | 2 --
 fs/xfs/xfs_acl.h            | 1 -
 fs/xfs/xfs_types.h          | 2 --
 3 files changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 7b26f5ff969..1dd52884975 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -21,8 +21,6 @@
 extern struct workqueue_struct *xfsdatad_workqueue;
 extern mempool_t *xfs_ioend_pool;
 
-typedef void (*xfs_ioend_func_t)(void *);
-
 /*
  * xfs_ioend struct manages large extent writes for XFS.
  * It can manage several multi-page bio's at once.
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index a4e293b93ef..642f1db4def 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -22,7 +22,6 @@
  * Access Control Lists
  */
 typedef __uint16_t	xfs_acl_perm_t;
-typedef __int32_t	xfs_acl_type_t;
 typedef __int32_t	xfs_acl_tag_t;
 typedef __int32_t	xfs_acl_id_t;
 
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 0f5191644ab..baedbd14dc2 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -111,8 +111,6 @@ typedef __uint64_t	xfs_fileoff_t;	/* block number in a file */
 typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
 typedef __uint64_t	xfs_filblks_t;	/* number of blocks in a file */
 
-typedef __uint8_t	xfs_arch_t;	/* architecture of an xfs fs */
-
 /*
  * Null values for the types.
  */
-- 
cgit v1.2.3


From c9fb86a917640d66ba2e0613a12f3a76eda8a30f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Thu, 1 Jan 2009 16:40:11 -0600
Subject: [XFS] Remove macro-to-function indirections in attr code

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c | 72 +++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_attr_leaf.h | 12 ---------
 2 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 79da6b2ea99..6c323f8a4cd 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -736,7 +736,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp)
 			continue;		/* don't copy partial entries */
 		if (!(entry->flags & XFS_ATTR_LOCAL))
 			return(0);
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		name_loc = xfs_attr_leaf_name_local(leaf, i);
 		if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
 			return(0);
 		if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
@@ -823,7 +823,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
 		if (!entry->nameidx)
 			continue;
 		ASSERT(entry->flags & XFS_ATTR_LOCAL);
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+		name_loc = xfs_attr_leaf_name_local(leaf, i);
 		nargs.name = (char *)name_loc->nameval;
 		nargs.namelen = name_loc->namelen;
 		nargs.value = (char *)&name_loc->nameval[nargs.namelen];
@@ -1141,14 +1141,14 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 	 * as part of this transaction (a split operation for example).
 	 */
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
 		name_loc->namelen = args->namelen;
 		name_loc->valuelen = cpu_to_be16(args->valuelen);
 		memcpy((char *)name_loc->nameval, args->name, args->namelen);
 		memcpy((char *)&name_loc->nameval[args->namelen], args->value,
 				   be16_to_cpu(name_loc->valuelen));
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		name_rmt->namelen = args->namelen;
 		memcpy((char *)name_rmt->name, args->name, args->namelen);
 		entry->flags |= XFS_ATTR_INCOMPLETE;
@@ -1159,7 +1159,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
 		args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen);
 	}
 	xfs_da_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
 				   xfs_attr_leaf_entsize(leaf, args->index)));
 
 	/*
@@ -1749,10 +1749,10 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args)
 	/*
 	 * Compress the remaining entries and zero out the removed stuff.
 	 */
-	memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize);
+	memset(xfs_attr_leaf_name(leaf, args->index), 0, entsize);
 	be16_add_cpu(&hdr->usedbytes, -entsize);
 	xfs_da_log_buf(args->trans, bp,
-	     XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index),
+	     XFS_DA_LOGRANGE(leaf, xfs_attr_leaf_name(leaf, args->index),
 				   entsize));
 
 	tmp = (be16_to_cpu(hdr->count) - args->index)
@@ -1985,7 +1985,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
 			continue;
 		}
 		if (entry->flags & XFS_ATTR_LOCAL) {
-			name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe);
+			name_loc = xfs_attr_leaf_name_local(leaf, probe);
 			if (name_loc->namelen != args->namelen)
 				continue;
 			if (memcmp(args->name, (char *)name_loc->nameval, args->namelen) != 0)
@@ -1995,7 +1995,7 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args)
 			args->index = probe;
 			return(XFS_ERROR(EEXIST));
 		} else {
-			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe);
+			name_rmt = xfs_attr_leaf_name_remote(leaf, probe);
 			if (name_rmt->namelen != args->namelen)
 				continue;
 			if (memcmp(args->name, (char *)name_rmt->name,
@@ -2035,7 +2035,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
 
 	entry = &leaf->entries[args->index];
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
 		ASSERT(name_loc->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0);
 		valuelen = be16_to_cpu(name_loc->valuelen);
@@ -2050,7 +2050,7 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args)
 		args->valuelen = valuelen;
 		memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		ASSERT(name_rmt->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
 		valuelen = be32_to_cpu(name_rmt->valuelen);
@@ -2143,7 +2143,7 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 		 * off for 6.2, should be revisited later.
 		 */
 		if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */
-			memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
 			be16_add_cpu(&hdr_s->usedbytes, -tmp);
 			be16_add_cpu(&hdr_s->count, -1);
 			entry_d--;	/* to compensate for ++ in loop hdr */
@@ -2160,11 +2160,11 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s,
 			entry_d->flags = entry_s->flags;
 			ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
 							<= XFS_LBSIZE(mp));
-			memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti),
-				XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp);
+			memmove(xfs_attr_leaf_name(leaf_d, desti),
+				xfs_attr_leaf_name(leaf_s, start_s + i), tmp);
 			ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
 							<= XFS_LBSIZE(mp));
-			memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp);
+			memset(xfs_attr_leaf_name(leaf_s, start_s + i), 0, tmp);
 			be16_add_cpu(&hdr_s->usedbytes, -tmp);
 			be16_add_cpu(&hdr_d->usedbytes, tmp);
 			be16_add_cpu(&hdr_s->count, -1);
@@ -2276,12 +2276,12 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
 
 	ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC);
 	if (leaf->entries[index].flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index);
-		size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen,
+		name_loc = xfs_attr_leaf_name_local(leaf, index);
+		size = xfs_attr_leaf_entsize_local(name_loc->namelen,
 						   be16_to_cpu(name_loc->valuelen));
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index);
-		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, index);
+		size = xfs_attr_leaf_entsize_remote(name_rmt->namelen);
 	}
 	return(size);
 }
@@ -2297,13 +2297,13 @@ xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
 {
 	int size;
 
-	size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(namelen, valuelen);
-	if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) {
+	size = xfs_attr_leaf_entsize_local(namelen, valuelen);
+	if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
 		if (local) {
 			*local = 1;
 		}
 	} else {
-		size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(namelen);
+		size = xfs_attr_leaf_entsize_remote(namelen);
 		if (local) {
 			*local = 0;
 		}
@@ -2372,7 +2372,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 
 		if (entry->flags & XFS_ATTR_LOCAL) {
 			xfs_attr_leaf_name_local_t *name_loc =
-				XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
+				xfs_attr_leaf_name_local(leaf, i);
 
 			retval = context->put_listent(context,
 						entry->flags,
@@ -2384,7 +2384,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
 				return retval;
 		} else {
 			xfs_attr_leaf_name_remote_t *name_rmt =
-				XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+				xfs_attr_leaf_name_remote(leaf, i);
 
 			int valuelen = be32_to_cpu(name_rmt->valuelen);
 
@@ -2468,11 +2468,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 
 #ifdef DEBUG
 	if (entry->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf, args->index);
 		namelen = name_loc->namelen;
 		name = (char *)name_loc->nameval;
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		namelen = name_rmt->namelen;
 		name = (char *)name_rmt->name;
 	}
@@ -2487,7 +2487,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 
 	if (args->rmtblkno) {
 		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
 		name_rmt->valuelen = cpu_to_be32(args->valuelen);
 		xfs_da_log_buf(args->trans, bp,
@@ -2534,7 +2534,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
 	xfs_da_log_buf(args->trans, bp,
 			XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry)));
 	if ((entry->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf, args->index);
 		name_rmt->valueblk = 0;
 		name_rmt->valuelen = 0;
 		xfs_da_log_buf(args->trans, bp,
@@ -2607,20 +2607,20 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 
 #ifdef DEBUG
 	if (entry1->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index);
+		name_loc = xfs_attr_leaf_name_local(leaf1, args->index);
 		namelen1 = name_loc->namelen;
 		name1 = (char *)name_loc->nameval;
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
 		namelen1 = name_rmt->namelen;
 		name1 = (char *)name_rmt->name;
 	}
 	if (entry2->flags & XFS_ATTR_LOCAL) {
-		name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2);
+		name_loc = xfs_attr_leaf_name_local(leaf2, args->index2);
 		namelen2 = name_loc->namelen;
 		name2 = (char *)name_loc->nameval;
 	} else {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
 		namelen2 = name_rmt->namelen;
 		name2 = (char *)name_rmt->name;
 	}
@@ -2637,7 +2637,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 			  XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1)));
 	if (args->rmtblkno) {
 		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index);
+		name_rmt = xfs_attr_leaf_name_remote(leaf1, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
 		name_rmt->valuelen = cpu_to_be32(args->valuelen);
 		xfs_da_log_buf(args->trans, bp1,
@@ -2648,7 +2648,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	xfs_da_log_buf(args->trans, bp2,
 			  XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2)));
 	if ((entry2->flags & XFS_ATTR_LOCAL) == 0) {
-		name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2);
+		name_rmt = xfs_attr_leaf_name_remote(leaf2, args->index2);
 		name_rmt->valueblk = 0;
 		name_rmt->valuelen = 0;
 		xfs_da_log_buf(args->trans, bp2,
@@ -2855,7 +2855,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
 	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
 		if (be16_to_cpu(entry->nameidx) &&
 		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
 			if (name_rmt->valueblk)
 				count++;
 		}
@@ -2883,7 +2883,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
 	for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
 		if (be16_to_cpu(entry->nameidx) &&
 		    ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
-			name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i);
+			name_rmt = xfs_attr_leaf_name_remote(leaf, i);
 			if (name_rmt->valueblk) {
 				lp->valueblk = be32_to_cpu(name_rmt->valueblk);
 				lp->valuelen = XFS_B_TO_FSB(dp->i_mount,
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 83e9af417ca..9c7d22fdcf4 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -151,8 +151,6 @@ typedef struct xfs_attr_leafblock {
 /*
  * Cast typed pointers for "local" and "remote" name/value structs.
  */
-#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx)	\
-	xfs_attr_leaf_name_remote(leafp,idx)
 static inline xfs_attr_leaf_name_remote_t *
 xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
 {
@@ -160,8 +158,6 @@ xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
 		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
 }
 
-#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx)	\
-	xfs_attr_leaf_name_local(leafp,idx)
 static inline xfs_attr_leaf_name_local_t *
 xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
 {
@@ -169,8 +165,6 @@ xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
 		&((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
 }
 
-#define XFS_ATTR_LEAF_NAME(leafp,idx)		\
-	xfs_attr_leaf_name(leafp,idx)
 static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
 {
 	return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)];
@@ -181,24 +175,18 @@ static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
  * a "local" name/value structure, a "remote" name/value structure, and
  * a pointer which might be either.
  */
-#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen)	\
-	xfs_attr_leaf_entsize_remote(nlen)
 static inline int xfs_attr_leaf_entsize_remote(int nlen)
 {
 	return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
 		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
 }
 
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen)	\
-	xfs_attr_leaf_entsize_local(nlen,vlen)
 static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
 {
 	return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
 		XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
 }
 
-#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize)	\
-	xfs_attr_leaf_entsize_local_max(bsize)
 static inline int xfs_attr_leaf_entsize_local_max(int bsize)
 {
 	return (((bsize) >> 1) + ((bsize) >> 2));
-- 
cgit v1.2.3


From fb82557f16f3700ae4961a4ce599bdaff6a10b1c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 9 Jan 2009 15:53:54 +1100
Subject: [XFS] Remove macro-to-function indirections in the mask code

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_ag.h         |  2 +-
 fs/xfs/xfs_bit.h        | 10 +-----
 fs/xfs/xfs_bmap_btree.c | 84 ++++++++++++++++++++++++-------------------------
 3 files changed, 44 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f2e21817a22..d3b3cf74299 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -231,7 +231,7 @@ typedef struct xfs_perag
 #define	XFS_FSB_TO_AGNO(mp,fsbno)	\
 	((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog))
 #define	XFS_FSB_TO_AGBNO(mp,fsbno)	\
-	((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog)))
+	((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog)))
 #define	XFS_AGB_TO_DADDR(mp,agno,agbno)	\
 	((xfs_daddr_t)XFS_FSB_TO_BB(mp, \
 		(xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno)))
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index bca7b243c31..f1e3c907044 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -23,24 +23,16 @@
  */
 
 /*
- * masks with n high/low bits set, 32-bit values & 64-bit values
+ * masks with n high/low bits set, 64-bit values
  */
-#define	XFS_MASK32HI(n)		xfs_mask32hi(n)
-static inline __uint32_t xfs_mask32hi(int n)
-{
-	return (__uint32_t)-1 << (32 - (n));
-}
-#define	XFS_MASK64HI(n)		xfs_mask64hi(n)
 static inline __uint64_t xfs_mask64hi(int n)
 {
 	return (__uint64_t)-1 << (64 - (n));
 }
-#define	XFS_MASK32LO(n)		xfs_mask32lo(n)
 static inline __uint32_t xfs_mask32lo(int n)
 {
 	return ((__uint32_t)1 << (n)) - 1;
 }
-#define	XFS_MASK64LO(n)		xfs_mask64lo(n)
 static inline __uint64_t xfs_mask64lo(int n)
 {
 	return ((__uint64_t)1 << (n)) - 1;
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 8f1ec73725d..ba6b08c2fb0 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -110,16 +110,16 @@ __xfs_bmbt_get_all(
 
 	ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
 	s->br_startoff = ((xfs_fileoff_t)l0 &
-			   XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+			   xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 #if XFS_BIG_BLKNOS
-	s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) |
+	s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
 			   (((xfs_fsblock_t)l1) >> 21);
 #else
 #ifdef DEBUG
 	{
 		xfs_dfsbno_t	b;
 
-		b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) |
+		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
 		    (((xfs_dfsbno_t)l1) >> 21);
 		ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
 		s->br_startblock = (xfs_fsblock_t)b;
@@ -128,7 +128,7 @@ __xfs_bmbt_get_all(
 	s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
 #endif	/* DEBUG */
 #endif	/* XFS_BIG_BLKNOS */
-	s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21));
+	s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
 	/* This is xfs_extent_state() in-line */
 	if (ext_flag) {
 		ASSERT(s->br_blockcount != 0);	/* saved for DMIG */
@@ -153,7 +153,7 @@ xfs_filblks_t
 xfs_bmbt_get_blockcount(
 	xfs_bmbt_rec_host_t	*r)
 {
-	return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21));
+	return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21));
 }
 
 /*
@@ -164,13 +164,13 @@ xfs_bmbt_get_startblock(
 	xfs_bmbt_rec_host_t	*r)
 {
 #if XFS_BIG_BLKNOS
-	return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+	return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
 	       (((xfs_fsblock_t)r->l1) >> 21);
 #else
 #ifdef DEBUG
 	xfs_dfsbno_t	b;
 
-	b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) |
+	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
 	    (((xfs_dfsbno_t)r->l1) >> 21);
 	ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
 	return (xfs_fsblock_t)b;
@@ -188,7 +188,7 @@ xfs_bmbt_get_startoff(
 	xfs_bmbt_rec_host_t	*r)
 {
 	return ((xfs_fileoff_t)r->l0 &
-		 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
 
 xfs_exntst_t
@@ -219,7 +219,7 @@ xfs_filblks_t
 xfs_bmbt_disk_get_blockcount(
 	xfs_bmbt_rec_t	*r)
 {
-	return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21));
+	return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21));
 }
 
 /*
@@ -230,7 +230,7 @@ xfs_bmbt_disk_get_startoff(
 	xfs_bmbt_rec_t	*r)
 {
 	return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
-		 XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
+		 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
 
 
@@ -248,33 +248,33 @@ xfs_bmbt_set_allf(
 	int		extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
 
 	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-	ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
-	ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
 
 #if XFS_BIG_BLKNOS
-	ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
 
 	r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 		((xfs_bmbt_rec_base_t)startoff << 9) |
 		((xfs_bmbt_rec_base_t)startblock >> 43);
 	r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
 		((xfs_bmbt_rec_base_t)blockcount &
-		(xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 #else	/* !XFS_BIG_BLKNOS */
 	if (ISNULLSTARTBLOCK(startblock)) {
 		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			((xfs_bmbt_rec_base_t)startoff << 9) |
-			 (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
-		r->l1 = XFS_MASK64HI(11) |
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)startblock << 21) |
 			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	} else {
 		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			((xfs_bmbt_rec_base_t)startoff << 9);
 		r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
 			 ((xfs_bmbt_rec_base_t)blockcount &
-			 (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
@@ -306,11 +306,11 @@ xfs_bmbt_disk_set_allf(
 	int			extent_flag = (state == XFS_EXT_NORM) ? 0 : 1;
 
 	ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
-	ASSERT((startoff & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0);
-	ASSERT((blockcount & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
+	ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
+	ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
 
 #if XFS_BIG_BLKNOS
-	ASSERT((startblock & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0);
+	ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
 
 	r->l0 = cpu_to_be64(
 		((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -319,17 +319,17 @@ xfs_bmbt_disk_set_allf(
 	r->l1 = cpu_to_be64(
 		((xfs_bmbt_rec_base_t)startblock << 21) |
 		 ((xfs_bmbt_rec_base_t)blockcount &
-		  (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 #else	/* !XFS_BIG_BLKNOS */
 	if (ISNULLSTARTBLOCK(startblock)) {
 		r->l0 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			 ((xfs_bmbt_rec_base_t)startoff << 9) |
-			  (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
-		r->l1 = cpu_to_be64(XFS_MASK64HI(11) |
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
+		r->l1 = cpu_to_be64(xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)startblock << 21) |
 			  ((xfs_bmbt_rec_base_t)blockcount &
-			   (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+			   (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 	} else {
 		r->l0 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -337,7 +337,7 @@ xfs_bmbt_disk_set_allf(
 		r->l1 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)startblock << 21) |
 			 ((xfs_bmbt_rec_base_t)blockcount &
-			  (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)));
+			  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
@@ -362,9 +362,9 @@ xfs_bmbt_set_blockcount(
 	xfs_bmbt_rec_host_t *r,
 	xfs_filblks_t	v)
 {
-	ASSERT((v & XFS_MASK64HI(43)) == 0);
-	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) |
-		  (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21));
+	ASSERT((v & xfs_mask64hi(43)) == 0);
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) |
+		  (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21));
 }
 
 /*
@@ -376,21 +376,21 @@ xfs_bmbt_set_startblock(
 	xfs_fsblock_t	v)
 {
 #if XFS_BIG_BLKNOS
-	ASSERT((v & XFS_MASK64HI(12)) == 0);
-	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) |
+	ASSERT((v & xfs_mask64hi(12)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
 		  (xfs_bmbt_rec_base_t)(v >> 43);
-	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) |
+	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
 		  (xfs_bmbt_rec_base_t)(v << 21);
 #else	/* !XFS_BIG_BLKNOS */
 	if (ISNULLSTARTBLOCK(v)) {
-		r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
-		r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) |
+		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
+		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	} else {
-		r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9);
+		r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
 		r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
-			  (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21));
+			  (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 	}
 #endif	/* XFS_BIG_BLKNOS */
 }
@@ -403,10 +403,10 @@ xfs_bmbt_set_startoff(
 	xfs_bmbt_rec_host_t *r,
 	xfs_fileoff_t	v)
 {
-	ASSERT((v & XFS_MASK64HI(9)) == 0);
-	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) |
+	ASSERT((v & xfs_mask64hi(9)) == 0);
+	r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) |
 		((xfs_bmbt_rec_base_t)v << 9) |
-		  (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9));
+		  (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
 }
 
 /*
@@ -419,9 +419,9 @@ xfs_bmbt_set_state(
 {
 	ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN);
 	if (v == XFS_EXT_NORM)
-		r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN);
+		r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN);
 	else
-		r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN);
+		r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN);
 }
 
 /*
-- 
cgit v1.2.3


From e6edbd1c1cbef278d58cdd8b046599ba8ac90cfc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 8 Jan 2009 13:42:23 -0500
Subject: [XFS] fix compile of xfs_btree_readahead_lblock on m68k

Change the left/right variables to the proper always 64bit xfs_dfsbo_t
type because otherwise compilation fails for Geert on m68k without
CONFIG_LBD:

| fs/xfs/xfs_btree.c: In function 'xfs_btree_readahead_lblock':
| fs/xfs/xfs_btree.c:736: warning: comparison is always true due to limited range of data type
| fs/xfs/xfs_btree.c:741: warning: comparison is always true due to limited range of data type

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_btree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7ed59267420..2c3ef20f884 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -730,8 +730,8 @@ xfs_btree_readahead_lblock(
 	struct xfs_btree_block	*block)
 {
 	int			rval = 0;
-	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+	xfs_dfsbno_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_dfsbno_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
 		xfs_btree_reada_bufl(cur->bc_mp, left, 1);
-- 
cgit v1.2.3


From 15440319767942a363f282d6585303d3d75088ba Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 8 Jan 2009 14:00:00 -0500
Subject: [XFS] truncate readdir offsets to signed 32 bit values

John Stanley reported EOVERFLOW errors in readdir from his self-build
glibc.  I traced this down to glibc enabling d_off overflow checks
in one of the about five million different getdents implementations.

In 2.6.28 Dave Woodhouse moved our readdir double buffering required
for NFS4 readdirplus into nfsd and at that point we lost the capping
of the directory offsets to 32 bit signed values.  Johns glibc used
getdents64 to even implement readdir for normal 32 bit offset dirents,
and failed with EOVERFLOW only if this happens on the first dirent in
a getdents call.  I managed to come up with a testcase that uses
raw getdents and does the EOVERFLOW check manually.  We always hit
it with our last entry due to the special end of directory marker.

The patch below is a dumb version of just putting back the masking,
to make sure we have the same behavior as in 2.6.27 and earlier.

I will work on a better and cleaner fix for 2.6.30.

Reported-by: John Stanley <jpsinthemix@verizon.net>
Tested-by: John Stanley <jpsinthemix@verizon.net>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_dir2_block.c |  7 ++++---
 fs/xfs/xfs_dir2_leaf.c  |  6 +++---
 fs/xfs/xfs_dir2_sf.c    | 15 ++++++++-------
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e2fa0a1d8e9..e1f0a06aaf0 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -517,9 +517,9 @@ xfs_dir2_block_getdents(
 		/*
 		 * If it didn't fit, set the final offset to here & return.
 		 */
-		if (filldir(dirent, dep->name, dep->namelen, cook,
+		if (filldir(dirent, dep->name, dep->namelen, cook & 0x7fffffff,
 			    ino, DT_UNKNOWN)) {
-			*offset = cook;
+			*offset = cook & 0x7fffffff;
 			xfs_da_brelse(NULL, bp);
 			return 0;
 		}
@@ -529,7 +529,8 @@ xfs_dir2_block_getdents(
 	 * Reached the end of the block.
 	 * Set the offset to a non-existent block 1 and return.
 	 */
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
+	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+			0x7fffffff;
 	xfs_da_brelse(NULL, bp);
 	return 0;
 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 93535992cb6..ef805a374ee 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1092,7 +1092,7 @@ xfs_dir2_leaf_getdents(
 		 * Won't fit.  Return to caller.
 		 */
 		if (filldir(dirent, dep->name, dep->namelen,
-			    xfs_dir2_byte_to_dataptr(mp, curoff),
+			    xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff,
 			    ino, DT_UNKNOWN))
 			break;
 
@@ -1108,9 +1108,9 @@ xfs_dir2_leaf_getdents(
 	 * All done.  Set output offset value to current offset.
 	 */
 	if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
-		*offset = XFS_DIR2_MAX_DATAPTR;
+		*offset = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
 	else
-		*offset = xfs_dir2_byte_to_dataptr(mp, curoff);
+		*offset = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
 	kmem_free(map);
 	if (bp)
 		xfs_da_brelse(NULL, bp);
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index b46af0013ec..a8a8a6efad5 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -752,8 +752,8 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
 		ino += mp->m_inoadd;
 #endif
-		if (filldir(dirent, ".", 1, dot_offset, ino, DT_DIR)) {
-			*offset = dot_offset;
+		if (filldir(dirent, ".", 1, dot_offset & 0x7fffffff, ino, DT_DIR)) {
+			*offset = dot_offset & 0x7fffffff;
 			return 0;
 		}
 	}
@@ -766,8 +766,8 @@ xfs_dir2_sf_getdents(
 #if XFS_BIG_INUMS
 		ino += mp->m_inoadd;
 #endif
-		if (filldir(dirent, "..", 2, dotdot_offset, ino, DT_DIR)) {
-			*offset = dotdot_offset;
+		if (filldir(dirent, "..", 2, dotdot_offset & 0x7fffffff, ino, DT_DIR)) {
+			*offset = dotdot_offset & 0x7fffffff;
 			return 0;
 		}
 	}
@@ -791,14 +791,15 @@ xfs_dir2_sf_getdents(
 #endif
 
 		if (filldir(dirent, sfep->name, sfep->namelen,
-					    off, ino, DT_UNKNOWN)) {
-			*offset = off;
+			    off & 0x7fffffff, ino, DT_UNKNOWN)) {
+			*offset = off & 0x7fffffff;
 			return 0;
 		}
 		sfep = xfs_dir2_sf_nextentry(sfp, sfep);
 	}
 
-	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0);
+	*offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
+			0x7fffffff;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 058652a37dd9eac18d6b8c1a311137c679de9dae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 8 Jan 2009 13:42:25 -0500
Subject: [XFS] make xfs_ino_t an unsigned long long

Currently xfs_ino_t is defined as a u64 which can either be an unsigned
long long or on some 64 bit platforms and unsigned long.  Just making
it and unsigned long long mean's it's still always 64 bits wide, but we
don't need to resort to cases to print it.

Fixes a warning regression on 64 bit powerpc in current git.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/xfs_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index baedbd14dc2..b2f724502f1 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -45,7 +45,7 @@ typedef __uint32_t		prid_t;		/* project ID */
 typedef __uint32_t		inst_t;		/* an instruction */
 
 typedef __s64			xfs_off_t;	/* <file offset> type */
-typedef __u64			xfs_ino_t;	/* <inode> type */
+typedef unsigned long long	xfs_ino_t;	/* <inode> type */
 typedef __s64			xfs_daddr_t;	/* <disk address> type */
 typedef char *			xfs_caddr_t;	/* <core address> type */
 typedef __u32			xfs_dev_t;
-- 
cgit v1.2.3


From 958f8c0e4fc311e23a40635a530c01aec366a6e8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:40:44 +1100
Subject: [XFS] remove old vmap cache

XFS's vmap batching simply defers a number (up to 64) of vunmaps, and keeps
track of them in a list. To purge the batch, it just goes through the list and
calls vunamp on each one. This is pretty poor: a global TLB flush is generally
still performed on each vunmap, with the most expensive parts of the operation
being the broadcast IPIs and locking involved in the SMP callouts, and the
locking involved in the vmap management -- none of these are avoided by just
batching up the calls. I'm actually surprised it ever made much difference.
(Now that the lazy vmap allocator is upstream, this description is not quite
right, but the vunmap batching still doesn't seem to do much)

Rip all this logic out of XFS completely. I will improve vmap performance
and scalability directly in subsequent patch.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 75 +---------------------------------------------
 1 file changed, 1 insertion(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index cb329edc925..0b2177a9fbd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -165,75 +165,6 @@ test_page_region(
 	return (mask && (page_private(page) & mask) == mask);
 }
 
-/*
- *	Mapping of multi-page buffers into contiguous virtual space
- */
-
-typedef struct a_list {
-	void		*vm_addr;
-	struct a_list	*next;
-} a_list_t;
-
-static a_list_t		*as_free_head;
-static int		as_list_len;
-static DEFINE_SPINLOCK(as_lock);
-
-/*
- *	Try to batch vunmaps because they are costly.
- */
-STATIC void
-free_address(
-	void		*addr)
-{
-	a_list_t	*aentry;
-
-#ifdef CONFIG_XEN
-	/*
-	 * Xen needs to be able to make sure it can get an exclusive
-	 * RO mapping of pages it wants to turn into a pagetable.  If
-	 * a newly allocated page is also still being vmap()ed by xfs,
-	 * it will cause pagetable construction to fail.  This is a
-	 * quick workaround to always eagerly unmap pages so that Xen
-	 * is happy.
-	 */
-	vunmap(addr);
-	return;
-#endif
-
-	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
-	if (likely(aentry)) {
-		spin_lock(&as_lock);
-		aentry->next = as_free_head;
-		aentry->vm_addr = addr;
-		as_free_head = aentry;
-		as_list_len++;
-		spin_unlock(&as_lock);
-	} else {
-		vunmap(addr);
-	}
-}
-
-STATIC void
-purge_addresses(void)
-{
-	a_list_t	*aentry, *old;
-
-	if (as_free_head == NULL)
-		return;
-
-	spin_lock(&as_lock);
-	aentry = as_free_head;
-	as_free_head = NULL;
-	as_list_len = 0;
-	spin_unlock(&as_lock);
-
-	while ((old = aentry) != NULL) {
-		vunmap(aentry->vm_addr);
-		aentry = aentry->next;
-		kfree(old);
-	}
-}
-
 /*
  *	Internal xfs_buf_t object manipulation
  */
@@ -333,7 +264,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-			free_address(bp->b_addr - bp->b_offset);
+                       vunmap(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -455,8 +386,6 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-		if (as_list_len > 64)
-			purge_addresses();
 		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
 					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
@@ -1743,8 +1672,6 @@ xfsbufd(
 			count++;
 		}
 
-		if (as_list_len > 0)
-			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
-- 
cgit v1.2.3


From 0087167c9d5b1273e7e6bbe39a9ab13bdb9a39bb Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 6 Jan 2009 14:43:09 +1100
Subject: [XFS] use scalable vmap API

Implement XFS's large buffer support with the new vmap APIs. See the vmap
rewrite (db64fe02) for some numbers. The biggest improvement that comes from
using the new APIs is avoiding the global KVA allocation lock on every call.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0b2177a9fbd..d71dc44e21e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -264,7 +264,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vunmap(bp->b_addr - bp->b_offset);
+                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,8 +386,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
-					VM_MAP, PAGE_KERNEL);
+               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                                       -1, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
-- 
cgit v1.2.3


From e293e97e363e419d8a3628a927321e3f75206a0b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 Jan 2009 13:14:17 -0500
Subject: Btrfs: explicitly mark the tree log root for writeback

Each subvolume has an extent_state_tree used to mark metadata
that needs to be sent to disk while syncing the tree.  This is
used in addition to the dirty bits on the pages themselves so that
a single subvolume can be sent to disk efficiently in disk order.

Normally this marking happens in btrfs_alloc_free_block, which also does
special recording of dirty tree blocks for the tree log roots.

Yan Zheng noticed that when the root of the log tree is allocated, it is added
to the wrong writeback list.  The fix used here is to explicitly set
it dirty as part of tree log creation.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 332ec35d2c0..d81cda2e077 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -158,6 +158,19 @@ static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	 */
 	new_root->ref_cows = 0;
 	new_root->last_trans = trans->transid;
+
+	/*
+	 * we need to make sure the root block for this new tree
+	 * is marked as dirty in the dirty_log_pages tree.  This
+	 * is how it gets flushed down to disk at tree log commit time.
+	 *
+	 * the tree logging mutex keeps others from coming in and changing
+	 * the new_root->node, so we can safely access it here
+	 */
+	set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
+			 new_root->node->start + new_root->node->len - 1,
+			 GFP_NOFS);
+
 fail:
 	return ret;
 }
-- 
cgit v1.2.3


From 54b0d127696aba2ef1ec5430301c968ab539fa0d Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Wed, 7 Jan 2009 08:55:39 +0100
Subject: block: fix bug in ptbl lookup cache

Neil writes:

   Hi Jens,

    I've found a little bug for you.  It was introduced by
        a6f23657d3072bde6844055bbc2290e497f33fbc

        block: add one-hit cache for disk partition lookup

    and has the effect of killing my machine whenever I try to assemble
    an md array :-(
    One of the devices in the array has partitions, and mdadm always
    deletes partitions before putting a whole-device in an array (as it
    can cause confusion).  The next IO to that device locks the machine.
    I don't really understand exactly why it locks up, but it happens in
    disk_map_sector_rcu().  This patch fixes it.

Which is due to a missing clear of the (now) stale partition lookup
data. So clear that when we delete a partition.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 5198ada6739..6d720243f5f 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno)
 
 	blk_free_devt(part_devt(part));
 	rcu_assign_pointer(ptbl->part[partno], NULL);
+	rcu_assign_pointer(ptbl->last_lookup, NULL);
 	kobject_put(part->holder_dir);
 	device_del(part_to_dev(part));
 
-- 
cgit v1.2.3


From ab5610b434645518aca6e4de5ad851f9fef006f3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 8 Jan 2009 19:38:07 +0200
Subject: [JFFS2] remove junk prototypes

'rb_prev()', 'rb_next()' and 'rb_replace_node()' are declared in
include/linux/rbtree.h, no need for JFFS2 to re-declare them. I
believe these are left-overs from the old days when the common
RB tree code did not have those call and JFFS2 had private
implementation.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/nodelist.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 1750445556c..507ed6ec184 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c);
 void jffs2_free_raw_node_refs(struct jffs2_sb_info *c);
 struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset);
 void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
-struct rb_node *rb_next(struct rb_node *);
-struct rb_node *rb_prev(struct rb_node *);
-void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c,
-- 
cgit v1.2.3


From b32714ba29358a688ef337d5297bf4bdc9f596dc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 9 Jan 2009 07:04:15 -0800
Subject: partial revert of asynchronous inode delete

let the core of this one bake in -next as well, but leave
some of the infrastructure in place.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/inode.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 0013ac1af8e..913ab2d9a5d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1139,11 +1139,16 @@ EXPORT_SYMBOL(remove_inode_hash);
  * I_FREEING is set so that no-one will take a new reference to the inode while
  * it is being deleted.
  */
-static void generic_delete_inode_async(void *data, async_cookie_t cookie)
+void generic_delete_inode(struct inode *inode)
 {
-	struct inode *inode = data;
 	const struct super_operations *op = inode->i_sb->s_op;
 
+	list_del_init(&inode->i_list);
+	list_del_init(&inode->i_sb_list);
+	inode->i_state |= I_FREEING;
+	inodes_stat.nr_inodes--;
+	spin_unlock(&inode_lock);
+
 	security_inode_delete(inode);
 
 	if (op->delete_inode) {
@@ -1167,16 +1172,6 @@ static void generic_delete_inode_async(void *data, async_cookie_t cookie)
 	destroy_inode(inode);
 }
 
-void generic_delete_inode(struct inode *inode)
-{
-	list_del_init(&inode->i_list);
-	list_del_init(&inode->i_sb_list);
-	inode->i_state |= I_FREEING;
-	inodes_stat.nr_inodes--;
-	spin_unlock(&inode_lock);
-	async_schedule_special(generic_delete_inode_async, inode, &inode->i_sb->s_async_list);
-}
-
 EXPORT_SYMBOL(generic_delete_inode);
 
 static void generic_forget_inode(struct inode *inode)
-- 
cgit v1.2.3


From 2d96d1053d97cf0db832c4545bfb02a337043e09 Mon Sep 17 00:00:00 2001
From: David Brownell <dbrownell@users.sourceforge.net>
Date: Fri, 9 Jan 2009 16:40:52 -0800
Subject: CORE_DUMP_DEFAULT_ELF_HEADERS depends on ELF_CORE

Kernels that don't support ELF coredumps at all surely can't be supporting
new partial-segment flavored ELF coredumps ...  don't make folk answer
Kconfig questions about that flavor.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Kconfig.binfmt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index ce9fb3fbfae..bb4cc5b8abc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -43,7 +43,7 @@ config BINFMT_ELF_FDPIC
 config CORE_DUMP_DEFAULT_ELF_HEADERS
 	bool "Write ELF core dumps with partial segments"
 	default n
-	depends on BINFMT_ELF
+	depends on BINFMT_ELF && ELF_CORE
 	help
 	  ELF core dump files describe each memory mapping of the crashed
 	  process, and can contain or omit the memory contents of each one.
-- 
cgit v1.2.3


From c4be0c1dc4cdc37b175579be1460f15ac6495e9a Mon Sep 17 00:00:00 2001
From: Takashi Sato <t-sato@yk.jp.nec.com>
Date: Fri, 9 Jan 2009 16:40:58 -0800
Subject: filesystem freeze: add error handling of write_super_lockfs/unlockfs

Currently, ext3 in mainline Linux doesn't have the freeze feature which
suspends write requests.  So, we cannot take a backup which keeps the
filesystem's consistency with the storage device's features (snapshot and
replication) while it is mounted.

In many case, a commercial filesystem (e.g.  VxFS) has the freeze feature
and it would be used to get the consistent backup.

If Linux's standard filesystem ext3 has the freeze feature, we can do it
without a commercial filesystem.

So I have implemented the ioctls of the freeze feature.
I think we can take the consistent backup with the following steps.
1. Freeze the filesystem with the freeze ioctl.
2. Separate the replication volume or create the snapshot
   with the storage device's feature.
3. Unfreeze the filesystem with the unfreeze ioctl.
4. Take the backup from the separated replication volume
   or the snapshot.

This patch:

VFS:
Changed the type of write_super_lockfs and unlockfs from "void"
to "int" so that they can return an error.
Rename write_super_lockfs and unlockfs of the super block operation
freeze_fs and unfreeze_fs to avoid a confusion.

ext3, ext4, xfs, gfs2, jfs:
Changed the type of write_super_lockfs and unlockfs from "void"
to "int" so that write_super_lockfs returns an error if needed,
and unlockfs always returns 0.

reiserfs:
Changed the type of write_super_lockfs and unlockfs from "void"
to "int" so that they always return 0 (success) to keep a current behavior.

Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com>
Cc: <xfs-masters@oss.sgi.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                  |  8 ++++----
 fs/ext3/super.c              | 45 ++++++++++++++++++++++++++++----------------
 fs/ext4/super.c              | 45 ++++++++++++++++++++++++++++++--------------
 fs/gfs2/ops_super.c          | 16 +++++++++-------
 fs/jfs/super.c               | 10 ++++++----
 fs/reiserfs/super.c          | 10 ++++++----
 fs/xfs/linux-2.6/xfs_super.c |  8 ++++----
 fs/xfs/xfs_fsops.c           | 11 +++++++----
 fs/xfs/xfs_fsops.h           |  2 +-
 9 files changed, 97 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index c26da785938..87f9e537b8c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -221,8 +221,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 
 		sync_blockdev(sb->s_bdev);
 
-		if (sb->s_op->write_super_lockfs)
-			sb->s_op->write_super_lockfs(sb);
+		if (sb->s_op->freeze_fs)
+			sb->s_op->freeze_fs(sb);
 	}
 
 	sync_blockdev(bdev);
@@ -242,8 +242,8 @@ void thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	if (sb) {
 		BUG_ON(sb->s_bdev != bdev);
 
-		if (sb->s_op->unlockfs)
-			sb->s_op->unlockfs(sb);
+		if (sb->s_op->unfreeze_fs)
+			sb->s_op->unfreeze_fs(sb);
 		sb->s_frozen = SB_UNFROZEN;
 		smp_wmb();
 		wake_up(&sb->s_wait_unfrozen);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 5d047a030a7..b70d90e08a3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -48,8 +48,8 @@ static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
 			     unsigned long journal_devnum);
 static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
 			       unsigned int);
-static void ext3_commit_super (struct super_block * sb,
-			       struct ext3_super_block * es,
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
 			       int sync);
 static void ext3_mark_recovery_complete(struct super_block * sb,
 					struct ext3_super_block * es);
@@ -60,9 +60,9 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 				     char nbuf[16]);
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
-static void ext3_unlockfs(struct super_block *sb);
+static int ext3_unfreeze(struct super_block *sb);
 static void ext3_write_super (struct super_block * sb);
-static void ext3_write_super_lockfs(struct super_block *sb);
+static int ext3_freeze(struct super_block *sb);
 
 /*
  * Wrappers for journal_start/end.
@@ -759,8 +759,8 @@ static const struct super_operations ext3_sops = {
 	.put_super	= ext3_put_super,
 	.write_super	= ext3_write_super,
 	.sync_fs	= ext3_sync_fs,
-	.write_super_lockfs = ext3_write_super_lockfs,
-	.unlockfs	= ext3_unlockfs,
+	.freeze_fs	= ext3_freeze,
+	.unfreeze_fs	= ext3_unfreeze,
 	.statfs		= ext3_statfs,
 	.remount_fs	= ext3_remount,
 	.clear_inode	= ext3_clear_inode,
@@ -2311,21 +2311,23 @@ static int ext3_create_journal(struct super_block * sb,
 	return 0;
 }
 
-static void ext3_commit_super (struct super_block * sb,
-			       struct ext3_super_block * es,
+static int ext3_commit_super(struct super_block *sb,
+			       struct ext3_super_block *es,
 			       int sync)
 {
 	struct buffer_head *sbh = EXT3_SB(sb)->s_sbh;
+	int error = 0;
 
 	if (!sbh)
-		return;
+		return error;
 	es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync)
-		sync_dirty_buffer(sbh);
+		error = sync_dirty_buffer(sbh);
+	return error;
 }
 
 
@@ -2439,12 +2441,14 @@ static int ext3_sync_fs(struct super_block *sb, int wait)
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
  */
-static void ext3_write_super_lockfs(struct super_block *sb)
+static int ext3_freeze(struct super_block *sb)
 {
+	int error = 0;
+	journal_t *journal;
 	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		journal_t *journal = EXT3_SB(sb)->s_journal;
+		journal = EXT3_SB(sb)->s_journal;
 
 		/* Now we set up the journal barrier. */
 		journal_lock_updates(journal);
@@ -2453,20 +2457,28 @@ static void ext3_write_super_lockfs(struct super_block *sb)
 		 * We don't want to clear needs_recovery flag when we failed
 		 * to flush the journal.
 		 */
-		if (journal_flush(journal) < 0)
-			return;
+		error = journal_flush(journal);
+		if (error < 0)
+			goto out;
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		error = ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+		if (error)
+			goto out;
 	}
+	return 0;
+
+out:
+	journal_unlock_updates(journal);
+	return error;
 }
 
 /*
  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
  * flag here, even though the filesystem is not technically dirty yet.
  */
-static void ext3_unlockfs(struct super_block *sb)
+static int ext3_unfreeze(struct super_block *sb)
 {
 	if (!(sb->s_flags & MS_RDONLY)) {
 		lock_super(sb);
@@ -2476,6 +2488,7 @@ static void ext3_unlockfs(struct super_block *sb)
 		unlock_super(sb);
 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
 	}
+	return 0;
 }
 
 static int ext3_remount (struct super_block * sb, int * flags, char * data)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f7e0be8ab1..e5f06a5f045 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -51,7 +51,7 @@ struct proc_dir_entry *ext4_proc_root;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static void ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es);
@@ -62,9 +62,9 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
 				     char nbuf[16]);
 static int ext4_remount(struct super_block *sb, int *flags, char *data);
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
-static void ext4_unlockfs(struct super_block *sb);
+static int ext4_unfreeze(struct super_block *sb);
 static void ext4_write_super(struct super_block *sb);
-static void ext4_write_super_lockfs(struct super_block *sb);
+static int ext4_freeze(struct super_block *sb);
 
 
 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -978,8 +978,8 @@ static const struct super_operations ext4_sops = {
 	.put_super	= ext4_put_super,
 	.write_super	= ext4_write_super,
 	.sync_fs	= ext4_sync_fs,
-	.write_super_lockfs = ext4_write_super_lockfs,
-	.unlockfs	= ext4_unlockfs,
+	.freeze_fs	= ext4_freeze,
+	.unfreeze_fs	= ext4_unfreeze,
 	.statfs		= ext4_statfs,
 	.remount_fs	= ext4_remount,
 	.clear_inode	= ext4_clear_inode,
@@ -2888,13 +2888,14 @@ static int ext4_load_journal(struct super_block *sb,
 	return 0;
 }
 
-static void ext4_commit_super(struct super_block *sb,
+static int ext4_commit_super(struct super_block *sb,
 			      struct ext4_super_block *es, int sync)
 {
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
+	int error = 0;
 
 	if (!sbh)
-		return;
+		return error;
 	if (buffer_write_io_error(sbh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
@@ -2918,14 +2919,19 @@ static void ext4_commit_super(struct super_block *sb,
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
-		sync_dirty_buffer(sbh);
-		if (buffer_write_io_error(sbh)) {
+		error = sync_dirty_buffer(sbh);
+		if (error)
+			return error;
+
+		error = buffer_write_io_error(sbh);
+		if (error) {
 			printk(KERN_ERR "EXT4-fs: I/O error while writing "
 			       "superblock for %s.\n", sb->s_id);
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
 		}
 	}
+	return error;
 }
 
 
@@ -3058,12 +3064,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
  * LVM calls this function before a (read-only) snapshot is created.  This
  * gives us a chance to flush the journal completely and mark the fs clean.
  */
-static void ext4_write_super_lockfs(struct super_block *sb)
+static int ext4_freeze(struct super_block *sb)
 {
+	int error = 0;
+	journal_t *journal;
 	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		journal_t *journal = EXT4_SB(sb)->s_journal;
+		journal = EXT4_SB(sb)->s_journal;
 
 		if (journal) {
 			/* Now we set up the journal barrier. */
@@ -3073,21 +3081,29 @@ static void ext4_write_super_lockfs(struct super_block *sb)
 			 * We don't want to clear needs_recovery flag when we
 			 * failed to flush the journal.
 			 */
-			if (jbd2_journal_flush(journal) < 0)
-				return;
+			error = jbd2_journal_flush(journal);
+			if (error < 0)
+				goto out;
 		}
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		if (error)
+			goto out;
 	}
+	return 0;
+out:
+	jbd2_journal_unlock_updates(journal);
+	return error;
 }
 
 /*
  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
  * flag here, even though the filesystem is not technically dirty yet.
  */
-static void ext4_unlockfs(struct super_block *sb)
+static int ext4_unfreeze(struct super_block *sb)
 {
 	if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
 		lock_super(sb);
@@ -3097,6 +3113,7 @@ static void ext4_unlockfs(struct super_block *sb)
 		unlock_super(sb);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	}
+	return 0;
 }
 
 static int ext4_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 777783deddc..320323d0347 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -211,18 +211,18 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 }
 
 /**
- * gfs2_write_super_lockfs - prevent further writes to the filesystem
+ * gfs2_freeze - prevent further writes to the filesystem
  * @sb: the VFS structure for the filesystem
  *
  */
 
-static void gfs2_write_super_lockfs(struct super_block *sb)
+static int gfs2_freeze(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
 
 	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return;
+		return -EINVAL;
 
 	for (;;) {
 		error = gfs2_freeze_fs(sdp);
@@ -242,17 +242,19 @@ static void gfs2_write_super_lockfs(struct super_block *sb)
 		fs_err(sdp, "retrying...\n");
 		msleep(1000);
 	}
+	return 0;
 }
 
 /**
- * gfs2_unlockfs - reallow writes to the filesystem
+ * gfs2_unfreeze - reallow writes to the filesystem
  * @sb: the VFS structure for the filesystem
  *
  */
 
-static void gfs2_unlockfs(struct super_block *sb)
+static int gfs2_unfreeze(struct super_block *sb)
 {
 	gfs2_unfreeze_fs(sb->s_fs_info);
+	return 0;
 }
 
 /**
@@ -688,8 +690,8 @@ const struct super_operations gfs2_super_ops = {
 	.put_super		= gfs2_put_super,
 	.write_super		= gfs2_write_super,
 	.sync_fs		= gfs2_sync_fs,
-	.write_super_lockfs 	= gfs2_write_super_lockfs,
-	.unlockfs		= gfs2_unlockfs,
+	.freeze_fs 		= gfs2_freeze,
+	.unfreeze_fs		= gfs2_unfreeze,
 	.statfs			= gfs2_statfs,
 	.remount_fs		= gfs2_remount_fs,
 	.clear_inode		= gfs2_clear_inode,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 0dae345e481..b37d1f78b85 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -543,7 +543,7 @@ out_kfree:
 	return ret;
 }
 
-static void jfs_write_super_lockfs(struct super_block *sb)
+static int jfs_freeze(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
@@ -553,9 +553,10 @@ static void jfs_write_super_lockfs(struct super_block *sb)
 		lmLogShutdown(log);
 		updateSuper(sb, FM_CLEAN);
 	}
+	return 0;
 }
 
-static void jfs_unlockfs(struct super_block *sb)
+static int jfs_unfreeze(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
@@ -568,6 +569,7 @@ static void jfs_unlockfs(struct super_block *sb)
 		else
 			txResume(sb);
 	}
+	return 0;
 }
 
 static int jfs_get_sb(struct file_system_type *fs_type,
@@ -735,8 +737,8 @@ static const struct super_operations jfs_super_operations = {
 	.delete_inode	= jfs_delete_inode,
 	.put_super	= jfs_put_super,
 	.sync_fs	= jfs_sync_fs,
-	.write_super_lockfs = jfs_write_super_lockfs,
-	.unlockfs       = jfs_unlockfs,
+	.freeze_fs	= jfs_freeze,
+	.unfreeze_fs	= jfs_unfreeze,
 	.statfs		= jfs_statfs,
 	.remount_fs	= jfs_remount,
 	.show_options	= jfs_show_options,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index c55651f1407..f3c820b7582 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -83,7 +83,7 @@ static void reiserfs_write_super(struct super_block *s)
 	reiserfs_sync_fs(s, 1);
 }
 
-static void reiserfs_write_super_lockfs(struct super_block *s)
+static int reiserfs_freeze(struct super_block *s)
 {
 	struct reiserfs_transaction_handle th;
 	reiserfs_write_lock(s);
@@ -101,11 +101,13 @@ static void reiserfs_write_super_lockfs(struct super_block *s)
 	}
 	s->s_dirt = 0;
 	reiserfs_write_unlock(s);
+	return 0;
 }
 
-static void reiserfs_unlockfs(struct super_block *s)
+static int reiserfs_unfreeze(struct super_block *s)
 {
 	reiserfs_allow_writes(s);
+	return 0;
 }
 
 extern const struct in_core_key MAX_IN_CORE_KEY;
@@ -613,8 +615,8 @@ static const struct super_operations reiserfs_sops = {
 	.put_super = reiserfs_put_super,
 	.write_super = reiserfs_write_super,
 	.sync_fs = reiserfs_sync_fs,
-	.write_super_lockfs = reiserfs_write_super_lockfs,
-	.unlockfs = reiserfs_unlockfs,
+	.freeze_fs = reiserfs_freeze,
+	.unfreeze_fs = reiserfs_unfreeze,
 	.statfs = reiserfs_statfs,
 	.remount_fs = reiserfs_remount,
 	.show_options = generic_show_options,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index be846d606ae..95a97108036 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1269,14 +1269,14 @@ xfs_fs_remount(
  * need to take care of the metadata. Once that's done write a dummy
  * record to dirty the log in case of a crash while frozen.
  */
-STATIC void
-xfs_fs_lockfs(
+STATIC int
+xfs_fs_freeze(
 	struct super_block	*sb)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_quiesce_attr(mp);
-	xfs_fs_log_dummy(mp);
+	return -xfs_fs_log_dummy(mp);
 }
 
 STATIC int
@@ -1557,7 +1557,7 @@ static struct super_operations xfs_super_operations = {
 	.put_super		= xfs_fs_put_super,
 	.write_super		= xfs_fs_write_super,
 	.sync_fs		= xfs_fs_sync_super,
-	.write_super_lockfs	= xfs_fs_lockfs,
+	.freeze_fs		= xfs_fs_freeze,
 	.statfs			= xfs_fs_statfs,
 	.remount_fs		= xfs_fs_remount,
 	.show_options		= xfs_fs_show_options,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 852b6d32e8d..680d0e0ec93 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -595,17 +595,19 @@ out:
 	return 0;
 }
 
-void
+int
 xfs_fs_log_dummy(
 	xfs_mount_t	*mp)
 {
 	xfs_trans_t	*tp;
 	xfs_inode_t	*ip;
+	int		error;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-	if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) {
+	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	if (error) {
 		xfs_trans_cancel(tp, 0);
-		return;
+		return error;
 	}
 
 	ip = mp->m_rootip;
@@ -615,9 +617,10 @@ xfs_fs_log_dummy(
 	xfs_trans_ihold(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 	xfs_trans_set_sync(tp);
-	xfs_trans_commit(tp, 0);
+	error = xfs_trans_commit(tp, 0);
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
 }
 
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 300d0c9d61a..88435e0a77c 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern void xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp);
 
 #endif	/* __XFS_FSOPS_H__ */
-- 
cgit v1.2.3


From fcccf502540e3d752d33b2d8e976034dee81f9f7 Mon Sep 17 00:00:00 2001
From: Takashi Sato <t-sato@yk.jp.nec.com>
Date: Fri, 9 Jan 2009 16:40:59 -0800
Subject: filesystem freeze: implement generic freeze feature

The ioctls for the generic freeze feature are below.
o Freeze the filesystem
  int ioctl(int fd, int FIFREEZE, arg)
    fd: The file descriptor of the mountpoint
    FIFREEZE: request code for the freeze
    arg: Ignored
    Return value: 0 if the operation succeeds. Otherwise, -1

o Unfreeze the filesystem
  int ioctl(int fd, int FITHAW, arg)
    fd: The file descriptor of the mountpoint
    FITHAW: request code for unfreeze
    arg: Ignored
    Return value: 0 if the operation succeeds. Otherwise, -1
    Error number: If the filesystem has already been unfrozen,
                  errno is set to EINVAL.

[akpm@linux-foundation.org: fix CONFIG_BLOCK=n]
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
Signed-off-by: Masayuki Hamaguchi <m-hamaguchi@ys.jp.nec.com>
Cc: <xfs-masters@oss.sgi.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c |  2 ++
 fs/buffer.c    | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/ioctl.c     | 46 ++++++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index ac7031f12ea..b3c1efff5e1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -285,6 +285,8 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&bdev->bd_holder_list);
 #endif
 	inode_init_once(&ei->vfs_inode);
+	/* Initialize mutex for freeze. */
+	mutex_init(&bdev->bd_fsfreeze_mutex);
 }
 
 static inline void __bd_forget(struct inode *inode)
diff --git a/fs/buffer.c b/fs/buffer.c
index 87f9e537b8c..b6e8b8632e2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -203,10 +203,25 @@ int fsync_bdev(struct block_device *bdev)
  * happen on bdev until thaw_bdev() is called.
  * If a superblock is found on this device, we take the s_umount semaphore
  * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
  */
 struct super_block *freeze_bdev(struct block_device *bdev)
 {
 	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		bdev->bd_fsfreeze_count++;
+		sb = get_super(bdev);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+	bdev->bd_fsfreeze_count++;
 
 	down(&bdev->bd_mount_sem);
 	sb = get_super(bdev);
@@ -221,11 +236,24 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 
 		sync_blockdev(sb->s_bdev);
 
-		if (sb->s_op->freeze_fs)
-			sb->s_op->freeze_fs(sb);
+		if (sb->s_op->freeze_fs) {
+			error = sb->s_op->freeze_fs(sb);
+			if (error) {
+				printk(KERN_ERR
+					"VFS:Filesystem freeze failed\n");
+				sb->s_frozen = SB_UNFROZEN;
+				drop_super(sb);
+				up(&bdev->bd_mount_sem);
+				bdev->bd_fsfreeze_count--;
+				mutex_unlock(&bdev->bd_fsfreeze_mutex);
+				return ERR_PTR(error);
+			}
+		}
 	}
 
 	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
 	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
 }
 EXPORT_SYMBOL(freeze_bdev);
@@ -237,20 +265,48 @@ EXPORT_SYMBOL(freeze_bdev);
  *
  * Unlocks the filesystem and marks it writeable again after freeze_bdev().
  */
-void thaw_bdev(struct block_device *bdev, struct super_block *sb)
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 {
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EINVAL;
+	}
+
+	bdev->bd_fsfreeze_count--;
+	if (bdev->bd_fsfreeze_count > 0) {
+		if (sb)
+			drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return 0;
+	}
+
 	if (sb) {
 		BUG_ON(sb->s_bdev != bdev);
-
-		if (sb->s_op->unfreeze_fs)
-			sb->s_op->unfreeze_fs(sb);
-		sb->s_frozen = SB_UNFROZEN;
-		smp_wmb();
-		wake_up(&sb->s_wait_unfrozen);
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (sb->s_op->unfreeze_fs) {
+				error = sb->s_op->unfreeze_fs(sb);
+				if (error) {
+					printk(KERN_ERR
+						"VFS:Filesystem thaw failed\n");
+					sb->s_frozen = SB_FREEZE_TRANS;
+					bdev->bd_fsfreeze_count++;
+					mutex_unlock(&bdev->bd_fsfreeze_mutex);
+					return error;
+				}
+			}
+			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
+		}
 		drop_super(sb);
 	}
 
 	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
 }
 EXPORT_SYMBOL(thaw_bdev);
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index cc3f1aa1cf7..20b0a8a24c6 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -439,6 +439,43 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 	return error;
 }
 
+static int ioctl_fsfreeze(struct file *filp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support freeze feature, return. */
+	if (sb->s_op->freeze_fs == NULL)
+		return -EOPNOTSUPP;
+
+	/* If a blockdevice-backed filesystem isn't specified, return. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	/* Freeze */
+	sb = freeze_bdev(sb->s_bdev);
+	if (IS_ERR(sb))
+		return PTR_ERR(sb);
+	return 0;
+}
+
+static int ioctl_fsthaw(struct file *filp)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	/* Thaw */
+	return thaw_bdev(sb->s_bdev, sb);
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -486,6 +523,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		} else
 			error = -ENOTTY;
 		break;
+
+	case FIFREEZE:
+		error = ioctl_fsfreeze(filp);
+		break;
+
+	case FITHAW:
+		error = ioctl_fsthaw(filp);
+		break;
+
 	default:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
-- 
cgit v1.2.3


From 8e961870bb9804110d5c8211d5d9d500451c4518 Mon Sep 17 00:00:00 2001
From: Takashi Sato <t-sato@yk.jp.nec.com>
Date: Fri, 9 Jan 2009 16:41:00 -0800
Subject: filesystem freeze: remove XFS specific ioctl interfaces for freeze
 feature

It removes XFS specific ioctl interfaces and request codes
for freeze feature.

This patch has been supplied by David Chinner.

Signed-off-by: Dave Chinner <dgc@sgi.com>
Signed-off-by: Takashi Sato <t-sato@yk.jp.nec.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: <xfs-masters@oss.sgi.com>
Cc: <linux-ext4@vger.kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Kleikamp <shaggy@austin.ibm.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/xfs/linux-2.6/xfs_ioctl.c   | 15 ---------------
 fs/xfs/linux-2.6/xfs_ioctl32.c |  2 --
 fs/xfs/xfs_fs.h                |  4 ++--
 3 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 67205f6198b..e5be1e0be80 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -1546,21 +1546,6 @@ xfs_file_ioctl(
 		return -error;
 	}
 
-	case XFS_IOC_FREEZE:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-
-		if (inode->i_sb->s_frozen == SB_UNFROZEN)
-			freeze_bdev(inode->i_sb->s_bdev);
-		return 0;
-
-	case XFS_IOC_THAW:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		if (inode->i_sb->s_frozen != SB_UNFROZEN)
-			thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
-		return 0;
-
 	case XFS_IOC_GOINGDOWN: {
 		__uint32_t in;
 
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0504cece9f6..50903ad3182 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -632,8 +632,6 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_SET_RESBLKS:
 	case XFS_IOC_GET_RESBLKS:
 	case XFS_IOC_FSGROWFSLOG:
-	case XFS_IOC_FREEZE:
-	case XFS_IOC_THAW:
 	case XFS_IOC_GOINGDOWN:
 	case XFS_IOC_ERROR_INJECTION:
 	case XFS_IOC_ERROR_CLEARALL:
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 589c41c3844..f7c06fac822 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -465,8 +465,8 @@ typedef struct xfs_handle {
 #define XFS_IOC_ERROR_INJECTION	     _IOW ('X', 116, struct xfs_error_injection)
 #define XFS_IOC_ERROR_CLEARALL	     _IOW ('X', 117, struct xfs_error_injection)
 /*	XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118	 */
-#define XFS_IOC_FREEZE		     _IOWR('X', 119, int)
-#define XFS_IOC_THAW		     _IOWR('X', 120, int)
+/*	XFS_IOC_FREEZE		  -- FIFREEZE   119	 */
+/*	XFS_IOC_THAW		  -- FITHAW     120	 */
 #define XFS_IOC_FSSETDM_BY_HANDLE    _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
 #define XFS_IOC_ATTRLIST_BY_HANDLE   _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
 #define XFS_IOC_ATTRMULTI_BY_HANDLE  _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
-- 
cgit v1.2.3


From 0176260fc30842e358cf34afa7dcd9413db44822 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 10 Jan 2009 06:09:52 -0800
Subject: btrfs: fix for write_super_lockfs/unlockfs error handling

Commit c4be0c1dc4cdc37b175579be1460f15ac6495e9a added the ability for
write_super_lockfs to return errors, and renamed them to match.  But
btrfs didn't get converted.

Do the minimal conversion to make it compile again.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/super.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b4c101d9322..0a14b495532 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -605,18 +605,20 @@ out:
 	return ret;
 }
 
-static void btrfs_write_super_lockfs(struct super_block *sb)
+static int btrfs_freeze(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
 	mutex_lock(&root->fs_info->transaction_kthread_mutex);
 	mutex_lock(&root->fs_info->cleaner_mutex);
+	return 0;
 }
 
-static void btrfs_unlockfs(struct super_block *sb)
+static int btrfs_unfreeze(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+	return 0;
 }
 
 static struct super_operations btrfs_super_ops = {
@@ -631,8 +633,8 @@ static struct super_operations btrfs_super_ops = {
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
 	.remount_fs	= btrfs_remount,
-	.write_super_lockfs = btrfs_write_super_lockfs,
-	.unlockfs	= btrfs_unlockfs,
+	.freeze_fs	= btrfs_freeze,
+	.unfreeze_fs	= btrfs_unfreeze,
 };
 
 static const struct file_operations btrfs_ctl_fops = {
-- 
cgit v1.2.3


From c225aa57ff4ffe715df4692676b77c815a337236 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Simon=20Holm=20Th=C3=B8gersen?= <odie@cs.aau.dk>
Date: Sun, 11 Jan 2009 22:34:01 -0500
Subject: ext4: fix wrong use of do_div
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the following warning:

fs/jbd2/journal.c: In function ‘jbd2_seq_info_show’:
fs/jbd2/journal.c:850: warning: format ‘%lu’ expects type ‘long
unsigned int’, but argument 3 has type ‘uint32_t’

is caused by wrong usage of do_div that modifies the dividend in-place
and returns the quotient. So not only would an incorrect value be
displayed, but s->journal->j_average_commit_time would also be changed
to a wrong value!

Fix it by using div_u64 instead.

Signed-off-by: Simon Holm Thøgersen <odie@cs.aau.dk>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/journal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 56675306ed8..eb343008ede 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -37,10 +37,10 @@
 #include <linux/proc_fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/math64.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
-#include <asm/div64.h>
 
 EXPORT_SYMBOL(jbd2_journal_start);
 EXPORT_SYMBOL(jbd2_journal_restart);
@@ -846,8 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 	    jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
 	seq_printf(seq, "  %ums logging transaction\n",
 	    jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
-	seq_printf(seq, "  %luus average transaction commit time\n",
-		   do_div(s->journal->j_average_commit_time, 1000));
+	seq_printf(seq, "  %lluus average transaction commit time\n",
+		   div_u64(s->journal->j_average_commit_time, 1000));
 	seq_printf(seq, "  %lu handles per transaction\n",
 	    s->stats->u.run.rs_handle_count / s->stats->ts_tid);
 	seq_printf(seq, "  %lu blocks per transaction\n",
-- 
cgit v1.2.3


From 62568510b8e2679cbc331d7de10ea9ba81ae8b3d Mon Sep 17 00:00:00 2001
From: Bernd Schmidt <bernds_cb1@t-online.de>
Date: Tue, 13 Jan 2009 22:14:48 +0100
Subject: Fix timeouts in sys_pselect7

Since we (Analog Devices) updated our Blackfin kernel to 2.6.28, we've
seen occasional 5-second hangs from telnet.  telnetd calls select with a
NULL timeout, but with the new kernel, the system call occasionally
returns 0, which causes telnet to call sleep (5).  This did not happen
with earlier kernels.

The code in sys_pselect7 looks a bit strange, in particular the variable
"to" is initialized to NULL, then changed if a non-null timeout was
passed in, but not used further.  It needs to be passed to
core_sys_select instead of &end_time.

This bug was introduced by 8ff3e8e85fa6c312051134b3953e397feb639f51
("select: switch select() and poll() over to hrtimers").

Signed-off-by: Bernd Schmidt <bernd.schmidt@analog.com>
Reviewed-by: Ulrich Drepper <drepper@redhat.com>
Tested-by: Robin Getz <rgetz@blackfin.uclinux.org>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/select.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/select.c b/fs/select.c
index 08b91beed80..b0cf1f0896d 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -610,7 +610,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
 	}
 
-	ret = core_sys_select(n, inp, outp, exp, &end_time);
+	ret = core_sys_select(n, inp, outp, exp, to);
 	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
 
 	if (ret == -ERESTARTNOHAND) {
-- 
cgit v1.2.3


From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:54 +0100
Subject: [CVE-2009-0029] Convert all system calls to return a long

Convert all system calls to return a long. This should be a NOP since all
converted types should have the same size anyway.
With the exception of sys_exit_group which returned void. But that doesn't
matter since the system call doesn't return.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/read_write.c | 18 +++++++++---------
 fs/xattr.c      | 12 ++++++------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 5cc6924eb15..940367f51f2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(vfs_llseek);
 
-asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
+asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
 {
 	off_t retval;
 	struct file * file;
@@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
 	file->f_pos = pos;
 }
 
-asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
+asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -386,7 +386,7 @@ asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
 	return ret;
 }
 
-asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
+asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -403,7 +403,7 @@ asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t co
 	return ret;
 }
 
-asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
+asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 			     size_t count, loff_t pos)
 {
 	struct file *file;
@@ -424,7 +424,7 @@ asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
 	return ret;
 }
 
-asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
+asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 			      size_t count, loff_t pos)
 {
 	struct file *file;
@@ -672,7 +672,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 
 EXPORT_SYMBOL(vfs_writev);
 
-asmlinkage ssize_t
+asmlinkage long
 sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 {
 	struct file *file;
@@ -693,7 +693,7 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 	return ret;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 {
 	struct file *file;
@@ -812,7 +812,7 @@ out:
 	return retval;
 }
 
-asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
+asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
 {
 	loff_t pos;
 	off_t off;
@@ -831,7 +831,7 @@ asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, siz
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
 
-asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
+asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
 {
 	loff_t pos;
 	ssize_t ret;
diff --git a/fs/xattr.c b/fs/xattr.c
index 237804cd6b5..d049ae27aae 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -349,7 +349,7 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_getxattr(const char __user *pathname, const char __user *name,
 	     void __user *value, size_t size)
 {
@@ -364,7 +364,7 @@ sys_getxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
 	      size_t size)
 {
@@ -379,7 +379,7 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size)
 {
 	struct file *f;
@@ -424,7 +424,7 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 {
 	struct path path;
@@ -438,7 +438,7 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 {
 	struct path path;
@@ -452,7 +452,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage ssize_t
+asmlinkage long
 sys_flistxattr(int fd, char __user *list, size_t size)
 {
 	struct file *f;
-- 
cgit v1.2.3


From e55380edf68796d75bf41391a781c68ee678587d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:55 +0100
Subject: [CVE-2009-0029] Rename old_readdir to sys_old_readdir

This way it matches the generic system call name convention.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/readdir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/readdir.c b/fs/readdir.c
index b318d9b5af2..8b4c2a0051a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,7 +102,7 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count)
+asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count)
 {
 	int error;
 	struct file * file;
-- 
cgit v1.2.3


From 1134723e96f6e2abcf8bfd7a2d1c96fcc323ef35 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:56 +0100
Subject: [CVE-2009-0029] Remove __attribute__((weak)) from sys_pipe/sys_pipe2

Remove __attribute__((weak)) from common code sys_pipe implemantation.
IA64, ALPHA, SUPERH (32bit) and SPARC (32bit) have own implemantations
with the same name. Just rename them.
For sys_pipe2 there is no architecture specific implementation.

Cc: Richard Henderson <rth@twiddle.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/pipe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 891697112f6..0c64db86c91 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1043,7 +1043,7 @@ int do_pipe(int *fd)
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
  */
-asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
+asmlinkage long sys_pipe2(int __user *fildes, int flags)
 {
 	int fd[2];
 	int error;
@@ -1059,7 +1059,7 @@ asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
 	return error;
 }
 
-asmlinkage long __weak sys_pipe(int __user *fildes)
+asmlinkage long sys_pipe(int __user *fildes)
 {
 	return sys_pipe2(fildes, 0);
 }
-- 
cgit v1.2.3


From c9da9f2129d6a421c32e334a83770a9e67f7feac Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:13:57 +0100
Subject: [CVE-2009-0029] Make sys_pselect7 static

Not a single architecture has wired up sys_pselect7 plus it is the
only system call with seven parameters. Just make it static and
rename it to do_pselect which will do the work for sys_pselect6.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/compat.c | 6 +++---
 fs/select.c | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 30f2faa22f5..65a070e705a 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1709,7 +1709,7 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
 }
 
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp,
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize)
@@ -1775,8 +1775,8 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
 				(compat_size_t __user *)(sig+sizeof(up))))
 			return -EFAULT;
 	}
-	return compat_sys_pselect7(n, inp, outp, exp, tsp, compat_ptr(up),
-					sigsetsize);
+	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+				 sigsetsize);
 }
 
 asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
diff --git a/fs/select.c b/fs/select.c
index b0cf1f0896d..d1651648be1 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -582,9 +582,9 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 }
 
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp,
-		fd_set __user *exp, struct timespec __user *tsp,
-		const sigset_t __user *sigmask, size_t sigsetsize)
+static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
+		       fd_set __user *exp, struct timespec __user *tsp,
+		       const sigset_t __user *sigmask, size_t sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
 	struct timespec ts, end_time, *to = NULL;
@@ -650,7 +650,7 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
 			return -EFAULT;
 	}
 
-	return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize);
+	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
 }
 #endif /* HAVE_SET_RESTORE_SIGMASK */
 
-- 
cgit v1.2.3


From 6673e0c3fbeaed2cd08e2fd4a4aa97382d6fedb0 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:02 +0100
Subject: [CVE-2009-0029] System call wrapper special cases

System calls with an unsigned long long argument can't be converted with
the standard wrappers since that would include a cast to long, which in
turn means that we would lose the upper 32 bit on 32 bit architectures.
Also semctl can't use the standard wrapper since it has a 'union'
parameter.

So we handle them as special case and add some extra wrappers instead.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/dcookies.c   | 10 ++++++++--
 fs/open.c       | 27 ++++++++++++++++++++++++---
 fs/read_write.c | 24 ++++++++++++++++++++----
 fs/sync.c       | 26 ++++++++++++++++++++++----
 4 files changed, 74 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/dcookies.c b/fs/dcookies.c
index 180e9fec4ad..a21cabdbd87 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -145,7 +145,7 @@ out:
 /* And here is where the userspace process can look up the cookie value
  * to retrieve the path.
  */
-asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user * buf, size_t len)
+SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
 {
 	unsigned long cookie = (unsigned long)cookie64;
 	int err = -EINVAL;
@@ -198,7 +198,13 @@ out:
 	mutex_unlock(&dcookie_mutex);
 	return err;
 }
-
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
+{
+	return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
+}
+SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
+#endif
 
 static int dcookie_init(void)
 {
diff --git a/fs/open.c b/fs/open.c
index d882fd2351d..e349013fc79 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -351,21 +351,35 @@ asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 
 /* LFS versions of truncate are only needed on 32 bit machines */
 #if BITS_PER_LONG == 32
-asmlinkage long sys_truncate64(const char __user * path, loff_t length)
+SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
 {
 	return do_sys_truncate(path, length);
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_truncate64(long path, loff_t length)
+{
+	return SYSC_truncate64((const char __user *) path, length);
+}
+SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
+#endif
 
-asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
+SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
 {
 	long ret = do_sys_ftruncate(fd, length, 0);
 	/* avoid REGPARM breakage on x86: */
 	asmlinkage_protect(2, ret, fd, length);
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_ftruncate64(long fd, loff_t length)
+{
+	return SYSC_ftruncate64((unsigned int) fd, length);
+}
+SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
 #endif
+#endif /* BITS_PER_LONG == 32 */
 
-asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
+SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 {
 	struct file *file;
 	struct inode *inode;
@@ -422,6 +436,13 @@ out_fput:
 out:
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
+{
+	return SYSC_fallocate((int)fd, (int)mode, offset, len);
+}
+SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
+#endif
 
 /*
  * access() needs to use the real uid/gid, not the effective uid/gid.
diff --git a/fs/read_write.c b/fs/read_write.c
index 940367f51f2..7a8326bc590 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -403,8 +403,8 @@ asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count
 	return ret;
 }
 
-asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
-			     size_t count, loff_t pos)
+SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
+			size_t count, loff_t pos)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -423,9 +423,17 @@ asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
 
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
+{
+	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
+			    (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pread64, SyS_pread64);
+#endif
 
-asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
-			      size_t count, loff_t pos)
+SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
+			 size_t count, loff_t pos)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -444,6 +452,14 @@ asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
 
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
+{
+	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
+			     (size_t) count, pos);
+}
+SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
+#endif
 
 /*
  * Reduce an iovec's length in-place.  Return the resulting number of segments
diff --git a/fs/sync.c b/fs/sync.c
index ac02b56548b..23ebbd72ecc 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -201,8 +201,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
  * already-instantiated disk blocks, there are no guarantees here that the data
  * will be available after a crash.
  */
-asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-					unsigned int flags)
+SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
+				unsigned int flags)
 {
 	int ret;
 	struct file *file;
@@ -262,14 +262,32 @@ out_put:
 out:
 	return ret;
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
+				    long flags)
+{
+	return SYSC_sync_file_range((int) fd, offset, nbytes,
+				    (unsigned int) flags);
+}
+SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
+#endif
 
 /* It would be nice if people remember that not all the world's an i386
    when they introduce new system calls */
-asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
-				     loff_t offset, loff_t nbytes)
+SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
+				 loff_t offset, loff_t nbytes)
 {
 	return sys_sync_file_range(fd, offset, nbytes, flags);
 }
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_sync_file_range2(long fd, long flags,
+				     loff_t offset, loff_t nbytes)
+{
+	return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
+				     offset, nbytes);
+}
+SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
+#endif
 
 /*
  * `endbyte' is inclusive
-- 
cgit v1.2.3


From a5f8fa9e9ba5ef3305e147f41ad6e1e84ac1f0bd Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:11 +0100
Subject: [CVE-2009-0029] System call wrappers part 09

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/sync.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/sync.c b/fs/sync.c
index 23ebbd72ecc..a16d53e5fe9 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -36,7 +36,7 @@ static void do_sync(unsigned long wait)
 		laptop_sync_completion();
 }
 
-asmlinkage long sys_sync(void)
+SYSCALL_DEFINE0(sync)
 {
 	do_sync(1);
 	return 0;
@@ -144,12 +144,12 @@ static int do_fsync(unsigned int fd, int datasync)
 	return ret;
 }
 
-asmlinkage long sys_fsync(unsigned int fd)
+SYSCALL_DEFINE1(fsync, unsigned int, fd)
 {
 	return do_fsync(fd, 0);
 }
 
-asmlinkage long sys_fdatasync(unsigned int fd)
+SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
 {
 	return do_fsync(fd, 1);
 }
-- 
cgit v1.2.3


From bdc480e3bef6eb0e7071770834cbdda7e30a5436 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:12 +0100
Subject: [CVE-2009-0029] System call wrappers part 10

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/buffer.c    |  2 +-
 fs/namespace.c |  9 ++++-----
 fs/open.c      | 12 +++++-------
 fs/stat.c      |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index b6e8b8632e2..b58208f1640 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3243,7 +3243,7 @@ void block_sync_page(struct page *page)
  * Use of bdflush() is deprecated and will be removed in a future kernel.
  * The `pdflush' kernel threads fully replace bdflush daemons and this call.
  */
-asmlinkage long sys_bdflush(int func, long data)
+SYSCALL_DEFINE2(bdflush, int, func, long, data)
 {
 	static int msg_count;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index a40685d800a..3876a0fbaa6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1128,7 +1128,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
  * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
  */
 
-asmlinkage long sys_umount(char __user * name, int flags)
+SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
 {
 	struct path path;
 	int retval;
@@ -1160,7 +1160,7 @@ out:
 /*
  *	The 2.0 compatible umount. No flags.
  */
-asmlinkage long sys_oldumount(char __user * name)
+SYSCALL_DEFINE1(oldumount, char __user *, name)
 {
 	return sys_umount(name, 0);
 }
@@ -2045,9 +2045,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	return new_ns;
 }
 
-asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
-			  char __user * type, unsigned long flags,
-			  void __user * data)
+SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
+		char __user *, type, unsigned long, flags, void __user *, data)
 {
 	int retval;
 	unsigned long data_page;
diff --git a/fs/open.c b/fs/open.c
index e349013fc79..f6c2f5673ed 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -122,7 +122,7 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
 	return 0;
 }
 
-asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf)
+SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
 	struct path path;
 	int error;
@@ -138,8 +138,7 @@ asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * b
 	return error;
 }
 
-
-asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
+SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
 	struct path path;
 	long error;
@@ -157,8 +156,7 @@ asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct stat
 	return error;
 }
 
-
-asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf)
+SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
 	struct file * file;
 	struct statfs tmp;
@@ -289,7 +287,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_truncate(const char __user * path, unsigned long length)
+SYSCALL_DEFINE2(truncate, const char __user *, path, unsigned long, length)
 {
 	/* on 32-bit boxen it will cut the range 2^31--2^32-1 off */
 	return do_sys_truncate(path, (long)length);
@@ -341,7 +339,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
+SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
 {
 	long ret = do_sys_ftruncate(fd, length, 1);
 	/* avoid REGPARM breakage on x86: */
diff --git a/fs/stat.c b/fs/stat.c
index 7e12a6f8279..a1411648048 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -152,7 +152,7 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_stat(char __user * filename, struct __old_kernel_stat __user * statbuf)
+SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
-- 
cgit v1.2.3


From 257ac264d69017270fbc3cf5536953525db4076c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:13 +0100
Subject: [CVE-2009-0029] System call wrappers part 11

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/open.c  |  2 +-
 fs/stat.c  | 20 ++++++++++++--------
 fs/super.c |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index f6c2f5673ed..322bb60d168 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -174,7 +174,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user *buf)
+SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
 	struct file * file;
 	struct statfs64 tmp;
diff --git a/fs/stat.c b/fs/stat.c
index a1411648048..f29c5fe4f8b 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -162,7 +162,8 @@ SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *
 
 	return error;
 }
-asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __user * statbuf)
+
+SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -172,7 +173,8 @@ asmlinkage long sys_lstat(char __user * filename, struct __old_kernel_stat __use
 
 	return error;
 }
-asmlinkage long sys_fstat(unsigned int fd, struct __old_kernel_stat __user * statbuf)
+
+SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
@@ -235,7 +237,7 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat_fd(AT_FDCWD, filename, &stat);
@@ -246,7 +248,7 @@ asmlinkage long sys_newstat(char __user *filename, struct stat __user *statbuf)
 	return error;
 }
 
-asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat_fd(AT_FDCWD, filename, &stat);
@@ -280,7 +282,7 @@ out:
 }
 #endif
 
-asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
+SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
@@ -365,7 +367,7 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
 	return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
 }
 
-asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbuf)
+SYSCALL_DEFINE2(stat64, char __user *, filename, struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_stat(filename, &stat);
@@ -375,7 +377,8 @@ asmlinkage long sys_stat64(char __user * filename, struct stat64 __user * statbu
 
 	return error;
 }
-asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statbuf)
+
+SYSCALL_DEFINE2(lstat64, char __user *, filename, struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_lstat(filename, &stat);
@@ -385,7 +388,8 @@ asmlinkage long sys_lstat64(char __user * filename, struct stat64 __user * statb
 
 	return error;
 }
-asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf)
+
+SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 {
 	struct kstat stat;
 	int error = vfs_fstat(fd, &stat);
diff --git a/fs/super.c b/fs/super.c
index ed080c41716..645e5403f2a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -544,7 +544,7 @@ rescan:
 	return NULL;
 }
 
-asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf)
+SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 {
         struct super_block *s;
         struct ustat tmp;
-- 
cgit v1.2.3


From 64fd1de3d821659ac0a3004fd5ee1de59e64af30 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:14 +0100
Subject: [CVE-2009-0029] System call wrappers part 12

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/xattr.c | 46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index d049ae27aae..0367a5dae2b 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -251,9 +251,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 	return error;
 }
 
-asmlinkage long
-sys_setxattr(const char __user *pathname, const char __user *name,
-	     const void __user *value, size_t size, int flags)
+SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
 {
 	struct path path;
 	int error;
@@ -270,9 +270,9 @@ sys_setxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage long
-sys_lsetxattr(const char __user *pathname, const char __user *name,
-	      const void __user *value, size_t size, int flags)
+SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
+		const char __user *, name, const void __user *, value,
+		size_t, size, int, flags)
 {
 	struct path path;
 	int error;
@@ -289,9 +289,8 @@ sys_lsetxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage long
-sys_fsetxattr(int fd, const char __user *name, const void __user *value,
-	      size_t size, int flags)
+SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
+		const void __user *,value, size_t, size, int, flags)
 {
 	struct file *f;
 	struct dentry *dentry;
@@ -349,9 +348,8 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 	return error;
 }
 
-asmlinkage long
-sys_getxattr(const char __user *pathname, const char __user *name,
-	     void __user *value, size_t size)
+SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -364,9 +362,8 @@ sys_getxattr(const char __user *pathname, const char __user *name,
 	return error;
 }
 
-asmlinkage long
-sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
-	      size_t size)
+SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
+		const char __user *, name, void __user *, value, size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -379,8 +376,8 @@ sys_lgetxattr(const char __user *pathname, const char __user *name, void __user
 	return error;
 }
 
-asmlinkage long
-sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size)
+SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
+		void __user *, value, size_t, size)
 {
 	struct file *f;
 	ssize_t error = -EBADF;
@@ -424,8 +421,8 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage long
-sys_listxattr(const char __user *pathname, char __user *list, size_t size)
+SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -438,8 +435,8 @@ sys_listxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage long
-sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
+SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
+		size_t, size)
 {
 	struct path path;
 	ssize_t error;
@@ -452,8 +449,7 @@ sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
 	return error;
 }
 
-asmlinkage long
-sys_flistxattr(int fd, char __user *list, size_t size)
+SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 {
 	struct file *f;
 	ssize_t error = -EBADF;
@@ -485,8 +481,8 @@ removexattr(struct dentry *d, const char __user *name)
 	return vfs_removexattr(d, kname);
 }
 
-asmlinkage long
-sys_removexattr(const char __user *pathname, const char __user *name)
+SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
+		const char __user *, name)
 {
 	struct path path;
 	int error;
-- 
cgit v1.2.3


From 6a6160a7b5c27b3c38651baef92a14fa7072b3c1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:15 +0100
Subject: [CVE-2009-0029] System call wrappers part 13

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/xattr.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xattr.c b/fs/xattr.c
index 0367a5dae2b..197c4fcac03 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -499,8 +499,8 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
 	return error;
 }
 
-asmlinkage long
-sys_lremovexattr(const char __user *pathname, const char __user *name)
+SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
+		const char __user *, name)
 {
 	struct path path;
 	int error;
@@ -517,8 +517,7 @@ sys_lremovexattr(const char __user *pathname, const char __user *name)
 	return error;
 }
 
-asmlinkage long
-sys_fremovexattr(int fd, const char __user *name)
+SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 {
 	struct file *f;
 	struct dentry *dentry;
-- 
cgit v1.2.3


From 3480b25743cb7404928d57efeaa3d085708b04c2 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:16 +0100
Subject: [CVE-2009-0029] System call wrappers part 14

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/namei.c     | 8 ++++----
 fs/namespace.c | 4 ++--
 fs/open.c      | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f05bed24242..43fa2525972 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2017,7 +2017,7 @@ out_unlock:
 	return error;
 }
 
-asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
+SYSCALL_DEFINE3(mknod, const char __user *, filename, int, mode, unsigned, dev)
 {
 	return sys_mknodat(AT_FDCWD, filename, mode, dev);
 }
@@ -2302,7 +2302,7 @@ asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
 	return do_unlinkat(dfd, pathname);
 }
 
-asmlinkage long sys_unlink(const char __user *pathname)
+SYSCALL_DEFINE1(unlink, const char __user *, pathname)
 {
 	return do_unlinkat(AT_FDCWD, pathname);
 }
@@ -2370,7 +2370,7 @@ out_putname:
 	return error;
 }
 
-asmlinkage long sys_symlink(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
 {
 	return sys_symlinkat(oldname, AT_FDCWD, newname);
 }
@@ -2473,7 +2473,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_link(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
 {
 	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 3876a0fbaa6..228d8c4bfd1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2171,8 +2171,8 @@ static void chroot_fs_refs(struct path *old_root, struct path *new_root)
  *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
  *    first.
  */
-asmlinkage long sys_pivot_root(const char __user * new_root,
-			       const char __user * put_old)
+SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
+		const char __user *, put_old)
 {
 	struct vfsmount *tmp;
 	struct path new, old, parent_path, root_parent, root;
diff --git a/fs/open.c b/fs/open.c
index 322bb60d168..9b926de6ed9 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -569,7 +569,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_chroot(const char __user * filename)
+SYSCALL_DEFINE1(chroot, const char __user *, filename)
 {
 	struct path path;
 	int error;
-- 
cgit v1.2.3


From a26eab2400f0477bfac0255600552394855016f7 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:17 +0100
Subject: [CVE-2009-0029] System call wrappers part 15

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/fcntl.c | 11 ++++++-----
 fs/ioctl.c |  2 +-
 fs/namei.c |  2 +-
 fs/open.c  |  4 ++--
 4 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index cdc14194672..bd215cc791d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -50,7 +50,7 @@ static int get_close_on_exec(unsigned int fd)
 	return res;
 }
 
-asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
 {
 	int err = -EBADF;
 	struct file * file, *tofree;
@@ -113,7 +113,7 @@ out_unlock:
 	return err;
 }
 
-asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
@@ -126,7 +126,7 @@ asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
 	return sys_dup3(oldfd, newfd, 0);
 }
 
-asmlinkage long sys_dup(unsigned int fildes)
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
 	int ret = -EBADF;
 	struct file *file = fget(fildes);
@@ -335,7 +335,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	return err;
 }
 
-asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {	
 	struct file *filp;
 	long err = -EBADF;
@@ -358,7 +358,8 @@ out:
 }
 
 #if BITS_PER_LONG == 32
-asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+		unsigned long, arg)
 {	
 	struct file * filp;
 	long err;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 20b0a8a24c6..240ec63984c 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -542,7 +542,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 	return error;
 }
 
-asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
+SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {
 	struct file *filp;
 	int error = -EBADF;
diff --git a/fs/namei.c b/fs/namei.c
index 43fa2525972..00c4f37a039 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2718,7 +2718,7 @@ exit:
 	return error;
 }
 
-asmlinkage long sys_rename(const char __user *oldname, const char __user *newname)
+SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
 {
 	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
 }
diff --git a/fs/open.c b/fs/open.c
index 9b926de6ed9..ecc75a2c262 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -594,7 +594,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
+SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 {
 	struct inode * inode;
 	struct dentry * dentry;
@@ -658,7 +658,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
+SYSCALL_DEFINE2(chmod, const char __user *, filename, mode_t, mode)
 {
 	return sys_fchmodat(AT_FDCWD, filename, mode);
 }
-- 
cgit v1.2.3


From 002c8976ee537724b20a5e179d9b349309438836 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:18 +0100
Subject: [CVE-2009-0029] System call wrappers part 16

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/aio.c        | 22 +++++++++++-----------
 fs/locks.c      |  2 +-
 fs/open.c       |  2 +-
 fs/read_write.c |  4 ++--
 fs/stat.c       |  4 ++--
 5 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index d6f89d3c15e..8fa77e23394 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1270,7 +1270,7 @@ static void io_destroy(struct kioctx *ioctx)
  *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
  *	implemented.
  */
-asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
+SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
 	struct kioctx *ioctx = NULL;
 	unsigned long ctx;
@@ -1308,7 +1308,7 @@ out:
  *	implemented.  May fail with -EFAULT if the context pointed to
  *	is invalid.
  */
-asmlinkage long sys_io_destroy(aio_context_t ctx)
+SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx);
 	if (likely(NULL != ioctx)) {
@@ -1662,8 +1662,8 @@ out_put_req:
  *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
  *	fail with -ENOSYS if not implemented.
  */
-asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
-			      struct iocb __user * __user *iocbpp)
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+		struct iocb __user * __user *, iocbpp)
 {
 	struct kioctx *ctx;
 	long ret = 0;
@@ -1737,8 +1737,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
  *	invalid.  May fail with -EAGAIN if the iocb specified was not
  *	cancelled.  Will fail with -ENOSYS if not implemented.
  */
-asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
-			      struct io_event __user *result)
+SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
+		struct io_event __user *, result)
 {
 	int (*cancel)(struct kiocb *iocb, struct io_event *res);
 	struct kioctx *ctx;
@@ -1799,11 +1799,11 @@ asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
  *	will be updated if not NULL and the operation blocks.  Will fail
  *	with -ENOSYS if not implemented.
  */
-asmlinkage long sys_io_getevents(aio_context_t ctx_id,
-				 long min_nr,
-				 long nr,
-				 struct io_event __user *events,
-				 struct timespec __user *timeout)
+SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
+		long, min_nr,
+		long, nr,
+		struct io_event __user *, events,
+		struct timespec __user *, timeout)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx_id);
 	long ret = -EINVAL;
diff --git a/fs/locks.c b/fs/locks.c
index 46a2e12f7d4..ec3deea29e3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1564,7 +1564,7 @@ EXPORT_SYMBOL(flock_lock_file_wait);
  *	%LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other
  *	processes read and write access respectively.
  */
-asmlinkage long sys_flock(unsigned int fd, unsigned int cmd)
+SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 {
 	struct file *filp;
 	struct file_lock *lock;
diff --git a/fs/open.c b/fs/open.c
index ecc75a2c262..293408b1c16 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1081,7 +1081,7 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
  * For backward compatibility?  Maybe this should be moved
  * into arch/i386 instead?
  */
-asmlinkage long sys_creat(const char __user * pathname, int mode)
+SYSCALL_DEFINE2(creat, const char __user *, pathname, int, mode)
 {
 	return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode);
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 7a8326bc590..0671aa016b6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -828,7 +828,7 @@ out:
 	return retval;
 }
 
-asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
+SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
 {
 	loff_t pos;
 	off_t off;
@@ -847,7 +847,7 @@ asmlinkage long sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
 
-asmlinkage long sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
+SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
 {
 	loff_t pos;
 	ssize_t ret;
diff --git a/fs/stat.c b/fs/stat.c
index f29c5fe4f8b..d712a0dfb50 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -320,8 +320,8 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
 	return error;
 }
 
-asmlinkage long sys_readlink(const char __user *path, char __user *buf,
-				int bufsiz)
+SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
+		int, bufsiz)
 {
 	return sys_readlinkat(AT_FDCWD, path, buf, bufsiz);
 }
-- 
cgit v1.2.3


From ca013e945b1ba5828b151ee646946f1297b67a4c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:19 +0100
Subject: [CVE-2009-0029] System call wrappers part 17

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/open.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 293408b1c16..4a6d8006474 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -517,7 +517,7 @@ out:
 	return res;
 }
 
-asmlinkage long sys_access(const char __user *filename, int mode)
+SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 {
 	return sys_faccessat(AT_FDCWD, filename, mode);
 }
@@ -688,7 +688,7 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
 	return error;
 }
 
-asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
+SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
 {
 	struct path path;
 	int error;
@@ -732,7 +732,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
+SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
 {
 	struct path path;
 	int error;
@@ -751,8 +751,7 @@ out:
 	return error;
 }
 
-
-asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
+SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 {
 	struct file * file;
 	int error = -EBADF;
@@ -1048,7 +1047,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 	return fd;
 }
 
-asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
 {
 	long ret;
 
@@ -1117,7 +1116,7 @@ EXPORT_SYMBOL(filp_close);
  * releasing the fd. This ensures that one clone task can't release
  * an fd while another clone is opening it.
  */
-asmlinkage long sys_close(unsigned int fd)
+SYSCALL_DEFINE1(close, unsigned int, fd)
 {
 	struct file * filp;
 	struct files_struct *files = current->files;
@@ -1150,14 +1149,13 @@ out_unlock:
 	spin_unlock(&files->file_lock);
 	return -EBADF;
 }
-
 EXPORT_SYMBOL(sys_close);
 
 /*
  * This routine simulates a hangup on the tty, to arrange that users
  * are given clean terminals at login time.
  */
-asmlinkage long sys_vhangup(void)
+SYSCALL_DEFINE0(vhangup)
 {
 	if (capable(CAP_SYS_TTY_CONFIG)) {
 		tty_vhangup_self();
-- 
cgit v1.2.3


From 003d7ab479168132a2b2c6700fe682b08f08ab0c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:21 +0100
Subject: [CVE-2009-0029] System call wrappers part 19

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/read_write.c | 8 ++++----
 fs/utimes.c     | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/read_write.c b/fs/read_write.c
index 0671aa016b6..fad10af59d9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -147,7 +147,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(vfs_llseek);
 
-asmlinkage long sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
+SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 {
 	off_t retval;
 	struct file * file;
@@ -171,9 +171,9 @@ bad:
 }
 
 #ifdef __ARCH_WANT_SYS_LLSEEK
-asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
-			   unsigned long offset_low, loff_t __user * result,
-			   unsigned int origin)
+SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
+		unsigned long, offset_low, loff_t __user *, result,
+		unsigned int, origin)
 {
 	int retval;
 	struct file * file;
diff --git a/fs/utimes.c b/fs/utimes.c
index 6929e3e91d0..ee853615798 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -24,7 +24,7 @@
  * must be owner or have write permission.
  * Else, update from *times, must be owner or super user.
  */
-asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
+SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times)
 {
 	struct timespec tv[2];
 
@@ -214,7 +214,8 @@ asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __u
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0);
 }
 
-asmlinkage long sys_utimes(char __user *filename, struct timeval __user *utimes)
+SYSCALL_DEFINE2(utimes, char __user *, filename,
+		struct timeval __user *, utimes)
 {
 	return sys_futimesat(AT_FDCWD, filename, utimes);
 }
-- 
cgit v1.2.3


From 3cdad42884bbd95d5aa01297e8236ea1bad70053 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:22 +0100
Subject: [CVE-2009-0029] System call wrappers part 20

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/dcache.c     |  2 +-
 fs/namei.c      |  4 ++--
 fs/open.c       |  4 ++--
 fs/quota.c      |  3 ++-
 fs/read_write.c | 13 +++++++------
 5 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 4547f66884a..937df0fb0da 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2092,7 +2092,7 @@ Elong:
  *		return NULL;
  *	}
  */
-asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
+SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 {
 	int error;
 	struct path pwd, root;
diff --git a/fs/namei.c b/fs/namei.c
index 00c4f37a039..90520f05f99 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2081,7 +2081,7 @@ out_err:
 	return error;
 }
 
-asmlinkage long sys_mkdir(const char __user *pathname, int mode)
+SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
 {
 	return sys_mkdirat(AT_FDCWD, pathname, mode);
 }
@@ -2195,7 +2195,7 @@ exit1:
 	return error;
 }
 
-asmlinkage long sys_rmdir(const char __user *pathname)
+SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
 {
 	return do_rmdir(AT_FDCWD, pathname);
 }
diff --git a/fs/open.c b/fs/open.c
index 4a6d8006474..bc49e3c388d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -522,7 +522,7 @@ SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
 	return sys_faccessat(AT_FDCWD, filename, mode);
 }
 
-asmlinkage long sys_chdir(const char __user * filename)
+SYSCALL_DEFINE1(chdir, const char __user *, filename)
 {
 	struct path path;
 	int error;
@@ -543,7 +543,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fchdir(unsigned int fd)
+SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 {
 	struct file *file;
 	struct inode *inode;
diff --git a/fs/quota.c b/fs/quota.c
index 4a8c94f05f7..d76ada914f9 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -371,7 +371,8 @@ static inline struct super_block *quotactl_block(const char __user *special)
  * calls. Maybe we need to add the process quotas etc. in the future,
  * but we probably should use rlimits for that.
  */
-asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr)
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+		qid_t, id, void __user *, addr)
 {
 	uint cmds, type;
 	struct super_block *sb = NULL;
diff --git a/fs/read_write.c b/fs/read_write.c
index fad10af59d9..400fe81c973 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -369,7 +369,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
 	file->f_pos = pos;
 }
 
-asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count)
+SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -386,7 +386,8 @@ asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count)
 	return ret;
 }
 
-asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t count)
+SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
+		size_t, count)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -688,8 +689,8 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
 
 EXPORT_SYMBOL(vfs_writev);
 
-asmlinkage long
-sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
@@ -709,8 +710,8 @@ sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
 	return ret;
 }
 
-asmlinkage long
-sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
+SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
+		unsigned long, vlen)
 {
 	struct file *file;
 	ssize_t ret = -EBADF;
-- 
cgit v1.2.3


From 20f37034fb966a1c35894f9fe529fda0b6440101 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:23 +0100
Subject: [CVE-2009-0029] System call wrappers part 21

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/readdir.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/readdir.c b/fs/readdir.c
index 8b4c2a0051a..cf6a0e39819 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -187,7 +187,8 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long sys_getdents(unsigned int fd, struct linux_dirent __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(getdents, unsigned int, fd,
+		struct linux_dirent __user *, dirent, unsigned int, count)
 {
 	struct file * file;
 	struct linux_dirent __user * lastdirent;
@@ -268,7 +269,8 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long sys_getdents64(unsigned int fd, struct linux_dirent64 __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+		struct linux_dirent64 __user *, dirent, unsigned int, count)
 {
 	struct file * file;
 	struct linux_dirent64 __user * lastdirent;
-- 
cgit v1.2.3


From 5a8a82b1d306a325d899b67715618413657efda4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:25 +0100
Subject: [CVE-2009-0029] System call wrappers part 23

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/eventpoll.c | 18 +++++++++---------
 fs/select.c    |  8 ++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 96355d50534..ba2f9ec7119 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1110,7 +1110,7 @@ retry:
 /*
  * Open an eventpoll file descriptor.
  */
-asmlinkage long sys_epoll_create1(int flags)
+SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
 	int error, fd = -1;
 	struct eventpoll *ep;
@@ -1150,7 +1150,7 @@ error_return:
 	return fd;
 }
 
-asmlinkage long sys_epoll_create(int size)
+SYSCALL_DEFINE1(epoll_create, int, size)
 {
 	if (size < 0)
 		return -EINVAL;
@@ -1163,8 +1163,8 @@ asmlinkage long sys_epoll_create(int size)
  * the eventpoll file that enables the insertion/removal/change of
  * file descriptors inside the interest set.
  */
-asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
-			      struct epoll_event __user *event)
+SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
+		struct epoll_event __user *, event)
 {
 	int error;
 	struct file *file, *tfile;
@@ -1261,8 +1261,8 @@ error_return:
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_wait(2).
  */
-asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
-			       int maxevents, int timeout)
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout)
 {
 	int error;
 	struct file *file;
@@ -1319,9 +1319,9 @@ error_return:
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
  */
-asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
-		int maxevents, int timeout, const sigset_t __user *sigmask,
-		size_t sigsetsize)
+SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
+		int, maxevents, int, timeout, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
 {
 	int error;
 	sigset_t ksigmask, sigsaved;
diff --git a/fs/select.c b/fs/select.c
index d1651648be1..338f703403a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -557,8 +557,8 @@ out_nofds:
 	return ret;
 }
 
-asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			fd_set __user *exp, struct timeval __user *tvp)
+SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timeval __user *, tvp)
 {
 	struct timespec end_time, *to = NULL;
 	struct timeval tv;
@@ -854,8 +854,8 @@ static long do_restart_poll(struct restart_block *restart_block)
 	return ret;
 }
 
-asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
-			long timeout_msecs)
+SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
+		long, timeout_msecs)
 {
 	struct timespec end_time, *to = NULL;
 	int ret;
-- 
cgit v1.2.3


From 1e7bfb2134dfec37ce04fb3a4ca89299e892d10c Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:29 +0100
Subject: [CVE-2009-0029] System call wrappers part 27

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/exec.c        | 2 +-
 fs/filesystems.c | 2 +-
 fs/nfsctl.c      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 71a6efe5d8b..0dd60a01f1b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -99,7 +99,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
  *
  * Also note that we take the address to load from from the file itself.
  */
-asmlinkage long sys_uselib(const char __user * library)
+SYSCALL_DEFINE1(uselib, const char __user *, library)
 {
 	struct file *file;
 	struct nameidata nd;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d488dcd7f2b..1aa70260e6d 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -179,7 +179,7 @@ static int fs_maxindex(void)
 /*
  * Whee.. Weird sysv syscall. 
  */
-asmlinkage long sys_sysfs(int option, unsigned long arg1, unsigned long arg2)
+SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 {
 	int retval = -EINVAL;
 
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b27451909df..8f9a20556f7 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -86,8 +86,8 @@ static struct {
 	},
 };
 
-long
-asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *res)
+SYSCALL_DEFINE3(nfsservctl, int, cmd, struct nfsctl_arg __user *, arg,
+		void __user *, res)
 {
 	struct file *file;
 	void __user *p = &arg->u;
-- 
cgit v1.2.3


From 938bb9f5e840eddbf54e4f62f6c5ba9b3ae12c9d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:30 +0100
Subject: [CVE-2009-0029] System call wrappers part 28

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/ioprio.c                      | 5 ++---
 fs/notify/inotify/inotify_user.c | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 1a39ac37094..c7c0b28d7d2 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -72,7 +72,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 
-asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
+SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 {
 	int class = IOPRIO_PRIO_CLASS(ioprio);
 	int data = IOPRIO_PRIO_DATA(ioprio);
@@ -188,7 +188,7 @@ int ioprio_best(unsigned short aprio, unsigned short bprio)
 		return aprio;
 }
 
-asmlinkage long sys_ioprio_get(int which, int who)
+SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -252,4 +252,3 @@ asmlinkage long sys_ioprio_get(int which, int who)
 	read_unlock(&tasklist_lock);
 	return ret;
 }
-
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 81b8644b013..efef1ffca77 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -576,7 +576,7 @@ static const struct inotify_operations inotify_user_ops = {
 	.destroy_watch	= free_inotify_user_watch,
 };
 
-asmlinkage long sys_inotify_init1(int flags)
+SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
 	struct inotify_device *dev;
 	struct inotify_handle *ih;
@@ -655,7 +655,7 @@ out_put_fd:
 	return ret;
 }
 
-asmlinkage long sys_inotify_init(void)
+SYSCALL_DEFINE0(inotify_init)
 {
 	return sys_inotify_init1(0);
 }
-- 
cgit v1.2.3


From 2e4d0924eb0c403ce4014fa139d1d61bf2c44fee Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:31 +0100
Subject: [CVE-2009-0029] System call wrappers part 29

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/namei.c                       | 21 ++++++++++-----------
 fs/notify/inotify/inotify_user.c |  5 +++--
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 90520f05f99..bbc15c23755 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1962,8 +1962,8 @@ static int may_mknod(mode_t mode)
 	}
 }
 
-asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
-				unsigned dev)
+SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode,
+		unsigned, dev)
 {
 	int error;
 	char *tmp;
@@ -2044,7 +2044,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return error;
 }
 
-asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
+SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode)
 {
 	int error = 0;
 	char * tmp;
@@ -2291,7 +2291,7 @@ slashes:
 	goto exit2;
 }
 
-asmlinkage long sys_unlinkat(int dfd, const char __user *pathname, int flag)
+SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
 {
 	if ((flag & ~AT_REMOVEDIR) != 0)
 		return -EINVAL;
@@ -2328,8 +2328,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 	return error;
 }
 
-asmlinkage long sys_symlinkat(const char __user *oldname,
-			      int newdfd, const char __user *newname)
+SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
 {
 	int error;
 	char *from;
@@ -2422,9 +2422,8 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
  * with linux 2.0, and to avoid hard-linking to directories
  * and other special files.  --ADM
  */
-asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
-			   int newdfd, const char __user *newname,
-			   int flags)
+SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname, int, flags)
 {
 	struct dentry *new_dentry;
 	struct nameidata nd;
@@ -2624,8 +2623,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return error;
 }
 
-asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
-			     int newdfd, const char __user *newname)
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+		int, newdfd, const char __user *, newname)
 {
 	struct dentry *old_dir, *new_dir;
 	struct dentry *old_dentry, *new_dentry;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index efef1ffca77..d53a1838d6e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -660,7 +660,8 @@ SYSCALL_DEFINE0(inotify_init)
 	return sys_inotify_init1(0);
 }
 
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
+SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
+		u32, mask)
 {
 	struct inode *inode;
 	struct inotify_device *dev;
@@ -704,7 +705,7 @@ fput_and_out:
 	return ret;
 }
 
-asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
+SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
 	struct file *filp;
 	struct inotify_device *dev;
-- 
cgit v1.2.3


From 6559eed8ca7db0531a207cd80be5e28cd6f213c5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:32 +0100
Subject: [CVE-2009-0029] System call wrappers part 30

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/open.c   | 13 ++++++-------
 fs/stat.c   | 12 ++++++------
 fs/utimes.c |  6 ++++--
 3 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index bc49e3c388d..a3a78ceb2a2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -447,7 +447,7 @@ SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
  * We do this by temporarily clearing all FS-related capabilities and
  * switching the fsuid/fsgid around to the real ones.
  */
-asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
+SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
 {
 	const struct cred *old_cred;
 	struct cred *override_cred;
@@ -628,8 +628,7 @@ out:
 	return err;
 }
 
-asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
-			     mode_t mode)
+SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, mode_t, mode)
 {
 	struct path path;
 	struct inode *inode;
@@ -707,8 +706,8 @@ out:
 	return error;
 }
 
-asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
-			     gid_t group, int flag)
+SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
+		gid_t, group, int, flag)
 {
 	struct path path;
 	int error = -EINVAL;
@@ -1060,8 +1059,8 @@ SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, int, mode)
 	return ret;
 }
 
-asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
-			   int mode)
+SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
+		int, mode)
 {
 	long ret;
 
diff --git a/fs/stat.c b/fs/stat.c
index d712a0dfb50..2db740a0cfb 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -260,8 +260,8 @@ SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf
 }
 
 #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
-asmlinkage long sys_newfstatat(int dfd, char __user *filename,
-				struct stat __user *statbuf, int flag)
+SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename,
+		struct stat __user *, statbuf, int, flag)
 {
 	struct kstat stat;
 	int error = -EINVAL;
@@ -293,8 +293,8 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
 	return error;
 }
 
-asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
-				char __user *buf, int bufsiz)
+SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
+		char __user *, buf, int, bufsiz)
 {
 	struct path path;
 	int error;
@@ -400,8 +400,8 @@ SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
 	return error;
 }
 
-asmlinkage long sys_fstatat64(int dfd, char __user *filename,
-			       struct stat64 __user *statbuf, int flag)
+SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
+		struct stat64 __user *, statbuf, int, flag)
 {
 	struct kstat stat;
 	int error = -EINVAL;
diff --git a/fs/utimes.c b/fs/utimes.c
index ee853615798..e4c75db5d37 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -170,7 +170,8 @@ out:
 	return error;
 }
 
-asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __user *utimes, int flags)
+SYSCALL_DEFINE4(utimensat, int, dfd, char __user *, filename,
+		struct timespec __user *, utimes, int, flags)
 {
 	struct timespec tstimes[2];
 
@@ -187,7 +188,8 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
 	return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags);
 }
 
-asmlinkage long sys_futimesat(int dfd, char __user *filename, struct timeval __user *utimes)
+SYSCALL_DEFINE3(futimesat, int, dfd, char __user *, filename,
+		struct timeval __user *, utimes)
 {
 	struct timeval times[2];
 	struct timespec tstimes[2];
-- 
cgit v1.2.3


From 836f92adf121f806e9beb5b6b88bd5c9c4ea3f24 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:33 +0100
Subject: [CVE-2009-0029] System call wrappers part 31

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/signalfd.c |  8 ++++----
 fs/splice.c   | 12 ++++++------
 fs/timerfd.c  |  8 ++++----
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/signalfd.c b/fs/signalfd.c
index 9c39bc7f843..b07565c9438 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,8 +205,8 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
-			      size_t sizemask, int flags)
+SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask, int, flags)
 {
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
@@ -259,8 +259,8 @@ asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
 	return ufd;
 }
 
-asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask,
-			     size_t sizemask)
+SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask,
+		size_t, sizemask)
 {
 	return sys_signalfd4(ufd, user_mask, sizemask, 0);
 }
diff --git a/fs/splice.c b/fs/splice.c
index a54b3e3f10a..4ed0ba44a96 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1435,8 +1435,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
  * Currently we punt and implement it as a normal copy, see pipe_to_user().
  *
  */
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
-			     unsigned long nr_segs, unsigned int flags)
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
+		unsigned long, nr_segs, unsigned int, flags)
 {
 	struct file *file;
 	long error;
@@ -1461,9 +1461,9 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
 	return error;
 }
 
-asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
-			   int fd_out, loff_t __user *off_out,
-			   size_t len, unsigned int flags)
+SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
+		int, fd_out, loff_t __user *, off_out,
+		size_t, len, unsigned int, flags)
 {
 	long error;
 	struct file *in, *out;
@@ -1685,7 +1685,7 @@ static long do_tee(struct file *in, struct file *out, size_t len,
 	return ret;
 }
 
-asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 {
 	struct file *in;
 	int error, fput_in;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0862f0e49d0..c8c14f58b96 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -177,7 +177,7 @@ static struct file *timerfd_fget(int fd)
 	return file;
 }
 
-asmlinkage long sys_timerfd_create(int clockid, int flags)
+SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 {
 	int ufd;
 	struct timerfd_ctx *ctx;
@@ -208,9 +208,9 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
 	return ufd;
 }
 
-asmlinkage long sys_timerfd_settime(int ufd, int flags,
-				    const struct itimerspec __user *utmr,
-				    struct itimerspec __user *otmr)
+SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
+		const struct itimerspec __user *, utmr,
+		struct itimerspec __user *, otmr)
 {
 	struct file *file;
 	struct timerfd_ctx *ctx;
-- 
cgit v1.2.3


From d4e82042c4cfa87a7d51710b71f568fe80132551 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:34 +0100
Subject: [CVE-2009-0029] System call wrappers part 32

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/eventfd.c |  5 ++---
 fs/pipe.c    |  2 +-
 fs/readdir.c |  3 ++-
 fs/select.c  | 11 ++++++-----
 fs/timerfd.c |  2 +-
 5 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08bf558d040..5de2c2db3aa 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,7 +198,7 @@ struct file *eventfd_fget(int fd)
 	return file;
 }
 
-asmlinkage long sys_eventfd2(unsigned int count, int flags)
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
 	int fd;
 	struct eventfd_ctx *ctx;
@@ -228,8 +228,7 @@ asmlinkage long sys_eventfd2(unsigned int count, int flags)
 	return fd;
 }
 
-asmlinkage long sys_eventfd(unsigned int count)
+SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
 	return sys_eventfd2(count, 0);
 }
-
diff --git a/fs/pipe.c b/fs/pipe.c
index 0c64db86c91..b89c878588a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1043,7 +1043,7 @@ int do_pipe(int *fd)
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
  */
-asmlinkage long sys_pipe2(int __user *fildes, int flags)
+SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 {
 	int fd[2];
 	int error;
diff --git a/fs/readdir.c b/fs/readdir.c
index cf6a0e39819..7723401f8d8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,7 +102,8 @@ efault:
 	return -EFAULT;
 }
 
-asmlinkage long sys_old_readdir(unsigned int fd, struct old_linux_dirent __user * dirent, unsigned int count)
+SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+		struct old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
 	struct file * file;
diff --git a/fs/select.c b/fs/select.c
index 338f703403a..0fe0e1469df 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -636,8 +636,9 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
  * which has a pointer to the sigset_t itself followed by a size_t containing
  * the sigset size.
  */
-asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp,
-	fd_set __user *exp, struct timespec __user *tsp, void __user *sig)
+SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
+		fd_set __user *, exp, struct timespec __user *, tsp,
+		void __user *, sig)
 {
 	size_t sigsetsize = 0;
 	sigset_t __user *up = NULL;
@@ -889,9 +890,9 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
 }
 
 #ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds,
-	struct timespec __user *tsp, const sigset_t __user *sigmask,
-	size_t sigsetsize)
+SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
+		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
+		size_t, sigsetsize)
 {
 	sigset_t ksigmask, sigsaved;
 	struct timespec ts, end_time, *to = NULL;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c8c14f58b96..6a123b8ff3f 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -265,7 +265,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	return 0;
 }
 
-asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr)
+SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 {
 	struct file *file;
 	struct timerfd_ctx *ctx;
-- 
cgit v1.2.3


From 2b66421995d2e93c9d1a0111acf2581f8529c6e5 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:35 +0100
Subject: [CVE-2009-0029] System call wrappers part 33

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 fs/pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index b89c878588a..3a48ba5179d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1059,7 +1059,7 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 	return error;
 }
 
-asmlinkage long sys_pipe(int __user *fildes)
+SYSCALL_DEFINE1(pipe, int __user *, fildes)
 {
 	return sys_pipe2(fildes, 0);
 }
-- 
cgit v1.2.3


From 1bcbf31337391a2f54ef6c1e8871c2de5944a7dc Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Thu, 15 Jan 2009 13:51:03 -0800
Subject: btrfs & squashfs: Move btrfs and squashfsto's magic number to
 <linux/magic.h>

Use the standard magic.h for btrfs and squashfs.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Cc: Phillip Lougher <phillip@lougher.demon.co.uk>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/super.c          | 2 +-
 fs/squashfs/squashfs_fs.h | 1 -
 fs/squashfs/super.c       | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0a14b495532..7256cf242eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
 #include <linux/version.h>
+#include <linux/magic.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -51,7 +52,6 @@
 #include "export.h"
 #include "compression.h"
 
-#define BTRFS_SUPER_MAGIC 0x9123683E
 
 static struct super_operations btrfs_super_ops;
 
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 6840da1bf21..283daafc568 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -26,7 +26,6 @@
 #define SQUASHFS_CACHED_FRAGMENTS	CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE
 #define SQUASHFS_MAJOR			4
 #define SQUASHFS_MINOR			0
-#define SQUASHFS_MAGIC			0x73717368
 #define SQUASHFS_START			0
 
 /* size of metadata (inode and directory) blocks */
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index a0466d7467b..071df5b5b49 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/zlib.h>
+#include <linux/magic.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
-- 
cgit v1.2.3


From 6b7021ef7e1a703c7092daeceda063951b22b4f6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 15 Jan 2009 13:51:29 -0800
Subject: ext2: also update the inode on disk when dir is IS_DIRSYNC

We used to just write changed page for IS_DIRSYNC inodes.  But we also
have to update the directory inode itself just for the case that we've
allocated a new block and changed i_size.

[akpm@linux-foundation.org: still sync the data page]
Signed-off-by: Jan Kara <jack@suse.cz>
Tested-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/dir.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 9a0fc400f91..2999d72153b 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -95,10 +95,13 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len)
 		mark_inode_dirty(dir);
 	}
 
-	if (IS_DIRSYNC(dir))
+	if (IS_DIRSYNC(dir)) {
 		err = write_one_page(page, 1);
-	else
+		if (!err)
+			err = ext2_sync_inode(dir);
+	} else {
 		unlock_page(page);
+	}
 
 	return err;
 }
-- 
cgit v1.2.3


From 06a279d636734da32bb62dd2f7b0ade666f65d7c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 17 Jan 2009 18:41:37 -0500
Subject: ext4: only use i_size_high for regular files

Directories are not allowed to be bigger than 2GB, so don't use
i_size_high for anything other than regular files.  E2fsck should
complain about these inodes, but the simplest thing to do for the
kernel is to only use i_size_high for regular files.

This prevents an intentially corrupted filesystem from causing the
kernel to burn a huge amount of CPU and issuing error messages such
as:

EXT4-fs warning (device loop0): ext4_block_to_path: block 135090028 > max

Thanks to David Maciejak from Fortinet's FortiGuard Global Security
Research Team for reporting this issue.

http://bugzilla.kernel.org/show_bug.cgi?id=12375

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/ext4.h  | 7 +++++--
 fs/ext4/inode.c | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c668e4377d7..aafc9eba1c2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1206,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
 
 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
 {
-	return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
-		le32_to_cpu(raw_inode->i_size_lo);
+	if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
+		return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+			le32_to_cpu(raw_inode->i_size_lo);
+	else
+		return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
 }
 
 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a6444cee0c7..49484ba801c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -360,9 +360,9 @@ static int ext4_block_to_path(struct inode *inode,
 		final = ptrs;
 	} else {
 		ext4_warning(inode->i_sb, "ext4_block_to_path",
-				"block %lu > max",
+				"block %lu > max in inode %lu",
 				i_block + direct_blocks +
-				indirect_blocks + double_blocks);
+				indirect_blocks + double_blocks, inode->i_ino);
 	}
 	if (boundary)
 		*boundary = final - 1 - (i_block & (ptrs - 1));
-- 
cgit v1.2.3


From e6b8bc09ba2075cd91fbffefcd2778b1a00bd76f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 16 Jan 2009 11:13:40 -0500
Subject: ext4: Add sanity check to make_indexed_dir

Make sure the rec_len field in the '..' entry is sane, lest we overrun
the directory block and cause a kernel oops on a purposefully
corrupted filesystem.

Thanks to Sami Liedes for reporting this bug.

http://bugzilla.kernel.org/show_bug.cgi?id=12430

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/namei.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index fec0b4c2f5f..ba702bd7910 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1368,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk(KERN_DEBUG "Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
 	retval = ext4_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext4_std_error(dir->i_sb, retval);
@@ -1377,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	}
 	root = (struct dx_root *) bh->b_data;
 
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext4_dir_entry_2 *)((char *)fde +
+		ext4_rec_len_from_disk(fde->rec_len));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		ext4_error(dir->i_sb, __func__,
+			   "invalid rec_len for '..' in inode %lu",
+			   dir->i_ino);
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + blocksize - (char *) de;
+
+	/* Allocate new block for the 0th block's dirents */
 	bh2 = ext4_append(handle, dir, &block, &retval);
 	if (!(bh2)) {
 		brelse(bh);
@@ -1385,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
 	data1 = bh2->b_data;
 
-	/* The 0th block becomes the root, move the dirents out */
-	fde = &root->dotdot;
-	de = (struct ext4_dir_entry_2 *)((char *)fde +
-		ext4_rec_len_from_disk(fde->rec_len));
-	len = ((char *) root) + blocksize - (char *) de;
 	memcpy (data1, de, len);
 	de = (struct ext4_dir_entry_2 *) data1;
 	top = data1 + len;
-- 
cgit v1.2.3


From a21102b55c4f8dfd3adb4a15a34cd62237b46039 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 16 Jan 2009 11:13:47 -0500
Subject: ext3: Add sanity check to make_indexed_dir

Make sure the rec_len field in the '..' entry is sane, lest we overrun
the directory block and cause a kernel oops on a purposefully
corrupted filesystem.

This fixes a bug related to a bug originally reported by Sami Liedes
for ext4 at:

http://bugzilla.kernel.org/show_bug.cgi?id=12430

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext3/namei.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 69a3d19ca9f..4db4ffa1eda 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1358,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	struct fake_dirent *fde;
 
 	blocksize =  dir->i_sb->s_blocksize;
-	dxtrace(printk("Creating index\n"));
+	dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
 	retval = ext3_journal_get_write_access(handle, bh);
 	if (retval) {
 		ext3_std_error(dir->i_sb, retval);
@@ -1367,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	}
 	root = (struct dx_root *) bh->b_data;
 
+	/* The 0th block becomes the root, move the dirents out */
+	fde = &root->dotdot;
+	de = (struct ext3_dir_entry_2 *)((char *)fde +
+			ext3_rec_len_from_disk(fde->rec_len));
+	if ((char *) de >= (((char *) root) + blocksize)) {
+		ext3_error(dir->i_sb, __func__,
+			   "invalid rec_len for '..' in inode %lu",
+			   dir->i_ino);
+		brelse(bh);
+		return -EIO;
+	}
+	len = ((char *) root) + blocksize - (char *) de;
+
 	bh2 = ext3_append (handle, dir, &block, &retval);
 	if (!(bh2)) {
 		brelse(bh);
@@ -1375,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
 	data1 = bh2->b_data;
 
-	/* The 0th block becomes the root, move the dirents out */
-	fde = &root->dotdot;
-	de = (struct ext3_dir_entry_2 *)((char *)fde +
-			ext3_rec_len_from_disk(fde->rec_len));
-	len = ((char *) root) + blocksize - (char *) de;
 	memcpy (data1, de, len);
 	de = (struct ext3_dir_entry_2 *) data1;
 	top = data1 + len;
-- 
cgit v1.2.3


From 1d9e2ae949411c2f329f30e01ea0355cd02c4296 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Jan 2009 11:58:19 -0500
Subject: Btrfs: Clear the device->running_pending flag before bailing on
 congestion

Btrfs maintains a queue of async bio submissions so the checksumming
threads don't have to wait on get_request_wait.  In order to avoid
extra wakeups, this code has a running_pending flag that is used
to tell new submissions they don't need to wake the thread.

When the threads notice congestion on a single device, they
may decide to requeue the job and move on to other devices.  This
makes sure the running_pending flag is cleared before the
job is requeued.

It should help avoid IO stalls by making sure the task is woken up
when new submissions come in.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b187b537888..3451e1cca2b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -220,6 +220,7 @@ loop:
 				tail->bi_next = old_head;
 			else
 				device->pending_bio_tail = tail;
+			device->running_pending = 0;
 
 			spin_unlock(&device->io_lock);
 			btrfs_requeue_work(&device->work);
-- 
cgit v1.2.3


From c071fcfdb60e7abbe95e02460005d6bca165bf24 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 Jan 2009 11:59:08 -0500
Subject: Btrfs: fix ioctl arg size (userland incompatible change!)

The structure used to send device in btrfs ioctl calls was not
properly aligned, and so 32 bit ioctls would not work properly on
64 bit kernels.

We could fix this with compat ioctls, but we're just one byte away
and it doesn't make sense at this stage to carry about the compat ioctls
forever at this stage in the project.

This patch brings the ioctl arg up to an evenly aligned 4k.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.h | 14 ++++++++------
 fs/btrfs/super.c |  3 ++-
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 78049ea208d..b320b103fa1 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,13 +22,20 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 3072
+#define BTRFS_PATH_NAME_MAX 4087
 
+/* this should be 4k */
 struct btrfs_ioctl_vol_args {
 	__s64 fd;
 	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
+struct btrfs_ioctl_clone_range_args {
+  __s64 src_fd;
+  __u64 src_offset, src_length;
+  __u64 dest_offset;
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -52,11 +59,6 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
 				   struct btrfs_ioctl_vol_args)
-struct btrfs_ioctl_clone_range_args {
-  __s64 src_fd;
-  __u64 src_offset, src_length;
-  __u64 dest_offset;
-};
 
 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
 				  struct btrfs_ioctl_clone_range_args)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b4c101d9322..92c9b543def 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -582,7 +582,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 {
 	struct btrfs_ioctl_vol_args *vol;
 	struct btrfs_fs_devices *fs_devices;
-	int ret = 0;
+	int ret = -ENOTTY;
 	int len;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -594,6 +594,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 		goto out;
 	}
 	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
 		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
-- 
cgit v1.2.3


From cc33412fb1f11613e20f9dfc2919a77ecd63fbc4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 17:23:05 +0100
Subject: quota: Improve locking

We implement dqget() and dqput() that need neither dqonoff_mutex nor dqptr_sem.
Then move dqget() and dqput() calls so that they are not called from under
dqptr_sem. This is important because filesystem callbacks aren't called from
under dqptr_sem which used to cause *lots* of problems with lock ranking
(and with OCFS2 they became close to unsolvable).

The patch also removes two functions which were introduced solely because OCFS2
needed them to cope with the old locking scheme. As time showed, they were not
enough for OCFS2 anyway and it would be unnecessary work to adapt them to the
new locking scheme in which they aren't needed.  As a result OCFS2 needs the
following patch to compile properly with quotas.  Sorry to any bisecters which
hit this in advance.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/dquot.c | 218 ++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 122 insertions(+), 96 deletions(-)

(limited to 'fs')

diff --git a/fs/dquot.c b/fs/dquot.c
index 48c0571f831..bca3cac4bee 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -87,14 +87,17 @@
 #define __DQUOT_PARANOIA
 
 /*
- * There are two quota SMP locks. dq_list_lock protects all lists with quotas
- * and quota formats and also dqstats structure containing statistics about the
- * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures
- * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
+ * There are three quota SMP locks. dq_list_lock protects all lists with quotas
+ * and quota formats, dqstats structure containing statistics about the lists
+ * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and
+ * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
  * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
- * in inode_add_bytes() and inode_sub_bytes().
+ * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects
+ * modifications of quota state (on quotaon and quotaoff) and readers who care
+ * about latest values take it as well.
  *
- * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock
+ * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock,
+ *   dq_list_lock > dq_state_lock
  *
  * Note that some things (eg. sb pointer, type, id) doesn't change during
  * the life of the dquot structure and so needn't to be protected by a lock
@@ -103,12 +106,7 @@
  * operation is just reading pointers from inode (or not using them at all) the
  * read lock is enough. If pointers are altered function must hold write lock
  * (these locking rules also apply for S_NOQUOTA flag in the inode - note that
- * for altering the flag i_mutex is also needed).  If operation is holding
- * reference to dquot in other way (e.g. quotactl ops) it must be guarded by
- * dqonoff_mutex.
- * This locking assures that:
- *   a) update/access to dquot pointers in inode is serialized
- *   b) everyone is guarded against invalidate_dquots()
+ * for altering the flag i_mutex is also needed).
  *
  * Each dquot has its dq_lock mutex. Locked dquots might not be referenced
  * from inodes (dquot_alloc_space() and such don't check the dq_lock).
@@ -122,10 +120,17 @@
  * Lock ordering (including related VFS locks) is the following:
  *   i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock >
  *   dqio_mutex
+ * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
+ * dqptr_sem. But filesystem has to count with the fact that functions such as
+ * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
+ * from inside a transaction to keep filesystem consistency after a crash. Also
+ * filesystems usually want to do some IO on dquot from ->mark_dirty which is
+ * called with dqptr_sem held.
  * i_mutex on quota files is special (it's below dqio_mutex)
  */
 
 static DEFINE_SPINLOCK(dq_list_lock);
+static DEFINE_SPINLOCK(dq_state_lock);
 DEFINE_SPINLOCK(dq_data_lock);
 
 static char *quotatypes[] = INITQFNAMES;
@@ -428,7 +433,7 @@ static inline void do_destroy_dquot(struct dquot *dquot)
  * quota is disabled and pointers from inodes removed so there cannot be new
  * quota users. There can still be some users of quotas due to inodes being
  * just deleted or pruned by prune_icache() (those are not attached to any
- * list). We have to wait for such users.
+ * list) or parallel quotactl call. We have to wait for such users.
  */
 static void invalidate_dquots(struct super_block *sb, int type)
 {
@@ -600,7 +605,6 @@ static struct shrinker dqcache_shrinker = {
 /*
  * Put reference to dquot
  * NOTE: If you change this function please check whether dqput_blocks() works right...
- * MUST be called with either dqptr_sem or dqonoff_mutex held
  */
 void dqput(struct dquot *dquot)
 {
@@ -696,37 +700,31 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 	return dquot;
 }
 
-/*
- * Check whether dquot is in memory.
- * MUST be called with either dqptr_sem or dqonoff_mutex held
- */
-int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
-{
-	unsigned int hashent = hashfn(sb, id, type);
-	int ret = 0;
-
-        if (!sb_has_quota_active(sb, type))
-		return 0;
-	spin_lock(&dq_list_lock);
-	if (find_dquot(hashent, sb, id, type) != NODQUOT)
-		ret = 1;
-	spin_unlock(&dq_list_lock);
-	return ret;
-}
-
 /*
  * Get reference to dquot
- * MUST be called with either dqptr_sem or dqonoff_mutex held
+ *
+ * Locking is slightly tricky here. We are guarded from parallel quotaoff()
+ * destroying our dquot by:
+ *   a) checking for quota flags under dq_list_lock and
+ *   b) getting a reference to dquot before we release dq_list_lock
  */
 struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 	unsigned int hashent = hashfn(sb, id, type);
-	struct dquot *dquot, *empty = NODQUOT;
+	struct dquot *dquot = NODQUOT, *empty = NODQUOT;
 
         if (!sb_has_quota_active(sb, type))
 		return NODQUOT;
 we_slept:
 	spin_lock(&dq_list_lock);
+	spin_lock(&dq_state_lock);
+	if (!sb_has_quota_active(sb, type)) {
+		spin_unlock(&dq_state_lock);
+		spin_unlock(&dq_list_lock);
+		goto out;
+	}
+	spin_unlock(&dq_state_lock);
+
 	if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
 		if (empty == NODQUOT) {
 			spin_unlock(&dq_list_lock);
@@ -735,6 +733,7 @@ we_slept:
 			goto we_slept;
 		}
 		dquot = empty;
+		empty = NODQUOT;
 		dquot->dq_id = id;
 		/* all dquots go on the inuse_list */
 		put_inuse(dquot);
@@ -749,8 +748,6 @@ we_slept:
 		dqstats.cache_hits++;
 		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
-		if (empty)
-			do_destroy_dquot(empty);
 	}
 	/* Wait for dq_lock - after this we know that either dquot_release() is already
 	 * finished or it will be canceled due to dq_count > 1 test */
@@ -758,11 +755,15 @@ we_slept:
 	/* Read the dquot and instantiate it (everything done only if needed) */
 	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
 		dqput(dquot);
-		return NODQUOT;
+		dquot = NODQUOT;
+		goto out;
 	}
 #ifdef __DQUOT_PARANOIA
 	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
 #endif
+out:
+	if (empty)
+		do_destroy_dquot(empty);
 
 	return dquot;
 }
@@ -1198,63 +1199,76 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
 }
 /*
  *	Initialize quota pointers in inode
- *	Transaction must be started at entry
+ *	We do things in a bit complicated way but by that we avoid calling
+ *	dqget() and thus filesystem callbacks under dqptr_sem.
  */
 int dquot_initialize(struct inode *inode, int type)
 {
 	unsigned int id = 0;
 	int cnt, ret = 0;
+	struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
+	struct super_block *sb = inode->i_sb;
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return 0;
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+	/* First get references to structures we might need. */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (type != -1 && cnt != type)
+			continue;
+		switch (cnt) {
+		case USRQUOTA:
+			id = inode->i_uid;
+			break;
+		case GRPQUOTA:
+			id = inode->i_gid;
+			break;
+		}
+		got[cnt] = dqget(sb, id, cnt);
+	}
+
+	down_write(&sb_dqopt(sb)->dqptr_sem);
 	/* Having dqptr_sem we know NOQUOTA flags can't be altered... */
 	if (IS_NOQUOTA(inode))
 		goto out_err;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
+		/* Avoid races with quotaoff() */
+		if (!sb_has_quota_active(sb, cnt))
+			continue;
 		if (inode->i_dquot[cnt] == NODQUOT) {
-			switch (cnt) {
-				case USRQUOTA:
-					id = inode->i_uid;
-					break;
-				case GRPQUOTA:
-					id = inode->i_gid;
-					break;
-			}
-			inode->i_dquot[cnt] = dqget(inode->i_sb, id, cnt);
+			inode->i_dquot[cnt] = got[cnt];
+			got[cnt] = NODQUOT;
 		}
 	}
 out_err:
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+	/* Drop unused references */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		dqput(got[cnt]);
 	return ret;
 }
 
 /*
  * 	Release all quotas referenced by inode
- *	Transaction must be started at an entry
  */
-int dquot_drop_locked(struct inode *inode)
+int dquot_drop(struct inode *inode)
 {
 	int cnt;
+	struct dquot *put[MAXQUOTAS];
 
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] != NODQUOT) {
-			dqput(inode->i_dquot[cnt]);
-			inode->i_dquot[cnt] = NODQUOT;
-		}
+		put[cnt] = inode->i_dquot[cnt];
+		inode->i_dquot[cnt] = NODQUOT;
 	}
-	return 0;
-}
-
-int dquot_drop(struct inode *inode)
-{
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	dquot_drop_locked(inode);
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		dqput(put[cnt]);
 	return 0;
 }
 
@@ -1470,8 +1484,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	qsize_t space;
 	struct dquot *transfer_from[MAXQUOTAS];
 	struct dquot *transfer_to[MAXQUOTAS];
-	int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid,
-	    chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
+	int cnt, ret = QUOTA_OK;
+	int chuid = iattr->ia_valid & ATTR_UID && inode->i_uid != iattr->ia_uid,
+	    chgid = iattr->ia_valid & ATTR_GID && inode->i_gid != iattr->ia_gid;
 	char warntype_to[MAXQUOTAS];
 	char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
 
@@ -1479,21 +1494,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
-	/* Clear the arrays */
+	/* Initialize the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
+		transfer_from[cnt] = NODQUOT;
+		transfer_to[cnt] = NODQUOT;
 		warntype_to[cnt] = QUOTA_NL_NOWARN;
-	}
-	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	/* Now recheck reliably when holding dqptr_sem */
-	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
-		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		return QUOTA_OK;
-	}
-	/* First build the transfer_to list - here we can block on
-	 * reading/instantiating of dquots.  We know that the transaction for
-	 * us was already started so we don't violate lock ranking here */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		switch (cnt) {
 			case USRQUOTA:
 				if (!chuid)
@@ -1507,6 +1512,13 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 				break;
 		}
 	}
+
+	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
+	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
+		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		goto put_all;
+	}
 	spin_lock(&dq_data_lock);
 	space = inode_get_bytes(inode);
 	/* Build the transfer_from list and check the limits */
@@ -1517,7 +1529,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
 		    NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
 		    warntype_to + cnt) == NO_QUOTA)
-			goto warn_put_all;
+			goto over_quota;
 	}
 
 	/*
@@ -1545,28 +1557,37 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 
 		inode->i_dquot[cnt] = transfer_to[cnt];
 	}
-	ret = QUOTA_OK;
-warn_put_all:
 	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+
 	/* Dirtify all the dquots - this can block when journalling */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (transfer_from[cnt])
 			mark_dquot_dirty(transfer_from[cnt]);
-		if (transfer_to[cnt])
+		if (transfer_to[cnt]) {
 			mark_dquot_dirty(transfer_to[cnt]);
+			/* The reference we got is transferred to the inode */
+			transfer_to[cnt] = NODQUOT;
+		}
 	}
+warn_put_all:
 	flush_warnings(transfer_to, warntype_to);
 	flush_warnings(transfer_from, warntype_from_inodes);
 	flush_warnings(transfer_from, warntype_from_space);
-	
+put_all:
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT)
-			dqput(transfer_from[cnt]);
-		if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT)
-			dqput(transfer_to[cnt]);
+		dqput(transfer_from[cnt]);
+		dqput(transfer_to[cnt]);
 	}
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return ret;
+over_quota:
+	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Clear dquot pointers we don't want to dqput() */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		transfer_from[cnt] = NODQUOT;
+	ret = NO_QUOTA;
+	goto warn_put_all;
 }
 
 /* Wrapper for transferring ownership of an inode */
@@ -1651,19 +1672,24 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 			continue;
 
 		if (flags & DQUOT_SUSPENDED) {
+			spin_lock(&dq_state_lock);
 			dqopt->flags |=
 				dquot_state_flag(DQUOT_SUSPENDED, cnt);
+			spin_unlock(&dq_state_lock);
 		} else {
+			spin_lock(&dq_state_lock);
 			dqopt->flags &= ~dquot_state_flag(flags, cnt);
 			/* Turning off suspended quotas? */
 			if (!sb_has_quota_loaded(sb, cnt) &&
 			    sb_has_quota_suspended(sb, cnt)) {
 				dqopt->flags &=	~dquot_state_flag(
 							DQUOT_SUSPENDED, cnt);
+				spin_unlock(&dq_state_lock);
 				iput(dqopt->files[cnt]);
 				dqopt->files[cnt] = NULL;
 				continue;
 			}
+			spin_unlock(&dq_state_lock);
 		}
 
 		/* We still have to keep quota loaded? */
@@ -1830,7 +1856,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	}
 	mutex_unlock(&dqopt->dqio_mutex);
 	mutex_unlock(&inode->i_mutex);
+	spin_lock(&dq_state_lock);
 	dqopt->flags |= dquot_state_flag(flags, type);
+	spin_unlock(&dq_state_lock);
 
 	add_dquot_ref(sb, type);
 	mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1872,9 +1900,11 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
 	}
 	inode = dqopt->files[type];
 	dqopt->files[type] = NULL;
+	spin_lock(&dq_state_lock);
 	flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
 						DQUOT_LIMITS_ENABLED, type);
 	dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
+	spin_unlock(&dq_state_lock);
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	flags = dquot_generic_flag(flags, type);
@@ -1952,7 +1982,9 @@ int vfs_quota_enable(struct inode *inode, int type, int format_id,
 			ret = -EBUSY;
 			goto out_lock;
 		}
+		spin_lock(&dq_state_lock);
 		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+		spin_unlock(&dq_state_lock);
 out_lock:
 		mutex_unlock(&dqopt->dqonoff_mutex);
 		return ret;
@@ -2039,14 +2071,12 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!(dquot = dqget(sb, id, type))) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	dquot = dqget(sb, id, type);
+	if (dquot == NODQUOT)
 		return -ESRCH;
-	}
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+
 	return 0;
 }
 
@@ -2130,7 +2160,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	struct dquot *dquot;
 	int rc;
 
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	dquot = dqget(sb, id, type);
 	if (!dquot) {
 		rc = -ESRCH;
@@ -2139,7 +2168,6 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 	rc = do_set_dqblk(dquot, di);
 	dqput(dquot);
 out:
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return rc;
 }
 
@@ -2370,11 +2398,9 @@ EXPORT_SYMBOL(dquot_release);
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 EXPORT_SYMBOL(dquot_initialize);
 EXPORT_SYMBOL(dquot_drop);
-EXPORT_SYMBOL(dquot_drop_locked);
 EXPORT_SYMBOL(vfs_dq_drop);
 EXPORT_SYMBOL(dqget);
 EXPORT_SYMBOL(dqput);
-EXPORT_SYMBOL(dquot_is_cached);
 EXPORT_SYMBOL(dquot_alloc_space);
 EXPORT_SYMBOL(dquot_alloc_inode);
 EXPORT_SYMBOL(dquot_free_space);
-- 
cgit v1.2.3


From dedb0d48a9d4d57086526b94a4b64da789a646e4 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 9 Jan 2009 21:02:37 +0200
Subject: UBIFS: do not commit twice

VFS calls '->sync_fs()' twice - first time with @wait = 0, second
time with @wait = 1. As a result, we may commit and synchronize
write-buffers twice. Avoid doing this by returning immediatelly if
@wait = 0.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 89556ee7251..a7fc97f4d9d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -432,18 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
 	struct writeback_control wbc = {
-		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.sync_mode   = WB_SYNC_ALL,
 		.range_start = 0,
 		.range_end   = LLONG_MAX,
 		.nr_to_write = LONG_MAX,
 	};
 
 	/*
-	 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
-	 * advisory thing to help the file system shove lots of data into the
-	 * queues. If some gets missed then it'll be picked up on the second
+	 * Zero @wait is just an advisory thing to help the file system shove
+	 * lots of data into the queues, and there will be the second
 	 * '->sync_fs()' call, with non-zero @wait.
 	 */
+	if (!wait)
+		return 0;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
-- 
cgit v1.2.3


From e8b815663b1bfd9c255af5176604ec0eafdf6ed7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 15 Jan 2009 17:43:23 +0200
Subject: UBIFS: constify operations

Mark super, file, and inode operation structcutes with 'const'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/dir.c   |  4 ++--
 fs/ubifs/file.c  |  8 ++++----
 fs/ubifs/super.c |  2 +-
 fs/ubifs/ubifs.h | 14 +++++++-------
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1f9c3..d29b771cce4 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1199,7 +1199,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-struct inode_operations ubifs_dir_inode_operations = {
+const struct inode_operations ubifs_dir_inode_operations = {
 	.lookup      = ubifs_lookup,
 	.create      = ubifs_create,
 	.link        = ubifs_link,
@@ -1219,7 +1219,7 @@ struct inode_operations ubifs_dir_inode_operations = {
 #endif
 };
 
-struct file_operations ubifs_dir_operations = {
+const struct file_operations ubifs_dir_operations = {
 	.llseek         = ubifs_dir_llseek,
 	.release        = ubifs_dir_release,
 	.read           = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf37374567f..17443d97e6f 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1541,7 +1541,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-struct address_space_operations ubifs_file_address_operations = {
+const struct address_space_operations ubifs_file_address_operations = {
 	.readpage       = ubifs_readpage,
 	.writepage      = ubifs_writepage,
 	.write_begin    = ubifs_write_begin,
@@ -1551,7 +1551,7 @@ struct address_space_operations ubifs_file_address_operations = {
 	.releasepage    = ubifs_releasepage,
 };
 
-struct inode_operations ubifs_file_inode_operations = {
+const struct inode_operations ubifs_file_inode_operations = {
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 #ifdef CONFIG_UBIFS_FS_XATTR
@@ -1562,14 +1562,14 @@ struct inode_operations ubifs_file_inode_operations = {
 #endif
 };
 
-struct inode_operations ubifs_symlink_inode_operations = {
+const struct inode_operations ubifs_symlink_inode_operations = {
 	.readlink    = generic_readlink,
 	.follow_link = ubifs_follow_link,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 };
 
-struct file_operations ubifs_file_operations = {
+const struct file_operations ubifs_file_operations = {
 	.llseek         = generic_file_llseek,
 	.read           = do_sync_read,
 	.write          = do_sync_write,
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a7fc97f4d9d..53811e567a6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1778,7 +1778,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-struct super_operations ubifs_super_operations = {
+const struct super_operations ubifs_super_operations = {
 	.alloc_inode   = ubifs_alloc_inode,
 	.destroy_inode = ubifs_destroy_inode,
 	.put_super     = ubifs_put_super,
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2a4cc66d0..0881897a420 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1405,13 +1405,13 @@ extern struct list_head ubifs_infos;
 extern spinlock_t ubifs_infos_lock;
 extern atomic_long_t ubifs_clean_zn_cnt;
 extern struct kmem_cache *ubifs_inode_slab;
-extern struct super_operations ubifs_super_operations;
-extern struct address_space_operations ubifs_file_address_operations;
-extern struct file_operations ubifs_file_operations;
-extern struct inode_operations ubifs_file_inode_operations;
-extern struct file_operations ubifs_dir_operations;
-extern struct inode_operations ubifs_dir_inode_operations;
-extern struct inode_operations ubifs_symlink_inode_operations;
+extern const struct super_operations ubifs_super_operations;
+extern const struct address_space_operations ubifs_file_address_operations;
+extern const struct file_operations ubifs_file_operations;
+extern const struct inode_operations ubifs_file_inode_operations;
+extern const struct file_operations ubifs_dir_operations;
+extern const struct inode_operations ubifs_dir_inode_operations;
+extern const struct inode_operations ubifs_symlink_inode_operations;
 extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
-- 
cgit v1.2.3


From ab596ad8972f314ace538799734c7e1bdd1da2ff Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:02:57 +0100
Subject: xfs: fix dentry aliasing issues in open_by_handle

Open by handle just grabs an inode by handle and then creates itself
a dentry for it.  While this works for regular files it is horribly
broken for directories, where the VFS locking relies on the fact that
there is only just one single dentry for a given inode, and that
these are always connected to the root of the filesystem so that
it's locking algorithms work (see Documentations/filesystems/Locking)

Remove all the existing open by handle code and replace it with a small
wrapper around the exportfs code which deals with all these issues.
At the same time we also make the checks for a valid handle strict
enough to reject all not perfectly well formed handles - given that
we never hand out others that's okay and simplifies the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/Kconfig                 |   1 +
 fs/xfs/linux-2.6/xfs_ioctl.c   | 305 ++++++++++++++++++-----------------------
 fs/xfs/linux-2.6/xfs_ioctl.h   |  15 +-
 fs/xfs/linux-2.6/xfs_ioctl32.c | 175 +++++++----------------
 4 files changed, 196 insertions(+), 300 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 3f53dd101f9..29228f5899c 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
 config XFS_FS
 	tristate "XFS filesystem support"
 	depends on BLOCK
+	select EXPORTFS
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index e5be1e0be80..4bd112313f3 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -50,12 +50,14 @@
 #include "xfs_vnodeops.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
+#include "xfs_export.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/exportfs.h>
 
 /*
  * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -164,97 +166,69 @@ xfs_find_handle(
 	return 0;
 }
 
-
 /*
- * Convert userspace handle data into inode.
- *
- * We use the fact that all the fsop_handlereq ioctl calls have a data
- * structure argument whose first component is always a xfs_fsop_handlereq_t,
- * so we can pass that sub structure into this handy, shared routine.
- *
- * If no error, caller must always iput the returned inode.
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
  */
 STATIC int
-xfs_vget_fsop_handlereq(
-	xfs_mount_t		*mp,
-	struct inode		*parinode,	/* parent inode pointer    */
-	xfs_fsop_handlereq_t	*hreq,
-	struct inode		**inode)
+xfs_handle_acceptable(
+	void			*context,
+	struct dentry		*dentry)
+{
+	return 1;
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen)
 {
-	void			__user *hanp;
-	size_t			hlen;
-	xfs_fid_t		*xfid;
-	xfs_handle_t		*handlep;
 	xfs_handle_t		handle;
-	xfs_inode_t		*ip;
-	xfs_ino_t		ino;
-	__u32			igen;
-	int			error;
+	struct xfs_fid64	fid;
 
 	/*
 	 * Only allow handle opens under a directory.
 	 */
-	if (!S_ISDIR(parinode->i_mode))
-		return XFS_ERROR(ENOTDIR);
-
-	hanp = hreq->ihandle;
-	hlen = hreq->ihandlen;
-	handlep = &handle;
-
-	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
-		return XFS_ERROR(EINVAL);
-	if (copy_from_user(handlep, hanp, hlen))
-		return XFS_ERROR(EFAULT);
-	if (hlen < sizeof(*handlep))
-		memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
-	if (hlen > sizeof(handlep->ha_fsid)) {
-		if (handlep->ha_fid.fid_len !=
-		    (hlen - sizeof(handlep->ha_fsid) -
-		            sizeof(handlep->ha_fid.fid_len)) ||
-		    handlep->ha_fid.fid_pad)
-			return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Crack the handle, obtain the inode # & generation #
-	 */
-	xfid = (struct xfs_fid *)&handlep->ha_fid;
-	if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
-		ino  = xfid->fid_ino;
-		igen = xfid->fid_gen;
-	} else {
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Get the XFS inode, building a Linux inode to go with it.
-	 */
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-	if (error)
-		return error;
-	if (ip == NULL)
-		return XFS_ERROR(EIO);
-	if (ip->i_d.di_gen != igen) {
-		xfs_iput_new(ip, XFS_ILOCK_SHARED);
-		return XFS_ERROR(ENOENT);
-	}
-
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+		return ERR_PTR(-ENOTDIR);
+
+	if (hlen != sizeof(xfs_handle_t))
+		return ERR_PTR(-EINVAL);
+	if (copy_from_user(&handle, uhandle, hlen))
+		return ERR_PTR(-EFAULT);
+	if (handle.ha_fid.fid_len !=
+	    sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
+		return ERR_PTR(-EINVAL);
+
+	memset(&fid, 0, sizeof(struct fid));
+	fid.ino = handle.ha_fid.fid_ino;
+	fid.gen = handle.ha_fid.fid_gen;
+
+	return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
+			FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+			xfs_handle_acceptable, NULL);
+}
 
-	*inode = VFS_I(ip);
-	return 0;
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
+{
+	return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
 }
 
 int
 xfs_open_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
 	struct file		*parfilp,
-	struct inode		*parinode)
+	xfs_fsop_handlereq_t	*hreq)
 {
 	const struct cred	*cred = current_cred();
 	int			error;
-	int			new_fd;
+	int			fd;
 	int			permflag;
 	struct file		*filp;
 	struct inode		*inode;
@@ -263,19 +237,21 @@ xfs_open_by_handle(
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
-	if (error)
-		return -error;
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	inode = dentry->d_inode;
 
 	/* Restrict xfs_open_by_handle to directories & regular files. */
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-		iput(inode);
-		return -XFS_ERROR(EINVAL);
+		error = -XFS_ERROR(EPERM);
+		goto out_dput;
 	}
 
 #if BITS_PER_LONG != 32
 	hreq->oflags |= O_LARGEFILE;
 #endif
+
 	/* Put open permission in namei format. */
 	permflag = hreq->oflags;
 	if ((permflag+1) & O_ACCMODE)
@@ -285,50 +261,45 @@ xfs_open_by_handle(
 
 	if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
 	    (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
-		iput(inode);
-		return -XFS_ERROR(EPERM);
+		error = -XFS_ERROR(EPERM);
+		goto out_dput;
 	}
 
 	if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-		iput(inode);
-		return -XFS_ERROR(EACCES);
+		error = -XFS_ERROR(EACCES);
+		goto out_dput;
 	}
 
 	/* Can't write directories. */
-	if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
-		iput(inode);
-		return -XFS_ERROR(EISDIR);
+	if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+		error = -XFS_ERROR(EISDIR);
+		goto out_dput;
 	}
 
-	if ((new_fd = get_unused_fd()) < 0) {
-		iput(inode);
-		return new_fd;
+	fd = get_unused_fd();
+	if (fd < 0) {
+		error = fd;
+		goto out_dput;
 	}
 
-	dentry = d_obtain_alias(inode);
-	if (IS_ERR(dentry)) {
-		put_unused_fd(new_fd);
-		return PTR_ERR(dentry);
-	}
-
-	/* Ensure umount returns EBUSY on umounts while this file is open. */
-	mntget(parfilp->f_path.mnt);
-
-	/* Create file pointer. */
-	filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
+	filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
+			   hreq->oflags, cred);
 	if (IS_ERR(filp)) {
-		put_unused_fd(new_fd);
-		return -XFS_ERROR(-PTR_ERR(filp));
+		put_unused_fd(fd);
+		return PTR_ERR(filp);
 	}
 
 	if (inode->i_mode & S_IFREG) {
-		/* invisible operation should not change atime */
 		filp->f_flags |= O_NOATIME;
 		filp->f_mode |= FMODE_NOCMTIME;
 	}
 
-	fd_install(new_fd, filp);
-	return new_fd;
+	fd_install(fd, filp);
+	return fd;
+
+ out_dput:
+	dput(dentry);
+	return error;
 }
 
 /*
@@ -359,11 +330,10 @@ do_readlink(
 
 int
 xfs_readlink_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq)
 {
-	struct inode		*inode;
+	struct dentry		*dentry;
 	__u32			olen;
 	void			*link;
 	int			error;
@@ -371,26 +341,28 @@ xfs_readlink_by_handle(
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
-	if (error)
-		return -error;
+	dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	/* Restrict this handle operation to symlinks only. */
-	if (!S_ISLNK(inode->i_mode)) {
+	if (!S_ISLNK(dentry->d_inode->i_mode)) {
 		error = -XFS_ERROR(EINVAL);
-		goto out_iput;
+		goto out_dput;
 	}
 
 	if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
 		error = -XFS_ERROR(EFAULT);
-		goto out_iput;
+		goto out_dput;
 	}
 
 	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-	if (!link)
-		goto out_iput;
+	if (!link) {
+		error = -XFS_ERROR(ENOMEM);
+		goto out_dput;
+	}
 
-	error = -xfs_readlink(XFS_I(inode), link);
+	error = -xfs_readlink(XFS_I(dentry->d_inode), link);
 	if (error)
 		goto out_kfree;
 	error = do_readlink(hreq->ohandle, olen, link);
@@ -399,32 +371,31 @@ xfs_readlink_by_handle(
 
  out_kfree:
 	kfree(link);
- out_iput:
-	iput(inode);
+ out_dput:
+	dput(dentry);
 	return error;
 }
 
 STATIC int
 xfs_fssetdm_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
 	int			error;
 	struct fsdmidata	fsd;
 	xfs_fsop_setdm_handlereq_t dmhreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 
 	if (!capable(CAP_MKNOD))
 		return -XFS_ERROR(EPERM);
 	if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &inode);
-	if (error)
-		return -error;
+	dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
 		error = -XFS_ERROR(EPERM);
 		goto out;
 	}
@@ -434,24 +405,23 @@ xfs_fssetdm_by_handle(
 		goto out;
 	}
 
-	error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
 				 fsd.fsd_dmstate);
 
  out:
-	iput(inode);
+	dput(dentry);
 	return error;
 }
 
 STATIC int
 xfs_attrlist_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
-	int			error;
+	int			error = -ENOMEM;
 	attrlist_cursor_kern_t	*cursor;
 	xfs_fsop_attrlist_handlereq_t al_hreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -467,16 +437,16 @@ xfs_attrlist_by_handle(
 	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
 		return -XFS_ERROR(EINVAL);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode);
-	if (error)
-		goto out;
+	dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
 	if (!kbuf)
-		goto out_vn_rele;
+		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
 					al_hreq.flags, cursor);
 	if (error)
 		goto out_kfree;
@@ -486,10 +456,9 @@ xfs_attrlist_by_handle(
 
  out_kfree:
 	kfree(kbuf);
- out_vn_rele:
-	iput(inode);
- out:
-	return -error;
+ out_dput:
+	dput(dentry);
+	return error;
 }
 
 int
@@ -564,15 +533,13 @@ xfs_attrmulti_attr_remove(
 
 STATIC int
 xfs_attrmulti_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
 	struct file		*parfilp,
-	struct inode		*parinode)
+	void			__user *arg)
 {
 	int			error;
 	xfs_attr_multiop_t	*ops;
 	xfs_fsop_attrmulti_handlereq_t am_hreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 	unsigned int		i, size;
 	char			*attr_name;
 
@@ -581,19 +548,19 @@ xfs_attrmulti_by_handle(
 	if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &inode);
-	if (error)
-		goto out;
+	dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	error = E2BIG;
 	size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
 	if (!size || size > 16 * PAGE_SIZE)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = ENOMEM;
 	ops = kmalloc(size, GFP_KERNEL);
 	if (!ops)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = EFAULT;
 	if (copy_from_user(ops, am_hreq.ops, size))
@@ -615,25 +582,28 @@ xfs_attrmulti_by_handle(
 
 		switch (ops[i].am_opcode) {
 		case ATTR_OP_GET:
-			ops[i].am_error = xfs_attrmulti_attr_get(inode,
-					attr_name, ops[i].am_attrvalue,
-					&ops[i].am_length, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					dentry->d_inode, attr_name,
+					ops[i].am_attrvalue, &ops[i].am_length,
+					ops[i].am_flags);
 			break;
 		case ATTR_OP_SET:
 			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
 			if (ops[i].am_error)
 				break;
-			ops[i].am_error = xfs_attrmulti_attr_set(inode,
-					attr_name, ops[i].am_attrvalue,
-					ops[i].am_length, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					dentry->d_inode, attr_name,
+					ops[i].am_attrvalue, ops[i].am_length,
+					ops[i].am_flags);
 			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		case ATTR_OP_REMOVE:
 			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
 			if (ops[i].am_error)
 				break;
-			ops[i].am_error = xfs_attrmulti_attr_remove(inode,
-					attr_name, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					dentry->d_inode, attr_name,
+					ops[i].am_flags);
 			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		default:
@@ -647,9 +617,8 @@ xfs_attrmulti_by_handle(
 	kfree(attr_name);
  out_kfree_ops:
 	kfree(ops);
- out_vn_rele:
-	iput(inode);
- out:
+ out_dput:
+	dput(dentry);
 	return -error;
 }
 
@@ -1440,23 +1409,23 @@ xfs_file_ioctl(
 
 		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
 			return -XFS_ERROR(EFAULT);
-		return xfs_open_by_handle(mp, &hreq, filp, inode);
+		return xfs_open_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_FSSETDM_BY_HANDLE:
-		return xfs_fssetdm_by_handle(mp, arg, inode);
+		return xfs_fssetdm_by_handle(filp, arg);
 
 	case XFS_IOC_READLINK_BY_HANDLE: {
 		xfs_fsop_handlereq_t	hreq;
 
 		if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
 			return -XFS_ERROR(EFAULT);
-		return xfs_readlink_by_handle(mp, &hreq, inode);
+		return xfs_readlink_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_ATTRLIST_BY_HANDLE:
-		return xfs_attrlist_by_handle(mp, arg, inode);
+		return xfs_attrlist_by_handle(filp, arg);
 
 	case XFS_IOC_ATTRMULTI_BY_HANDLE:
-		return xfs_attrmulti_by_handle(mp, arg, filp, inode);
+		return xfs_attrmulti_by_handle(filp, arg);
 
 	case XFS_IOC_SWAPEXT: {
 		struct xfs_swapext	sxp;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
index 8c16bf2d7e0..7bd7c6afc1e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -34,16 +34,13 @@ xfs_find_handle(
 
 extern int
 xfs_open_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
 	struct file		*parfilp,
-	struct inode		*parinode);
+	xfs_fsop_handlereq_t	*hreq);
 
 extern int
 xfs_readlink_by_handle(
-	xfs_mount_t		*mp,
-	xfs_fsop_handlereq_t	*hreq,
-	struct inode		*parinode);
+	struct file		*parfilp,
+	xfs_fsop_handlereq_t	*hreq);
 
 extern int
 xfs_attrmulti_attr_get(
@@ -67,6 +64,12 @@ xfs_attrmulti_attr_remove(
 	char			*name,
 	__uint32_t		flags);
 
+extern struct dentry *
+xfs_handle_to_dentry(
+	struct file		*parfilp,
+	void __user		*uhandle,
+	u32			hlen);
+
 extern long
 xfs_file_ioctl(
 	struct file		*filp,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 50903ad3182..fd4362063f2 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -340,96 +340,24 @@ xfs_compat_handlereq_copyin(
 	return 0;
 }
 
-/*
- * Convert userspace handle data into inode.
- *
- * We use the fact that all the fsop_handlereq ioctl calls have a data
- * structure argument whose first component is always a xfs_fsop_handlereq_t,
- * so we can pass that sub structure into this handy, shared routine.
- *
- * If no error, caller must always iput the returned inode.
- */
-STATIC int
-xfs_vget_fsop_handlereq_compat(
-	xfs_mount_t		*mp,
-	struct inode		*parinode,	/* parent inode pointer    */
-	compat_xfs_fsop_handlereq_t	*hreq,
-	struct inode		**inode)
+STATIC struct dentry *
+xfs_compat_handlereq_to_dentry(
+	struct file		*parfilp,
+	compat_xfs_fsop_handlereq_t *hreq)
 {
-	void			__user *hanp;
-	size_t			hlen;
-	xfs_fid_t		*xfid;
-	xfs_handle_t		*handlep;
-	xfs_handle_t		handle;
-	xfs_inode_t		*ip;
-	xfs_ino_t		ino;
-	__u32			igen;
-	int			error;
-
-	/*
-	 * Only allow handle opens under a directory.
-	 */
-	if (!S_ISDIR(parinode->i_mode))
-		return XFS_ERROR(ENOTDIR);
-
-	hanp = compat_ptr(hreq->ihandle);
-	hlen = hreq->ihandlen;
-	handlep = &handle;
-
-	if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
-		return XFS_ERROR(EINVAL);
-	if (copy_from_user(handlep, hanp, hlen))
-		return XFS_ERROR(EFAULT);
-	if (hlen < sizeof(*handlep))
-		memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
-	if (hlen > sizeof(handlep->ha_fsid)) {
-		if (handlep->ha_fid.fid_len !=
-		    (hlen - sizeof(handlep->ha_fsid) -
-			    sizeof(handlep->ha_fid.fid_len)) ||
-		    handlep->ha_fid.fid_pad)
-			return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Crack the handle, obtain the inode # & generation #
-	 */
-	xfid = (struct xfs_fid *)&handlep->ha_fid;
-	if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
-		ino  = xfid->fid_ino;
-		igen = xfid->fid_gen;
-	} else {
-		return XFS_ERROR(EINVAL);
-	}
-
-	/*
-	 * Get the XFS inode, building a Linux inode to go with it.
-	 */
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
-	if (error)
-		return error;
-	if (ip == NULL)
-		return XFS_ERROR(EIO);
-	if (ip->i_d.di_gen != igen) {
-		xfs_iput_new(ip, XFS_ILOCK_SHARED);
-		return XFS_ERROR(ENOENT);
-	}
-
-	xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-	*inode = VFS_I(ip);
-	return 0;
+	return xfs_handle_to_dentry(parfilp,
+			compat_ptr(hreq->ihandle), hreq->ihandlen);
 }
 
 STATIC int
 xfs_compat_attrlist_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
 	int			error;
 	attrlist_cursor_kern_t	*cursor;
 	compat_xfs_fsop_attrlist_handlereq_t al_hreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 	char			*kbuf;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -446,17 +374,17 @@ xfs_compat_attrlist_by_handle(
 	if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
 		return -XFS_ERROR(EINVAL);
 
-	error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
-					       &inode);
-	if (error)
-		goto out;
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
+	error = -ENOMEM;
 	kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
 	if (!kbuf)
-		goto out_vn_rele;
+		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-	error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
 					al_hreq.flags, cursor);
 	if (error)
 		goto out_kfree;
@@ -466,22 +394,20 @@ xfs_compat_attrlist_by_handle(
 
  out_kfree:
 	kfree(kbuf);
- out_vn_rele:
-	iput(inode);
- out:
-	return -error;
+ out_dput:
+	dput(dentry);
+	return error;
 }
 
 STATIC int
 xfs_compat_attrmulti_by_handle(
-	xfs_mount_t				*mp,
-	void					__user *arg,
-	struct inode				*parinode)
+	struct file				*parfilp,
+	void					__user *arg)
 {
 	int					error;
 	compat_xfs_attr_multiop_t		*ops;
 	compat_xfs_fsop_attrmulti_handlereq_t	am_hreq;
-	struct inode				*inode;
+	struct dentry				*dentry;
 	unsigned int				i, size;
 	char					*attr_name;
 
@@ -491,20 +417,19 @@ xfs_compat_attrmulti_by_handle(
 			   sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
-					       &inode);
-	if (error)
-		goto out;
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
 	error = E2BIG;
 	size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
 	if (!size || size > 16 * PAGE_SIZE)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = ENOMEM;
 	ops = kmalloc(size, GFP_KERNEL);
 	if (!ops)
-		goto out_vn_rele;
+		goto out_dput;
 
 	error = EFAULT;
 	if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
@@ -527,20 +452,21 @@ xfs_compat_attrmulti_by_handle(
 
 		switch (ops[i].am_opcode) {
 		case ATTR_OP_GET:
-			ops[i].am_error = xfs_attrmulti_attr_get(inode,
-					attr_name,
+			ops[i].am_error = xfs_attrmulti_attr_get(
+					dentry->d_inode, attr_name,
 					compat_ptr(ops[i].am_attrvalue),
 					&ops[i].am_length, ops[i].am_flags);
 			break;
 		case ATTR_OP_SET:
-			ops[i].am_error = xfs_attrmulti_attr_set(inode,
-					attr_name,
+			ops[i].am_error = xfs_attrmulti_attr_set(
+					dentry->d_inode, attr_name,
 					compat_ptr(ops[i].am_attrvalue),
 					ops[i].am_length, ops[i].am_flags);
 			break;
 		case ATTR_OP_REMOVE:
-			ops[i].am_error = xfs_attrmulti_attr_remove(inode,
-					attr_name, ops[i].am_flags);
+			ops[i].am_error = xfs_attrmulti_attr_remove(
+					dentry->d_inode, attr_name,
+					ops[i].am_flags);
 			break;
 		default:
 			ops[i].am_error = EINVAL;
@@ -553,22 +479,20 @@ xfs_compat_attrmulti_by_handle(
 	kfree(attr_name);
  out_kfree_ops:
 	kfree(ops);
- out_vn_rele:
-	iput(inode);
- out:
+ out_dput:
+	dput(dentry);
 	return -error;
 }
 
 STATIC int
 xfs_compat_fssetdm_by_handle(
-	xfs_mount_t		*mp,
-	void			__user *arg,
-	struct inode		*parinode)
+	struct file		*parfilp,
+	void			__user *arg)
 {
 	int			error;
 	struct fsdmidata	fsd;
 	compat_xfs_fsop_setdm_handlereq_t dmhreq;
-	struct inode		*inode;
+	struct dentry		*dentry;
 
 	if (!capable(CAP_MKNOD))
 		return -XFS_ERROR(EPERM);
@@ -576,12 +500,11 @@ xfs_compat_fssetdm_by_handle(
 			   sizeof(compat_xfs_fsop_setdm_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
 
-	error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
-					       &inode);
-	if (error)
-		return -error;
+	dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
 
-	if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
 		error = -XFS_ERROR(EPERM);
 		goto out;
 	}
@@ -591,11 +514,11 @@ xfs_compat_fssetdm_by_handle(
 		goto out;
 	}
 
-	error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+	error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
 				 fsd.fsd_dmstate);
 
 out:
-	iput(inode);
+	dput(dentry);
 	return error;
 }
 
@@ -722,21 +645,21 @@ xfs_file_compat_ioctl(
 
 		if (xfs_compat_handlereq_copyin(&hreq, arg))
 			return -XFS_ERROR(EFAULT);
-		return xfs_open_by_handle(mp, &hreq, filp, inode);
+		return xfs_open_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_READLINK_BY_HANDLE_32: {
 		struct xfs_fsop_handlereq	hreq;
 
 		if (xfs_compat_handlereq_copyin(&hreq, arg))
 			return -XFS_ERROR(EFAULT);
-		return xfs_readlink_by_handle(mp, &hreq, inode);
+		return xfs_readlink_by_handle(filp, &hreq);
 	}
 	case XFS_IOC_ATTRLIST_BY_HANDLE_32:
-		return xfs_compat_attrlist_by_handle(mp, arg, inode);
+		return xfs_compat_attrlist_by_handle(filp, arg);
 	case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
-		return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+		return xfs_compat_attrmulti_by_handle(filp, arg);
 	case XFS_IOC_FSSETDM_BY_HANDLE_32:
-		return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+		return xfs_compat_fssetdm_by_handle(filp, arg);
 	default:
 		return -XFS_ERROR(ENOIOCTLCMD);
 	}
-- 
cgit v1.2.3


From 62e194ecdaf8a1935991c1f8704886328d96a391 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:03 +0100
Subject: xfs: use mnt_want_write in compat_attrmulti ioctl

The compat version of the attrmulti ioctl needs to ask for and then
later release write access to the mount just like the native version,
otherwise we could potentially write to read-only mounts.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_ioctl32.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index fd4362063f2..c70c4e3db79 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -17,6 +17,7 @@
  */
 #include <linux/compat.h>
 #include <linux/ioctl.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
@@ -458,15 +459,23 @@ xfs_compat_attrmulti_by_handle(
 					&ops[i].am_length, ops[i].am_flags);
 			break;
 		case ATTR_OP_SET:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
 			ops[i].am_error = xfs_attrmulti_attr_set(
 					dentry->d_inode, attr_name,
 					compat_ptr(ops[i].am_attrvalue),
 					ops[i].am_length, ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		case ATTR_OP_REMOVE:
+			ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+			if (ops[i].am_error)
+				break;
 			ops[i].am_error = xfs_attrmulti_attr_remove(
 					dentry->d_inode, attr_name,
 					ops[i].am_flags);
+			mnt_drop_write(parfilp->f_path.mnt);
 			break;
 		default:
 			ops[i].am_error = EINVAL;
-- 
cgit v1.2.3


From 080dda7f5e8e8df95bcd17a5345c276e365a2054 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:11 +0100
Subject: xfs: add a separate lock class for the per-mount list of dquots

We can have both a a quota hash chain and the per-mount list locked at
the same time.  But given that both use the same struct dqhash as list
head we have to tell lockdep that they are different lock classes.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/quota/xfs_qm.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 6b13960cf31..7a2beb64314 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1070,6 +1070,13 @@ xfs_qm_sync(
 	return 0;
 }
 
+/*
+ * The hash chains and the mplist use the same xfs_dqhash structure as
+ * their list head, but we can take the mplist qh_lock and one of the
+ * hash qh_locks at the same time without any problem as they aren't
+ * related.
+ */
+static struct lock_class_key xfs_quota_mplist_class;
 
 /*
  * This initializes all the quota information that's kept in the
@@ -1105,6 +1112,8 @@ xfs_qm_init_quotainfo(
 	}
 
 	xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
+	lockdep_set_class(&qinf->qi_dqlist.qh_lock, &xfs_quota_mplist_class);
+
 	qinf->qi_dqreclaims = 0;
 
 	/* mutex used to serialize quotaoffs */
-- 
cgit v1.2.3


From 4f2d4ac6e5eb7d72e8df7f3fbf67a78dab8b91cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:19 +0100
Subject: xfs: lockdep annotations for xfs_dqlock2

xfs_dqlock2 locks two xfs_dquots, which is fine as it always locks the
dquot with the lower id first.  Use mutex_lock_nested to tell lockdep
about this fact.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/quota/xfs_dquot.c | 24 ++++++++++++++----------
 fs/xfs/quota/xfs_dquot.h | 10 ++++++++++
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 591ca6602bf..36d1bb6140d 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1383,6 +1383,12 @@ xfs_dqunlock_nonotify(
 	mutex_unlock(&(dqp->q_qlock));
 }
 
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
 void
 xfs_dqlock2(
 	xfs_dquot_t	*d1,
@@ -1392,18 +1398,16 @@ xfs_dqlock2(
 		ASSERT(d1 != d2);
 		if (be32_to_cpu(d1->q_core.d_id) >
 		    be32_to_cpu(d2->q_core.d_id)) {
-			xfs_dqlock(d2);
-			xfs_dqlock(d1);
+			mutex_lock(&d2->q_qlock);
+			mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
 		} else {
-			xfs_dqlock(d1);
-			xfs_dqlock(d2);
-		}
-	} else {
-		if (d1) {
-			xfs_dqlock(d1);
-		} else if (d2) {
-			xfs_dqlock(d2);
+			mutex_lock(&d1->q_qlock);
+			mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
 		}
+	} else if (d1) {
+		mutex_lock(&d1->q_qlock);
+	} else if (d2) {
+		mutex_lock(&d2->q_qlock);
 	}
 }
 
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 7e455337e2b..d443e93b433 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -97,6 +97,16 @@ typedef struct xfs_dquot {
 #define dq_hashlist	q_lists.dqm_hashlist
 #define dq_flags	q_lists.dqm_flags
 
+/*
+ * Lock hierachy for q_qlock:
+ *	XFS_QLOCK_NORMAL is the implicit default,
+ * 	XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+	XFS_QLOCK_NORMAL = 0,
+	XFS_QLOCK_NESTED,
+};
+
 #define XFS_DQHOLD(dqp)		((dqp)->q_nrefs++)
 
 #ifdef DEBUG
-- 
cgit v1.2.3


From 5aa2dc0a0697c762874241fa9ddbecd2d878b934 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:03:25 +0100
Subject: xfs: add a lock class for group/project dquots

We can have both a user and a group/project dquot locked at the same time,
as long as the user dquot is locked first.  Tell lockdep about that fact
by making the group/project dquots a different lock class.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/quota/xfs_dquot.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 36d1bb6140d..f0bc7846580 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -73,6 +73,8 @@ int xfs_dqreq_num;
 int xfs_dqerror_mod = 33;
 #endif
 
+static struct lock_class_key xfs_dquot_other_class;
+
 /*
  * Allocate and initialize a dquot. We don't always allocate fresh memory;
  * we try to reclaim a free dquot if the number of incore dquots are above
@@ -139,7 +141,15 @@ xfs_qm_dqinit(
 		 ASSERT(dqp->q_trace);
 		 xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT");
 #endif
-	 }
+	}
+
+	/*
+	 * In either case we need to make sure group quotas have a different
+	 * lock class than user quotas, to make sure lockdep knows we can
+	 * locks of one of each at the same time.
+	 */
+	if (!(type & XFS_DQ_USER))
+		lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
 
 	/*
 	 * log item gets initialized later
-- 
cgit v1.2.3


From 49739140e57a65114d9e1976c4c158d2145595fb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:04:07 +0100
Subject: xfs: fix bad_features2 fixups for the root filesystem

Currently the bad_features2 fixup and the alignment updates in the superblock
are skipped if we mount a filesystem read-only.  But for the root filesystem
the typical case is to mount read-only first and only later remount writeable
so we'll never perform this update at all.  It's not a big problem but means
the logs of people needing the fixup get spammed at every boot because they
never happen on disk.

Reported-by: Arkadiusz Miskiewicz <arekm@maven.pl>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 17 ++++++++++++++++-
 fs/xfs/xfs_mount.c           | 26 +++++++++++++-------------
 fs/xfs/xfs_mount.h           |  3 +++
 3 files changed, 32 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 95a97108036..c71e226da7f 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1197,6 +1197,7 @@ xfs_fs_remount(
 	struct xfs_mount	*mp = XFS_M(sb);
 	substring_t		args[MAX_OPT_ARGS];
 	char			*p;
+	int			error;
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
@@ -1247,11 +1248,25 @@ xfs_fs_remount(
 		}
 	}
 
-	/* rw/ro -> rw */
+	/* ro -> rw */
 	if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
 		mp->m_flags &= ~XFS_MOUNT_RDONLY;
 		if (mp->m_flags & XFS_MOUNT_BARRIER)
 			xfs_mountfs_check_barriers(mp);
+
+		/*
+		 * If this is the first remount to writeable state we
+		 * might have some superblock changes to update.
+		 */
+		if (mp->m_update_flags) {
+			error = xfs_mount_log_sb(mp, mp->m_update_flags);
+			if (error) {
+				cmn_err(CE_WARN,
+					"XFS: failed to write sb changes");
+				return error;
+			}
+			mp->m_update_flags = 0;
+		}
 	}
 
 	/* rw -> ro */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3c97c6463a4..35300250e86 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,6 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 
-STATIC int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 STATIC int	xfs_uuid_mount(xfs_mount_t *);
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
@@ -682,7 +681,7 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
  * Update alignment values based on mount options and sb values
  */
 STATIC int
-xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp)
 {
 	xfs_sb_t	*sbp = &(mp->m_sb);
 
@@ -736,11 +735,11 @@ xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
 		if (xfs_sb_version_hasdalign(sbp)) {
 			if (sbp->sb_unit != mp->m_dalign) {
 				sbp->sb_unit = mp->m_dalign;
-				*update_flags |= XFS_SB_UNIT;
+				mp->m_update_flags |= XFS_SB_UNIT;
 			}
 			if (sbp->sb_width != mp->m_swidth) {
 				sbp->sb_width = mp->m_swidth;
-				*update_flags |= XFS_SB_WIDTH;
+				mp->m_update_flags |= XFS_SB_WIDTH;
 			}
 		}
 	} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
@@ -905,7 +904,6 @@ xfs_mountfs(
 	xfs_sb_t	*sbp = &(mp->m_sb);
 	xfs_inode_t	*rip;
 	__uint64_t	resblks;
-	__int64_t	update_flags = 0LL;
 	uint		quotamount, quotaflags;
 	int		uuid_mounted = 0;
 	int		error = 0;
@@ -933,7 +931,7 @@ xfs_mountfs(
 			"XFS: correcting sb_features alignment problem");
 		sbp->sb_features2 |= sbp->sb_bad_features2;
 		sbp->sb_bad_features2 = sbp->sb_features2;
-		update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
+		mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
 
 		/*
 		 * Re-check for ATTR2 in case it was found in bad_features2
@@ -947,11 +945,11 @@ xfs_mountfs(
 	if (xfs_sb_version_hasattr2(&mp->m_sb) &&
 	   (mp->m_flags & XFS_MOUNT_NOATTR2)) {
 		xfs_sb_version_removeattr2(&mp->m_sb);
-		update_flags |= XFS_SB_FEATURES2;
+		mp->m_update_flags |= XFS_SB_FEATURES2;
 
 		/* update sb_versionnum for the clearing of the morebits */
 		if (!sbp->sb_features2)
-			update_flags |= XFS_SB_VERSIONNUM;
+			mp->m_update_flags |= XFS_SB_VERSIONNUM;
 	}
 
 	/*
@@ -960,7 +958,7 @@ xfs_mountfs(
 	 * allocator alignment is within an ag, therefore ag has
 	 * to be aligned at stripe boundary.
 	 */
-	error = xfs_update_alignment(mp, &update_flags);
+	error = xfs_update_alignment(mp);
 	if (error)
 		goto error1;
 
@@ -1137,10 +1135,12 @@ xfs_mountfs(
 	}
 
 	/*
-	 * If fs is not mounted readonly, then update the superblock changes.
+	 * If this is a read-only mount defer the superblock updates until
+	 * the next remount into writeable mode.  Otherwise we would never
+	 * perform the update e.g. for the root filesystem.
 	 */
-	if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		error = xfs_mount_log_sb(mp, update_flags);
+	if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		error = xfs_mount_log_sb(mp, mp->m_update_flags);
 		if (error) {
 			cmn_err(CE_WARN, "XFS: failed to write sb changes");
 			goto error4;
@@ -1820,7 +1820,7 @@ xfs_uuid_mount(
  * be altered by the mount options, as well as any potential sb_features2
  * fixup. Only the first superblock is updated.
  */
-STATIC int
+int
 xfs_mount_log_sb(
 	xfs_mount_t	*mp,
 	__int64_t	fields)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c1e02846732..e37eff6761e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -327,6 +327,8 @@ typedef struct xfs_mount {
 	spinlock_t		m_sync_lock;	/* work item list lock */
 	int			m_sync_seq;	/* sync thread generation no. */
 	wait_queue_head_t	m_wait_single_sync_task;
+	__int64_t		m_update_flags;	/* sb flags we need to update
+						   on the next remount,rw */
 } xfs_mount_t;
 
 /*
@@ -514,6 +516,7 @@ extern int	xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
 			int64_t, int);
 extern int	xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
 			uint, int);
+extern int	xfs_mount_log_sb(xfs_mount_t *, __int64_t);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int	xfs_readsb(xfs_mount_t *, int);
 extern void	xfs_freesb(xfs_mount_t *);
-- 
cgit v1.2.3


From b828d8c33867dd6479644c06500975570bfd525c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 19 Jan 2009 02:04:16 +0100
Subject: xfs: sanity check attr fork size

Recently we have quite a few kerneloops reports about dereferencing a NULL
if_data in the attribute fork.  From looking over the code this can only
happen if we pass a 0 size argument to xfs_iformat_local.  This implies some
sort of corruption and in fact the only mailinglist report about this from
earlier this year was after a powerfail presumably on a system with write
cache and without barriers.

Add a quick sanity check for the attr fork size in xfs_iformat to catch
these early and without an oops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_inode.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 5a5e035e5d3..323ecd76a12 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -424,6 +424,19 @@ xfs_iformat(
 	case XFS_DINODE_FMT_LOCAL:
 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 		size = be16_to_cpu(atp->hdr.totsize);
+
+		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
+			xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+				"corrupt inode %Lu "
+				"(bad attr fork size %Ld).",
+				(unsigned long long) ip->i_ino,
+				(long long) size);
+			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
+					     XFS_ERRLEVEL_LOW,
+					     ip->i_mount, dip);
+			return XFS_ERROR(EFSCORRUPTED);
+		}
+
 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
-- 
cgit v1.2.3


From b6e3222732a3551e786aa47b90a8eab2a517711c Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 14 Jan 2009 23:22:07 -0600
Subject: [XFS] Remove the rest of the macro-to-function indirections.

Remove the last of the macros-defined-to-static-functions.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
---
 fs/xfs/quota/xfs_dquot.c  |   2 +-
 fs/xfs/xfs_ag.h           |   6 +-
 fs/xfs/xfs_alloc_btree.c  |   2 +-
 fs/xfs/xfs_attr.c         |  26 ++++----
 fs/xfs/xfs_bmap.c         | 166 +++++++++++++++++++++++-----------------------
 fs/xfs/xfs_bmap.h         |   2 -
 fs/xfs/xfs_bmap_btree.c   |  10 +--
 fs/xfs/xfs_bmap_btree.h   |   4 --
 fs/xfs/xfs_btree.c        |   6 +-
 fs/xfs/xfs_da_btree.c     |   8 +--
 fs/xfs/xfs_ialloc.c       |   6 +-
 fs/xfs/xfs_ialloc.h       |   2 -
 fs/xfs/xfs_ialloc_btree.h |   1 -
 fs/xfs/xfs_inode.c        |   6 +-
 fs/xfs/xfs_inode_item.h   |   4 --
 fs/xfs/xfs_iomap.c        |  10 +--
 fs/xfs/xfs_itable.c       |   6 +-
 fs/xfs/xfs_mount.h        |   6 +-
 fs/xfs/xfs_rename.c       |   2 +-
 fs/xfs/xfs_rtalloc.c      |   2 +-
 fs/xfs/xfs_rw.h           |   1 -
 fs/xfs/xfs_sb.h           |   2 +-
 fs/xfs/xfs_vnodeops.c     |  20 +++---
 23 files changed, 142 insertions(+), 158 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f0bc7846580..6543c0b2975 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -431,7 +431,7 @@ xfs_qm_dqalloc(
 	/*
 	 * Initialize the bmap freelist prior to calling bmapi code.
 	 */
-	XFS_BMAP_INIT(&flist, &firstblock);
+	xfs_bmap_init(&flist, &firstblock);
 	xfs_ilock(quotip, XFS_ILOCK_EXCL);
 	/*
 	 * Return if this type of quotas is turned off while we didn't
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index d3b3cf74299..143d63ecb20 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -244,8 +244,8 @@ typedef struct xfs_perag
 #define	XFS_AG_CHECK_DADDR(mp,d,len)	\
 	((len) == 1 ? \
 	    ASSERT((d) == XFS_SB_DADDR || \
-		   XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \
-	    ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \
-		   XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1)))
+		   xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \
+	    ASSERT(xfs_daddr_to_agno(mp, d) == \
+		   xfs_daddr_to_agno(mp, (d) + (len) - 1)))
 
 #endif	/* __XFS_AG_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 733cb75a8c5..c10c3a292d3 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -115,7 +115,7 @@ xfs_allocbt_free_block(
 	xfs_agblock_t		bno;
 	int			error;
 
-	bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
+	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
 	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index f7cdc28aff4..5fde1654b43 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -374,7 +374,7 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
 		 * It won't fit in the shortform, transform to a leaf block.
 		 * GROT: another possible req'mt for a double-split btree op.
 		 */
-		XFS_BMAP_INIT(args.flist, args.firstblock);
+		xfs_bmap_init(args.flist, args.firstblock);
 		error = xfs_attr_shortform_to_leaf(&args);
 		if (!error) {
 			error = xfs_bmap_finish(&args.trans, args.flist,
@@ -956,7 +956,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * Commit that transaction so that the node_addname() call
 		 * can manage its own transactions.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_attr_leaf_to_node(args);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1057,7 +1057,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * If the result is small enough, shrink it all into the inode.
 		 */
 		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
 			if (!error) {
@@ -1135,7 +1135,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 	 * If the result is small enough, shrink it all into the inode.
 	 */
 	if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
 		/* bp is gone due to xfs_da_shrink_inode */
 		if (!error) {
@@ -1290,7 +1290,7 @@ restart:
 			 * have been a b-tree.
 			 */
 			xfs_da_state_free(state);
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr_leaf_to_node(args);
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
@@ -1331,7 +1331,7 @@ restart:
 		 * in the index/blkno/rmtblkno/rmtblkcnt fields and
 		 * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_da_split(state);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1443,7 +1443,7 @@ restart:
 		 * Check to see if the tree needs to be collapsed.
 		 */
 		if (retval && (state->path.active > 1)) {
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_da_join(state);
 			if (!error) {
 				error = xfs_bmap_finish(&args->trans,
@@ -1579,7 +1579,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 	 * Check to see if the tree needs to be collapsed.
 	 */
 	if (retval && (state->path.active > 1)) {
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_da_join(state);
 		if (!error) {
 			error = xfs_bmap_finish(&args->trans, args->flist,
@@ -1630,7 +1630,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 						       == XFS_ATTR_LEAF_MAGIC);
 
 		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
-			XFS_BMAP_INIT(args->flist, args->firstblock);
+			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
 			if (!error) {
@@ -2069,7 +2069,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		/*
 		 * Allocate a single extent, up to the size of the value.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
 		error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno,
 				  blkcnt,
@@ -2123,7 +2123,7 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 		/*
 		 * Try to remember where we decided to put the value.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
 		error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno,
 				  args->rmtblkcnt,
@@ -2188,7 +2188,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 		/*
 		 * Try to remember where we decided to put the value.
 		 */
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		nmap = 1;
 		error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno,
 					args->rmtblkcnt,
@@ -2229,7 +2229,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 	blkcnt = args->rmtblkcnt;
 	done = 0;
 	while (!done) {
-		XFS_BMAP_INIT(args->flist, args->firstblock);
+		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
 				    XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA,
 				    1, args->firstblock, args->flist,
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 138308e70d1..c852cd65aae 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -595,9 +595,9 @@ xfs_bmap_add_extent(
 		xfs_iext_insert(ifp, 0, 1, new);
 		ASSERT(cur == NULL);
 		ifp->if_lastex = 0;
-		if (!ISNULLSTARTBLOCK(new->br_startblock)) {
+		if (!isnullstartblock(new->br_startblock)) {
 			XFS_IFORK_NEXT_SET(ip, whichfork, 1);
-			logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else
 			logflags = 0;
 		/* DELTA: single new extent */
@@ -613,7 +613,7 @@ xfs_bmap_add_extent(
 	/*
 	 * Any kind of new delayed allocation goes here.
 	 */
-	else if (ISNULLSTARTBLOCK(new->br_startblock)) {
+	else if (isnullstartblock(new->br_startblock)) {
 		if (cur)
 			ASSERT((cur->bc_private.b.flags &
 				XFS_BTCUR_BPRV_WASDEL) == 0);
@@ -644,11 +644,11 @@ xfs_bmap_add_extent(
 		 * in a delayed or unwritten allocation with a real one, or
 		 * converting real back to unwritten.
 		 */
-		if (!ISNULLSTARTBLOCK(new->br_startblock) &&
+		if (!isnullstartblock(new->br_startblock) &&
 		    new->br_startoff + new->br_blockcount > prev.br_startoff) {
 			if (prev.br_state != XFS_EXT_UNWRITTEN &&
-			    ISNULLSTARTBLOCK(prev.br_startblock)) {
-				da_old = STARTBLOCKVAL(prev.br_startblock);
+			    isnullstartblock(prev.br_startblock)) {
+				da_old = startblockval(prev.br_startblock);
 				if (cur)
 					ASSERT(cur->bc_private.b.flags &
 						XFS_BTCUR_BPRV_WASDEL);
@@ -803,7 +803,7 @@ xfs_bmap_add_extent_delay_real(
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
 	}
 	STATE_SET(LEFT_CONTIG,
 		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -820,7 +820,7 @@ xfs_bmap_add_extent_delay_real(
 			idx <
 			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
 	}
 	STATE_SET(RIGHT_CONTIG,
 		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1019,8 +1019,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock));
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			startblockval(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("LF|LC", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: The boundary between two in-core extents moved. */
@@ -1067,10 +1067,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock) -
+			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, idx + 1);
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("LF", ip, idx + 1, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: One in-core extent is split in two. */
@@ -1110,8 +1110,8 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock));
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			startblockval(PREV.br_startblock));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("RF|RC", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: The boundary between two in-core extents moved. */
@@ -1157,10 +1157,10 @@ xfs_bmap_add_extent_delay_real(
 				goto done;
 		}
 		temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
-			STARTBLOCKVAL(PREV.br_startblock) -
+			startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		ep = xfs_iext_get_ext(ifp, idx);
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("RF", ip, idx, XFS_DATA_FORK);
 		*dnew = temp;
 		/* DELTA: One in-core extent is split in two. */
@@ -1213,7 +1213,7 @@ xfs_bmap_add_extent_delay_real(
 		}
 		temp = xfs_bmap_worst_indlen(ip, temp);
 		temp2 = xfs_bmap_worst_indlen(ip, temp2);
-		diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) -
+		diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
 			(cur ? cur->bc_private.b.allocated : 0));
 		if (diff > 0 &&
 		    xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -((int64_t)diff), rsvd)) {
@@ -1241,11 +1241,11 @@ xfs_bmap_add_extent_delay_real(
 			}
 		}
 		ep = xfs_iext_get_ext(ifp, idx);
-		xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+		xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx, XFS_DATA_FORK);
 		XFS_BMAP_TRACE_PRE_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
-			NULLSTARTBLOCK((int)temp2));
+			nullstartblock((int)temp2));
 		XFS_BMAP_TRACE_POST_UPDATE("0", ip, idx + 2, XFS_DATA_FORK);
 		*dnew = temp + temp2;
 		/* DELTA: One in-core extent is split in three. */
@@ -1365,7 +1365,7 @@ xfs_bmap_add_extent_unwritten_real(
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(LEFT.br_startblock));
 	}
 	STATE_SET(LEFT_CONTIG,
 		STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) &&
@@ -1382,7 +1382,7 @@ xfs_bmap_add_extent_unwritten_real(
 			idx <
 			ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(RIGHT.br_startblock));
 	}
 	STATE_SET(RIGHT_CONTIG,
 		STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) &&
@@ -1889,13 +1889,13 @@ xfs_bmap_add_extent_hole_delay(
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	ep = xfs_iext_get_ext(ifp, idx);
 	state = 0;
-	ASSERT(ISNULLSTARTBLOCK(new->br_startblock));
+	ASSERT(isnullstartblock(new->br_startblock));
 	/*
 	 * Check and set flags if this segment has a left neighbor
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
 	}
 	/*
 	 * Check and set flags if the current (right) segment exists.
@@ -1905,7 +1905,7 @@ xfs_bmap_add_extent_hole_delay(
 			   idx <
 			   ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
 		xfs_bmbt_get_all(ep, &right);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
 	}
 	/*
 	 * Set contiguity flags on the left and right neighbors.
@@ -1938,12 +1938,12 @@ xfs_bmap_add_extent_hole_delay(
 		XFS_BMAP_TRACE_PRE_UPDATE("LC|RC", ip, idx - 1,
 			XFS_DATA_FORK);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
-		oldlen = STARTBLOCKVAL(left.br_startblock) +
-			STARTBLOCKVAL(new->br_startblock) +
-			STARTBLOCKVAL(right.br_startblock);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
-			NULLSTARTBLOCK((int)newlen));
+			nullstartblock((int)newlen));
 		XFS_BMAP_TRACE_POST_UPDATE("LC|RC", ip, idx - 1,
 			XFS_DATA_FORK);
 		XFS_BMAP_TRACE_DELETE("LC|RC", ip, idx, 1, XFS_DATA_FORK);
@@ -1964,11 +1964,11 @@ xfs_bmap_add_extent_hole_delay(
 		XFS_BMAP_TRACE_PRE_UPDATE("LC", ip, idx - 1,
 			XFS_DATA_FORK);
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
-		oldlen = STARTBLOCKVAL(left.br_startblock) +
-			STARTBLOCKVAL(new->br_startblock);
+		oldlen = startblockval(left.br_startblock) +
+			startblockval(new->br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
-			NULLSTARTBLOCK((int)newlen));
+			nullstartblock((int)newlen));
 		XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1,
 			XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx - 1;
@@ -1985,11 +1985,11 @@ xfs_bmap_add_extent_hole_delay(
 		 */
 		XFS_BMAP_TRACE_PRE_UPDATE("RC", ip, idx, XFS_DATA_FORK);
 		temp = new->br_blockcount + right.br_blockcount;
-		oldlen = STARTBLOCKVAL(new->br_startblock) +
-			STARTBLOCKVAL(right.br_startblock);
+		oldlen = startblockval(new->br_startblock) +
+			startblockval(right.br_startblock);
 		newlen = xfs_bmap_worst_indlen(ip, temp);
 		xfs_bmbt_set_allf(ep, new->br_startoff,
-			NULLSTARTBLOCK((int)newlen), temp, right.br_state);
+			nullstartblock((int)newlen), temp, right.br_state);
 		XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, XFS_DATA_FORK);
 		ip->i_df.if_lastex = idx;
 		/* DELTA: One in-core extent grew into a hole. */
@@ -2085,7 +2085,7 @@ xfs_bmap_add_extent_hole_real(
 	 */
 	if (STATE_SET_TEST(LEFT_VALID, idx > 0)) {
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
-		STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock));
+		STATE_SET(LEFT_DELAY, isnullstartblock(left.br_startblock));
 	}
 	/*
 	 * Check and set flags if this segment has a current value.
@@ -2095,7 +2095,7 @@ xfs_bmap_add_extent_hole_real(
 			   idx <
 			   ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
 		xfs_bmbt_get_all(ep, &right);
-		STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock));
+		STATE_SET(RIGHT_DELAY, isnullstartblock(right.br_startblock));
 	}
 	/*
 	 * We're inserting a real allocation between "left" and "right".
@@ -2143,7 +2143,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL) {
-			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2185,7 +2185,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_BMAP_TRACE_POST_UPDATE("LC", ip, idx - 1, whichfork);
 		ifp->if_lastex = idx - 1;
 		if (cur == NULL) {
-			rval = XFS_ILOG_FEXT(whichfork);
+			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2220,7 +2220,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_BMAP_TRACE_POST_UPDATE("RC", ip, idx, whichfork);
 		ifp->if_lastex = idx;
 		if (cur == NULL) {
-			rval = XFS_ILOG_FEXT(whichfork);
+			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2254,7 +2254,7 @@ xfs_bmap_add_extent_hole_real(
 		XFS_IFORK_NEXT_SET(ip, whichfork,
 			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL) {
-			rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
 			if ((error = xfs_bmbt_lookup_eq(cur,
@@ -2482,7 +2482,7 @@ xfs_bmap_adjacent(
 	 * try to use it's last block as our starting point.
 	 */
 	if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF &&
-	    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+	    !isnullstartblock(ap->prevp->br_startblock) &&
 	    ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount,
 		    ap->prevp->br_startblock)) {
 		ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount;
@@ -2511,7 +2511,7 @@ xfs_bmap_adjacent(
 		 * start block based on it.
 		 */
 		if (ap->prevp->br_startoff != NULLFILEOFF &&
-		    !ISNULLSTARTBLOCK(ap->prevp->br_startblock) &&
+		    !isnullstartblock(ap->prevp->br_startblock) &&
 		    (prevbno = ap->prevp->br_startblock +
 			       ap->prevp->br_blockcount) &&
 		    ISVALID(prevbno, ap->prevp->br_startblock)) {
@@ -2552,7 +2552,7 @@ xfs_bmap_adjacent(
 		 * If there's a following (right) block, select a requested
 		 * start block based on it.
 		 */
-		if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) {
+		if (!isnullstartblock(ap->gotp->br_startblock)) {
 			/*
 			 * Calculate gap to start of next block.
 			 */
@@ -3082,7 +3082,7 @@ xfs_bmap_btree_to_extents(
 	ASSERT(ifp->if_broot == NULL);
 	ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
-	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 	return 0;
 }
 
@@ -3136,8 +3136,8 @@ xfs_bmap_del_extent(
 	del_endoff = del->br_startoff + del->br_blockcount;
 	got_endoff = got.br_startoff + got.br_blockcount;
 	ASSERT(got_endoff >= del_endoff);
-	delay = ISNULLSTARTBLOCK(got.br_startblock);
-	ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
 	flags = 0;
 	qfield = 0;
 	error = 0;
@@ -3189,7 +3189,7 @@ xfs_bmap_del_extent(
 		}
 		da_old = da_new = 0;
 	} else {
-		da_old = STARTBLOCKVAL(got.br_startblock);
+		da_old = startblockval(got.br_startblock);
 		da_new = 0;
 		nblks = 0;
 		do_fx = 0;
@@ -3213,7 +3213,7 @@ xfs_bmap_del_extent(
 			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		flags |= XFS_ILOG_CORE;
 		if (!cur) {
-			flags |= XFS_ILOG_FEXT(whichfork);
+			flags |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		if ((error = xfs_btree_delete(cur, &i)))
@@ -3233,7 +3233,7 @@ xfs_bmap_del_extent(
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
-			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 			XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx,
 				whichfork);
 			da_new = temp;
@@ -3242,7 +3242,7 @@ xfs_bmap_del_extent(
 		xfs_bmbt_set_startblock(ep, del_endblock);
 		XFS_BMAP_TRACE_POST_UPDATE("2", ip, idx, whichfork);
 		if (!cur) {
-			flags |= XFS_ILOG_FEXT(whichfork);
+			flags |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock,
@@ -3262,7 +3262,7 @@ xfs_bmap_del_extent(
 		if (delay) {
 			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
 				da_old);
-			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 			XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx,
 				whichfork);
 			da_new = temp;
@@ -3270,7 +3270,7 @@ xfs_bmap_del_extent(
 		}
 		XFS_BMAP_TRACE_POST_UPDATE("1", ip, idx, whichfork);
 		if (!cur) {
-			flags |= XFS_ILOG_FEXT(whichfork);
+			flags |= xfs_ilog_fext(whichfork);
 			break;
 		}
 		if ((error = xfs_bmbt_update(cur, got.br_startoff,
@@ -3345,22 +3345,22 @@ xfs_bmap_del_extent(
 				}
 				XFS_WANT_CORRUPTED_GOTO(i == 1, done);
 			} else
-				flags |= XFS_ILOG_FEXT(whichfork);
+				flags |= xfs_ilog_fext(whichfork);
 			XFS_IFORK_NEXT_SET(ip, whichfork,
 				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		} else {
 			ASSERT(whichfork == XFS_DATA_FORK);
 			temp = xfs_bmap_worst_indlen(ip, temp);
-			xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp));
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
 			temp2 = xfs_bmap_worst_indlen(ip, temp2);
-			new.br_startblock = NULLSTARTBLOCK((int)temp2);
+			new.br_startblock = nullstartblock((int)temp2);
 			da_new = temp + temp2;
 			while (da_new > da_old) {
 				if (temp) {
 					temp--;
 					da_new--;
 					xfs_bmbt_set_startblock(ep,
-						NULLSTARTBLOCK((int)temp));
+						nullstartblock((int)temp));
 				}
 				if (da_new == da_old)
 					break;
@@ -3368,7 +3368,7 @@ xfs_bmap_del_extent(
 					temp2--;
 					da_new--;
 					new.br_startblock =
-						NULLSTARTBLOCK((int)temp2);
+						nullstartblock((int)temp2);
 				}
 			}
 		}
@@ -3545,7 +3545,7 @@ xfs_bmap_extents_to_btree(
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	for (cnt = i = 0; i < nextents; i++) {
 		ep = xfs_iext_get_ext(ifp, i);
-		if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) {
+		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
 			arp->l0 = cpu_to_be64(ep->l0);
 			arp->l1 = cpu_to_be64(ep->l1);
 			arp++; cnt++;
@@ -3572,7 +3572,7 @@ xfs_bmap_extents_to_btree(
 	xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
 	ASSERT(*curp == NULL);
 	*curp = cur;
-	*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
+	*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
 	return 0;
 }
 
@@ -3676,7 +3676,7 @@ xfs_bmap_local_to_extents(
 		ip->i_d.di_nblocks = 1;
 		XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip,
 			XFS_TRANS_DQ_BCOUNT, 1L);
-		flags |= XFS_ILOG_FEXT(whichfork);
+		flags |= xfs_ilog_fext(whichfork);
 	} else {
 		ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
 		xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
@@ -4082,7 +4082,7 @@ xfs_bmap_add_attrfork(
 		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
 	ip->i_afp->if_flags = XFS_IFEXTENTS;
 	logflags = 0;
-	XFS_BMAP_INIT(&flist, &firstblock);
+	xfs_bmap_init(&flist, &firstblock);
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_LOCAL:
 		error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist,
@@ -4162,7 +4162,7 @@ xfs_bmap_add_free(
 	ASSERT(bno != NULLFSBLOCK);
 	ASSERT(len > 0);
 	ASSERT(len <= MAXEXTLEN);
-	ASSERT(!ISNULLSTARTBLOCK(bno));
+	ASSERT(!isnullstartblock(bno));
 	agno = XFS_FSB_TO_AGNO(mp, bno);
 	agbno = XFS_FSB_TO_AGBNO(mp, bno);
 	ASSERT(agno < mp->m_sb.sb_agcount);
@@ -4909,7 +4909,7 @@ xfs_bmapi(
 			got.br_startoff = end;
 		inhole = eof || got.br_startoff > bno;
 		wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) &&
-			ISNULLSTARTBLOCK(got.br_startblock);
+			isnullstartblock(got.br_startblock);
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
@@ -5028,7 +5028,7 @@ xfs_bmapi(
 				}
 
 				ip->i_delayed_blks += alen;
-				abno = NULLSTARTBLOCK(indlen);
+				abno = nullstartblock(indlen);
 			} else {
 				/*
 				 * If first time, allocate and fill in
@@ -5144,8 +5144,8 @@ xfs_bmapi(
 				aoff + alen);
 #ifdef DEBUG
 			if (flags & XFS_BMAPI_DELAY) {
-				ASSERT(ISNULLSTARTBLOCK(got.br_startblock));
-				ASSERT(STARTBLOCKVAL(got.br_startblock) > 0);
+				ASSERT(isnullstartblock(got.br_startblock));
+				ASSERT(startblockval(got.br_startblock) > 0);
 			}
 			ASSERT(got.br_state == XFS_EXT_NORM ||
 			       got.br_state == XFS_EXT_UNWRITTEN);
@@ -5179,7 +5179,7 @@ xfs_bmapi(
 			ASSERT((bno >= obno) || (n == 0));
 			ASSERT(bno < end);
 			mval->br_startoff = bno;
-			if (ISNULLSTARTBLOCK(got.br_startblock)) {
+			if (isnullstartblock(got.br_startblock)) {
 				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
 				mval->br_startblock = DELAYSTARTBLOCK;
 			} else
@@ -5201,7 +5201,7 @@ xfs_bmapi(
 			ASSERT(mval->br_blockcount <= len);
 		} else {
 			*mval = got;
-			if (ISNULLSTARTBLOCK(mval->br_startblock)) {
+			if (isnullstartblock(mval->br_startblock)) {
 				ASSERT(!wr || (flags & XFS_BMAPI_DELAY));
 				mval->br_startblock = DELAYSTARTBLOCK;
 			}
@@ -5329,12 +5329,12 @@ error0:
 	 * Log everything.  Do this after conversion, there's no point in
 	 * logging the extent records if we've converted to btree format.
 	 */
-	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		logflags &= ~XFS_ILOG_FEXT(whichfork);
-	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
 		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+		logflags &= ~xfs_ilog_fbroot(whichfork);
 	/*
 	 * Log whatever the flags say, even if error.  Otherwise we might miss
 	 * detecting a case where the data is changed, there's an error,
@@ -5411,7 +5411,7 @@ xfs_bmapi_single(
 		*fsb = NULLFSBLOCK;
 		return 0;
 	}
-	ASSERT(!ISNULLSTARTBLOCK(got.br_startblock));
+	ASSERT(!isnullstartblock(got.br_startblock));
 	ASSERT(bno < got.br_startoff + got.br_blockcount);
 	*fsb = got.br_startblock + (bno - got.br_startoff);
 	ifp->if_lastex = lastx;
@@ -5543,7 +5543,7 @@ xfs_bunmapi(
 		 */
 		ASSERT(ep != NULL);
 		del = got;
-		wasdel = ISNULLSTARTBLOCK(del.br_startblock);
+		wasdel = isnullstartblock(del.br_startblock);
 		if (got.br_startoff < start) {
 			del.br_startoff = start;
 			del.br_blockcount -= start - got.br_startoff;
@@ -5638,7 +5638,7 @@ xfs_bunmapi(
 				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
 						lastx - 1), &prev);
 				ASSERT(prev.br_state == XFS_EXT_NORM);
-				ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock));
+				ASSERT(!isnullstartblock(prev.br_startblock));
 				ASSERT(del.br_startblock ==
 				       prev.br_startblock + prev.br_blockcount);
 				if (prev.br_startoff < start) {
@@ -5666,7 +5666,7 @@ xfs_bunmapi(
 			}
 		}
 		if (wasdel) {
-			ASSERT(STARTBLOCKVAL(del.br_startblock) > 0);
+			ASSERT(startblockval(del.br_startblock) > 0);
 			/* Update realtime/data freespace, unreserve quota */
 			if (isrt) {
 				xfs_filblks_t rtexts;
@@ -5782,12 +5782,12 @@ error0:
 	 * Log everything.  Do this after conversion, there's no point in
 	 * logging the extent records if we've converted to btree format.
 	 */
-	if ((logflags & XFS_ILOG_FEXT(whichfork)) &&
+	if ((logflags & xfs_ilog_fext(whichfork)) &&
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
-		logflags &= ~XFS_ILOG_FEXT(whichfork);
-	else if ((logflags & XFS_ILOG_FBROOT(whichfork)) &&
+		logflags &= ~xfs_ilog_fext(whichfork);
+	else if ((logflags & xfs_ilog_fbroot(whichfork)) &&
 		 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
-		logflags &= ~XFS_ILOG_FBROOT(whichfork);
+		logflags &= ~xfs_ilog_fbroot(whichfork);
 	/*
 	 * Log inode even in the error case, if the transaction
 	 * is dirty we'll need to shut down the filesystem.
@@ -5838,7 +5838,7 @@ xfs_getbmapx_fix_eof_hole(
 		if (startblock == DELAYSTARTBLOCK)
 			out->bmv_block = -2;
 		else
-			out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+			out->bmv_block = xfs_fsb_to_db(ip, startblock);
 		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
 		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
@@ -5979,7 +5979,7 @@ xfs_getbmap(
 	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
 		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
 
-	bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
+	bmapi_flags = xfs_bmapi_aflag(whichfork) |
 			((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
 
 	/*
@@ -6098,7 +6098,7 @@ xfs_bmap_isaeof(
 	 */
 	*aeof = (off >= s.br_startoff &&
 		 off < s.br_startoff + s.br_blockcount &&
-		 ISNULLSTARTBLOCK(s.br_startblock)) ||
+		 isnullstartblock(s.br_startblock)) ||
 		off >= s.br_startoff + s.br_blockcount;
 	return 0;
 }
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 284571c05ed..be2979d88d3 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -95,7 +95,6 @@ typedef	struct xfs_bmap_free
 					/* need write cache flushing and no */
 					/* additional allocation alignments */
 
-#define	XFS_BMAPI_AFLAG(w)	xfs_bmapi_aflag(w)
 static inline int xfs_bmapi_aflag(int w)
 {
 	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
@@ -107,7 +106,6 @@ static inline int xfs_bmapi_aflag(int w)
 #define	DELAYSTARTBLOCK		((xfs_fsblock_t)-1LL)
 #define	HOLESTARTBLOCK		((xfs_fsblock_t)-2LL)
 
-#define	XFS_BMAP_INIT(flp,fbp)	xfs_bmap_init(flp,fbp)
 static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 {
 	((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index ba6b08c2fb0..0760d352586 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -121,7 +121,7 @@ __xfs_bmbt_get_all(
 
 		b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
 		    (((xfs_dfsbno_t)l1) >> 21);
-		ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+		ASSERT((b >> 32) == 0 || isnulldstartblock(b));
 		s->br_startblock = (xfs_fsblock_t)b;
 	}
 #else	/* !DEBUG */
@@ -172,7 +172,7 @@ xfs_bmbt_get_startblock(
 
 	b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
 	    (((xfs_dfsbno_t)r->l1) >> 21);
-	ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b));
+	ASSERT((b >> 32) == 0 || isnulldstartblock(b));
 	return (xfs_fsblock_t)b;
 #else	/* !DEBUG */
 	return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
@@ -261,7 +261,7 @@ xfs_bmbt_set_allf(
 		((xfs_bmbt_rec_base_t)blockcount &
 		(xfs_bmbt_rec_base_t)xfs_mask64lo(21));
 #else	/* !XFS_BIG_BLKNOS */
-	if (ISNULLSTARTBLOCK(startblock)) {
+	if (isnullstartblock(startblock)) {
 		r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			((xfs_bmbt_rec_base_t)startoff << 9) |
 			 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
@@ -321,7 +321,7 @@ xfs_bmbt_disk_set_allf(
 		 ((xfs_bmbt_rec_base_t)blockcount &
 		  (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
 #else	/* !XFS_BIG_BLKNOS */
-	if (ISNULLSTARTBLOCK(startblock)) {
+	if (isnullstartblock(startblock)) {
 		r->l0 = cpu_to_be64(
 			((xfs_bmbt_rec_base_t)extent_flag << 63) |
 			 ((xfs_bmbt_rec_base_t)startoff << 9) |
@@ -382,7 +382,7 @@ xfs_bmbt_set_startblock(
 	r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
 		  (xfs_bmbt_rec_base_t)(v << 21);
 #else	/* !XFS_BIG_BLKNOS */
-	if (ISNULLSTARTBLOCK(v)) {
+	if (isnullstartblock(v)) {
 		r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
 		r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
 			  ((xfs_bmbt_rec_base_t)v << 21) |
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index a4555abb662..0e8df007615 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -76,26 +76,22 @@ typedef struct xfs_bmbt_rec_host {
 #define DSTARTBLOCKMASK		\
 	(((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
 
-#define ISNULLSTARTBLOCK(x)	isnullstartblock(x)
 static inline int isnullstartblock(xfs_fsblock_t x)
 {
 	return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
 }
 
-#define ISNULLDSTARTBLOCK(x)	isnulldstartblock(x)
 static inline int isnulldstartblock(xfs_dfsbno_t x)
 {
 	return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
 }
 
-#define NULLSTARTBLOCK(k)	nullstartblock(k)
 static inline xfs_fsblock_t nullstartblock(int k)
 {
 	ASSERT(k < (1 << STARTBLOCKVALBITS));
 	return STARTBLOCKMASK | (k);
 }
 
-#define STARTBLOCKVAL(x)	startblockval(x)
 static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
 {
 	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 2c3ef20f884..4681519ded9 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -960,7 +960,7 @@ xfs_btree_buf_to_ptr(
 		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
 					XFS_BUF_ADDR(bp)));
 	else {
-		ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
 					XFS_BUF_ADDR(bp)));
 	}
 }
@@ -2454,7 +2454,7 @@ xfs_btree_new_iroot(
 	xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
 
 	*logflags |=
-		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
 	*stat = 1;
 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
 	return 0;
@@ -3048,7 +3048,7 @@ xfs_btree_kill_iroot(
 	cur->bc_bufs[level - 1] = NULL;
 	be16_add_cpu(&block->bb_level, -1);
 	xfs_trans_log_inode(cur->bc_tp, ip,
-		XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+		XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
 	cur->bc_nlevels--;
 out0:
 	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index a11a8390bf6..c45f74ff1a5 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1597,7 +1597,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 	nmap = 1;
 	ASSERT(args->firstblock != NULL);
 	if ((error = xfs_bmapi(tp, dp, bno, count,
-			XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
+			xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|
 			XFS_BMAPI_CONTIG,
 			args->firstblock, args->total, &map, &nmap,
 			args->flist, NULL))) {
@@ -1618,7 +1618,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
 			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
 			c = (int)(bno + count - b);
 			if ((error = xfs_bmapi(tp, dp, b, c,
-					XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|
+					xfs_bmapi_aflag(w)|XFS_BMAPI_WRITE|
 					XFS_BMAPI_METADATA,
 					args->firstblock, args->total,
 					&mapp[mapi], &nmap, args->flist,
@@ -1882,7 +1882,7 @@ xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 		 * the last block to the place we want to kill.
 		 */
 		if ((error = xfs_bunmapi(tp, dp, dead_blkno, count,
-				XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA,
+				xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
 				0, args->firstblock, args->flist, NULL,
 				&done)) == ENOSPC) {
 			if (w != XFS_DATA_FORK)
@@ -1987,7 +1987,7 @@ xfs_da_do_buf(
 			if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno,
 					nfsb,
 					XFS_BMAPI_METADATA |
-						XFS_BMAPI_AFLAG(whichfork),
+						xfs_bmapi_aflag(whichfork),
 					NULL, 0, mapp, &nmap, NULL, NULL)))
 				goto exit0;
 		}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index e6ebbaeb4dc..ab016e5ae7b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -357,7 +357,7 @@ xfs_ialloc_ag_alloc(
 			int	ioffset = i << args.mp->m_sb.sb_inodelog;
 			uint	isize = sizeof(struct xfs_dinode);
 
-			free = XFS_MAKE_IPTR(args.mp, fbuf, i);
+			free = xfs_make_iptr(args.mp, fbuf, i);
 			free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 			free->di_version = version;
 			free->di_gen = cpu_to_be32(gen);
@@ -937,7 +937,7 @@ nextag:
 			}
 		}
 	}
-	offset = XFS_IALLOC_FIND_FREE(&rec.ir_free);
+	offset = xfs_ialloc_find_free(&rec.ir_free);
 	ASSERT(offset >= 0);
 	ASSERT(offset < XFS_INODES_PER_CHUNK);
 	ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
@@ -1279,7 +1279,7 @@ xfs_imap(
 		offset = XFS_INO_TO_OFFSET(mp, ino);
 		ASSERT(offset < mp->m_sb.sb_inopblock);
 
-		cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
+		cluster_agbno = xfs_daddr_to_agbno(mp, imap->im_blkno);
 		offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
 
 		imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 50f558a4e0a..aeee8278f92 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -39,7 +39,6 @@ struct xfs_trans;
 /*
  * Make an inode pointer out of the buffer/offset.
  */
-#define	XFS_MAKE_IPTR(mp,b,o)		xfs_make_iptr(mp,b,o)
 static inline struct xfs_dinode *
 xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 {
@@ -50,7 +49,6 @@ xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
 /*
  * Find a free (set) bit in the inode bitmask.
  */
-#define	XFS_IALLOC_FIND_FREE(fp)	xfs_ialloc_find_free(fp)
 static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 {
 	return xfs_lowbit64(*fp);
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 37e5dd01a57..5580e255ff0 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -36,7 +36,6 @@ typedef	__uint64_t	xfs_inofree_t;
 #define	XFS_INODES_PER_CHUNK_LOG	(XFS_NBBYLOG + 3)
 #define	XFS_INOBT_ALL_FREE	((xfs_inofree_t)-1)
 
-#define	XFS_INOBT_MASKN(i,n)		xfs_inobt_maskn(i,n)
 static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
 {
 	return (((n) >= XFS_INODES_PER_CHUNK ? \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 323ecd76a12..e7ae08d1df4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1614,10 +1614,10 @@ xfs_itruncate_finish(
 		 * in this file with garbage in them once recovery
 		 * runs.
 		 */
-		XFS_BMAP_INIT(&free_list, &first_block);
+		xfs_bmap_init(&free_list, &first_block);
 		error = xfs_bunmapi(ntp, ip,
 				    first_unmap_block, unmap_len,
-				    XFS_BMAPI_AFLAG(fork) |
+				    xfs_bmapi_aflag(fork) |
 				      (sync ? 0 : XFS_BMAPI_ASYNC),
 				    XFS_ITRUNC_MAX_EXTENTS,
 				    &first_block, &free_list,
@@ -2570,7 +2570,7 @@ xfs_iextents_copy(
 	for (i = 0; i < nrecs; i++) {
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 		start_block = xfs_bmbt_get_startblock(ep);
-		if (ISNULLSTARTBLOCK(start_block)) {
+		if (isnullstartblock(start_block)) {
 			/*
 			 * It's a delayed allocation extent, so skip it.
 			 */
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 1ff04cc323a..9957d0602d5 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -111,20 +111,16 @@ typedef struct xfs_inode_log_format_64 {
 
 #define	XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
 
-
-#define	XFS_ILOG_FBROOT(w)	xfs_ilog_fbroot(w)
 static inline int xfs_ilog_fbroot(int w)
 {
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
 }
 
-#define	XFS_ILOG_FEXT(w)	xfs_ilog_fext(w)
 static inline int xfs_ilog_fext(int w)
 {
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
 }
 
-#define	XFS_ILOG_FDATA(w)	xfs_ilog_fdata(w)
 static inline int xfs_ilog_fdata(int w)
 {
 	return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 911062cf73a..08ce72316bf 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -155,7 +155,7 @@ xfs_imap_to_bmap(
 			iomapp->iomap_bn = IOMAP_DADDR_NULL;
 			iomapp->iomap_flags |= IOMAP_DELAY;
 		} else {
-			iomapp->iomap_bn = XFS_FSB_TO_DB(ip, start_block);
+			iomapp->iomap_bn = xfs_fsb_to_db(ip, start_block);
 			if (ISUNWRITTEN(imap))
 				iomapp->iomap_flags |= IOMAP_UNWRITTEN;
 		}
@@ -261,7 +261,7 @@ xfs_iomap(
 		xfs_iunlock(ip, lockmode);
 		lockmode = 0;
 
-		if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) {
+		if (nimaps && !isnullstartblock(imap.br_startblock)) {
 			xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, ip,
 					offset, count, iomapp, &imap, flags);
 			break;
@@ -491,7 +491,7 @@ xfs_iomap_write_direct(
 	/*
 	 * Issue the xfs_bmapi() call to allocate the blocks
 	 */
-	XFS_BMAP_INIT(&free_list, &firstfsb);
+	xfs_bmap_init(&free_list, &firstfsb);
 	nimaps = 1;
 	error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, bmapi_flag,
 		&firstfsb, 0, &imap, &nimaps, &free_list, NULL);
@@ -751,7 +751,7 @@ xfs_iomap_write_allocate(
 			xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 			xfs_trans_ihold(tp, ip);
 
-			XFS_BMAP_INIT(&free_list, &first_block);
+			xfs_bmap_init(&free_list, &first_block);
 
 			/*
 			 * it is possible that the extents have changed since
@@ -911,7 +911,7 @@ xfs_iomap_write_unwritten(
 		/*
 		 * Modify the unwritten extent state of the buffer.
 		 */
-		XFS_BMAP_INIT(&free_list, &firstfsb);
+		xfs_bmap_init(&free_list, &firstfsb);
 		nimaps = 1;
 		error = xfs_bmapi(tp, ip, offset_fsb, count_fsb,
 				  XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb,
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index e19d0a8d561..cf98a805ec9 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -453,7 +453,7 @@ xfs_bulkstat(
 			    (chunkidx = agino - gino + 1) <
 				    XFS_INODES_PER_CHUNK &&
 					/* there are some left allocated */
-			    XFS_INOBT_MASKN(chunkidx,
+			    xfs_inobt_maskn(chunkidx,
 				    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
 				/*
 				 * Grab the chunk record.  Mark all the
@@ -464,7 +464,7 @@ xfs_bulkstat(
 					if (XFS_INOBT_MASK(i) & ~gfree)
 						gcnt++;
 				}
-				gfree |= XFS_INOBT_MASKN(0, chunkidx);
+				gfree |= xfs_inobt_maskn(0, chunkidx);
 				irbp->ir_startino = gino;
 				irbp->ir_freecount = gcnt;
 				irbp->ir_free = gfree;
@@ -535,7 +535,7 @@ xfs_bulkstat(
 				     chunkidx < XFS_INODES_PER_CHUNK;
 				     chunkidx += nicluster,
 				     agbno += nbcluster) {
-					if (XFS_INOBT_MASKN(chunkidx,
+					if (xfs_inobt_maskn(chunkidx,
 							    nicluster) & ~gfree)
 						xfs_btree_reada_bufs(mp, agno,
 							agbno, nbcluster);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e37eff6761e..f5e9937f9bd 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -44,9 +44,9 @@ typedef struct xfs_trans_reservations {
 
 #ifndef __KERNEL__
 
-#define XFS_DADDR_TO_AGNO(mp,d) \
+#define xfs_daddr_to_agno(mp,d) \
 	((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
-#define XFS_DADDR_TO_AGBNO(mp,d) \
+#define xfs_daddr_to_agbno(mp,d) \
 	((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
 
 #else /* __KERNEL__ */
@@ -441,7 +441,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
  */
 #define XFS_MFSI_QUIET		0x40	/* Be silent if mount errors found */
 
-#define XFS_DADDR_TO_AGNO(mp,d)         xfs_daddr_to_agno(mp,d)
 static inline xfs_agnumber_t
 xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
 {
@@ -450,7 +449,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
 	return (xfs_agnumber_t) ld;
 }
 
-#define XFS_DADDR_TO_AGBNO(mp,d)        xfs_daddr_to_agbno(mp,d)
 static inline xfs_agblock_t
 xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 {
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 86471bb40fd..58f85e9cd11 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -147,7 +147,7 @@ xfs_rename(
 	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
 				inodes, &num_inodes);
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
 	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index edf12c7b834..c5bb86f3ec0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -120,7 +120,7 @@ xfs_growfs_rt_alloc(
 		if ((error = xfs_trans_iget(mp, tp, ino, 0,
 						XFS_ILOCK_EXCL, &ip)))
 			goto error_cancel;
-		XFS_BMAP_INIT(&flist, &firstblock);
+		xfs_bmap_init(&flist, &firstblock);
 		/*
 		 * Allocate blocks to the bitmap file.
 		 */
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f87db5344ce..f76c003ec55 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -28,7 +28,6 @@ struct xfs_mount;
  * file is a real time file or not, because the bmap code
  * does.
  */
-#define	XFS_FSB_TO_DB(ip,fsb)	xfs_fsb_to_db(ip,fsb)
 static inline xfs_daddr_t
 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
 {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 1ed71916e4c..1b017c65749 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -505,7 +505,7 @@ static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
 
 #define	XFS_HDR_BLOCK(mp,d)	((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d))
 #define	XFS_DADDR_TO_FSB(mp,d)	XFS_AGB_TO_FSB(mp, \
-			XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d))
+			xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d))
 #define	XFS_FSB_TO_DADDR(mp,fsbno)	XFS_AGB_TO_DADDR(mp, \
 			XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno))
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f07bf8768c3..0e55c5d7db5 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -862,7 +862,7 @@ xfs_inactive_symlink_rmt(
 	 * Find the block(s) so we can inval and unmap them.
 	 */
 	done = 0;
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	nmaps = ARRAY_SIZE(mval);
 	if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size),
 			XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps,
@@ -1288,7 +1288,7 @@ xfs_inactive(
 	/*
 	 * Free the inode.
 	 */
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	error = xfs_ifree(tp, ip, &free_list);
 	if (error) {
 		/*
@@ -1461,7 +1461,7 @@ xfs_create(
 	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = B_TRUE;
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	ASSERT(ip == NULL);
 
@@ -1879,7 +1879,7 @@ xfs_remove(
 		}
 	}
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
 					&first_block, &free_list, resblks);
 	if (error) {
@@ -2059,7 +2059,7 @@ xfs_link(
 	if (error)
 		goto error_return;
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
 					&first_block, &free_list, resblks);
@@ -2231,7 +2231,7 @@ xfs_mkdir(
 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = B_FALSE;
 
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
 					&first_block, &free_list, resblks ?
@@ -2438,7 +2438,7 @@ xfs_symlink(
 	 * Initialize the bmap freelist prior to calling either
 	 * bmapi or the directory create code.
 	 */
-	XFS_BMAP_INIT(&free_list, &first_block);
+	xfs_bmap_init(&free_list, &first_block);
 
 	/*
 	 * Allocate an inode for the symlink.
@@ -2860,7 +2860,7 @@ retry:
 		/*
 		 * Issue the xfs_bmapi() call to allocate the blocks
 		 */
-		XFS_BMAP_INIT(&free_list, &firstfsb);
+		xfs_bmap_init(&free_list, &firstfsb);
 		error = xfs_bmapi(tp, ip, startoffset_fsb,
 				  allocatesize_fsb, bmapi_flag,
 				  &firstfsb, 0, imapp, &nimaps,
@@ -2980,7 +2980,7 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNDONE(bp);
 		XFS_BUF_UNWRITE(bp);
 		XFS_BUF_READ(bp);
-		XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
+		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
 		xfsbdstrat(mp, bp);
 		error = xfs_iowait(bp);
 		if (error) {
@@ -3186,7 +3186,7 @@ xfs_free_file_space(
 		/*
 		 * issue the bunmapi() call to free the blocks
 		 */
-		XFS_BMAP_INIT(&free_list, &firstfsb);
+		xfs_bmap_init(&free_list, &firstfsb);
 		error = xfs_bunmapi(tp, ip, startoffset_fsb,
 				  endoffset_fsb - startoffset_fsb,
 				  0, 2, &firstfsb, &free_list, NULL, &done);
-- 
cgit v1.2.3


From a50412e3f8ce95d7ed558370d7dde5171fd04283 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 6 Jan 2009 19:54:02 +0200
Subject: UBIFS: do not treat all data as short term

UBIFS wrongly tells UBI that all data is short term. Use proper
hints instead. Thanks to Xiaochuan-Xu for noticing this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 9b7c54e0cd2..a11ca0958a2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -208,7 +208,7 @@ again:
 	offs = 0;
 
 out:
-	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
 	if (err)
 		goto out_unlock;
 
-- 
cgit v1.2.3


From 7078202e55b565582fcbd831a8dd3069bdc72610 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 19 Jan 2009 19:57:27 +0200
Subject: UBIFS: document dark_wm and dead_wm better

Just add more commentaries. Also some commentary fixes for
lprops flags.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c    | 20 ++++++++++++++++++++
 fs/ubifs/super.c | 11 ++---------
 fs/ubifs/ubifs.h |  4 ++--
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9832f9abe28..b2e5f113337 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
  * to be reused. Garbage collection will cause the number of dirty index nodes
  * to grow, however sufficient space is reserved for the index to ensure the
  * commit will never run out of space.
+ *
+ * Notes about dead watermark. At current UBIFS implementation we assume that
+ * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
+ * and not worth garbage-collecting. The dead watermark is one min. I/O unit
+ * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
+ * Garbage Collector has to synchronize the GC head's write buffer before
+ * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
+ * actually reclaim even very small pieces of dirty space by garbage collecting
+ * enough dirty LEBs, but we do not bother doing this at this implementation.
+ *
+ * Notes about dark watermark. The results of GC work depends on how big are
+ * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
+ * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
+ * have to waste large pieces of free space at the end of LEB B, because nodes
+ * from LEB A would not fit. And the worst situation is when all nodes are of
+ * maximum size. So dark watermark is the amount of free + dirty space in LEB
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * be unable to reclaim it. So, LEBs with free + dirty greater than dark
+ * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
+ * good, and GC takes extra care when moving them.
  */
 
 #include <linux/pagemap.h>
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 53811e567a6..da99da098ef 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -573,15 +573,8 @@ static int init_constants_early(struct ubifs_info *c)
 	c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
 
 	/*
-	 * Initialize dead and dark LEB space watermarks.
-	 *
-	 * Dead space is the space which cannot be used. Its watermark is
-	 * equivalent to min. I/O unit or minimum node size if it is greater
-	 * then min. I/O unit.
-	 *
-	 * Dark space is the space which might be used, or might not, depending
-	 * on which node should be written to the LEB. Its watermark is
-	 * equivalent to maximum UBIFS node size.
+	 * Initialize dead and dark LEB space watermarks. See gc.c for comments
+	 * about these values.
 	 */
 	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
 	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 0881897a420..2e78d6ac007 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -426,9 +426,9 @@ struct ubifs_unclean_leb {
  * LEB properties flags.
  *
  * LPROPS_UNCAT: not categorized
- * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
  * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
- * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
  * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
  * LPROPS_EMPTY: LEB is empty, not taken
  * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
-- 
cgit v1.2.3


From e7f07968c16bdd9480001c0a9de013ba56889cf9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 20 Jan 2009 09:50:19 -0500
Subject: ext4: Fix ext4_free_blocks() w/o a journal when files have indirect
 blocks

When trying to unlink a file with indirect blocks on a filesystem
without a journal, the "circular indirect block" sanity test was
getting falsely triggered.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 49484ba801c..b4386dafeb0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3622,7 +3622,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		 * block pointed to itself, it would have been detached when
 		 * the block was cleared. Check for this instead of OOPSing.
 		 */
-		if (bh2jh(this_bh))
+		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
 			ext4_error(inode->i_sb, __func__,
-- 
cgit v1.2.3


From 4503efd0891c40e30928afb4b23dc3f99c62a6b2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Tue, 20 Jan 2009 15:51:16 -0800
Subject: sysfs: fix problems with binary files

Some sysfs binary files don't like having 0 passed to them as a size.
Fix this up at the root by just returning to the vfs if userspace asks
us for a zero sized buffer.

Thanks to Pavel Roskin for pointing this out.

Reported-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/bin.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 66f6e58a7e4..f2c478c3424 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -63,6 +63,9 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
 
+	if (!bytes)
+		return 0;
+
 	if (size) {
 		if (offs > size)
 			return 0;
@@ -131,6 +134,9 @@ static ssize_t write(struct file *file, const char __user *userbuf,
 	int count = min_t(size_t, bytes, PAGE_SIZE);
 	char *temp;
 
+	if (!bytes)
+		return 0;
+
 	if (size) {
 		if (offs > size)
 			return 0;
-- 
cgit v1.2.3


From c475146d8f3b97e79f9ef88521e28ad40ac07de6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 17:44:34 +0100
Subject: ocfs2: Remove ocfs2_dquot_initialize() and ocfs2_dquot_drop()

Since ->acquire_dquot and ->release_dquot callbacks aren't called under
dqptr_sem anymore, we don't have to start a transaction and obtain locks
so early. So we can just remove all this complicated stuff.

Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/ocfs2/quota_global.c | 169 +-----------------------------------------------
 1 file changed, 2 insertions(+), 167 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 6aff8f2d3e4..f4efa89baee 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -810,171 +810,6 @@ out:
 	return status;
 }
 
-/* This is difficult. We have to lock quota inode and start transaction
- * in this function but we don't want to take the penalty of exlusive
- * quota file lock when we are just going to use cached structures. So
- * we just take read lock check whether we have dquot cached and if so,
- * we don't have to take the write lock... */
-static int ocfs2_dquot_initialize(struct inode *inode, int type)
-{
-	handle_t *handle = NULL;
-	int status = 0;
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_mem_dqinfo *oinfo;
-	int exclusive = 0;
-	int cnt;
-	qid_t id;
-
-	mlog_entry_void();
-
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (type != -1 && cnt != type)
-			continue;
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-		status = ocfs2_lock_global_qf(oinfo, 0);
-		if (status < 0)
-			goto out;
-		/* This is just a performance optimization not a reliable test.
-		 * Since we hold an inode lock, noone can actually release
-		 * the structure until we are finished with initialization. */
-		if (inode->i_dquot[cnt] != NODQUOT) {
-			ocfs2_unlock_global_qf(oinfo, 0);
-			continue;
-		}
-		/* When we have inode lock, we know that no dquot_release() can
-		 * run and thus we can safely check whether we need to
-		 * read+modify global file to get quota information or whether
-		 * our node already has it. */
-		if (cnt == USRQUOTA)
-			id = inode->i_uid;
-		else if (cnt == GRPQUOTA)
-			id = inode->i_gid;
-		else
-			BUG();
-		/* Obtain exclusion from quota off... */
-		down_write(&sb_dqopt(sb)->dqptr_sem);
-		exclusive = !dquot_is_cached(sb, id, cnt);
-		up_write(&sb_dqopt(sb)->dqptr_sem);
-		if (exclusive) {
-			status = ocfs2_lock_global_qf(oinfo, 1);
-			if (status < 0) {
-				exclusive = 0;
-				mlog_errno(status);
-				goto out_ilock;
-			}
-			handle = ocfs2_start_trans(OCFS2_SB(sb),
-					ocfs2_calc_qinit_credits(sb, cnt));
-			if (IS_ERR(handle)) {
-				status = PTR_ERR(handle);
-				mlog_errno(status);
-				goto out_ilock;
-			}
-		}
-		dquot_initialize(inode, cnt);
-		if (exclusive) {
-			ocfs2_commit_trans(OCFS2_SB(sb), handle);
-			ocfs2_unlock_global_qf(oinfo, 1);
-		}
-		ocfs2_unlock_global_qf(oinfo, 0);
-	}
-	mlog_exit(0);
-	return 0;
-out_ilock:
-	if (exclusive)
-		ocfs2_unlock_global_qf(oinfo, 1);
-	ocfs2_unlock_global_qf(oinfo, 0);
-out:
-	mlog_exit(status);
-	return status;
-}
-
-static int ocfs2_dquot_drop_slow(struct inode *inode)
-{
-	int status = 0;
-	int cnt;
-	int got_lock[MAXQUOTAS] = {0, 0};
-	handle_t *handle;
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_mem_dqinfo *oinfo;
-
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-		status = ocfs2_lock_global_qf(oinfo, 1);
-		if (status < 0)
-			goto out;
-		got_lock[cnt] = 1;
-	}
-	handle = ocfs2_start_trans(OCFS2_SB(sb),
-			ocfs2_calc_qinit_credits(sb, USRQUOTA) +
-			ocfs2_calc_qinit_credits(sb, GRPQUOTA));
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		mlog_errno(status);
-		goto out;
-	}
-	dquot_drop(inode);
-	ocfs2_commit_trans(OCFS2_SB(sb), handle);
-out:
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (got_lock[cnt]) {
-			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-			ocfs2_unlock_global_qf(oinfo, 1);
-		}
-	return status;
-}
-
-/* See the comment before ocfs2_dquot_initialize. */
-static int ocfs2_dquot_drop(struct inode *inode)
-{
-	int status = 0;
-	struct super_block *sb = inode->i_sb;
-	struct ocfs2_mem_dqinfo *oinfo;
-	int exclusive = 0;
-	int cnt;
-	int got_lock[MAXQUOTAS] = {0, 0};
-
-	mlog_entry_void();
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!sb_has_quota_active(sb, cnt))
-			continue;
-		oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-		status = ocfs2_lock_global_qf(oinfo, 0);
-		if (status < 0)
-			goto out;
-		got_lock[cnt] = 1;
-	}
-	/* Lock against anyone releasing references so that when when we check
-	 * we know we are not going to be last ones to release dquot */
-	down_write(&sb_dqopt(sb)->dqptr_sem);
-	/* Urgh, this is a terrible hack :( */
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] != NODQUOT &&
-		    atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
-			exclusive = 1;
-			break;
-		}
-	}
-	if (!exclusive)
-		dquot_drop_locked(inode);
-	up_write(&sb_dqopt(sb)->dqptr_sem);
-out:
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (got_lock[cnt]) {
-			oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
-			ocfs2_unlock_global_qf(oinfo, 0);
-		}
-	/* In case we bailed out because we had to do expensive locking
-	 * do it now... */
-	if (exclusive)
-		status = ocfs2_dquot_drop_slow(inode);
-	mlog_exit(status);
-	return status;
-}
-
 static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
 {
 	struct ocfs2_dquot *dquot =
@@ -991,8 +826,8 @@ static void ocfs2_destroy_dquot(struct dquot *dquot)
 }
 
 struct dquot_operations ocfs2_quota_operations = {
-	.initialize	= ocfs2_dquot_initialize,
-	.drop		= ocfs2_dquot_drop,
+	.initialize	= dquot_initialize,
+	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
 	.alloc_inode	= dquot_alloc_inode,
 	.free_space	= dquot_free_space,
-- 
cgit v1.2.3


From 24179f488092267c9a033d7e25ce7a58af50ff79 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Mon, 19 Jan 2009 13:13:33 -0600
Subject: dlm: fix plock notify callback to lockd

We should use the original copy of the file_lock, fl, instead
of the copy, flc in the lockd notify callback.  The range in flc has
been modified by posix_lock_file(), so it will not match a copy of the
lock in lockd.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/plock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index eba87ff3177..502b1ea5ef6 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -168,7 +168,7 @@ static int dlm_plock_callback(struct plock_op *op)
 	notify = xop->callback;
 
 	if (op->info.rv) {
-		notify(flc, NULL, op->info.rv);
+		notify(fl, NULL, op->info.rv);
 		goto out;
 	}
 
@@ -187,7 +187,7 @@ static int dlm_plock_callback(struct plock_op *op)
 			  (unsigned long long)op->info.number, file, fl);
 	}
 
-	rv = notify(flc, NULL, 0);
+	rv = notify(fl, NULL, 0);
 	if (rv) {
 		/* XXX: We need to cancel the fs lock here: */
 		log_print("dlm_plock_callback: lock granted after lock request "
-- 
cgit v1.2.3


From 20d5a39929232a715f29e6cb7e3f0d0c790f41eb Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 21 Jan 2009 11:34:50 -0500
Subject: dlm: initialize file_lock struct in GETLK before copying conflicting
 lock

dlm_posix_get fills out the relevant fields in the file_lock before
returning when there is a lock conflict, but doesn't clean out any of
the other fields in the file_lock.

When nfsd does a NFSv4 lockt call, it sets the fl_lmops to
nfsd_posix_mng_ops before calling the lower fs. When the lock comes back
after testing a lock on GFS2, it still has that field set. This confuses
nfsd into thinking that the file_lock is a nfsd4 lock.

Fix this by making DLM reinitialize the file_lock before copying the
fields from the conflicting lock.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/plock.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 502b1ea5ef6..894a32d438d 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -304,7 +304,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	if (rv == -ENOENT)
 		rv = 0;
 	else if (rv > 0) {
+		locks_init_lock(fl);
 		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+		fl->fl_flags = FL_POSIX;
 		fl->fl_pid = op->info.pid;
 		fl->fl_start = op->info.start;
 		fl->fl_end = op->info.end;
-- 
cgit v1.2.3


From 74e2d06521913443c7e2697037909f5efc200ec5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 21 Jan 2009 15:22:17 +1100
Subject: Long btree pointers are still 64 bit on disk

[XFS] Long btree pointers are still 64 bit on disk

On 32 bit machines with CONFIG_LBD=n, XFS reduces the
in memory size of xfs_fsblock_t to 32 bits so that it
will fit within 32 bit addressing. However, the disk format
for long btree pointers are still 64 bits in size.

The recent btree rewrite failed to take this into account
when initialising new btree blocks, setting sibling pointers
to NULL and checking if they are NULL. Hence checking whether
a 64 bit NULL was the same as a 32 bit NULL was failingi
resulting in NULL sibling pointers failing to be detected
correctly. This showed up as WANT_CORRUPTED_GOTO shutdowns
in xfs_btree_delrec.

Fix this by making all the comparisons and setting of long
pointer btree NULL blocks to the disk format, not the
in memory format. i.e. use NULLDFSBNO.

Reported-by: Alexander Beregalov <a.beregalov@gmail.com>
Reported-by: Jacek Luczak <difrost.kernel@gmail.com>
Reported-by: Danny ter Haar <dth@dth.net>
Tested-by: Jacek Luczak <difrost.kernel@gmail.com>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_btree.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 4681519ded9..e73c332eb23 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -843,7 +843,7 @@ xfs_btree_ptr_is_null(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+		return be64_to_cpu(ptr->l) == NULLDFSBNO;
 	else
 		return be32_to_cpu(ptr->s) == NULLAGBLOCK;
 }
@@ -854,7 +854,7 @@ xfs_btree_set_ptr_null(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		ptr->l = cpu_to_be64(NULLFSBLOCK);
+		ptr->l = cpu_to_be64(NULLDFSBNO);
 	else
 		ptr->s = cpu_to_be32(NULLAGBLOCK);
 }
@@ -918,8 +918,8 @@ xfs_btree_init_block(
 	new->bb_numrecs = cpu_to_be16(numrecs);
 
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
-		new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+		new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+		new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
 	} else {
 		new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
@@ -971,7 +971,7 @@ xfs_btree_ptr_to_daddr(
 	union xfs_btree_ptr	*ptr)
 {
 	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+		ASSERT(be64_to_cpu(ptr->l) != NULLDFSBNO);
 
 		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
 	} else {
-- 
cgit v1.2.3


From b16ecfe2f985f77901a36ee5a99c7d3400313341 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:22:31 +0300
Subject: fs/Kconfig: move reiserfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 86 +----------------------------------------------------
 fs/reiserfs/Kconfig | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 85 deletions(-)
 create mode 100644 fs/reiserfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 51307b0fdf0..03fde694969 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -27,91 +27,7 @@ config FS_MBCACHE
 	default y if EXT4_FS=y && EXT4_FS_XATTR
 	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 
-config REISERFS_FS
-	tristate "Reiserfs support"
-	help
-	  Stores not just filenames but the files themselves in a balanced
-	  tree.  Uses journalling.
-
-	  Balanced trees are more efficient than traditional file system
-	  architectural foundations.
-
-	  In general, ReiserFS is as fast as ext2, but is very efficient with
-	  large directories and small files.  Additional patches are needed
-	  for NFS and quotas, please see <http://www.namesys.com/> for links.
-
-	  It is more easily extended to have features currently found in
-	  database and keyword search systems than block allocation based file
-	  systems are.  The next version will be so extended, and will support
-	  plugins consistent with our motto ``It takes more than a license to
-	  make source code open.''
-
-	  Read <http://www.namesys.com/> to learn more about reiserfs.
-
-	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
-
-	  If you like it, you can pay us to add new features to it that you
-	  need, buy a support contract, or pay us to port it to another OS.
-
-config REISERFS_CHECK
-	bool "Enable reiserfs debug mode"
-	depends on REISERFS_FS
-	help
-	  If you set this to Y, then ReiserFS will perform every check it can
-	  possibly imagine of its internal consistency throughout its
-	  operation.  It will also go substantially slower.  More than once we
-	  have forgotten that this was on, and then gone despondent over the
-	  latest benchmarks.:-) Use of this option allows our team to go all
-	  out in checking for consistency when debugging without fear of its
-	  effect on end users.  If you are on the verge of sending in a bug
-	  report, say Y and you might get a useful error message.  Almost
-	  everyone should say N.
-
-config REISERFS_PROC_INFO
-	bool "Stats in /proc/fs/reiserfs"
-	depends on REISERFS_FS && PROC_FS
-	help
-	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
-	  various ReiserFS statistics and internal data at the expense of
-	  making your kernel or module slightly larger (+8 KB). This also
-	  increases the amount of kernel memory required for each mount.
-	  Almost everyone but ReiserFS developers and people fine-tuning
-	  reiserfs or tracing problems should say N.
-
-config REISERFS_FS_XATTR
-	bool "ReiserFS extended attributes"
-	depends on REISERFS_FS
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config REISERFS_FS_POSIX_ACL
-	bool "ReiserFS POSIX Access Control Lists"
-	depends on REISERFS_FS_XATTR
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config REISERFS_FS_SECURITY
-	bool "ReiserFS Security Labels"
-	depends on REISERFS_FS_XATTR
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the ReiserFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
+source "fs/reiserfs/Kconfig"
 
 config JFS_FS
 	tristate "JFS filesystem support"
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
new file mode 100644
index 00000000000..949b8c6addc
--- /dev/null
+++ b/fs/reiserfs/Kconfig
@@ -0,0 +1,85 @@
+config REISERFS_FS
+	tristate "Reiserfs support"
+	help
+	  Stores not just filenames but the files themselves in a balanced
+	  tree.  Uses journalling.
+
+	  Balanced trees are more efficient than traditional file system
+	  architectural foundations.
+
+	  In general, ReiserFS is as fast as ext2, but is very efficient with
+	  large directories and small files.  Additional patches are needed
+	  for NFS and quotas, please see <http://www.namesys.com/> for links.
+
+	  It is more easily extended to have features currently found in
+	  database and keyword search systems than block allocation based file
+	  systems are.  The next version will be so extended, and will support
+	  plugins consistent with our motto ``It takes more than a license to
+	  make source code open.''
+
+	  Read <http://www.namesys.com/> to learn more about reiserfs.
+
+	  Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com.
+
+	  If you like it, you can pay us to add new features to it that you
+	  need, buy a support contract, or pay us to port it to another OS.
+
+config REISERFS_CHECK
+	bool "Enable reiserfs debug mode"
+	depends on REISERFS_FS
+	help
+	  If you set this to Y, then ReiserFS will perform every check it can
+	  possibly imagine of its internal consistency throughout its
+	  operation.  It will also go substantially slower.  More than once we
+	  have forgotten that this was on, and then gone despondent over the
+	  latest benchmarks.:-) Use of this option allows our team to go all
+	  out in checking for consistency when debugging without fear of its
+	  effect on end users.  If you are on the verge of sending in a bug
+	  report, say Y and you might get a useful error message.  Almost
+	  everyone should say N.
+
+config REISERFS_PROC_INFO
+	bool "Stats in /proc/fs/reiserfs"
+	depends on REISERFS_FS && PROC_FS
+	help
+	  Create under /proc/fs/reiserfs a hierarchy of files, displaying
+	  various ReiserFS statistics and internal data at the expense of
+	  making your kernel or module slightly larger (+8 KB). This also
+	  increases the amount of kernel memory required for each mount.
+	  Almost everyone but ReiserFS developers and people fine-tuning
+	  reiserfs or tracing problems should say N.
+
+config REISERFS_FS_XATTR
+	bool "ReiserFS extended attributes"
+	depends on REISERFS_FS
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config REISERFS_FS_POSIX_ACL
+	bool "ReiserFS POSIX Access Control Lists"
+	depends on REISERFS_FS_XATTR
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config REISERFS_FS_SECURITY
+	bool "ReiserFS Security Labels"
+	depends on REISERFS_FS_XATTR
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the ReiserFS filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
-- 
cgit v1.2.3


From f5c77969b33cc5cbb4534289bf23cb1794f9d37c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:24:27 +0300
Subject: fs/Kconfig: move jfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 51 +--------------------------------------------------
 fs/jfs/Kconfig | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 50 deletions(-)
 create mode 100644 fs/jfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 03fde694969..b39675cc0fc 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -28,56 +28,7 @@ config FS_MBCACHE
 	default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
 
 source "fs/reiserfs/Kconfig"
-
-config JFS_FS
-	tristate "JFS filesystem support"
-	select NLS
-	help
-	  This is a port of IBM's Journaled Filesystem .  More information is
-	  available in the file <file:Documentation/filesystems/jfs.txt>.
-
-	  If you do not intend to use the JFS filesystem, say N.
-
-config JFS_POSIX_ACL
-	bool "JFS POSIX Access Control Lists"
-	depends on JFS_FS
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config JFS_SECURITY
-	bool "JFS Security Labels"
-	depends on JFS_FS
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the jfs filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config JFS_DEBUG
-	bool "JFS debugging"
-	depends on JFS_FS
-	help
-	  If you are experiencing any problems with the JFS filesystem, say
-	  Y here.  This will result in additional debugging messages to be
-	  written to the system log.  Under normal circumstances, this
-	  results in very little overhead.
-
-config JFS_STATISTICS
-	bool "JFS statistics"
-	depends on JFS_FS
-	help
-	  Enabling this option will cause statistics from the JFS file system
-	  to be made available to the user in the /proc/fs/jfs/ directory.
+source "fs/jfs/Kconfig"
 
 config FS_POSIX_ACL
 # Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4)
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
new file mode 100644
index 00000000000..9ff619a6f9c
--- /dev/null
+++ b/fs/jfs/Kconfig
@@ -0,0 +1,49 @@
+config JFS_FS
+	tristate "JFS filesystem support"
+	select NLS
+	help
+	  This is a port of IBM's Journaled Filesystem .  More information is
+	  available in the file <file:Documentation/filesystems/jfs.txt>.
+
+	  If you do not intend to use the JFS filesystem, say N.
+
+config JFS_POSIX_ACL
+	bool "JFS POSIX Access Control Lists"
+	depends on JFS_FS
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFS_SECURITY
+	bool "JFS Security Labels"
+	depends on JFS_FS
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jfs filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFS_DEBUG
+	bool "JFS debugging"
+	depends on JFS_FS
+	help
+	  If you are experiencing any problems with the JFS filesystem, say
+	  Y here.  This will result in additional debugging messages to be
+	  written to the system log.  Under normal circumstances, this
+	  results in very little overhead.
+
+config JFS_STATISTICS
+	bool "JFS statistics"
+	depends on JFS_FS
+	help
+	  Enabling this option will cause statistics from the JFS file system
+	  to be made available to the user in the /proc/fs/jfs/ directory.
-- 
cgit v1.2.3


From 2fe4371dff3f1a5a1f7d91f1b090076954f4d17e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:26:11 +0300
Subject: fs/Kconfig: move ocfs2 out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 87 +-------------------------------------------------------
 fs/ocfs2/Kconfig | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 86 deletions(-)
 create mode 100644 fs/ocfs2/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index b39675cc0fc..9fbc43f973d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -49,92 +49,7 @@ config FILE_LOCKING
 
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
-
-config OCFS2_FS
-	tristate "OCFS2 file system support"
-	depends on NET && SYSFS
-	select CONFIGFS_FS
-	select JBD2
-	select CRC32
-	select QUOTA
-	select QUOTA_TREE
-	help
-	  OCFS2 is a general purpose extent based shared disk cluster file
-	  system with many similarities to ext3. It supports 64 bit inode
-	  numbers, and has automatically extending metadata groups which may
-	  also make it attractive for non-clustered use.
-
-	  You'll want to install the ocfs2-tools package in order to at least
-	  get "mount.ocfs2".
-
-	  Project web page:    http://oss.oracle.com/projects/ocfs2
-	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
-	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
-
-	  For more information on OCFS2, see the file
-	  <file:Documentation/filesystems/ocfs2.txt>.
-
-config OCFS2_FS_O2CB
-	tristate "O2CB Kernelspace Clustering"
-	depends on OCFS2_FS
-	default y
-	help
-	  OCFS2 includes a simple kernelspace clustering package, the OCFS2
-	  Cluster Base.  It only requires a very small userspace component
-	  to configure it. This comes with the standard ocfs2-tools package.
-	  O2CB is limited to maintaining a cluster for OCFS2 file systems.
-	  It cannot manage any other cluster applications.
-
-	  It is always safe to say Y here, as the clustering method is
-	  run-time selectable.
-
-config OCFS2_FS_USERSPACE_CLUSTER
-	tristate "OCFS2 Userspace Clustering"
-	depends on OCFS2_FS && DLM
-	default y
-	help
-	  This option will allow OCFS2 to use userspace clustering services
-	  in conjunction with the DLM in fs/dlm.  If you are using a
-	  userspace cluster manager, say Y here.
-
-	  It is safe to say Y, as the clustering method is run-time
-	  selectable.
-
-config OCFS2_FS_STATS
-	bool "OCFS2 statistics"
-	depends on OCFS2_FS
-	default y
-	help
-	  This option allows some fs statistics to be captured. Enabling
-	  this option may increase the memory consumption.
-
-config OCFS2_DEBUG_MASKLOG
-	bool "OCFS2 logging support"
-	depends on OCFS2_FS
-	default y
-	help
-	  The ocfs2 filesystem has an extensive logging system.  The system
-	  allows selection of events to log via files in /sys/o2cb/logmask/.
-	  This option will enlarge your kernel, but it allows debugging of
-	  ocfs2 filesystem issues.
-
-config OCFS2_DEBUG_FS
-	bool "OCFS2 expensive checks"
-	depends on OCFS2_FS
-	default n
-	help
-	  This option will enable expensive consistency checks. Enable
-	  this option for debugging only as it is likely to decrease
-	  performance of the filesystem.
-
-config OCFS2_FS_POSIX_ACL
-	bool "OCFS2 POSIX Access Control Lists"
-	depends on OCFS2_FS
-	select FS_POSIX_ACL
-	default n
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
+source "fs/ocfs2/Kconfig"
 
 config BTRFS_FS
 	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
new file mode 100644
index 00000000000..701b7a3a872
--- /dev/null
+++ b/fs/ocfs2/Kconfig
@@ -0,0 +1,85 @@
+config OCFS2_FS
+	tristate "OCFS2 file system support"
+	depends on NET && SYSFS
+	select CONFIGFS_FS
+	select JBD2
+	select CRC32
+	select QUOTA
+	select QUOTA_TREE
+	help
+	  OCFS2 is a general purpose extent based shared disk cluster file
+	  system with many similarities to ext3. It supports 64 bit inode
+	  numbers, and has automatically extending metadata groups which may
+	  also make it attractive for non-clustered use.
+
+	  You'll want to install the ocfs2-tools package in order to at least
+	  get "mount.ocfs2".
+
+	  Project web page:    http://oss.oracle.com/projects/ocfs2
+	  Tools web page:      http://oss.oracle.com/projects/ocfs2-tools
+	  OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+	  For more information on OCFS2, see the file
+	  <file:Documentation/filesystems/ocfs2.txt>.
+
+config OCFS2_FS_O2CB
+	tristate "O2CB Kernelspace Clustering"
+	depends on OCFS2_FS
+	default y
+	help
+	  OCFS2 includes a simple kernelspace clustering package, the OCFS2
+	  Cluster Base.  It only requires a very small userspace component
+	  to configure it. This comes with the standard ocfs2-tools package.
+	  O2CB is limited to maintaining a cluster for OCFS2 file systems.
+	  It cannot manage any other cluster applications.
+
+	  It is always safe to say Y here, as the clustering method is
+	  run-time selectable.
+
+config OCFS2_FS_USERSPACE_CLUSTER
+	tristate "OCFS2 Userspace Clustering"
+	depends on OCFS2_FS && DLM
+	default y
+	help
+	  This option will allow OCFS2 to use userspace clustering services
+	  in conjunction with the DLM in fs/dlm.  If you are using a
+	  userspace cluster manager, say Y here.
+
+	  It is safe to say Y, as the clustering method is run-time
+	  selectable.
+
+config OCFS2_FS_STATS
+	bool "OCFS2 statistics"
+	depends on OCFS2_FS
+	default y
+	help
+	  This option allows some fs statistics to be captured. Enabling
+	  this option may increase the memory consumption.
+
+config OCFS2_DEBUG_MASKLOG
+	bool "OCFS2 logging support"
+	depends on OCFS2_FS
+	default y
+	help
+	  The ocfs2 filesystem has an extensive logging system.  The system
+	  allows selection of events to log via files in /sys/o2cb/logmask/.
+	  This option will enlarge your kernel, but it allows debugging of
+	  ocfs2 filesystem issues.
+
+config OCFS2_DEBUG_FS
+	bool "OCFS2 expensive checks"
+	depends on OCFS2_FS
+	default n
+	help
+	  This option will enable expensive consistency checks. Enable
+	  this option for debugging only as it is likely to decrease
+	  performance of the filesystem.
+
+config OCFS2_FS_POSIX_ACL
+	bool "OCFS2 POSIX Access Control Lists"
+	depends on OCFS2_FS
+	select FS_POSIX_ACL
+	default n
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
-- 
cgit v1.2.3


From 335debee07f2d4187a6073d7764ed56bb2ae52f4 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:27:30 +0300
Subject: fs/Kconfig: move btrfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 20 +-------------------
 fs/btrfs/Kconfig | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 19 deletions(-)
 create mode 100644 fs/btrfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9fbc43f973d..51f2aba92c2 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,25 +50,7 @@ config FILE_LOCKING
 source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
-
-config BTRFS_FS
-	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
-	depends on EXPERIMENTAL
-	select LIBCRC32C
-	select ZLIB_INFLATE
-	select ZLIB_DEFLATE
-	help
-	  Btrfs is a new filesystem with extents, writable snapshotting,
-	  support for multiple devices and many more features.
-
-	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
-	  FINALIZED.  You should say N here unless you are interested in
-	  testing Btrfs with non-critical data.
-
-	  To compile this file system support as a module, choose M here. The
-	  module will be called btrfs.
-
-	  If unsure, say N.
+source "fs/btrfs/Kconfig"
 
 endif # BLOCK
 
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
new file mode 100644
index 00000000000..f8fcf999ea1
--- /dev/null
+++ b/fs/btrfs/Kconfig
@@ -0,0 +1,18 @@
+config BTRFS_FS
+	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+	depends on EXPERIMENTAL
+	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	help
+	  Btrfs is a new filesystem with extents, writable snapshotting,
+	  support for multiple devices and many more features.
+
+	  Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
+	  FINALIZED.  You should say N here unless you are interested in
+	  testing Btrfs with non-critical data.
+
+	  To compile this file system support as a module, choose M here. The
+	  module will be called btrfs.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 90ffd467933eaf581e11fec51e7ba16fc9bd542d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:31:56 +0300
Subject: fs/Kconfig: move autofs, autofs4 out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig         | 44 ++------------------------------------------
 fs/autofs/Kconfig  | 21 +++++++++++++++++++++
 fs/autofs4/Kconfig | 20 ++++++++++++++++++++
 3 files changed, 43 insertions(+), 42 deletions(-)
 create mode 100644 fs/autofs/Kconfig
 create mode 100644 fs/autofs4/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 51f2aba92c2..70527fe6b63 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -112,48 +112,8 @@ config QUOTACTL
 	depends on XFS_QUOTA || QUOTA
 	default y
 
-config AUTOFS_FS
-	tristate "Kernel automounter support"
-	help
-	  The automounter is a tool to automatically mount remote file systems
-	  on demand. This implementation is partially kernel-based to reduce
-	  overhead in the already-mounted case; this is unlike the BSD
-	  automounter (amd), which is a pure user space daemon.
-
-	  To use the automounter you need the user-space tools from the autofs
-	  package; you can find the location in <file:Documentation/Changes>.
-	  You also want to answer Y to "NFS file system support", below.
-
-	  If you want to use the newer version of the automounter with more
-	  features, say N here and say Y to "Kernel automounter v4 support",
-	  below.
-
-	  To compile this support as a module, choose M here: the module will be
-	  called autofs.
-
-	  If you are not a part of a fairly large, distributed network, you
-	  probably do not need an automounter, and can say N here.
-
-config AUTOFS4_FS
-	tristate "Kernel automounter version 4 support (also supports v3)"
-	help
-	  The automounter is a tool to automatically mount remote file systems
-	  on demand. This implementation is partially kernel-based to reduce
-	  overhead in the already-mounted case; this is unlike the BSD
-	  automounter (amd), which is a pure user space daemon.
-
-	  To use the automounter you need the user-space tools from
-	  <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
-	  want to answer Y to "NFS file system support", below.
-
-	  To compile this support as a module, choose M here: the module will be
-	  called autofs4.  You will need to add "alias autofs autofs4" to your
-	  modules configuration file.
-
-	  If you are not a part of a fairly large, distributed network or
-	  don't have a laptop which needs to dynamically reconfigure to the
-	  local network, you probably do not need an automounter, and can say
-	  N here.
+source "fs/autofs/Kconfig"
+source "fs/autofs4/Kconfig"
 
 config FUSE_FS
 	tristate "FUSE (Filesystem in Userspace) support"
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
new file mode 100644
index 00000000000..5f3bea90911
--- /dev/null
+++ b/fs/autofs/Kconfig
@@ -0,0 +1,21 @@
+config AUTOFS_FS
+	tristate "Kernel automounter support"
+	help
+	  The automounter is a tool to automatically mount remote file systems
+	  on demand. This implementation is partially kernel-based to reduce
+	  overhead in the already-mounted case; this is unlike the BSD
+	  automounter (amd), which is a pure user space daemon.
+
+	  To use the automounter you need the user-space tools from the autofs
+	  package; you can find the location in <file:Documentation/Changes>.
+	  You also want to answer Y to "NFS file system support", below.
+
+	  If you want to use the newer version of the automounter with more
+	  features, say N here and say Y to "Kernel automounter v4 support",
+	  below.
+
+	  To compile this support as a module, choose M here: the module will be
+	  called autofs.
+
+	  If you are not a part of a fairly large, distributed network, you
+	  probably do not need an automounter, and can say N here.
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
new file mode 100644
index 00000000000..1204d6384d3
--- /dev/null
+++ b/fs/autofs4/Kconfig
@@ -0,0 +1,20 @@
+config AUTOFS4_FS
+	tristate "Kernel automounter version 4 support (also supports v3)"
+	help
+	  The automounter is a tool to automatically mount remote file systems
+	  on demand. This implementation is partially kernel-based to reduce
+	  overhead in the already-mounted case; this is unlike the BSD
+	  automounter (amd), which is a pure user space daemon.
+
+	  To use the automounter you need the user-space tools from
+	  <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+	  want to answer Y to "NFS file system support", below.
+
+	  To compile this support as a module, choose M here: the module will be
+	  called autofs4.  You will need to add "alias autofs autofs4" to your
+	  modules configuration file.
+
+	  If you are not a part of a fairly large, distributed network or
+	  don't have a laptop which needs to dynamically reconfigure to the
+	  local network, you probably do not need an automounter, and can say
+	  N here.
-- 
cgit v1.2.3


From 3ef7784e47975e31148c25b6fa795949fdc16d9c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:33:25 +0300
Subject: fs/Kconfig: move fuse out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 17 +----------------
 fs/fuse/Kconfig | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)
 create mode 100644 fs/fuse/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 70527fe6b63..8b36059d2b0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -114,22 +114,7 @@ config QUOTACTL
 
 source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
-
-config FUSE_FS
-	tristate "FUSE (Filesystem in Userspace) support"
-	help
-	  With FUSE it is possible to implement a fully functional filesystem
-	  in a userspace program.
-
-	  There's also companion library: libfuse.  This library along with
-	  utilities is available from the FUSE homepage:
-	  <http://fuse.sourceforge.net/>
-
-	  See <file:Documentation/filesystems/fuse.txt> for more information.
-	  See <file:Documentation/Changes> for needed library/utility version.
-
-	  If you want to develop a userspace FS, or if you want to use
-	  a filesystem based on FUSE, answer Y or M.
+source "fs/fuse/Kconfig"
 
 config GENERIC_ACL
 	bool
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 00000000000..0cf160a94ed
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,15 @@
+config FUSE_FS
+	tristate "FUSE (Filesystem in Userspace) support"
+	help
+	  With FUSE it is possible to implement a fully functional filesystem
+	  in a userspace program.
+
+	  There's also companion library: libfuse.  This library along with
+	  utilities is available from the FUSE homepage:
+	  <http://fuse.sourceforge.net/>
+
+	  See <file:Documentation/filesystems/fuse.txt> for more information.
+	  See <file:Documentation/Changes> for needed library/utility version.
+
+	  If you want to develop a userspace FS, or if you want to use
+	  a filesystem based on FUSE, answer Y or M.
-- 
cgit v1.2.3


From ddfaccd995b2d1bb1df4461ee9403ba9fdcbee04 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:35:21 +0300
Subject: fs/Kconfig: move iso9660, udf out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 60 ++------------------------------------------------------
 fs/isofs/Kconfig | 39 ++++++++++++++++++++++++++++++++++++
 fs/udf/Kconfig   | 18 +++++++++++++++++
 3 files changed, 59 insertions(+), 58 deletions(-)
 create mode 100644 fs/isofs/Kconfig
 create mode 100644 fs/udf/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 8b36059d2b0..b4868b8fd99 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -123,64 +123,8 @@ config GENERIC_ACL
 if BLOCK
 menu "CD-ROM/DVD Filesystems"
 
-config ISO9660_FS
-	tristate "ISO 9660 CDROM file system support"
-	help
-	  This is the standard file system used on CD-ROMs.  It was previously
-	  known as "High Sierra File System" and is called "hsfs" on other
-	  Unix systems.  The so-called Rock-Ridge extensions which allow for
-	  long Unix filenames and symbolic links are also supported by this
-	  driver.  If you have a CD-ROM drive and want to do more with it than
-	  just listen to audio CDs and watch its LEDs, say Y (and read
-	  <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>), thereby
-	  enlarging your kernel by about 27 KB; otherwise say N.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called isofs.
-
-config JOLIET
-	bool "Microsoft Joliet CDROM extensions"
-	depends on ISO9660_FS
-	select NLS
-	help
-	  Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
-	  which allows for long filenames in unicode format (unicode is the
-	  new 16 bit character code, successor to ASCII, which encodes the
-	  characters of almost all languages of the world; see
-	  <http://www.unicode.org/> for more information).  Say Y here if you
-	  want to be able to read Joliet CD-ROMs under Linux.
-
-config ZISOFS
-	bool "Transparent decompression extension"
-	depends on ISO9660_FS
-	select ZLIB_INFLATE
-	help
-	  This is a Linux-specific extension to RockRidge which lets you store
-	  data in compressed form on a CD-ROM and have it transparently
-	  decompressed when the CD-ROM is accessed.  See
-	  <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
-	  necessary to create such a filesystem.  Say Y here if you want to be
-	  able to read such compressed CD-ROMs.
-
-config UDF_FS
-	tristate "UDF file system support"
-	select CRC_ITU_T
-	help
-	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
-	  you intend to mount DVD discs or CDRW's written in packet mode, or
-	  if written to by other UDF utilities, such as DirectCD.
-	  Please read <file:Documentation/filesystems/udf.txt>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called udf.
-
-	  If unsure, say N.
-
-config UDF_NLS
-	bool
-	default y
-	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
+source "fs/isofs/Kconfig"
+source "fs/udf/Kconfig"
 
 endmenu
 endif # BLOCK
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
new file mode 100644
index 00000000000..8ab9878e367
--- /dev/null
+++ b/fs/isofs/Kconfig
@@ -0,0 +1,39 @@
+config ISO9660_FS
+	tristate "ISO 9660 CDROM file system support"
+	help
+	  This is the standard file system used on CD-ROMs.  It was previously
+	  known as "High Sierra File System" and is called "hsfs" on other
+	  Unix systems.  The so-called Rock-Ridge extensions which allow for
+	  long Unix filenames and symbolic links are also supported by this
+	  driver.  If you have a CD-ROM drive and want to do more with it than
+	  just listen to audio CDs and watch its LEDs, say Y (and read
+	  <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>), thereby
+	  enlarging your kernel by about 27 KB; otherwise say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called isofs.
+
+config JOLIET
+	bool "Microsoft Joliet CDROM extensions"
+	depends on ISO9660_FS
+	select NLS
+	help
+	  Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system
+	  which allows for long filenames in unicode format (unicode is the
+	  new 16 bit character code, successor to ASCII, which encodes the
+	  characters of almost all languages of the world; see
+	  <http://www.unicode.org/> for more information).  Say Y here if you
+	  want to be able to read Joliet CD-ROMs under Linux.
+
+config ZISOFS
+	bool "Transparent decompression extension"
+	depends on ISO9660_FS
+	select ZLIB_INFLATE
+	help
+	  This is a Linux-specific extension to RockRidge which lets you store
+	  data in compressed form on a CD-ROM and have it transparently
+	  decompressed when the CD-ROM is accessed.  See
+	  <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools
+	  necessary to create such a filesystem.  Say Y here if you want to be
+	  able to read such compressed CD-ROMs.
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
new file mode 100644
index 00000000000..0e0e99bd6bc
--- /dev/null
+++ b/fs/udf/Kconfig
@@ -0,0 +1,18 @@
+config UDF_FS
+	tristate "UDF file system support"
+	select CRC_ITU_T
+	help
+	  This is the new file system used on some CD-ROMs and DVDs. Say Y if
+	  you intend to mount DVD discs or CDRW's written in packet mode, or
+	  if written to by other UDF utilities, such as DirectCD.
+	  Please read <file:Documentation/filesystems/udf.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called udf.
+
+	  If unsure, say N.
+
+config UDF_NLS
+	bool
+	default y
+	depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
-- 
cgit v1.2.3


From 1c6ace019bce5e918a3d6cd53948652e14850644 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:37:59 +0300
Subject: fs/Kconfig: move fat out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 98 +---------------------------------------------------------
 fs/fat/Kconfig | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 97 deletions(-)
 create mode 100644 fs/fat/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index b4868b8fd99..fdb2c351b4a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -132,103 +132,7 @@ endif # BLOCK
 if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
-config FAT_FS
-	tristate
-	select NLS
-	help
-	  If you want to use one of the FAT-based file systems (the MS-DOS and
-	  VFAT (Windows 95) file systems), then you must say Y or M here
-	  to include FAT support. You will then be able to mount partitions or
-	  diskettes with FAT-based file systems and transparently access the
-	  files on them, i.e. MSDOS files will look and behave just like all
-	  other Unix files.
-
-	  This FAT support is not a file system in itself, it only provides
-	  the foundation for the other file systems. You will have to say Y or
-	  M to at least one of "MSDOS fs support" or "VFAT fs support" in
-	  order to make use of it.
-
-	  Another way to read and write MSDOS floppies and hard drive
-	  partitions from within Linux (but not transparently) is with the
-	  mtools ("man mtools") program suite. You don't need to say Y here in
-	  order to do that.
-
-	  If you need to move large files on floppies between a DOS and a
-	  Linux box, say Y here, mount the floppy under Linux with an MSDOS
-	  file system and use GNU tar's M option. GNU tar is a program
-	  available for Unix and DOS ("man tar" or "info tar").
-
-	  The FAT support will enlarge your kernel by about 37 KB. If unsure,
-	  say Y.
-
-	  To compile this as a module, choose M here: the module will be called
-	  fat.  Note that if you compile the FAT support as a module, you
-	  cannot compile any of the FAT-based file systems into the kernel
-	  -- they will have to be modules as well.
-
-config MSDOS_FS
-	tristate "MSDOS fs support"
-	select FAT_FS
-	help
-	  This allows you to mount MSDOS partitions of your hard drive (unless
-	  they are compressed; to access compressed MSDOS partitions under
-	  Linux, you can either use the DOS emulator DOSEMU, described in the
-	  DOSEMU-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
-	  <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
-	  intend to use dosemu with a non-compressed MSDOS partition, say Y
-	  here) and MSDOS floppies. This means that file access becomes
-	  transparent, i.e. the MSDOS files look and behave just like all
-	  other Unix files.
-
-	  If you have Windows 95 or Windows NT installed on your MSDOS
-	  partitions, you should use the VFAT file system (say Y to "VFAT fs
-	  support" below), or you will not be able to see the long filenames
-	  generated by Windows 95 / Windows NT.
-
-	  This option will enlarge your kernel by about 7 KB. If unsure,
-	  answer Y. This will only work if you said Y to "DOS FAT fs support"
-	  as well. To compile this as a module, choose M here: the module will
-	  be called msdos.
-
-config VFAT_FS
-	tristate "VFAT (Windows-95) fs support"
-	select FAT_FS
-	help
-	  This option provides support for normal Windows file systems with
-	  long filenames.  That includes non-compressed FAT-based file systems
-	  used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
-	  programs from the mtools package.
-
-	  The VFAT support enlarges your kernel by about 10 KB and it only
-	  works if you said Y to the "DOS FAT fs support" above.  Please read
-	  the file <file:Documentation/filesystems/vfat.txt> for details.  If
-	  unsure, say Y.
-
-	  To compile this as a module, choose M here: the module will be called
-	  vfat.
-
-config FAT_DEFAULT_CODEPAGE
-	int "Default codepage for FAT"
-	depends on MSDOS_FS || VFAT_FS
-	default 437
-	help
-	  This option should be set to the codepage of your FAT filesystems.
-	  It can be overridden with the "codepage" mount option.
-	  See <file:Documentation/filesystems/vfat.txt> for more information.
-
-config FAT_DEFAULT_IOCHARSET
-	string "Default iocharset for FAT"
-	depends on VFAT_FS
-	default "iso8859-1"
-	help
-	  Set this to the default input/output character set you'd
-	  like FAT to use. It should probably match the character set
-	  that most of your FAT filesystems use, and can be overridden
-	  with the "iocharset" mount option for FAT filesystems.
-	  Note that "utf8" is not recommended for FAT filesystems.
-	  If unsure, you shouldn't set "utf8" here.
-	  See <file:Documentation/filesystems/vfat.txt> for more information.
+source "fs/fat/Kconfig"
 
 config NTFS_FS
 	tristate "NTFS file system support"
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
new file mode 100644
index 00000000000..d0a69ff2537
--- /dev/null
+++ b/fs/fat/Kconfig
@@ -0,0 +1,97 @@
+config FAT_FS
+	tristate
+	select NLS
+	help
+	  If you want to use one of the FAT-based file systems (the MS-DOS and
+	  VFAT (Windows 95) file systems), then you must say Y or M here
+	  to include FAT support. You will then be able to mount partitions or
+	  diskettes with FAT-based file systems and transparently access the
+	  files on them, i.e. MSDOS files will look and behave just like all
+	  other Unix files.
+
+	  This FAT support is not a file system in itself, it only provides
+	  the foundation for the other file systems. You will have to say Y or
+	  M to at least one of "MSDOS fs support" or "VFAT fs support" in
+	  order to make use of it.
+
+	  Another way to read and write MSDOS floppies and hard drive
+	  partitions from within Linux (but not transparently) is with the
+	  mtools ("man mtools") program suite. You don't need to say Y here in
+	  order to do that.
+
+	  If you need to move large files on floppies between a DOS and a
+	  Linux box, say Y here, mount the floppy under Linux with an MSDOS
+	  file system and use GNU tar's M option. GNU tar is a program
+	  available for Unix and DOS ("man tar" or "info tar").
+
+	  The FAT support will enlarge your kernel by about 37 KB. If unsure,
+	  say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  fat.  Note that if you compile the FAT support as a module, you
+	  cannot compile any of the FAT-based file systems into the kernel
+	  -- they will have to be modules as well.
+
+config MSDOS_FS
+	tristate "MSDOS fs support"
+	select FAT_FS
+	help
+	  This allows you to mount MSDOS partitions of your hard drive (unless
+	  they are compressed; to access compressed MSDOS partitions under
+	  Linux, you can either use the DOS emulator DOSEMU, described in the
+	  DOSEMU-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in
+	  <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you
+	  intend to use dosemu with a non-compressed MSDOS partition, say Y
+	  here) and MSDOS floppies. This means that file access becomes
+	  transparent, i.e. the MSDOS files look and behave just like all
+	  other Unix files.
+
+	  If you have Windows 95 or Windows NT installed on your MSDOS
+	  partitions, you should use the VFAT file system (say Y to "VFAT fs
+	  support" below), or you will not be able to see the long filenames
+	  generated by Windows 95 / Windows NT.
+
+	  This option will enlarge your kernel by about 7 KB. If unsure,
+	  answer Y. This will only work if you said Y to "DOS FAT fs support"
+	  as well. To compile this as a module, choose M here: the module will
+	  be called msdos.
+
+config VFAT_FS
+	tristate "VFAT (Windows-95) fs support"
+	select FAT_FS
+	help
+	  This option provides support for normal Windows file systems with
+	  long filenames.  That includes non-compressed FAT-based file systems
+	  used by Windows 95, Windows 98, Windows NT 4.0, and the Unix
+	  programs from the mtools package.
+
+	  The VFAT support enlarges your kernel by about 10 KB and it only
+	  works if you said Y to the "DOS FAT fs support" above.  Please read
+	  the file <file:Documentation/filesystems/vfat.txt> for details.  If
+	  unsure, say Y.
+
+	  To compile this as a module, choose M here: the module will be called
+	  vfat.
+
+config FAT_DEFAULT_CODEPAGE
+	int "Default codepage for FAT"
+	depends on MSDOS_FS || VFAT_FS
+	default 437
+	help
+	  This option should be set to the codepage of your FAT filesystems.
+	  It can be overridden with the "codepage" mount option.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
+
+config FAT_DEFAULT_IOCHARSET
+	string "Default iocharset for FAT"
+	depends on VFAT_FS
+	default "iso8859-1"
+	help
+	  Set this to the default input/output character set you'd
+	  like FAT to use. It should probably match the character set
+	  that most of your FAT filesystems use, and can be overridden
+	  with the "iocharset" mount option for FAT filesystems.
+	  Note that "utf8" is not recommended for FAT filesystems.
+	  If unsure, you shouldn't set "utf8" here.
+	  See <file:Documentation/filesystems/vfat.txt> for more information.
-- 
cgit v1.2.3


From 9d73ac9e8faffa3b930fcebbf4ebcd25f8061ada Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:39:20 +0300
Subject: fs/Kconfig: move ntfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 80 +--------------------------------------------------------
 fs/ntfs/Kconfig | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 79 deletions(-)
 create mode 100644 fs/ntfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index fdb2c351b4a..f746fd6cb72 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -133,85 +133,7 @@ if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
 source "fs/fat/Kconfig"
-
-config NTFS_FS
-	tristate "NTFS file system support"
-	select NLS
-	help
-	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
-
-	  Saying Y or M here enables read support.  There is partial, but
-	  safe, write support available.  For write support you must also
-	  say Y to "NTFS write support" below.
-
-	  There are also a number of user-space tools available, called
-	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
-	  without NTFS support enabled in the kernel.
-
-	  This is a rewrite from scratch of Linux NTFS support and replaced
-	  the old NTFS code starting with Linux 2.5.11.  A backport to
-	  the Linux 2.4 kernel series is separately available as a patch
-	  from the project web site.
-
-	  For more information see <file:Documentation/filesystems/ntfs.txt>
-	  and <http://www.linux-ntfs.org/>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ntfs.
-
-	  If you are not using Windows NT, 2000, XP or 2003 in addition to
-	  Linux on your computer it is safe to say N.
-
-config NTFS_DEBUG
-	bool "NTFS debugging support"
-	depends on NTFS_FS
-	help
-	  If you are experiencing any problems with the NTFS file system, say
-	  Y here.  This will result in additional consistency checks to be
-	  performed by the driver as well as additional debugging messages to
-	  be written to the system log.  Note that debugging messages are
-	  disabled by default.  To enable them, supply the option debug_msgs=1
-	  at the kernel command line when booting the kernel or as an option
-	  to insmod when loading the ntfs module.  Once the driver is active,
-	  you can enable debugging messages by doing (as root):
-	  echo 1 > /proc/sys/fs/ntfs-debug
-	  Replacing the "1" with "0" would disable debug messages.
-
-	  If you leave debugging messages disabled, this results in little
-	  overhead, but enabling debug messages results in very significant
-	  slowdown of the system.
-
-	  When reporting bugs, please try to have available a full dump of
-	  debugging messages while the misbehaviour was occurring.
-
-config NTFS_RW
-	bool "NTFS write support"
-	depends on NTFS_FS
-	help
-	  This enables the partial, but safe, write support in the NTFS driver.
-
-	  The only supported operation is overwriting existing files, without
-	  changing the file length.  No file or directory creation, deletion or
-	  renaming is possible.  Note only non-resident files can be written to
-	  so you may find that some very small files (<500 bytes or so) cannot
-	  be written to.
-
-	  While we cannot guarantee that it will not damage any data, we have
-	  so far not received a single report where the driver would have
-	  damaged someones data so we assume it is perfectly safe to use.
-
-	  Note:  While write support is safe in this version (a rewrite from
-	  scratch of the NTFS support), it should be noted that the old NTFS
-	  write support, included in Linux 2.5.10 and before (since 1997),
-	  is not safe.
-
-	  This is currently useful with TopologiLinux.  TopologiLinux is run
-	  on top of any DOS/Microsoft Windows system without partitioning your
-	  hard disk.  Unlike other Linux distributions TopologiLinux does not
-	  need its own partition.  For more information see
-	  <http://topologi-linux.sourceforge.net/>
-
-	  It is perfectly safe to say N here.
+source "fs/ntfs/Kconfig"
 
 endmenu
 endif # BLOCK
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 00000000000..f5a868cc915
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,78 @@
+config NTFS_FS
+	tristate "NTFS file system support"
+	select NLS
+	help
+	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
+
+	  Saying Y or M here enables read support.  There is partial, but
+	  safe, write support available.  For write support you must also
+	  say Y to "NTFS write support" below.
+
+	  There are also a number of user-space tools available, called
+	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
+	  without NTFS support enabled in the kernel.
+
+	  This is a rewrite from scratch of Linux NTFS support and replaced
+	  the old NTFS code starting with Linux 2.5.11.  A backport to
+	  the Linux 2.4 kernel series is separately available as a patch
+	  from the project web site.
+
+	  For more information see <file:Documentation/filesystems/ntfs.txt>
+	  and <http://www.linux-ntfs.org/>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ntfs.
+
+	  If you are not using Windows NT, 2000, XP or 2003 in addition to
+	  Linux on your computer it is safe to say N.
+
+config NTFS_DEBUG
+	bool "NTFS debugging support"
+	depends on NTFS_FS
+	help
+	  If you are experiencing any problems with the NTFS file system, say
+	  Y here.  This will result in additional consistency checks to be
+	  performed by the driver as well as additional debugging messages to
+	  be written to the system log.  Note that debugging messages are
+	  disabled by default.  To enable them, supply the option debug_msgs=1
+	  at the kernel command line when booting the kernel or as an option
+	  to insmod when loading the ntfs module.  Once the driver is active,
+	  you can enable debugging messages by doing (as root):
+	  echo 1 > /proc/sys/fs/ntfs-debug
+	  Replacing the "1" with "0" would disable debug messages.
+
+	  If you leave debugging messages disabled, this results in little
+	  overhead, but enabling debug messages results in very significant
+	  slowdown of the system.
+
+	  When reporting bugs, please try to have available a full dump of
+	  debugging messages while the misbehaviour was occurring.
+
+config NTFS_RW
+	bool "NTFS write support"
+	depends on NTFS_FS
+	help
+	  This enables the partial, but safe, write support in the NTFS driver.
+
+	  The only supported operation is overwriting existing files, without
+	  changing the file length.  No file or directory creation, deletion or
+	  renaming is possible.  Note only non-resident files can be written to
+	  so you may find that some very small files (<500 bytes or so) cannot
+	  be written to.
+
+	  While we cannot guarantee that it will not damage any data, we have
+	  so far not received a single report where the driver would have
+	  damaged someones data so we assume it is perfectly safe to use.
+
+	  Note:  While write support is safe in this version (a rewrite from
+	  scratch of the NTFS support), it should be noted that the old NTFS
+	  write support, included in Linux 2.5.10 and before (since 1997),
+	  is not safe.
+
+	  This is currently useful with TopologiLinux.  TopologiLinux is run
+	  on top of any DOS/Microsoft Windows system without partitioning your
+	  hard disk.  Unlike other Linux distributions TopologiLinux does not
+	  need its own partition.  For more information see
+	  <http://topologi-linux.sourceforge.net/>
+
+	  It is perfectly safe to say N here.
-- 
cgit v1.2.3


From 5f3a211a8b02222498f134ea961fe29c97a4801f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:40:58 +0300
Subject: fs/Kconfig: move sysfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 25 +------------------------
 fs/sysfs/Kconfig | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 24 deletions(-)
 create mode 100644 fs/sysfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f746fd6cb72..e9103b9862b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -141,30 +141,7 @@ endif # BLOCK
 menu "Pseudo filesystems"
 
 source "fs/proc/Kconfig"
-
-config SYSFS
-	bool "sysfs file system support" if EMBEDDED
-	default y
-	help
-	The sysfs filesystem is a virtual filesystem that the kernel uses to
-	export internal kernel objects, their attributes, and their
-	relationships to one another.
-
-	Users can use sysfs to ascertain useful information about the running
-	kernel, such as the devices the kernel has discovered on each bus and
-	which driver each is bound to. sysfs can also be used to tune devices
-	and other kernel subsystems.
-
-	Some system agents rely on the information in sysfs to operate.
-	/sbin/hotplug uses device and object attributes in sysfs to assist in
-	delegating policy decisions, like persistently naming devices.
-
-	sysfs is currently used by the block subsystem to mount the root
-	partition.  If sysfs is disabled you must specify the boot device on
-	the kernel boot command line via its major and minor numbers.  For
-	example, "root=03:01" for /dev/hda1.
-
-	Designers of embedded systems may wish to say N here to conserve space.
+source "fs/sysfs/Kconfig"
 
 config TMPFS
 	bool "Virtual memory file system support (former shm fs)"
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
new file mode 100644
index 00000000000..f4b67588b9d
--- /dev/null
+++ b/fs/sysfs/Kconfig
@@ -0,0 +1,23 @@
+config SYSFS
+	bool "sysfs file system support" if EMBEDDED
+	default y
+	help
+	The sysfs filesystem is a virtual filesystem that the kernel uses to
+	export internal kernel objects, their attributes, and their
+	relationships to one another.
+
+	Users can use sysfs to ascertain useful information about the running
+	kernel, such as the devices the kernel has discovered on each bus and
+	which driver each is bound to. sysfs can also be used to tune devices
+	and other kernel subsystems.
+
+	Some system agents rely on the information in sysfs to operate.
+	/sbin/hotplug uses device and object attributes in sysfs to assist in
+	delegating policy decisions, like persistently naming devices.
+
+	sysfs is currently used by the block subsystem to mount the root
+	partition.  If sysfs is disabled you must specify the boot device on
+	the kernel boot command line via its major and minor numbers.  For
+	example, "root=03:01" for /dev/hda1.
+
+	Designers of embedded systems may wish to say N here to conserve space.
-- 
cgit v1.2.3


From 4591dabe27ec0f7928fb73d93694698e21dc769e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:42:52 +0300
Subject: fs/Kconfig: move configfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 12 +-----------
 fs/configfs/Kconfig | 11 +++++++++++
 2 files changed, 12 insertions(+), 11 deletions(-)
 create mode 100644 fs/configfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e9103b9862b..d7d7f1b9363 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -182,17 +182,7 @@ config HUGETLBFS
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
 
-config CONFIGFS_FS
-	tristate "Userspace-driven configuration filesystem"
-	depends on SYSFS
-	help
-	  configfs is a ram-based filesystem that provides the converse
-	  of sysfs's functionality. Where sysfs is a filesystem-based
-	  view of kernel objects, configfs is a filesystem-based manager
-	  of kernel objects, or config_items.
-
-	  Both sysfs and configfs can and should exist together on the
-	  same system. One is not a replacement for the other.
+source "fs/configfs/Kconfig"
 
 endmenu
 
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
new file mode 100644
index 00000000000..13587cc97a0
--- /dev/null
+++ b/fs/configfs/Kconfig
@@ -0,0 +1,11 @@
+config CONFIGFS_FS
+	tristate "Userspace-driven configuration filesystem"
+	depends on SYSFS
+	help
+	  configfs is a ram-based filesystem that provides the converse
+	  of sysfs's functionality. Where sysfs is a filesystem-based
+	  view of kernel objects, configfs is a filesystem-based manager
+	  of kernel objects, or config_items.
+
+	  Both sysfs and configfs can and should exist together on the
+	  same system. One is not a replacement for the other.
-- 
cgit v1.2.3


From bc2de2ae67177bc60bb9ab41c97ea4f827d52821 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:48:46 +0300
Subject: fs/Kconfig: move adfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 28 +---------------------------
 fs/adfs/Kconfig | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 27 deletions(-)
 create mode 100644 fs/adfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d7d7f1b9363..e4492c75efe 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -201,33 +201,7 @@ menuconfig MISC_FILESYSTEMS
 
 if MISC_FILESYSTEMS
 
-config ADFS_FS
-	tristate "ADFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  The Acorn Disc Filing System is the standard file system of the
-	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
-	  systems and the Acorn Archimedes range of machines. If you say Y
-	  here, Linux will be able to read from ADFS partitions on hard drives
-	  and from ADFS-formatted floppy discs. If you also want to be able to
-	  write to those devices, say Y to "ADFS write support" below.
-
-	  The ADFS partition should be the first partition (i.e.,
-	  /dev/[hs]d?1) on each of your drives. Please read the file
-	  <file:Documentation/filesystems/adfs.txt> for further details.
-
-	  To compile this code as a module, choose M here: the module will be
-	  called adfs.
-
-	  If unsure, say N.
-
-config ADFS_FS_RW
-	bool "ADFS write support (DANGEROUS)"
-	depends on ADFS_FS
-	help
-	  If you say Y here, you will be able to write to ADFS partitions on
-	  hard drives and ADFS-formatted floppy disks. This is experimental
-	  codes, so if you're unsure, say N.
+source "fs/adfs/Kconfig"
 
 config AFFS_FS
 	tristate "Amiga FFS file system support (EXPERIMENTAL)"
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
new file mode 100644
index 00000000000..e55182a7460
--- /dev/null
+++ b/fs/adfs/Kconfig
@@ -0,0 +1,27 @@
+config ADFS_FS
+	tristate "ADFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  The Acorn Disc Filing System is the standard file system of the
+	  RiscOS operating system which runs on Acorn's ARM-based Risc PC
+	  systems and the Acorn Archimedes range of machines. If you say Y
+	  here, Linux will be able to read from ADFS partitions on hard drives
+	  and from ADFS-formatted floppy discs. If you also want to be able to
+	  write to those devices, say Y to "ADFS write support" below.
+
+	  The ADFS partition should be the first partition (i.e.,
+	  /dev/[hs]d?1) on each of your drives. Please read the file
+	  <file:Documentation/filesystems/adfs.txt> for further details.
+
+	  To compile this code as a module, choose M here: the module will be
+	  called adfs.
+
+	  If unsure, say N.
+
+config ADFS_FS_RW
+	bool "ADFS write support (DANGEROUS)"
+	depends on ADFS_FS
+	help
+	  If you say Y here, you will be able to write to ADFS partitions on
+	  hard drives and ADFS-formatted floppy disks. This is experimental
+	  codes, so if you're unsure, say N.
-- 
cgit v1.2.3


From 10951bf05d952bf6d13094f66a0dccd11dec311e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:49:44 +0300
Subject: fs/Kconfig: move affs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 23 +----------------------
 fs/affs/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 fs/affs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e4492c75efe..3e025af4d8b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -202,28 +202,7 @@ menuconfig MISC_FILESYSTEMS
 if MISC_FILESYSTEMS
 
 source "fs/adfs/Kconfig"
-
-config AFFS_FS
-	tristate "Amiga FFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  The Fast File System (FFS) is the common file system used on hard
-	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
-	  if you want to be able to read and write files from and to an Amiga
-	  FFS partition on your hard drive.  Amiga floppies however cannot be
-	  read with this driver due to an incompatibility of the floppy
-	  controller used in an Amiga and the standard floppy controller in
-	  PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
-	  and <file:fs/affs/Changes>.
-
-	  With this driver you can also mount disk files used by Bernd
-	  Schmidt's Un*X Amiga Emulator
-	  (<http://www.freiburg.linux.de/~uae/>).
-	  If you want to do this, you will also need to say Y or M to "Loop
-	  device support", above.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called affs.  If unsure, say N.
+source "fs/affs/Kconfig"
 
 config ECRYPT_FS
 	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
new file mode 100644
index 00000000000..cfad9afb476
--- /dev/null
+++ b/fs/affs/Kconfig
@@ -0,0 +1,21 @@
+config AFFS_FS
+	tristate "Amiga FFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  The Fast File System (FFS) is the common file system used on hard
+	  disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
+	  if you want to be able to read and write files from and to an Amiga
+	  FFS partition on your hard drive.  Amiga floppies however cannot be
+	  read with this driver due to an incompatibility of the floppy
+	  controller used in an Amiga and the standard floppy controller in
+	  PCs and workstations. Read <file:Documentation/filesystems/affs.txt>
+	  and <file:fs/affs/Changes>.
+
+	  With this driver you can also mount disk files used by Bernd
+	  Schmidt's Un*X Amiga Emulator
+	  (<http://www.freiburg.linux.de/~uae/>).
+	  If you want to do this, you will also need to say Y or M to "Loop
+	  device support", above.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called affs.  If unsure, say N.
-- 
cgit v1.2.3


From 295c896cb95de18004ef5e1b53f44c2ad001f936 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:50:50 +0300
Subject: fs/Kconfig: move ecryptfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 13 +------------
 fs/ecryptfs/Kconfig | 11 +++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)
 create mode 100644 fs/ecryptfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3e025af4d8b..1c79baf55db 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -203,18 +203,7 @@ if MISC_FILESYSTEMS
 
 source "fs/adfs/Kconfig"
 source "fs/affs/Kconfig"
-
-config ECRYPT_FS
-	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && KEYS && CRYPTO && NET
-	help
-	  Encrypted filesystem that operates on the VFS layer.  See
-	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
-	  eCryptfs.  Userspace components are required and can be
-	  obtained from <http://ecryptfs.sf.net>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ecryptfs.
+source "fs/ecryptfs/Kconfig"
 
 config HFS_FS
 	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
new file mode 100644
index 00000000000..0c754e64232
--- /dev/null
+++ b/fs/ecryptfs/Kconfig
@@ -0,0 +1,11 @@
+config ECRYPT_FS
+	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && KEYS && CRYPTO && NET
+	help
+	  Encrypted filesystem that operates on the VFS layer.  See
+	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
+	  eCryptfs.  Userspace components are required and can be
+	  obtained from <http://ecryptfs.sf.net>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called ecryptfs.
-- 
cgit v1.2.3


From b08bac1f185b2281c3decb4f8e15e8f41f96e974 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:53:24 +0300
Subject: fs/Kconfig: move hfs, hfsplus out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig         | 29 ++---------------------------
 fs/hfs/Kconfig     | 12 ++++++++++++
 fs/hfsplus/Kconfig | 13 +++++++++++++
 3 files changed, 27 insertions(+), 27 deletions(-)
 create mode 100644 fs/hfs/Kconfig
 create mode 100644 fs/hfsplus/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 1c79baf55db..3b48ab4f0b7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -204,33 +204,8 @@ if MISC_FILESYSTEMS
 source "fs/adfs/Kconfig"
 source "fs/affs/Kconfig"
 source "fs/ecryptfs/Kconfig"
-
-config HFS_FS
-	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	select NLS
-	help
-	  If you say Y here, you will be able to mount Macintosh-formatted
-	  floppy disks and hard drive partitions with full read-write access.
-	  Please read <file:Documentation/filesystems/hfs.txt> to learn about
-	  the available mount options.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called hfs.
-
-config HFSPLUS_FS
-	tristate "Apple Extended HFS file system support"
-	depends on BLOCK
-	select NLS
-	select NLS_UTF8
-	help
-	  If you say Y here, you will be able to mount extended format
-	  Macintosh-formatted hard drive partitions with full read-write access.
-
-	  This file system is often called HFS+ and was introduced with
-	  MacOS 8. It includes all Mac specific filesystem data such as
-	  data forks and creator codes, but it also has several UNIX
-	  style features such as file ownership and permissions.
+source "fs/hfs/Kconfig"
+source "fs/hfsplus/Kconfig"
 
 config BEFS_FS
 	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
new file mode 100644
index 00000000000..b77c5bc20f8
--- /dev/null
+++ b/fs/hfs/Kconfig
@@ -0,0 +1,12 @@
+config HFS_FS
+	tristate "Apple Macintosh file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select NLS
+	help
+	  If you say Y here, you will be able to mount Macintosh-formatted
+	  floppy disks and hard drive partitions with full read-write access.
+	  Please read <file:Documentation/filesystems/hfs.txt> to learn about
+	  the available mount options.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hfs.
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
new file mode 100644
index 00000000000..a63371815aa
--- /dev/null
+++ b/fs/hfsplus/Kconfig
@@ -0,0 +1,13 @@
+config HFSPLUS_FS
+	tristate "Apple Extended HFS file system support"
+	depends on BLOCK
+	select NLS
+	select NLS_UTF8
+	help
+	  If you say Y here, you will be able to mount extended format
+	  Macintosh-formatted hard drive partitions with full read-write access.
+
+	  This file system is often called HFS+ and was introduced with
+	  MacOS 8. It includes all Mac specific filesystem data such as
+	  data forks and creator codes, but it also has several UNIX
+	  style features such as file ownership and permissions.
-- 
cgit v1.2.3


From 0b09eb32985d5fbec567e83b18db3dec14d1fef9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:54:16 +0300
Subject: fs/Kconfig: move befs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 28 +---------------------------
 fs/befs/Kconfig | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 27 deletions(-)
 create mode 100644 fs/befs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3b48ab4f0b7..cfddc0a76ad 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -206,33 +206,7 @@ source "fs/affs/Kconfig"
 source "fs/ecryptfs/Kconfig"
 source "fs/hfs/Kconfig"
 source "fs/hfsplus/Kconfig"
-
-config BEFS_FS
-	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	select NLS
-	help
-	  The BeOS File System (BeFS) is the native file system of Be, Inc's
-	  BeOS. Notable features include support for arbitrary attributes
-	  on files and directories, and database-like indices on selected
-	  attributes. (Also note that this driver doesn't make those features
-	  available at this time). It is a 64 bit filesystem, so it supports
-	  extremely large volumes and files.
-
-	  If you use this filesystem, you should also say Y to at least one
-	  of the NLS (native language support) options below.
-
-	  If you don't know what this is about, say N.
-
-	  To compile this as a module, choose M here: the module will be
-	  called befs.
-
-config BEFS_DEBUG
-	bool "Debug BeFS"
-	depends on BEFS_FS
-	help
-	  If you say Y here, you can use the 'debug' mount option to enable
-	  debugging output from the driver.
+source "fs/befs/Kconfig"
 
 config BFS_FS
 	tristate "BFS file system support (EXPERIMENTAL)"
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
new file mode 100644
index 00000000000..7835d30f211
--- /dev/null
+++ b/fs/befs/Kconfig
@@ -0,0 +1,26 @@
+config BEFS_FS
+	tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	select NLS
+	help
+	  The BeOS File System (BeFS) is the native file system of Be, Inc's
+	  BeOS. Notable features include support for arbitrary attributes
+	  on files and directories, and database-like indices on selected
+	  attributes. (Also note that this driver doesn't make those features
+	  available at this time). It is a 64 bit filesystem, so it supports
+	  extremely large volumes and files.
+
+	  If you use this filesystem, you should also say Y to at least one
+	  of the NLS (native language support) options below.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be
+	  called befs.
+
+config BEFS_DEBUG
+	bool "Debug BeFS"
+	depends on BEFS_FS
+	help
+	  If you say Y here, you can use the 'debug' mount option to enable
+	  debugging output from the driver.
-- 
cgit v1.2.3


From 0ff423849de3fe98c06d30a8ac73103c8741914c Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:55:13 +0300
Subject: fs/Kconfig: move bfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 23 +----------------------
 fs/bfs/Kconfig | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 22 deletions(-)
 create mode 100644 fs/bfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index cfddc0a76ad..9acf3a2d231 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -207,28 +207,7 @@ source "fs/ecryptfs/Kconfig"
 source "fs/hfs/Kconfig"
 source "fs/hfsplus/Kconfig"
 source "fs/befs/Kconfig"
-
-config BFS_FS
-	tristate "BFS file system support (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  Boot File System (BFS) is a file system used under SCO UnixWare to
-	  allow the bootloader access to the kernel image and other important
-	  files during the boot process.  It is usually mounted under /stand
-	  and corresponds to the slice marked as "STAND" in the UnixWare
-	  partition.  You should say Y if you want to read or write the files
-	  on your /stand slice from within Linux.  You then also need to say Y
-	  to "UnixWare slices support", below.  More information about the BFS
-	  file system is contained in the file
-	  <file:Documentation/filesystems/bfs.txt>.
-
-	  If you don't know what this is about, say N.
-
-	  To compile this as a module, choose M here: the module will be called
-	  bfs.  Note that the file system of your root partition (the one
-	  containing the directory /) cannot be compiled as a module.
-
-
+source "fs/bfs/Kconfig"
 
 config EFS_FS
 	tristate "EFS file system support (read only) (EXPERIMENTAL)"
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
new file mode 100644
index 00000000000..c2336c62024
--- /dev/null
+++ b/fs/bfs/Kconfig
@@ -0,0 +1,19 @@
+config BFS_FS
+	tristate "BFS file system support (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  Boot File System (BFS) is a file system used under SCO UnixWare to
+	  allow the bootloader access to the kernel image and other important
+	  files during the boot process.  It is usually mounted under /stand
+	  and corresponds to the slice marked as "STAND" in the UnixWare
+	  partition.  You should say Y if you want to read or write the files
+	  on your /stand slice from within Linux.  You then also need to say Y
+	  to "UnixWare slices support", below.  More information about the BFS
+	  file system is contained in the file
+	  <file:Documentation/filesystems/bfs.txt>.
+
+	  If you don't know what this is about, say N.
+
+	  To compile this as a module, choose M here: the module will be called
+	  bfs.  Note that the file system of your root partition (the one
+	  containing the directory /) cannot be compiled as a module.
-- 
cgit v1.2.3


From 571f0a0bdeeb2d1692751b6c5df15dafb483c7ff Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:56:07 +0300
Subject: fs/Kconfig: move efs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 17 +----------------
 fs/efs/Kconfig | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 16 deletions(-)
 create mode 100644 fs/efs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9acf3a2d231..fad19083285 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -208,22 +208,7 @@ source "fs/hfs/Kconfig"
 source "fs/hfsplus/Kconfig"
 source "fs/befs/Kconfig"
 source "fs/bfs/Kconfig"
-
-config EFS_FS
-	tristate "EFS file system support (read only) (EXPERIMENTAL)"
-	depends on BLOCK && EXPERIMENTAL
-	help
-	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
-	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
-	  uses the XFS file system for hard disk partitions however).
-
-	  This implementation only offers read-only access. If you don't know
-	  what all this is about, it's safe to say N. For more information
-	  about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
-
-	  To compile the EFS file system support as a module, choose M here: the
-	  module will be called efs.
-
+source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
new file mode 100644
index 00000000000..6ebfc1c207a
--- /dev/null
+++ b/fs/efs/Kconfig
@@ -0,0 +1,14 @@
+config EFS_FS
+	tristate "EFS file system support (read only) (EXPERIMENTAL)"
+	depends on BLOCK && EXPERIMENTAL
+	help
+	  EFS is an older file system used for non-ISO9660 CD-ROMs and hard
+	  disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
+	  uses the XFS file system for hard disk partitions however).
+
+	  This implementation only offers read-only access. If you don't know
+	  what all this is about, it's safe to say N. For more information
+	  about EFS see its home page at <http://aeschi.ch.eu.org/efs/>.
+
+	  To compile the EFS file system support as a module, choose M here: the
+	  module will be called efs.
-- 
cgit v1.2.3


From 2a22783be0fbbd63599dd6aacf8bc2ddab941bf7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:56:54 +0300
Subject: fs/Kconfig: move cramfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig        | 21 +--------------------
 fs/cramfs/Kconfig | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 20 deletions(-)
 create mode 100644 fs/cramfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index fad19083285..d7b84dfed4f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -212,26 +212,7 @@ source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
-
-config CRAMFS
-	tristate "Compressed ROM file system support (cramfs)"
-	depends on BLOCK
-	select ZLIB_INFLATE
-	help
-	  Saying Y here includes support for CramFs (Compressed ROM File
-	  System).  CramFs is designed to be a simple, small, and compressed
-	  file system for ROM based embedded systems.  CramFs is read-only,
-	  limited to 256MB file systems (with 16MB files), and doesn't support
-	  16/32 bits uid/gid, hard links and timestamps.
-
-	  See <file:Documentation/filesystems/cramfs.txt> and
-	  <file:fs/cramfs/README> for further information.
-
-	  To compile this as a module, choose M here: the module will be called
-	  cramfs.  Note that the root file system (the one containing the
-	  directory /) cannot be compiled as a module.
-
-	  If unsure, say N.
+source "fs/cramfs/Kconfig"
 
 config SQUASHFS
 	tristate "SquashFS 4.0 - Squashed file system support"
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
new file mode 100644
index 00000000000..cd06466f365
--- /dev/null
+++ b/fs/cramfs/Kconfig
@@ -0,0 +1,19 @@
+config CRAMFS
+	tristate "Compressed ROM file system support (cramfs)"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for CramFs (Compressed ROM File
+	  System).  CramFs is designed to be a simple, small, and compressed
+	  file system for ROM based embedded systems.  CramFs is read-only,
+	  limited to 256MB file systems (with 16MB files), and doesn't support
+	  16/32 bits uid/gid, hard links and timestamps.
+
+	  See <file:Documentation/filesystems/cramfs.txt> and
+	  <file:fs/cramfs/README> for further information.
+
+	  To compile this as a module, choose M here: the module will be called
+	  cramfs.  Note that the root file system (the one containing the
+	  directory /) cannot be compiled as a module.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 22635ec9e0cb5afbc1eaa25495ae28da8416aac3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:57:46 +0300
Subject: fs/Kconfig: move squashfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 53 +----------------------------------------------------
 fs/squashfs/Kconfig | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 52 deletions(-)
 create mode 100644 fs/squashfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d7b84dfed4f..d44a698463c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -213,58 +213,7 @@ source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
 source "fs/cramfs/Kconfig"
-
-config SQUASHFS
-	tristate "SquashFS 4.0 - Squashed file system support"
-	depends on BLOCK
-	select ZLIB_INFLATE
-	help
-	  Saying Y here includes support for SquashFS 4.0 (a Compressed
-	  Read-Only File System).  Squashfs is a highly compressed read-only
-	  filesystem for Linux.  It uses zlib compression to compress both
-	  files, inodes and directories.  Inodes in the system are very small
-	  and all blocks are packed to minimise data overhead. Block sizes
-	  greater than 4K are supported up to a maximum of 1 Mbytes (default
-	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
-	  (larger than 4GB), full uid/gid information, hard links and
-	  timestamps.  
-
-	  Squashfs is intended for general read-only filesystem use, for
-	  archival use (i.e. in cases where a .tar.gz file may be used), and in
-	  embedded systems where low overhead is needed.  Further information
-	  and tools are available from http://squashfs.sourceforge.net.
-
-	  If you want to compile this as a module ( = code which can be
-	  inserted in and removed from the running kernel whenever you want),
-	  say M here and read <file:Documentation/modules.txt>.  The module
-	  will be called squashfs.  Note that the root file system (the one
-	  containing the directory /) cannot be compiled as a module.
-
-	  If unsure, say N.
-
-config SQUASHFS_EMBEDDED
-
-	bool "Additional option for memory-constrained systems" 
-	depends on SQUASHFS
-	default n
-	help
-	  Saying Y here allows you to specify cache size.
-
-	  If unsure, say N.
-
-config SQUASHFS_FRAGMENT_CACHE_SIZE
-	int "Number of fragments cached" if SQUASHFS_EMBEDDED
-	depends on SQUASHFS
-	default "3"
-	help
-	  By default SquashFS caches the last 3 fragments read from
-	  the filesystem.  Increasing this amount may mean SquashFS
-	  has to re-read fragments less often from disk, at the expense
-	  of extra system memory.  Decreasing this amount will mean
-	  SquashFS uses less memory at the expense of extra reads from disk.
-
-	  Note there must be at least one cached fragment.  Anything
-	  much more than three will probably not make much difference.
+source "fs/squashfs/Kconfig"
 
 config VXFS_FS
 	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
new file mode 100644
index 00000000000..25a00d19d68
--- /dev/null
+++ b/fs/squashfs/Kconfig
@@ -0,0 +1,51 @@
+config SQUASHFS
+	tristate "SquashFS 4.0 - Squashed file system support"
+	depends on BLOCK
+	select ZLIB_INFLATE
+	help
+	  Saying Y here includes support for SquashFS 4.0 (a Compressed
+	  Read-Only File System).  Squashfs is a highly compressed read-only
+	  filesystem for Linux.  It uses zlib compression to compress both
+	  files, inodes and directories.  Inodes in the system are very small
+	  and all blocks are packed to minimise data overhead. Block sizes
+	  greater than 4K are supported up to a maximum of 1 Mbytes (default
+	  block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
+	  (larger than 4GB), full uid/gid information, hard links and
+	  timestamps.  
+
+	  Squashfs is intended for general read-only filesystem use, for
+	  archival use (i.e. in cases where a .tar.gz file may be used), and in
+	  embedded systems where low overhead is needed.  Further information
+	  and tools are available from http://squashfs.sourceforge.net.
+
+	  If you want to compile this as a module ( = code which can be
+	  inserted in and removed from the running kernel whenever you want),
+	  say M here and read <file:Documentation/modules.txt>.  The module
+	  will be called squashfs.  Note that the root file system (the one
+	  containing the directory /) cannot be compiled as a module.
+
+	  If unsure, say N.
+
+config SQUASHFS_EMBEDDED
+
+	bool "Additional option for memory-constrained systems" 
+	depends on SQUASHFS
+	default n
+	help
+	  Saying Y here allows you to specify cache size.
+
+	  If unsure, say N.
+
+config SQUASHFS_FRAGMENT_CACHE_SIZE
+	int "Number of fragments cached" if SQUASHFS_EMBEDDED
+	depends on SQUASHFS
+	default "3"
+	help
+	  By default SquashFS caches the last 3 fragments read from
+	  the filesystem.  Increasing this amount may mean SquashFS
+	  has to re-read fragments less often from disk, at the expense
+	  of extra system memory.  Decreasing this amount will mean
+	  SquashFS uses less memory at the expense of extra reads from disk.
+
+	  Note there must be at least one cached fragment.  Anything
+	  much more than three will probably not make much difference.
-- 
cgit v1.2.3


From 22135169ddc536b1f7d7f070c7980fe4bcdaa20b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:58:51 +0300
Subject: fs/Kconfig: move vxfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig          | 18 +-----------------
 fs/freevxfs/Kconfig | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 17 deletions(-)
 create mode 100644 fs/freevxfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d44a698463c..58ab4df5644 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -214,23 +214,7 @@ source "fs/jffs2/Kconfig"
 source "fs/ubifs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
-
-config VXFS_FS
-	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
-	depends on BLOCK
-	help
-	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
-	  file system format.  VERITAS VxFS(TM) is the standard file system
-	  of SCO UnixWare (and possibly others) and optionally available
-	  for Sunsoft Solaris, HP-UX and many other operating systems.
-	  Currently only readonly access is supported.
-
-	  NOTE: the file system type as used by mount(1), mount(2) and
-	  fstab(5) is 'vxfs' as it describes the file system format, not
-	  the actual driver.
-
-	  To compile this as a module, choose M here: the module will be
-	  called freevxfs.  If unsure, say N.
+source "fs/freevxfs/Kconfig"
 
 config MINIX_FS
 	tristate "Minix file system support"
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
new file mode 100644
index 00000000000..8dc1cd5c1ef
--- /dev/null
+++ b/fs/freevxfs/Kconfig
@@ -0,0 +1,16 @@
+config VXFS_FS
+	tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
+	depends on BLOCK
+	help
+	  FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
+	  file system format.  VERITAS VxFS(TM) is the standard file system
+	  of SCO UnixWare (and possibly others) and optionally available
+	  for Sunsoft Solaris, HP-UX and many other operating systems.
+	  Currently only readonly access is supported.
+
+	  NOTE: the file system type as used by mount(1), mount(2) and
+	  fstab(5) is 'vxfs' as it describes the file system format, not
+	  the actual driver.
+
+	  To compile this as a module, choose M here: the module will be
+	  called freevxfs.  If unsure, say N.
-- 
cgit v1.2.3


From 8b1cd7d3c5daaed6c4dec3697c1fc113eb817df0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 10:59:49 +0300
Subject: fs/Kconfig: move minix out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 19 +------------------
 fs/minix/Kconfig | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 18 deletions(-)
 create mode 100644 fs/minix/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 58ab4df5644..3323379fdb3 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -215,24 +215,7 @@ source "fs/ubifs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
-
-config MINIX_FS
-	tristate "Minix file system support"
-	depends on BLOCK
-	help
-	  Minix is a simple operating system used in many classes about OS's.
-	  The minix file system (method to organize files on a hard disk
-	  partition or a floppy disk) was the original file system for Linux,
-	  but has been superseded by the second extended file system ext2fs.
-	  You don't want to use the minix file system on your hard disk
-	  because of certain built-in restrictions, but it is sometimes found
-	  on older Linux floppy disks.  This option will enlarge your kernel
-	  by about 28 KB. If unsure, say N.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called minix.  Note that the file system of your root
-	  partition (the one containing the directory /) cannot be compiled as
-	  a module.
+source "fs/minix/Kconfig"
 
 config OMFS_FS
 	tristate "SonicBlue Optimized MPEG File System support"
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
new file mode 100644
index 00000000000..0fd7ca99426
--- /dev/null
+++ b/fs/minix/Kconfig
@@ -0,0 +1,17 @@
+config MINIX_FS
+	tristate "Minix file system support"
+	depends on BLOCK
+	help
+	  Minix is a simple operating system used in many classes about OS's.
+	  The minix file system (method to organize files on a hard disk
+	  partition or a floppy disk) was the original file system for Linux,
+	  but has been superseded by the second extended file system ext2fs.
+	  You don't want to use the minix file system on your hard disk
+	  because of certain built-in restrictions, but it is sometimes found
+	  on older Linux floppy disks.  This option will enlarge your kernel
+	  by about 28 KB. If unsure, say N.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called minix.  Note that the file system of your root
+	  partition (the one containing the directory /) cannot be compiled as
+	  a module.
-- 
cgit v1.2.3


From da55e6f92830df9bba7c87438344479c60d44fdb Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:00:41 +0300
Subject: fs/Kconfig: move omfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 15 +--------------
 fs/omfs/Kconfig | 13 +++++++++++++
 2 files changed, 14 insertions(+), 14 deletions(-)
 create mode 100644 fs/omfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3323379fdb3..da5e8f956a8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -216,20 +216,7 @@ source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
 source "fs/minix/Kconfig"
-
-config OMFS_FS
-	tristate "SonicBlue Optimized MPEG File System support"
-	depends on BLOCK
-	select CRC_ITU_T
-	help
-	  This is the proprietary file system used by the Rio Karma music
-	  player and ReplayTV DVR.  Despite the name, this filesystem is not
-	  more efficient than a standard FS for MPEG files, in fact likely
-	  the opposite is true.  Say Y if you have either of these devices
-	  and wish to mount its disk.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called omfs.  If unsure, say N.
+source "fs/omfs/Kconfig"
 
 config HPFS_FS
 	tristate "OS/2 HPFS file system support"
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
new file mode 100644
index 00000000000..b1b9a0aba6f
--- /dev/null
+++ b/fs/omfs/Kconfig
@@ -0,0 +1,13 @@
+config OMFS_FS
+	tristate "SonicBlue Optimized MPEG File System support"
+	depends on BLOCK
+	select CRC_ITU_T
+	help
+	  This is the proprietary file system used by the Rio Karma music
+	  player and ReplayTV DVR.  Despite the name, this filesystem is not
+	  more efficient than a standard FS for MPEG files, in fact likely
+	  the opposite is true.  Say Y if you have either of these devices
+	  and wish to mount its disk.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called omfs.  If unsure, say N.
-- 
cgit v1.2.3


From 928ea192959f188e6a4de95b293e3973887917b5 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:01:26 +0300
Subject: fs/Kconfig: move hpfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 17 +----------------
 fs/hpfs/Kconfig | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 16 deletions(-)
 create mode 100644 fs/hpfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index da5e8f956a8..9bead7c680d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -217,22 +217,7 @@ source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
 source "fs/minix/Kconfig"
 source "fs/omfs/Kconfig"
-
-config HPFS_FS
-	tristate "OS/2 HPFS file system support"
-	depends on BLOCK
-	help
-	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
-	  is the file system used for organizing files on OS/2 hard disk
-	  partitions. Say Y if you want to be able to read files from and
-	  write files to an OS/2 HPFS partition on your hard drive. OS/2
-	  floppies however are in regular MSDOS format, so you don't need this
-	  option in order to be able to read them. Read
-	  <file:Documentation/filesystems/hpfs.txt>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called hpfs.  If unsure, say N.
-
+source "fs/hpfs/Kconfig"
 
 config QNX4FS_FS
 	tristate "QNX4 file system support (read only)"
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
new file mode 100644
index 00000000000..56bd15c5bf6
--- /dev/null
+++ b/fs/hpfs/Kconfig
@@ -0,0 +1,14 @@
+config HPFS_FS
+	tristate "OS/2 HPFS file system support"
+	depends on BLOCK
+	help
+	  OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
+	  is the file system used for organizing files on OS/2 hard disk
+	  partitions. Say Y if you want to be able to read files from and
+	  write files to an OS/2 HPFS partition on your hard drive. OS/2
+	  floppies however are in regular MSDOS format, so you don't need this
+	  option in order to be able to read them. Read
+	  <file:Documentation/filesystems/hpfs.txt>.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called hpfs.  If unsure, say N.
-- 
cgit v1.2.3


From 4c7415830c7ab465ff54ca7ffc20bfb1b59906c3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:02:21 +0300
Subject: fs/Kconfig: move qnx4 out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 27 +--------------------------
 fs/qnx4/Kconfig | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 26 deletions(-)
 create mode 100644 fs/qnx4/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9bead7c680d..b348d2e8cc6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -218,32 +218,7 @@ source "fs/freevxfs/Kconfig"
 source "fs/minix/Kconfig"
 source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
-
-config QNX4FS_FS
-	tristate "QNX4 file system support (read only)"
-	depends on BLOCK
-	help
-	  This is the file system used by the real-time operating systems
-	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
-	  Further information is available at <http://www.qnx.com/>.
-	  Say Y if you intend to mount QNX hard disks or floppies.
-	  Unless you say Y to "QNX4FS read-write support" below, you will
-	  only be able to read these file systems.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called qnx4.
-
-	  If you don't know whether you need it, then you don't need it:
-	  answer N.
-
-config QNX4FS_RW
-	bool "QNX4FS write support (DANGEROUS)"
-	depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
-	help
-	  Say Y if you want to test write support for QNX4 file systems.
-
-	  It's currently broken, so for now:
-	  answer N.
+source "fs/qnx4/Kconfig"
 
 config ROMFS_FS
 	tristate "ROM file system support"
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
new file mode 100644
index 00000000000..be8e0e1445b
--- /dev/null
+++ b/fs/qnx4/Kconfig
@@ -0,0 +1,25 @@
+config QNX4FS_FS
+	tristate "QNX4 file system support (read only)"
+	depends on BLOCK
+	help
+	  This is the file system used by the real-time operating systems
+	  QNX 4 and QNX 6 (the latter is also called QNX RTP).
+	  Further information is available at <http://www.qnx.com/>.
+	  Say Y if you intend to mount QNX hard disks or floppies.
+	  Unless you say Y to "QNX4FS read-write support" below, you will
+	  only be able to read these file systems.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called qnx4.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
+
+config QNX4FS_RW
+	bool "QNX4FS write support (DANGEROUS)"
+	depends on QNX4FS_FS && EXPERIMENTAL && BROKEN
+	help
+	  Say Y if you want to test write support for QNX4 file systems.
+
+	  It's currently broken, so for now:
+	  answer N.
-- 
cgit v1.2.3


From 41810246df2e65c66dc1f0da79b282a95b664fc7 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:03:34 +0300
Subject: fs/Kconfig: move romfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 19 +------------------
 fs/romfs/Kconfig | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 18 deletions(-)
 create mode 100644 fs/romfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index b348d2e8cc6..d8672ccdc69 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -219,24 +219,7 @@ source "fs/minix/Kconfig"
 source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
-
-config ROMFS_FS
-	tristate "ROM file system support"
-	depends on BLOCK
-	---help---
-	  This is a very small read-only file system mainly intended for
-	  initial ram disks of installation disks, but it could be used for
-	  other read-only media as well.  Read
-	  <file:Documentation/filesystems/romfs.txt> for details.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called romfs.  Note that the file system of your
-	  root partition (the one containing the directory /) cannot be a
-	  module.
-
-	  If you don't know whether you need it, then you don't need it:
-	  answer N.
-
+source "fs/romfs/Kconfig"
 
 config SYSV_FS
 	tristate "System V/Xenix/V7/Coherent file system support"
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
new file mode 100644
index 00000000000..1a17020f9fa
--- /dev/null
+++ b/fs/romfs/Kconfig
@@ -0,0 +1,16 @@
+config ROMFS_FS
+	tristate "ROM file system support"
+	depends on BLOCK
+	---help---
+	  This is a very small read-only file system mainly intended for
+	  initial ram disks of installation disks, but it could be used for
+	  other read-only media as well.  Read
+	  <file:Documentation/filesystems/romfs.txt> for details.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called romfs.  Note that the file system of your
+	  root partition (the one containing the directory /) cannot be a
+	  module.
+
+	  If you don't know whether you need it, then you don't need it:
+	  answer N.
-- 
cgit v1.2.3


From 8af915ba1d1eae1f9f31fa8c5db8040492dc4785 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:04:23 +0300
Subject: fs/Kconfig: move sysv out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 39 +--------------------------------------
 fs/sysv/Kconfig | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 38 deletions(-)
 create mode 100644 fs/sysv/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index d8672ccdc69..e1cdb831064 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -220,44 +220,7 @@ source "fs/omfs/Kconfig"
 source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
-
-config SYSV_FS
-	tristate "System V/Xenix/V7/Coherent file system support"
-	depends on BLOCK
-	help
-	  SCO, Xenix and Coherent are commercial Unix systems for Intel
-	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
-	  here would allow you to read from their floppies and hard disk
-	  partitions.
-
-	  If you have floppies or hard disk partitions like that, it is likely
-	  that they contain binaries from those other Unix systems; in order
-	  to run these binaries, you will want to install linux-abi which is
-	  a set of kernel modules that lets you run SCO, Xenix, Wyse,
-	  UnixWare, Dell Unix and System V programs under Linux.  It is
-	  available via FTP (user: ftp) from
-	  <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
-	  NOTE: that will work only for binaries from Intel-based systems;
-	  PDP ones will have to wait until somebody ports Linux to -11 ;-)
-
-	  If you only intend to mount files from some other Unix over the
-	  network using NFS, you don't need the System V file system support
-	  (but you need NFS file system support obviously).
-
-	  Note that this option is generally not needed for floppies, since a
-	  good portable way to transport files and directories between unixes
-	  (and even other operating systems) is given by the tar program ("man
-	  tar" or preferably "info tar").  Note also that this option has
-	  nothing whatsoever to do with the option "System V IPC". Read about
-	  the System V file system in
-	  <file:Documentation/filesystems/sysv-fs.txt>.
-	  Saying Y here will enlarge your kernel by about 27 KB.
-
-	  To compile this as a module, choose M here: the module will be called
-	  sysv.
-
-	  If you haven't heard about all of this before, it's safe to say N.
-
+source "fs/sysv/Kconfig"
 
 config UFS_FS
 	tristate "UFS file system support (read only)"
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
new file mode 100644
index 00000000000..33aeb4b75db
--- /dev/null
+++ b/fs/sysv/Kconfig
@@ -0,0 +1,36 @@
+config SYSV_FS
+	tristate "System V/Xenix/V7/Coherent file system support"
+	depends on BLOCK
+	help
+	  SCO, Xenix and Coherent are commercial Unix systems for Intel
+	  machines, and Version 7 was used on the DEC PDP-11. Saying Y
+	  here would allow you to read from their floppies and hard disk
+	  partitions.
+
+	  If you have floppies or hard disk partitions like that, it is likely
+	  that they contain binaries from those other Unix systems; in order
+	  to run these binaries, you will want to install linux-abi which is
+	  a set of kernel modules that lets you run SCO, Xenix, Wyse,
+	  UnixWare, Dell Unix and System V programs under Linux.  It is
+	  available via FTP (user: ftp) from
+	  <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
+	  NOTE: that will work only for binaries from Intel-based systems;
+	  PDP ones will have to wait until somebody ports Linux to -11 ;-)
+
+	  If you only intend to mount files from some other Unix over the
+	  network using NFS, you don't need the System V file system support
+	  (but you need NFS file system support obviously).
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").  Note also that this option has
+	  nothing whatsoever to do with the option "System V IPC". Read about
+	  the System V file system in
+	  <file:Documentation/filesystems/sysv-fs.txt>.
+	  Saying Y here will enlarge your kernel by about 27 KB.
+
+	  To compile this as a module, choose M here: the module will be called
+	  sysv.
+
+	  If you haven't heard about all of this before, it's safe to say N.
-- 
cgit v1.2.3


From a276a52f9f1b1059bddade71df18ceb6481534a6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:05:02 +0300
Subject: fs/Kconfig: move ufs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 45 +--------------------------------------------
 fs/ufs/Kconfig | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 44 deletions(-)
 create mode 100644 fs/ufs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index e1cdb831064..35941e8a17c 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -221,50 +221,7 @@ source "fs/hpfs/Kconfig"
 source "fs/qnx4/Kconfig"
 source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
-
-config UFS_FS
-	tristate "UFS file system support (read only)"
-	depends on BLOCK
-	help
-	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
-	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
-	  Unixes can create and mount hard disk partitions and diskettes using
-	  this file system as well. Saying Y here will allow you to read from
-	  these partitions; if you also want to write to them, say Y to the
-	  experimental "UFS file system write support", below. Please read the
-	  file <file:Documentation/filesystems/ufs.txt> for more information.
-
-          The recently released UFS2 variant (used in FreeBSD 5.x) is
-          READ-ONLY supported.
-
-	  Note that this option is generally not needed for floppies, since a
-	  good portable way to transport files and directories between unixes
-	  (and even other operating systems) is given by the tar program ("man
-	  tar" or preferably "info tar").
-
-	  When accessing NeXTstep files, you may need to convert them from the
-	  NeXT character set to the Latin1 character set; use the program
-	  recode ("info recode") for this purpose.
-
-	  To compile the UFS file system support as a module, choose M here: the
-	  module will be called ufs.
-
-	  If you haven't heard about all of this before, it's safe to say N.
-
-config UFS_FS_WRITE
-	bool "UFS file system write support (DANGEROUS)"
-	depends on UFS_FS && EXPERIMENTAL
-	help
-	  Say Y here if you want to try writing to UFS partitions. This is
-	  experimental, so you should back up your UFS partitions beforehand.
-
-config UFS_DEBUG
-	bool "UFS debugging"
-	depends on UFS_FS
-	help
-	  If you are experiencing any problems with the UFS filesystem, say
-	  Y here.  This will result in _many_ additional debugging messages to be
-	  written to the system log.
+source "fs/ufs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
new file mode 100644
index 00000000000..e4f10a40768
--- /dev/null
+++ b/fs/ufs/Kconfig
@@ -0,0 +1,43 @@
+config UFS_FS
+	tristate "UFS file system support (read only)"
+	depends on BLOCK
+	help
+	  BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
+	  OpenBSD and NeXTstep) use a file system called UFS. Some System V
+	  Unixes can create and mount hard disk partitions and diskettes using
+	  this file system as well. Saying Y here will allow you to read from
+	  these partitions; if you also want to write to them, say Y to the
+	  experimental "UFS file system write support", below. Please read the
+	  file <file:Documentation/filesystems/ufs.txt> for more information.
+
+          The recently released UFS2 variant (used in FreeBSD 5.x) is
+          READ-ONLY supported.
+
+	  Note that this option is generally not needed for floppies, since a
+	  good portable way to transport files and directories between unixes
+	  (and even other operating systems) is given by the tar program ("man
+	  tar" or preferably "info tar").
+
+	  When accessing NeXTstep files, you may need to convert them from the
+	  NeXT character set to the Latin1 character set; use the program
+	  recode ("info recode") for this purpose.
+
+	  To compile the UFS file system support as a module, choose M here: the
+	  module will be called ufs.
+
+	  If you haven't heard about all of this before, it's safe to say N.
+
+config UFS_FS_WRITE
+	bool "UFS file system write support (DANGEROUS)"
+	depends on UFS_FS && EXPERIMENTAL
+	help
+	  Say Y here if you want to try writing to UFS partitions. This is
+	  experimental, so you should back up your UFS partitions beforehand.
+
+config UFS_DEBUG
+	bool "UFS debugging"
+	depends on UFS_FS
+	help
+	  If you are experiencing any problems with the UFS filesystem, say
+	  Y here.  This will result in _many_ additional debugging messages to be
+	  written to the system log.
-- 
cgit v1.2.3


From 97afe47ac378615d727fc2f0ffa1b58e9837f438 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:07:41 +0300
Subject: fs/Kconfig: move nfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 87 +---------------------------------------------------------
 fs/nfs/Kconfig | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 86 deletions(-)
 create mode 100644 fs/nfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 35941e8a17c..f07c72b7666 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -241,92 +241,7 @@ menuconfig NETWORK_FILESYSTEMS
 
 if NETWORK_FILESYSTEMS
 
-config NFS_FS
-	tristate "NFS client support"
-	depends on INET
-	select LOCKD
-	select SUNRPC
-	select NFS_ACL_SUPPORT if NFS_V3_ACL
-	help
-	  Choose Y here if you want to access files residing on other
-	  computers using Sun's Network File System protocol.  To compile
-	  this file system support as a module, choose M here: the module
-	  will be called nfs.
-
-	  To mount file systems exported by NFS servers, you also need to
-	  install the user space mount.nfs command which can be found in
-	  the Linux nfs-utils package, available from http://linux-nfs.org/.
-	  Information about using the mount command is available in the
-	  mount(8) man page.  More detail about the Linux NFS client
-	  implementation is available via the nfs(5) man page.
-
-	  Below you can choose which versions of the NFS protocol are
-	  available in the kernel to mount NFS servers.  Support for NFS
-	  version 2 (RFC 1094) is always available when NFS_FS is selected.
-
-	  To configure a system which mounts its root file system via NFS
-	  at boot time, say Y here, select "Kernel level IP
-	  autoconfiguration" in the NETWORK menu, and select "Root file
-	  system on NFS" below.  You cannot compile this file system as a
-	  module in this case.
-
-	  If unsure, say N.
-
-config NFS_V3
-	bool "NFS client support for NFS version 3"
-	depends on NFS_FS
-	help
-	  This option enables support for version 3 of the NFS protocol
-	  (RFC 1813) in the kernel's NFS client.
-
-	  If unsure, say Y.
-
-config NFS_V3_ACL
-	bool "NFS client support for the NFSv3 ACL protocol extension"
-	depends on NFS_V3
-	help
-	  Some NFS servers support an auxiliary NFSv3 ACL protocol that
-	  Sun added to Solaris but never became an official part of the
-	  NFS version 3 protocol.  This protocol extension allows
-	  applications on NFS clients to manipulate POSIX Access Control
-	  Lists on files residing on NFS servers.  NFS servers enforce
-	  ACLs on local files whether this protocol is available or not.
-
-	  Choose Y here if your NFS server supports the Solaris NFSv3 ACL
-	  protocol extension and you want your NFS client to allow
-	  applications to access and modify ACLs on files on the server.
-
-	  Most NFS servers don't support the Solaris NFSv3 ACL protocol
-	  extension.  You can choose N here or specify the "noacl" mount
-	  option to prevent your NFS client from trying to use the NFSv3
-	  ACL protocol.
-
-	  If unsure, say N.
-
-config NFS_V4
-	bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
-	depends on NFS_FS && EXPERIMENTAL
-	select RPCSEC_GSS_KRB5
-	help
-	  This option enables support for version 4 of the NFS protocol
-	  (RFC 3530) in the kernel's NFS client.
-
-	  To mount NFS servers using NFSv4, you also need to install user
-	  space programs which can be found in the Linux nfs-utils package,
-	  available from http://linux-nfs.org/.
-
-	  If unsure, say N.
-
-config ROOT_NFS
-	bool "Root file system on NFS"
-	depends on NFS_FS=y && IP_PNP
-	help
-	  If you want your system to mount its root file system via NFS,
-	  choose Y here.  This is common practice for managing systems
-	  without local permanent storage.  For details, read
-	  <file:Documentation/filesystems/nfsroot.txt>.
-
-	  Most people say N here.
+source "fs/nfs/Kconfig"
 
 config NFSD
 	tristate "NFS server support"
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
new file mode 100644
index 00000000000..36fe20d6eba
--- /dev/null
+++ b/fs/nfs/Kconfig
@@ -0,0 +1,86 @@
+config NFS_FS
+	tristate "NFS client support"
+	depends on INET
+	select LOCKD
+	select SUNRPC
+	select NFS_ACL_SUPPORT if NFS_V3_ACL
+	help
+	  Choose Y here if you want to access files residing on other
+	  computers using Sun's Network File System protocol.  To compile
+	  this file system support as a module, choose M here: the module
+	  will be called nfs.
+
+	  To mount file systems exported by NFS servers, you also need to
+	  install the user space mount.nfs command which can be found in
+	  the Linux nfs-utils package, available from http://linux-nfs.org/.
+	  Information about using the mount command is available in the
+	  mount(8) man page.  More detail about the Linux NFS client
+	  implementation is available via the nfs(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available in the kernel to mount NFS servers.  Support for NFS
+	  version 2 (RFC 1094) is always available when NFS_FS is selected.
+
+	  To configure a system which mounts its root file system via NFS
+	  at boot time, say Y here, select "Kernel level IP
+	  autoconfiguration" in the NETWORK menu, and select "Root file
+	  system on NFS" below.  You cannot compile this file system as a
+	  module in this case.
+
+	  If unsure, say N.
+
+config NFS_V3
+	bool "NFS client support for NFS version 3"
+	depends on NFS_FS
+	help
+	  This option enables support for version 3 of the NFS protocol
+	  (RFC 1813) in the kernel's NFS client.
+
+	  If unsure, say Y.
+
+config NFS_V3_ACL
+	bool "NFS client support for the NFSv3 ACL protocol extension"
+	depends on NFS_V3
+	help
+	  Some NFS servers support an auxiliary NFSv3 ACL protocol that
+	  Sun added to Solaris but never became an official part of the
+	  NFS version 3 protocol.  This protocol extension allows
+	  applications on NFS clients to manipulate POSIX Access Control
+	  Lists on files residing on NFS servers.  NFS servers enforce
+	  ACLs on local files whether this protocol is available or not.
+
+	  Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+	  protocol extension and you want your NFS client to allow
+	  applications to access and modify ACLs on files on the server.
+
+	  Most NFS servers don't support the Solaris NFSv3 ACL protocol
+	  extension.  You can choose N here or specify the "noacl" mount
+	  option to prevent your NFS client from trying to use the NFSv3
+	  ACL protocol.
+
+	  If unsure, say N.
+
+config NFS_V4
+	bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
+	depends on NFS_FS && EXPERIMENTAL
+	select RPCSEC_GSS_KRB5
+	help
+	  This option enables support for version 4 of the NFS protocol
+	  (RFC 3530) in the kernel's NFS client.
+
+	  To mount NFS servers using NFSv4, you also need to install user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say N.
+
+config ROOT_NFS
+	bool "Root file system on NFS"
+	depends on NFS_FS=y && IP_PNP
+	help
+	  If you want your system to mount its root file system via NFS,
+	  choose Y here.  This is common practice for managing systems
+	  without local permanent storage.  For details, read
+	  <file:Documentation/filesystems/nfsroot.txt>.
+
+	  Most people say N here.
-- 
cgit v1.2.3


From e2b329e2002685c1b0fa3c06caadc0936b7f507f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:08:58 +0300
Subject: fs/Kconfig: move nfsd out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 82 +--------------------------------------------------------
 fs/nfsd/Kconfig | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 81 deletions(-)
 create mode 100644 fs/nfsd/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f07c72b7666..acceb6e62bf 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -242,87 +242,7 @@ menuconfig NETWORK_FILESYSTEMS
 if NETWORK_FILESYSTEMS
 
 source "fs/nfs/Kconfig"
-
-config NFSD
-	tristate "NFS server support"
-	depends on INET
-	select LOCKD
-	select SUNRPC
-	select EXPORTFS
-	select NFS_ACL_SUPPORT if NFSD_V2_ACL
-	help
-	  Choose Y here if you want to allow other computers to access
-	  files residing on this system using Sun's Network File System
-	  protocol.  To compile the NFS server support as a module,
-	  choose M here: the module will be called nfsd.
-
-	  You may choose to use a user-space NFS server instead, in which
-	  case you can choose N here.
-
-	  To export local file systems using NFS, you also need to install
-	  user space programs which can be found in the Linux nfs-utils
-	  package, available from http://linux-nfs.org/.  More detail about
-	  the Linux NFS server implementation is available via the
-	  exports(5) man page.
-
-	  Below you can choose which versions of the NFS protocol are
-	  available to clients mounting the NFS server on this system.
-	  Support for NFS version 2 (RFC 1094) is always available when
-	  CONFIG_NFSD is selected.
-
-	  If unsure, say N.
-
-config NFSD_V2_ACL
-	bool
-	depends on NFSD
-
-config NFSD_V3
-	bool "NFS server support for NFS version 3"
-	depends on NFSD
-	help
-	  This option enables support in your system's NFS server for
-	  version 3 of the NFS protocol (RFC 1813).
-
-	  If unsure, say Y.
-
-config NFSD_V3_ACL
-	bool "NFS server support for the NFSv3 ACL protocol extension"
-	depends on NFSD_V3
-	select NFSD_V2_ACL
-	help
-	  Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
-	  never became an official part of the NFS version 3 protocol.
-	  This protocol extension allows applications on NFS clients to
-	  manipulate POSIX Access Control Lists on files residing on NFS
-	  servers.  NFS servers enforce POSIX ACLs on local files whether
-	  this protocol is available or not.
-
-	  This option enables support in your system's NFS server for the
-	  NFSv3 ACL protocol extension allowing NFS clients to manipulate
-	  POSIX ACLs on files exported by your system's NFS server.  NFS
-	  clients which support the Solaris NFSv3 ACL protocol can then
-	  access and modify ACLs on your NFS server.
-
-	  To store ACLs on your NFS server, you also need to enable ACL-
-	  related CONFIG options for your local file systems of choice.
-
-	  If unsure, say N.
-
-config NFSD_V4
-	bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
-	depends on NFSD && PROC_FS && EXPERIMENTAL
-	select NFSD_V3
-	select FS_POSIX_ACL
-	select RPCSEC_GSS_KRB5
-	help
-	  This option enables support in your system's NFS server for
-	  version 4 of the NFS protocol (RFC 3530).
-
-	  To export files using NFSv4, you need to install additional user
-	  space programs which can be found in the Linux nfs-utils package,
-	  available from http://linux-nfs.org/.
-
-	  If unsure, say N.
+source "fs/nfsd/Kconfig"
 
 config LOCKD
 	tristate
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
new file mode 100644
index 00000000000..44d7d04dab9
--- /dev/null
+++ b/fs/nfsd/Kconfig
@@ -0,0 +1,80 @@
+config NFSD
+	tristate "NFS server support"
+	depends on INET
+	select LOCKD
+	select SUNRPC
+	select EXPORTFS
+	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	help
+	  Choose Y here if you want to allow other computers to access
+	  files residing on this system using Sun's Network File System
+	  protocol.  To compile the NFS server support as a module,
+	  choose M here: the module will be called nfsd.
+
+	  You may choose to use a user-space NFS server instead, in which
+	  case you can choose N here.
+
+	  To export local file systems using NFS, you also need to install
+	  user space programs which can be found in the Linux nfs-utils
+	  package, available from http://linux-nfs.org/.  More detail about
+	  the Linux NFS server implementation is available via the
+	  exports(5) man page.
+
+	  Below you can choose which versions of the NFS protocol are
+	  available to clients mounting the NFS server on this system.
+	  Support for NFS version 2 (RFC 1094) is always available when
+	  CONFIG_NFSD is selected.
+
+	  If unsure, say N.
+
+config NFSD_V2_ACL
+	bool
+	depends on NFSD
+
+config NFSD_V3
+	bool "NFS server support for NFS version 3"
+	depends on NFSD
+	help
+	  This option enables support in your system's NFS server for
+	  version 3 of the NFS protocol (RFC 1813).
+
+	  If unsure, say Y.
+
+config NFSD_V3_ACL
+	bool "NFS server support for the NFSv3 ACL protocol extension"
+	depends on NFSD_V3
+	select NFSD_V2_ACL
+	help
+	  Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
+	  never became an official part of the NFS version 3 protocol.
+	  This protocol extension allows applications on NFS clients to
+	  manipulate POSIX Access Control Lists on files residing on NFS
+	  servers.  NFS servers enforce POSIX ACLs on local files whether
+	  this protocol is available or not.
+
+	  This option enables support in your system's NFS server for the
+	  NFSv3 ACL protocol extension allowing NFS clients to manipulate
+	  POSIX ACLs on files exported by your system's NFS server.  NFS
+	  clients which support the Solaris NFSv3 ACL protocol can then
+	  access and modify ACLs on your NFS server.
+
+	  To store ACLs on your NFS server, you also need to enable ACL-
+	  related CONFIG options for your local file systems of choice.
+
+	  If unsure, say N.
+
+config NFSD_V4
+	bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
+	depends on NFSD && PROC_FS && EXPERIMENTAL
+	select NFSD_V3
+	select FS_POSIX_ACL
+	select RPCSEC_GSS_KRB5
+	help
+	  This option enables support in your system's NFS server for
+	  version 4 of the NFS protocol (RFC 3530).
+
+	  To export files using NFSv4, you need to install additional user
+	  space programs which can be found in the Linux nfs-utils package,
+	  available from http://linux-nfs.org/.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 9098c24f35f7da6c89a83420acf21e3d7b35151d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:11:56 +0300
Subject: fs/Kconfig: move sunrpc out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig | 80 +-------------------------------------------------------------
 1 file changed, 1 insertion(+), 79 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index acceb6e62bf..1d7c0f6fade 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -264,85 +264,7 @@ config NFS_COMMON
 	depends on NFSD || NFS_FS
 	default y
 
-config SUNRPC
-	tristate
-
-config SUNRPC_GSS
-	tristate
-
-config SUNRPC_XPRT_RDMA
-	tristate
-	depends on SUNRPC && INFINIBAND && EXPERIMENTAL
-	default SUNRPC && INFINIBAND
-	help
-	  This option enables an RPC client transport capability that
-	  allows the NFS client to mount servers via an RDMA-enabled
-	  transport.
-
-	  To compile RPC client RDMA transport support as a module,
-	  choose M here: the module will be called xprtrdma.
-
-	  If unsure, say N.
-
-config SUNRPC_REGISTER_V4
-	bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	default n
-	help
-	  Sun added support for registering RPC services at an IPv6
-	  address by creating two new versions of the rpcbind protocol
-	  (RFC 1833).
-
-	  This option enables support in the kernel RPC server for
-	  registering kernel RPC services via version 4 of the rpcbind
-	  protocol.  If you enable this option, you must run a portmapper
-	  daemon that supports rpcbind protocol version 4.
-
-	  Serving NFS over IPv6 from knfsd (the kernel's NFS server)
-	  requires that you enable this option and use a portmapper that
-	  supports rpcbind version 4.
-
-	  If unsure, say N to get traditional behavior (register kernel
-	  RPC services using only rpcbind version 2).  Distributions
-	  using the legacy Linux portmapper daemon must say N here.
-
-config RPCSEC_GSS_KRB5
-	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	select SUNRPC_GSS
-	select CRYPTO
-	select CRYPTO_MD5
-	select CRYPTO_DES
-	select CRYPTO_CBC
-	help
-	  Choose Y here to enable Secure RPC using the Kerberos version 5
-	  GSS-API mechanism (RFC 1964).
-
-	  Secure RPC calls with Kerberos require an auxiliary user-space
-	  daemon which may be found in the Linux nfs-utils package
-	  available from http://linux-nfs.org/.  In addition, user-space
-	  Kerberos support should be installed.
-
-	  If unsure, say N.
-
-config RPCSEC_GSS_SPKM3
-	tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
-	select SUNRPC_GSS
-	select CRYPTO
-	select CRYPTO_MD5
-	select CRYPTO_DES
-	select CRYPTO_CAST5
-	select CRYPTO_CBC
-	help
-	  Choose Y here to enable Secure RPC using the SPKM3 public key
-	  GSS-API mechansim (RFC 2025).
-
-	  Secure RPC calls with SPKM3 require an auxiliary userspace
-	  daemon which may be found in the Linux nfs-utils package
-	  available from http://linux-nfs.org/.
-
-	  If unsure, say N.
+source "net/sunrpc/Kconfig"
 
 config SMB_FS
 	tristate "SMB file system support (OBSOLETE, please use CIFS)"
-- 
cgit v1.2.3


From 213a41d404d5ed16528df5aa0ed215adcb1e9d66 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:13:16 +0300
Subject: fs/Kconfig: move smbfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 58 +-------------------------------------------------------
 fs/smbfs/Kconfig | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 57 deletions(-)
 create mode 100644 fs/smbfs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 1d7c0f6fade..c05ccea75c3 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -265,63 +265,7 @@ config NFS_COMMON
 	default y
 
 source "net/sunrpc/Kconfig"
-
-config SMB_FS
-	tristate "SMB file system support (OBSOLETE, please use CIFS)"
-	depends on INET
-	select NLS
-	help
-	  SMB (Server Message Block) is the protocol Windows for Workgroups
-	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
-	  files and printers over local networks.  Saying Y here allows you to
-	  mount their file systems (often called "shares" in this context) and
-	  access them just like any other Unix directory.  Currently, this
-	  works only if the Windows machines use TCP/IP as the underlying
-	  transport protocol, and not NetBEUI.  For details, read
-	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
-	  available from <http://www.tldp.org/docs.html#howto>.
-
-	  Note: if you just want your box to act as an SMB *server* and make
-	  files and printing services available to Windows clients (which need
-	  to have a TCP/IP stack), you don't need to say Y here; you can use
-	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
-	  for that.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  To compile the SMB support as a module, choose M here:
-	  the module will be called smbfs.  Most people say N, however.
-
-config SMB_NLS_DEFAULT
-	bool "Use a default NLS"
-	depends on SMB_FS
-	help
-	  Enabling this will make smbfs use nls translations by default. You
-	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
-	  settings and you need to give the default nls for the SMB server as
-	  CONFIG_SMB_NLS_REMOTE.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
-
-config SMB_NLS_REMOTE
-	string "Default Remote NLS Option"
-	depends on SMB_NLS_DEFAULT
-	default "cp437"
-	help
-	  This setting allows you to specify a default value for which
-	  codepage the server uses. If this field is left blank no
-	  translations will be done by default. The local codepage/charset
-	  default to CONFIG_NLS_DEFAULT.
-
-	  The nls settings can be changed at mount time, if your smbmount
-	  supports that, using the codepage and iocharset parameters.
-
-	  smbmount from samba 2.2.0 or later supports this.
-
+source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
 
 config NCP_FS
diff --git a/fs/smbfs/Kconfig b/fs/smbfs/Kconfig
new file mode 100644
index 00000000000..e668127c8b2
--- /dev/null
+++ b/fs/smbfs/Kconfig
@@ -0,0 +1,55 @@
+config SMB_FS
+	tristate "SMB file system support (OBSOLETE, please use CIFS)"
+	depends on INET
+	select NLS
+	help
+	  SMB (Server Message Block) is the protocol Windows for Workgroups
+	  (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share
+	  files and printers over local networks.  Saying Y here allows you to
+	  mount their file systems (often called "shares" in this context) and
+	  access them just like any other Unix directory.  Currently, this
+	  works only if the Windows machines use TCP/IP as the underlying
+	  transport protocol, and not NetBEUI.  For details, read
+	  <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO,
+	  available from <http://www.tldp.org/docs.html#howto>.
+
+	  Note: if you just want your box to act as an SMB *server* and make
+	  files and printing services available to Windows clients (which need
+	  to have a TCP/IP stack), you don't need to say Y here; you can use
+	  the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>)
+	  for that.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile the SMB support as a module, choose M here:
+	  the module will be called smbfs.  Most people say N, however.
+
+config SMB_NLS_DEFAULT
+	bool "Use a default NLS"
+	depends on SMB_FS
+	help
+	  Enabling this will make smbfs use nls translations by default. You
+	  need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls
+	  settings and you need to give the default nls for the SMB server as
+	  CONFIG_SMB_NLS_REMOTE.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
+
+config SMB_NLS_REMOTE
+	string "Default Remote NLS Option"
+	depends on SMB_NLS_DEFAULT
+	default "cp437"
+	help
+	  This setting allows you to specify a default value for which
+	  codepage the server uses. If this field is left blank no
+	  translations will be done by default. The local codepage/charset
+	  default to CONFIG_NLS_DEFAULT.
+
+	  The nls settings can be changed at mount time, if your smbmount
+	  supports that, using the codepage and iocharset parameters.
+
+	  smbmount from samba 2.2.0 or later supports this.
-- 
cgit v1.2.3


From 9d7d6447ef455f4561f63bf6e8f6bef58b42a0a3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:14:15 +0300
Subject: fs/Kconfig: move the rest of ncpfs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig       | 22 ----------------------
 fs/ncpfs/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index c05ccea75c3..86a4f1173fa 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -267,28 +267,6 @@ config NFS_COMMON
 source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
-
-config NCP_FS
-	tristate "NCP file system support (to mount NetWare volumes)"
-	depends on IPX!=n || INET
-	help
-	  NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
-	  used by Novell NetWare clients to talk to file servers.  It is to
-	  IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
-	  to mount NetWare file server volumes and to access them just like
-	  any other Unix directory.  For details, please read the file
-	  <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
-	  the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
-
-	  You do not have to say Y here if you want your Linux box to act as a
-	  file *server* for Novell NetWare clients.
-
-	  General information about how to connect Linux, Windows machines and
-	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
-
-	  To compile this as a module, choose M here: the module will be called
-	  ncpfs.  Say N unless you are connected to a Novell network.
-
 source "fs/ncpfs/Kconfig"
 
 config CODA_FS
diff --git a/fs/ncpfs/Kconfig b/fs/ncpfs/Kconfig
index 142808427b2..c931cf22a1f 100644
--- a/fs/ncpfs/Kconfig
+++ b/fs/ncpfs/Kconfig
@@ -1,6 +1,27 @@
 #
 # NCP Filesystem configuration
 #
+config NCP_FS
+	tristate "NCP file system support (to mount NetWare volumes)"
+	depends on IPX!=n || INET
+	help
+	  NCP (NetWare Core Protocol) is a protocol that runs over IPX and is
+	  used by Novell NetWare clients to talk to file servers.  It is to
+	  IPX what NFS is to TCP/IP, if that helps.  Saying Y here allows you
+	  to mount NetWare file server volumes and to access them just like
+	  any other Unix directory.  For details, please read the file
+	  <file:Documentation/filesystems/ncpfs.txt> in the kernel source and
+	  the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>.
+
+	  You do not have to say Y here if you want your Linux box to act as a
+	  file *server* for Novell NetWare clients.
+
+	  General information about how to connect Linux, Windows machines and
+	  Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>.
+
+	  To compile this as a module, choose M here: the module will be called
+	  ncpfs.  Say N unless you are connected to a Novell network.
+
 config NCPFS_PACKET_SIGNING
 	bool "Packet signatures"
 	depends on NCP_FS
-- 
cgit v1.2.3


From 33a1a6fedf08bbcb4b4df74498d697e7a88d39f2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:15:06 +0300
Subject: fs/Kconfig: move coda out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig      | 23 +----------------------
 fs/coda/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 fs/coda/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 86a4f1173fa..f5cd88790b0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -268,28 +268,7 @@ source "net/sunrpc/Kconfig"
 source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
-
-config CODA_FS
-	tristate "Coda file system support (advanced network fs)"
-	depends on INET
-	help
-	  Coda is an advanced network file system, similar to NFS in that it
-	  enables you to mount file systems of a remote server and access them
-	  with regular Unix commands as if they were sitting on your hard
-	  disk.  Coda has several advantages over NFS: support for
-	  disconnected operation (e.g. for laptops), read/write server
-	  replication, security model for authentication and encryption,
-	  persistent client caches and write back caching.
-
-	  If you say Y here, your Linux box will be able to act as a Coda
-	  *client*.  You will need user level code as well, both for the
-	  client and server.  Servers are currently user level, i.e. they need
-	  no kernel support.  Please read
-	  <file:Documentation/filesystems/coda.txt> and check out the Coda
-	  home page <http://www.coda.cs.cmu.edu/>.
-
-	  To compile the coda client support as a module, choose M here: the
-	  module will be called coda.
+source "fs/coda/Kconfig"
 
 config AFS_FS
 	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
diff --git a/fs/coda/Kconfig b/fs/coda/Kconfig
new file mode 100644
index 00000000000..c0e5a7fad06
--- /dev/null
+++ b/fs/coda/Kconfig
@@ -0,0 +1,21 @@
+config CODA_FS
+	tristate "Coda file system support (advanced network fs)"
+	depends on INET
+	help
+	  Coda is an advanced network file system, similar to NFS in that it
+	  enables you to mount file systems of a remote server and access them
+	  with regular Unix commands as if they were sitting on your hard
+	  disk.  Coda has several advantages over NFS: support for
+	  disconnected operation (e.g. for laptops), read/write server
+	  replication, security model for authentication and encryption,
+	  persistent client caches and write back caching.
+
+	  If you say Y here, your Linux box will be able to act as a Coda
+	  *client*.  You will need user level code as well, both for the
+	  client and server.  Servers are currently user level, i.e. they need
+	  no kernel support.  Please read
+	  <file:Documentation/filesystems/coda.txt> and check out the Coda
+	  home page <http://www.coda.cs.cmu.edu/>.
+
+	  To compile the coda client support as a module, choose M here: the
+	  module will be called coda.
-- 
cgit v1.2.3


From b2480c7fbfed172e6ec3ba1c8e80f05a3721b24a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:16:02 +0300
Subject: fs/Kconfig: move afs out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/Kconfig     | 23 +----------------------
 fs/afs/Kconfig | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 22 deletions(-)
 create mode 100644 fs/afs/Kconfig

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index f5cd88790b0..0563f9f1ab5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -269,28 +269,7 @@ source "fs/smbfs/Kconfig"
 source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
-
-config AFS_FS
-	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
-	depends on INET && EXPERIMENTAL
-	select AF_RXRPC
-	help
-	  If you say Y here, you will get an experimental Andrew File System
-	  driver. It currently only supports unsecured read-only AFS access.
-
-	  See <file:Documentation/filesystems/afs.txt> for more information.
-
-	  If unsure, say N.
-
-config AFS_DEBUG
-	bool "AFS dynamic debugging"
-	depends on AFS_FS
-	help
-	  Say Y here to make runtime controllable debugging messages appear.
-
-	  See <file:Documentation/filesystems/afs.txt> for more information.
-
-	  If unsure, say N.
+source "fs/afs/Kconfig"
 
 config 9P_FS
 	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
new file mode 100644
index 00000000000..e7b522fe15e
--- /dev/null
+++ b/fs/afs/Kconfig
@@ -0,0 +1,21 @@
+config AFS_FS
+	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	select AF_RXRPC
+	help
+	  If you say Y here, you will get an experimental Andrew File System
+	  driver. It currently only supports unsecured read-only AFS access.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
+
+config AFS_DEBUG
+	bool "AFS dynamic debugging"
+	depends on AFS_FS
+	help
+	  Say Y here to make runtime controllable debugging messages appear.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
-- 
cgit v1.2.3


From 0fcb44088970b18eaf2df4579d64840be6e3bf39 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 22 Jan 2009 11:16:42 +0300
Subject: fs/Kconfig: move 9p out

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/9p/Kconfig | 10 ++++++++++
 fs/Kconfig    | 12 +-----------
 2 files changed, 11 insertions(+), 11 deletions(-)
 create mode 100644 fs/9p/Kconfig

(limited to 'fs')

diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
new file mode 100644
index 00000000000..74e0723e90b
--- /dev/null
+++ b/fs/9p/Kconfig
@@ -0,0 +1,10 @@
+config 9P_FS
+	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
+	depends on INET && NET_9P && EXPERIMENTAL
+	help
+	  If you say Y here, you will get experimental support for
+	  Plan 9 resource sharing via the 9P2000 protocol.
+
+	  See <http://v9fs.sf.net> for more information.
+
+	  If unsure, say N.
diff --git a/fs/Kconfig b/fs/Kconfig
index 0563f9f1ab5..93945dd0b1a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -270,17 +270,7 @@ source "fs/cifs/Kconfig"
 source "fs/ncpfs/Kconfig"
 source "fs/coda/Kconfig"
 source "fs/afs/Kconfig"
-
-config 9P_FS
-	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
-	depends on INET && NET_9P && EXPERIMENTAL
-	help
-	  If you say Y here, you will get experimental support for
-	  Plan 9 resource sharing via the 9P2000 protocol.
-
-	  See <http://v9fs.sf.net> for more information.
-
-	  If unsure, say N.
+source "fs/9p/Kconfig"
 
 endif # NETWORK_FILESYSTEMS
 
-- 
cgit v1.2.3


From 82c1593cad3dfc97661764c8bc62aa1a416e9ea8 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 20 Jan 2009 16:46:02 +0200
Subject: UBIFS: simplify locking

This patch simplifies lock_[23]_inodes functions. We do not have
to care about locking order, because UBIFS does this for @i_mutex
and this is enough. Thanks to Al Viro for suggesting this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/dir.c | 92 +++++++++++++++++++++++-----------------------------------
 1 file changed, 36 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index d29b771cce4..f55d523c52b 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
 }
 
 /**
- * lock_2_inodes - lock two UBIFS inodes.
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
  */
 static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-	if (inode1->i_ino < inode2->i_ino) {
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
-		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
-	} else {
-		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
-	}
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
 }
 
 /**
- * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
  */
 static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 
 static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
 		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
 		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
 		dentry->d_name.name, inode->i_ino, dir->i_ino);
-
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = check_dir_empty(c, dentry->d_inode);
 	if (err)
 		return err;
@@ -922,59 +926,30 @@ out_budg:
 }
 
 /**
- * lock_3_inodes - lock three UBIFS inodes for rename.
+ * lock_3_inodes - a wrapper for locking three UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
  * @inode3: third inode
  *
- * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
- * be null.
+ * This function is used for 'ubifs_rename()' and @inode1 may be the same as
+ * @inode2 whereas @inode3 may be %NULL.
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
  */
 static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
 			  struct inode *inode3)
 {
-	struct inode *i1, *i2, *i3;
-
-	if (!inode3) {
-		if (inode1 != inode2) {
-			lock_2_inodes(inode1, inode2);
-			return;
-		}
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
-		return;
-	}
-
-	if (inode1 == inode2) {
-		lock_2_inodes(inode1, inode3);
-		return;
-	}
-
-	/* 3 different inodes */
-	if (inode1 < inode2) {
-		i3 = inode2;
-		if (inode1 < inode3) {
-			i1 = inode1;
-			i2 = inode3;
-		} else {
-			i1 = inode3;
-			i2 = inode1;
-		}
-	} else {
-		i3 = inode1;
-		if (inode2 < inode3) {
-			i1 = inode2;
-			i2 = inode3;
-		} else {
-			i1 = inode3;
-			i2 = inode2;
-		}
-	}
-	mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
-	lock_2_inodes(i2, i3);
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	if (inode2 != inode1)
+		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+	if (inode3)
+		mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
 }
 
 /**
- * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
  * @inode1: first inode
  * @inode2: second inode
  * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
 static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
 			    struct inode *inode3)
 {
-	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
-	if (inode1 != inode2)
-		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
 	if (inode3)
 		mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+	if (inode1 != inode2)
+		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 
 static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		"dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
 		old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
 		new_dentry->d_name.name, new_dir->i_ino);
+	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+	if (unlink)
+		ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+
 
 	if (unlink && is_dir) {
 		err = check_dir_empty(c, new_inode);
-- 
cgit v1.2.3


From e4d9b6cbfc98d696a28d2c24a3d49768695811ee Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 23 Jan 2009 14:17:36 +0200
Subject: UBIFS: fix LEB list freeing

When freeing the c->idx_lebs list, we have to release the LEBs as well,
because we might be called from mount to read-only mode code. Otherwise
the LEBs stay taken forever, which may cause problems when we re-mount
back ro RW mode.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c     | 16 ++++++++++++----
 fs/ubifs/lprops.c |  8 ++++++++
 fs/ubifs/super.c  | 42 +++++++++++++++++++++++++++---------------
 fs/ubifs/ubifs.h  |  2 +-
 4 files changed, 48 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index b2e5f113337..9760154d874 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -830,21 +830,29 @@ out:
  * ubifs_destroy_idx_gc - destroy idx_gc list.
  * @c: UBIFS file-system description object
  *
- * This function destroys the idx_gc list. It is called when unmounting or
- * remounting read-only so locks are not needed.
+ * This function destroys the @c->idx_gc list. It is called when unmounting or
+ * remounting read-only so locks are not needed. Returns zero in case of
+ * success and a negative error code in case of failure.
  */
-void ubifs_destroy_idx_gc(struct ubifs_info *c)
+int ubifs_destroy_idx_gc(struct ubifs_info *c)
 {
+	int ret = 0;
+
 	while (!list_empty(&c->idx_gc)) {
+		int err;
 		struct ubifs_gced_idx_leb *idx_gc;
 
 		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
 				    list);
-		c->idx_gc_cnt -= 1;
+		err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
+		if (err && !ret)
+			ret = err;
 		list_del(&idx_gc->list);
 		kfree(idx_gc);
 	}
 
+	return ret;
 }
 
 /**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index dfd2bcece27..68328c59762 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 
 out:
 	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot change properties of LEB %d, error %d",
+			  lnum, err);
 	return err;
 }
 
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 
 out:
 	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot update properties of LEB %d, error %d",
+			  lnum, err);
 	return err;
 }
 
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
 	lpp = ubifs_lpt_lookup(c, lnum);
 	if (IS_ERR(lpp)) {
 		err = PTR_ERR(lpp);
+		ubifs_err("cannot read properties of LEB %d, error %d",
+			  lnum, err);
 		goto out;
 	}
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index da99da098ef..807bbd3c8b4 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1469,9 +1469,6 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
 	int err, lnum;
 
-	if (c->ro_media)
-		return -EINVAL;
-
 	mutex_lock(&c->umount_mutex);
 	c->remounting_rw = 1;
 	c->always_chk_crc = 1;
@@ -1605,9 +1602,13 @@ out:
  */
 static void commit_on_unmount(struct ubifs_info *c)
 {
-	struct super_block *sb = c->vfs_sb;
 	long long bud_bytes;
 
+	if (!c->fast_unmount) {
+		dbg_gen("skip committing - fast unmount enabled");
+		return;
+	}
+
 	/*
 	 * This function is called before the background thread is stopped, so
 	 * we may race with ongoing commit, which means we have to take
@@ -1617,8 +1618,11 @@ static void commit_on_unmount(struct ubifs_info *c)
 	bud_bytes = c->bud_bytes;
 	spin_unlock(&c->buds_lock);
 
-	if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
+	if (bud_bytes) {
+		dbg_gen("run commit");
 		ubifs_run_commit(c);
+	} else
+		dbg_gen("journal is empty, do not run commit");
 }
 
 /**
@@ -1633,6 +1637,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	int i, err;
 
 	ubifs_assert(!c->need_recovery);
+	ubifs_assert(!c->ro_media);
+
 	commit_on_unmount(c);
 
 	mutex_lock(&c->umount_mutex);
@@ -1646,16 +1652,17 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 		del_timer_sync(&c->jheads[i].wbuf.timer);
 	}
 
-	if (!c->ro_media) {
-		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
-		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
-		c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
-		err = ubifs_write_master(c);
-		if (err)
-			ubifs_ro_mode(c, err);
-	}
+	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+	err = ubifs_write_master(c);
+	if (err)
+		ubifs_ro_mode(c, err);
+
+	err = ubifs_destroy_idx_gc(c);
+	if (err)
+		ubifs_ro_mode(c, err);
 
-	ubifs_destroy_idx_gc(c);
 	free_wbufs(c);
 	vfree(c->orph_buf);
 	c->orph_buf = NULL;
@@ -1754,6 +1761,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		if (c->ro_media) {
+			ubifs_msg("cannot re-mount R/W, UBIFS is working in "
+				  "R/O mode");
+			return -EINVAL;
+		}
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
@@ -2044,7 +2056,7 @@ static void ubifs_kill_sb(struct super_block *sb)
 	 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
 	 * in order to be outside BKL.
 	 */
-	if (sb->s_root)
+	if (sb->s_root && !(sb->s_flags & MS_RDONLY))
 		commit_on_unmount(c);
 	/* The un-mount routine is actually done in put_super() */
 	generic_shutdown_super(sb);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 2e78d6ac007..ee9517a7b02 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1593,7 +1593,7 @@ int ubifs_replay_journal(struct ubifs_info *c);
 int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
 int ubifs_gc_start_commit(struct ubifs_info *c);
 int ubifs_gc_end_commit(struct ubifs_info *c);
-void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_destroy_idx_gc(struct ubifs_info *c);
 int ubifs_get_idx_gc_leb(struct ubifs_info *c);
 int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
 
-- 
cgit v1.2.3


From 84abf972ccff5c13d10b672972949eba431a6e0e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 23 Jan 2009 14:54:59 +0200
Subject: UBIFS: add re-mount debugging checks

We observe space corrupted accounting when re-mounting. So add some
debbugging checks to catch problems like this.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/budget.c |  35 +++++++++++-----
 fs/ubifs/debug.c  | 120 +++++++++++++++++++++++++++++++++++++++---------------
 fs/ubifs/debug.h  |  36 +++++++++-------
 fs/ubifs/file.c   |   1 -
 fs/ubifs/lprops.c |   4 +-
 fs/ubifs/super.c  |  14 +++++--
 fs/ubifs/ubifs.h  |   3 +-
 7 files changed, 148 insertions(+), 65 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 175f9c590b7..f393620890e 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -689,7 +689,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 }
 
 /**
- * ubifs_get_free_space - return amount of free space.
+ * ubifs_get_free_space_nolock - return amount of free space.
  * @c: UBIFS file-system description object
  *
  * This function calculates amount of free space to report to user-space.
@@ -704,16 +704,14 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * traditional file-systems, because they have way less overhead than UBIFS.
  * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
-long long ubifs_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space_nolock(struct ubifs_info *c)
 {
-	int min_idx_lebs, rsvd_idx_lebs, lebs;
+	int rsvd_idx_lebs, lebs;
 	long long available, outstanding, free;
 
-	spin_lock(&c->space_lock);
-	min_idx_lebs = c->min_idx_lebs;
-	ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	available = ubifs_calc_available(c, min_idx_lebs);
+	available = ubifs_calc_available(c, c->min_idx_lebs);
 
 	/*
 	 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -726,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 	 * Note, the calculations below are similar to what we have in
 	 * 'do_budget_space()', so refer there for comments.
 	 */
-	if (min_idx_lebs > c->lst.idx_lebs)
-		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	if (c->min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
 	else
 		rsvd_idx_lebs = 0;
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
 	lebs -= rsvd_idx_lebs;
 	available += lebs * (c->dark_wm - c->leb_overhead);
-	spin_unlock(&c->space_lock);
 
 	if (available > outstanding)
 		free = ubifs_reported_space(c, available - outstanding);
@@ -742,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 		free = 0;
 	return free;
 }
+
+/**
+ * ubifs_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and retuns amount of free space to report to
+ * user-space.
+ */
+long long ubifs_get_free_space(struct ubifs_info *c)
+{
+	long long free;
+
+	spin_lock(&c->space_lock);
+	free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+
+	return free;
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 792c5a16c18..9a41f6f245b 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -620,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
 	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
 	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
 	       c->gc_lnum, c->ihead_lnum);
-	for (i = 0; i < c->jhead_cnt; i++)
-		printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
-		       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+	/* If we are in R/O mode, journal heads do not exist */
+	if (c->jheads)
+		for (i = 0; i < c->jhead_cnt; i++)
+			printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+			       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
 	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
 		bud = rb_entry(rb, struct ubifs_bud, rb);
 		printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -637,10 +639,7 @@ void dbg_dump_budg(struct ubifs_info *c)
 	/* Print budgeting predictions */
 	available = ubifs_calc_available(c, c->min_idx_lebs);
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	if (available > outstanding)
-		free = ubifs_reported_space(c, available - outstanding);
-	else
-		free = 0;
+	free = ubifs_get_free_space_nolock(c);
 	printk(KERN_DEBUG "Budgeting predictions:\n");
 	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
 	       available, outstanding, free);
@@ -860,6 +859,65 @@ void dbg_dump_index(struct ubifs_info *c)
 	dbg_walk_index(c, NULL, dump_znode, NULL);
 }
 
+/**
+ * dbg_save_space_info - save information about flash space.
+ * @c: UBIFS file-system description object
+ *
+ * This function saves information about UBIFS free space, dirty space, etc, in
+ * order to check it later.
+ */
+void dbg_save_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+
+	ubifs_get_lp_stats(c, &d->saved_lst);
+
+	spin_lock(&c->space_lock);
+	d->saved_free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * dbg_check_space_info - check flash space information.
+ * @c: UBIFS file-system description object
+ *
+ * This function compares current flash space information with the information
+ * which was saved when the 'dbg_save_space_info()' function was called.
+ * Returns zero if the information has not changed, and %-EINVAL it it has
+ * changed.
+ */
+int dbg_check_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	struct ubifs_lp_stats lst;
+	long long avail, free;
+
+	spin_lock(&c->space_lock);
+	avail = ubifs_calc_available(c, c->min_idx_lebs);
+	spin_unlock(&c->space_lock);
+	free = ubifs_get_free_space(c);
+
+	if (free != d->saved_free) {
+		ubifs_err("free space changed from %lld to %lld",
+			  d->saved_free, free);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_msg("saved lprops statistics dump");
+	dbg_dump_lstats(&d->saved_lst);
+	ubifs_get_lp_stats(c, &lst);
+	ubifs_msg("current lprops statistics dump");
+	dbg_dump_lstats(&d->saved_lst);
+	spin_lock(&c->space_lock);
+	dbg_dump_budg(c);
+	spin_unlock(&c->space_lock);
+	dump_stack();
+	return -EINVAL;
+}
+
 /**
  * dbg_check_synced_i_size - check synchronized inode size.
  * @inode: inode to check
@@ -2409,7 +2467,7 @@ void ubifs_debugging_exit(struct ubifs_info *c)
  * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
  * contain the stuff specific to particular file-system mounts.
  */
-static struct dentry *debugfs_rootdir;
+static struct dentry *dfs_rootdir;
 
 /**
  * dbg_debugfs_init - initialize debugfs file-system.
@@ -2421,9 +2479,9 @@ static struct dentry *debugfs_rootdir;
  */
 int dbg_debugfs_init(void)
 {
-	debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
-	if (IS_ERR(debugfs_rootdir)) {
-		int err = PTR_ERR(debugfs_rootdir);
+	dfs_rootdir = debugfs_create_dir("ubifs", NULL);
+	if (IS_ERR(dfs_rootdir)) {
+		int err = PTR_ERR(dfs_rootdir);
 		ubifs_err("cannot create \"ubifs\" debugfs directory, "
 			  "error %d\n", err);
 		return err;
@@ -2437,7 +2495,7 @@ int dbg_debugfs_init(void)
  */
 void dbg_debugfs_exit(void)
 {
-	debugfs_remove(debugfs_rootdir);
+	debugfs_remove(dfs_rootdir);
 }
 
 static int open_debugfs_file(struct inode *inode, struct file *file)
@@ -2452,13 +2510,13 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 	struct ubifs_info *c = file->private_data;
 	struct ubifs_debug_info *d = c->dbg;
 
-	if (file->f_path.dentry == d->dump_lprops)
+	if (file->f_path.dentry == d->dfs_dump_lprops)
 		dbg_dump_lprops(c);
-	else if (file->f_path.dentry == d->dump_budg) {
+	else if (file->f_path.dentry == d->dfs_dump_budg) {
 		spin_lock(&c->space_lock);
 		dbg_dump_budg(c);
 		spin_unlock(&c->space_lock);
-	} else if (file->f_path.dentry == d->dump_tnc) {
+	} else if (file->f_path.dentry == d->dfs_dump_tnc) {
 		mutex_lock(&c->tnc_mutex);
 		dbg_dump_tnc(c);
 		mutex_unlock(&c->tnc_mutex);
@@ -2469,7 +2527,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 	return count;
 }
 
-static const struct file_operations debugfs_fops = {
+static const struct file_operations dfs_fops = {
 	.open = open_debugfs_file,
 	.write = write_debugfs_file,
 	.owner = THIS_MODULE,
@@ -2494,36 +2552,32 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
 	struct dentry *dent;
 	struct ubifs_debug_info *d = c->dbg;
 
-	sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
-	d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
-					      debugfs_rootdir);
-	if (IS_ERR(d->debugfs_dir)) {
-		err = PTR_ERR(d->debugfs_dir);
+	sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+	d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
+	if (IS_ERR(d->dfs_dir)) {
+		err = PTR_ERR(d->dfs_dir);
 		ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
-			  d->debugfs_dir_name, err);
+			  d->dfs_dir_name, err);
 		goto out;
 	}
 
 	fname = "dump_lprops";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_lprops = dent;
+	d->dfs_dump_lprops = dent;
 
 	fname = "dump_budg";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_budg = dent;
+	d->dfs_dump_budg = dent;
 
 	fname = "dump_tnc";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_tnc = dent;
+	d->dfs_dump_tnc = dent;
 
 	return 0;
 
@@ -2531,7 +2585,7 @@ out_remove:
 	err = PTR_ERR(dent);
 	ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
 		  fname, err);
-	debugfs_remove_recursive(d->debugfs_dir);
+	debugfs_remove_recursive(d->dfs_dir);
 out:
 	return err;
 }
@@ -2542,7 +2596,7 @@ out:
  */
 void dbg_debugfs_exit_fs(struct ubifs_info *c)
 {
-	debugfs_remove_recursive(c->dbg->debugfs_dir);
+	debugfs_remove_recursive(c->dbg->dfs_dir);
 }
 
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 9820d6999f7..c1cd73b2e06 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -41,15 +41,17 @@
  * @chk_lpt_wastage: used by LPT tree size checker
  * @chk_lpt_lebs: used by LPT tree size checker
  * @new_nhead_offs: used by LPT tree size checker
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
+ * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
+ * @new_ihead_offs: used by debugging to check @c->ihead_offs
  *
- * debugfs_dir_name: name of debugfs directory containing this file-system's
- *                   files
- * debugfs_dir: direntry object of the file-system debugfs directory
- * dump_lprops: "dump lprops" debugfs knob
- * dump_budg: "dump budgeting information" debugfs knob
- * dump_tnc: "dump TNC" debugfs knob
+ * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
+ * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ *
+ * dfs_dir_name: name of debugfs directory containing this file-system's files
+ * dfs_dir: direntry object of the file-system debugfs directory
+ * dfs_dump_lprops: "dump lprops" debugfs knob
+ * dfs_dump_budg: "dump budgeting information" debugfs knob
+ * dfs_dump_tnc: "dump TNC" debugfs knob
  */
 struct ubifs_debug_info {
 	void *buf;
@@ -69,11 +71,14 @@ struct ubifs_debug_info {
 	int new_ihead_lnum;
 	int new_ihead_offs;
 
-	char debugfs_dir_name[100];
-	struct dentry *debugfs_dir;
-	struct dentry *dump_lprops;
-	struct dentry *dump_budg;
-	struct dentry *dump_tnc;
+	struct ubifs_lp_stats saved_lst;
+	long long saved_free;
+
+	char dfs_dir_name[100];
+	struct dentry *dfs_dir;
+	struct dentry *dfs_dump_lprops;
+	struct dentry *dfs_dump_budg;
+	struct dentry *dfs_dump_tnc;
 };
 
 #define ubifs_assert(expr) do {                                                \
@@ -297,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
 		   dbg_znode_callback znode_cb, void *priv);
 
 /* Checking functions */
-
+void dbg_save_space_info(struct ubifs_info *c);
+int dbg_check_space_info(struct ubifs_info *c);
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -439,6 +445,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 
 #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
+#define dbg_save_space_info(c)                     ({})
+#define dbg_check_space_info(c)                    0
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 17443d97e6f..93b6de51f26 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -432,7 +432,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
 	struct page *page;
 
-
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
 
 	if (unlikely(c->ro_media))
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 68328c59762..4cdd284dea5 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @st: return statistics
  */
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
 {
 	spin_lock(&c->space_lock);
-	memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+	memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
 	spin_unlock(&c->space_lock);
 }
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 807bbd3c8b4..5c814a71f33 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1470,6 +1470,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	int err, lnum;
 
 	mutex_lock(&c->umount_mutex);
+	dbg_save_space_info(c);
 	c->remounting_rw = 1;
 	c->always_chk_crc = 1;
 
@@ -1573,8 +1574,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	c->vfs_sb->s_flags &= ~MS_RDONLY;
 	c->remounting_rw = 0;
 	c->always_chk_crc = 0;
+	err = dbg_check_space_info(c);
 	mutex_unlock(&c->umount_mutex);
-	return 0;
+	return err;
 
 out:
 	vfree(c->orph_buf);
@@ -1629,8 +1631,8 @@ static void commit_on_unmount(struct ubifs_info *c)
  * ubifs_remount_ro - re-mount in read-only mode.
  * @c: UBIFS file-system description object
  *
- * We rely on VFS to have stopped writing. Possibly the background thread could
- * be running a commit, however kthread_stop will wait in that case.
+ * We assume VFS has stopped writing. Possibly the background thread could be
+ * running a commit, however kthread_stop will wait in that case.
  */
 static void ubifs_remount_ro(struct ubifs_info *c)
 {
@@ -1640,13 +1642,14 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	ubifs_assert(!c->ro_media);
 
 	commit_on_unmount(c);
-
 	mutex_lock(&c->umount_mutex);
 	if (c->bgt) {
 		kthread_stop(c->bgt);
 		c->bgt = NULL;
 	}
 
+	dbg_save_space_info(c);
+
 	for (i = 0; i < c->jhead_cnt; i++) {
 		ubifs_wbuf_sync(&c->jheads[i].wbuf);
 		del_timer_sync(&c->jheads[i].wbuf.timer);
@@ -1669,6 +1672,9 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	vfree(c->ileb_buf);
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
+	err = dbg_check_space_info(c);
+	if (err)
+		ubifs_ro_mode(c, err);
 	mutex_unlock(&c->umount_mutex);
 }
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ee9517a7b02..f1754354029 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1495,6 +1495,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 			 struct ubifs_budget_req *req);
 long long ubifs_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space_nolock(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
 long long ubifs_reported_space(const struct ubifs_info *c, long long free);
@@ -1646,7 +1647,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 					   const struct ubifs_lprops *lp,
 					   int free, int dirty, int flags,
 					   int idx_gc_cnt);
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
 void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
 		      int cat);
 void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
-- 
cgit v1.2.3


From b4978e949104844224ecf786170c9263efa601f3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Fri, 23 Jan 2009 18:23:03 +0200
Subject: UBIFS: always clean up GC LEB space

When we mount UBIFS, GC LEB may contain out-of-date information,
and UBIFS should update lprops and set free space for thei LEB.
Currently UBIFS does this only if mounted R/W. But for R/O mount
we have to do the same, because otherwise we will have incorrect
FS free space reported to user-space.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 5c814a71f33..336073e4c39 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -397,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = UBIFS_MAX_NLEN;
 	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
 	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
+	ubifs_assert(buf->f_bfree <= c->block_cnt);
 	return 0;
 }
 
@@ -735,12 +736,12 @@ static void init_constants_master(struct ubifs_info *c)
  * take_gc_lnum - reserve GC LEB.
  * @c: UBIFS file-system description object
  *
- * This function ensures that the LEB reserved for garbage collection is
- * unmapped and is marked as "taken" in lprops. We also have to set free space
- * to LEB size and dirty space to zero, because lprops may contain out-of-date
- * information if the file-system was un-mounted before it has been committed.
- * This function returns zero in case of success and a negative error code in
- * case of failure.
+ * This function ensures that the LEB reserved for garbage collection is marked
+ * as "taken" in lprops. We also have to set free space to LEB size and dirty
+ * space to zero, because lprops may contain out-of-date information if the
+ * file-system was un-mounted before it has been committed. This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
  */
 static int take_gc_lnum(struct ubifs_info *c)
 {
@@ -751,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
 		return -EINVAL;
 	}
 
-	err = ubifs_leb_unmap(c, c->gc_lnum);
-	if (err)
-		return err;
-
 	/* And we have to tell lprops that this LEB is taken */
 	err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
 				  LPROPS_TAKEN, 0, 0);
@@ -1280,10 +1277,19 @@ static int mount_ubifs(struct ubifs_info *c)
 			if (err)
 				goto out_orphans;
 			err = ubifs_rcvry_gc_commit(c);
-		} else
+		} else {
 			err = take_gc_lnum(c);
-		if (err)
-			goto out_orphans;
+			if (err)
+				goto out_orphans;
+
+			/*
+			 * GC LEB may contain garbage if there was an unclean
+			 * reboot, and it should be un-mapped.
+			 */
+			err = ubifs_leb_unmap(c, c->gc_lnum);
+			if (err)
+				return err;
+		}
 
 		err = dbg_check_lprops(c);
 		if (err)
@@ -1292,6 +1298,16 @@ static int mount_ubifs(struct ubifs_info *c)
 		err = ubifs_recover_size(c);
 		if (err)
 			goto out_orphans;
+	} else {
+		/*
+		 * Even if we mount read-only, we have to set space in GC LEB
+		 * to proper value because this affects UBIFS free space
+		 * reporting. We do not want to have a situation when
+		 * re-mounting from R/O to R/W changes amount of free space.
+		 */
+		err = take_gc_lnum(c);
+		if (err)
+			goto out_orphans;
 	}
 
 	spin_lock(&ubifs_infos_lock);
@@ -1316,6 +1332,8 @@ static int mount_ubifs(struct ubifs_info *c)
 		goto out_infos;
 
 	c->always_chk_crc = 0;
+	/* GC LEB has to be empty and taken at this point */
+	ubifs_assert(c->lst.taken_empty_lebs == 1);
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1561,7 +1579,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	if (c->need_recovery)
 		err = ubifs_rcvry_gc_commit(c);
 	else
-		err = take_gc_lnum(c);
+		err = ubifs_leb_unmap(c, c->gc_lnum);
 	if (err)
 		goto out;
 
@@ -1786,6 +1804,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		c->bu.buf = NULL;
 	}
 
+	ubifs_assert(c->lst.taken_empty_lebs == 1);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 49d128aa60751a010640f4763d11577e2f508853 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Mon, 26 Jan 2009 10:55:40 +0200
Subject: UBIFS: ensure orphan area head is initialized

When mounting read-only the orphan area head is
not initialized.  It must be initialized when
remounting read/write, but it was not.  This patch
fixes that.

[Artem: sorry, added comment tweaking noise]
Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/orphan.c | 38 +++++++++++++++++++-------------------
 fs/ubifs/super.c  |  6 ++++++
 fs/ubifs/ubifs.h  |  1 +
 3 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9e6f403f170..152a7b34a14 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
  * Orphans are accumulated in a rb-tree. When an inode's link count drops to
  * zero, the inode number is added to the rb-tree. It is removed from the tree
  * when the inode is deleted.  Any new orphans that are in the orphan tree when
- * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * the commit is run, are written to the orphan area in 1 or more orphan nodes.
  * If the orphan area is full, it is consolidated to make space.  There is
  * always enough space because validation prevents the user from creating more
  * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
 }
 
 /**
- * do_write_orph_node - write a node
+ * do_write_orph_node - write a node to the orphan head.
  * @c: UBIFS file-system description object
  * @len: length of node
  * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
 }
 
 /**
- * write_orph_node - write an orph node
+ * write_orph_node - write an orphan node.
  * @c: UBIFS file-system description object
  * @atomic: write atomically
  *
- * This function builds an orph node from the cnext list and writes it to the
+ * This function builds an orphan node from the cnext list and writes it to the
  * orphan head. On success, %0 is returned, otherwise a negative error code
  * is returned.
  */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
 }
 
 /**
- * write_orph_nodes - write orph nodes until there are no more to commit
+ * write_orph_nodes - write orphan nodes until there are no more to commit.
  * @c: UBIFS file-system description object
  * @atomic: write atomically
  *
- * This function writes orph nodes for all the orphans to commit. On success,
+ * This function writes orphan nodes for all the orphans to commit. On success,
  * %0 is returned, otherwise a negative error code is returned.
  */
 static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
 }
 
 /**
- * clear_orphans - erase all LEBs used for orphans.
+ * ubifs_clear_orphans - erase all LEBs used for orphans.
  * @c: UBIFS file-system description object
  *
  * If recovery is not required, then the orphans from the previous session
  * are not needed. This function locates the LEBs used to record
  * orphans, and un-maps them.
  */
-static int clear_orphans(struct ubifs_info *c)
+int ubifs_clear_orphans(struct ubifs_info *c)
 {
 	int lnum, err;
 
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
  * do_kill_orphans - remove orphan inodes from the index.
  * @c: UBIFS file-system description object
  * @sleb: scanned LEB
- * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
  * @outofdate: whether the LEB is out of date is returned here
- * @last_flagged: whether the end orph node is encountered
+ * @last_flagged: whether the end orphan node is encountered
  *
  * This function is a helper to the 'kill_orphans()' function. It goes through
  * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		/*
 		 * The commit number on the master node may be less, because
 		 * of a failed commit. If there are several failed commits in a
-		 * row, the commit number written on orph nodes will continue to
-		 * increase (because the commit number is adjusted here) even
+		 * row, the commit number written on orphan nodes will continue
+		 * to increase (because the commit number is adjusted here) even
 		 * though the commit number on the master node stays the same
 		 * because the master node has not been re-written.
 		 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 			c->cmt_no = cmt_no;
 		if (cmt_no < *last_cmt_no && *last_flagged) {
 			/*
-			 * The last orph node had a higher commit number and was
-			 * flagged as the last written for that commit number.
-			 * That makes this orph node, out of date.
+			 * The last orphan node had a higher commit number and
+			 * was flagged as the last written for that commit
+			 * number. That makes this orphan node, out of date.
 			 */
 			if (!first) {
 				ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
 	/*
 	 * Orph nodes always start at c->orph_first and are written to each
 	 * successive LEB in turn. Generally unused LEBs will have been unmapped
-	 * but may contain out of date orph nodes if the unmap didn't go
-	 * through. In addition, the last orph node written for each commit is
+	 * but may contain out of date orphan nodes if the unmap didn't go
+	 * through. In addition, the last orphan node written for each commit is
 	 * marked (top bit of orph->cmt_no is set to 1). It is possible that
-	 * there are orph nodes from the next commit (i.e. the commit did not
+	 * there are orphan nodes from the next commit (i.e. the commit did not
 	 * complete successfully). In that case, no orphans will have been lost
 	 * due to the way that orphans are written, and any orphans added will
 	 * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
 	if (unclean)
 		err = kill_orphans(c);
 	else if (!read_only)
-		err = clear_orphans(c);
+		err = ubifs_clear_orphans(c);
 
 	return err;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 336073e4c39..fd7fc7f3b7a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1524,6 +1524,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		err = ubifs_recover_inl_heads(c, c->sbuf);
 		if (err)
 			goto out;
+	} else {
+		/* A readonly mount is not allowed to have orphans */
+		ubifs_assert(c->tot_orphans == 0);
+		err = ubifs_clear_orphans(c);
+		if (err)
+			goto out;
 	}
 
 	if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index f1754354029..9999ff0aaa4 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1604,6 +1604,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
 int ubifs_orphan_start_commit(struct ubifs_info *c);
 int ubifs_orphan_end_commit(struct ubifs_info *c);
 int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+int ubifs_clear_orphans(struct ubifs_info *c);
 
 /* lpt.c */
 int ubifs_calc_lpt_geom(struct ubifs_info *c);
-- 
cgit v1.2.3


From bb875b38dc5e343bdb696b2eab8233e4d195e208 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Mon, 26 Jan 2009 15:00:58 +0100
Subject: fuse: fix NULL deref in fuse_file_alloc()

ff is set to NULL and then dereferenced on line 65.  Compile tested only.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
---
 fs/fuse/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e8162646a9b..d9fdb7cec53 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -54,7 +54,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 		ff->reserved_req = fuse_request_alloc();
 		if (!ff->reserved_req) {
 			kfree(ff);
-			ff = NULL;
+			return NULL;
 		} else {
 			INIT_LIST_HEAD(&ff->write_entry);
 			atomic_set(&ff->count, 0);
-- 
cgit v1.2.3


From 3ddf1e7f57237ac7c5d5bfb7058f1ea4f970b661 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:58 +0100
Subject: fuse: fix missing fput on error

Fix the leaking file reference if allocation or initialization of
fuse_conn failed.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
---
 fs/fuse/inode.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 47c96fdca1a..6893717b653 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -829,15 +829,20 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		return -EINVAL;
 
-	if (file->f_op != &fuse_dev_operations)
+	if (file->f_op != &fuse_dev_operations) {
+		fput(file);
 		return -EINVAL;
+	}
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
-	if (!fc)
+	if (!fc) {
+		fput(file);
 		return -ENOMEM;
+	}
 
 	err = fuse_conn_init(fc, sb);
 	if (err) {
+		fput(file);
 		kfree(fc);
 		return err;
 	}
-- 
cgit v1.2.3


From c2b8f006909b9bf9e165dfdf3c378527938c4497 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:58 +0100
Subject: fuse: fuse_fill_super error handling cleanup

Clean up error handling for the whole of fuse_fill_super() function.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/inode.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6893717b653..dc649f6bc3e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -805,16 +805,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	int err;
 	int is_bdev = sb->s_bdev != NULL;
 
+	err = -EINVAL;
 	if (sb->s_flags & MS_MANDLOCK)
-		return -EINVAL;
+		goto err;
 
 	if (!parse_fuse_opt((char *) data, &d, is_bdev))
-		return -EINVAL;
+		goto err;
 
 	if (is_bdev) {
 #ifdef CONFIG_BLOCK
+		err = -EINVAL;
 		if (!sb_set_blocksize(sb, d.blksize))
-			return -EINVAL;
+			goto err;
 #endif
 	} else {
 		sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -826,25 +828,22 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_export_op = &fuse_export_operations;
 
 	file = fget(d.fd);
+	err = -EINVAL;
 	if (!file)
-		return -EINVAL;
+		goto err;
 
-	if (file->f_op != &fuse_dev_operations) {
-		fput(file);
-		return -EINVAL;
-	}
+	if (file->f_op != &fuse_dev_operations)
+		goto err_fput;
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
-	if (!fc) {
-		fput(file);
-		return -ENOMEM;
-	}
+	err = -ENOMEM;
+	if (!fc)
+		goto err_fput;
 
 	err = fuse_conn_init(fc, sb);
 	if (err) {
-		fput(file);
 		kfree(fc);
-		return err;
+		goto err_fput;
 	}
 
 	fc->release = fuse_free_conn;
@@ -859,12 +858,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	err = -ENOMEM;
 	root = fuse_get_root_inode(sb, d.rootmode);
 	if (!root)
-		goto err;
+		goto err_put_conn;
 
 	root_dentry = d_alloc_root(root);
 	if (!root_dentry) {
 		iput(root);
-		goto err;
+		goto err_put_conn;
 	}
 
 	init_req = fuse_request_alloc();
@@ -908,9 +907,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
- err:
-	fput(file);
+ err_put_conn:
 	fuse_conn_put(fc);
+ err_fput:
+	fput(file);
+ err:
 	return err;
 }
 
-- 
cgit v1.2.3


From 26c3679101dbccc054dcf370143941844ba70531 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:59 +0100
Subject: fuse: destroy bdi on umount

If a fuse filesystem is unmounted but the device file descriptor
remains open and a new mount reuses the old device number, then the
mount fails with EEXIST and the following warning is printed in the
kernel log:

  WARNING: at fs/sysfs/dir.c:462 sysfs_add_one+0x35/0x3d()
  sysfs: duplicate filename '0:15' can not be created

The cause is that the bdi belonging to the fuse filesystem was
destoryed only after the device file was released.  Fix this by
calling bdi_destroy() from fuse_put_super() instead.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
---
 fs/fuse/dev.c   | 3 ++-
 fs/fuse/inode.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e0c7ada08a1..c4a3d9bbdaa 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -281,7 +281,8 @@ __releases(&fc->lock)
 			fc->blocked = 0;
 			wake_up_all(&fc->blocked_waitq);
 		}
-		if (fc->num_background == FUSE_CONGESTION_THRESHOLD) {
+		if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+		    fc->connected) {
 			clear_bdi_congested(&fc->bdi, READ);
 			clear_bdi_congested(&fc->bdi, WRITE);
 		}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index dc649f6bc3e..459b73dd45e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -292,6 +292,7 @@ static void fuse_put_super(struct super_block *sb)
 	list_del(&fc->entry);
 	fuse_ctl_remove_conn(fc);
 	mutex_unlock(&fuse_mutex);
+	bdi_destroy(&fc->bdi);
 	fuse_conn_put(fc);
 }
 
@@ -532,7 +533,6 @@ void fuse_conn_put(struct fuse_conn *fc)
 		if (fc->destroy_req)
 			fuse_request_free(fc->destroy_req);
 		mutex_destroy(&fc->inst_mutex);
-		bdi_destroy(&fc->bdi);
 		fc->release(fc);
 	}
 }
-- 
cgit v1.2.3


From f6d47a1761896dcd89e3184399a8962dff17267d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 26 Jan 2009 15:00:59 +0100
Subject: fuse: fix poll notify

Move fuse_copy_finish() to before calling fuse_notify_poll_wakeup().
This is not a big issue because fuse_notify_poll_wakeup() should be
atomic, but it's cleaner this way, and later uses of notification will
need to be able to finish the copying before performing some actions.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dev.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c4a3d9bbdaa..ba76b68c52f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -826,16 +826,21 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
 			    struct fuse_copy_state *cs)
 {
 	struct fuse_notify_poll_wakeup_out outarg;
-	int err;
+	int err = -EINVAL;
 
 	if (size != sizeof(outarg))
-		return -EINVAL;
+		goto err;
 
 	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
 	if (err)
-		return err;
+		goto err;
 
+	fuse_copy_finish(cs);
 	return fuse_notify_poll_wakeup(fc, &outarg);
+
+err:
+	fuse_copy_finish(cs);
+	return err;
 }
 
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
@@ -846,6 +851,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		return fuse_notify_poll(fc, size, cs);
 
 	default:
+		fuse_copy_finish(cs);
 		return -EINVAL;
 	}
 }
@@ -924,7 +930,6 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 	 */
 	if (!oh.unique) {
 		err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
-		fuse_copy_finish(&cs);
 		return err ? err : nbytes;
 	}
 
-- 
cgit v1.2.3


From 6ba87c9b920bea8c2703308d31eb7de925242c30 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Mon, 26 Jan 2009 16:12:20 +0200
Subject: UBIFS: fix assertions

I introduce wrong assertions in one of the previous commits, this
patch fixes them.

Also, initialize debugfs after the debugging check. This is a little
nicer because we want the FS data to be accessible to external users
after everything has been initialized.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index fd7fc7f3b7a..dbfc8871471 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1320,20 +1320,21 @@ static int mount_ubifs(struct ubifs_info *c)
 		else {
 			c->need_recovery = 0;
 			ubifs_msg("recovery completed");
+			/* GC LEB has to be empty and taken at this point */
+			ubifs_assert(c->lst.taken_empty_lebs == 1);
 		}
-	}
+	} else
+		ubifs_assert(c->lst.taken_empty_lebs == 1);
 
-	err = dbg_debugfs_init_fs(c);
+	err = dbg_check_filesystem(c);
 	if (err)
 		goto out_infos;
 
-	err = dbg_check_filesystem(c);
+	err = dbg_debugfs_init_fs(c);
 	if (err)
 		goto out_infos;
 
 	c->always_chk_crc = 0;
-	/* GC LEB has to be empty and taken at this point */
-	ubifs_assert(c->lst.taken_empty_lebs == 1);
 
 	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
 		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1663,7 +1664,7 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	int i, err;
 
 	ubifs_assert(!c->need_recovery);
-	ubifs_assert(!c->ro_media);
+	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 
 	commit_on_unmount(c);
 	mutex_lock(&c->umount_mutex);
-- 
cgit v1.2.3


From 3632dee2f8b8a9720329f29eeaa4ec4669a3aff8 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Thu, 22 Jan 2009 15:29:45 +0100
Subject: inotify: clean up inotify_read and fix locking problems

If userspace supplies an invalid pointer to a read() of an inotify
instance, the inotify device's event list mutex is unlocked twice.
This causes an unbalance which effectively leaves the data structure
unprotected, and we can trigger oopses by accessing the inotify
instance from different tasks concurrently.

The best fix (contributed largely by Linus) is a total rewrite
of the function in question:

On Thu, Jan 22, 2009 at 7:05 AM, Linus Torvalds wrote:
> The thing to notice is that:
>
>  - locking is done in just one place, and there is no question about it
>   not having an unlock.
>
>  - that whole double-while(1)-loop thing is gone.
>
>  - use multiple functions to make nesting and error handling sane
>
>  - do error testing after doing the things you always need to do, ie do
>   this:
>
>        mutex_lock(..)
>        ret = function_call();
>        mutex_unlock(..)
>
>        .. test ret here ..
>
>   instead of doing conditional exits with unlocking or freeing.
>
> So if the code is written in this way, it may still be buggy, but at least
> it's not buggy because of subtle "forgot to unlock" or "forgot to free"
> issues.
>
> This _always_ unlocks if it locked, and it always frees if it got a
> non-error kevent.

Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Robert Love <rlove@google.com>
Cc: <stable@kernel.org>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_user.c | 135 +++++++++++++++++++++------------------
 1 file changed, 74 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index d53a1838d6e..bed766e435b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -427,10 +427,61 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
 	return ret;
 }
 
+/*
+ * Get an inotify_kernel_event if one exists and is small
+ * enough to fit in "count". Return an error pointer if
+ * not large enough.
+ *
+ * Called with the device ev_mutex held.
+ */
+static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
+						  size_t count)
+{
+	size_t event_size = sizeof(struct inotify_event);
+	struct inotify_kernel_event *kevent;
+
+	if (list_empty(&dev->events))
+		return NULL;
+
+	kevent = inotify_dev_get_event(dev);
+	if (kevent->name)
+		event_size += kevent->event.len;
+
+	if (event_size > count)
+		return ERR_PTR(-EINVAL);
+
+	remove_kevent(dev, kevent);
+	return kevent;
+}
+
+/*
+ * Copy an event to user space, returning how much we copied.
+ *
+ * We already checked that the event size is smaller than the
+ * buffer we had in "get_one_event()" above.
+ */
+static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+				  char __user *buf)
+{
+	size_t event_size = sizeof(struct inotify_event);
+
+	if (copy_to_user(buf, &kevent->event, event_size))
+		return -EFAULT;
+
+	if (kevent->name) {
+		buf += event_size;
+
+		if (copy_to_user(buf, kevent->name, kevent->event.len))
+			return -EFAULT;
+
+		event_size += kevent->event.len;
+	}
+	return event_size;
+}
+
 static ssize_t inotify_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *pos)
 {
-	size_t event_size = sizeof (struct inotify_event);
 	struct inotify_device *dev;
 	char __user *start;
 	int ret;
@@ -440,81 +491,43 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 	dev = file->private_data;
 
 	while (1) {
+		struct inotify_kernel_event *kevent;
 
 		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
 
 		mutex_lock(&dev->ev_mutex);
-		if (!list_empty(&dev->events)) {
-			ret = 0;
-			break;
-		}
+		kevent = get_one_event(dev, count);
 		mutex_unlock(&dev->ev_mutex);
 
-		if (file->f_flags & O_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
+		if (kevent) {
+			ret = PTR_ERR(kevent);
+			if (IS_ERR(kevent))
+				break;
+			ret = copy_event_to_user(kevent, buf);
+			free_kevent(kevent);
+			if (ret < 0)
+				break;
+			buf += ret;
+			count -= ret;
+			continue;
 		}
 
-		schedule();
-	}
-
-	finish_wait(&dev->wq, &wait);
-	if (ret)
-		return ret;
-
-	while (1) {
-		struct inotify_kernel_event *kevent;
-
-		ret = buf - start;
-		if (list_empty(&dev->events))
+		ret = -EAGAIN;
+		if (file->f_flags & O_NONBLOCK)
 			break;
-
-		kevent = inotify_dev_get_event(dev);
-		if (event_size + kevent->event.len > count) {
-			if (ret == 0 && count > 0) {
-				/*
-				 * could not get a single event because we
-				 * didn't have enough buffer space.
-				 */
-				ret = -EINVAL;
-			}
+		ret = -EINTR;
+		if (signal_pending(current))
 			break;
-		}
-		remove_kevent(dev, kevent);
 
-		/*
-		 * Must perform the copy_to_user outside the mutex in order
-		 * to avoid a lock order reversal with mmap_sem.
-		 */
-		mutex_unlock(&dev->ev_mutex);
-
-		if (copy_to_user(buf, &kevent->event, event_size)) {
-			ret = -EFAULT;
+		if (start != buf)
 			break;
-		}
-		buf += event_size;
-		count -= event_size;
-
-		if (kevent->name) {
-			if (copy_to_user(buf, kevent->name, kevent->event.len)){
-				ret = -EFAULT;
-				break;
-			}
-			buf += kevent->event.len;
-			count -= kevent->event.len;
-		}
-
-		free_kevent(kevent);
 
-		mutex_lock(&dev->ev_mutex);
+		schedule();
 	}
-	mutex_unlock(&dev->ev_mutex);
 
+	finish_wait(&dev->wq, &wait);
+	if (start != buf && ret != -EFAULT)
+		ret = buf - start;
 	return ret;
 }
 
-- 
cgit v1.2.3


From fdff73f094e7220602cc3f8959c7230517976412 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 26 Jan 2009 19:06:41 -0500
Subject: ext4: Initialize the new group descriptor when resizing the
 filesystem

Make sure all of the fields of the group descriptor are properly
initialized.  Previously, we allowed bg_flags field to be contain
random garbage, which could trigger non-deterministic behavior,
including a kernel OOPS.

http://bugzilla.kernel.org/show_bug.cgi?id=12433

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c328be5d688..c06886abd65 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -861,12 +861,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
 					 gdb_off * EXT4_DESC_SIZE(sb));
 
+	memset(gdp, 0, EXT4_DESC_SIZE(sb));
 	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
 	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
 	ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
 	ext4_free_blks_set(sb, gdp, input->free_blocks_count);
 	ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
-	gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
+	gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
 	/*
-- 
cgit v1.2.3


From 9fd9784c91db79e953ea3fe3741f885bdc390a72 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Date: Mon, 26 Jan 2009 19:26:26 -0500
Subject: ext4: Fix building with EXT4FS_DEBUG

When bg_free_blocks_count was renamed to bg_free_blocks_count_lo in
560671a0, its uses under EXT4FS_DEBUG were not changed to the helper
ext4_free_blks_count.

Another commit, 498e5f24, also did not change everything needed under
EXT4FS_DEBUG, thus making it spill some warnings related to printing
format.

This commit fixes both issues and makes ext4 build again when
EXT4FS_DEBUG is enabled.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 6 +++---
 fs/ext4/extents.c | 2 +-
 fs/ext4/mballoc.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 6bba06b09dd..9a50b8052dc 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -684,15 +684,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
-		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+		desc_count += ext4_free_blks_count(sb, gdp);
 		brelse(bitmap_bh);
 		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
 			continue;
 
 		x = ext4_count_free(bitmap_bh, sb->s_blocksize);
-		printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n",
-			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+		printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
+			i, ext4_free_blks_count(sb, gdp), x);
 		bitmap_count += x;
 	}
 	brelse(bitmap_bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54bf0623a9a..e2eab196875 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3048,7 +3048,7 @@ retry:
 			WARN_ON(ret <= 0);
 			printk(KERN_ERR "%s: ext4_ext_get_blocks "
 				    "returned error inode#%lu, block=%u, "
-				    "max_blocks=%lu", __func__,
+				    "max_blocks=%u", __func__,
 				    inode->i_ino, block, max_blocks);
 #endif
 			ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 918aec0c8a1..deba54f6cbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3025,7 +3025,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		goto out_err;
 
 	ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
-			gdp->bg_free_blocks_count);
+			ext4_free_blks_count(sb, gdp));
 
 	err = ext4_journal_get_write_access(handle, gdp_bh);
 	if (err)
-- 
cgit v1.2.3


From 6f7ab6d458bbfc2f55d295fa3e6b9e69cdb1d517 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Tue, 27 Jan 2009 16:12:31 +0200
Subject: UBIFS: fix no_chk_data_crc

When data CRC checking is disabled, UBIFS returns incorrect return
code from the 'try_read_node()' function (0 instead of 1, which means
CRC error), which make the caller re-read the data node again, but using
a different code patch, so the second read is fine. Thus, we read the
same node twice. And the result of this is that UBIFS is slower
with no_chk_data_crc option than it is with chk_data_crc option.
This patches fixes the problem.

Reported-by: Reuben Dowle <Reuben.Dowle@navico.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/io.c    | 22 ++++++++++++++--------
 fs/ubifs/tnc.c   | 12 ++++++++----
 fs/ubifs/ubifs.h |  2 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01682713af6..e8e632a1dcd 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
  * would have been wasted for padding to the nearest minimal I/O unit boundary.
  * Instead, data first goes to the write-buffer and is flushed when the
  * buffer is full or when it is not used for some time (by timer). This is
- * similarto the mechanism is used by JFFS2.
+ * similar to the mechanism is used by JFFS2.
  *
  * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
  * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * @lnum: logical eraseblock number
  * @offs: offset within the logical eraseblock
  * @quiet: print no messages
- * @chk_crc: indicates whether to always check the CRC
+ * @must_chk_crc: indicates whether to always check the CRC
  *
  * This function checks node magic number and CRC checksum. This function also
  * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * node length in the common header could cause UBIFS to read memory outside of
  * allocated buffer when checking the CRC checksum.
  *
- * This function returns zero in case of success %-EUCLEAN in case of bad CRC
- * or magic.
+ * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
+ * true, which is controlled by corresponding UBIFS mount option. However, if
+ * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
+ * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
+ * ignored and CRC is checked.
+ *
+ * This function returns zero in case of success and %-EUCLEAN in case of bad
+ * CRC or magic.
  */
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int chk_crc)
+		     int offs, int quiet, int must_chk_crc)
 {
 	int err = -EINVAL, type, node_len;
 	uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		   node_len > c->ranges[type].max_len)
 		goto out_len;
 
-	if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc)
-		if (c->no_chk_data_crc)
-			return 0;
+	if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
+	     c->no_chk_data_crc)
+		return 0;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f7e36f54552..fa28a84c6a1 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
  * This function performs that same function as ubifs_read_node except that
  * it does not require that there is actually a node present and instead
  * the return code indicates if a node was read.
+ *
+ * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
+ * is true (it is controlled by corresponding mount option). However, if
+ * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
+ * checked.
  */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 			 int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 	if (node_len != len)
 		return 0;
 
-	if (type == UBIFS_DATA_NODE && !c->always_chk_crc)
-		if (c->no_chk_data_crc)
-			return 0;
+	if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
+		return 1;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
  *
  * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
  * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
- * maxumum possible amount of nodes for bulk-read.
+ * maximum possible amount of nodes for bulk-read.
  */
 int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 9999ff0aaa4..29dfa816077 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1428,7 +1428,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
 		     int offs, int dtype);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int chk_crc);
+		     int offs, int quiet, int must_chk_crc);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
 void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
 int ubifs_io_init(struct ubifs_info *c);
-- 
cgit v1.2.3


From f0e0059b9c18426cffdcc04161062251a8f9741e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Sun, 25 Jan 2009 20:53:00 -0600
Subject: don't reallocate sxp variable passed into xfs_swapext

fixes kernel.org bugzilla 12538, xfs_fsr fails on 2.6.29-rc kernels

Regression caused by 743bb4650da9e2595d6cedd01c680b5b9398c74a

This was an embarrasing mistake, reallocating the sxp pointer passed
in from the main ioctl switch.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net
Reported-by: Paul Martin <pm@debian.org>
Tested-by: Paul Martin <pm@debian.org>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_dfrag.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee71349..f8278cfcc1d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
 	struct file	*file, *target_file;
 	int		error = 0;
 
-	sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
-	if (!sxp) {
-		error = XFS_ERROR(ENOMEM);
-		goto out;
-	}
-
 	/* Pull information for the target fd */
 	file = fget((int)sxp->sx_fdtarget);
 	if (!file) {
 		error = XFS_ERROR(EINVAL);
-		goto out_free_sxp;
+		goto out;
 	}
 
 	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
 	fput(target_file);
  out_put_file:
 	fput(file);
- out_free_sxp:
-	kmem_free(sxp);
  out:
 	return error;
 }
-- 
cgit v1.2.3


From bf935a78814cc9b96d09f612912178adc964ce9c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 20 Jan 2009 19:32:59 -0500
Subject: nfsd: fix null dereference on error path

We're forgetting to check the return value from groups_alloc().

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/auth.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index c903e04aa21..b860d3484cd 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -49,6 +49,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		new->fsuid = exp->ex_anon_uid;
 		new->fsgid = exp->ex_anon_gid;
 		gi = groups_alloc(0);
+		if (!gi)
+			goto oom;
 	} else if (flags & NFSEXP_ROOTSQUASH) {
 		if (!new->fsuid)
 			new->fsuid = exp->ex_anon_uid;
-- 
cgit v1.2.3


From b914152a6fbd2cd0441bc293ae8b3f3f1a9407b6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 20 Jan 2009 19:34:22 -0500
Subject: nfsd: fix cred leak on every rpc

Since override_creds() took its own reference on new, we need to release
our own reference.

(Note the put_cred on the return value puts the *old* value of
current->creds, not the new passed-in value).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/auth.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index b860d3484cd..5573508f707 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -87,6 +87,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 		new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
 							new->cap_permitted);
 	put_cred(override_creds(new));
+	put_cred(new);
 	return 0;
 
 oom:
-- 
cgit v1.2.3


From fa82a491275a613b15489aab4b99acecb00958d3 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 Jan 2009 14:16:04 -0500
Subject: nfsd: only set file_lock.fl_lmops in nfsd4_lockt if a stateowner is
 found

nfsd4_lockt does a search for a lockstateowner when building the lock
struct to test. If one is found, it'll set fl_owner to it. Regardless of
whether that happens, it'll also set fl_lmops. Given that this lock is
basically a "lightweight" lock that's just used for checking conflicts,
setting fl_lmops is probably not appropriate for it.

This behavior exposed a bug in DLM's GETLK implementation where it
wasn't clearing out the fields in the file_lock before filling in
conflicting lock info. While we were able to fix this in DLM, it
still seems pointless and dangerous to set the fl_lmops this way
when we may have a NULL lockstateowner.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@pig.fieldses.org>
---
 fs/nfsd/nfs4state.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 88db7d3ec12..b6f60f48e94 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2871,7 +2871,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
-	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lockt->lt_offset;
 	file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
-- 
cgit v1.2.3


From 4a29d2005b0f28d018d36d209c47f3973a725df5 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Tue, 27 Jan 2009 15:22:54 +0200
Subject: UBIFS: fix LPT out-of-space bug (again)

The function to traverse and dirty the LPT was still not
dirtying all nodes, with the result that the LPT could
run out of space.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/lpt_commit.c | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 96ca9570717..3216a1f277f 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -556,23 +556,23 @@ no_space:
 }
 
 /**
- * next_pnode - find next pnode.
+ * next_pnode_to_dirty - find next pnode to dirty.
  * @c: UBIFS file-system description object
  * @pnode: pnode
  *
- * This function returns the next pnode or %NULL if there are no more pnodes.
+ * This function returns the next pnode to dirty or %NULL if there are no more
+ * pnodes.  Note that pnodes that have never been written (lnum == 0) are
+ * skipped.
  */
-static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
-				      struct ubifs_pnode *pnode)
+static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
+					       struct ubifs_pnode *pnode)
 {
 	struct ubifs_nnode *nnode;
 	int iip;
 
 	/* Try to go right */
 	nnode = pnode->parent;
-	iip = pnode->iip + 1;
-	if (iip < UBIFS_LPT_FANOUT) {
-		/* We assume here that LEB zero is never an LPT LEB */
+	for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
 		if (nnode->nbranch[iip].lnum)
 			return ubifs_get_pnode(c, nnode, iip);
 	}
@@ -583,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 		nnode = nnode->parent;
 		if (!nnode)
 			return NULL;
-		/* We assume here that LEB zero is never an LPT LEB */
-	} while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+		for (; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+       } while (iip >= UBIFS_LPT_FANOUT);
 
 	/* Go right */
 	nnode = ubifs_get_nnode(c, nnode, iip);
@@ -593,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 
 	/* Go down to level 1 */
 	while (nnode->level > 1) {
-		nnode = ubifs_get_nnode(c, nnode, 0);
+		for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+		if (iip >= UBIFS_LPT_FANOUT) {
+			/*
+			 * Should not happen, but we need to keep going
+			 * if it does.
+			 */
+			iip = 0;
+		}
+		nnode = ubifs_get_nnode(c, nnode, iip);
 		if (IS_ERR(nnode))
 			return (void *)nnode;
 	}
 
-	return ubifs_get_pnode(c, nnode, 0);
+	for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
+		if (nnode->nbranch[iip].lnum)
+			break;
+	if (iip >= UBIFS_LPT_FANOUT)
+		/* Should not happen, but we need to keep going if it does */
+		iip = 0;
+	return ubifs_get_pnode(c, nnode, iip);
 }
 
 /**
@@ -688,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
 	pnode = pnode_lookup(c, 0);
 	while (pnode) {
 		do_make_pnode_dirty(c, pnode);
-		pnode = next_pnode(c, pnode);
+		pnode = next_pnode_to_dirty(c, pnode);
 		if (IS_ERR(pnode))
 			return PTR_ERR(pnode);
 	}
-- 
cgit v1.2.3


From 0496e02d8791e7f06673a19a181be30dad6eff70 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 30 Dec 2008 12:39:16 -0500
Subject: cifs: turn smb_send into a wrapper around smb_sendv

cifs: turn smb_send into a wrapper around smb_sendv

Rename smb_send2 to smb_sendv to make it consistent with kernel naming
conventions for functions that take a vector.

There's no need to have 2 functions to handle sending SMB calls. Turn
smb_send into a wrapper around smb_sendv. This also allows us to
properly mark the socket as needing to be reconnected when there's a
partial send from smb_send.

Also, in practice we always use the address and noblocksnd flag
that's attached to the TCP_Server_Info. There's no need to pass
them in as separate args to smb_sendv.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/connect.c   |   4 +-
 fs/cifs/transport.c | 107 ++++++++++------------------------------------------
 3 files changed, 22 insertions(+), 93 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 06f6779988b..382ba629880 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -35,8 +35,8 @@ extern struct smb_hdr *cifs_buf_get(void);
 extern void cifs_buf_release(void *);
 extern struct smb_hdr *cifs_small_buf_get(void);
 extern void cifs_small_buf_release(void *);
-extern int smb_send(struct socket *, struct smb_hdr *,
-			unsigned int /* length */ , struct sockaddr *, bool);
+extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
+			unsigned int /* length */);
 extern unsigned int _GetXid(void);
 extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e9ea394ee07..7419576228f 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1860,9 +1860,7 @@ ipv4_connect(struct TCP_Server_Info *server)
 			smb_buf = (struct smb_hdr *)ses_init_buf;
 			/* sizeof RFC1002_SESSION_REQUEST with no scope */
 			smb_buf->smb_buf_length = 0x81000044;
-			rc = smb_send(socket, smb_buf, 0x44,
-				(struct sockaddr *) &server->addr.sockAddr,
-				server->noblocksnd);
+			rc = smb_send(server, smb_buf, 0x44);
 			kfree(ses_init_buf);
 			msleep(1); /* RFC1001 layer in at least one server
 				      requires very short break before negprot
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 7ebe6599ed3..2c7efd26992 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -154,81 +154,8 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
 	spin_unlock(&GlobalMid_Lock);
 }
 
-int
-smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer,
-	 unsigned int smb_buf_length, struct sockaddr *sin, bool noblocksnd)
-{
-	int rc = 0;
-	int i = 0;
-	struct msghdr smb_msg;
-	struct kvec iov;
-	unsigned len = smb_buf_length + 4;
-
-	if (ssocket == NULL)
-		return -ENOTSOCK; /* BB eventually add reconnect code here */
-	iov.iov_base = smb_buffer;
-	iov.iov_len = len;
-
-	smb_msg.msg_name = sin;
-	smb_msg.msg_namelen = sizeof(struct sockaddr);
-	smb_msg.msg_control = NULL;
-	smb_msg.msg_controllen = 0;
-	if (noblocksnd)
-		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
-	else
-		smb_msg.msg_flags = MSG_NOSIGNAL;
-
-	/* smb header is converted in header_assemble. bcc and rest of SMB word
-	   area, and byte area if necessary, is converted to littleendian in
-	   cifssmb.c and RFC1001 len is converted to bigendian in smb_send
-	   Flags2 is converted in SendReceive */
-
-	smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
-	cFYI(1, ("Sending smb of length %d", smb_buf_length));
-	dump_smb(smb_buffer, len);
-
-	while (len > 0) {
-		rc = kernel_sendmsg(ssocket, &smb_msg, &iov, 1, len);
-		if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
-			i++;
-		/* smaller timeout here than send2 since smaller size */
-		/* Although it may not be required, this also is smaller
-		   oplock break time */
-			if (i > 12) {
-				cERROR(1,
-				   ("sends on sock %p stuck for 7 seconds",
-				    ssocket));
-				rc = -EAGAIN;
-				break;
-			}
-			msleep(1 << i);
-			continue;
-		}
-		if (rc < 0)
-			break;
-		else
-			i = 0; /* reset i after each successful send */
-		iov.iov_base += rc;
-		iov.iov_len -= rc;
-		len -= rc;
-	}
-
-	if (rc < 0) {
-		cERROR(1, ("Error %d sending data on socket to server", rc));
-	} else {
-		rc = 0;
-	}
-
-	/* Don't want to modify the buffer as a
-	   side effect of this call. */
-	smb_buffer->smb_buf_length = smb_buf_length;
-
-	return rc;
-}
-
 static int
-smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
-	  struct sockaddr *sin, bool noblocksnd)
+smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 {
 	int rc = 0;
 	int i = 0;
@@ -243,11 +170,11 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
 	if (ssocket == NULL)
 		return -ENOTSOCK; /* BB eventually add reconnect code here */
 
-	smb_msg.msg_name = sin;
+	smb_msg.msg_name = (struct sockaddr *) &server->addr.sockAddr;
 	smb_msg.msg_namelen = sizeof(struct sockaddr);
 	smb_msg.msg_control = NULL;
 	smb_msg.msg_controllen = 0;
-	if (noblocksnd)
+	if (server->noblocksnd)
 		smb_msg.msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
 	else
 		smb_msg.msg_flags = MSG_NOSIGNAL;
@@ -339,6 +266,18 @@ smb_send2(struct TCP_Server_Info *server, struct kvec *iov, int n_vec,
 	return rc;
 }
 
+int
+smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
+	 unsigned int smb_buf_length)
+{
+	struct kvec iov;
+
+	iov.iov_base = smb_buffer;
+	iov.iov_len = smb_buf_length + 4;
+
+	return smb_sendv(server, &iov, 1);
+}
+
 static int wait_for_free_request(struct cifsSesInfo *ses, const int long_op)
 {
 	if (long_op == CIFS_ASYNC_OP) {
@@ -540,9 +479,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send2(ses->server, iov, n_vec,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr),
-		       ses->server->noblocksnd);
+	rc = smb_sendv(ses->server, iov, n_vec);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -736,9 +673,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr),
-		      ses->server->noblocksnd);
+	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
@@ -879,9 +814,7 @@ send_nt_cancel(struct cifsTconInfo *tcon, struct smb_hdr *in_buf,
 		mutex_unlock(&ses->server->srv_mutex);
 		return rc;
 	}
-	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-	      (struct sockaddr *) &(ses->server->addr.sockAddr),
-	      ses->server->noblocksnd);
+	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
 	mutex_unlock(&ses->server->srv_mutex);
 	return rc;
 }
@@ -973,9 +906,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 #ifdef CONFIG_CIFS_STATS2
 	atomic_inc(&ses->server->inSend);
 #endif
-	rc = smb_send(ses->server->ssocket, in_buf, in_buf->smb_buf_length,
-		      (struct sockaddr *) &(ses->server->addr.sockAddr),
-		      ses->server->noblocksnd);
+	rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
 #ifdef CONFIG_CIFS_STATS2
 	atomic_dec(&ses->server->inSend);
 	midQ->when_sent = jiffies;
-- 
cgit v1.2.3


From 6a7f8d36c00ab7adef5fb633f7805c91e8c1e139 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 12 Jan 2009 21:03:25 +0000
Subject: [CIFS] Rename md5 functions to avoid collision with new rt modules

When rt modules were added they (each) included their own md5
with names which collided with the existing names of cifs's md5 functions.

Renaming cifs's md5 modules so we don't collide with them.

> Stephen Rothwell wrote:
> When CIFS is built-in (=y) and staging/rt28[67]0 =y, there are multiple
> definitions of:
>
> build-r8250.out:(.text+0x1d8ad0): multiple definition of `MD5Init'
> build-r8250.out:(.text+0x1dbb30): multiple definition of `MD5Update'
> build-r8250.out:(.text+0x1db9b0): multiple definition of `MD5Final'
>
> all of which need to have more unique identifiers for their global
> symbols (e.g., rt28_md5_init, cifs_md5_init, foo, blah, bar).
>

CC: Greg K-H <gregkh@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 18 +++++++++---------
 fs/cifs/md5.c         | 38 +++++++++++++++++++-------------------
 fs/cifs/md5.h         |  6 +++---
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d4839cf0cb2..7c9809523f4 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -48,11 +48,11 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 	if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
 		return -EINVAL;
 
-	MD5Init(&context);
-	MD5Update(&context, (char *)&key->data, key->len);
-	MD5Update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+	cifs_MD5_init(&context);
+	cifs_MD5_update(&context, (char *)&key->data, key->len);
+	cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
 
-	MD5Final(signature, &context);
+	cifs_MD5_final(signature, &context);
 	return 0;
 }
 
@@ -96,8 +96,8 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 	if ((iov == NULL) || (signature == NULL) || (key == NULL))
 		return -EINVAL;
 
-	MD5Init(&context);
-	MD5Update(&context, (char *)&key->data, key->len);
+	cifs_MD5_init(&context);
+	cifs_MD5_update(&context, (char *)&key->data, key->len);
 	for (i = 0; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
@@ -110,13 +110,13 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (i == 0) {
 			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
-			MD5Update(&context, iov[0].iov_base+4,
+			cifs_MD5_update(&context, iov[0].iov_base+4,
 				  iov[0].iov_len-4);
 		} else
-			MD5Update(&context, iov[i].iov_base, iov[i].iov_len);
+			cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
 	}
 
-	MD5Final(signature, &context);
+	cifs_MD5_final(signature, &context);
 
 	return 0;
 }
diff --git a/fs/cifs/md5.c b/fs/cifs/md5.c
index 462bbfefd4b..98b66a54c31 100644
--- a/fs/cifs/md5.c
+++ b/fs/cifs/md5.c
@@ -10,8 +10,8 @@
  * with every copy.
  *
  * To compute the message digest of a chunk of bytes, declare an
- * MD5Context structure, pass it to MD5Init, call MD5Update as
- * needed on buffers full of bytes, and then call MD5Final, which
+ * MD5Context structure, pass it to cifs_MD5_init, call cifs_MD5_update as
+ * needed on buffers full of bytes, and then call cifs_MD5_final, which
  * will fill a supplied 16-byte array with the digest.
  */
 
@@ -45,7 +45,7 @@ byteReverse(unsigned char *buf, unsigned longs)
  * initialization constants.
  */
 void
-MD5Init(struct MD5Context *ctx)
+cifs_MD5_init(struct MD5Context *ctx)
 {
 	ctx->buf[0] = 0x67452301;
 	ctx->buf[1] = 0xefcdab89;
@@ -61,7 +61,7 @@ MD5Init(struct MD5Context *ctx)
  * of bytes.
  */
 void
-MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
+cifs_MD5_update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
 {
 	register __u32 t;
 
@@ -110,7 +110,7 @@ MD5Update(struct MD5Context *ctx, unsigned char const *buf, unsigned len)
  * 1 0* (64-bit count of bits processed, MSB-first)
  */
 void
-MD5Final(unsigned char digest[16], struct MD5Context *ctx)
+cifs_MD5_final(unsigned char digest[16], struct MD5Context *ctx)
 {
 	unsigned int count;
 	unsigned char *p;
@@ -165,7 +165,7 @@ MD5Final(unsigned char digest[16], struct MD5Context *ctx)
 
 /*
  * The core of the MD5 algorithm, this alters an existing MD5 hash to
- * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * reflect the addition of 16 longwords of new data.  cifs_MD5_update blocks
  * the data and converts bytes into longwords for this routine.
  */
 static void
@@ -267,9 +267,9 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
 		unsigned char tk[16];
 		struct MD5Context tctx;
 
-		MD5Init(&tctx);
-		MD5Update(&tctx, key, key_len);
-		MD5Final(tk, &tctx);
+		cifs_MD5_init(&tctx);
+		cifs_MD5_update(&tctx, key, key_len);
+		cifs_MD5_final(tk, &tctx);
 
 		key = tk;
 		key_len = 16;
@@ -287,8 +287,8 @@ hmac_md5_init_rfc2104(unsigned char *key, int key_len,
 		ctx->k_opad[i] ^= 0x5c;
 	}
 
-	MD5Init(&ctx->ctx);
-	MD5Update(&ctx->ctx, ctx->k_ipad, 64);
+	cifs_MD5_init(&ctx->ctx);
+	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
 }
 #endif
 
@@ -317,8 +317,8 @@ hmac_md5_init_limK_to_64(const unsigned char *key, int key_len,
 		ctx->k_opad[i] ^= 0x5c;
 	}
 
-	MD5Init(&ctx->ctx);
-	MD5Update(&ctx->ctx, ctx->k_ipad, 64);
+	cifs_MD5_init(&ctx->ctx);
+	cifs_MD5_update(&ctx->ctx, ctx->k_ipad, 64);
 }
 
 /***********************************************************************
@@ -328,7 +328,7 @@ void
 hmac_md5_update(const unsigned char *text, int text_len,
 		struct HMACMD5Context *ctx)
 {
-	MD5Update(&ctx->ctx, text, text_len);	/* then text of datagram */
+	cifs_MD5_update(&ctx->ctx, text, text_len);	/* then text of datagram */
 }
 
 /***********************************************************************
@@ -339,12 +339,12 @@ hmac_md5_final(unsigned char *digest, struct HMACMD5Context *ctx)
 {
 	struct MD5Context ctx_o;
 
-	MD5Final(digest, &ctx->ctx);
+	cifs_MD5_final(digest, &ctx->ctx);
 
-	MD5Init(&ctx_o);
-	MD5Update(&ctx_o, ctx->k_opad, 64);
-	MD5Update(&ctx_o, digest, 16);
-	MD5Final(digest, &ctx_o);
+	cifs_MD5_init(&ctx_o);
+	cifs_MD5_update(&ctx_o, ctx->k_opad, 64);
+	cifs_MD5_update(&ctx_o, digest, 16);
+	cifs_MD5_final(digest, &ctx_o);
 }
 
 /***********************************************************
diff --git a/fs/cifs/md5.h b/fs/cifs/md5.h
index f7d4f4197ba..6fba8cb402f 100644
--- a/fs/cifs/md5.h
+++ b/fs/cifs/md5.h
@@ -20,10 +20,10 @@ struct HMACMD5Context {
 };
 #endif				/* _HMAC_MD5_H */
 
-void MD5Init(struct MD5Context *context);
-void MD5Update(struct MD5Context *context, unsigned char const *buf,
+void cifs_MD5_init(struct MD5Context *context);
+void cifs_MD5_update(struct MD5Context *context, unsigned char const *buf,
 			unsigned len);
-void MD5Final(unsigned char digest[16], struct MD5Context *context);
+void cifs_MD5_final(unsigned char digest[16], struct MD5Context *context);
 
 /* The following definitions come from lib/hmacmd5.c  */
 
-- 
cgit v1.2.3


From 42c245447c8c3f998dfe880aba18b6e5129d2976 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 13 Jan 2009 22:03:55 +0000
Subject: [CIFS] revalidate parent inode when rmdir done within that directory

When a search is pending of a parent directory, and a child directory
within it is removed, we need to reset the parent directory's time
so that we don't reuse the (now stale) search results.

Thanks to Gunter Kukkukk for reporting this:

> got the following failure notification on irc #samba:
>
> A user was updating from subversion 1.4 to 1.5, where the
> repository is located on a samba share (independent of
> unix extensions = Yes or No).
> svn 1.4 did work, 1.5 does not.
>
> The user did a lot of stracing of subversion - and wrote a
> testapplet to simulate the failing behaviour.
> I've converted the C++ source to C and added some error cases.
>
> When using "./testdir" on a local file system, "result2"
> is always (nil) as expected - cifs vfs behaves different here!
>
>   ./testdir /mnt/cifs/mounted/share
>
> returns a (failing) valid pointer.

Acked-by: Dave Kleikamp <shaggy@us.ibm.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES | 4 +++-
 fs/cifs/inode.c | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 080703a15f4..73ac7ebd1df 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,7 +5,9 @@ rather than posix (advisory) byte range locks, even though server would
 support posix byte range locks.  Fix query of root inode when prefixpath
 specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
-Samba servers (worked to Windows).
+Samba servers (worked to Windows).  Fix rmdir so that pending search
+(readdir) requests do not get invalid results which include the now
+removed directory.
 
 Version 1.55
 ------------
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 5ab9896fdcb..bcf7b518466 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1285,6 +1285,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
 	cifsInode = CIFS_I(direntry->d_inode);
 	cifsInode->time = 0;	/* force revalidate to go get info when
 				   needed */
+
+	cifsInode = CIFS_I(inode);
+	cifsInode->time = 0;	/* force revalidate to get parent dir info
+				   since cached search results now invalid */
+
 	direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime =
 		current_fs_time(inode->i_sb);
 
-- 
cgit v1.2.3


From f818dd55c4a8b3519e203900bde0bb780d36e799 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 19 Jan 2009 02:38:35 +0000
Subject: [CIFS] some cleanup to dir.c prior to addition of posix_open

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 56 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 838d9c720a5..964aad03c5a 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -129,6 +129,17 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+static void setup_cifs_dentry(struct cifsTconInfo *tcon,
+			      struct dentry *direntry,
+			      struct inode *newinode)
+{
+	if (tcon->nocase)
+		direntry->d_op = &cifs_ci_dentry_ops;
+	else
+		direntry->d_op = &cifs_dentry_ops;
+	d_instantiate(direntry, newinode);
+}
+
 /* Inode operations in similar order to how they appear in Linux file fs.h */
 
 int
@@ -139,14 +150,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int xid;
 	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
+	/* BB below access is too much for the mknod to request */
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	char *full_path = NULL;
 	FILE_ALL_INFO *buf = NULL;
 	struct inode *newinode = NULL;
-	struct cifsFileInfo *pCifsFile = NULL;
 	struct cifsInodeInfo *pCifsInode;
 	int disposition = FILE_OVERWRITE_IF;
 	bool write_only = false;
@@ -154,7 +165,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
@@ -162,6 +173,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
+	mode &= ~current->fs->umask;
+
 	if (nd && (nd->flags & LOOKUP_OPEN)) {
 		int oflags = nd->intent.open.flags;
 
@@ -196,17 +209,15 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		return -ENOMEM;
 	}
 
-	mode &= ~current->fs->umask;
-
 	/*
 	 * if we're not using unix extensions, see if we need to set
 	 * ATTR_READONLY on the create call
 	 */
-	if (!pTcon->unix_ext && (mode & S_IWUGO) == 0)
+	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
 		create_options |= CREATE_OPTION_READONLY;
 
 	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, create_options,
 			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
 			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -215,7 +226,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	if (rc == -EIO) {
 		/* old server, retry the open legacy style */
-		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
 			desiredAccess, create_options,
 			&fileHandle, &oplock, buf, cifs_sb->local_nls,
 			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -225,7 +236,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	} else {
 		/* If Open reported that we actually created a file
 		then we now have to set the mode if possible */
-		if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+		if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
 			struct cifs_unix_set_info_args args = {
 				.mode	= mode,
 				.ctime	= NO_CHANGE_64,
@@ -244,20 +255,20 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 				args.uid = NO_CHANGE_64;
 				args.gid = NO_CHANGE_64;
 			}
-			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
 				cifs_sb->local_nls,
 				cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
 		} else {
 			/* BB implement mode setting via Windows security
 			   descriptors e.g. */
-			/* CIFSSMBWinSetPerms(xid,pTcon,path,mode,-1,-1,nls);*/
+			/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
 
 			/* Could set r/o dos attribute if mode & 0222 == 0 */
 		}
 
 		/* server might mask mode so we have to query for it */
-		if (pTcon->unix_ext)
+		if (tcon->unix_ext)
 			rc = cifs_get_inode_info_unix(&newinode, full_path,
 						 inode->i_sb, xid);
 		else {
@@ -283,22 +294,17 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 		}
 
 		if (rc != 0) {
-			cFYI(1,
-			     ("Create worked but get_inode_info failed rc = %d",
-			      rc));
-		} else {
-			if (pTcon->nocase)
-				direntry->d_op = &cifs_ci_dentry_ops;
-			else
-				direntry->d_op = &cifs_dentry_ops;
-			d_instantiate(direntry, newinode);
-		}
+			cFYI(1, ("Create worked, get_inode_info failed rc = %d",
+				 rc));
+		} else
+			setup_cifs_dentry(tcon, direntry, newinode);
+
 		if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
 			(!(nd->flags & LOOKUP_OPEN))) {
 			/* mknod case - do not leave file open */
-			CIFSSMBClose(xid, pTcon, fileHandle);
+			CIFSSMBClose(xid, tcon, fileHandle);
 		} else if (newinode) {
-			pCifsFile =
+			struct cifsFileInfo *pCifsFile =
 			   kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
 
 			if (pCifsFile == NULL)
@@ -316,7 +322,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 			/* set the following in open now
 				pCifsFile->pfile = file; */
 			write_lock(&GlobalSMBSeslock);
-			list_add(&pCifsFile->tlist, &pTcon->openFileList);
+			list_add(&pCifsFile->tlist, &tcon->openFileList);
 			pCifsInode = CIFS_I(newinode);
 			if (pCifsInode) {
 				/* if readable file instance put first in list*/
-- 
cgit v1.2.3


From da505c386c9f993e43861791dae339b2219cf8dd Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 19 Jan 2009 03:49:35 +0000
Subject: [CIFS] Make socket retry timeouts consistent between blocking and
 nonblocking cases

We have used approximately 15 second timeouts on nonblocking sends in the past, and
also 15 second SMB timeout (waiting for server responses, for most request types).
Now that we can do blocking tcp sends,
make blocking send timeout approximately the same (15 seconds).

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c   |  4 ++--
 fs/cifs/transport.c | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7419576228f..a3537a90a9d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1802,7 +1802,7 @@ ipv4_connect(struct TCP_Server_Info *server)
 	 *  user space buffer
 	 */
 	socket->sk->sk_rcvtimeo = 7 * HZ;
-	socket->sk->sk_sndtimeo = 3 * HZ;
+	socket->sk->sk_sndtimeo = 5 * HZ;
 
 	/* make the bufsizes depend on wsize/rsize and max requests */
 	if (server->noautotune) {
@@ -1953,7 +1953,7 @@ ipv6_connect(struct TCP_Server_Info *server)
 	 * user space buffer
 	 */
 	socket->sk->sk_rcvtimeo = 7 * HZ;
-	socket->sk->sk_sndtimeo = 3 * HZ;
+	socket->sk->sk_sndtimeo = 5 * HZ;
 	server->ssocket = socket;
 
 	return rc;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 2c7efd26992..0ad3e2d116a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -199,7 +199,25 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
 				    n_vec - first_vec, total_len);
 		if ((rc == -ENOSPC) || (rc == -EAGAIN)) {
 			i++;
-			if (i >= 14) {
+			/* if blocking send we try 3 times, since each can block
+			   for 5 seconds. For nonblocking  we have to try more
+			   but wait increasing amounts of time allowing time for
+			   socket to clear.  The overall time we wait in either
+			   case to send on the socket is about 15 seconds.
+			   Similarly we wait for 15 seconds for
+			   a response from the server in SendReceive[2]
+			   for the server to send a response back for
+			   most types of requests (except SMB Write
+			   past end of file which can be slow, and
+			   blocking lock operations). NFS waits slightly longer
+			   than CIFS, but this can make it take longer for
+			   nonresponsive servers to be detected and 15 seconds
+			   is more than enough time for modern networks to
+			   send a packet.  In most cases if we fail to send
+			   after the retries we will kill the socket and
+			   reconnect which may clear the network problem.
+			*/
+			if ((i >= 14) || (!server->noblocksnd && (i > 2))) {
 				cERROR(1,
 				   ("sends on sock %p stuck for 15 seconds",
 				    ssocket));
-- 
cgit v1.2.3


From a9ac49d303f967be0dabd97cb722c4a13109c6c2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 22 Jan 2009 14:43:21 -0500
Subject: cifs: make sure we allocate enough storage for socket address

The sockaddr declared on the stack in cifs_get_tcp_session is too small
for IPv6 addresses. Change it from "struct sockaddr" to "struct
sockaddr_storage" to prevent stack corruption when IPv6 is used.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a3537a90a9d..2209be94305 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1354,7 +1354,7 @@ cifs_parse_mount_options(char *options, const char *devname,
 }
 
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr)
 {
 	struct list_head *tmp;
 	struct TCP_Server_Info *server;
@@ -1374,11 +1374,11 @@ cifs_find_tcp_session(struct sockaddr *addr)
 		if (server->tcpStatus == CifsNew)
 			continue;
 
-		if (addr->sa_family == AF_INET &&
+		if (addr->ss_family == AF_INET &&
 		    (addr4->sin_addr.s_addr !=
 		     server->addr.sockAddr.sin_addr.s_addr))
 			continue;
-		else if (addr->sa_family == AF_INET6 &&
+		else if (addr->ss_family == AF_INET6 &&
 			 memcmp(&server->addr.sockAddr6.sin6_addr,
 				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
 			continue;
@@ -1419,12 +1419,12 @@ static struct TCP_Server_Info *
 cifs_get_tcp_session(struct smb_vol *volume_info)
 {
 	struct TCP_Server_Info *tcp_ses = NULL;
-	struct sockaddr addr;
+	struct sockaddr_storage addr;
 	struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
 	struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
 	int rc;
 
-	memset(&addr, 0, sizeof(struct sockaddr));
+	memset(&addr, 0, sizeof(struct sockaddr_storage));
 
 	if (volume_info->UNCip && volume_info->UNC) {
 		rc = cifs_inet_pton(AF_INET, volume_info->UNCip,
@@ -1435,9 +1435,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 			rc = cifs_inet_pton(AF_INET6, volume_info->UNCip,
 					    &sin_server6->sin6_addr.in6_u);
 			if (rc > 0)
-				addr.sa_family = AF_INET6;
+				addr.ss_family = AF_INET6;
 		} else {
-			addr.sa_family = AF_INET;
+			addr.ss_family = AF_INET;
 		}
 
 		if (rc <= 0) {
@@ -1502,7 +1502,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->tcpStatus = CifsNew;
 	++tcp_ses->srv_count;
 
-	if (addr.sa_family == AF_INET6) {
+	if (addr.ss_family == AF_INET6) {
 		cFYI(1, ("attempting ipv6 connect"));
 		/* BB should we allow ipv6 on port 139? */
 		/* other OS never observed in Wild doing 139 with v6 */
-- 
cgit v1.2.3


From 3eb14297c4b85af0c5e6605e18d93b6031330d71 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 29 Jan 2009 11:17:24 +0200
Subject: UBIFS: sync wbufs after syncing inodes and pages

All writes go through wbufs so they must be sync'd
after syncing inodes and pages.

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index dbfc8871471..3ddd754262b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -450,16 +450,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	/*
-	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
-	 * do this if it waits for an already running commit.
-	 */
-	for (i = 0; i < c->jhead_cnt; i++) {
-		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
-		if (err)
-			return err;
-	}
-
 	/*
 	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
 	 * pages, so synchronize them first, then commit the journal. Strictly
@@ -471,6 +461,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	 */
 	generic_sync_sb_inodes(sb, &wbc);
 
+	/*
+	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
+	 * do this if it waits for an already running commit.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
 	err = ubifs_run_commit(c);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 227c75c91dbfa037d109ab7ef45b7f5ba9cab6d0 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 29 Jan 2009 11:53:51 +0200
Subject: UBIFS: spelling fix 'date' -> 'data'

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/debug.c | 2 +-
 fs/ubifs/gc.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 9a41f6f245b..e975bd82f38 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -1407,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
  * @c: UBIFS file-system description object
  * @leaf_cb: called for each leaf node
  * @znode_cb: called for each indexing node
- * @priv: private date which is passed to callbacks
+ * @priv: private data which is passed to callbacks
  *
  * This function walks the UBIFS index and calls the @leaf_cb for each leaf
  * node and @znode_cb for each indexing node. Returns zero in case of success
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9760154d874..bad3339a800 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -401,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 
 		/*
 		 * Don't release the LEB until after the next commit, because
-		 * it may contain date which is needed for recovery. So
+		 * it may contain data which is needed for recovery. So
 		 * although we freed this LEB, it will become usable only after
 		 * the commit.
 		 */
-- 
cgit v1.2.3


From b466f17d780c5b72427f36aef22ecdec9f1d0689 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <ext-adrian.hunter@nokia.com>
Date: Thu, 29 Jan 2009 12:59:33 +0200
Subject: UBIFS: remount ro fixes

- preserve the idx_gc list - it will be needed in the same
state, should UBIFS be remounted rw again
- prevent remounting ro if we have switched to read only
mode (due to a fatal error)

Signed-off-by: Adrian Hunter <ext-adrian.hunter@nokia.com>
Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/gc.c    | 18 +++++-------------
 fs/ubifs/super.c | 14 +++++++-------
 fs/ubifs/ubifs.h |  2 +-
 3 files changed, 13 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index bad3339a800..a711d33b3d3 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -830,29 +830,21 @@ out:
  * ubifs_destroy_idx_gc - destroy idx_gc list.
  * @c: UBIFS file-system description object
  *
- * This function destroys the @c->idx_gc list. It is called when unmounting or
- * remounting read-only so locks are not needed. Returns zero in case of
- * success and a negative error code in case of failure.
+ * This function destroys the @c->idx_gc list. It is called when unmounting
+ * so locks are not needed. Returns zero in case of success and a negative
+ * error code in case of failure.
  */
-int ubifs_destroy_idx_gc(struct ubifs_info *c)
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
 {
-	int ret = 0;
-
 	while (!list_empty(&c->idx_gc)) {
-		int err;
 		struct ubifs_gced_idx_leb *idx_gc;
 
 		idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
 				    list);
-		err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
-					  LPROPS_NC, 0, LPROPS_TAKEN, -1);
-		if (err && !ret)
-			ret = err;
+		c->idx_gc_cnt -= 1;
 		list_del(&idx_gc->list);
 		kfree(idx_gc);
 	}
-
-	return ret;
 }
 
 /**
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3ddd754262b..daa679d3a03 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1687,10 +1687,6 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	if (err)
 		ubifs_ro_mode(c, err);
 
-	err = ubifs_destroy_idx_gc(c);
-	if (err)
-		ubifs_ro_mode(c, err);
-
 	free_wbufs(c);
 	vfree(c->orph_buf);
 	c->orph_buf = NULL;
@@ -1793,15 +1789,19 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
-			ubifs_msg("cannot re-mount R/W, UBIFS is working in "
-				  "R/O mode");
+			ubifs_msg("cannot re-mount due to prior errors");
 			return -EINVAL;
 		}
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+		if (c->ro_media) {
+			ubifs_msg("cannot re-mount due to prior errors");
+			return -EINVAL;
+		}
 		ubifs_remount_ro(c);
+	}
 
 	if (c->bulk_read == 1)
 		bu_init(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 29dfa816077..535f8742679 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1594,7 +1594,7 @@ int ubifs_replay_journal(struct ubifs_info *c);
 int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
 int ubifs_gc_start_commit(struct ubifs_info *c);
 int ubifs_gc_end_commit(struct ubifs_info *c);
-int ubifs_destroy_idx_gc(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
 int ubifs_get_idx_gc_leb(struct ubifs_info *c);
 int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
 
-- 
cgit v1.2.3


From a2b9df3ff691db8e5e521dccd231a8098bbf7416 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 29 Jan 2009 16:22:54 +0200
Subject: UBIFS: return sensible error codes

When mounting/re-mounting, UBIFS returns EINVAL even if the ENOSPC
or EROFS codes are are much better, just because we have not found
references to ENOSPC/EROFS in mount (2) man pages. This patch
changes this behaviour and makes UBIFS return real error code,
because:

1. It is just less confusing and more logical
2. mount is not described in SuSv3, so it seems to be not really
   well-standartized
3. we do not cover all cases, and any random undocumented in man
   pages error code may be returned anyway

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/master.c |  2 +-
 fs/ubifs/super.c  | 11 +++--------
 2 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493bf56..a88f33801b9 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
 	int err, lnum, offs, len;
 
 	if (c->ro_media)
-		return -EINVAL;
+		return -EROFS;
 
 	lnum = UBIFS_MST_LNUM;
 	offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index daa679d3a03..ab85eb8cce7 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1085,12 +1085,7 @@ static int check_free_space(struct ubifs_info *c)
 		ubifs_err("insufficient free space to mount in read/write mode");
 		dbg_dump_budg(c);
 		dbg_dump_lprops(c);
-		/*
-		 * We return %-EINVAL instead of %-ENOSPC because it seems to
-		 * be the closest error code mentioned in the mount function
-		 * documentation.
-		 */
-		return -EINVAL;
+		return -ENOSPC;
 	}
 	return 0;
 }
@@ -1790,7 +1785,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
-			return -EINVAL;
+			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
 		if (err)
@@ -1798,7 +1793,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
-			return -EINVAL;
+			return -EROFS;
 		}
 		ubifs_remount_ro(c);
 	}
-- 
cgit v1.2.3


From 27ad27993313312a4ad0047d0a944c425cd511a5 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Date: Thu, 29 Jan 2009 16:34:30 +0200
Subject: UBIFS: remove fast unmounting

This UBIFS feature has never worked properly, and it was a mistake
to add it because we simply have no use-cases. So, lets still accept
the fast_unmount mount option, but ignore it. This does not change
much, because UBIFS commit in sync_fs anyway, and sync_fs is called
while unmounting.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
---
 fs/ubifs/super.c | 50 +++++---------------------------------------------
 fs/ubifs/ubifs.h |  2 --
 2 files changed, 5 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ab85eb8cce7..1182b66a549 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -957,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		/*
+		 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
+		 * We accepte them in order to be backware-compatible. But this
+		 * should be removed at some point.
+		 */
 		case Opt_fast_unmount:
 			c->mount_opts.unmount_mode = 2;
-			c->fast_unmount = 1;
 			break;
 		case Opt_norm_unmount:
 			c->mount_opts.unmount_mode = 1;
-			c->fast_unmount = 0;
 			break;
 		case Opt_bulk_read:
 			c->mount_opts.bulk_read = 2;
@@ -1359,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
 	       c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
 	       c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
 	       c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
-	dbg_msg("fast unmount:        %d", c->fast_unmount);
 	dbg_msg("big_lpt              %d", c->big_lpt);
 	dbg_msg("log LEBs:            %d (%d - %d)",
 		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1615,38 +1617,6 @@ out:
 	return err;
 }
 
-/**
- * commit_on_unmount - commit the journal when un-mounting.
- * @c: UBIFS file-system description object
- *
- * This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled.
- */
-static void commit_on_unmount(struct ubifs_info *c)
-{
-	long long bud_bytes;
-
-	if (!c->fast_unmount) {
-		dbg_gen("skip committing - fast unmount enabled");
-		return;
-	}
-
-	/*
-	 * This function is called before the background thread is stopped, so
-	 * we may race with ongoing commit, which means we have to take
-	 * @c->bud_lock to access @c->bud_bytes.
-	 */
-	spin_lock(&c->buds_lock);
-	bud_bytes = c->bud_bytes;
-	spin_unlock(&c->buds_lock);
-
-	if (bud_bytes) {
-		dbg_gen("run commit");
-		ubifs_run_commit(c);
-	} else
-		dbg_gen("journal is empty, do not run commit");
-}
-
 /**
  * ubifs_remount_ro - re-mount in read-only mode.
  * @c: UBIFS file-system description object
@@ -1661,7 +1631,6 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 	ubifs_assert(!c->need_recovery);
 	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 
-	commit_on_unmount(c);
 	mutex_lock(&c->umount_mutex);
 	if (c->bgt) {
 		kthread_stop(c->bgt);
@@ -2077,15 +2046,6 @@ out_close:
 
 static void ubifs_kill_sb(struct super_block *sb)
 {
-	struct ubifs_info *c = sb->s_fs_info;
-
-	/*
-	 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
-	 * in order to be outside BKL.
-	 */
-	if (sb->s_root && !(sb->s_flags & MS_RDONLY))
-		commit_on_unmount(c);
-	/* The un-mount routine is actually done in put_super() */
 	generic_shutdown_super(sb);
 }
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 535f8742679..039a68bee29 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -961,7 +961,6 @@ struct ubifs_debug_info;
  * @cs_lock: commit state lock
  * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
  *
- * @fast_unmount: do not run journal commit before un-mounting
  * @big_lpt: flag that LPT is too big to write whole during commit
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
@@ -1202,7 +1201,6 @@ struct ubifs_info {
 	spinlock_t cs_lock;
 	wait_queue_head_t cmt_wq;
 
-	unsigned int fast_unmount:1;
 	unsigned int big_lpt:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
-- 
cgit v1.2.3


From df1c46b2b6876d0a1b1b4740f009fa69d95ebbc9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 29 Jan 2009 16:53:35 -0800
Subject: tun: Add some missing TUN compat ioctl translations.

Based upon a report from Michael Tokarev <mjt@tls.msk.ru>:

	Just saw in dmesg:

	ioctl32(kvm:4408): Unknown cmd fd(9) cmd(800454cf){t:'T';sz:4} arg(ffc668e4) on /dev/net/tun

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/compat_ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5235c67e759..c8f8d5904f5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -538,6 +538,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
 		 * cannot be fixed without breaking all existing apps.
 		 */
 		case TUNSETIFF:
+		case TUNGETIFF:
 		case SIOCGIFFLAGS:
 		case SIOCGIFMETRIC:
 		case SIOCGIFMTU:
@@ -1982,6 +1983,11 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM)
 COMPATIBLE_IOCTL(TUNSETDEBUG)
 COMPATIBLE_IOCTL(TUNSETPERSIST)
 COMPATIBLE_IOCTL(TUNSETOWNER)
+COMPATIBLE_IOCTL(TUNSETLINK)
+COMPATIBLE_IOCTL(TUNSETGROUP)
+COMPATIBLE_IOCTL(TUNGETFEATURES)
+COMPATIBLE_IOCTL(TUNSETOFFLOAD)
+COMPATIBLE_IOCTL(TUNSETTXFILTER)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2573,6 +2579,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc)
 HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc)
 HANDLE_IOCTL(TUNSETIFF, dev_ifsioc)
+HANDLE_IOCTL(TUNGETIFF, dev_ifsioc)
 HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl)
 HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl)
 HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl)
-- 
cgit v1.2.3


From 9df04e1f25effde823a600e755b51475d438f56b Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 29 Jan 2009 14:25:26 -0800
Subject: epoll: drop max_user_instances and rely only on max_user_watches

Linus suggested to put limits where the money is, and max_user_watches
already does that w/out the need of max_user_instances.  That has the
advantage to mitigate the potential DoS while allowing pretty generous
default behavior.

Allowing top 4% of low memory (per user) to be allocated in epoll watches,
we have:

LOMEM    MAX_WATCHES (per user)
512MB    ~178000
1GB      ~356000
2GB      ~712000

A box with 512MB of lomem, will meet some challenge in hitting 180K
watches, socket buffers math teaches us.  No more max_user_instances
limits then.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Bron Gondwana <brong@fastmail.fm>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ba2f9ec7119..011b9b8c90c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -234,8 +234,6 @@ struct ep_pqueue {
 /*
  * Configuration options available inside /proc/sys/fs/epoll/
  */
-/* Maximum number of epoll devices, per user */
-static int max_user_instances __read_mostly;
 /* Maximum number of epoll watched descriptors, per user */
 static int max_user_watches __read_mostly;
 
@@ -260,14 +258,6 @@ static struct kmem_cache *pwq_cache __read_mostly;
 static int zero;
 
 ctl_table epoll_table[] = {
-	{
-		.procname	= "max_user_instances",
-		.data		= &max_user_instances,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &zero,
-	},
 	{
 		.procname	= "max_user_watches",
 		.data		= &max_user_watches,
@@ -491,7 +481,6 @@ static void ep_free(struct eventpoll *ep)
 
 	mutex_unlock(&epmutex);
 	mutex_destroy(&ep->mtx);
-	atomic_dec(&ep->user->epoll_devs);
 	free_uid(ep->user);
 	kfree(ep);
 }
@@ -581,10 +570,6 @@ static int ep_alloc(struct eventpoll **pep)
 	struct eventpoll *ep;
 
 	user = get_current_user();
-	error = -EMFILE;
-	if (unlikely(atomic_read(&user->epoll_devs) >=
-			max_user_instances))
-		goto free_uid;
 	error = -ENOMEM;
 	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
 	if (unlikely(!ep))
@@ -1141,7 +1126,6 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
 			      flags & O_CLOEXEC);
 	if (fd < 0)
 		ep_free(ep);
-	atomic_inc(&ep->user->epoll_devs);
 
 error_return:
 	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
@@ -1366,8 +1350,10 @@ static int __init eventpoll_init(void)
 	struct sysinfo si;
 
 	si_meminfo(&si);
-	max_user_instances = 128;
-	max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) /
+	/*
+	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
+	 */
+	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
 		EP_ITEM_COST;
 
 	/* Initialize the structure used to perform safe poll wait head wake ups */
-- 
cgit v1.2.3


From b9ec63f78b425c0e16cc95605b5d4ff2dc228b97 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 30 Jan 2009 00:00:24 -0500
Subject: ext4: Remove bogus BUG() check in ext4_bmap()

The code to support journal-less ext4 operation added a BUG to
ext4_bmap() which fired if there was no journal and the
EXT4_STATE_JDATA bit was set in the i_state field.  This caused
running the filefrag program (which uses the FIMBAP ioctl) to trigger
a BUG().

The EXT4_STATE_JDATA bit is only used for ext4_bmap(), and it's
harmless for the bit to be set.  We could add a check in
__ext4_journalled_writepage() and ext4_journalled_write_end() to only
set the EXT4_STATE_JDATA bit if the journal is present, but that adds
an extra test and jump instruction.  It's easier to simply remove the
BUG check.

http://bugzilla.kernel.org/show_bug.cgi?id=12568

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@kernel.org
---
 fs/ext4/inode.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b4386dafeb0..03ba20be132 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2821,9 +2821,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 		filemap_write_and_wait(mapping);
 	}
 
-	BUG_ON(!EXT4_JOURNAL(inode) &&
-	       EXT4_I(inode)->i_state & EXT4_STATE_JDATA);
-
 	if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
-- 
cgit v1.2.3


From 7b24fc4d7eb611da367dea3aad45473050aacd6c Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 4 Jan 2009 02:43:38 -0500
Subject: block: Don't verify integrity metadata on read error

If we get an I/O error on a read request there is no point in doing a
verify pass on the integrity buffer.  Adjust the completion path
accordingly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 77ebc3c263d..8396d741f80 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -465,7 +465,7 @@ static int bio_integrity_verify(struct bio *bio)
 
 		if (ret) {
 			kunmap_atomic(kaddr, KM_USER0);
-			break;
+			return ret;
 		}
 
 		sectors = bv->bv_len / bi->sector_size;
@@ -493,18 +493,13 @@ static void bio_integrity_verify_fn(struct work_struct *work)
 	struct bio_integrity_payload *bip =
 		container_of(work, struct bio_integrity_payload, bip_work);
 	struct bio *bio = bip->bip_bio;
-	int error = bip->bip_error;
+	int error;
 
-	if (bio_integrity_verify(bio)) {
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		error = -EIO;
-	}
+	error = bio_integrity_verify(bio);
 
 	/* Restore original bio completion handler */
 	bio->bi_end_io = bip->bip_end_io;
-
-	if (bio->bi_end_io)
-		bio->bi_end_io(bio, error);
+	bio_endio(bio, error);
 }
 
 /**
@@ -525,7 +520,17 @@ void bio_integrity_endio(struct bio *bio, int error)
 
 	BUG_ON(bip->bip_bio != bio);
 
-	bip->bip_error = error;
+	/* In case of an I/O error there is no point in verifying the
+	 * integrity metadata.  Restore original bio end_io handler
+	 * and run it.
+	 */
+	if (error) {
+		bio->bi_end_io = bip->bip_end_io;
+		bio_endio(bio, error);
+
+		return;
+	}
+
 	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
 	queue_work(kintegrityd_wq, &bip->bip_work);
 }
-- 
cgit v1.2.3


From 8ae372e3bb4acaca37ffa2ce54f4cf8dd60a94fa Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sun, 4 Jan 2009 02:43:39 -0500
Subject: block: Remove obsolete BUG_ON

Now that bio_vecs are no longer cleared in bvec_alloc_bs() the following
BUG_ON must go.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio-integrity.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8396d741f80..549b0144da1 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 
 	iv = bip_vec_idx(bip, bip->bip_vcnt);
 	BUG_ON(iv == NULL);
-	BUG_ON(iv->bv_page != NULL);
 
 	iv->bv_page = page;
 	iv->bv_len = len;
-- 
cgit v1.2.3


From ea455f8ab68338ba69f5d3362b342c115bea8e13 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 23:20:31 +0100
Subject: ocfs2: Push out dropping of dentry lock to ocfs2_wq

Dropping of last reference to dentry lock is a complicated operation involving
dropping of reference to inode. This can get complicated and quota code in
particular needs to obtain some quota locks which leads to potential deadlock.
Thus we defer dropping of inode reference to ocfs2_wq.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dcache.c | 42 +++++++++++++++++++++++++++++++++++++++---
 fs/ocfs2/dcache.h |  9 ++++++++-
 fs/ocfs2/ocfs2.h  |  6 ++++++
 fs/ocfs2/super.c  |  3 +++
 4 files changed, 56 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c381e8..e9d7c2038c0 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
+#include "super.h"
 
 
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
 	return ret;
 }
 
+static DEFINE_SPINLOCK(dentry_list_lock);
+
+/* We limit the number of dentry locks to drop in one go. We have
+ * this limit so that we don't starve other users of ocfs2_wq. */
+#define DL_INODE_DROP_COUNT 64
+
+/* Drop inode references from dentry locks */
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+					       dentry_lock_work);
+	struct ocfs2_dentry_lock *dl;
+	int drop_count = DL_INODE_DROP_COUNT;
+
+	spin_lock(&dentry_list_lock);
+	while (osb->dentry_lock_list && drop_count--) {
+		dl = osb->dentry_lock_list;
+		osb->dentry_lock_list = dl->dl_next;
+		spin_unlock(&dentry_list_lock);
+		iput(dl->dl_inode);
+		kfree(dl);
+		spin_lock(&dentry_list_lock);
+	}
+	if (osb->dentry_lock_list)
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	spin_unlock(&dentry_list_lock);
+}
+
 /*
  * ocfs2_dentry_iput() and friends.
  *
@@ -318,16 +347,23 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
 				   struct ocfs2_dentry_lock *dl)
 {
-	iput(dl->dl_inode);
 	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
 	ocfs2_lock_res_free(&dl->dl_lockres);
-	kfree(dl);
+
+	/* We leave dropping of inode reference to ocfs2_wq as that can
+	 * possibly lead to inode deletion which gets tricky */
+	spin_lock(&dentry_list_lock);
+	if (!osb->dentry_lock_list)
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	dl->dl_next = osb->dentry_lock_list;
+	osb->dentry_lock_list = dl;
+	spin_unlock(&dentry_list_lock);
 }
 
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl)
 {
-	int unlock = 0;
+	int unlock;
 
 	BUG_ON(dl->dl_count == 0);
 
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34d988..d06e16c0664 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,8 +29,13 @@
 extern struct dentry_operations ocfs2_dentry_ops;
 
 struct ocfs2_dentry_lock {
+	/* Use count of dentry lock */
 	unsigned int		dl_count;
-	u64			dl_parent_blkno;
+	union {
+		/* Linked list of dentry locks to release */
+		struct ocfs2_dentry_lock *dl_next;
+		u64			dl_parent_blkno;
+	};
 
 	/*
 	 * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl);
 
+void ocfs2_drop_dl_inodes(struct work_struct *work);
+
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
 				      int skip_unhashed);
 
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ad5c24a29ed..077384135f4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -210,6 +210,7 @@ struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
 struct ocfs2_quota_recovery;
+struct ocfs2_dentry_lock;
 struct ocfs2_super
 {
 	struct task_struct *commit_task;
@@ -325,6 +326,11 @@ struct ocfs2_super
 	struct list_head blocked_lock_list;
 	unsigned long blocked_lock_count;
 
+	/* List of dentry locks to release. Anyone can add locks to
+	 * the list, ocfs2_wq processes the list  */
+	struct ocfs2_dentry_lock *dentry_lock_list;
+	struct work_struct dentry_lock_work;
+
 	wait_queue_head_t		osb_mount_event;
 
 	/* Truncate log info */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 43ed11345b5..b1cb38fbe80 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1887,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
 	journal->j_state = OCFS2_JOURNAL_FREE;
 
+	INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
+	osb->dentry_lock_list = NULL;
+
 	/* get some pseudo constants for clustersize bits */
 	osb->s_clustersize_bits =
 		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
-- 
cgit v1.2.3


From f8afead7169f0f28a4b421bcbdb510e52a2d094d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 12 Jan 2009 23:20:32 +0100
Subject: ocfs2: Fix possible deadlock in ocfs2_write_dquot()

It could happen that some limit has been set via quotactl() and in parallel
->mark_dirty() is called from another thread doing e.g. dquot_alloc_space(). In
such case ocfs2_write_dquot() must not try to sync the dquot because that needs
global quota lock but that ranks above transaction start.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/quota_global.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f4efa89baee..1ed0f7c8686 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	if (dquot->dq_flags & mask)
 		sync = 1;
 	spin_unlock(&dq_data_lock);
-	if (!sync) {
+	/* This is a slight hack but we can't afford getting global quota
+	 * lock if we already have a transaction started. */
+	if (!sync || journal_current_handle()) {
 		status = ocfs2_write_dquot(dquot);
 		goto out;
 	}
-- 
cgit v1.2.3


From 0e0333429a6280e6eb3c98845e4eed90d5f8078a Mon Sep 17 00:00:00 2001
From: Joel Becker <Joel.Becker@oracle.com>
Date: Wed, 17 Dec 2008 14:23:52 -0800
Subject: configfs: Silence lockdep on mkdir(), rmdir() and
 configfs_depend_item()

When attaching default groups (subdirs) of a new group (in mkdir() or
in configfs_register()), configfs recursively takes inode's mutexes
along the path from the parent of the new group to the default
subdirs. This is needed to ensure that the VFS will not race with
operations on these sub-dirs. This is safe for the following reasons:

- the VFS allows one to lock first an inode and second one of its
  children (The lock subclasses for this pattern are respectively
  I_MUTEX_PARENT and I_MUTEX_CHILD);
- from this rule any inode path can be recursively locked in
  descending order as long as it stays under a single mountpoint and
  does not follow symlinks.

Unfortunately lockdep does not know (yet?) how to handle such
recursion.

I've tried to use Peter Zijlstra's lock_set_subclass() helper to
upgrade i_mutexes from I_MUTEX_CHILD to I_MUTEX_PARENT when we know
that we might recursively lock some of their descendant, but this
usage does not seem to fit the purpose of lock_set_subclass() because
it leads to several i_mutex locked with subclass I_MUTEX_PARENT by
the same task.

>From inside configfs it is not possible to serialize those recursive
locking with a top-level one, because mkdir() and rmdir() are already
called with inodes locked by the VFS. So using some
mutex_lock_nest_lock() is not an option.

I am proposing two solutions:
1) one that wraps recursive mutex_lock()s with
   lockdep_off()/lockdep_on().
2) (as suggested earlier by Peter Zijlstra) one that puts the
   i_mutexes recursively locked in different classes based on their
   depth from the top-level config_group created. This
   induces an arbitrary limit (MAX_LOCK_DEPTH - 2 == 46) on the
   nesting of configfs default groups whenever lockdep is activated
   but this limit looks reasonably high. Unfortunately, this alos
   isolates VFS operations on configfs default groups from the others
   and thus lowers the chances to detect locking issues.

This patch implements solution 1).

Solution 2) looks better from lockdep's point of view, but fails with
configfs_depend_item(). This needs to rework the locking
scheme of configfs_depend_item() by removing the variable lock recursion
depth, and I think that it's doable thanks to the configfs_dirent_lock.
For now, let's stick to solution 1).

Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/configfs/dir.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

(limited to 'fs')

diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e93341f3e8..9c235839114 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -553,12 +553,24 @@ static void detach_groups(struct config_group *group)
 
 		child = sd->s_dentry;
 
+		/*
+		 * Note: we hide this from lockdep since we have no way
+		 * to teach lockdep about recursive
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+		 * in an inode tree, which are valid as soon as
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+		 * parent inode to one of its children.
+		 */
+		lockdep_off();
 		mutex_lock(&child->d_inode->i_mutex);
+		lockdep_on();
 
 		configfs_detach_group(sd->s_element);
 		child->d_inode->i_flags |= S_DEAD;
 
+		lockdep_off();
 		mutex_unlock(&child->d_inode->i_mutex);
+		lockdep_on();
 
 		d_delete(child);
 		dput(child);
@@ -748,11 +760,22 @@ static int configfs_attach_item(struct config_item *parent_item,
 			 * We are going to remove an inode and its dentry but
 			 * the VFS may already have hit and used them. Thus,
 			 * we must lock them as rmdir() would.
+			 *
+			 * Note: we hide this from lockdep since we have no way
+			 * to teach lockdep about recursive
+			 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+			 * in an inode tree, which are valid as soon as
+			 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+			 * parent inode to one of its children.
 			 */
+			lockdep_off();
 			mutex_lock(&dentry->d_inode->i_mutex);
+			lockdep_on();
 			configfs_remove_dir(item);
 			dentry->d_inode->i_flags |= S_DEAD;
+			lockdep_off();
 			mutex_unlock(&dentry->d_inode->i_mutex);
+			lockdep_on();
 			d_delete(dentry);
 		}
 	}
@@ -787,14 +810,25 @@ static int configfs_attach_group(struct config_item *parent_item,
 		 *
 		 * We must also lock the inode to remove it safely in case of
 		 * error, as rmdir() would.
+		 *
+		 * Note: we hide this from lockdep since we have no way
+		 * to teach lockdep about recursive
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+		 * in an inode tree, which are valid as soon as
+		 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+		 * parent inode to one of its children.
 		 */
+		lockdep_off();
 		mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+		lockdep_on();
 		ret = populate_groups(to_config_group(item));
 		if (ret) {
 			configfs_detach_item(item);
 			dentry->d_inode->i_flags |= S_DEAD;
 		}
+		lockdep_off();
 		mutex_unlock(&dentry->d_inode->i_mutex);
+		lockdep_on();
 		if (ret)
 			d_delete(dentry);
 	}
@@ -956,7 +990,17 @@ static int configfs_depend_prep(struct dentry *origin,
 	BUG_ON(!origin || !sd);
 
 	/* Lock this guy on the way down */
+	/*
+	 * Note: we hide this from lockdep since we have no way
+	 * to teach lockdep about recursive
+	 * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path
+	 * in an inode tree, which are valid as soon as
+	 * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a
+	 * parent inode to one of its children.
+	 */
+	lockdep_off();
 	mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+	lockdep_on();
 	if (sd->s_element == target)  /* Boo-yah */
 		goto out;
 
@@ -970,7 +1014,9 @@ static int configfs_depend_prep(struct dentry *origin,
 	}
 
 	/* We looped all our children and didn't find target */
+	lockdep_off();
 	mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+	lockdep_on();
 	ret = -ENOENT;
 
 out:
@@ -990,11 +1036,16 @@ static void configfs_depend_rollback(struct dentry *origin,
 	struct dentry *dentry = item->ci_dentry;
 
 	while (dentry != origin) {
+		/* See comments in configfs_depend_prep() */
+		lockdep_off();
 		mutex_unlock(&dentry->d_inode->i_mutex);
+		lockdep_on();
 		dentry = dentry->d_parent;
 	}
 
+	lockdep_off();
 	mutex_unlock(&origin->d_inode->i_mutex);
+	lockdep_on();
 }
 
 int configfs_depend_item(struct configfs_subsystem *subsys,
@@ -1329,8 +1380,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
 			}
 
 			/* Wait until the racing operation terminates */
+			/*
+			 * Note: we hide this from lockdep since we are locked
+			 * with subclass I_MUTEX_NORMAL from vfs_rmdir() (why
+			 * not I_MUTEX_CHILD?), and I_MUTEX_XATTR or
+			 * I_MUTEX_QUOTA are not relevant for the locked inode.
+			 */
+			lockdep_off();
 			mutex_lock(wait_mutex);
 			mutex_unlock(wait_mutex);
+			lockdep_on();
 		}
 	} while (ret == -EAGAIN);
 
-- 
cgit v1.2.3


From 554e7f9e043e29da79c044f7a55efe4fad40701e Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 8 Jan 2009 08:21:43 +0800
Subject: ocfs2: Access the xattr bucket only before modifying it.

In ocfs2_xattr_value_truncate, we may call b-tree codes which will
extend the journal transaction. It has a potential problem that it
may let the already-accessed-but-not-dirtied buffers gone. So we'd
better access the bucket after we call ocfs2_xattr_value_truncate.
And as for the root buffer for the xattr value, b-tree code will
acess and dirty it, so we don't need to worry about it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e1d638af6ac..915039fffe6 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -4729,13 +4729,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	vb.vb_xv = (struct ocfs2_xattr_value_root *)
 		(vb.vb_bh->b_data + offset % blocksize);
 
-	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	/*
 	 * From here on out we have to dirty the bucket.  The generic
 	 * value calls only modify one of the bucket's bhs, but we need
@@ -4748,12 +4741,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_dirty;
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
 	xe->xe_value_size = cpu_to_le64(len);
 
-out_dirty:
 	ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
 
 out:
-- 
cgit v1.2.3


From a4b91965d39d5d53b470d6aa62cba155a6f3ffe1 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Thu, 29 Jan 2009 17:12:31 -0800
Subject: ocfs2: Wakeup the downconvert thread after a successful cancel
 convert

When two nodes holding PR locks on a resource concurrently attempt to
upconvert the locks to EX, the master sends a BAST to one of the nodes. This
message tells that node to first cancel convert the upconvert request,
followed by downconvert to a NL. Only when this lock is downconverted to NL,
can the master upconvert the first node's lock to EX.

While the fs was doing the cancel convert, it was forgetting to wake up the
dc thread after a successful cancel, leading to a deadlock.

Reported-and-Tested-by: David Teigland <teigland@redhat.com>
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b0c4cadd4c4..206a2370876 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2860,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
 	case OCFS2_UNLOCK_CANCEL_CONVERT:
 		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
 		lockres->l_action = OCFS2_AST_INVALID;
+		/* Downconvert thread may have requeued this lock, we
+		 * need to wake it. */
+		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
 		break;
 	case OCFS2_UNLOCK_DROP_LOCK:
 		lockres->l_level = DLM_LOCK_IV;
-- 
cgit v1.2.3


From fd4ef231962ab44fd1004e87f9d7c6809f00cd64 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Thu, 29 Jan 2009 15:06:21 -0800
Subject: ocfs2: add quota call to ocfs2_remove_btree_range()

We weren't reclaiming the clusters which get free'd from this function,
so any user punching holes in a file would still have those bytes accounted
against him/her. Add the call to vfs_dq_free_space_nodirty() to fix this.
Interestingly enough, the journal credits calculation already took this into
account.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Acked-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/alloc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d861096c9d8..60fe74035db 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5390,6 +5390,9 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto out;
 	}
 
+	vfs_dq_free_space_nodirty(inode,
+				  ocfs2_clusters_to_bytes(inode->i_sb, len));
+
 	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
 				  dealloc);
 	if (ret) {
-- 
cgit v1.2.3


From 6139a2360987f55e4490a7813cf69df74ec8b93a Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 22 Jan 2009 15:37:47 +1100
Subject: xfs: Check buffer lengths in log recovery

Before trying to obtain, read or write a buffer,
check that the buffer length is actually valid. If
it is not valid, then something read in the recovery
process has been corrupted and we should abort
recovery.

Reported-by: Eric Sesterhenn <snakebyte@gmx.de>
Tested-by: Eric Sesterhenn <snakebyte@gmx.de>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98bd94..b1047de2fff 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 xfs_buf_t *
 xlog_get_bp(
 	xlog_t		*log,
-	int		num_bblks)
+	int		nbblks)
 {
-	ASSERT(num_bblks > 0);
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_get_bp(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return NULL;
+	}
 
 	if (log->l_sectbb_log) {
-		if (num_bblks > 1)
-			num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
-		num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
+		if (nbblks > 1)
+			nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 	}
-	return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
+	return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
 
 void
@@ -102,6 +107,13 @@ xlog_bread(
 {
 	int		error;
 
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_bread(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
 {
 	int		error;
 
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_bwrite(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
-- 
cgit v1.2.3


From 43f3f057c56d030546145696627f13f95735be95 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Thu, 22 Jan 2009 21:34:05 -0600
Subject: [XFS] Warn on transaction in flight on read-only remount

Till VFS can correctly support read-only remount without racing,
use WARN_ON instead of BUG_ON on detecting transaction in flight
after quiescing filesystem.

Signed-off-by: Felix Blyakher <felixb@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_sync.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed035354c2..a608e72fa40 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
 	/* flush inodes and push all remaining buffers out to disk */
 	xfs_quiesce_fs(mp);
 
-	ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
 
 	/* Push the superblock and write an unmount record */
 	error = xfs_log_sbcount(mp, 1);
-- 
cgit v1.2.3


From 33da8892a2f9e7d4b2d9a35fc80833ba2d2b1aa6 Mon Sep 17 00:00:00 2001
From: Eric Biederman <ebiederm@xmission.com>
Date: Wed, 4 Feb 2009 15:12:25 -0800
Subject: seq_file: move traverse so it can be used from seq_read

In 2.6.25 some /proc files were converted to use the seq_file
infrastructure.  But seq_files do not correctly support pread(), which
broke some usersapce applications.

To handle pread correctly we can't assume that f_pos is where we left it
in seq_read.  So move traverse() so that we can eventually use it in
seq_read and do thus some day support pread().

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Cc: Paul Turner <pjt@google.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 114 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 57 insertions(+), 57 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index b569ff1c4dc..2716c12eacf 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -54,6 +54,63 @@ int seq_open(struct file *file, const struct seq_operations *op)
 }
 EXPORT_SYMBOL(seq_open);
 
+static int traverse(struct seq_file *m, loff_t offset)
+{
+	loff_t pos = 0, index;
+	int error = 0;
+	void *p;
+
+	m->version = 0;
+	index = 0;
+	m->count = m->from = 0;
+	if (!offset) {
+		m->index = index;
+		return 0;
+	}
+	if (!m->buf) {
+		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+		if (!m->buf)
+			return -ENOMEM;
+	}
+	p = m->op->start(m, &index);
+	while (p) {
+		error = PTR_ERR(p);
+		if (IS_ERR(p))
+			break;
+		error = m->op->show(m, p);
+		if (error < 0)
+			break;
+		if (unlikely(error)) {
+			error = 0;
+			m->count = 0;
+		}
+		if (m->count == m->size)
+			goto Eoverflow;
+		if (pos + m->count > offset) {
+			m->from = offset - pos;
+			m->count -= m->from;
+			m->index = index;
+			break;
+		}
+		pos += m->count;
+		m->count = 0;
+		if (pos == offset) {
+			index++;
+			m->index = index;
+			break;
+		}
+		p = m->op->next(m, p, &index);
+	}
+	m->op->stop(m, p);
+	return error;
+
+Eoverflow:
+	m->op->stop(m, p);
+	kfree(m->buf);
+	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+	return !m->buf ? -ENOMEM : -EAGAIN;
+}
+
 /**
  *	seq_read -	->read() method for sequential files.
  *	@file: the file to read from
@@ -186,63 +243,6 @@ Efault:
 }
 EXPORT_SYMBOL(seq_read);
 
-static int traverse(struct seq_file *m, loff_t offset)
-{
-	loff_t pos = 0, index;
-	int error = 0;
-	void *p;
-
-	m->version = 0;
-	index = 0;
-	m->count = m->from = 0;
-	if (!offset) {
-		m->index = index;
-		return 0;
-	}
-	if (!m->buf) {
-		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
-		if (!m->buf)
-			return -ENOMEM;
-	}
-	p = m->op->start(m, &index);
-	while (p) {
-		error = PTR_ERR(p);
-		if (IS_ERR(p))
-			break;
-		error = m->op->show(m, p);
-		if (error < 0)
-			break;
-		if (unlikely(error)) {
-			error = 0;
-			m->count = 0;
-		}
-		if (m->count == m->size)
-			goto Eoverflow;
-		if (pos + m->count > offset) {
-			m->from = offset - pos;
-			m->count -= m->from;
-			m->index = index;
-			break;
-		}
-		pos += m->count;
-		m->count = 0;
-		if (pos == offset) {
-			index++;
-			m->index = index;
-			break;
-		}
-		p = m->op->next(m, p, &index);
-	}
-	m->op->stop(m, p);
-	return error;
-
-Eoverflow:
-	m->op->stop(m, p);
-	kfree(m->buf);
-	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
-	return !m->buf ? -ENOMEM : -EAGAIN;
-}
-
 /**
  *	seq_lseek -	->llseek() method for sequential files.
  *	@file: the file in question
-- 
cgit v1.2.3


From f01d1d546abb2f4028b5299092f529eefb01253a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 6 Feb 2009 00:30:05 +0300
Subject: seq_file: fix big-enough lseek() + read()

lseek() further than length of the file will leave stale ->index
(second-to-last during iteration). Next seq_read() will not notice
that ->f_pos is big enough to return 0, but will print last item
as if ->f_pos is pointing to it.

Introduced in commit cb510b8172602a66467f3551b4be1911f5a7c8c2
aka "seq_file: more atomicity in traverse()".

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 2716c12eacf..5267098532b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -102,6 +102,7 @@ static int traverse(struct seq_file *m, loff_t offset)
 		p = m->op->next(m, p, &index);
 	}
 	m->op->stop(m, p);
+	m->index = index;
 	return error;
 
 Eoverflow:
-- 
cgit v1.2.3